def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' self.models_location = ModelLocation(path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model(model_name, 10) self.profile_factory = ProfileFactory(cfg) gene_name = "gspD" c_gene_gspd = CoreGene(self.models_location, gene_name, self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model, multi_system=True) gene_name = "sctJ" c_gene_sctj = CoreGene(self.models_location, gene_name, self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model, multi_system=True) gene_name = "sctN" c_gene_sctn = CoreGene(self.models_location, gene_name, self.profile_factory) gene_sctn = Exchangeable(c_gene_sctn, gene_sctj) gene_sctj.add_exchangeable(gene_sctn) model.add_mandatory_gene(gene_gspd) model.add_accessory_gene(gene_sctj) # CoreHit(gene, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match # pos score chit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) chit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) chit_3 = CoreHit(c_gene_gspd, "hit_3", 803, "replicon_id", 10, 1.0, 3.0, 1.0, 1.0, 10, 20) chit_4 = CoreHit(c_gene_sctn, "hit_4", 803, "replicon_id", 14, 1.0, 4.0, 1.0, 1.0, 10, 20) chit_5 = CoreHit(c_gene_gspd, "hit_5", 803, "replicon_id", 20, 1.0, 2.0, 1.0, 1.0, 10, 20) self.mhit_1 = ModelHit(chit_1, gene_gspd, GeneStatus.MANDATORY) self.mhit_2 = ModelHit(chit_2, gene_sctj, GeneStatus.ACCESSORY) self.mhit_3 = ModelHit(chit_3, gene_gspd, GeneStatus.MANDATORY) self.mhit_4 = ModelHit(chit_4, gene_sctn, GeneStatus.ACCESSORY) self.mhit_5 = ModelHit(chit_5, gene_gspd, GeneStatus.MANDATORY) self.ms_1 = MultiSystem(chit_1, gene_ref=gene_gspd, gene_status=GeneStatus.MANDATORY) self.ms_2 = MultiSystem(chit_2, gene_ref=gene_sctj, gene_status=GeneStatus.ACCESSORY) self.ms_3 = MultiSystem(chit_3, gene_ref=gene_gspd, gene_status=GeneStatus.MANDATORY) self.ms_4 = MultiSystem(chit_4, gene_ref=gene_sctn, gene_status=GeneStatus.ACCESSORY) self.ms_5 = MultiSystem(chit_5, gene_ref=gene_gspd, gene_status=GeneStatus.MANDATORY)
def setUp(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = tempfile.gettempdir() args.log_level = 30 self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.model_location = ModelLocation(path=os.path.join(args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg)
def test_search_systems_unordered(self): logger = logging.getLogger('macsypy.macsyfinder') macsypy.logger_set_level(level='ERROR') defaults = MacsyDefaults() out_dir = os.path.join(self.tmp_dir, 'macsyfinder_test_search_systems') os.mkdir(out_dir) seq_db = self.find_data('base', 'VICH001.B.00001.C001.prt') model_dir = self.find_data('data_set', 'models') # test unordered replicon args = f"--sequence-db {seq_db} --db-type=unordered --models-dir {model_dir} --models set_1 all -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) systems, uncomplete_sys = search_systems(config, model_bank, gene_bank, profile_factory, logger) expected_sys_id = [ 'Unordered_T2SS_4', 'Unordered_MSH_3', 'Unordered_T4P_5', 'Unordered_T4bP_6' ] self.assertListEqual([s.id for s in systems], expected_sys_id) expected_uncomplete_sys_id = [ 'Unordered_Archaeal-T4P_1', 'Unordered_ComM_2', 'Unordered_Tad_7' ] self.assertListEqual([s.id for s in uncomplete_sys], expected_uncomplete_sys_id)
def test_search_systems_model_unknown(self): logger = logging.getLogger('macsypy.macsyfinder') macsypy.logger_set_level(level='ERROR') defaults = MacsyDefaults() out_dir = os.path.join(self.tmp_dir, 'macsyfinder_test_search_systems') os.mkdir(out_dir) seq_db = self.find_data('base', 'test_1.fasta') model_dir = self.find_data('data_set', 'models') args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models nimporaoik -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) exit_ori = sys.exit sys.exit = self.fake_exit try: with self.assertRaises(TypeError) as ctx: _ = search_systems(config, model_bank, gene_bank, profile_factory, logger) self.assertEqual( str(ctx.exception), "macsyfinder: \"No such model definition: 'nimporaoik'\"") finally: sys.exit = exit_ori
class TestCoreGene(MacsyTest): def setUp(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = tempfile.gettempdir() args.log_level = 30 self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.model_location = ModelLocation(path=os.path.join(args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) def tearDown(self): try: shutil.rmtree(self.cfg.working_dir()) except: pass def test_core_gene(self): model_fqn = "foo/bar" model = Model(model_fqn, 10) gene_name = 'toto' cg = CoreGene(self.model_location, gene_name, self.profile_factory) self.assertEqual(cg.name, gene_name) self.assertEqual(cg.model_family_name, model.family_name) self.assertEqual(cg.profile, self.profile_factory.get_profile(cg, self.model_location)) cg2 = CoreGene(self.model_location, gene_name, self.profile_factory) self.assertTrue(isinstance(hash(cg), int)) self.assertEqual(hash(cg), hash(cg2)) gene_name = 'totote' cg3 = CoreGene(self.model_location, gene_name, self.profile_factory) self.assertNotEqual(hash(cg), hash(cg3))
def test_search_recover(self): # first job searching using hmmsearch gene_name = "abc" c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) expected_hit = [ Hit(c_gene_abc, "ESCO030p01_000260", 706, "ESCO030p01", 26, float(1.000e-200), float(660.800), float(1.000), float(0.714), 160, 663) ] # second job using recover # disable hmmer to be sure that test use the recover inner function self.cfg.hmmer = lambda: "hmmer_disable" # and create a new dir for the second job previous_job_path = self.cfg.working_dir() self.cfg.previous_run = lambda: previous_job_path self.cfg.out_dir = lambda: os.path.join(self.tmp_dir, 'job_2') os.mkdir(self.cfg.out_dir()) # rerun with previous run # but we have to reset the profile attached to the gene gene._profile._report self.profile_factory = ProfileFactory(self.cfg) c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) self.assertEqual(len(report), 1) self.assertEqual(expected_hit[0], report[0].hits[0])
def setUp(self): self.tmp_dir = os.path.join(tempfile.gettempdir(), 'test_macsyfinder_search_genes') if os.path.exists(self.tmp_dir): shutil.rmtree(self.tmp_dir) os.mkdir(self.tmp_dir) args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_base.fa") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.log_level = 30 args.out_dir = os.path.join(self.tmp_dir, 'job_1') args.res_search_dir = args.out_dir os.mkdir(args.out_dir) self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) idx = Indexes(self.cfg) idx._build_my_indexes() self.profile_factory = ProfileFactory(self.cfg)
def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation(path=os.path.join(args.models_dir, model_name)) model = Model("foo/T2SS", 10) profile_factory = ProfileFactory(cfg) gene_name = "gspD" self.c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) self.gene_gspd = ModelGene(self.c_gene_gspd, model) gene_name = "sctJ" self.c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) self.gene_sctj = ModelGene(self.c_gene_sctj, model) model.add_mandatory_gene(self.gene_gspd) model.add_accessory_gene(self.gene_sctj) self.hit_1 = Hit(self.c_gene_gspd, "hit_1", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) self.hit_2 = Hit(self.c_gene_sctj, "hit_2", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20)
def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation(path=os.path.join(args.models_dir, model_name)) model = Model("foo/T2SS", 10) profile_factory = ProfileFactory(cfg) gene_name = "gspD" self.cg_gspd = CoreGene(models_location, gene_name, profile_factory) self.mg_gspd = ModelGene(self.cg_gspd, model, loner=True, multi_system=True) gene_name = "sctJ" self.cg_sctj = CoreGene(models_location, gene_name, profile_factory) self.mg_sctj = ModelGene(self.cg_sctj, model) model.add_mandatory_gene(self.mg_gspd) model.add_accessory_gene(self.mg_sctj) self.chit_1 = CoreHit(self.cg_gspd, "hit_1", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) self.chit_2 = CoreHit(self.cg_sctj, "hit_2", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) self.chit_3 = CoreHit(self.cg_gspd, "hit_3", 803, "replicon_id", 10, 1.0, 1.0, 1.0, 1.0, 10, 20) self.chit_4 = CoreHit(self.cg_gspd, "hit_4", 803, "replicon_id", 20, 1.0, 1.0, 1.0, 1.0, 10, 20) self.mhit_1 = ModelHit(self.chit_1, self.mg_gspd, GeneStatus.MANDATORY) self.mhit_2 = ModelHit(self.chit_2, self.mg_sctj, GeneStatus.ACCESSORY) self.mhit_3 = ModelHit(self.chit_3, self.mg_gspd, GeneStatus.MANDATORY) self.mhit_4 = ModelHit(self.chit_4, self.mg_gspd, GeneStatus.MANDATORY)
def setUp(self): args = argparse.Namespace() args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = tempfile.gettempdir() args.log_level = 30 args.out_dir = os.path.join(args.res_search_dir, 'test_macsyfinder_Report') if os.path.exists(args.out_dir): shutil.rmtree(args.out_dir) os.mkdir(args.out_dir) seq_db = self.find_data("base", "test_base.fa") shutil.copy(seq_db, args.out_dir) args.sequence_db = os.path.join(args.out_dir, os.path.basename(seq_db)) self.cfg = Config(MacsyDefaults(), args) os.mkdir(os.path.join(self.cfg.out_dir(), self.cfg.hmmer_dir())) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe self.profile_factory = ProfileFactory(self.cfg) idx = Indexes(self.cfg) idx.build()
class TestProfileFactory(MacsyTest): def setUp(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = tempfile.gettempdir() args.log_level = 30 self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.models_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) def tearDown(self): try: shutil.rmtree(self.cfg.working_dir) except: pass def test_get_profile(self): gene_name = 'sctJ_FLG' gene = CoreGene(self.models_location, gene_name, self.profile_factory) profile = self.profile_factory.get_profile(gene, self.models_location) self.assertTrue(isinstance(profile, Profile)) self.assertEqual(profile.gene.name, gene_name) def test_get_uniq_object(self): gene_name = 'sctJ_FLG' gene = CoreGene(self.models_location, gene_name, self.profile_factory) profile1 = self.profile_factory.get_profile(gene, self.models_location) profile2 = self.profile_factory.get_profile(gene, self.models_location) self.assertEqual(profile1, profile2) def test_unknow_profile(self): gene_name = 'sctJ_FLG' gene = CoreGene(self.models_location, gene_name, self.profile_factory) gene._name = "bar" with self.assertRaises(MacsypyError) as ctx: self.profile_factory.get_profile(gene, self.models_location) self.assertEqual(str(ctx.exception), f"'{self.model_name}/{gene.name}': No such profile")
def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') self.cfg = Config(MacsyDefaults(), args) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe self.profile_factory = ProfileFactory(self.cfg) self.systems = _build_systems(self.cfg, self.profile_factory)
def setUp(self) -> None: self.args = argparse.Namespace() self.args.sequence_db = self.find_data("base", "test_1.fasta") self.args.db_type = 'gembase' self.args.models_dir = self.find_data('models') self.args.res_search_dir = "blabla" self.cfg = Config(MacsyDefaults(), self.args) self.model_name = 'foo' self.model_location = ModelLocation(path=os.path.join(self.args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) self.hit_weights = HitWeight(**self.cfg.hit_weights())
def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) self.hit_weights = HitWeight(**self.cfg.hit_weights()) # reset the uniq id number for AbstractSetOfHits # to have predictable results AbstractSetOfHits._id = itertools.count(1)
def setUp(self): self.args = argparse.Namespace() self.args.sequence_db = self.find_data("base", "test_1.fasta") self.args.db_type = 'gembase' self.args.models_dir = self.find_data('models') self.args.res_search_dir = tempfile.gettempdir() self.args.log_level = 30 self.args.out_dir = os.path.join(self.args.res_search_dir, 'test_macsyfinder_Model') if os.path.exists(self.args.out_dir): shutil.rmtree(self.args.out_dir) os.mkdir(self.args.out_dir) self.cfg = Config(MacsyDefaults(), self.args) self.model_name = 'foo' self.model_location = ModelLocation(path=os.path.join(self.args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg)
def setUp(self): defaults = MacsyDefaults() self.args = argparse.Namespace() self.args.sequence_db = self.find_data("base", "test_1.fasta") self.args.db_type = 'gembase' self.args.models_dir = self.find_data('models') self.args.res_search_dir = tempfile.gettempdir() self.cfg = Config(defaults, self.args) self.model_bank = ModelBank() self.gene_bank = GeneBank() self.profile_factory = ProfileFactory(self.cfg) self.model_registry = ModelRegistry() models_location = scan_models_dir(self.args.models_dir) for ml in models_location: self.model_registry.add(ml) self.parser = DefinitionParser(self.cfg, self.model_bank, self.gene_bank, self.model_registry, self.profile_factory)
def test_likely_systems_to_tsv(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'unordered' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) # test if id is well incremented gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) gene_name = "sctJ" c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) gene_name = "sctC" c_gene_sctc = CoreGene(models_location, gene_name, profile_factory) gene_sctc = ModelGene(c_gene_sctc, model) model.add_neutral_gene(gene_sctc) gene_name = "tadZ" c_gene_tadz = CoreGene(models_location, gene_name, profile_factory) gene_tadz = ModelGene(c_gene_tadz, model) model.add_forbidden_gene(gene_tadz) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 804, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctc, "hit_3", 805, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctc, GeneStatus.NEUTRAL) hit_4 = Hit(c_gene_tadz, "hit_4", 806, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_4 = ValidHit(hit_4, gene_tadz, GeneStatus.FORBIDDEN) system_1 = LikelySystem(model, [v_hit_1], [v_hit_2], [v_hit_3], [v_hit_4]) sol_tsv = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Likely Systems found:""" sol_tsv += "\n\n" sol_tsv += "\t".join([ "replicon", "hit_id", "gene_name", "hit_pos", "model_fqn", "sys_id", "sys_wholeness", "hit_gene_ref", "hit_status", "hit_seq_len", "hit_i_eval", "hit_score", "hit_profile_cov", "hit_seq_cov", "hit_begin_match", "hit_end_match", "used_in" ]) sol_tsv += "\n" sol_tsv += '\t'.join([ "replicon_id", "hit_1", "gspD", "1", "foo/T2SS", "replicon_id_T2SS_1", "1.000", "gspD", "mandatory", "803", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) sol_tsv += "\n" sol_tsv += '\t'.join([ "replicon_id", "hit_2", "sctJ", "1", "foo/T2SS", "replicon_id_T2SS_1", "1.000", "sctJ", "accessory", "804", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) sol_tsv += "\n" sol_tsv += '\t'.join([ "replicon_id", "hit_4", "tadZ", "1", "foo/T2SS", "replicon_id_T2SS_1", "1.000", "tadZ", "forbidden", "806", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) sol_tsv += "\n" sol_tsv += '\t'.join([ "replicon_id", "hit_3", "sctC", "1", "foo/T2SS", "replicon_id_T2SS_1", "1.000", "sctC", "neutral", "805", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) sol_tsv += "\n" sol_tsv += "\n" f_out = StringIO() track_multi_systems_hit = HitSystemTracker([system_1]) likely_systems_to_tsv([system_1], track_multi_systems_hit, f_out) self.assertMultiLineEqual(sol_tsv, f_out.getvalue()) f_out = StringIO() likely_systems_to_tsv([], track_multi_systems_hit, f_out) expected_out = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Likely Systems found """ self.assertEqual(expected_out, f_out.getvalue())
def test_SpecialHitSerializer_tsv(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) gene_name = "gspD" cg_gspd = CoreGene(models_location, gene_name, profile_factory) mg_gspd = ModelGene(cg_gspd, model, loner=True) gene_name = "sctJ" cg_sctj = CoreGene(models_location, gene_name, profile_factory) mg_sctj = ModelGene(cg_sctj, model) gene_name = "abc" cg_abc = CoreGene(models_location, gene_name, profile_factory) mg_abc = ModelGene(cg_abc, model) model.add_mandatory_gene(mg_gspd) model.add_accessory_gene(mg_sctj) model.add_accessory_gene(mg_abc) chit_abc = CoreHit(cg_abc, "hit_abc", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) chit_sctj = CoreHit(cg_sctj, "hit_sctj", 803, "replicon_id", 4, 1.0, 1.0, 1.0, 1.0, 10, 20) chit_gspd1 = CoreHit(cg_gspd, "hit_gspd1", 803, "replicon_id", 20, 1.0, 2.0, 1.0, 1.0, 10, 20) chit_gspd2 = CoreHit(cg_gspd, "hit_gspd2", 803, "replicon_id", 30, 1.0, 3.0, 1.0, 1.0, 10, 20) mhit_abc = ModelHit(chit_abc, mg_abc, GeneStatus.ACCESSORY) mhit_sctj = ModelHit(chit_sctj, mg_sctj, GeneStatus.ACCESSORY) mhit_gspd1 = ModelHit(chit_gspd1, mg_gspd, GeneStatus.MANDATORY) mhit_gspd2 = ModelHit(chit_gspd2, mg_gspd, GeneStatus.MANDATORY) l_gspd1 = Loner(mhit_gspd1, counterpart=[mhit_gspd2]) l_gspd2 = Loner(mhit_gspd2, counterpart=[mhit_gspd1]) ser = TsvSpecialHitSerializer() txt = ser.serialize([l_gspd1, l_gspd2]) expected_txt = "\t".join([ 'replicon', 'model_fqn', 'function', 'gene_name', 'hit_id', 'hit_pos', 'hit_status', 'hit_seq_len', 'hit_i_eval', 'hit_score', 'hit_profile_cov', 'hit_seq_cov', 'hit_begin_match', 'hit_end_match' ]) expected_txt += "\n" expected_txt += "\t".join([ 'replicon_id', 'foo/T2SS', 'gspD', 'gspD', 'hit_gspd1', '20', 'mandatory', '803', '1.000e+00', '2.000', '1.000', '1.000', '10', '20' ]) expected_txt += "\n" expected_txt += "\t".join([ 'replicon_id', 'foo/T2SS', 'gspD', 'gspD', 'hit_gspd2', '30', 'mandatory', '803', '1.000e+00', '3.000', '1.000', '1.000', '10', '20' ]) expected_txt += "\n" self.maxDiff = None self.assertEqual(txt, expected_txt)
def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) self.model = Model("foo/model_A", 10) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, self.model) c_gene_sctn_flg = CoreGene(self.model_location, "sctN_FLG", self.profile_factory) gene_sctn_flg = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_flg) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, self.model) c_gene_sctj_flg = CoreGene(self.model_location, "sctJ_FLG", self.profile_factory) gene_sctj_flg = Exchangeable(c_gene_sctj_flg, gene_sctj) gene_sctj.add_exchangeable(gene_sctj_flg) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, self.model) c_gene_flgb = CoreGene(self.model_location, "flgB", self.profile_factory) gene_gspd_an = Exchangeable(c_gene_flgb, gene_gspd) gene_gspd.add_exchangeable(gene_gspd_an) c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, self.model) c_gene_tadz = CoreGene(self.model_location, "tadZ", self.profile_factory) gene_abc_ho = Exchangeable(c_gene_tadz, gene_abc) gene_abc.add_exchangeable(gene_abc_ho) c_gene_toto = CoreGene(self.model_location, "toto", self.profile_factory) gene_toto = ModelGene(c_gene_toto, self.model) c_gene_totote = CoreGene(self.model_location, "totote", self.profile_factory) gene_toto_ho = Exchangeable(c_gene_totote, gene_toto) gene_toto.add_exchangeable(gene_toto_ho) self.model.add_mandatory_gene(gene_sctn) self.model.add_mandatory_gene(gene_sctj) self.model.add_accessory_gene(gene_gspd) self.model.add_neutral_gene(gene_toto) self.model.add_forbidden_gene(gene_abc) self.c_hits = { 'h_sctj': Hit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_sctj_flg': Hit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_sctn': Hit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_sctn_flg': Hit(c_gene_sctn_flg, "hit_sctn_flg", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_gspd': Hit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_gspd_an': Hit(c_gene_flgb, "hit_gspd_an", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_abc': Hit(c_gene_abc, "hit_abc", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_abc_ho': Hit(c_gene_tadz, "hit_abc_ho", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_toto': Hit(c_gene_toto, "hit_toto", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_toto_ho': Hit(c_gene_totote, "hit_toto_ho", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), }
def test_systems_to_tsv(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) gene_name = "sctJ" c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) system_1 = System(model, [ Cluster([v_hit_1, v_hit_2], model, HitWeight(**cfg.hit_weights())) ], cfg.redundancy_penalty()) system_tsv = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Systems found: """ system_tsv += "\t".join([ "replicon", "hit_id", "gene_name", "hit_pos", "model_fqn", "sys_id", "sys_loci", "sys_wholeness", "sys_score", "sys_occ", "hit_gene_ref", "hit_status", "hit_seq_len", "hit_i_eval", "hit_score", "hit_profile_cov", "hit_seq_cov", "hit_begin_match", "hit_end_match", "used_in" ]) system_tsv += "\n" system_tsv += "\t".join([ "replicon_id", "hit_1", "gspD", "1", "foo/T2SS", system_1.id, "1", "1.000", "1.500", "1", "gspD", "mandatory", "803", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) system_tsv += "\n" system_tsv += "\t".join([ "replicon_id", "hit_2", "sctJ", "1", "foo/T2SS", system_1.id, "1", "1.000", "1.500", "1", "sctJ", "accessory", "803", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) system_tsv += "\n\n" f_out = StringIO() track_multi_systems_hit = HitSystemTracker([system_1]) systems_to_tsv([system_1], track_multi_systems_hit, f_out) self.assertMultiLineEqual(system_tsv, f_out.getvalue()) # test No system found system_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Systems found """ f_out = StringIO() track_multi_systems_hit = HitSystemTracker([]) systems_to_tsv([], track_multi_systems_hit, f_out) self.assertMultiLineEqual(system_str, f_out.getvalue())
def main(args=None, loglevel=None): """ main entry point to MacSyFinder do some check before to launch :func:`main_search_systems` which is the real function that perform a search :param args: the arguments passed on the command line without the program name :type args: List of string :param loglevel: the output verbosity :type loglevel: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL' """ args = sys.argv[1:] if args is None else args parser, parsed_args = parse_args(args) defaults = MacsyDefaults() config = Config(defaults, parsed_args) ########################### # creation of working dir ########################### working_dir = config.working_dir() if not os.path.exists(working_dir): os.makedirs(working_dir) else: if os.path.isdir(working_dir): if os.listdir(working_dir): raise ValueError( f"'{working_dir}' already exists and is not a empty") else: raise ValueError( f"'{working_dir}' already exists and is not a directory") ################ # init loggers # ################ macsypy.init_logger(log_file=os.path.join(config.working_dir(), config.log_file()), out=not config.mute()) if not loglevel: # logs are specify from args options macsypy.logger_set_level(level=config.log_level()) else: # used by unit tests to mute or unmute logs macsypy.logger_set_level(level=loglevel) logger = logging.getLogger('macsypy.macsyfinder') if parsed_args.list_models: print(list_models(parsed_args), file=sys.stdout) sys.exit(0) else: if not parsed_args.previous_run and not parsed_args.models: parser.print_help() print() sys.tracebacklimit = 0 raise OptionError( "argument --models or --previous-run is required.") elif not parsed_args.previous_run and not parsed_args.sequence_db: parser.print_help() print() sys.tracebacklimit = 0 raise OptionError( "argument --sequence-db or --previous-run is required.") elif not parsed_args.previous_run and not parsed_args.db_type: parser.print_help() print() sys.tracebacklimit = 0 raise OptionError( "argument --db-type or --previous-run is required.") _log.info(f"command used: {' '.join(sys.argv)}") models = ModelBank() genes = GeneBank() profile_factory = ProfileFactory(config) macsypy.hit.hit_weight = macsypy.hit.HitWeight(itself=3, exchangeable=.75, mandatory=2, accessory=.25, neutral=1.5) logger.info("\n{:#^70}".format(" Searching systems ")) all_systems, rejected_clusters = search_systems( config, models, genes, profile_factory, logger) track_multi_systems_hit = HitSystemTracker(all_systems) if config.db_type() in ('gembase', 'ordered_replicon'): ############################# # Ordered/Gembase replicons # ############################# ########################### # select the best systems # ########################### logger.info("\n{:#^70}".format(" Computing best solutions ")) best_solutions = [] one_best_solution = [] # group systems found by replicon # before to search best system combination import time for rep_name, syst_group in itertools.groupby( all_systems, key=lambda s: s.replicon_name): syst_group = list(syst_group) logger.info( f"Computing best solutions for {rep_name} (nb of systems {len(syst_group)})" ) t0 = time.time() best_sol_4_1_replicon, score = find_best_solutions(syst_group) t1 = time.time() logger.info( f"It took {t1 - t0:.2f}sec to find best solution ({score:.2f}) for replicon {rep_name}" ) # if several solutions are equivalent same number of system and score is same # store all equivalent solution in best_solution => all_best_systems # pick one in one_best_solution => best_systems best_solutions.extend(best_sol_4_1_replicon) one_best_solution.append(best_sol_4_1_replicon[0]) ############################## # Write the results in files # ############################## logger.info("\n{:#^70}".format(" Writing down results ")) system_filename = os.path.join(config.working_dir(), "all_systems.txt") tsv_filename = os.path.join(config.working_dir(), "all_systems.tsv") with open(system_filename, "w") as sys_file: systems_to_txt(all_systems, track_multi_systems_hit, sys_file) with open(tsv_filename, "w") as tsv_file: systems_to_tsv(all_systems, track_multi_systems_hit, tsv_file) cluster_filename = os.path.join(config.working_dir(), "rejected_clusters.txt") with open(cluster_filename, "w") as clst_file: rejected_clusters.sort(key=lambda clst: ( clst.replicon_name, clst.model, clst.hits)) rejected_clst_to_txt(rejected_clusters, clst_file) if not (all_systems or rejected_clusters): logger.info("No Systems found in this dataset.") tsv_filename = os.path.join(config.working_dir(), "all_best_solutions.tsv") with open(tsv_filename, "w") as tsv_file: solutions_to_tsv(best_solutions, track_multi_systems_hit, tsv_file) tsv_filename = os.path.join(config.working_dir(), "best_solution.tsv") with open(tsv_filename, "w") as tsv_file: # flattern the list and sort it one_best_solution = [ syst for sol in one_best_solution for syst in sol ] one_best_solution.sort( key=lambda syst: (syst.replicon_name, syst.position[0], syst.model.fqn, -syst.score)) systems_to_tsv(one_best_solution, track_multi_systems_hit, tsv_file) else: ####################### # Unordered replicons # ####################### ############################## # Write the results in files # ############################## logger.info("\n{:#^70}".format(" Writing down results ")) system_filename = os.path.join(config.working_dir(), "all_systems.txt") with open(system_filename, "w") as sys_file: likely_systems_to_txt(all_systems, track_multi_systems_hit, sys_file) # forbidden = [s for s in all_systems if s.forbidden_occ] # system_filename = os.path.join(config.working_dir(), "forbidden_components.tsv") # with open(system_filename, "w") as sys_file: # likely_systems_to_tsv(forbidden, track_multi_systems_hit, sys_file) system_filename = os.path.join(config.working_dir(), "all_systems.tsv") with open(system_filename, "w") as sys_file: likely_systems_to_tsv(all_systems, track_multi_systems_hit, sys_file) cluster_filename = os.path.join(config.working_dir(), "uncomplete_systems.txt") with open(cluster_filename, "w") as clst_file: unlikely_systems_to_txt(rejected_clusters, clst_file) if not (all_systems or rejected_clusters): logger.info("No Systems found in this dataset.") logger.info("END")
def test_search_systems(self): logger = logging.getLogger('macsypy.macsyfinder') macsypy.logger_set_level(level='ERROR') defaults = MacsyDefaults() out_dir = os.path.join(self.tmp_dir, 'macsyfinder_test_search_systems') os.mkdir(out_dir) # test gembase replicon seq_db = self.find_data('base', 'VICH001.B.00001.C001.prt') model_dir = self.find_data('data_set', 'models') args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models set_1 all -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) systems, rejected_clst = search_systems(config, model_bank, gene_bank, profile_factory, logger) expected_sys_id = [ 'VICH001.B.00001.C001_MSH_5', 'VICH001.B.00001.C001_MSH_7', 'VICH001.B.00001.C001_T4P_25', 'VICH001.B.00001.C001_T4P_23', 'VICH001.B.00001.C001_T4P_21', 'VICH001.B.00001.C001_T4P_22', 'VICH001.B.00001.C001_T4P_17', 'VICH001.B.00001.C001_T4P_16', 'VICH001.B.00001.C001_T4bP_26', 'VICH001.B.00001.C001_T4P_24', 'VICH001.B.00001.C001_T4P_18', 'VICH001.B.00001.C001_T4P_19', 'VICH001.B.00001.C001_T4P_20', 'VICH001.B.00001.C001_T2SS_10', 'VICH001.B.00001.C001_T2SS_9' ] self.assertListEqual([s.id for s in systems], expected_sys_id) expected_scores = [ 10.5, 10.0, 12.0, 9.5, 9.0, 8.5, 6.0, 5.0, 5.5, 10.5, 7.5, 7.0, 8.0, 8.3, 7.5 ] self.assertListEqual([s.score for s in systems], expected_scores) self.assertEqual(len(rejected_clst), 11) # test hits but No Systems args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models set_1 Tad -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) systems, rejected_clst = search_systems(config, model_bank, gene_bank, profile_factory, logger) self.assertEqual(systems, []) # test No hits seq_db = self.find_data('base', 'test_1.fasta') args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models set_1 T4bP -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) systems, rejected_clst = search_systems(config, model_bank, gene_bank, profile_factory, logger) self.assertEqual(systems, []) self.assertEqual(rejected_clst, [])
def test_systems_to_txt(self): system_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Systems found """ f_out = StringIO() track_multi_systems_hit = HitSystemTracker([]) systems_to_txt([], track_multi_systems_hit, f_out) self.assertMultiLineEqual(system_str, f_out.getvalue()) args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) # test if id is well incremented gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) gene_name = "sctJ" c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) system_1 = System(model, [ Cluster([v_hit_1, v_hit_2], model, HitWeight(**cfg.hit_weights())) ], cfg.redundancy_penalty()) system_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Systems found: system id = replicon_id_T2SS_{next(System._id) - 1} model = foo/T2SS replicon = replicon_id clusters = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 1)] occ = 1 wholeness = 1.000 loci nb = 1 score = 1.500 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) neutral genes: ============================================================ """ f_out = StringIO() track_multi_systems_hit = HitSystemTracker([system_1]) systems_to_txt([system_1], track_multi_systems_hit, f_out) self.assertMultiLineEqual(system_str, f_out.getvalue())
def test_unnlikely_systems_to_txt(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'unordered' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) # test if id is well incremented gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) gene_name = "sctJ" c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) gene_name = "sctC" c_gene_sctc = CoreGene(models_location, gene_name, profile_factory) gene_sctc = ModelGene(c_gene_sctc, model) model.add_neutral_gene(gene_sctc) gene_name = "tadZ" c_gene_tadz = CoreGene(models_location, gene_name, profile_factory) gene_tadz = ModelGene(c_gene_tadz, model) model.add_forbidden_gene(gene_tadz) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 804, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctc, "hit_3", 805, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctc, GeneStatus.NEUTRAL) hit_4 = Hit(c_gene_tadz, "hit_4", 806, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_4 = ValidHit(hit_4, gene_tadz, GeneStatus.FORBIDDEN) reason = "why it not a system" system_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2], [v_hit_3], [v_hit_4], reason) exp_txt = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Unlikely Systems found: This replicon probably not contains a system foo/T2SS: {reason} system id = replicon_id_T2SS_1 model = foo/T2SS replicon = replicon_id hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 1), ('hit_3', 'sctC', 1), ('hit_4', 'tadZ', 1)] wholeness = 1.000 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) neutral genes: \t- sctC: 1 (sctC) forbidden genes: \t- tadZ: 1 (tadZ) Use ordered replicon to have better prediction. ============================================================ """ f_out = StringIO() unlikely_systems_to_txt([system_1], f_out) self.assertMultiLineEqual(exp_txt, f_out.getvalue()) f_out = StringIO() unlikely_systems_to_txt([], f_out) expected_out = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Unlikely Systems found """ self.assertEqual(expected_out, f_out.getvalue())
def test_solutions_to_tsv(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model_A = Model("foo/A", 10) model_B = Model("foo/B", 10) model_C = Model("foo/C", 10) c_gene_sctn_flg = CoreGene(models_location, "sctN_FLG", profile_factory) gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B) c_gene_sctj_flg = CoreGene(models_location, "sctJ_FLG", profile_factory) gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B) c_gene_flgB = CoreGene(models_location, "flgB", profile_factory) gene_flgB = ModelGene(c_gene_flgB, model_B) c_gene_tadZ = CoreGene(models_location, "tadZ", profile_factory) gene_tadZ = ModelGene(c_gene_tadZ, model_B) c_gene_sctn = CoreGene(models_location, "sctN", profile_factory) gene_sctn = ModelGene(c_gene_sctn, model_A) gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_hom) c_gene_sctj = CoreGene(models_location, "sctJ", profile_factory) gene_sctj = ModelGene(c_gene_sctj, model_A) gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj) gene_sctj.add_exchangeable(gene_sctj_an) c_gene_gspd = CoreGene(models_location, "gspD", profile_factory) gene_gspd = ModelGene(c_gene_gspd, model_A) gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd) gene_gspd.add_exchangeable(gene_gspd_an) c_gene_abc = CoreGene(models_location, "abc", profile_factory) gene_abc = ModelGene(c_gene_abc, model_A) gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc) gene_abc.add_exchangeable(gene_abc_ho) model_A.add_mandatory_gene(gene_sctn) model_A.add_mandatory_gene(gene_sctj) model_A.add_accessory_gene(gene_gspd) model_A.add_forbidden_gene(gene_abc) model_B.add_mandatory_gene(gene_sctn_flg) model_B.add_mandatory_gene(gene_sctj_flg) model_B.add_accessory_gene(gene_flgB) model_B.add_accessory_gene(gene_tadZ) model_C.add_mandatory_gene(gene_sctn_flg) model_C.add_mandatory_gene(gene_sctj_flg) model_C.add_mandatory_gene(gene_flgB) model_C.add_accessory_gene(gene_tadZ) model_C.add_accessory_gene(gene_gspd) h_sctj = Hit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctn = Hit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_gspd = Hit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctj_flg = Hit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_flgB = Hit(c_gene_flgB, "hit_flgB", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_tadZ = Hit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) model_A._min_mandatory_genes_required = 2 model_A._min_genes_required = 2 hit_weights = HitWeight(**cfg.hit_weights()) c1 = Cluster([ ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY), ValidHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) ], model_A, hit_weights) c2 = Cluster([ ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY) ], model_A, hit_weights) model_B._min_mandatory_genes_required = 1 model_B._min_genes_required = 2 c3 = Cluster([ ValidHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY), ValidHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY), ValidHit(h_flgB, gene_flgB, GeneStatus.ACCESSORY) ], model_B, hit_weights) model_C._min_mandatory_genes_required = 1 model_C._min_genes_required = 2 c4 = Cluster([ ValidHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY), ValidHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY), ValidHit(h_flgB, gene_flgB, GeneStatus.MANDATORY), ValidHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) ], model_C, hit_weights) sys_A = System(model_A, [c1, c2], cfg.redundancy_penalty()) sys_A.id = "sys_id_A" sys_B = System(model_B, [c3], cfg.redundancy_penalty()) sys_B.id = "sys_id_B" sys_C = System(model_C, [c4], cfg.redundancy_penalty()) sys_C.id = "sys_id_C" sol_1 = [sys_A, sys_B] sol_2 = [sys_A, sys_C] sol_id_1 = '1' sol_id_2 = '2' sol_tsv = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Systems found: """ sol_tsv += "\t".join([ "sol_id", "replicon", "hit_id", "gene_name", "hit_pos", "model_fqn", "sys_id", "sys_loci", "sys_wholeness", "sys_score", "sys_occ", "hit_gene_ref", "hit_status", "hit_seq_len", "hit_i_eval", "hit_score", "hit_profile_cov", "hit_seq_cov", "hit_begin_match", "hit_end_match", "used_in" ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_gspd', 'gspD', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'gspD', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctj_flg', 'sctJ_FLG', '1', 'foo/B', 'sys_id_B', '1', '0.750', '2.000', '1', 'sctJ_FLG', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_tadZ', 'tadZ', '1', 'foo/B', 'sys_id_B', '1', '0.750', '2.000', '1', 'tadZ', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_flgB', 'flgB', '1', 'foo/B', 'sys_id_B', '1', '0.750', '2.000', '1', 'flgB', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_gspd', 'gspD', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'gspD', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctj_flg', 'sctJ_FLG', '1', 'foo/C', 'sys_id_C', '1', '0.800', '3.000', '1', 'sctJ_FLG', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'sys_id_B' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_tadZ', 'tadZ', '1', 'foo/C', 'sys_id_C', '1', '0.800', '3.000', '1', 'tadZ', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'sys_id_B' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_flgB', 'flgB', '1', 'foo/C', 'sys_id_C', '1', '0.800', '3.000', '1', 'flgB', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'sys_id_B' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_gspd', 'gspD', '1', 'foo/C', 'sys_id_C', '1', '0.800', '3.000', '1', 'gspD', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'sys_id_A' ]) sol_tsv += "\n" sol_tsv += "\n" f_out = StringIO() hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B]) solutions_to_tsv([sol_1, sol_2], hit_multi_sys_tracker, f_out) self.assertMultiLineEqual(sol_tsv, f_out.getvalue())
def test_rejected_clst_to_txt(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = "blabla" cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 11) gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_1 = ModelGene(c_gene_gspd, model) gene_name = "sctC" c_gene_sctc = CoreGene(models_location, gene_name, profile_factory) gene_2 = ModelGene(c_gene_sctc, model) model.add_mandatory_gene(gene_1) model.add_accessory_gene(gene_2) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_gspd, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_sctc, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.ACCESSORY) h40 = Hit(c_gene_gspd, "h10", 10, "replicon_1", 40, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h40 = ValidHit(h40, gene_1, GeneStatus.MANDATORY) h50 = Hit(c_gene_sctc, "h20", 10, "replicon_1", 50, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_2, GeneStatus.ACCESSORY) hit_weights = HitWeight(**cfg.hit_weights()) c1 = Cluster([v_h10, v_h20], model, hit_weights) c2 = Cluster([v_h40, v_h50], model, hit_weights) r_c = RejectedClusters(model, [c1, c2], ["The reasons to reject this clusters"]) rej_clst_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Rejected clusters: Cluster: - model = T2SS - replicon = replicon_1 - hits = (h10, gspD, 10), (h20, sctC, 20) Cluster: - model = T2SS - replicon = replicon_1 - hits = (h10, gspD, 40), (h20, sctC, 50) These clusters have been rejected because: \t- The reasons to reject this clusters ============================================================ """ f_out = StringIO() rejected_clst_to_txt([r_c], f_out) self.maxDiff = None self.assertMultiLineEqual(rej_clst_str, f_out.getvalue()) rej_clst_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Rejected clusters """ f_out = StringIO() rejected_clst_to_txt([], f_out) self.assertMultiLineEqual(rej_clst_str, f_out.getvalue())