def test_hit_weight_default(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) hit_weight = HitWeight(**cfg.hit_weights()) self.assertEqual(hit_weight.mandatory, 1) self.assertEqual(hit_weight.accessory, 0.5) self.assertEqual(hit_weight.itself, 1) self.assertEqual(hit_weight.exchangeable, 0.8) self.assertEqual(hit_weight.loner_multi_system, 0.7)
def test_hit_weights(self): cfg = Config(self.defaults, self.parsed_args) default = { k: self.defaults[f"{k}_weight"] for k in ('mandatory', 'accessory', 'neutral', 'itself', 'exchangeable', 'loner_multi_system') } self.assertDictEqual(default, cfg.hit_weights()) self.parsed_args.mandatory_weight = 2 self.parsed_args.accessory_weight = 1 self.parsed_args.exchangeable_weight = 0.5 self.parsed_args.loner_multi_system_weight = 0.2 cfg = Config(self.defaults, self.parsed_args) expected = { 'mandatory': 2, 'accessory': 1, 'neutral': self.defaults.neutral_weight, 'itself': self.defaults.itself_weight, 'exchangeable': .5, 'loner_multi_system': .2 } self.assertDictEqual(cfg.hit_weights(), expected)
def test_model_conf(self): self.parsed_args.models_dir = self.find_data('models') self.parsed_args.models = "Model_w_conf all" cfg = Config(self.defaults, self.parsed_args) expected_weights = { 'mandatory': 13.0, 'accessory': 14.0, 'neutral': 0.0, 'itself': 11.0, 'exchangeable': 12.0, 'out_of_cluster': 10.0 } self.assertDictEqual(cfg.hit_weights(), expected_weights) self.assertEqual(cfg.i_evalue_sel(), 0.012) self.assertEqual(cfg.e_value_search(), 0.12) self.assertEqual(cfg.coverage_profile(), 0.55) self.assertTrue(cfg.no_cut_ga())
def test_hit_weight_not_default(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.mandatory_weight = 2.0 args.accessory_weight = 3.0 args.neutral_weight = 4.0 args.exchangeable_weight = 5.0 args.itself_weight = 6.0 args.loner_multi_system_weight = 12 cfg = Config(MacsyDefaults(), args) hit_weight = HitWeight(**cfg.hit_weights()) self.assertEqual(hit_weight.mandatory, 2.0) self.assertEqual(hit_weight.accessory, 3.0) self.assertEqual(hit_weight.neutral, 4.0) self.assertEqual(hit_weight.exchangeable, 5.0) self.assertEqual(hit_weight.itself, 6.0) self.assertEqual(hit_weight.loner_multi_system, 12.0)
class TestBuildCluster(MacsyTest): def setUp(self) -> None: self.args = argparse.Namespace() self.args.sequence_db = self.find_data("base", "test_1.fasta") self.args.db_type = 'gembase' self.args.models_dir = self.find_data('models') self.args.res_search_dir = "blabla" self.cfg = Config(MacsyDefaults(), self.args) self.model_name = 'foo' self.model_location = ModelLocation(path=os.path.join(self.args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) self.hit_weights = HitWeight(**self.cfg.hit_weights()) def test_build_clusters(self): # handle name, topology type, and min/max positions in the sequence dataset for a replicon and list of genes. # each genes is representing by a tuple (seq_id, length)""" rep_info = RepliconInfo('linear', 1, 60, [(f"g_{i}", i * 10) for i in range(1, 7)]) model = Model("foo/T2SS", 11) core_genes = [] model_genes = [] for g_name in ('gspD', 'sctC', 'sctJ', 'sctN', 'abc'): core_gene = CoreGene(self.model_location, g_name, self.profile_factory) core_genes.append(core_gene) model_genes.append(ModelGene(core_gene, model)) model_genes[4]._loner = True model.add_mandatory_gene(model_genes[0]) model.add_mandatory_gene(model_genes[1]) model.add_accessory_gene(model_genes[2]) model.add_accessory_gene(model_genes[3]) model.add_neutral_gene(model_genes[4]) # Hit(gene, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(core_genes[0], "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) h11 = Hit(core_genes[0], "h11", 10, "replicon_1", 10, 1.0, 11.0, 1.0, 1.0, 10, 20) h20 = Hit(core_genes[1], "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) h21 = Hit(core_genes[2], "h21", 10, "replicon_1", 20, 1.0, 21.0, 1.0, 1.0, 10, 20) h30 = Hit(core_genes[2], "h30", 10, "replicon_1", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) h31 = Hit(core_genes[1], "h31", 10, "replicon_1", 30, 1.0, 31.0, 1.0, 1.0, 10, 20) h50 = Hit(core_genes[2], "h50", 10, "replicon_1", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) h51 = Hit(core_genes[2], "h51", 10, "replicon_1", 50, 1.0, 51.0, 1.0, 1.0, 10, 20) h60 = Hit(core_genes[2], "h60", 10, "replicon_1", 60, 1.0, 60.0, 1.0, 1.0, 10, 20) h61 = Hit(core_genes[3], "h61", 10, "replicon_1", 60, 1.0, 61.0, 1.0, 1.0, 10, 20) # case replicon is linear, 2 clusters hits = [h10, h11, h20, h21, h30, h31, h50, h51, h60, h61] random.shuffle(hits) clusters = build_clusters(hits, rep_info, model, self.hit_weights) self.assertEqual(len(clusters), 2) self.assertListEqual(clusters[0].hits, [h11, h21, h31]) self.assertListEqual(clusters[1].hits, [h51, h61]) # case replicon is linear with a single hit (not loner) between 2 clusters h70 = Hit(core_genes[3], "h70", 10, "replicon_1", 70, 1.0, 80.0, 1.0, 1.0, 10, 20) h80 = Hit(core_genes[4], "h80", 10, "replicon_1", 80, 1.0, 80.0, 1.0, 1.0, 10, 20) hits = [h10, h11, h20, h21, h50, h51, h70, h80] random.shuffle(hits) clusters = build_clusters(hits, rep_info, model, self.hit_weights) self.assertEqual(len(clusters), 2) self.assertListEqual(clusters[0].hits, [h11, h21]) self.assertListEqual(clusters[1].hits, [h70, h80]) # replicon is linear, 3 clusters, the last one contains only one hit (loner) rep_info = RepliconInfo('linear', 1, 100, [(f"g_{i}", i*10) for i in range(1, 101)]) h80 = Hit(core_genes[4], "h80", 10, "replicon_1", 80, 1.0, 80.0, 1.0, 1.0, 10, 20) hits = [h10, h11, h20, h21, h30, h31, h50, h51, h60, h61, h80] random.shuffle(hits) clusters = build_clusters(hits, rep_info, model, self.hit_weights) self.assertEqual(len(clusters), 3) self.assertListEqual(clusters[0].hits, [h11, h21, h31]) self.assertListEqual(clusters[1].hits, [h51, h61]) self.assertListEqual(clusters[2].hits, [h80]) # replicon is circular contains only one cluster rep_info = RepliconInfo('circular', 1, 60, [(f"g_{i}", i*10) for i in range(1, 7)]) hits = [h10, h20, h30] clusters = build_clusters(hits, rep_info, model, self.hit_weights) self.assertEqual(len(clusters), 1) self.assertListEqual(clusters[0].hits, [h10, h20, h30]) # replicon is circular the last cluster is merge with the first So we have only one cluster rep_info = RepliconInfo('circular', 1, 60, [(f"g_{i}", i*10) for i in range(1, 7)]) hits = [h10, h11, h20, h21, h30, h31, h50, h51, h60, h61] clusters = build_clusters(hits, rep_info, model, self.hit_weights) self.assertEqual(len(clusters), 1) self.assertListEqual(clusters[0].hits, [h51, h61, h11, h21, h31]) # replicon is circular the last hit is incorporate to the first cluster rep_info = RepliconInfo('circular', 1, 80, [(f"g_{i}", i*10) for i in range(1, 9)]) h80 = Hit(core_genes[3], "h80", 10, "replicon_1", 80, 1.0, 80.0, 1.0, 1.0, 10, 20) hits = [h10, h11, h20, h21, h30, h31, h50, h51, h60, h61, h80] random.shuffle(hits) clusters = build_clusters(hits, rep_info, model, self.hit_weights) self.assertEqual(len(clusters), 2) self.assertListEqual(clusters[0].hits, [h80, h11, h21, h31]) self.assertListEqual(clusters[1].hits, [h51, h61]) # replicon is circular the last hit is not merged with the first cluster rep_info = RepliconInfo('linear', 1, 80, [(f"g_{i}", i*10) for i in range(1, 9)]) hits = [h10, h11, h20, h21, h30, h31, h50, h51, h60, h61, h80] random.shuffle(hits) clusters = build_clusters(hits, rep_info, model, self.hit_weights) self.assertEqual(len(clusters), 2) self.assertEqual(len(clusters), 2) self.assertListEqual(clusters[0].hits, [h11, h21, h31]) self.assertListEqual(clusters[1].hits, [h51, h61]) # case replicon is linear, 2 clusters, the hits 11,21,31 and 51,61 are contiguous h10 = Hit(core_genes[0], "h10", 10, "replicon_1", 10, 1.0, 11.0, 1.0, 1.0, 10, 20) h11 = Hit(core_genes[2], "h11", 10, "replicon_1", 11, 1.0, 21.0, 1.0, 1.0, 10, 20) h12 = Hit(core_genes[1], "h12", 10, "replicon_1", 12, 1.0, 31.0, 1.0, 1.0, 10, 20) h50 = Hit(core_genes[2], "h50", 10, "replicon_1", 50, 1.0, 51.0, 1.0, 1.0, 10, 20) h51 = Hit(core_genes[3], "h51", 10, "replicon_1", 51, 1.0, 61.0, 1.0, 1.0, 10, 20) hits = [h10, h11, h12, h50, h51] random.shuffle(hits) clusters = build_clusters(hits, rep_info, model, self.hit_weights) self.assertEqual(len(clusters), 2) self.assertListEqual(clusters[0].hits, [h10, h11, h12]) self.assertListEqual(clusters[1].hits, [h50, h51]) # case replicon is linear # one cluster with one hit loner h80 = Hit(core_genes[4], "h80", 10, "replicon_1", 80, 1.0, 80.0, 1.0, 1.0, 10, 20) hits = [h80] random.shuffle(hits) clusters = build_clusters(hits, rep_info, model, self.hit_weights) self.assertEqual(len(clusters), 1) self.assertListEqual(clusters[0].hits, [h80]) # case replicon is linear, no hits clusters = build_clusters([], rep_info, model, self.hit_weights) self.assertListEqual(clusters, [])
class TestCluster(MacsyTest): def setUp(self) -> None: self.args = argparse.Namespace() self.args.sequence_db = self.find_data("base", "test_1.fasta") self.args.db_type = 'gembase' self.args.models_dir = self.find_data('models') self.args.res_search_dir = "blabla" self.cfg = Config(MacsyDefaults(), self.args) self.model_name = 'foo' self.model_location = ModelLocation(path=os.path.join(self.args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) self.hit_weights = HitWeight(**self.cfg.hit_weights()) def test_init(self): model_1 = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_1 = ModelGene(c_gene_1, model_1) h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_1, GeneStatus.MANDATORY) h30 = Hit(c_gene_3, "h30", 10, "replicon_2", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) v_h30 = ValidHit(h30, gene_1, GeneStatus.ACCESSORY) h50 = Hit(c_gene_3, "h50", 10, "replicon_2", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_1, GeneStatus.ACCESSORY) with self.assertRaises(MacsypyError) as ctx: with self.catch_log(): Cluster([v_h10, v_h20, v_h30, v_h50], model_1, self.hit_weights) msg = "Cannot build a cluster from hits coming from different replicons" self.assertEqual(str(ctx.exception), msg) def test_replicon_name(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) replicon_name = "replicon_1" # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, replicon_name, 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, replicon_name, 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) self.assertEqual(c1.replicon_name, replicon_name) def test_len(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) self.assertEqual(len(c1), 2) def test_loner(self): model = Model("foo/bar", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) c1 = Cluster([v_h10], model, self.hit_weights) c2 = Cluster([v_h10, v_h20], model, self.hit_weights) self.assertTrue(c1.loner()) self.assertFalse(c2.loner()) def test_contains(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) gene_3 = ModelGene(c_gene_3, model) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) h30 = Hit(c_gene_3, "h30", 10, "replicon_1", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) v_h30 = ValidHit(h30, gene_3, GeneStatus.ACCESSORY) h50 = Hit(c_gene_3, "h50", 10, "replicon_1", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_3, GeneStatus.ACCESSORY) c1 = Cluster([v_h10, v_h20, v_h50], model, self.hit_weights) self.assertTrue(v_h10 in c1) self.assertFalse(v_h30 in c1) def test_fulfilled_function(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) c_gene_4 = CoreGene(self.model_location, "sctJ_FLG", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) gene_3 = ModelGene(c_gene_3, model) gene_4 = Exchangeable(c_gene_4, gene_3) gene_3.add_exchangeable(gene_4) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) c = Cluster([v_h10, v_h20], model, self.hit_weights) self.assertTrue(c.fulfilled_function(gene_1)) self.assertFalse(c.fulfilled_function(gene_3)) h50 = Hit(c_gene_4, "h50", 10, "replicon_1", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_4, GeneStatus.ACCESSORY) c = Cluster([v_h10, v_h50], model, self.hit_weights) self.assertTrue(c.fulfilled_function(gene_3)) def test_score(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_tadZ = CoreGene(self.model_location, "tadZ", self.profile_factory) gene_tadZ = ModelGene(c_gene_tadZ, model) model.add_mandatory_gene(gene_tadZ) c_gene_sctj = CoreGene(self.model_location, "sctC", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) c_gene_sctJ_FLG = CoreGene(self.model_location, "sctJ_FLG", self.profile_factory) analog_sctJ_FLG = Exchangeable(c_gene_sctJ_FLG, gene_sctj) gene_sctj.add_exchangeable(analog_sctJ_FLG) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model, loner=True) c_gene_sctn_FLG = CoreGene(self.model_location, "sctN_FLG", self.profile_factory) homolog_sctn_FLG = Exchangeable(c_gene_sctn_FLG, gene_sctn) gene_sctn.add_exchangeable(homolog_sctn_FLG) model.add_accessory_gene(gene_sctn) c_gene_toto = CoreGene(self.model_location, "toto", self.profile_factory) gene_toto = ModelGene(c_gene_toto, model) model.add_neutral_gene(gene_toto) c_gene_flie = CoreGene(self.model_location, "fliE", self.profile_factory) gene_flie = ModelGene(c_gene_flie, model, loner=True, multi_system=True) model.add_mandatory_gene(gene_flie) h_gspd = Hit(c_gene_gspd, "h_gspd", 10, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_h_gspd = ValidHit(h_gspd, gene_gspd, GeneStatus.MANDATORY) h_tadz = Hit(c_gene_tadZ, "h_tadz", 20, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_h_tadz = ValidHit(h_tadz, gene_tadZ, GeneStatus.MANDATORY) h_sctj = Hit(c_gene_sctj, "h_sctj", 30, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_h_sctj = ValidHit(h_sctj, gene_sctj, GeneStatus.ACCESSORY) h_sctj_an = Hit(c_gene_sctJ_FLG, "h_sctj_an", 30, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_h_sctj_an = ValidHit(h_sctj_an, analog_sctJ_FLG, GeneStatus.ACCESSORY) h_sctn = Hit(c_gene_sctn, "sctn", 40, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_h_sctn = ValidHit(h_sctn, gene_sctn, GeneStatus.ACCESSORY) h_sctn_hom = Hit(c_gene_sctn_FLG, "h_scth_hom", 30, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_h_sctn_hom = ValidHit(h_sctn_hom, homolog_sctn_FLG, GeneStatus.ACCESSORY) h_toto = Hit(c_gene_sctn, "toto", 50, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_h_toto = ValidHit(h_toto, gene_toto, GeneStatus.NEUTRAL) h_flie = Hit(c_gene_flie, "h_flie", 100, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_h_flie = ValidHit(h_flie, gene_flie, GeneStatus.MANDATORY) # 2 mandatory, 2 accessory no analog/homolog c1 = Cluster([v_h_gspd, v_h_tadz, v_h_sctj, v_h_sctn], model, self.hit_weights) self.assertEqual(c1.score, 3.0) # 2 mandatory, 2 accessory 1 neutral, no analog/homolog c1 = Cluster([v_h_gspd, v_h_tadz, v_h_sctj, v_h_sctn, v_h_toto], model, self.hit_weights) self.assertEqual(c1.score, 3.0) # 1 mandatory + 1 mandatory duplicated 1 time # 1 accessory + 1 accessory duplicated 1 times # no analog/homolog c1 = Cluster([v_h_gspd, v_h_tadz, v_h_sctj, v_h_sctn, v_h_gspd, v_h_sctn], model, self.hit_weights) self.assertEqual(c1.score, 3.0) # 2 mandatory # 1 accessory + 1 accessory homolog c1 = Cluster([v_h_gspd, v_h_tadz, v_h_sctj, v_h_sctn_hom], model, self.hit_weights) self.assertEqual(c1.score, 2.9) # # 2 mandatory # # 1 accessory + 1 accessory analog of the 1rst accessory # c1 = Cluster([v_h_gspd, v_h_tadz, v_h_sctj, v_h_sctj_an], model, self.hit_weights) # self.assertEqual(c1.score, 2.5) # test loners multi system c1 = Cluster([v_h_flie], model, self.hit_weights) self.assertEqual(c1.score, 0.7) # test the cache score self.assertEqual(c1.score, 0.7) non_valid_hit = ValidHit(h_sctn, gene_sctn, GeneStatus.FORBIDDEN) c1 = Cluster([v_h_gspd, non_valid_hit, v_h_tadz], model, self.hit_weights) with self.assertRaises(MacsypyError) as ctx: c1.score self.assertEqual(str(ctx.exception), "a Cluster contains hit which is neither mandatory nor accessory") def test_merge(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) gene_3 = ModelGene(c_gene_3, model) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) h30 = Hit(c_gene_3, "h30", 10, "replicon_1", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) v_h30 = ValidHit(h30, gene_3, GeneStatus.ACCESSORY) h50 = Hit(c_gene_3, "h50", 10, "replicon_1", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_3, GeneStatus.ACCESSORY) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) c2 = Cluster([v_h30, v_h50], model, self.hit_weights) c1.merge(c2) self.assertListEqual(c1.hits, [v_h10, v_h20, v_h30, v_h50]) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) c2 = Cluster([v_h30, v_h50], model, self.hit_weights) c2.merge(c1) self.assertListEqual(c2.hits, [v_h30, v_h50, v_h10, v_h20]) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) c2 = Cluster([v_h30, v_h50], model, self.hit_weights) c1.merge(c2, before=True) self.assertListEqual(c1.hits, [v_h30, v_h50, v_h10, v_h20]) model_2 = Model("foo/T3SS", 11) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_3 = ModelGene(c_gene_3, model) h30 = Hit(c_gene_3, "h30", 10, "replicon_2", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) v_h30 = ValidHit(h30, gene_3, GeneStatus.ACCESSORY) h50 = Hit(c_gene_3, "h50", 10, "replicon_2", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_3, GeneStatus.ACCESSORY) c3 = Cluster([v_h30, v_h50], model_2, self.hit_weights) with self.assertRaises(MacsypyError) as ctx: c1.merge(c3) self.assertEqual(str(ctx.exception), "Try to merge Clusters from different model") def test_str(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) s ="""Cluster: - model = T2SS - replicon = replicon_1 - hits = (h10, gspD, 10), (h20, sctC, 20)""" self.assertEqual(str(c1), s)
class TestHitFunc(MacsyTest): def setUp(self) -> None: self.args = argparse.Namespace() self.args.sequence_db = self.find_data("base", "test_1.fasta") self.args.db_type = 'gembase' self.args.models_dir = self.find_data('models') self.args.res_search_dir = "blabla" self.cfg = Config(MacsyDefaults(), self.args) self.model_name = 'foo' self.model_location = ModelLocation(path=os.path.join(self.args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) self.hit_weights = HitWeight(**self.cfg.hit_weights()) def test_get_loners(self): model = Model("foo/T2SS", 11) # handle name, topology type, and min/max positions in the sequence dataset for a replicon and list of genes. # each genes is representing by a tuple (seq_id, length)""" rep_info = RepliconInfo('linear', 1, 60, [(f"g_{i}", i * 10) for i in range(1, 7)]) core_genes = [] model_genes = [] for g_name in ('gspD', 'sctC', 'sctJ', 'sctN', 'abc'): core_gene = CoreGene(self.model_location, g_name, self.profile_factory) core_genes.append(core_gene) model_genes.append(ModelGene(core_gene, model)) model_genes[3]._loner = True model_genes[4]._loner = True model.add_mandatory_gene(model_genes[0]) model.add_mandatory_gene(model_genes[1]) model.add_accessory_gene(model_genes[2]) model.add_accessory_gene(model_genes[3]) model.add_neutral_gene(model_genes[4]) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(core_genes[0], "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) h20 = Hit(core_genes[1], "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) h30 = Hit(core_genes[2], "h30", 10, "replicon_1", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) h61 = Hit(core_genes[3], "h61", 10, "replicon_1", 60, 1.0, 61.0, 1.0, 1.0, 10, 20) h80 = Hit(core_genes[4], "h80", 10, "replicon_1", 80, 1.0, 80.0, 1.0, 1.0, 10, 20) # loners are clusters of one hit loners = get_loners([h10, h20, h30, h61, h80], model, self.hit_weights) hit_from_clusters = [h.hits[0] for h in loners] self.assertListEqual(hit_from_clusters, [h61, h80]) def test_filter_loners(self): model = Model("foo/T2SS", 11) core_genes = [] model_genes = [] for g_name in ('gspD', 'sctC', 'sctJ', 'sctN', 'abc'): core_gene = CoreGene(self.model_location, g_name, self.profile_factory) core_genes.append(core_gene) model_genes.append(ModelGene(core_gene, model)) model_genes[2]._loner = True model_genes[3]._loner = True model_genes[4]._loner = True model.add_mandatory_gene(model_genes[0]) model.add_mandatory_gene(model_genes[1]) model.add_accessory_gene(model_genes[2]) model.add_accessory_gene(model_genes[3]) model.add_neutral_gene(model_genes[4]) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(core_genes[0], "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) h20 = Hit(core_genes[1], "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) h30 = Hit(core_genes[2], "h30", 10, "replicon_1", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) h40 = Hit(core_genes[3], "h40", 10, "replicon_1", 40, 1.0, 61.0, 1.0, 1.0, 10, 20) h50 = Hit(core_genes[4], "h50", 10, "replicon_1", 50, 1.0, 80.0, 1.0, 1.0, 10, 20) c1 = Cluster([h10, h20, h30, h40, h50], model, self.hit_weights) filtered_loners = filter_loners(c1, [Cluster([h30], model, self.hit_weights), Cluster([h40], model, self.hit_weights), Cluster([h50], model, self.hit_weights)] ) self.assertListEqual(filtered_loners, []) c1 = Cluster([h10, h20, h40], model, self.hit_weights) c30 = Cluster([h30], model, self.hit_weights) c40 = Cluster([h40], model, self.hit_weights) c50 = Cluster([h50], model, self.hit_weights) filtered_loners = filter_loners(c1, [c30, c40, c50]) self.assertListEqual(filtered_loners, [c30, c50])
class SerializationTest(MacsyTest): def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) self.hit_weights = HitWeight(**self.cfg.hit_weights()) # reset the uniq id number for AbstractSetOfHits # to have predictable results AbstractSetOfHits._id = itertools.count(1) def test_SystemSerializer_str(self): model_name = 'foo' model_location = ModelLocation( path=os.path.join(self.cfg.models_dir(), model_name)) model_A = Model("foo/A", 10) model_B = Model("foo/B", 10) c_gene_sctn_flg = CoreGene(model_location, "sctN_FLG", self.profile_factory) gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B) c_gene_sctj_flg = CoreGene(model_location, "sctJ_FLG", self.profile_factory) gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B) c_gene_flgB = CoreGene(model_location, "flgB", self.profile_factory) c_gene_tadZ = CoreGene(model_location, "tadZ", self.profile_factory) gene_tadZ = ModelGene(c_gene_tadZ, model_B) c_gene_sctn = CoreGene(model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model_A) gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_hom) c_gene_sctj = CoreGene(model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model_A) gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj) gene_sctj.add_exchangeable(gene_sctj_an) c_gene_gspd = CoreGene(model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model_A) gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd) gene_gspd.add_exchangeable(gene_gspd_an) c_gene_abc = CoreGene(model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model_A) gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc) gene_abc.add_exchangeable(gene_abc_ho) model_A.add_mandatory_gene(gene_sctn) model_A.add_mandatory_gene(gene_sctj) model_A.add_accessory_gene(gene_gspd) model_A.add_forbidden_gene(gene_abc) model_B.add_mandatory_gene(gene_sctn_flg) model_B.add_mandatory_gene(gene_sctj_flg) model_B.add_accessory_gene(gene_gspd) model_B.add_accessory_gene(gene_tadZ) h_sctj = Hit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctn = Hit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_gspd = Hit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctj_flg = Hit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_tadZ = Hit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) model_A._min_mandatory_genes_required = 2 model_A._min_genes_required = 2 c1 = Cluster([ ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY), ValidHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) ], model_A, self.hit_weights) c2 = Cluster([ ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY) ], model_A, self.hit_weights) model_B._min_mandatory_genes_required = 1 model_B._min_genes_required = 2 c3 = Cluster([ ValidHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY), ValidHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY), ValidHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) ], model_B, self.hit_weights) sys_A = System(model_A, [c1, c2], self.cfg.redundancy_penalty()) sys_A.id = "sys_id_A" sys_B = System(model_B, [c3], self.cfg.redundancy_penalty()) sys_B.id = "sys_id_B" hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B]) system_serializer = TxtSystemSerializer() sys_str = f"""system id = {sys_A.id} model = foo/A replicon = replicon_id clusters = [('hit_sctj', 'sctJ', 1), ('hit_sctn', 'sctN', 1), ('hit_gspd', 'gspD', 1)], [('hit_sctj', 'sctJ', 1), ('hit_sctn', 'sctN', 1)] occ = 2 wholeness = 1.000 loci nb = 2 score = 1.500 mandatory genes: \t- sctN: 2 (sctN, sctN) \t- sctJ: 2 (sctJ, sctJ) accessory genes: \t- gspD: 1 (gspD [sys_id_B]) neutral genes: """ self.assertEqual( sys_str, system_serializer.serialize(sys_A, hit_multi_sys_tracker)) def test_SystemSerializer_tsv(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) c_gene_sctn_flg = CoreGene(self.model_location, "sctN_FLG", self.profile_factory) gene_sctn_flg = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_flg) model.add_accessory_gene(gene_sctn) h_gspd = Hit(c_gene_gspd, "h_gspd", 803, "replicon_id", 10, 1.0, 1.0, 1.0, 1.0, 10, 20) v_h_gspd = ValidHit(h_gspd, gene_gspd, GeneStatus.MANDATORY) h_sctj = Hit(c_gene_sctj, "h_sctj", 803, "replicon_id", 20, 1.0, 1.0, 1.0, 1.0, 20, 30) v_h_sctj = ValidHit(h_sctj, gene_sctj, GeneStatus.ACCESSORY) h_sctn_flg = Hit(c_gene_sctn_flg, "h_sctn_flg", 803, "replicon_id", 30, 1.0, 1.0, 1.0, 1.0, 30, 40) v_h_sctn_flg = ValidHit(h_sctn_flg, gene_sctn_flg, GeneStatus.ACCESSORY) c1 = Cluster([v_h_gspd, v_h_sctj], model, self.hit_weights) c2 = Cluster([v_h_sctn_flg], model, self.hit_weights) sys_multi_loci = System(model, [c1, c2], self.cfg.redundancy_penalty()) hit_multi_sys_tracker = HitSystemTracker([sys_multi_loci]) system_serializer = TsvSystemSerializer() sys_tsv = "\t".join([ "replicon_id", "h_gspd", "gspD", "10", "foo/T2SS", sys_multi_loci.id, "1", "1.000", "1.900", "1", "gspD", "mandatory", "803", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) sys_tsv += "\n" sys_tsv += "\t".join([ "replicon_id", "h_sctj", "sctJ", "20", "foo/T2SS", sys_multi_loci.id, "1", "1.000", "1.900", "1", "sctJ", "accessory", "803", "1.0", "1.000", "1.000", "1.000", "20", "30", "" ]) sys_tsv += "\n" sys_tsv += "\t".join([ "replicon_id", "h_sctn_flg", "sctN_FLG", "30", "foo/T2SS", sys_multi_loci.id, "1", "1.000", "1.900", "1", "sctN", "accessory", "803", "1.0", "1.000", "1.000", "1.000", "30", "40", "" ]) sys_tsv += "\n" self.assertEqual( sys_tsv, system_serializer.serialize(sys_multi_loci, hit_multi_sys_tracker)) def test_SolutionSerializer_tsv(self): model_name = 'foo' model_location = ModelLocation( path=os.path.join(self.cfg.models_dir(), model_name)) model_A = Model("foo/A", 10) model_B = Model("foo/B", 10) c_gene_sctn_flg = CoreGene(model_location, "sctN_FLG", self.profile_factory) gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B) c_gene_sctj_flg = CoreGene(model_location, "sctJ_FLG", self.profile_factory) gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B) c_gene_flgB = CoreGene(model_location, "flgB", self.profile_factory) gene_flgB = ModelGene(c_gene_flgB, model_B) c_gene_tadZ = CoreGene(model_location, "tadZ", self.profile_factory) gene_tadZ = ModelGene(c_gene_tadZ, model_B) c_gene_sctn = CoreGene(model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model_A) gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_hom) c_gene_sctj = CoreGene(model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model_A) gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj) gene_sctj.add_exchangeable(gene_sctj_an) c_gene_gspd = CoreGene(model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model_A) gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd) gene_gspd.add_exchangeable(gene_gspd_an) c_gene_abc = CoreGene(model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model_A) gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc) gene_abc.add_exchangeable(gene_abc_ho) model_A.add_mandatory_gene(gene_sctn) model_A.add_mandatory_gene(gene_sctj) model_A.add_accessory_gene(gene_gspd) model_A.add_forbidden_gene(gene_abc) model_B.add_mandatory_gene(gene_sctn_flg) model_B.add_mandatory_gene(gene_sctj_flg) model_B.add_accessory_gene(gene_flgB) model_B.add_accessory_gene(gene_tadZ) h_sctj = Hit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctn = Hit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_gspd = Hit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctj_flg = Hit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_flgB = Hit(c_gene_flgB, "hit_flgB", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_tadZ = Hit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) model_A._min_mandatory_genes_required = 2 model_A._min_genes_required = 2 c1 = Cluster([ ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY), ValidHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) ], model_A, self.hit_weights) c2 = Cluster([ ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY) ], model_A, self.hit_weights) model_B._min_mandatory_genes_required = 1 model_B._min_genes_required = 2 c3 = Cluster([ ValidHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY), ValidHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY), ValidHit(h_flgB, gene_flgB, GeneStatus.ACCESSORY) ], model_B, self.hit_weights) sys_A = System(model_A, [c1, c2], self.cfg.redundancy_penalty()) sys_A.id = "sys_id_A" sys_B = System(model_B, [c3], self.cfg.redundancy_penalty()) sys_B.id = "sys_id_B" sol = [sys_A, sys_B] sol_id = '12' hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B]) system_serializer = TsvSolutionSerializer() sol_tsv = '\t'.join([ sol_id, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_gspd', 'gspD', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'gspD', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_sctj_flg', 'sctJ_FLG', '1', 'foo/B', 'sys_id_B', '1', '0.750', '2.000', '1', 'sctJ_FLG', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_tadZ', 'tadZ', '1', 'foo/B', 'sys_id_B', '1', '0.750', '2.000', '1', 'tadZ', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_flgB', 'flgB', '1', 'foo/B', 'sys_id_B', '1', '0.750', '2.000', '1', 'flgB', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += "\n" ser = system_serializer.serialize(sol, sol_id, hit_multi_sys_tracker) self.assertEqual(ser, sol_tsv) def test_LikelySystemSerializer_txt(self): model = Model("foo/FOO", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model) model.add_forbidden_gene(gene_abc) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) hit_4 = Hit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_4 = ValidHit(hit_4, gene_abc, GeneStatus.FORBIDDEN) ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [v_hit_4]) hit_multi_sys_tracker = HitSystemTracker([ls_1]) ser = TxtLikelySystemSerializer() txt = ser.serialize(ls_1, hit_multi_sys_tracker) expected_txt = """This replicon contains genetic materials needed for system foo/FOO WARNING there quorum is reached but there is also some forbidden genes. system id = replicon_id_FOO_1 model = foo/FOO replicon = replicon_id hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)] wholeness = 1.000 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) \t- sctN: 1 (sctN) neutral genes: forbidden genes: \t- abc: 1 (abc) Use ordered replicon to have better prediction. """ self.assertEqual(txt, expected_txt) def test_UnlikelySystemSerializer_txt(self): model = Model("foo/FOO", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model) model.add_forbidden_gene(gene_abc) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) hit_4 = Hit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_4 = ValidHit(hit_4, gene_abc, GeneStatus.FORBIDDEN) ser = TxtUnikelySystemSerializer() ls_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [v_hit_4], ["the reason why"]) txt = ser.serialize(ls_1) expected_txt = """This replicon probably not contains a system foo/FOO: the reason why system id = replicon_id_FOO_1 model = foo/FOO replicon = replicon_id hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)] wholeness = 1.000 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) \t- sctN: 1 (sctN) neutral genes: forbidden genes: \t- abc: 1 (abc) Use ordered replicon to have better prediction. """ self.assertEqual(txt, expected_txt)
def test_systems_to_txt(self): system_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Systems found """ f_out = StringIO() track_multi_systems_hit = HitSystemTracker([]) systems_to_txt([], track_multi_systems_hit, f_out) self.assertMultiLineEqual(system_str, f_out.getvalue()) args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) # test if id is well incremented gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) gene_name = "sctJ" c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) system_1 = System(model, [ Cluster([v_hit_1, v_hit_2], model, HitWeight(**cfg.hit_weights())) ], cfg.redundancy_penalty()) system_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Systems found: system id = replicon_id_T2SS_{next(System._id) - 1} model = foo/T2SS replicon = replicon_id clusters = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 1)] occ = 1 wholeness = 1.000 loci nb = 1 score = 1.500 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) neutral genes: ============================================================ """ f_out = StringIO() track_multi_systems_hit = HitSystemTracker([system_1]) systems_to_txt([system_1], track_multi_systems_hit, f_out) self.assertMultiLineEqual(system_str, f_out.getvalue())
def test_rejected_clst_to_txt(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = "blabla" cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 11) gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_1 = ModelGene(c_gene_gspd, model) gene_name = "sctC" c_gene_sctc = CoreGene(models_location, gene_name, profile_factory) gene_2 = ModelGene(c_gene_sctc, model) model.add_mandatory_gene(gene_1) model.add_accessory_gene(gene_2) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_gspd, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_sctc, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.ACCESSORY) h40 = Hit(c_gene_gspd, "h10", 10, "replicon_1", 40, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h40 = ValidHit(h40, gene_1, GeneStatus.MANDATORY) h50 = Hit(c_gene_sctc, "h20", 10, "replicon_1", 50, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_2, GeneStatus.ACCESSORY) hit_weights = HitWeight(**cfg.hit_weights()) c1 = Cluster([v_h10, v_h20], model, hit_weights) c2 = Cluster([v_h40, v_h50], model, hit_weights) r_c = RejectedClusters(model, [c1, c2], ["The reasons to reject this clusters"]) rej_clst_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Rejected clusters: Cluster: - model = T2SS - replicon = replicon_1 - hits = (h10, gspD, 10), (h20, sctC, 20) Cluster: - model = T2SS - replicon = replicon_1 - hits = (h10, gspD, 40), (h20, sctC, 50) These clusters have been rejected because: \t- The reasons to reject this clusters ============================================================ """ f_out = StringIO() rejected_clst_to_txt([r_c], f_out) self.maxDiff = None self.assertMultiLineEqual(rej_clst_str, f_out.getvalue()) rej_clst_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Rejected clusters """ f_out = StringIO() rejected_clst_to_txt([], f_out) self.assertMultiLineEqual(rej_clst_str, f_out.getvalue())
def test_solutions_to_tsv(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model_A = Model("foo/A", 10) model_B = Model("foo/B", 10) model_C = Model("foo/C", 10) c_gene_sctn_flg = CoreGene(models_location, "sctN_FLG", profile_factory) gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B) c_gene_sctj_flg = CoreGene(models_location, "sctJ_FLG", profile_factory) gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B) c_gene_flgB = CoreGene(models_location, "flgB", profile_factory) gene_flgB = ModelGene(c_gene_flgB, model_B) c_gene_tadZ = CoreGene(models_location, "tadZ", profile_factory) gene_tadZ = ModelGene(c_gene_tadZ, model_B) c_gene_sctn = CoreGene(models_location, "sctN", profile_factory) gene_sctn = ModelGene(c_gene_sctn, model_A) gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_hom) c_gene_sctj = CoreGene(models_location, "sctJ", profile_factory) gene_sctj = ModelGene(c_gene_sctj, model_A) gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj) gene_sctj.add_exchangeable(gene_sctj_an) c_gene_gspd = CoreGene(models_location, "gspD", profile_factory) gene_gspd = ModelGene(c_gene_gspd, model_A) gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd) gene_gspd.add_exchangeable(gene_gspd_an) c_gene_abc = CoreGene(models_location, "abc", profile_factory) gene_abc = ModelGene(c_gene_abc, model_A) gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc) gene_abc.add_exchangeable(gene_abc_ho) model_A.add_mandatory_gene(gene_sctn) model_A.add_mandatory_gene(gene_sctj) model_A.add_accessory_gene(gene_gspd) model_A.add_forbidden_gene(gene_abc) model_B.add_mandatory_gene(gene_sctn_flg) model_B.add_mandatory_gene(gene_sctj_flg) model_B.add_accessory_gene(gene_flgB) model_B.add_accessory_gene(gene_tadZ) model_C.add_mandatory_gene(gene_sctn_flg) model_C.add_mandatory_gene(gene_sctj_flg) model_C.add_mandatory_gene(gene_flgB) model_C.add_accessory_gene(gene_tadZ) model_C.add_accessory_gene(gene_gspd) h_sctj = Hit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctn = Hit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_gspd = Hit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctj_flg = Hit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_flgB = Hit(c_gene_flgB, "hit_flgB", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_tadZ = Hit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) model_A._min_mandatory_genes_required = 2 model_A._min_genes_required = 2 hit_weights = HitWeight(**cfg.hit_weights()) c1 = Cluster([ ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY), ValidHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) ], model_A, hit_weights) c2 = Cluster([ ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY) ], model_A, hit_weights) model_B._min_mandatory_genes_required = 1 model_B._min_genes_required = 2 c3 = Cluster([ ValidHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY), ValidHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY), ValidHit(h_flgB, gene_flgB, GeneStatus.ACCESSORY) ], model_B, hit_weights) model_C._min_mandatory_genes_required = 1 model_C._min_genes_required = 2 c4 = Cluster([ ValidHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY), ValidHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY), ValidHit(h_flgB, gene_flgB, GeneStatus.MANDATORY), ValidHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) ], model_C, hit_weights) sys_A = System(model_A, [c1, c2], cfg.redundancy_penalty()) sys_A.id = "sys_id_A" sys_B = System(model_B, [c3], cfg.redundancy_penalty()) sys_B.id = "sys_id_B" sys_C = System(model_C, [c4], cfg.redundancy_penalty()) sys_C.id = "sys_id_C" sol_1 = [sys_A, sys_B] sol_2 = [sys_A, sys_C] sol_id_1 = '1' sol_id_2 = '2' sol_tsv = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Systems found: """ sol_tsv += "\t".join([ "sol_id", "replicon", "hit_id", "gene_name", "hit_pos", "model_fqn", "sys_id", "sys_loci", "sys_wholeness", "sys_score", "sys_occ", "hit_gene_ref", "hit_status", "hit_seq_len", "hit_i_eval", "hit_score", "hit_profile_cov", "hit_seq_cov", "hit_begin_match", "hit_end_match", "used_in" ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_gspd', 'gspD', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'gspD', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctj_flg', 'sctJ_FLG', '1', 'foo/B', 'sys_id_B', '1', '0.750', '2.000', '1', 'sctJ_FLG', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_tadZ', 'tadZ', '1', 'foo/B', 'sys_id_B', '1', '0.750', '2.000', '1', 'tadZ', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_flgB', 'flgB', '1', 'foo/B', 'sys_id_B', '1', '0.750', '2.000', '1', 'flgB', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_gspd', 'gspD', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'gspD', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctj_flg', 'sctJ_FLG', '1', 'foo/C', 'sys_id_C', '1', '0.800', '3.000', '1', 'sctJ_FLG', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'sys_id_B' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_tadZ', 'tadZ', '1', 'foo/C', 'sys_id_C', '1', '0.800', '3.000', '1', 'tadZ', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'sys_id_B' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_flgB', 'flgB', '1', 'foo/C', 'sys_id_C', '1', '0.800', '3.000', '1', 'flgB', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'sys_id_B' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_gspd', 'gspD', '1', 'foo/C', 'sys_id_C', '1', '0.800', '3.000', '1', 'gspD', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'sys_id_A' ]) sol_tsv += "\n" sol_tsv += "\n" f_out = StringIO() hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B]) solutions_to_tsv([sol_1, sol_2], hit_multi_sys_tracker, f_out) self.assertMultiLineEqual(sol_tsv, f_out.getvalue())
def test_systems_to_tsv(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) gene_name = "sctJ" c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) system_1 = System(model, [ Cluster([v_hit_1, v_hit_2], model, HitWeight(**cfg.hit_weights())) ], cfg.redundancy_penalty()) system_tsv = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Systems found: """ system_tsv += "\t".join([ "replicon", "hit_id", "gene_name", "hit_pos", "model_fqn", "sys_id", "sys_loci", "sys_wholeness", "sys_score", "sys_occ", "hit_gene_ref", "hit_status", "hit_seq_len", "hit_i_eval", "hit_score", "hit_profile_cov", "hit_seq_cov", "hit_begin_match", "hit_end_match", "used_in" ]) system_tsv += "\n" system_tsv += "\t".join([ "replicon_id", "hit_1", "gspD", "1", "foo/T2SS", system_1.id, "1", "1.000", "1.500", "1", "gspD", "mandatory", "803", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) system_tsv += "\n" system_tsv += "\t".join([ "replicon_id", "hit_2", "sctJ", "1", "foo/T2SS", system_1.id, "1", "1.000", "1.500", "1", "sctJ", "accessory", "803", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) system_tsv += "\n\n" f_out = StringIO() track_multi_systems_hit = HitSystemTracker([system_1]) systems_to_tsv([system_1], track_multi_systems_hit, f_out) self.assertMultiLineEqual(system_tsv, f_out.getvalue()) # test No system found system_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Systems found """ f_out = StringIO() track_multi_systems_hit = HitSystemTracker([]) systems_to_tsv([], track_multi_systems_hit, f_out) self.assertMultiLineEqual(system_str, f_out.getvalue())
class MatchMakerTest(MacsyTest): def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) self.model = Model("foo/model_A", 10) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, self.model) c_gene_sctn_flg = CoreGene(self.model_location, "sctN_FLG", self.profile_factory) gene_sctn_flg = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_flg) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, self.model) c_gene_sctj_flg = CoreGene(self.model_location, "sctJ_FLG", self.profile_factory) gene_sctj_flg = Exchangeable(c_gene_sctj_flg, gene_sctj) gene_sctj.add_exchangeable(gene_sctj_flg) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, self.model) c_gene_flgb = CoreGene(self.model_location, "flgB", self.profile_factory) gene_gspd_an = Exchangeable(c_gene_flgb, gene_gspd) gene_gspd.add_exchangeable(gene_gspd_an) c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, self.model) c_gene_tadz = CoreGene(self.model_location, "tadZ", self.profile_factory) gene_abc_ho = Exchangeable(c_gene_tadz, gene_abc) gene_abc.add_exchangeable(gene_abc_ho) c_gene_toto = CoreGene(self.model_location, "toto", self.profile_factory) gene_toto = ModelGene(c_gene_toto, self.model) c_gene_totote = CoreGene(self.model_location, "totote", self.profile_factory) gene_toto_ho = Exchangeable(c_gene_totote, gene_toto) gene_toto.add_exchangeable(gene_toto_ho) self.model.add_mandatory_gene(gene_sctn) self.model.add_mandatory_gene(gene_sctj) self.model.add_accessory_gene(gene_gspd) self.model.add_neutral_gene(gene_toto) self.model.add_forbidden_gene(gene_abc) self.c_hits = { 'h_sctj': Hit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_sctj_flg': Hit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_sctn': Hit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_sctn_flg': Hit(c_gene_sctn_flg, "hit_sctn_flg", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_gspd': Hit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_gspd_an': Hit(c_gene_flgb, "hit_gspd_an", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_abc': Hit(c_gene_abc, "hit_abc", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_abc_ho': Hit(c_gene_tadz, "hit_abc_ho", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_toto': Hit(c_gene_toto, "hit_toto", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), 'h_toto_ho': Hit(c_gene_totote, "hit_toto_ho", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20), } def test_sort_hits_by_status(self): ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) mandatory_exp = [self.c_hits['h_sctn'], self.c_hits['h_sctj']] accessory_exp = [self.c_hits['h_gspd']] neutral_exp = [self.c_hits['h_toto']] forbidden_exp = [self.c_hits['h_abc']] mandatory, accessory, neutral, forbidden = ordered_match_maker.sort_hits_by_status( mandatory_exp + accessory_exp + neutral_exp + forbidden_exp) self.assertListEqual([h.gene.name for h in mandatory_exp], [h.gene.name for h in mandatory]) self.assertListEqual([h.gene.name for h in accessory_exp], [h.gene.name for h in accessory]) self.assertListEqual([h.gene.name for h in neutral_exp], [h.gene.name for h in neutral]) self.assertListEqual([h.gene.name for h in forbidden_exp], [h.gene.name for h in forbidden]) # do the same but with exchangeable mandatory_exp_exch = [ self.c_hits['h_sctn_flg'], self.c_hits['h_sctj_flg'] ] accessory_exp_exch = [self.c_hits['h_gspd_an']] neutral_exp_exch = [self.c_hits['h_toto_ho']] forbidden_exp_exch = [self.c_hits['h_abc_ho']] mandatory, accessory, neutral, forbidden = ordered_match_maker.sort_hits_by_status( mandatory_exp_exch + accessory_exp_exch + neutral_exp_exch + forbidden_exp_exch) self.assertListEqual([h.gene.name for h in mandatory_exp_exch], [h.gene.name for h in mandatory]) self.assertListEqual([h.gene.name for h in accessory_exp_exch], [h.gene.name for h in accessory]) self.assertListEqual([h.gene.name for h in neutral_exp_exch], [h.gene.name for h in neutral]) self.assertListEqual([h.gene.name for h in forbidden_exp_exch], [h.gene.name for h in forbidden]) # test if gene_ref is the ModelGene # alternate_of return the ModelGene of the function self.assertListEqual( [h.gene.name for h in mandatory_exp], [h.gene_ref.alternate_of().name for h in mandatory]) self.assertListEqual( [h.gene.name for h in accessory_exp], [h.gene_ref.alternate_of().name for h in accessory]) self.assertListEqual([h.gene.name for h in neutral_exp], [h.gene_ref.alternate_of().name for h in neutral]) self.assertListEqual( [h.gene.name for h in forbidden_exp], [h.gene_ref.alternate_of().name for h in forbidden]) def test_ordered_match(self): ##################### # test single locus # ##################### # it lack one mandatory gene self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 3 c1 = Cluster([self.c_hits['h_sctj'], self.c_hits['h_gspd']], self.model, self.cfg.hit_weights()) ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) res = ordered_match_maker.match([c1]) self.assertIsInstance(res, RejectedClusters) self.assertEqual(res.reasons, [ "The quorum of mandatory genes required (2) is not reached: 1", "The quorum of genes required (3) is not reached: 2" ]) # all quorum are reached self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 1 c1 = Cluster([ self.c_hits['h_sctj'], self.c_hits['h_sctn'], self.c_hits['h_gspd'] ], self.model, self.cfg.hit_weights()) ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) res = ordered_match_maker.match([c1]) self.assertIsInstance(res, System) # with one mandatory analog self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 1 c1 = Cluster([ self.c_hits['h_sctj_flg'], self.c_hits['h_sctn'], self.c_hits['h_gspd'] ], self.model, self.cfg.hit_weights()) ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) res = ordered_match_maker.match([c1]) self.assertIsInstance(res, System) # with one accessory analog self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 1 c1 = Cluster([ self.c_hits['h_sctj'], self.c_hits['h_sctn'], self.c_hits['h_gspd_an'] ], self.model, self.cfg.hit_weights()) ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) res = ordered_match_maker.match([c1]) self.assertIsInstance(res, System) # the min_gene_required quorum is not reached self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 4 c1 = Cluster([ self.c_hits['h_sctj'], self.c_hits['h_sctn_flg'], self.c_hits['h_gspd'] ], self.model, self.cfg.hit_weights()) ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) res = ordered_match_maker.match([c1]) self.assertIsInstance(res, RejectedClusters) self.assertListEqual( res.reasons, ["The quorum of genes required (4) is not reached: 3"]) # the min_gene_required quorum is not reached even there is a neutral self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 4 c1 = Cluster([ self.c_hits['h_sctj'], self.c_hits['h_sctn_flg'], self.c_hits['h_gspd'], self.c_hits['h_toto'] ], self.model, self.cfg.hit_weights()) ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) res = ordered_match_maker.match([c1]) self.assertIsInstance(res, RejectedClusters) self.assertEqual( res.reasons, ["The quorum of genes required (4) is not reached: 3"]) self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 4 c1 = Cluster([ self.c_hits['h_sctj'], self.c_hits['h_sctn_flg'], self.c_hits['h_gspd'], self.c_hits['h_toto_ho'] ], self.model, self.cfg.hit_weights()) ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) res = ordered_match_maker.match([c1]) self.assertIsInstance(res, RejectedClusters) self.assertEqual( res.reasons, ["The quorum of genes required (4) is not reached: 3"]) # the cluster contain a forbidden gene self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 1 c1 = Cluster([ self.c_hits['h_sctj'], self.c_hits['h_sctn'], self.c_hits['h_gspd'], self.c_hits['h_abc'] ], self.model, self.cfg.hit_weights()) ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) res = ordered_match_maker.match([c1]) self.assertIsInstance(res, RejectedClusters) self.assertEqual(res.reasons, ["There is 1 forbidden genes occurrence(s): abc"]) # the cluster contain a forbidden gene homolog self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 1 c1 = Cluster([ self.c_hits['h_sctj'], self.c_hits['h_sctn'], self.c_hits['h_gspd'], self.c_hits['h_abc_ho'] ], self.model, self.cfg.hit_weights()) ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) res = ordered_match_maker.match([c1]) self.assertIsInstance(res, RejectedClusters) self.assertEqual(res.reasons, ["There is 1 forbidden genes occurrence(s): tadZ"]) ##################### # test multi loci # ##################### self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 1 c1 = Cluster([self.c_hits['h_sctj'], self.c_hits['h_sctn']], self.model, self.cfg.hit_weights()) c2 = Cluster([self.c_hits['h_gspd']], self.model, self.cfg.hit_weights()) ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) res = ordered_match_maker.match([c1, c2]) self.assertIsInstance(res, System) # with one analog an one homolog self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 1 c1 = Cluster([self.c_hits['h_sctj_flg'], self.c_hits['h_sctn_flg']], self.model, self.cfg.hit_weights()) c2 = Cluster([self.c_hits['h_gspd_an']], self.model, self.cfg.hit_weights()) ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) res = ordered_match_maker.match([c1, c2]) self.assertIsInstance(res, System) # with one analog an one homolog and one forbidden in 3 clusters self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 1 c1 = Cluster([self.c_hits['h_sctj_flg'], self.c_hits['h_sctn_flg']], self.model, self.cfg.hit_weights()) c2 = Cluster([self.c_hits['h_gspd']], self.model, self.cfg.hit_weights()) c3 = Cluster([self.c_hits['h_abc']], self.model, self.cfg.hit_weights()) ordered_match_maker = OrderedMatchMaker(self.model, self.cfg.redundancy_penalty()) res = ordered_match_maker.match([c1, c2, c3]) self.assertEqual(res.reasons, ["There is 1 forbidden genes occurrence(s): abc"]) def test_unordered_match(self): # it lack one mandatory gene self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 3 hits = [self.c_hits['h_sctj'], self.c_hits['h_gspd']] unordered_match_maker = UnorderedMatchMaker(self.model) res = unordered_match_maker.match(hits) self.assertIsInstance(res, UnlikelySystem) self.assertEqual(res.reasons, [ "The quorum of mandatory genes required (2) is not reached: 1", "The quorum of genes required (3) is not reached: 2" ]) # all quorum are reached self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 1 hits = [ self.c_hits['h_sctj'], self.c_hits['h_sctn'], self.c_hits['h_gspd'] ] unordered_match_maker = UnorderedMatchMaker(self.model) res = unordered_match_maker.match(hits) self.assertIsInstance(res, LikelySystem) # with one mandatory analog self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 1 hits = [ self.c_hits['h_sctj_flg'], self.c_hits['h_sctn'], self.c_hits['h_gspd'] ] unordered_match_maker = UnorderedMatchMaker(self.model) res = unordered_match_maker.match(hits) self.assertIsInstance(res, LikelySystem) # with one accessory analog self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 1 hits = [ self.c_hits['h_sctj'], self.c_hits['h_sctn'], self.c_hits['h_gspd_an'] ] unordered_match_maker = UnorderedMatchMaker(self.model) res = unordered_match_maker.match(hits) self.assertIsInstance(res, LikelySystem) # the min_gene_required quorum is not reached self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 4 hits = [ self.c_hits['h_sctj'], self.c_hits['h_sctn_flg'], self.c_hits['h_gspd'] ] unordered_match_maker = UnorderedMatchMaker(self.model) res = unordered_match_maker.match(hits) self.assertIsInstance(res, UnlikelySystem) self.assertEqual( res.reasons, ["The quorum of genes required (4) is not reached: 3"]) # the min_gene_required quorum is not reached even there is a neutral self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 4 hits = [ self.c_hits['h_sctj'], self.c_hits['h_sctn_flg'], self.c_hits['h_gspd'], self.c_hits['h_toto'] ] unordered_match_maker = UnorderedMatchMaker(self.model) res = unordered_match_maker.match(hits) self.assertIsInstance(res, UnlikelySystem) self.assertEqual( res.reasons, ["The quorum of genes required (4) is not reached: 3"]) self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 4 hits = [ self.c_hits['h_sctj'], self.c_hits['h_sctn_flg'], self.c_hits['h_gspd'], self.c_hits['h_toto_ho'] ] unordered_match_maker = UnorderedMatchMaker(self.model) res = unordered_match_maker.match(hits) self.assertIsInstance(res, UnlikelySystem) self.assertEqual( res.reasons, ["The quorum of genes required (4) is not reached: 3"]) # the hits contain a forbidden gene self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 1 allowed_hits = [ self.c_hits['h_sctj'], self.c_hits['h_sctn'], self.c_hits['h_gspd'] ] forbidden_hits = [self.c_hits['h_abc']] unordered_match_maker = UnorderedMatchMaker(self.model) res = unordered_match_maker.match(allowed_hits + forbidden_hits) self.assertIsInstance(res, LikelySystem) self.assertListEqual([(h.id, h.position) for h in res.hits], [(h.id, h.position) for h in allowed_hits + forbidden_hits]) self.assertListEqual(res._forbidden_hits, [self.c_hits['h_abc']]) # the cluster contain a forbidden gene homolog self.model._min_mandatory_genes_required = 2 self.model._min_genes_required = 1 hits = [ self.c_hits['h_sctj'], self.c_hits['h_sctn'], self.c_hits['h_gspd'], self.c_hits['h_abc_ho'] ] unordered_match_maker = UnorderedMatchMaker(self.model) res = unordered_match_maker.match(hits) self.assertIsInstance(res, LikelySystem) self.assertListEqual(res._forbidden_hits, [self.c_hits['h_abc_ho']])
class SerializationTest(MacsyTest): def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) self.hit_weights = HitWeight(**self.cfg.hit_weights()) # reset the uniq id number for AbstractUnordered # to have predictable results for (Likely/Unlikely)Systems System._id = itertools.count(1) AbstractUnordered._id = itertools.count(1) def test_SystemSerializer_str(self): model_name = 'foo' model_location = ModelLocation( path=os.path.join(self.cfg.models_dir()[0], model_name)) model_A = Model("foo/A", 10) model_B = Model("foo/B", 10) c_gene_sctn_flg = CoreGene(model_location, "sctN_FLG", self.profile_factory) gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B) c_gene_sctj_flg = CoreGene(model_location, "sctJ_FLG", self.profile_factory) gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B) c_gene_flgB = CoreGene(model_location, "flgB", self.profile_factory) c_gene_tadZ = CoreGene(model_location, "tadZ", self.profile_factory) gene_tadZ = ModelGene(c_gene_tadZ, model_B) c_gene_sctn = CoreGene(model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model_A) gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_hom) c_gene_sctj = CoreGene(model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model_A) gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj) gene_sctj.add_exchangeable(gene_sctj_an) c_gene_gspd = CoreGene(model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model_A) gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd) gene_gspd.add_exchangeable(gene_gspd_an) c_gene_abc = CoreGene(model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model_A) gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc) gene_abc.add_exchangeable(gene_abc_ho) model_A.add_mandatory_gene(gene_sctn) model_A.add_mandatory_gene(gene_sctj) model_A.add_accessory_gene(gene_gspd) model_A.add_forbidden_gene(gene_abc) model_B.add_mandatory_gene(gene_sctn_flg) model_B.add_mandatory_gene(gene_sctj_flg) model_B.add_accessory_gene(gene_gspd) model_B.add_accessory_gene(gene_tadZ) h_sctj = CoreHit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctn = CoreHit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_gspd = CoreHit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctj_flg = CoreHit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_tadZ = CoreHit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) model_A._min_mandatory_genes_required = 2 model_A._min_genes_required = 2 c1 = Cluster([ ModelHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ModelHit(h_sctn, gene_sctn, GeneStatus.MANDATORY), ModelHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) ], model_A, self.hit_weights) c2 = Cluster([ ModelHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ModelHit(h_sctn, gene_sctn, GeneStatus.MANDATORY) ], model_A, self.hit_weights) model_B._min_mandatory_genes_required = 1 model_B._min_genes_required = 2 c3 = Cluster([ ModelHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY), ModelHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY), ModelHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) ], model_B, self.hit_weights) sys_A = System(model_A, [c1, c2], self.cfg.redundancy_penalty()) sys_A.id = "sys_id_A" sys_B = System(model_B, [c3], self.cfg.redundancy_penalty()) sys_B.id = "sys_id_B" hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B]) system_serializer = TxtSystemSerializer() sys_str = f"""system id = {sys_A.id} model = foo/A replicon = replicon_id clusters = [('hit_sctj', 'sctJ', 1), ('hit_sctn', 'sctN', 1), ('hit_gspd', 'gspD', 1)], [('hit_sctj', 'sctJ', 1), ('hit_sctn', 'sctN', 1)] occ = 2 wholeness = 1.000 loci nb = 2 score = 1.500 mandatory genes: \t- sctN: 2 (sctN, sctN) \t- sctJ: 2 (sctJ, sctJ) accessory genes: \t- gspD: 1 (gspD [sys_id_B]) neutral genes: """ self.assertEqual( sys_str, system_serializer.serialize(sys_A, hit_multi_sys_tracker)) def test_SystemSerializer_tsv(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model, loner=True) c_gene_sctn_flg = CoreGene(self.model_location, "sctN_FLG", self.profile_factory) gene_sctn_flg = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_flg) model.add_accessory_gene(gene_sctn) #CoreHit(gene, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match # pos score ch_gspd = CoreHit(c_gene_gspd, "h_gspd", 803, "replicon_id", 10, 1.0, 1.0, 1.0, 1.0, 10, 20) mh_gspd = ModelHit(ch_gspd, gene_ref=gene_gspd, gene_status=GeneStatus.MANDATORY) ch_sctj = CoreHit(c_gene_sctj, "h_sctj", 803, "replicon_id", 20, 1.0, 1.0, 1.0, 1.0, 20, 30) mh_sctj = ModelHit(ch_sctj, gene_ref=gene_sctj, gene_status=GeneStatus.ACCESSORY) ch_sctn_flg = CoreHit(c_gene_sctn_flg, "h_sctn_flg", 803, "replicon_id", 40, 1.0, 1.0, 1.0, 1.0, 30, 40) mh_sctn_flg = ModelHit(ch_sctn_flg, gene_ref=gene_sctn_flg, gene_status=GeneStatus.ACCESSORY) ch_sctn = CoreHit(c_gene_sctn, "h_sctn", 803, "replicon_id", 80, 1.0, 1.0, 1.0, 1.0, 30, 40) mh_sctn = Loner(ch_sctn, gene_ref=gene_sctn, gene_status=GeneStatus.ACCESSORY, counterpart=[mh_sctn_flg]) c1 = Cluster([mh_gspd, mh_sctj], model, self.hit_weights) c2 = Cluster([mh_sctn], model, self.hit_weights) sys_multi_loci = System(model, [c1, c2], self.cfg.redundancy_penalty()) # score 1.5 .35 = 1.85 hit_multi_sys_tracker = HitSystemTracker([sys_multi_loci]) system_serializer = TsvSystemSerializer() sys_tsv = "\t".join([ "replicon_id", "h_gspd", "gspD", "10", "foo/T2SS", sys_multi_loci.id, "1", "1", "1.000", "1.850", "1", "gspD", "mandatory", "803", "1.0", "1.000", "1.000", "1.000", "10", "20", "", "" ]) sys_tsv += "\n" sys_tsv += "\t".join([ "replicon_id", "h_sctj", "sctJ", "20", "foo/T2SS", sys_multi_loci.id, "1", "1", "1.000", "1.850", "1", "sctJ", "accessory", "803", "1.0", "1.000", "1.000", "1.000", "20", "30", "", "" ]) sys_tsv += "\n" sys_tsv += "\t".join([ "replicon_id", "h_sctn", "sctN", "80", "foo/T2SS", sys_multi_loci.id, "1", "-1", "1.000", "1.850", "1", "sctN", "accessory", "803", "1.0", "1.000", "1.000", "1.000", "30", "40", "h_sctn_flg", "" ]) sys_tsv += "\n" self.maxDiff = None self.assertEqual( sys_tsv, system_serializer.serialize(sys_multi_loci, hit_multi_sys_tracker)) def test_SolutionSerializer_tsv(self): model_name = 'foo' model_location = ModelLocation( path=os.path.join(self.cfg.models_dir()[0], model_name)) ########### # Model B # ########### model_B = Model("foo/B", 10) c_gene_sctn_flg = CoreGene(model_location, "sctN_FLG", self.profile_factory) gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B) c_gene_sctj_flg = CoreGene(model_location, "sctJ_FLG", self.profile_factory) gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B) c_gene_flgB = CoreGene(model_location, "flgB", self.profile_factory) gene_flgB = ModelGene(c_gene_flgB, model_B) c_gene_tadZ = CoreGene(model_location, "tadZ", self.profile_factory) gene_tadZ = ModelGene(c_gene_tadZ, model_B) model_B.add_mandatory_gene(gene_sctn_flg) model_B.add_mandatory_gene(gene_sctj_flg) model_B.add_accessory_gene(gene_flgB) model_B.add_accessory_gene(gene_tadZ) ########### # Model A # ########### model_A = Model("foo/A", 10) c_gene_sctn = CoreGene(model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model_A) gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_hom) c_gene_sctj = CoreGene(model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model_A) gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj) gene_sctj.add_exchangeable(gene_sctj_an) c_gene_gspd = CoreGene(model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model_A) gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd) gene_gspd.add_exchangeable(gene_gspd_an) c_gene_abc = CoreGene(model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model_A, loner=True) gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc) gene_abc.add_exchangeable(gene_abc_ho) model_A.add_mandatory_gene(gene_sctn) model_A.add_mandatory_gene(gene_sctj) model_A.add_accessory_gene(gene_gspd) model_A.add_accessory_gene(gene_abc) # CoreHit(gene, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match # pos score h_sctj = CoreHit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) mh_sctj = ModelHit(h_sctj, gene_sctj, GeneStatus.MANDATORY) h_sctn = CoreHit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) mh_sctn = ModelHit(h_sctn, gene_sctn, GeneStatus.MANDATORY) h_gspd = CoreHit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) mh_gspd = ModelHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) h_sctj_flg = CoreHit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id", 10, 1.0, 1.0, 1.0, 1.0, 10, 20) h_flgB = CoreHit(c_gene_flgB, "hit_flgB", 803, "replicon_id", 11, 1.0, 1.0, 1.0, 1.0, 10, 20) h_abc = CoreHit(c_gene_abc, "hit_abc", 803, "replicon_id", 20, 1.0, 1.0, 1.0, 1.0, 10, 20) h_abc2 = CoreHit(c_gene_abc, "hit_abc2", 803, "replicon_id", 50, 1.0, 1.0, 1.0, 1.0, 10, 20) h_tadZ = CoreHit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 40, 1.0, 1.0, 1.0, 1.0, 10, 20) mh_sctj_flg = ModelHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY) mh_flgB = ModelHit(h_flgB, gene_flgB, GeneStatus.ACCESSORY) mh_abc = ModelHit(h_abc, gene_abc, GeneStatus.ACCESSORY) mh_abc2 = ModelHit(h_abc2, gene_abc, GeneStatus.ACCESSORY) mh_tadZ = ModelHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY) model_A._min_mandatory_genes_required = 2 model_A._min_genes_required = 2 c1 = Cluster([mh_sctj, mh_sctn, mh_gspd], model_A, self.hit_weights) c2 = Cluster([mh_sctj, mh_sctn], model_A, self.hit_weights) c3 = Cluster([ Loner(h_abc, gene_ref=gene_abc, gene_status=GeneStatus.ACCESSORY, counterpart=[mh_abc2]) ], model_A, self.hit_weights) model_B._min_mandatory_genes_required = 1 model_B._min_genes_required = 2 c5 = Cluster([mh_sctj_flg, mh_tadZ, mh_flgB], model_B, self.hit_weights) sys_A = System(model_A, [c1, c2, c3], self.cfg.redundancy_penalty()) # score = 2.5, 2 , 0.35 = 4.85 - (2 * 1.5) = 1.85 sys_A.id = "sys_id_A" sys_B = System(model_B, [c5], self.cfg.redundancy_penalty()) # score = 2.0 sys_B.id = "sys_id_B" sol = Solution([sys_A, sys_B]) sol_id = '12' hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B]) sol_serializer = TsvSolutionSerializer() sol_tsv = '\t'.join([ sol_id, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1', '1.000', '1.850', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_sctn', 'sctN', '2', 'foo/A', 'sys_id_A', '2', '1', '1.000', '1.850', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_gspd', 'gspD', '3', 'foo/A', 'sys_id_A', '2', '1', '1.000', '1.850', '2', 'gspD', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '2', '1.000', '1.850', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_sctn', 'sctN', '2', 'foo/A', 'sys_id_A', '2', '2', '1.000', '1.850', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_abc', 'abc', '20', 'foo/A', 'sys_id_A', '2', '-1', '1.000', '1.850', '2', 'abc', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'hit_abc2', '' ]) sol_tsv += "\n" sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_sctj_flg', 'sctJ_FLG', '10', 'foo/B', 'sys_id_B', '1', '1', '0.750', '2.000', '1', 'sctJ_FLG', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_flgB', 'flgB', '11', 'foo/B', 'sys_id_B', '1', '1', '0.750', '2.000', '1', 'flgB', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id, 'replicon_id', 'hit_tadZ', 'tadZ', '40', 'foo/B', 'sys_id_B', '1', '1', '0.750', '2.000', '1', 'tadZ', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', '' ]) sol_tsv += "\n" sol_tsv += "\n" ser = sol_serializer.serialize(sol, sol_id, hit_multi_sys_tracker) self.maxDiff = None self.assertEqual(ser, sol_tsv) def test_LikelySystemSerializer_txt(self): model = Model("foo/FOO", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model) model.add_forbidden_gene(gene_abc) hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) hit_4 = CoreHit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_4 = ModelHit(hit_4, gene_abc, GeneStatus.FORBIDDEN) ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [v_hit_4]) hit_multi_sys_tracker = HitSystemTracker([ls_1]) ser = TxtLikelySystemSerializer() txt = ser.serialize(ls_1, hit_multi_sys_tracker) expected_txt = """This replicon contains genetic materials needed for system foo/FOO WARNING there quorum is reached but there is also some forbidden genes. system id = replicon_id_FOO_1 model = foo/FOO replicon = replicon_id hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)] wholeness = 1.000 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) \t- sctN: 1 (sctN) neutral genes: forbidden genes: \t- abc: 1 (abc) Use ordered replicon to have better prediction. """ self.assertEqual(txt, expected_txt) def test_UnlikelySystemSerializer_txt(self): model = Model("foo/FOO", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model) model.add_forbidden_gene(gene_abc) hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) hit_4 = CoreHit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_4 = ModelHit(hit_4, gene_abc, GeneStatus.FORBIDDEN) ser = TxtUnikelySystemSerializer() ls_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [v_hit_4], ["the reason why"]) txt = ser.serialize(ls_1) expected_txt = """This replicon probably not contains a system foo/FOO: the reason why system id = replicon_id_FOO_1 model = foo/FOO replicon = replicon_id hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)] wholeness = 1.000 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) \t- sctN: 1 (sctN) neutral genes: forbidden genes: \t- abc: 1 (abc) Use ordered replicon to have better prediction. """ self.assertEqual(txt, expected_txt) def test_SpecialHitSerializer_tsv(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) gene_name = "gspD" cg_gspd = CoreGene(models_location, gene_name, profile_factory) mg_gspd = ModelGene(cg_gspd, model, loner=True) gene_name = "sctJ" cg_sctj = CoreGene(models_location, gene_name, profile_factory) mg_sctj = ModelGene(cg_sctj, model) gene_name = "abc" cg_abc = CoreGene(models_location, gene_name, profile_factory) mg_abc = ModelGene(cg_abc, model) model.add_mandatory_gene(mg_gspd) model.add_accessory_gene(mg_sctj) model.add_accessory_gene(mg_abc) chit_abc = CoreHit(cg_abc, "hit_abc", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) chit_sctj = CoreHit(cg_sctj, "hit_sctj", 803, "replicon_id", 4, 1.0, 1.0, 1.0, 1.0, 10, 20) chit_gspd1 = CoreHit(cg_gspd, "hit_gspd1", 803, "replicon_id", 20, 1.0, 2.0, 1.0, 1.0, 10, 20) chit_gspd2 = CoreHit(cg_gspd, "hit_gspd2", 803, "replicon_id", 30, 1.0, 3.0, 1.0, 1.0, 10, 20) mhit_abc = ModelHit(chit_abc, mg_abc, GeneStatus.ACCESSORY) mhit_sctj = ModelHit(chit_sctj, mg_sctj, GeneStatus.ACCESSORY) mhit_gspd1 = ModelHit(chit_gspd1, mg_gspd, GeneStatus.MANDATORY) mhit_gspd2 = ModelHit(chit_gspd2, mg_gspd, GeneStatus.MANDATORY) l_gspd1 = Loner(mhit_gspd1, counterpart=[mhit_gspd2]) l_gspd2 = Loner(mhit_gspd2, counterpart=[mhit_gspd1]) ser = TsvSpecialHitSerializer() txt = ser.serialize([l_gspd1, l_gspd2]) expected_txt = "\t".join([ 'replicon', 'model_fqn', 'function', 'gene_name', 'hit_id', 'hit_pos', 'hit_status', 'hit_seq_len', 'hit_i_eval', 'hit_score', 'hit_profile_cov', 'hit_seq_cov', 'hit_begin_match', 'hit_end_match' ]) expected_txt += "\n" expected_txt += "\t".join([ 'replicon_id', 'foo/T2SS', 'gspD', 'gspD', 'hit_gspd1', '20', 'mandatory', '803', '1.000e+00', '2.000', '1.000', '1.000', '10', '20' ]) expected_txt += "\n" expected_txt += "\t".join([ 'replicon_id', 'foo/T2SS', 'gspD', 'gspD', 'hit_gspd2', '30', 'mandatory', '803', '1.000e+00', '3.000', '1.000', '1.000', '10', '20' ]) expected_txt += "\n" self.maxDiff = None self.assertEqual(txt, expected_txt)
def test_functional_dark_theme(self): cur_dir = os.getcwd() tmpdir = os.path.join(tempfile.gettempdir(), 'tmp-macsyconfig') if os.path.exists(tmpdir): shutil.rmtree(tmpdir) os.mkdir(tmpdir) os.chdir(tmpdir) cf_default = MacsyDefaults(system_models_dir='') cf_args = argparse.Namespace(cfg_file="macsyfinder.conf") try: with self.catch_io(out=True): macsyconfig_args = "macsyconfig --defaults" msf_cfg.main(macsyconfig_args.split()[1:]) stdout = sys.stdout.getvalue() expected_stdout = f"""Welcome to the MacSyFinder {msf_vers} configuration utility. Please enter values for the following settings (just press Enter to accept a default value, if one is given in brackets). Configuring directories options: Configuring hmmer options: Configuring score_opt options: Configuring general options: Configuring base options: A configuration file 'macsyfinder.conf' has been generated.. Place it in canonical location * in /etc/macsyfinder for system wide configuration (must named macsyfinder.conf) * in <VIRTUALENV>/etc if you use a virtualenv (must named macsyfinder.conf) * in ~/.macsyfinder for user wide configuration (must named macsyfinder.conf) * where you run the analysis for local configuration (must named macsyfinder.conf) * you can also put anywhere on the filesystems and use MACSY_CONF environment variable to indicate where to find it or specify it on the macsyfinder command line with option --cfg-file can be named as you want. """ # I don't why but ansi color escape sequence are removed when I catch the stdout ??? self.maxDiff = None self.assertEqual(stdout, expected_stdout) # reparse the generated config file and check the result cfg = Config(cf_default, cf_args) for opt_name in cf_default.keys(): if opt_name in ('inter_gene_max_space', 'max_nb_genes', 'min_genes_required', 'min_mandatory_genes_required', 'multi_loci', 'models_dir', 'out_dir'): # not set in macsyconfig continue opt_value = getattr(cfg, opt_name)() if opt_name == 'cfg_file': self.assertEqual(opt_value, "macsyfinder.conf") else: self.assertEqual( opt_value, cf_default[opt_name], msg=f"{opt_name}: {opt_value} {cf_default[opt_name]}") self.assertDictEqual( cfg.hit_weights(), { 'mandatory': cf_default['mandatory_weight'], 'accessory': cf_default['accessory_weight'], 'neutral': cf_default['neutral_weight'], 'itself': cf_default['itself_weight'], 'exchangeable': cf_default['exchangeable_weight'], 'out_of_cluster': cf_default['out_of_cluster_weight'] }) finally: os.chdir(cur_dir) shutil.rmtree(tmpdir)
class TestRejectedCluster(MacsyTest): def setUp(self) -> None: self.args = argparse.Namespace() self.args.sequence_db = self.find_data("base", "test_1.fasta") self.args.db_type = 'gembase' self.args.models_dir = self.find_data('models') self.args.res_search_dir = "blabla" self.cfg = Config(MacsyDefaults(), self.args) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(self.args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) self.hit_weights = HitWeight(self.cfg.hit_weights()) def test_init(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) model.add_mandatory_gene(gene_1) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) gene_2 = ModelGene(c_gene_2, model) model.add_accessory_gene(gene_2) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.ACCESSORY) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) r_c = RejectedClusters(model, c1, ["bla"]) self.assertListEqual(r_c.clusters, [c1]) self.assertEqual(r_c.reasons, ['bla']) def test_str(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) model.add_mandatory_gene(gene_1) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) gene_2 = ModelGene(c_gene_2, model) model.add_accessory_gene(gene_2) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.ACCESSORY) h40 = Hit(c_gene_1, "h40", 10, "replicon_1", 40, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h40 = ValidHit(h40, gene_1, GeneStatus.MANDATORY) h50 = Hit(c_gene_2, "h50", 10, "replicon_1", 50, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_2, GeneStatus.ACCESSORY) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) c2 = Cluster([v_h40, v_h50], model, self.hit_weights) r_c = RejectedClusters(model, [c1, c2], ["bla"]) expected_str = """Cluster: - model = T2SS - replicon = replicon_1 - hits = (h10, gspD, 10), (h20, sctC, 20) Cluster: - model = T2SS - replicon = replicon_1 - hits = (h40, gspD, 40), (h50, sctC, 50) These clusters have been rejected because: \t- bla """ self.assertEqual(expected_str, str(r_c)) def test_hits(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) rc = RejectedClusters(model, [ Cluster([v_hit_1, v_hit_2], model, self.hit_weights), Cluster([v_hit_3], model, self.hit_weights) ], ["bla bla"]) self.assertEqual(rc.hits, [v_hit_1, v_hit_2, v_hit_3]) self.assertEqual(rc.reasons, ["bla bla"])