def test_contains(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) gene_3 = ModelGene(c_gene_3, model) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) h30 = Hit(c_gene_3, "h30", 10, "replicon_1", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) v_h30 = ValidHit(h30, gene_3, GeneStatus.ACCESSORY) h50 = Hit(c_gene_3, "h50", 10, "replicon_1", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_3, GeneStatus.ACCESSORY) c1 = Cluster([v_h10, v_h20, v_h50], model, self.hit_weights) self.assertTrue(v_h10 in c1) self.assertFalse(v_h30 in c1)
def test_hits(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], []) self.assertListEqual(ls_1.hits, [v_hit_1, v_hit_2, v_hit_3])
def test_fulfilled_function(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) c_gene_4 = CoreGene(self.model_location, "sctJ_FLG", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) gene_3 = ModelGene(c_gene_3, model) gene_4 = Exchangeable(c_gene_4, gene_3) gene_3.add_exchangeable(gene_4) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) c = Cluster([v_h10, v_h20], model, self.hit_weights) self.assertTrue(c.fulfilled_function(gene_1)) self.assertFalse(c.fulfilled_function(gene_3)) h50 = Hit(c_gene_4, "h50", 10, "replicon_1", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_4, GeneStatus.ACCESSORY) c = Cluster([v_h10, v_h50], model, self.hit_weights) self.assertTrue(c.fulfilled_function(gene_3))
def test_hits(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) rc = RejectedClusters(model, [ Cluster([v_hit_1, v_hit_2], model, self.hit_weights), Cluster([v_hit_3], model, self.hit_weights) ], ["bla bla"]) self.assertEqual(rc.hits, [v_hit_1, v_hit_2, v_hit_3]) self.assertEqual(rc.reasons, ["bla bla"])
def test_init(self): model = Model("foo/model_A", 10) # test if id is well incremented c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2], [], []) self.assertTrue(ls_1.id.startswith('replicon_id_model_A_')) ls_2 = LikelySystem(model, [v_hit_1, v_hit_2], [], [], []) # check if the id of the second likelysystem is well increased self.assertEqual(int(ls_2.id.split('_')[-1]), int(ls_1.id.split('_')[-1]) + 1)
def setUp(self) -> None: args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation(path=os.path.join(args.models_dir, model_name)) model = Model("foo/T2SS", 10) profile_factory = ProfileFactory(cfg) gene_name = "gspD" self.c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) self.gene_gspd = ModelGene(self.c_gene_gspd, model) gene_name = "sctJ" self.c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) self.gene_sctj = ModelGene(self.c_gene_sctj, model) model.add_mandatory_gene(self.gene_gspd) model.add_accessory_gene(self.gene_sctj) self.hit_1 = Hit(self.c_gene_gspd, "hit_1", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) self.hit_2 = Hit(self.c_gene_sctj, "hit_2", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20)
def test_str(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) uls_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [], ["reason"]) expected_str = """(hit_1, gspD, 1), (hit_2, sctJ, 2), (hit_3, sctN, 3): These hits does not probably constitute a system because: reason""" self.assertEqual(str(uls_1), expected_str)
def test_str(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], []) expected_str = ', '.join([ f"({h.id}, {h.gene.name}, {h.position})" for h in (v_hit_1, v_hit_2, v_hit_3) ]) self.assertEqual(str(ls_1), expected_str)
def test_get_loners(self): model = Model("foo/T2SS", 11) # handle name, topology type, and min/max positions in the sequence dataset for a replicon and list of genes. # each genes is representing by a tuple (seq_id, length)""" rep_info = RepliconInfo('linear', 1, 60, [(f"g_{i}", i * 10) for i in range(1, 7)]) core_genes = [] model_genes = [] for g_name in ('gspD', 'sctC', 'sctJ', 'sctN', 'abc'): core_gene = CoreGene(self.model_location, g_name, self.profile_factory) core_genes.append(core_gene) model_genes.append(ModelGene(core_gene, model)) model_genes[3]._loner = True model_genes[4]._loner = True model.add_mandatory_gene(model_genes[0]) model.add_mandatory_gene(model_genes[1]) model.add_accessory_gene(model_genes[2]) model.add_accessory_gene(model_genes[3]) model.add_neutral_gene(model_genes[4]) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(core_genes[0], "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) h20 = Hit(core_genes[1], "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) h30 = Hit(core_genes[2], "h30", 10, "replicon_1", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) h61 = Hit(core_genes[3], "h61", 10, "replicon_1", 60, 1.0, 61.0, 1.0, 1.0, 10, 20) h80 = Hit(core_genes[4], "h80", 10, "replicon_1", 80, 1.0, 80.0, 1.0, 1.0, 10, 20) # loners are clusters of one hit loners = get_loners([h10, h20, h30, h61, h80], model, self.hit_weights) hit_from_clusters = [h.hits[0] for h in loners] self.assertListEqual(hit_from_clusters, [h61, h80])
def test_get_position(self): gene_name = "gspD" gene = CoreGene(self.model_location, gene_name, self.profile_factory) h0 = Hit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) self.assertEqual(h0.get_position(), 3450)
def test_UnlikelySystemSerializer_txt(self): model = Model("foo/FOO", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) model.add_accessory_gene(gene_sctn) c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory) gene_abc = ModelGene(c_gene_abc, model) model.add_forbidden_gene(gene_abc) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctn, GeneStatus.ACCESSORY) hit_4 = Hit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_4 = ValidHit(hit_4, gene_abc, GeneStatus.FORBIDDEN) ser = TxtUnikelySystemSerializer() ls_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [v_hit_4], ["the reason why"]) txt = ser.serialize(ls_1) expected_txt = """This replicon probably not contains a system foo/FOO: the reason why system id = replicon_id_FOO_1 model = foo/FOO replicon = replicon_id hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)] wholeness = 1.000 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) \t- sctN: 1 (sctN) neutral genes: forbidden genes: \t- abc: 1 (abc) Use ordered replicon to have better prediction. """ self.assertEqual(txt, expected_txt)
def test_SystemSerializer_tsv(self): model = Model("foo/T2SS", 10) c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) c_gene_sctn = CoreGene(self.model_location, "sctN", self.profile_factory) gene_sctn = ModelGene(c_gene_sctn, model) c_gene_sctn_flg = CoreGene(self.model_location, "sctN_FLG", self.profile_factory) gene_sctn_flg = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_flg) model.add_accessory_gene(gene_sctn) h_gspd = Hit(c_gene_gspd, "h_gspd", 803, "replicon_id", 10, 1.0, 1.0, 1.0, 1.0, 10, 20) v_h_gspd = ValidHit(h_gspd, gene_gspd, GeneStatus.MANDATORY) h_sctj = Hit(c_gene_sctj, "h_sctj", 803, "replicon_id", 20, 1.0, 1.0, 1.0, 1.0, 20, 30) v_h_sctj = ValidHit(h_sctj, gene_sctj, GeneStatus.ACCESSORY) h_sctn_flg = Hit(c_gene_sctn_flg, "h_sctn_flg", 803, "replicon_id", 30, 1.0, 1.0, 1.0, 1.0, 30, 40) v_h_sctn_flg = ValidHit(h_sctn_flg, gene_sctn_flg, GeneStatus.ACCESSORY) c1 = Cluster([v_h_gspd, v_h_sctj], model, self.hit_weights) c2 = Cluster([v_h_sctn_flg], model, self.hit_weights) sys_multi_loci = System(model, [c1, c2], self.cfg.redundancy_penalty()) hit_multi_sys_tracker = HitSystemTracker([sys_multi_loci]) system_serializer = TsvSystemSerializer() sys_tsv = "\t".join([ "replicon_id", "h_gspd", "gspD", "10", "foo/T2SS", sys_multi_loci.id, "1", "1.000", "1.900", "1", "gspD", "mandatory", "803", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) sys_tsv += "\n" sys_tsv += "\t".join([ "replicon_id", "h_sctj", "sctJ", "20", "foo/T2SS", sys_multi_loci.id, "1", "1.000", "1.900", "1", "sctJ", "accessory", "803", "1.0", "1.000", "1.000", "1.000", "20", "30", "" ]) sys_tsv += "\n" sys_tsv += "\t".join([ "replicon_id", "h_sctn_flg", "sctN_FLG", "30", "foo/T2SS", sys_multi_loci.id, "1", "1.000", "1.900", "1", "sctN", "accessory", "803", "1.0", "1.000", "1.000", "1.000", "30", "40", "" ]) sys_tsv += "\n" self.assertEqual( sys_tsv, system_serializer.serialize(sys_multi_loci, hit_multi_sys_tracker))
def test_eq(self): gene_name = "gspD" gene = CoreGene(self.model_location, gene_name, self.profile_factory) h0 = Hit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = Hit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h2 = Hit(gene, "PSAE001c01_013980", 759, "PSAE001c01", 4146, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) self.assertEqual(h0, h1) self.assertNotEqual(h0, h2)
def test_hash(self): gene_name = "gspD" gene = CoreGene(self.model_location, gene_name, self.profile_factory) h0 = Hit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = Hit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h2 = Hit(gene, "PSAE001c01_006941", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) self.assertTrue(isinstance(hash(h0), int)) self.assertEqual(hash(h0), hash(h1)) self.assertNotEqual(hash(h0), hash(h2))
def test_search_recover(self): # first job searching using hmmsearch gene_name = "abc" c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) expected_hit = [ Hit(c_gene_abc, "ESCO030p01_000260", 706, "ESCO030p01", 26, float(1.000e-200), float(660.800), float(1.000), float(0.714), 160, 663) ] # second job using recover # disable hmmer to be sure that test use the recover inner function self.cfg.hmmer = lambda: "hmmer_disable" # and create a new dir for the second job previous_job_path = self.cfg.working_dir() self.cfg.previous_run = lambda: previous_job_path self.cfg.out_dir = lambda: os.path.join(self.tmp_dir, 'job_2') os.mkdir(self.cfg.out_dir()) # rerun with previous run # but we have to reset the profile attached to the gene gene._profile._report self.profile_factory = ProfileFactory(self.cfg) c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) self.assertEqual(len(report), 1) self.assertEqual(expected_hit[0], report[0].hits[0])
def test_len(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) self.assertEqual(len(c1), 2)
def test_cmp(self): gene_name = "gspD" gene = CoreGene(self.model_location, gene_name, self.profile_factory) # compare hit with different id (comparison based on seq identifier) h0 = Hit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = Hit(gene, "PSAE001c01_013980", 759, "PSAE001c01", 4146, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) self.assertGreater(h1, h0) self.assertLess(h0, h1) # compare hit with different same id (comparison based on score) # score = 779.2 h0 = Hit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) # score = 255.8 h1 = Hit(gene, "PSAE001c01_006940", 759, "PSAE001c01", 4146, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) self.assertGreater(h0, h1) self.assertLess(h1, h0)
def test_search(self): gene_name = "abc" c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) expected_hit = [ Hit(c_gene_abc, "ESCO030p01_000260", 706, "ESCO030p01", 26, float(1.000e-200), float(660.800), float(1.000), float(0.714), 160, 663) ] self.assertEqual(len(report), 1) self.assertEqual(expected_hit[0], report[0].hits[0])
def test_reason(self): model = Model("foo/model_A", 10) # test if id is well incremented c_gene_gspd = CoreGene(self.model_location, "gspD", self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) c_gene_sctj = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_forbidden_gene(gene_sctj) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.FORBIDDEN) reason_2 = ["forbidden gene"] uls_2 = UnlikelySystem(model, [v_hit_1], [], [], [v_hit_2], reason_2) self.assertEqual(uls_2.reasons, reason_2)
def test_str(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) model.add_mandatory_gene(gene_1) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) gene_2 = ModelGene(c_gene_2, model) model.add_accessory_gene(gene_2) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.ACCESSORY) h40 = Hit(c_gene_1, "h40", 10, "replicon_1", 40, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h40 = ValidHit(h40, gene_1, GeneStatus.MANDATORY) h50 = Hit(c_gene_2, "h50", 10, "replicon_1", 50, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_2, GeneStatus.ACCESSORY) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) c2 = Cluster([v_h40, v_h50], model, self.hit_weights) r_c = RejectedClusters(model, [c1, c2], ["bla"]) expected_str = """Cluster: - model = T2SS - replicon = replicon_1 - hits = (h10, gspD, 10), (h20, sctC, 20) Cluster: - model = T2SS - replicon = replicon_1 - hits = (h40, gspD, 40), (h50, sctC, 50) These clusters have been rejected because: \t- bla """ self.assertEqual(expected_str, str(r_c))
def test_extract_concurent(self): gene_name = "gspD" c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) reports = [] for i in range(5): report = OrderedHMMReport(c_gene, report_path, self.cfg) reports.append(report) import threading def worker(report): report.extract() for report in reports: t = threading.Thread(target=worker, args=(report, )) t.start() main_thread = threading.currentThread() for t in threading.enumerate(): if t is main_thread: continue t.join() # gene, model, hit_id, hit_seq_length replicon_name, pos_hit, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match hits = [ Hit(c_gene, "NC_xxxxx_xx_056141", 803, RepliconDB.ordered_replicon_name, 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), Hit(c_gene, "PSAE001c01_006940", 803, RepliconDB.ordered_replicon_name, 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), Hit(c_gene, "PSAE001c01_013980", 759, RepliconDB.ordered_replicon_name, 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736), Hit(c_gene, "PSAE001c01_017350", 600, RepliconDB.ordered_replicon_name, 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600, 226, 506), Hit(c_gene, "PSAE001c01_018920", 776, RepliconDB.ordered_replicon_name, 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776, 48, 606), Hit(c_gene, "PSAE001c01_031420", 658, RepliconDB.ordered_replicon_name, 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658, 55, 614) ] for report in reports: report.save_extract() self.assertEqual(len(report.hits), len(hits)) self.assertListEqual(report.hits, hits)
def test_filter_loners(self): model = Model("foo/T2SS", 11) core_genes = [] model_genes = [] for g_name in ('gspD', 'sctC', 'sctJ', 'sctN', 'abc'): core_gene = CoreGene(self.model_location, g_name, self.profile_factory) core_genes.append(core_gene) model_genes.append(ModelGene(core_gene, model)) model_genes[2]._loner = True model_genes[3]._loner = True model_genes[4]._loner = True model.add_mandatory_gene(model_genes[0]) model.add_mandatory_gene(model_genes[1]) model.add_accessory_gene(model_genes[2]) model.add_accessory_gene(model_genes[3]) model.add_neutral_gene(model_genes[4]) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(core_genes[0], "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) h20 = Hit(core_genes[1], "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) h30 = Hit(core_genes[2], "h30", 10, "replicon_1", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) h40 = Hit(core_genes[3], "h40", 10, "replicon_1", 40, 1.0, 61.0, 1.0, 1.0, 10, 20) h50 = Hit(core_genes[4], "h50", 10, "replicon_1", 50, 1.0, 80.0, 1.0, 1.0, 10, 20) c1 = Cluster([h10, h20, h30, h40, h50], model, self.hit_weights) filtered_loners = filter_loners(c1, [Cluster([h30], model, self.hit_weights), Cluster([h40], model, self.hit_weights), Cluster([h50], model, self.hit_weights)] ) self.assertListEqual(filtered_loners, []) c1 = Cluster([h10, h20, h40], model, self.hit_weights) c30 = Cluster([h30], model, self.hit_weights) c40 = Cluster([h40], model, self.hit_weights) c50 = Cluster([h50], model, self.hit_weights) filtered_loners = filter_loners(c1, [c30, c40, c50]) self.assertListEqual(filtered_loners, [c30, c50])
def test_init(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) model.add_mandatory_gene(gene_1) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) gene_2 = ModelGene(c_gene_2, model) model.add_accessory_gene(gene_2) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.ACCESSORY) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) r_c = RejectedClusters(model, c1, ["bla"]) self.assertListEqual(r_c.clusters, [c1]) self.assertEqual(r_c.reasons, ['bla'])
def test_init(self): model_1 = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_1 = ModelGene(c_gene_1, model_1) h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_1, GeneStatus.MANDATORY) h30 = Hit(c_gene_3, "h30", 10, "replicon_2", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) v_h30 = ValidHit(h30, gene_1, GeneStatus.ACCESSORY) h50 = Hit(c_gene_3, "h50", 10, "replicon_2", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_1, GeneStatus.ACCESSORY) with self.assertRaises(MacsypyError) as ctx: with self.catch_log(): Cluster([v_h10, v_h20, v_h30, v_h50], model_1, self.hit_weights) msg = "Cannot build a cluster from hits coming from different replicons" self.assertEqual(str(ctx.exception), msg)
def test_save_extract(self): gene_name = "gspD" gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) report = GembaseHMMReport(gene, report_path, self.cfg) report.extract() report.save_extract() extract_filename = gene_name + self.cfg.res_extract_suffix() extract_path = os.path.join(self.cfg.working_dir(), self.cfg.hmmer_dir(), extract_filename) self.assertTrue(os.path.exists(extract_path)) self.assertTrue(os.path.isfile(extract_path)) hits = [ Hit(gene, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), Hit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), Hit(gene, "PSAE001c01_013980", 759, "PSAE001c01", 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736), Hit(gene, "PSAE001c01_017350", 600, "PSAE001c01", 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600, 226, 506), Hit(gene, "PSAE001c01_018920", 776, "PSAE001c01", 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776, 48, 606), Hit(gene, "PSAE001c01_031420", 658, "PSAE001c01", 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658, 55, 614) ] expected_extract_path = os.path.join(self.cfg.working_dir(), 'expected_extract') with open(expected_extract_path, 'w') as expected_extract: extract = """# gene: {name} extract from {path} hmm output # profile length= {len_profile:d} # i_evalue threshold= {i_evalue:.3f} # coverage threshold= {cov:.3f} # hit_id replicon_name position_hit hit_sequence_length gene_name gene_system i_eval score profile_coverage sequence_coverage begin end """.format(name=gene.name, path=report_path, len_profile=len(gene.profile), i_evalue=self.cfg.i_evalue_sel(), cov=self.cfg.coverage_profile()) expected_extract.write(extract) for h in hits: expected_extract.write(str(h)) self.assertFileEqual(extract_path, expected_extract_path)
def test_best_hit(self): gene_name = 'gspD' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) report = GembaseHMMReport(c_gene, report_path, self.cfg) self.assertIsNone(report.best_hit()) report.extract() best_hit = report.best_hit() hit_expected = Hit(c_gene, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) self.assertEqual(hit_expected, best_hit)
def test_merge(self): model = Model("foo/T2SS", 11) c_gene_1 = CoreGene(self.model_location, "gspD", self.profile_factory) c_gene_2 = CoreGene(self.model_location, "sctC", self.profile_factory) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_1 = ModelGene(c_gene_1, model) gene_2 = ModelGene(c_gene_2, model) gene_3 = ModelGene(c_gene_3, model) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_1, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_2, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.MANDATORY) h30 = Hit(c_gene_3, "h30", 10, "replicon_1", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) v_h30 = ValidHit(h30, gene_3, GeneStatus.ACCESSORY) h50 = Hit(c_gene_3, "h50", 10, "replicon_1", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_3, GeneStatus.ACCESSORY) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) c2 = Cluster([v_h30, v_h50], model, self.hit_weights) c1.merge(c2) self.assertListEqual(c1.hits, [v_h10, v_h20, v_h30, v_h50]) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) c2 = Cluster([v_h30, v_h50], model, self.hit_weights) c2.merge(c1) self.assertListEqual(c2.hits, [v_h30, v_h50, v_h10, v_h20]) c1 = Cluster([v_h10, v_h20], model, self.hit_weights) c2 = Cluster([v_h30, v_h50], model, self.hit_weights) c1.merge(c2, before=True) self.assertListEqual(c1.hits, [v_h30, v_h50, v_h10, v_h20]) model_2 = Model("foo/T3SS", 11) c_gene_3 = CoreGene(self.model_location, "sctJ", self.profile_factory) gene_3 = ModelGene(c_gene_3, model) h30 = Hit(c_gene_3, "h30", 10, "replicon_2", 30, 1.0, 30.0, 1.0, 1.0, 10, 20) v_h30 = ValidHit(h30, gene_3, GeneStatus.ACCESSORY) h50 = Hit(c_gene_3, "h50", 10, "replicon_2", 50, 1.0, 50.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_3, GeneStatus.ACCESSORY) c3 = Cluster([v_h30, v_h50], model_2, self.hit_weights) with self.assertRaises(MacsypyError) as ctx: c1.merge(c3) self.assertEqual(str(ctx.exception), "Try to merge Clusters from different model")
def test_get_best_hits(self): model = Model("foo/T2SS", 10) gene_name = "gspD" c_gene_gspd = CoreGene(self.models_location, gene_name, self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) # gene, model, id, hit_seq_len, replicon_name, position, i_eval, # score, profil_coverage, sequence_coverage, begin,end ###################### # based on the score # ###################### h0 = Hit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), 10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = Hit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, float(3.7e-76), 11, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) h = get_best_hits([h0, h1]) self.assertEqual(h[0], h1) ####################### # based on the i_eval # ####################### h0 = Hit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10, 10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = Hit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 11, 10, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) h = get_best_hits([h0, h1], key='i_eval') self.assertEqual(h[0], h0) ################################# # based on the profile_coverage # ################################# h0 = Hit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10, 10, 10, (741.0 - 104.0 + 1) / 803, 104, 741) h1 = Hit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 10, 10, 11, (736.0 - 105.0 + 1) / 759, 105, 736) h = get_best_hits([h0, h1], key='profile_coverage') self.assertEqual(h[0], h1) # bad criterion with self.assertRaises(MacsypyError) as ctx: get_best_hits([h0, h1], key='nimportnaoik') self.assertEqual('The criterion for Hits comparison nimportnaoik does not exist or is not available.\n' 'It must be either "score", "i_eval" or "profile_coverage".', str(ctx.exception))
def test_str(self): gene_name = "gspD" gene = CoreGene(self.model_location, gene_name, self.profile_factory) hit_prop = {'id': "PSAE001c01_006940", 'hit_seq_len': 803, 'replicon_name': "PSAE001c01", 'position': 694, 'i_eval': float(1.2e-234), 'score': float(779.2), 'gene_name': gene.name, 'profil_coverage': float(1.0), 'sequence_coverage': float(638.000000), 'begin': 104, 'end': 741 } hit = Hit(gene, hit_prop['id'], hit_prop['hit_seq_len'], hit_prop['replicon_name'], hit_prop['position'], hit_prop['i_eval'], hit_prop['score'], hit_prop['profil_coverage'], hit_prop['sequence_coverage'], hit_prop['begin'],hit_prop['end']) s = "{id}\t{replicon_name}\t{position:d}\t{hit_seq_len:d}\t{gene_name}\t{i_eval:.3e}" \ "\t{score:.3f}\t{profil_coverage:.3f}\t{sequence_coverage:.3f}\t{begin:d}\t{end:d}\n".format(**hit_prop) self.assertEqual(s, str(hit))
def test_extract(self): gene_name = "gspD" c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) shutil.copy( self.find_data("hmm", gene_name + self.cfg.res_search_suffix()), self.cfg.working_dir()) report_path = os.path.join(self.cfg.working_dir(), gene_name + self.cfg.res_search_suffix()) report = OrderedHMMReport(c_gene, report_path, self.cfg) report.extract() self.assertEqual(len(report.hits), 6) # gene, model, hit_id, hit_seq_ length replicon_name, pos_hit, i_eval, # score, profile_coverage, sequence_coverage, begin_match, end_match hits = [ Hit(c_gene, "NC_xxxxx_xx_056141", 803, RepliconDB.ordered_replicon_name, 141, float(2e-236), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), Hit(c_gene, "PSAE001c01_006940", 803, RepliconDB.ordered_replicon_name, 68, float(1.2e-234), float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741), Hit(c_gene, "PSAE001c01_013980", 759, RepliconDB.ordered_replicon_name, 69, float(3.7e-76), float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736), Hit(c_gene, "PSAE001c01_017350", 600, RepliconDB.ordered_replicon_name, 70, float(3.2e-27), float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600, 226, 506), Hit(c_gene, "PSAE001c01_018920", 776, RepliconDB.ordered_replicon_name, 71, float(6.1e-183), float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776, 48, 606), Hit(c_gene, "PSAE001c01_031420", 658, RepliconDB.ordered_replicon_name, 73, float(1.8e-210), float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658, 55, 614) ] self.assertListEqual(hits, report.hits) report = OrderedHMMReport(c_gene, report_path, self.cfg) report.hits = hits self.assertIsNone(report.extract())