Esempio n. 1
0
    def setUp(self) -> None:
        args = argparse.Namespace()
        args.sequence_db = self.find_data("base", "test_1.fasta")
        args.db_type = 'gembase'
        args.models_dir = self.find_data('models')
        cfg = Config(MacsyDefaults(), args)

        model_name = 'foo'
        models_location = ModelLocation(path=os.path.join(args.models_dir, model_name))

        model = Model("foo/T2SS", 10)
        profile_factory = ProfileFactory(cfg)

        gene_name = "gspD"
        self.cg_gspd = CoreGene(models_location, gene_name, profile_factory)
        self.mg_gspd = ModelGene(self.cg_gspd, model, loner=True, multi_system=True)

        gene_name = "sctJ"
        self.cg_sctj = CoreGene(models_location, gene_name, profile_factory)
        self.mg_sctj = ModelGene(self.cg_sctj, model)

        model.add_mandatory_gene(self.mg_gspd)
        model.add_accessory_gene(self.mg_sctj)

        self.chit_1 = CoreHit(self.cg_gspd, "hit_1", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20)
        self.chit_2 = CoreHit(self.cg_sctj, "hit_2", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20)
        self.chit_3 = CoreHit(self.cg_gspd, "hit_3", 803, "replicon_id", 10, 1.0, 1.0, 1.0, 1.0, 10, 20)
        self.chit_4 = CoreHit(self.cg_gspd, "hit_4", 803, "replicon_id", 20, 1.0, 1.0, 1.0, 1.0, 10, 20)
        self.mhit_1 = ModelHit(self.chit_1, self.mg_gspd, GeneStatus.MANDATORY)
        self.mhit_2 = ModelHit(self.chit_2, self.mg_sctj, GeneStatus.ACCESSORY)
        self.mhit_3 = ModelHit(self.chit_3, self.mg_gspd, GeneStatus.MANDATORY)
        self.mhit_4 = ModelHit(self.chit_4, self.mg_gspd, GeneStatus.MANDATORY)
Esempio n. 2
0
    def test_str(self):
        model = Model("foo/T2SS", 10)
        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)
        model.add_mandatory_gene(gene_gspd)
        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model)
        model.add_accessory_gene(gene_sctj)
        c_gene_sctn = CoreGene(self.model_location, "sctN",
                               self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model)
        model.add_accessory_gene(gene_sctn)

        hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY)
        hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY)
        hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY)
        uls_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [],
                               ["reason"])

        expected_str = """(hit_1, gspD, 1), (hit_2, sctJ, 2), (hit_3, sctN, 3): These hits does not probably constitute a system because:
reason"""
        self.assertEqual(str(uls_1), expected_str)
Esempio n. 3
0
    def test_get_position(self):
        gene_name = "gspD"
        gene = CoreGene(self.model_location, gene_name, self.profile_factory)

        h0 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2),
                     float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        self.assertEqual(h0.get_position(), 3450)
Esempio n. 4
0
    def test_init(self):
        model = Model("foo/model_A", 10)
        # test if id is well incremented
        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)
        model.add_mandatory_gene(gene_gspd)
        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model)
        model.add_accessory_gene(gene_sctj)

        hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY)
        hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY)
        ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2], [], [])
        self.assertTrue(ls_1.id.startswith('replicon_id_model_A_'))

        ls_2 = LikelySystem(model, [v_hit_1, v_hit_2], [], [], [])
        # check if the id of the second likelysystem is well increased
        self.assertEqual(int(ls_2.id.split('_')[-1]),
                         int(ls_1.id.split('_')[-1]) + 1)
Esempio n. 5
0
    def test_hits(self):
        model = Model("foo/T2SS", 10)
        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)
        model.add_mandatory_gene(gene_gspd)
        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model)
        model.add_accessory_gene(gene_sctj)
        c_gene_sctn = CoreGene(self.model_location, "sctN",
                               self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model)
        model.add_accessory_gene(gene_sctn)

        hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY)
        hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY)
        hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY)
        ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [])

        self.assertListEqual(ls_1.hits, [v_hit_1, v_hit_2, v_hit_3])
Esempio n. 6
0
    def test_str(self):
        model = Model("foo/T2SS", 10)
        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)
        model.add_mandatory_gene(gene_gspd)
        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model)
        model.add_accessory_gene(gene_sctj)
        c_gene_sctn = CoreGene(self.model_location, "sctN",
                               self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model)
        model.add_accessory_gene(gene_sctn)

        hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY)
        hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY)
        hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY)
        ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [], [])
        expected_str = ', '.join([
            f"({h.id}, {h.gene.name}, {h.position})"
            for h in (v_hit_1, v_hit_2, v_hit_3)
        ])
        self.assertEqual(str(ls_1), expected_str)
Esempio n. 7
0
    def test_UnlikelySystemSerializer_txt(self):
        model = Model("foo/FOO", 10)
        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)
        model.add_mandatory_gene(gene_gspd)
        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model)
        model.add_accessory_gene(gene_sctj)
        c_gene_sctn = CoreGene(self.model_location, "sctN",
                               self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model)
        model.add_accessory_gene(gene_sctn)
        c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory)
        gene_abc = ModelGene(c_gene_abc, model)
        model.add_forbidden_gene(gene_abc)

        hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY)
        hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY)
        hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY)
        hit_4 = CoreHit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_4 = ModelHit(hit_4, gene_abc, GeneStatus.FORBIDDEN)
        ser = TxtUnikelySystemSerializer()

        ls_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [],
                              [v_hit_4], ["the reason why"])
        txt = ser.serialize(ls_1)
        expected_txt = """This replicon probably not contains a system foo/FOO:
the reason why

system id = replicon_id_FOO_1
model = foo/FOO
replicon = replicon_id
hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)]
wholeness = 1.000

mandatory genes:
\t- gspD: 1 (gspD)

accessory genes:
\t- sctJ: 1 (sctJ)
\t- sctN: 1 (sctN)

neutral genes:

forbidden genes:
\t- abc: 1 (abc)

Use ordered replicon to have better prediction.
"""
        self.assertEqual(txt, expected_txt)
Esempio n. 8
0
    def setUp(self) -> None:
        args = argparse.Namespace()
        args.sequence_db = self.find_data("base", "test_1.fasta")
        args.db_type = 'gembase'
        args.models_dir = self.find_data('models')
        cfg = Config(MacsyDefaults(), args)

        model_name = 'foo'
        self.models_location = ModelLocation(path=os.path.join(args.models_dir, model_name))

        # we need to reset the ProfileFactory
        # because it's a like a singleton
        # so other tests are influenced by ProfileFactory and it's configuration
        # for instance search_genes get profile without hmmer_exe
        profile_factory = ProfileFactory(cfg)

        model = Model(model_name, 10)
        self.profile_factory = ProfileFactory(cfg)

        gene_name = "gspD"
        c_gene_gspd = CoreGene(self.models_location, gene_name, self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model, multi_system=True)

        gene_name = "sctJ"
        c_gene_sctj = CoreGene(self.models_location, gene_name, self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model, multi_system=True)

        gene_name = "sctN"
        c_gene_sctn = CoreGene(self.models_location, gene_name, self.profile_factory)
        gene_sctn = Exchangeable(c_gene_sctn, gene_sctj)
        gene_sctj.add_exchangeable(gene_sctn)

        model.add_mandatory_gene(gene_gspd)
        model.add_accessory_gene(gene_sctj)

        #        CoreHit(gene, hit_id, hit_seq_length, replicon_name, position, i_eval, score,
        #                       profile_coverage, sequence_coverage, begin_match, end_match
        #                                                        pos      score
        chit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 2, 1.0, 1.0, 1.0, 1.0, 10, 20)
        chit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 3, 1.0, 1.0, 1.0, 1.0, 10, 20)
        chit_3 = CoreHit(c_gene_gspd, "hit_3", 803, "replicon_id", 10, 1.0, 3.0, 1.0, 1.0, 10, 20)
        chit_4 = CoreHit(c_gene_sctn, "hit_4", 803, "replicon_id", 14, 1.0, 4.0, 1.0, 1.0, 10, 20)
        chit_5 = CoreHit(c_gene_gspd, "hit_5", 803, "replicon_id", 20, 1.0, 2.0, 1.0, 1.0, 10, 20)

        self.mhit_1 = ModelHit(chit_1, gene_gspd, GeneStatus.MANDATORY)
        self.mhit_2 = ModelHit(chit_2, gene_sctj, GeneStatus.ACCESSORY)
        self.mhit_3 = ModelHit(chit_3, gene_gspd, GeneStatus.MANDATORY)
        self.mhit_4 = ModelHit(chit_4, gene_sctn, GeneStatus.ACCESSORY)
        self.mhit_5 = ModelHit(chit_5, gene_gspd, GeneStatus.MANDATORY)

        self.ms_1 = MultiSystem(chit_1, gene_ref=gene_gspd, gene_status=GeneStatus.MANDATORY)
        self.ms_2 = MultiSystem(chit_2, gene_ref=gene_sctj, gene_status=GeneStatus.ACCESSORY)
        self.ms_3 = MultiSystem(chit_3, gene_ref=gene_gspd, gene_status=GeneStatus.MANDATORY)
        self.ms_4 = MultiSystem(chit_4, gene_ref=gene_sctn, gene_status=GeneStatus.ACCESSORY)
        self.ms_5 = MultiSystem(chit_5, gene_ref=gene_gspd, gene_status=GeneStatus.MANDATORY)
Esempio n. 9
0
    def test_eq(self):
        gene_name = "gspD"
        gene = CoreGene(self.model_location, gene_name, self.profile_factory)

        h0 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2),
                     float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        h1 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2),
                     float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        h2 = CoreHit(gene, "PSAE001c01_013980", 759, "PSAE001c01", 4146, float(3.7e-76), float(255.8),
                     float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736)
        self.assertEqual(h0, h1)
        self.assertNotEqual(h0, h2)
Esempio n. 10
0
    def test_hash(self):
        gene_name = "gspD"
        gene = CoreGene(self.model_location, gene_name, self.profile_factory)

        h0 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2),
                     float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        h1 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2),
                     float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        h2 = CoreHit(gene, "PSAE001c01_006941", 803, "PSAE001c01", 3450, float(1.2e-234), float(779.2),
                     float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        self.assertTrue(isinstance(hash(h0), int))
        self.assertEqual(hash(h0), hash(h1))
        self.assertNotEqual(hash(h0), hash(h2))
Esempio n. 11
0
    def test_search_recover(self):
        # first job searching using hmmsearch
        gene_name = "abc"
        c_gene_abc = CoreGene(self.model_location, gene_name,
                              self.profile_factory)
        report = search_genes([c_gene_abc], self.cfg)
        expected_hit = [
            CoreHit(c_gene_abc, "ESCO030p01_000260", 706, "ESCO030p01", 26,
                    float(1.000e-200), float(660.800), float(1.000),
                    float(0.714), 160, 663)
        ]

        # second job using recover
        # disable hmmer to be sure that test use the recover inner function
        self.cfg.hmmer = lambda: "hmmer_disable"
        # and create a new dir for the second job
        previous_job_path = self.cfg.working_dir()
        self.cfg.previous_run = lambda: previous_job_path
        self.cfg.out_dir = lambda: os.path.join(self.tmp_dir, 'job_2')
        os.mkdir(self.cfg.out_dir())

        # rerun with previous run
        # but we have to reset the profile attached to the gene gene._profile._report
        self.profile_factory = ProfileFactory(self.cfg)
        c_gene_abc = CoreGene(self.model_location, gene_name,
                              self.profile_factory)
        report = search_genes([c_gene_abc], self.cfg)
        self.assertEqual(len(report), 1)
        self.assertEqual(expected_hit[0], report[0].hits[0])
Esempio n. 12
0
    def test_cmp(self):
        gene_name = "gspD"
        gene = CoreGene(self.model_location, gene_name, self.profile_factory)

        # compare hit with different id (comparison based on seq identifier)
        h0 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234),
                     float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        h1 = CoreHit(gene, "PSAE001c01_013980", 759, "PSAE001c01", 4146, float(3.7e-76),
                     float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736)
        self.assertGreater(h1, h0)
        self.assertLess(h0, h1)
        # compare hit with different same id (comparison based on score)
        # score = 779.2
        h0 = CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234),
                     float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        # score = 255.8
        h1 = CoreHit(gene, "PSAE001c01_006940", 759, "PSAE001c01", 4146, float(3.7e-76),
                     float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736)
        self.assertGreater(h0, h1)
        self.assertLess(h1, h0)
Esempio n. 13
0
 def test_search(self):
     gene_name = "abc"
     c_gene_abc = CoreGene(self.model_location, gene_name,
                           self.profile_factory)
     report = search_genes([c_gene_abc], self.cfg)
     expected_hit = [
         CoreHit(c_gene_abc, "ESCO030p01_000260", 706, "ESCO030p01", 26,
                 float(1.000e-200), float(660.800), float(1.000),
                 float(0.714), 160, 663)
     ]
     self.assertEqual(len(report), 1)
     self.assertEqual(expected_hit[0], report[0].hits[0])
Esempio n. 14
0
    def test_reason(self):
        model = Model("foo/model_A", 10)
        # test if id is well incremented
        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)
        model.add_mandatory_gene(gene_gspd)
        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model)
        model.add_forbidden_gene(gene_sctj)

        hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY)
        hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.FORBIDDEN)
        reason_2 = ["forbidden gene"]
        uls_2 = UnlikelySystem(model, [v_hit_1], [], [], [v_hit_2], reason_2)
        self.assertEqual(uls_2.reasons, reason_2)
Esempio n. 15
0
    def test_extract_concurent(self):
        gene_name = "gspD"
        c_gene = CoreGene(self.model_location, gene_name, self.profile_factory)
        shutil.copy(
            self.find_data("hmm", gene_name + self.cfg.res_search_suffix()),
            self.cfg.working_dir())
        report_path = os.path.join(self.cfg.working_dir(),
                                   gene_name + self.cfg.res_search_suffix())
        reports = []
        for i in range(5):
            report = OrderedHMMReport(c_gene, report_path, self.cfg)
            reports.append(report)

        import threading

        def worker(report):
            report.extract()

        for report in reports:
            t = threading.Thread(target=worker, args=(report, ))
            t.start()
        main_thread = threading.currentThread()
        for t in threading.enumerate():
            if t is main_thread:
                continue
        t.join()

        #          gene, model,     hit_id,        hit_seq_length replicon_name, pos_hit, i_eval,  score,
        #          profile_coverage, sequence_coverage, begin_match, end_match
        hits = [
            CoreHit(c_gene, "NC_xxxxx_xx_056141", 803,
                    RepliconDB.ordered_replicon_name, 141, float(2e-236),
                    float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803,
                    104, 741),
            CoreHit(c_gene, "PSAE001c01_006940", 803,
                    RepliconDB.ordered_replicon_name, 68, float(1.2e-234),
                    float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803,
                    104, 741),
            CoreHit(c_gene, "PSAE001c01_013980", 759,
                    RepliconDB.ordered_replicon_name, 69, float(3.7e-76),
                    float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759,
                    105, 736),
            CoreHit(c_gene, "PSAE001c01_017350", 600,
                    RepliconDB.ordered_replicon_name, 70, float(3.2e-27),
                    float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600,
                    226, 506),
            CoreHit(c_gene, "PSAE001c01_018920", 776,
                    RepliconDB.ordered_replicon_name, 71, float(6.1e-183),
                    float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776,
                    48, 606),
            CoreHit(c_gene, "PSAE001c01_031420", 658,
                    RepliconDB.ordered_replicon_name, 73, float(1.8e-210),
                    float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658,
                    55, 614)
        ]
        for report in reports:
            report.save_extract()
            self.assertEqual(len(report.hits), len(hits))
            self.assertListEqual(report.hits, hits)
Esempio n. 16
0
    def test_extract(self):
        gene_name = "gspD"
        c_gene = CoreGene(self.model_location, gene_name, self.profile_factory)
        shutil.copy(
            self.find_data("hmm", gene_name + self.cfg.res_search_suffix()),
            self.cfg.working_dir())
        report_path = os.path.join(self.cfg.working_dir(),
                                   gene_name + self.cfg.res_search_suffix())
        report = OrderedHMMReport(c_gene, report_path, self.cfg)
        report.extract()
        self.assertEqual(len(report.hits), 6)
        #           gene, model,     hit_id,         hit_seq_ length   replicon_name, pos_hit, i_eval,
        #           score,       profile_coverage, sequence_coverage, begin_match, end_match
        hits = [
            CoreHit(c_gene, "NC_xxxxx_xx_056141", 803,
                    RepliconDB.ordered_replicon_name, 141, float(2e-236),
                    float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803,
                    104, 741),
            CoreHit(c_gene, "PSAE001c01_006940", 803,
                    RepliconDB.ordered_replicon_name, 68, float(1.2e-234),
                    float(779.2), float(1.000000), (741.0 - 104.0 + 1) / 803,
                    104, 741),
            CoreHit(c_gene, "PSAE001c01_013980", 759,
                    RepliconDB.ordered_replicon_name, 69, float(3.7e-76),
                    float(255.8), float(1.000000), (736.0 - 105.0 + 1) / 759,
                    105, 736),
            CoreHit(c_gene, "PSAE001c01_017350", 600,
                    RepliconDB.ordered_replicon_name, 70, float(3.2e-27),
                    float(94.2), float(0.500000), (506.0 - 226.0 + 1) / 600,
                    226, 506),
            CoreHit(c_gene, "PSAE001c01_018920", 776,
                    RepliconDB.ordered_replicon_name, 71, float(6.1e-183),
                    float(608.4), float(1.000000), (606.0 - 48.0 + 1) / 776,
                    48, 606),
            CoreHit(c_gene, "PSAE001c01_031420", 658,
                    RepliconDB.ordered_replicon_name, 73, float(1.8e-210),
                    float(699.3), float(1.000000), (614.0 - 55.0 + 1) / 658,
                    55, 614)
        ]
        self.assertListEqual(hits, report.hits)

        report = OrderedHMMReport(c_gene, report_path, self.cfg)
        report.hits = hits
        self.assertIsNone(report.extract())

        index_file = self.cfg.sequence_db() + '.idx'
        with open(index_file, 'r') as idx_file:
            idx = idx_file.readlines()
        idx = idx[:-1]
        with open(index_file, 'w') as idx_file:
            idx_file.writelines(idx)
        report = OrderedHMMReport(c_gene, report_path, self.cfg)
        with self.assertRaises(MacsypyError) as ctx:
            with self.catch_log() as log:
                report.extract()
            self.assertEqual(
                str(ctx.exception),
                "hit id 'NC_xxxxx_xx_056141' was not indexed, rebuild sequence 'test_base.fa' index"
            )
Esempio n. 17
0
    def test_get_best_hits_4_func(self):
        model = Model("foo/T2SS", 10)
        gene_name = "gspD"
        c_gene_gspd = CoreGene(self.models_location, gene_name, self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model, loner=True)

        #        gene, model, id,            hit_seq_len, replicon_name, position, i_eval,
        #        score,      profil_coverage,      sequence_coverage,     begin,end
        ######################
        # based on the score #
        ######################
        h0 = CoreHit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234),
                     10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        h1 = CoreHit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, float(3.7e-76),
                     11, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736)
        m0 = ModelHit(h0, gene_gspd, GeneStatus.ACCESSORY)
        m1 = ModelHit(h1, gene_gspd, GeneStatus.ACCESSORY)
        l0 = Loner(h0, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m1])
        l1 = Loner(h1, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m0])
        l = get_best_hit_4_func(gene_name, [l0, l1])
        self.assertEqual(l, l1)

        #######################
        # based on the i_eval #
        #######################
        h0 = CoreHit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10,
                     10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        h1 = CoreHit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 11,
                     10, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736)
        m0 = ModelHit(h0, gene_gspd, GeneStatus.ACCESSORY)
        m1 = ModelHit(h1, gene_gspd, GeneStatus.ACCESSORY)
        l0 = Loner(h0, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m1])
        l1 = Loner(h1, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m0])

        l = get_best_hit_4_func(gene_name, [l0, l1], key='i_eval')
        self.assertEqual(l, l0)

        #################################
        # based on the profile_coverage #
        #################################
        h0 = CoreHit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10,
                     10, 10, (741.0 - 104.0 + 1) / 803, 104, 741)
        h1 = CoreHit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 10,
                     10, 11, (736.0 - 105.0 + 1) / 759, 105, 736)
        m0 = ModelHit(h0, gene_gspd, GeneStatus.ACCESSORY)
        m1 = ModelHit(h1, gene_gspd, GeneStatus.ACCESSORY)
        l0 = Loner(h0, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m1])
        l1 = Loner(h1, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m0])

        l = get_best_hit_4_func(gene_name, [l0, l1], key='profile_coverage')
        self.assertEqual(l, l1)

        # bad criterion
        with self.assertRaises(MacsypyError) as ctx:
            get_best_hits([l0, l1], key='nimportnaoik')
        self.assertEqual('The criterion for Hits comparison nimportnaoik does not exist or is not available.\n'
                         'It must be either "score", "i_eval" or "profile_coverage".', str(ctx.exception))
Esempio n. 18
0
    def test_save_extract(self):
        gene_name = "gspD"
        gene = CoreGene(self.model_location, gene_name, self.profile_factory)
        shutil.copy(
            self.find_data("hmm", gene_name + self.cfg.res_search_suffix()),
            self.cfg.working_dir())
        report_path = os.path.join(self.cfg.working_dir(),
                                   gene_name + self.cfg.res_search_suffix())
        report = GembaseHMMReport(gene, report_path, self.cfg)
        report.extract()
        report.save_extract()
        extract_filename = gene_name + self.cfg.res_extract_suffix()
        extract_path = os.path.join(self.cfg.working_dir(),
                                    self.cfg.hmmer_dir(), extract_filename)
        self.assertTrue(os.path.exists(extract_path))
        self.assertTrue(os.path.isfile(extract_path))

        hits = [
            CoreHit(gene, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141,
                    float(2e-236), float(779.2), float(1.000000),
                    (741.0 - 104.0 + 1) / 803, 104, 741),
            CoreHit(gene, "PSAE001c01_006940", 803, "PSAE001c01", 68,
                    float(1.2e-234), float(779.2), float(1.000000),
                    (741.0 - 104.0 + 1) / 803, 104, 741),
            CoreHit(gene, "PSAE001c01_013980", 759, "PSAE001c01", 69,
                    float(3.7e-76), float(255.8), float(1.000000),
                    (736.0 - 105.0 + 1) / 759, 105, 736),
            CoreHit(gene, "PSAE001c01_017350", 600, "PSAE001c01", 70,
                    float(3.2e-27), float(94.2), float(0.500000),
                    (506.0 - 226.0 + 1) / 600, 226, 506),
            CoreHit(gene, "PSAE001c01_018920", 776, "PSAE001c01", 71,
                    float(6.1e-183), float(608.4), float(1.000000),
                    (606.0 - 48.0 + 1) / 776, 48, 606),
            CoreHit(gene, "PSAE001c01_031420", 658, "PSAE001c01", 73,
                    float(1.8e-210), float(699.3), float(1.000000),
                    (614.0 - 55.0 + 1) / 658, 55, 614)
        ]

        expected_extract_path = os.path.join(self.cfg.working_dir(),
                                             'expected_extract')
        with open(expected_extract_path, 'w') as expected_extract:
            extract = """# gene: {name} extract from {path} hmm output
# profile length= {len_profile:d}
# i_evalue threshold= {i_evalue:.3f}
# coverage threshold= {cov:.3f}
# hit_id replicon_name position_hit hit_sequence_length gene_name gene_system i_eval score profile_coverage sequence_coverage begin end
""".format(name=gene.name,
            path=report_path,
            len_profile=len(gene.profile),
            i_evalue=self.cfg.i_evalue_sel(),
            cov=self.cfg.coverage_profile())
            expected_extract.write(extract)
            for h in hits:
                expected_extract.write(str(h))

        self.assertFileEqual(extract_path, expected_extract_path)
Esempio n. 19
0
    def test_best_hit(self):
        gene_name = 'gspD'
        c_gene = CoreGene(self.model_location, gene_name, self.profile_factory)

        shutil.copy(
            self.find_data("hmm", gene_name + self.cfg.res_search_suffix()),
            self.cfg.working_dir())
        report_path = os.path.join(self.cfg.working_dir(),
                                   gene_name + self.cfg.res_search_suffix())
        report = GembaseHMMReport(c_gene, report_path, self.cfg)
        self.assertIsNone(report.best_hit())
        report.extract()
        best_hit = report.best_hit()
        hit_expected = CoreHit(c_gene, "NC_xxxxx_xx_056141", 803,
                               "NC_xxxxx_xx", 141, float(2e-236), float(779.2),
                               float(1.000000), (741.0 - 104.0 + 1) / 803, 104,
                               741)
        self.assertEqual(hit_expected, best_hit)
Esempio n. 20
0
    def test_str(self):
        gene_name = "gspD"
        gene = CoreGene(self.model_location, gene_name, self.profile_factory)

        hit_prop = {'id': "PSAE001c01_006940",
                    'hit_seq_len': 803,
                    'replicon_name': "PSAE001c01",
                    'position': 694,
                    'i_eval': float(1.2e-234),
                    'score': float(779.2),
                    'gene_name': gene.name,
                    'profil_coverage': float(1.0),
                    'sequence_coverage': float(638.000000),
                    'begin': 104,
                    'end': 741
                    }

        hit = CoreHit(gene, hit_prop['id'], hit_prop['hit_seq_len'], hit_prop['replicon_name'],
                      hit_prop['position'], hit_prop['i_eval'], hit_prop['score'],
                      hit_prop['profil_coverage'], hit_prop['sequence_coverage'], hit_prop['begin'], hit_prop['end'])
        s = "{id}\t{replicon_name}\t{position:d}\t{hit_seq_len:d}\t{gene_name}\t{i_eval:.3e}" \
            "\t{score:.3f}\t{profil_coverage:.3f}\t{sequence_coverage:.3f}\t{begin:d}\t{end:d}\n".format(**hit_prop)
        self.assertEqual(s, str(hit))
Esempio n. 21
0
    def test_str(self):
        gene_name = 'gspD'
        c_gene = CoreGene(self.model_location, gene_name, self.profile_factory)

        shutil.copy(
            self.find_data("hmm", gene_name + self.cfg.res_search_suffix()),
            self.cfg.working_dir())
        report_path = os.path.join(self.cfg.working_dir(),
                                   gene_name + self.cfg.res_search_suffix())
        report = GembaseHMMReport(c_gene, report_path, self.cfg)
        report.extract()

        hits = [
            CoreHit(c_gene, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141,
                    float(2e-236), float(779.2), float(1.000000),
                    (741.0 - 104.0 + 1) / 803, 104, 741),
            CoreHit(c_gene, "PSAE001c01_006940", 803, "PSAE001c01", 68,
                    float(1.2e-234), float(779.2), float(1.000000),
                    (741.0 - 104.0 + 1) / 803, 104, 741),
            CoreHit(c_gene, "PSAE001c01_013980", 759, "PSAE001c01", 69,
                    float(3.7e-76), float(255.8), float(1.000000),
                    (736.0 - 105.0 + 1) / 759, 105, 736),
            CoreHit(c_gene, "PSAE001c01_017350", 600, "PSAE001c01", 70,
                    float(3.2e-27), float(94.2), float(0.500000),
                    (506.0 - 226.0 + 1) / 600, 226, 506),
            CoreHit(c_gene, "PSAE001c01_018920", 776, "PSAE001c01", 71,
                    float(6.1e-183), float(608.4), float(1.000000),
                    (606.0 - 48.0 + 1) / 776, 48, 606),
            CoreHit(c_gene, "PSAE001c01_031420", 658, "PSAE001c01", 73,
                    float(1.8e-210), float(699.3), float(1.000000),
                    (614.0 - 55.0 + 1) / 658, 55, 614)
        ]

        s = f"# gene: {c_gene.name} extract from {report_path} hmm output\n"
        s += f"# profile length= {len(c_gene.profile):d}\n"
        s += f"# i_evalue threshold= {self.cfg.i_evalue_sel():.3f}\n"
        s += f"# coverage threshold= {self.cfg.coverage_profile():.3f}\n"
        s += "# hit_id replicon_name position_hit hit_sequence_length gene_name gene_system i_eval score " \
             "profile_coverage sequence_coverage begin end\n"
        for h in hits:
            s += str(h)
        self.assertMultiLineEqual(str(report), s)
Esempio n. 22
0
    def test_extract(self):
        gene_name = "gspD"
        c_gene = CoreGene(self.model_location, gene_name, self.profile_factory)
        shutil.copy(
            self.find_data("hmm", gene_name + self.cfg.res_search_suffix()),
            self.cfg.working_dir())
        report_path = os.path.join(self.cfg.working_dir(),
                                   gene_name + self.cfg.res_search_suffix())
        report = GeneralHMMReport(c_gene, report_path, self.cfg)
        report.extract()
        self.assertEqual(len(report.hits), 6)
        #           gene, model,     hit_id,         hit_seq_ length   replicon_name, pos_hit, i_eval,
        #           score,       profile_coverage, sequence_coverage, begin_match, end_match
        hits = [
            CoreHit(c_gene, "NC_xxxxx_xx_056141", 803, "Unordered", 141,
                    float(2e-236), float(779.2), float(1.000000),
                    (741.0 - 104.0 + 1) / 803, 104, 741),
            CoreHit(c_gene, "PSAE001c01_006940", 803, "Unordered", 68,
                    float(1.2e-234), float(779.2), float(1.000000),
                    (741.0 - 104.0 + 1) / 803, 104, 741),
            CoreHit(c_gene, "PSAE001c01_013980", 759, "Unordered", 69,
                    float(3.7e-76), float(255.8), float(1.000000),
                    (736.0 - 105.0 + 1) / 759, 105, 736),
            CoreHit(c_gene, "PSAE001c01_017350", 600, "Unordered", 70,
                    float(3.2e-27), float(94.2), float(0.500000),
                    (506.0 - 226.0 + 1) / 600, 226, 506),
            CoreHit(c_gene, "PSAE001c01_018920", 776, "Unordered", 71,
                    float(6.1e-183), float(608.4), float(1.000000),
                    (606.0 - 48.0 + 1) / 776, 48, 606),
            CoreHit(c_gene, "PSAE001c01_031420", 658, "Unordered", 73,
                    float(1.8e-210), float(699.3), float(1.000000),
                    (614.0 - 55.0 + 1) / 658, 55, 614)
        ]
        self.assertListEqual(hits, report.hits)

        report = GeneralHMMReport(c_gene, report_path, self.cfg)
        report.hits = hits
        self.assertIsNone(report.extract())
Esempio n. 23
0
    def test_sort_hits_by_status(self):
        ordered_match_maker = OrderedMatchMaker(self.model,
                                                self.cfg.redundancy_penalty())
        mandatory_exp = [self.m_hits['mh_sctn'], self.m_hits['mh_sctj']]
        accessory_exp = [self.m_hits['mh_gspd']]
        neutral_exp = [self.m_hits['mh_toto']]
        forbidden_exp = [self.m_hits['mh_abc']]

        mandatory, accessory, neutral, forbidden = ordered_match_maker.sort_hits_by_status(
            mandatory_exp + accessory_exp + neutral_exp + forbidden_exp)
        self.assertListEqual([h.gene.name for h in mandatory_exp],
                             [h.gene.name for h in mandatory])
        self.assertListEqual([h.gene.name for h in accessory_exp],
                             [h.gene.name for h in accessory])
        self.assertListEqual([h.gene.name for h in neutral_exp],
                             [h.gene.name for h in neutral])
        self.assertListEqual([h.gene.name for h in forbidden_exp],
                             [h.gene.name for h in forbidden])

        # do the same but with exchangeable
        mandatory_exp_exch = [
            self.m_hits['mh_sctn_flg'], self.m_hits['mh_sctj_flg']
        ]
        accessory_exp_exch = [self.m_hits['mh_gspd_ex']]
        neutral_exp_exch = [self.m_hits['mh_toto_ex']]
        forbidden_exp_exch = [self.m_hits['mh_abc_ex']]

        mandatory, accessory, neutral, forbidden = ordered_match_maker.sort_hits_by_status(
            mandatory_exp_exch + accessory_exp_exch + neutral_exp_exch +
            forbidden_exp_exch)
        self.assertListEqual([h.gene.name for h in mandatory_exp_exch],
                             [h.gene.name for h in mandatory])
        self.assertListEqual([h.gene.name for h in accessory_exp_exch],
                             [h.gene.name for h in accessory])
        self.assertListEqual([h.gene.name for h in neutral_exp_exch],
                             [h.gene.name for h in neutral])
        self.assertListEqual([h.gene.name for h in forbidden_exp_exch],
                             [h.gene.name for h in forbidden])

        # test if gene_ref is the ModelGene
        # alternate_of return the ModelGene of the function
        self.assertListEqual(
            [h.gene.name for h in mandatory_exp],
            [h.gene_ref.alternate_of().name for h in mandatory])
        self.assertListEqual(
            [h.gene.name for h in accessory_exp],
            [h.gene_ref.alternate_of().name for h in accessory])
        self.assertListEqual([h.gene.name for h in neutral_exp],
                             [h.gene_ref.alternate_of().name for h in neutral])
        self.assertListEqual(
            [h.gene.name for h in forbidden_exp],
            [h.gene_ref.alternate_of().name for h in forbidden])

        # test if the hit does not refer to gene belonging to the model
        model2 = Model("foo/model_B", 10)
        cg_fliE = CoreGene(self.model_location, "fliE", self.profile_factory)
        ch_fliE = CoreHit(cg_fliE, "hit_fliE", 803, "replicon_id", 1, 1.0, 1.0,
                          1.0, 1.0, 10, 20)

        mg_fliE = ModelGene(cg_fliE, model2)
        mh_fliE = ModelHit(ch_fliE, mg_fliE, GeneStatus.NEUTRAL)
        with self.assertRaises(MacsypyError) as ctx:
            with self.catch_log():
                ordered_match_maker.sort_hits_by_status([mh_fliE])
        self.assertEqual(str(ctx.exception),
                         "Gene 'fliE' not found in model 'foo/model_B'")
Esempio n. 24
0
    def test_LikelySystemSerializer_txt(self):
        model = Model("foo/FOO", 10)
        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)
        model.add_mandatory_gene(gene_gspd)
        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model)
        model.add_accessory_gene(gene_sctj)
        c_gene_sctn = CoreGene(self.model_location, "sctN",
                               self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model)
        model.add_accessory_gene(gene_sctn)
        c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory)
        gene_abc = ModelGene(c_gene_abc, model)
        model.add_forbidden_gene(gene_abc)

        hit_1 = CoreHit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_1 = ModelHit(hit_1, gene_gspd, GeneStatus.MANDATORY)
        hit_2 = CoreHit(c_gene_sctj, "hit_2", 803, "replicon_id", 2, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_2 = ModelHit(hit_2, gene_sctj, GeneStatus.ACCESSORY)
        hit_3 = CoreHit(c_gene_sctn, "hit_3", 803, "replicon_id", 3, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_3 = ModelHit(hit_3, gene_sctn, GeneStatus.ACCESSORY)
        hit_4 = CoreHit(c_gene_abc, "hit_4", 803, "replicon_id", 4, 1.0, 1.0,
                        1.0, 1.0, 10, 20)
        v_hit_4 = ModelHit(hit_4, gene_abc, GeneStatus.FORBIDDEN)

        ls_1 = LikelySystem(model, [v_hit_1], [v_hit_2, v_hit_3], [],
                            [v_hit_4])
        hit_multi_sys_tracker = HitSystemTracker([ls_1])
        ser = TxtLikelySystemSerializer()

        txt = ser.serialize(ls_1, hit_multi_sys_tracker)
        expected_txt = """This replicon contains genetic materials needed for system foo/FOO
WARNING there quorum is reached but there is also some forbidden genes.

system id = replicon_id_FOO_1
model = foo/FOO
replicon = replicon_id
hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 2), ('hit_3', 'sctN', 3), ('hit_4', 'abc', 4)]
wholeness = 1.000

mandatory genes:
\t- gspD: 1 (gspD)

accessory genes:
\t- sctJ: 1 (sctJ)
\t- sctN: 1 (sctN)

neutral genes:

forbidden genes:
\t- abc: 1 (abc)

Use ordered replicon to have better prediction.
"""
        self.assertEqual(txt, expected_txt)
Esempio n. 25
0
    def test_SolutionSerializer_tsv(self):
        model_name = 'foo'
        model_location = ModelLocation(
            path=os.path.join(self.cfg.models_dir()[0], model_name))

        ###########
        # Model B #
        ###########
        model_B = Model("foo/B", 10)
        c_gene_sctn_flg = CoreGene(model_location, "sctN_FLG",
                                   self.profile_factory)
        gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B)
        c_gene_sctj_flg = CoreGene(model_location, "sctJ_FLG",
                                   self.profile_factory)
        gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B)
        c_gene_flgB = CoreGene(model_location, "flgB", self.profile_factory)
        gene_flgB = ModelGene(c_gene_flgB, model_B)
        c_gene_tadZ = CoreGene(model_location, "tadZ", self.profile_factory)
        gene_tadZ = ModelGene(c_gene_tadZ, model_B)

        model_B.add_mandatory_gene(gene_sctn_flg)
        model_B.add_mandatory_gene(gene_sctj_flg)
        model_B.add_accessory_gene(gene_flgB)
        model_B.add_accessory_gene(gene_tadZ)

        ###########
        # Model A #
        ###########
        model_A = Model("foo/A", 10)
        c_gene_sctn = CoreGene(model_location, "sctN", self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model_A)
        gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn)
        gene_sctn.add_exchangeable(gene_sctn_hom)

        c_gene_sctj = CoreGene(model_location, "sctJ", self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model_A)
        gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj)
        gene_sctj.add_exchangeable(gene_sctj_an)

        c_gene_gspd = CoreGene(model_location, "gspD", self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model_A)
        gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd)
        gene_gspd.add_exchangeable(gene_gspd_an)

        c_gene_abc = CoreGene(model_location, "abc", self.profile_factory)
        gene_abc = ModelGene(c_gene_abc, model_A, loner=True)
        gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc)
        gene_abc.add_exchangeable(gene_abc_ho)

        model_A.add_mandatory_gene(gene_sctn)
        model_A.add_mandatory_gene(gene_sctj)
        model_A.add_accessory_gene(gene_gspd)
        model_A.add_accessory_gene(gene_abc)

        #       CoreHit(gene, hit_id, hit_seq_length, replicon_name, position, i_eval, score,
        #         profile_coverage, sequence_coverage, begin_match, end_match
        #                                                           pos      score
        h_sctj = CoreHit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        mh_sctj = ModelHit(h_sctj, gene_sctj, GeneStatus.MANDATORY)
        h_sctn = CoreHit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 2, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        mh_sctn = ModelHit(h_sctn, gene_sctn, GeneStatus.MANDATORY)
        h_gspd = CoreHit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 3, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        mh_gspd = ModelHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY)

        h_sctj_flg = CoreHit(c_gene_sctj_flg, "hit_sctj_flg", 803,
                             "replicon_id", 10, 1.0, 1.0, 1.0, 1.0, 10, 20)
        h_flgB = CoreHit(c_gene_flgB, "hit_flgB", 803, "replicon_id", 11, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        h_abc = CoreHit(c_gene_abc, "hit_abc", 803, "replicon_id", 20, 1.0,
                        1.0, 1.0, 1.0, 10, 20)
        h_abc2 = CoreHit(c_gene_abc, "hit_abc2", 803, "replicon_id", 50, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        h_tadZ = CoreHit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 40, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        mh_sctj_flg = ModelHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY)
        mh_flgB = ModelHit(h_flgB, gene_flgB, GeneStatus.ACCESSORY)
        mh_abc = ModelHit(h_abc, gene_abc, GeneStatus.ACCESSORY)
        mh_abc2 = ModelHit(h_abc2, gene_abc, GeneStatus.ACCESSORY)
        mh_tadZ = ModelHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY)

        model_A._min_mandatory_genes_required = 2
        model_A._min_genes_required = 2
        c1 = Cluster([mh_sctj, mh_sctn, mh_gspd], model_A, self.hit_weights)
        c2 = Cluster([mh_sctj, mh_sctn], model_A, self.hit_weights)
        c3 = Cluster([
            Loner(h_abc,
                  gene_ref=gene_abc,
                  gene_status=GeneStatus.ACCESSORY,
                  counterpart=[mh_abc2])
        ], model_A, self.hit_weights)

        model_B._min_mandatory_genes_required = 1
        model_B._min_genes_required = 2
        c5 = Cluster([mh_sctj_flg, mh_tadZ, mh_flgB], model_B,
                     self.hit_weights)

        sys_A = System(model_A, [c1, c2, c3], self.cfg.redundancy_penalty())
        # score =               2.5, 2 , 0.35 = 4.85 - (2 * 1.5) = 1.85

        sys_A.id = "sys_id_A"
        sys_B = System(model_B, [c5], self.cfg.redundancy_penalty())
        # score =                2.0
        sys_B.id = "sys_id_B"

        sol = Solution([sys_A, sys_B])
        sol_id = '12'

        hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B])
        sol_serializer = TsvSolutionSerializer()

        sol_tsv = '\t'.join([
            sol_id, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A',
            'sys_id_A', '2', '1', '1.000', '1.850', '2', 'sctJ', 'mandatory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_sctn', 'sctN', '2', 'foo/A',
            'sys_id_A', '2', '1', '1.000', '1.850', '2', 'sctN', 'mandatory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_gspd', 'gspD', '3', 'foo/A',
            'sys_id_A', '2', '1', '1.000', '1.850', '2', 'gspD', 'accessory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A',
            'sys_id_A', '2', '2', '1.000', '1.850', '2', 'sctJ', 'mandatory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_sctn', 'sctN', '2', 'foo/A',
            'sys_id_A', '2', '2', '1.000', '1.850', '2', 'sctN', 'mandatory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_abc', 'abc', '20', 'foo/A', 'sys_id_A',
            '2', '-1', '1.000', '1.850', '2', 'abc', 'accessory', '803', '1.0',
            '1.000', '1.000', '1.000', '10', '20', 'hit_abc2', ''
        ])
        sol_tsv += "\n"
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_sctj_flg', 'sctJ_FLG', '10', 'foo/B',
            'sys_id_B', '1', '1', '0.750', '2.000', '1', 'sctJ_FLG',
            'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20',
            '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_flgB', 'flgB', '11', 'foo/B',
            'sys_id_B', '1', '1', '0.750', '2.000', '1', 'flgB', 'accessory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += '\t'.join([
            sol_id, 'replicon_id', 'hit_tadZ', 'tadZ', '40', 'foo/B',
            'sys_id_B', '1', '1', '0.750', '2.000', '1', 'tadZ', 'accessory',
            '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '', ''
        ])
        sol_tsv += "\n"
        sol_tsv += "\n"
        ser = sol_serializer.serialize(sol, sol_id, hit_multi_sys_tracker)
        self.maxDiff = None
        self.assertEqual(ser, sol_tsv)
Esempio n. 26
0
    def test_parse_hmm_body(self):
        def make_hmm_group(hmm_string):
            hmm_file = StringIO(hmm_string)
            hmm_hits = (
                x[1] for x in groupby(hmm_file, lambda l: l.startswith('>>')))
            header = next(hmm_hits)
            body = next(hmm_hits)
            return body

        gene_name = "gspD"
        c_gene = CoreGene(self.model_location, gene_name, self.profile_factory)
        report_path = os.path.join(self.cfg.working_dir(),
                                   gene_name + self.cfg.res_search_suffix())
        report = GembaseHMMReport(c_gene, report_path, self.cfg)

        # with one significant hit
        hmm = """>> NC_xxxxx_xx_056141  C ATG TAA 6260390 6261757 Valid PA5567 1368 _NP_254254.1_ PA5567 1 6260390 6261757 | tRNA modific
   #    score  bias  c-Evalue  i-Evalue hmmfrom  hmm to    alifrom  ali to    envfrom  env to     acc
 ---   ------ ----- --------- --------- ------- -------    ------- -------    ------- -------    ----
   1 !  779.2   5.5  1.4e-237    2e-236       1     596 []     104     741 ..     104     741 .. 0.93

  Alignments for each domain:
"""
        body = make_hmm_group(hmm)
        hits = report._parse_hmm_body('NC_xxxxx_xx_056141', 596, 803, 0.5,
                                      'NC_xxxxx_xx', 141, 0.5, body)
        expected_hits = [
            CoreHit(c_gene, "NC_xxxxx_xx_056141", 803, "NC_xxxxx_xx", 141,
                    float(2e-236), float(779.2), float(1.000000),
                    (741.0 - 104.0 + 1) / 803, 104, 741)
        ]
        self.assertListEqual(hits, expected_hits)
        # with no significant hit
        hmm = """>> PSAE001c01_051090  C ATG TGA 5675714 5677858 Valid pilQ 2145 _PA5040_NP_253727.1_ PA5040 1 5675714 5677858 | type 4 f
   #    score  bias  c-Evalue  i-Evalue hmmfrom  hmm to    alifrom  ali to    envfrom  env to     acc
 ---   ------ ----- --------- --------- ------- -------    ------- -------    ------- -------    ----
   1 !   27.1   0.2   6.3e-10   6.6e-07       1     120 [.     286     402 ..     286     407 .. 0.86
   2 !  186.2   0.1   4.2e-58   4.3e-55     294     590 ..     405     709 ..     397     712 .. 0.84

  Alignments for each domain:
"""
        body = make_hmm_group(hmm)
        hits = report._parse_hmm_body('NC_xxxxx_xx_056141', 596, 803, 0.5,
                                      'NC_xxxxx_xx', 141, 0.5, body)
        expected_hits = []
        self.assertListEqual(hits, expected_hits)

        # with no hit
        hmm = """>> PSAE001c01_051090  C ATG TGA 5675714 5677858 Valid pilQ 2145 _PA5040_NP_253727.1_ PA5040 1 5675714 5677858 | type 4 f
        bla bla
        """
        body = make_hmm_group(hmm)
        hits = report._parse_hmm_body('NC_xxxxx_xx_056141', 596, 803, 0.5,
                                      'NC_xxxxx_xx', 141, 0.5, body)
        expected_hits = []
        self.assertListEqual(hits, expected_hits)

        # with invalid hmm
        hmm = """>> NC_xxxxx_xx_056141  C ATG TAA 6260390 6261757 Valid PA5567 1368 _NP_254254.1_ PA5567 1 6260390 6261757 | tRNA modific
   #    score  bias  c-Evalue  i-Evalue hmmfrom  hmm to    alifrom  ali to    envfrom  env to     acc
 ---   ------ ----- --------- --------- ------- -------    ------- -------    ------- -------    ----
   1 !  779.2   5.5  1.4e-237    foo       1     596 []     104     741 ..     104     741 .. 0.93

  Alignments for each domain:
"""
        body = make_hmm_group(hmm)
        with self.assertRaises(ValueError) as ctx:
            report._parse_hmm_body('NC_xxxxx_xx_056141', 596, 803, 0.5,
                                   'NC_xxxxx_xx', 141, 0.5, body)
        self.assertEqual(
            str(ctx.exception),
            """Invalid line to parse :   1 !  779.2   5.5  1.4e-237    foo       1     596 []     104     741 ..     104     741 .. 0.93
:could not convert string to float: 'foo'""")
Esempio n. 27
0
    def test_SpecialHitSerializer_tsv(self):
        args = argparse.Namespace()
        args.sequence_db = self.find_data("base", "test_1.fasta")
        args.db_type = 'gembase'
        args.models_dir = self.find_data('models')
        cfg = Config(MacsyDefaults(), args)

        model_name = 'foo'
        models_location = ModelLocation(
            path=os.path.join(args.models_dir, model_name))

        # we need to reset the ProfileFactory
        # because it's a like a singleton
        # so other tests are influenced by ProfileFactory and it's configuration
        # for instance search_genes get profile without hmmer_exe
        profile_factory = ProfileFactory(cfg)
        model = Model("foo/T2SS", 10)

        gene_name = "gspD"
        cg_gspd = CoreGene(models_location, gene_name, profile_factory)
        mg_gspd = ModelGene(cg_gspd, model, loner=True)

        gene_name = "sctJ"
        cg_sctj = CoreGene(models_location, gene_name, profile_factory)
        mg_sctj = ModelGene(cg_sctj, model)

        gene_name = "abc"
        cg_abc = CoreGene(models_location, gene_name, profile_factory)
        mg_abc = ModelGene(cg_abc, model)

        model.add_mandatory_gene(mg_gspd)
        model.add_accessory_gene(mg_sctj)
        model.add_accessory_gene(mg_abc)

        chit_abc = CoreHit(cg_abc, "hit_abc", 803, "replicon_id", 3, 1.0, 1.0,
                           1.0, 1.0, 10, 20)
        chit_sctj = CoreHit(cg_sctj, "hit_sctj", 803, "replicon_id", 4, 1.0,
                            1.0, 1.0, 1.0, 10, 20)
        chit_gspd1 = CoreHit(cg_gspd, "hit_gspd1", 803, "replicon_id", 20, 1.0,
                             2.0, 1.0, 1.0, 10, 20)
        chit_gspd2 = CoreHit(cg_gspd, "hit_gspd2", 803, "replicon_id", 30, 1.0,
                             3.0, 1.0, 1.0, 10, 20)
        mhit_abc = ModelHit(chit_abc, mg_abc, GeneStatus.ACCESSORY)
        mhit_sctj = ModelHit(chit_sctj, mg_sctj, GeneStatus.ACCESSORY)
        mhit_gspd1 = ModelHit(chit_gspd1, mg_gspd, GeneStatus.MANDATORY)
        mhit_gspd2 = ModelHit(chit_gspd2, mg_gspd, GeneStatus.MANDATORY)
        l_gspd1 = Loner(mhit_gspd1, counterpart=[mhit_gspd2])
        l_gspd2 = Loner(mhit_gspd2, counterpart=[mhit_gspd1])
        ser = TsvSpecialHitSerializer()
        txt = ser.serialize([l_gspd1, l_gspd2])

        expected_txt = "\t".join([
            'replicon', 'model_fqn', 'function', 'gene_name', 'hit_id',
            'hit_pos', 'hit_status', 'hit_seq_len', 'hit_i_eval', 'hit_score',
            'hit_profile_cov', 'hit_seq_cov', 'hit_begin_match',
            'hit_end_match'
        ])
        expected_txt += "\n"
        expected_txt += "\t".join([
            'replicon_id', 'foo/T2SS', 'gspD', 'gspD', 'hit_gspd1', '20',
            'mandatory', '803', '1.000e+00', '2.000', '1.000', '1.000', '10',
            '20'
        ])
        expected_txt += "\n"
        expected_txt += "\t".join([
            'replicon_id', 'foo/T2SS', 'gspD', 'gspD', 'hit_gspd2', '30',
            'mandatory', '803', '1.000e+00', '3.000', '1.000', '1.000', '10',
            '20'
        ])
        expected_txt += "\n"
        self.maxDiff = None
        self.assertEqual(txt, expected_txt)
Esempio n. 28
0
    def test_SystemSerializer_str(self):
        model_name = 'foo'
        model_location = ModelLocation(
            path=os.path.join(self.cfg.models_dir()[0], model_name))
        model_A = Model("foo/A", 10)
        model_B = Model("foo/B", 10)

        c_gene_sctn_flg = CoreGene(model_location, "sctN_FLG",
                                   self.profile_factory)
        gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B)
        c_gene_sctj_flg = CoreGene(model_location, "sctJ_FLG",
                                   self.profile_factory)
        gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B)
        c_gene_flgB = CoreGene(model_location, "flgB", self.profile_factory)
        c_gene_tadZ = CoreGene(model_location, "tadZ", self.profile_factory)
        gene_tadZ = ModelGene(c_gene_tadZ, model_B)

        c_gene_sctn = CoreGene(model_location, "sctN", self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, model_A)
        gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn)
        gene_sctn.add_exchangeable(gene_sctn_hom)

        c_gene_sctj = CoreGene(model_location, "sctJ", self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, model_A)
        gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj)
        gene_sctj.add_exchangeable(gene_sctj_an)

        c_gene_gspd = CoreGene(model_location, "gspD", self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model_A)
        gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd)
        gene_gspd.add_exchangeable(gene_gspd_an)

        c_gene_abc = CoreGene(model_location, "abc", self.profile_factory)
        gene_abc = ModelGene(c_gene_abc, model_A)
        gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc)
        gene_abc.add_exchangeable(gene_abc_ho)

        model_A.add_mandatory_gene(gene_sctn)
        model_A.add_mandatory_gene(gene_sctj)
        model_A.add_accessory_gene(gene_gspd)
        model_A.add_forbidden_gene(gene_abc)

        model_B.add_mandatory_gene(gene_sctn_flg)
        model_B.add_mandatory_gene(gene_sctj_flg)
        model_B.add_accessory_gene(gene_gspd)
        model_B.add_accessory_gene(gene_tadZ)

        h_sctj = CoreHit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        h_sctn = CoreHit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0,
                         1.0, 1.0, 1.0, 10, 20)
        h_gspd = CoreHit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0,
                         1.0, 1.0, 1.0, 10, 20)

        h_sctj_flg = CoreHit(c_gene_sctj_flg, "hit_sctj_flg", 803,
                             "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20)
        h_tadZ = CoreHit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 1, 1.0,
                         1.0, 1.0, 1.0, 10, 20)

        model_A._min_mandatory_genes_required = 2
        model_A._min_genes_required = 2
        c1 = Cluster([
            ModelHit(h_sctj, gene_sctj, GeneStatus.MANDATORY),
            ModelHit(h_sctn, gene_sctn, GeneStatus.MANDATORY),
            ModelHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY)
        ], model_A, self.hit_weights)

        c2 = Cluster([
            ModelHit(h_sctj, gene_sctj, GeneStatus.MANDATORY),
            ModelHit(h_sctn, gene_sctn, GeneStatus.MANDATORY)
        ], model_A, self.hit_weights)

        model_B._min_mandatory_genes_required = 1
        model_B._min_genes_required = 2
        c3 = Cluster([
            ModelHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY),
            ModelHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY),
            ModelHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY)
        ], model_B, self.hit_weights)

        sys_A = System(model_A, [c1, c2], self.cfg.redundancy_penalty())
        sys_A.id = "sys_id_A"
        sys_B = System(model_B, [c3], self.cfg.redundancy_penalty())
        sys_B.id = "sys_id_B"
        hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B])
        system_serializer = TxtSystemSerializer()

        sys_str = f"""system id = {sys_A.id}
model = foo/A
replicon = replicon_id
clusters = [('hit_sctj', 'sctJ', 1), ('hit_sctn', 'sctN', 1), ('hit_gspd', 'gspD', 1)], [('hit_sctj', 'sctJ', 1), ('hit_sctn', 'sctN', 1)]
occ = 2
wholeness = 1.000
loci nb = 2
score = 1.500

mandatory genes:
\t- sctN: 2 (sctN, sctN)
\t- sctJ: 2 (sctJ, sctJ)

accessory genes:
\t- gspD: 1 (gspD [sys_id_B])

neutral genes:
"""
        self.assertEqual(
            sys_str, system_serializer.serialize(sys_A, hit_multi_sys_tracker))
Esempio n. 29
0
    def test_filter(self):
        model_fqn = "foo/bar"
        model = Model(model_fqn, 10)
        model_2 = Model("foo/buz", 10)

        gene_name = 'sctJ_FLG'
        sctJ_FLG_core = CoreGene(self.model_location, gene_name, self.profile_factory)
        sctJ_FLG = ModelGene(sctJ_FLG_core, model)
        model.add_mandatory_gene(sctJ_FLG)

        gene_name = 'sctJ'
        sctJ_core = CoreGene(self.model_location, gene_name, self.profile_factory)
        sctj = Exchangeable(sctJ_core, sctJ_FLG)
        sctJ_FLG.add_exchangeable(sctj)

        gene_name = 'sctN_FLG'
        sctN_FLG_core = CoreGene(self.model_location, gene_name, self.profile_factory)
        sctN_FLG = ModelGene(sctN_FLG_core, model)
        model.add_accessory_gene(sctN_FLG)

        gene_name = 'sctN'
        sctN_core = CoreGene(self.model_location, gene_name, self.profile_factory)
        sctn = Exchangeable(sctN_core, sctN_FLG)
        sctN_FLG.add_exchangeable(sctn)

        gene_name = 'sctC'
        sctC_core = CoreGene(self.model_location, gene_name, self.profile_factory)
        sctC = ModelGene(sctC_core, model)
        model.add_forbidden_gene(sctC)

        gene_name = 'toto'
        toto_core = CoreGene(self.model_location, gene_name, self.profile_factory)
        toto = ModelGene(toto_core, model)
        model.add_neutral_gene(toto)

        gene_name = 'totote'
        totote_core = CoreGene(self.model_location, gene_name, self.profile_factory)
        totote = Exchangeable(totote_core, toto)
        toto.add_exchangeable(totote)

        gene_name = 'gspD'
        gspd_core = CoreGene(self.model_location, gene_name, self.profile_factory)
        gspd = ModelGene(gspd_core, model_2)

        gene_name = 'tadZ'
        tadz_core = CoreGene(self.model_location, gene_name, self.profile_factory)
        tadz = Exchangeable(tadz_core, gspd)
        gspd.add_exchangeable(tadz)

        hit_to_keep = []
        for gene in (sctJ_FLG, sctN_FLG, sctC, toto, totote):
            hit_to_keep.append(CoreHit(gene,
                                   f"PSAE001c01_{gene.name}",
                                       1, "PSAE001c01", 1, 1.0, 1.0, 1.0, 1.0, 1, 2)
                               )
        hit_to_filter_out = []
        for gene in (gspd, tadz):
            hit_to_filter_out.append(CoreHit(gene,
                                     f"PSAE001c01_{gene.name}",
                                             1, "PSAE001c01", 1, 1.0, 1.0, 1.0, 1.0, 1, 2)
                                     )

        filtered_hits = model.filter(hit_to_keep + hit_to_filter_out)

        self.assertListEqual(sorted(hit_to_keep), sorted(filtered_hits))
Esempio n. 30
0
    def setUp(self) -> None:
        args = argparse.Namespace()
        args.sequence_db = self.find_data("base", "test_1.fasta")
        args.db_type = 'gembase'
        args.models_dir = self.find_data('models')
        self.cfg = Config(MacsyDefaults(), args)

        self.model_name = 'foo'
        self.model_location = ModelLocation(
            path=os.path.join(args.models_dir, self.model_name))
        self.profile_factory = ProfileFactory(self.cfg)

        self.model = Model("foo/model_A", 10)
        c_gene_sctn = CoreGene(self.model_location, "sctN",
                               self.profile_factory)
        gene_sctn = ModelGene(c_gene_sctn, self.model)

        c_gene_sctn_flg = CoreGene(self.model_location, "sctN_FLG",
                                   self.profile_factory)
        gene_sctn_flg = Exchangeable(c_gene_sctn_flg, gene_sctn)
        gene_sctn.add_exchangeable(gene_sctn_flg)

        c_gene_sctj = CoreGene(self.model_location, "sctJ",
                               self.profile_factory)
        gene_sctj = ModelGene(c_gene_sctj, self.model)
        c_gene_sctj_flg = CoreGene(self.model_location, "sctJ_FLG",
                                   self.profile_factory)
        gene_sctj_flg = Exchangeable(c_gene_sctj_flg, gene_sctj)
        gene_sctj.add_exchangeable(gene_sctj_flg)

        c_gene_gspd = CoreGene(self.model_location, "gspD",
                               self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, self.model)

        c_gene_flgb = CoreGene(self.model_location, "flgB",
                               self.profile_factory)
        gene_gspd_ex = Exchangeable(c_gene_flgb, gene_gspd)
        gene_gspd.add_exchangeable(gene_gspd_ex)

        c_gene_abc = CoreGene(self.model_location, "abc", self.profile_factory)
        gene_abc = ModelGene(c_gene_abc, self.model)
        c_gene_tadz = CoreGene(self.model_location, "tadZ",
                               self.profile_factory)
        gene_abc_ex = Exchangeable(c_gene_tadz, gene_abc)
        gene_abc.add_exchangeable(gene_abc_ex)

        c_gene_toto = CoreGene(self.model_location, "toto",
                               self.profile_factory)
        gene_toto = ModelGene(c_gene_toto, self.model)
        c_gene_totote = CoreGene(self.model_location, "totote",
                                 self.profile_factory)
        gene_toto_ex = Exchangeable(c_gene_totote, gene_toto)
        gene_toto.add_exchangeable(gene_toto_ex)

        self.model.add_mandatory_gene(gene_sctn)
        self.model.add_mandatory_gene(gene_sctj)
        self.model.add_accessory_gene(gene_gspd)
        self.model.add_neutral_gene(gene_toto)
        self.model.add_forbidden_gene(gene_abc)

        self.c_hits = {
            'ch_sctj':
            CoreHit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0,
                    1.0, 1.0, 10, 20),
            'ch_sctj_flg':
            CoreHit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id", 1,
                    1.0, 1.0, 1.0, 1.0, 10, 20),
            'ch_sctn':
            CoreHit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0, 1.0,
                    1.0, 1.0, 10, 20),
            'ch_sctn_flg':
            CoreHit(c_gene_sctn_flg, "hit_sctn_flg", 803, "replicon_id", 1,
                    1.0, 1.0, 1.0, 1.0, 10, 20),
            'ch_gspd':
            CoreHit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0, 1.0,
                    1.0, 1.0, 10, 20),
            'ch_gspd_ex':
            CoreHit(c_gene_flgb, "hit_gspd_an", 803, "replicon_id", 1, 1.0,
                    1.0, 1.0, 1.0, 10, 20),
            'ch_abc':
            CoreHit(c_gene_abc, "hit_abc", 803, "replicon_id", 1, 1.0, 1.0,
                    1.0, 1.0, 10, 20),
            'ch_abc_ex':
            CoreHit(c_gene_tadz, "hit_abc_ho", 803, "replicon_id", 1, 1.0, 1.0,
                    1.0, 1.0, 10, 20),
            'ch_toto':
            CoreHit(c_gene_toto, "hit_toto", 803, "replicon_id", 1, 1.0, 1.0,
                    1.0, 1.0, 10, 20),
            'ch_toto_ex':
            CoreHit(c_gene_totote, "hit_toto_ho", 803, "replicon_id", 1, 1.0,
                    1.0, 1.0, 1.0, 10, 20),
        }
        self.m_hits = {
            'mh_sctj':
            ModelHit(self.c_hits['ch_sctj'], gene_sctj, GeneStatus.MANDATORY),
            'mh_sctj_flg':
            ModelHit(self.c_hits['ch_sctj_flg'], gene_sctj_flg,
                     GeneStatus.MANDATORY),
            'mh_sctn':
            ModelHit(self.c_hits['ch_sctn'], gene_sctn, GeneStatus.MANDATORY),
            'mh_sctn_flg':
            ModelHit(self.c_hits['ch_sctn_flg'], gene_sctn_flg,
                     GeneStatus.MANDATORY),
            'mh_gspd':
            ModelHit(self.c_hits['ch_gspd'], gene_gspd, GeneStatus.ACCESSORY),
            'mh_gspd_ex':
            ModelHit(self.c_hits['ch_gspd_ex'], gene_gspd_ex,
                     GeneStatus.ACCESSORY),
            'mh_abc':
            ModelHit(self.c_hits['ch_abc'], gene_abc, GeneStatus.FORBIDDEN),
            'mh_abc_ex':
            ModelHit(self.c_hits['ch_abc_ex'], gene_abc_ex,
                     GeneStatus.FORBIDDEN),
            'mh_toto':
            ModelHit(self.c_hits['ch_toto'], gene_toto, GeneStatus.NEUTRAL),
            'mh_toto_ex':
            ModelHit(self.c_hits['ch_toto_ex'], gene_toto_ex,
                     GeneStatus.NEUTRAL)
        }