コード例 #1
0
    def setUp(self):
        self.tmp_dir = os.path.join(tempfile.gettempdir(),
                                    'test_macsyfinder_search_genes')
        if os.path.exists(self.tmp_dir):
            shutil.rmtree(self.tmp_dir)
        os.mkdir(self.tmp_dir)

        macsypy.init_logger()
        macsypy.logger_set_level(30)

        args = argparse.Namespace()
        args.sequence_db = self.find_data("base", "test_base.fa")
        args.db_type = 'gembase'
        args.models_dir = self.find_data('models')
        args.log_level = 30
        args.out_dir = os.path.join(self.tmp_dir, 'job_1')
        args.res_search_dir = args.out_dir
        args.no_cut_ga = True
        args.index_dir = os.path.join(self.tmp_dir)
        os.mkdir(args.out_dir)

        self.cfg = Config(MacsyDefaults(), args)

        self.model_name = 'foo'
        self.model_location = ModelLocation(
            path=os.path.join(args.models_dir, self.model_name))

        idx = Indexes(self.cfg)
        idx.build()
        self.profile_factory = ProfileFactory(self.cfg)
コード例 #2
0
    def setUp(self):
        args = argparse.Namespace()
        args.db_type = 'gembase'
        args.models_dir = self.find_data('models')
        args.res_search_dir = tempfile.gettempdir()
        args.log_level = 30
        args.out_dir = os.path.join(args.res_search_dir,
                                    'test_macsyfinder_Report')
        if os.path.exists(args.out_dir):
            shutil.rmtree(args.out_dir)
        os.mkdir(args.out_dir)

        seq_db = self.find_data("base", "test_base.fa")
        shutil.copy(seq_db, args.out_dir)
        args.sequence_db = os.path.join(args.out_dir, os.path.basename(seq_db))
        self.cfg = Config(MacsyDefaults(), args)

        os.mkdir(os.path.join(self.cfg.out_dir(), self.cfg.hmmer_dir()))

        self.model_name = 'foo'
        self.model_location = ModelLocation(
            path=os.path.join(args.models_dir, self.model_name))

        # we need to reset the ProfileFactory
        # because it's a like a singleton
        # so other tests are influenced by ProfileFactory and it's configuration
        # for instance search_genes get profile without hmmer_exe
        self.profile_factory = ProfileFactory(self.cfg)

        idx = Indexes(self.cfg)
        idx.build()
コード例 #3
0
    def test_iter(self):
        idx = Indexes(self.cfg)
        with self.assertRaises(MacsypyError) as ctx:
            next(iter(idx))

        self.assertEqual(str(ctx.exception), 'Build index before to use it.')

        idx.build()
        expected_idx = [('VICH001.B.00001.C001_01359', 200, 1),
                        ('VICH001.B.00001.C001_01360', 484, 2),
                        ('VICH001.B.00001.C001_01361', 406, 3),
                        ('VICH001.B.00001.C001_01390', 326, 4),
                        ('VICH001.B.00001.C001_01391', 54, 5),
                        ('VICH001.B.00001.C001_01392', 206, 6),
                        ('VICH001.B.00001.C001_01393', 477, 7),
                        ('VICH001.B.00001.C001_01394', 126, 8),
                        ('VICH001.B.00001.C001_01395', 405, 9),
                        ('VICH001.B.00001.C001_01396', 572, 10),
                        ('VICH001.B.00001.C001_01397', 721, 11),
                        ('VICH001.B.00001.C001_01398', 467, 12),
                        ('VICH001.B.00001.C001_01399', 720, 13),
                        ('VICH001.B.00001.C001_01400', 559, 14),
                        ('VICH001.B.00001.C001_01401', 153, 15),
                        ('VICH001.B.00001.C001_01402', 4558, 16),
                        ('VICH001.B.00001.C001_01500', 120, 17),
                        ('VICH001.B.00001.C001_01501', 344, 18),
                        ('VICH001.B.00001.C001_01502', 478, 19),
                        ('VICH001.B.00001.C001_01503', 724, 20),
                        ('VICH001.B.00001.C001_01504', 309, 21),
                        ('VICH001.B.00001.C001_01505', 390, 22),
                        ('VICH001.B.00001.C001_01506', 419, 23),
                        ('VICH001.B.00001.C001_01540', 353, 24),
                        ('VICH001.B.00001.C001_01541', 229, 25),
                        ('VICH001.B.00001.C001_01542', 267, 26),
                        ('VICH001.B.00001.C001_01543', 328, 27),
                        ('VICH001.B.00001.C001_01544', 258, 28),
                        ('VICH001.B.00001.C001_01545', 228, 29),
                        ('VICH001.B.00001.C001_01546', 538, 30),
                        ('VICH001.B.00001.C001_01547', 77, 31),
                        ('VICH001.B.00001.C001_01548', 476, 32),
                        ('VICH001.B.00001.C001_01549', 324, 33),
                        ('VICH001.B.00001.C001_01550', 387, 34),
                        ('VICH001.B.00001.C001_01551', 382, 35),
                        ('VICH001.B.00001.C001_01552', 149, 36),
                        ('VICH001.B.00001.C001_01553', 319, 37),
                        ('VICH001.B.00001.C001_01554', 237, 38),
                        ('VICH001.B.00001.C001_01555', 74, 39),
                        ('VICH001.B.00001.C001_01556', 362, 40),
                        ('VICH001.B.00001.C001_01557', 170, 41),
                        ('VICH001.B.00001.C001_01558', 77, 42),
                        ('VICH001.B.00001.C001_01559', 296, 43),
                        ('VICH001.B.00001.C001_01560', 405, 44),
                        ('VICH001.B.00001.C001_01561', 182, 45),
                        ('VICH001.B.00001.C001_01562', 445, 46),
                        ('VICH001.B.00001.C001_01563', 212, 47),
                        ('VICH001.B.00001.C001_01564', 387, 48),
                        ('VICH001.B.00001.C001_01565', 414, 49)]
        self.assertListEqual(list(iter(idx)), expected_idx)
コード例 #4
0
 def test_build_no_idx(self):
     if not which('makeblastdb') and which('formatdb'):
         self.cfg.options['index_db_exe'] = 'formatdb'
     idx = Indexes(self.cfg)
     idx.build()
     my_idx = idx.find_my_indexes()
     hmmer_idx = idx.find_hmmer_indexes()
     self.assertEqual(my_idx, os.path.join( os.path.dirname(self.cfg.sequence_db), idx.name + ".idx"))
     self.assertEqual( hmmer_idx , [ self.cfg.sequence_db + suffix for suffix in ('.phr', '.pin', '.psd', '.psi', '.psq')])
コード例 #5
0
 def test_build_no_idx(self):
     if not which('makeblastdb') and which('formatdb'):
         self.cfg.options['index_db_exe'] = 'formatdb'
     idx = Indexes(self.cfg)
     idx.build()
     my_idx = idx.find_my_indexes()
     hmmer_idx = idx.find_hmmer_indexes()
     self.assertEqual(my_idx, os.path.join(os.path.dirname(self.cfg.sequence_db), idx.name + ".idx"))
     self.assertEqual(hmmer_idx, [self.cfg.sequence_db + suffix for suffix in ('.phr', '.pin', '.psd', '.psi', '.psq')])
コード例 #6
0
 def test_build_not_writable(self):
     # Skip test on Windows, since setting the folder permissions is not affecting files inside
     # in Singularity container tess are run as root and this test as non sense
     idx = Indexes(self.cfg)
     idx_dir = os.path.join(os.path.dirname(self.cfg.sequence_db()))
     os.chmod(idx_dir, 0000)
     try:
         with self.assertRaises(IOError) as ctx:
             with self.catch_log():
                 idx.build()
         self.assertRegex(str(ctx.exception),
                          "cannot build indexes, \(.+/test_macsyfinder_indexes\) is not writable")
     finally:
         os.chmod(idx_dir, 0o777)
コード例 #7
0
    def _fill_my_db(self, macsyfinder_idx: str, db: Dict) -> None:
        """
        Fill the dictionary with information on the matched sequences

        :param macsyfinder_idx: the path the macsyfinder index corresponding to the dataset
        :type  macsyfinder_idx: string
        :param db: the database containing all sequence id of the hits.
        :type db: dict
        """
        idx = Indexes(self.cfg)
        idx.build()
        for seqid, length, rank in idx:
            if seqid in db:
                db[seqid] = (length, rank)
コード例 #8
0
 def test_build_with_idx(self):
     if not which('makeblastdb') and which('formatdb'):
         self.cfg.options['index_db_exe'] = 'formatdb'
     #put fake hmmer indexes
     suffixes = ('.phr', '.pin', '.psd', '.psi', '.psq')
     for s in  suffixes:
         new_idx = os.path.join( self.cfg.sequence_db + s)
         open(new_idx, 'w')
     idx = Indexes(self.cfg)
     new_idx = open(os.path.join( os.path.dirname(self.cfg.sequence_db), idx.name + ".idx"), 'w')
     idx.build()
     my_idx = idx.find_my_indexes()
     hmmer_idx = idx.find_hmmer_indexes()
     for f in hmmer_idx +[my_idx]:
         self.assertEqual(os.path.getsize(f), 0)
コード例 #9
0
 def test_build_force(self):
     # put fake hmmer indexes
     if not which('makeblastdb') and which('formatdb'):
         self.cfg.options['index_db_exe'] = 'formatdb'
    
     suffixes = ('.phr', '.pin', '.psd', '.psi', '.psq')
     for s in suffixes:
         new_idx = os.path.join( self.cfg.sequence_db + s)
         open(new_idx, 'w')
     idx = Indexes(self.cfg)
     idx.build(force=True)
     my_idx = idx.find_my_indexes()
     hmmer_idx = idx.find_hmmer_indexes()
     for f in hmmer_idx + [my_idx]:
         self.assertNotEqual(os.path.getsize(f), 0)
コード例 #10
0
 def test_build_with_idx(self):
     if not which('makeblastdb') and which('formatdb'):
         self.cfg.options['index_db_exe'] = 'formatdb'
     # put fake hmmer indexes
     suffixes = ('.phr', '.pin', '.psd', '.psi', '.psq')
     for s in suffixes:
         new_idx = os.path.join(self.cfg.sequence_db + s)
         open(new_idx, 'w')
     idx = Indexes(self.cfg)
     new_idx = open(os.path.join( os.path.dirname(self.cfg.sequence_db), idx.name + ".idx"), 'w')
     idx.build()
     my_idx = idx.find_my_indexes()
     hmmer_idx = idx.find_hmmer_indexes()
     for f in hmmer_idx + [my_idx]:
         self.assertEqual(os.path.getsize(f), 0)
コード例 #11
0
    def parse(self) -> List[LightHit]:
        """
        parse a hmm output file and extract all hits and do some basic computation (coverage profile)

        :return: The list of extracted hits
        """
        all_hits = []
        idx = Indexes(self.cfg)
        macsyfinder_idx = idx.build()
        my_db = self._build_my_db(self._hmmer_raw_out)
        self._fill_my_db(macsyfinder_idx, my_db)

        with open(self._hmmer_raw_out, 'r') as hmm_out:
            i_evalue_sel = self.cfg.i_evalue_sel()
            coverage_threshold = self.cfg.coverage_profile()
            hmm_hits = (x[1] for x in groupby(hmm_out, self._hit_start))
            # drop summary
            next(hmm_hits)
            for hmm_hit in hmm_hits:
                hit_id = self._parse_hmm_header(hmm_hit)
                seq_lg, position_hit = my_db[hit_id]

                replicon_name = self._get_replicon_name(hit_id)

                body = next(hmm_hits)
                l_hit = self._parse_hmm_body(hit_id, self.gene_profile_lg,
                                             seq_lg, coverage_threshold,
                                             replicon_name, position_hit,
                                             i_evalue_sel, body)
                all_hits += l_hit
            hits = sorted(all_hits, key=lambda h: -h.score)
        return hits
コード例 #12
0
    def test_fill_my_db(self):
        gene_name = "gspD"
        args = argparse.Namespace()
        args.db_type = 'gembase'
        args.models_dir = self.find_data('models')
        args.log_level = 30
        args.sequence_db = self.find_data("base", "test_base.fa")
        args.index_dir = self.tmpdir
        cfg = Config(MacsyDefaults(), args)
        gspD_hmmer_path = self.find_data('hmm', 'gspD.search_hmm.out')

        idx = Indexes(cfg)
        macsyfinder_idx = idx.build()
        hmm_prof = macsyprofile.HmmProfile(gene_name, 596, gspD_hmmer_path,
                                           cfg)

        db = hmm_prof._build_my_db(gspD_hmmer_path)
        hmm_prof._fill_my_db(macsyfinder_idx, db)
        self.assertDictEqual(
            db, {
                'PSAE001c01_031420': (658, 73),
                'PSAE001c01_051090': (714, 75),
                'PSAE001c01_018920': (776, 71),
                'PSAE001c01_043580': (416, 74),
                'PSAE001c01_017350': (600, 70),
                'PSAE001c01_013980': (759, 69),
                'PSAE001c01_026600': (273, 72),
                'NC_xxxxx_xx_056141': (803, 141),
                'PSAE001c01_006940': (803, 68)
            })
コード例 #13
0
 def test_build_no_idx(self):
     idx = Indexes(self.cfg)
     my_idx = idx.build()
     self.assertEqual(
         my_idx,
         os.path.join(os.path.dirname(self.cfg.sequence_db()),
                      idx.name + ".idx"))
コード例 #14
0
    def test_fill_gembase_min_max_oredered_replicon(self):
        seq_ori = self.find_data("base", "ordered_replicon_base.fasta")
        shutil.copy(seq_ori, self.args.out_dir)
        self.args.sequence_db = os.path.join(self.args.out_dir,
                                             os.path.basename(seq_ori))
        cfg = Config(MacsyDefaults(), self.args)

        idx = Indexes(cfg)
        idx.build()
        RepliconDB.__init__ = self.fake_init
        db = RepliconDB(cfg)
        with self.assertRaises(MacsypyError) as ctx:
            with self.catch_log() as log:
                db._fill_gembase_min_max({}, self.cfg.replicon_topology())
        self.assertEqual(
            str(ctx.exception),
            f"Error during sequence-db '{self.args.sequence_db}' parsing. "
            f"Are you sure db-type is 'gembase'?")
コード例 #15
0
    def test_fill_ordered_replicon_min_max(self):
        seq_ori = self.find_data("base", "ordered_replicon_base.fasta")
        shutil.copy(seq_ori, self.args.out_dir)
        self.args.sequence_db = os.path.join(self.args.out_dir,
                                             os.path.basename(seq_ori))
        cfg = Config(MacsyDefaults(), self.args)

        idx = Indexes(cfg)
        idx.build()
        RepliconDB.__init__ = self.fake_init
        db = RepliconDB(cfg)
        db._fill_ordered_min_max(cfg.replicon_topology())

        self.assertEqual(len(db._DB), 1)
        rep = db[RepliconDB.ordered_replicon_name]
        self.assertEqual(rep.topology, cfg.replicon_topology())
        self.assertEqual(rep.min, 1)
        self.assertEqual(rep.max, 52)
コード例 #16
0
    def test_build_with_idx(self):
        idx = Indexes(self.cfg)
        # case new style idx
        with open(
                os.path.join(os.path.dirname(self.cfg.sequence_db()),
                             idx.name + ".idx"), 'w') as idx_file:
            idx_content_new = f"{self.cfg.sequence_db()}\nVICH001.B.00001.C001_01359{idx._field_separator}200{idx._field_separator}1\n"
            idx_file.write(idx_content_new)
        my_idx = idx.build()
        self.assertEqual(os.path.getsize(idx_file.name), len(idx_content_new))

        # case old style no path as first line
        idx_path = os.path.join(os.path.dirname(self.cfg.sequence_db()),
                                idx.name + ".idx")
        with open(idx_path, 'w') as idx_file:
            idx_content_old = "VICH001.B.00001.C001_01359;200;1\n"
            idx_file.write(idx_content_old)
        with self.catch_log(log_name='macsypy') as log:
            _ = idx.build()
            log_msg = log.get_value().strip()
        self.assertEqual(
            log_msg,
            f"The '{idx_path}' index file is in old format. Force index building."
        )

        # case old style bad separator
        idx_path = os.path.join(os.path.dirname(self.cfg.sequence_db()),
                                idx.name + ".idx")
        with open(idx_path, 'w') as idx_file:
            idx_content_old = f"{self.cfg.sequence_db()}\nVICH001.B.00001.C001_01359;200;1\n"
            idx_file.write(idx_content_old)
        with self.catch_log(log_name='macsypy') as log:
            _ = idx.build()
            log_msg = log.get_value().strip()
        self.assertEqual(
            log_msg,
            f"The '{idx_path}' index file is in old format. Force index building."
        )

        # case idx seems valid read it
        with open(
                os.path.join(os.path.dirname(self.cfg.sequence_db()),
                             idx.name + ".idx")) as idx_file_test:
            data = idx_file_test.read()

        new_content = f"""{self.cfg.sequence_db()}
VICH001.B.00001.C001_01359{idx._field_separator}200{idx._field_separator}1
VICH001.B.00001.C001_01360{idx._field_separator}484{idx._field_separator}2
VICH001.B.00001.C001_01361{idx._field_separator}406{idx._field_separator}3
VICH001.B.00001.C001_01390{idx._field_separator}326{idx._field_separator}4
VICH001.B.00001.C001_01391{idx._field_separator}54{idx._field_separator}5
VICH001.B.00001.C001_01392{idx._field_separator}206{idx._field_separator}6
VICH001.B.00001.C001_01393{idx._field_separator}477{idx._field_separator}7
VICH001.B.00001.C001_01394{idx._field_separator}126{idx._field_separator}8
VICH001.B.00001.C001_01395{idx._field_separator}405{idx._field_separator}9
VICH001.B.00001.C001_01396{idx._field_separator}572{idx._field_separator}10
VICH001.B.00001.C001_01397{idx._field_separator}721{idx._field_separator}11
VICH001.B.00001.C001_01398{idx._field_separator}467{idx._field_separator}12
VICH001.B.00001.C001_01399{idx._field_separator}720{idx._field_separator}13
VICH001.B.00001.C001_01400{idx._field_separator}559{idx._field_separator}14
VICH001.B.00001.C001_01401{idx._field_separator}153{idx._field_separator}15
VICH001.B.00001.C001_01402{idx._field_separator}4558{idx._field_separator}16
VICH001.B.00001.C001_01500{idx._field_separator}120{idx._field_separator}17
VICH001.B.00001.C001_01501{idx._field_separator}344{idx._field_separator}18
VICH001.B.00001.C001_01502{idx._field_separator}478{idx._field_separator}19
VICH001.B.00001.C001_01503{idx._field_separator}724{idx._field_separator}20
VICH001.B.00001.C001_01504{idx._field_separator}309{idx._field_separator}21
VICH001.B.00001.C001_01505{idx._field_separator}390{idx._field_separator}22
VICH001.B.00001.C001_01506{idx._field_separator}419{idx._field_separator}23
VICH001.B.00001.C001_01540{idx._field_separator}353{idx._field_separator}24
VICH001.B.00001.C001_01541{idx._field_separator}229{idx._field_separator}25
VICH001.B.00001.C001_01542{idx._field_separator}267{idx._field_separator}26
VICH001.B.00001.C001_01543{idx._field_separator}328{idx._field_separator}27
VICH001.B.00001.C001_01544{idx._field_separator}258{idx._field_separator}28
VICH001.B.00001.C001_01545{idx._field_separator}228{idx._field_separator}29
VICH001.B.00001.C001_01546{idx._field_separator}538{idx._field_separator}30
VICH001.B.00001.C001_01547{idx._field_separator}77{idx._field_separator}31
VICH001.B.00001.C001_01548{idx._field_separator}476{idx._field_separator}32
VICH001.B.00001.C001_01549{idx._field_separator}324{idx._field_separator}33
VICH001.B.00001.C001_01550{idx._field_separator}387{idx._field_separator}34
VICH001.B.00001.C001_01551{idx._field_separator}382{idx._field_separator}35
VICH001.B.00001.C001_01552{idx._field_separator}149{idx._field_separator}36
VICH001.B.00001.C001_01553{idx._field_separator}319{idx._field_separator}37
VICH001.B.00001.C001_01554{idx._field_separator}237{idx._field_separator}38
VICH001.B.00001.C001_01555{idx._field_separator}74{idx._field_separator}39
VICH001.B.00001.C001_01556{idx._field_separator}362{idx._field_separator}40
VICH001.B.00001.C001_01557{idx._field_separator}170{idx._field_separator}41
VICH001.B.00001.C001_01558{idx._field_separator}77{idx._field_separator}42
VICH001.B.00001.C001_01559{idx._field_separator}296{idx._field_separator}43
VICH001.B.00001.C001_01560{idx._field_separator}405{idx._field_separator}44
VICH001.B.00001.C001_01561{idx._field_separator}182{idx._field_separator}45
VICH001.B.00001.C001_01562{idx._field_separator}445{idx._field_separator}46
VICH001.B.00001.C001_01563{idx._field_separator}212{idx._field_separator}47
VICH001.B.00001.C001_01564{idx._field_separator}387{idx._field_separator}48
VICH001.B.00001.C001_01565{idx._field_separator}414{idx._field_separator}49
"""

        self.assertEqual(data, new_content)
コード例 #17
0
 def test_build_force(self):
     idx = Indexes(self.cfg)
     idx.build(force=True)
     my_idx = idx.find_my_indexes()
     self.assertNotEqual(os.path.getsize(my_idx), 0)
コード例 #18
0
 def test_build_with_idx(self):
     idx = Indexes(self.cfg)
     open(os.path.join(os.path.dirname(self.cfg.sequence_db()), idx.name + ".idx"), 'w').close()
     idx.build()
     my_idx = idx.find_my_indexes()
     self.assertEqual(os.path.getsize(my_idx), 0)
コード例 #19
0
def search_systems(config, model_bank, gene_bank, profile_factory, logger):
    """
    Do the job, this function is the orchestrator of all the macsyfinder mechanics
    at the end several files are produced containing the results

      - macsyfinder.conf: The set of variables used to runt this job
      - macsyfinder.systems: The list of the potential systems
      - macsyfinder.rejected_cluster: The list of all clusters and clustrs combination
                                      which has been rejected and the reason
      - macsyfinder.log: the copy of the standard output

    :param config: The MacSyFinder Configuration
    :type config: :class:`macsypy.config.Config` object
    :param model_bank: The bank populated with the available models
    :type model_bank: :class:`macsypy.model.ModelBank` object
    :param gene_bank: the bank containing all genes
    :type gene_bank: :class:`macsypy.gene.GeneBank` object
    :param profile_factory: The profile factory
    :type profile_factory: :class:`macsypy.gene.ProfileFactory`
    :param logger: The logger use to display information to the user.
                   It must be initialized. see :func:`macsypy.init_logger`
    :type logger: :class:`colorlog.Logger` object
    :return: the systems and rejected clusters found
    :rtype: ([:class:`macsypy.system.System`, ...], [:class:`macsypy.cluster.RejectedCluster`, ...])
    """
    working_dir = config.working_dir()
    config.save(path_or_buf=os.path.join(working_dir, config.cfg_name))
    registry = ModelRegistry()
    models_loc_available = scan_models_dir(
        config.models_dir(),
        profile_suffix=config.profile_suffix(),
        relative_path=config.relative_path())
    for model_loc in models_loc_available:
        registry.add(model_loc)
    # build indexes
    idx = Indexes(config)
    idx.build(force=config.idx)

    # create models
    parser = DefinitionParser(config, model_bank, gene_bank, registry,
                              profile_factory)
    try:
        models_def_to_detect = get_def_to_detect(config.models(), registry)
    except KeyError as err:
        sys.exit(f"macsyfinder: {err}")

    parser.parse(models_def_to_detect)

    logger.info(
        f"MacSyFinder's results will be stored in working_dir{working_dir}")
    logger.info(f"Analysis launched on {config.sequence_db()} for model(s):")

    for m in models_def_to_detect:
        logger.info(f"\t- {m.fqn}")

    models_to_detect = [
        model_bank[model_loc.fqn] for model_loc in models_def_to_detect
    ]
    all_genes = []
    for model in models_to_detect:
        genes = model.mandatory_genes + model.accessory_genes + model.neutral_genes + model.forbidden_genes
        # Exchangeable (formerly homologs/analogs) are also added because they can "replace" an important gene...
        ex_genes = []

        for g in genes:
            ex_genes += g.exchangeables
        all_genes += (genes + ex_genes)
    #############################################
    # this part of code is executed in parallel
    #############################################
    try:
        all_reports = search_genes(all_genes, config)
    except Exception as err:
        sys.exit(str(err))
    #############################################
    # end of parallel code
    #############################################
    all_hits = [
        hit for subl in [report.hits for report in all_reports] for hit in subl
    ]

    if len(all_hits) > 0:
        # It's important to keep this sorting to have in last all_hits version
        # the hits with the same replicon_name and position sorted by score
        # the best score in first
        hits_by_replicon = {}
        for hit in all_hits:
            if hit.replicon_name in hits_by_replicon:
                hits_by_replicon[hit.replicon_name].append(hit)
            else:
                hits_by_replicon[hit.replicon_name] = [hit]

        for rep_name in hits_by_replicon:
            hits_by_replicon[rep_name] = get_best_hits(
                hits_by_replicon[rep_name], key='score')
            hits_by_replicon[rep_name].sort(key=attrgetter('position'))

        models_to_detect = sorted(models_to_detect, key=attrgetter('name'))
        db_type = config.db_type()
        if db_type in ('ordered_replicon', 'gembase'):
            systems, rejected_clusters = _search_in_ordered_replicon(
                hits_by_replicon, models_to_detect, config, logger)
            return systems, rejected_clusters
        elif db_type == "unordered":
            likely_systems, rejected_hits = _search_in_unordered_replicon(
                hits_by_replicon, models_to_detect, logger)
            return likely_systems, rejected_hits
        else:
            assert False, f"dbtype have an invalid value {db_type}"
    else:
        # No hits detected
        return [], []
コード例 #20
0
class Test(MacsyTest):
    def __init__(self, methodName='runTest'):
        super(Test, self).__init__(methodName)

        def fake_init(obj, cfg):
            obj.cfg = cfg
            obj._idx = Indexes(cfg)
            obj.sequence_idx = obj._idx.find_my_indexes()
            obj.topology_file = cfg.topology_file()
            obj._DB = {}

        self.fake_init = fake_init
        self.real_init = RepliconDB.__init__

    def setUp(self):
        self.args = argparse.Namespace()
        self.args.db_type = 'gembase'
        self.args.models_dir = self.find_data('models')
        self.args.res_search_dir = tempfile.gettempdir()
        self.args.log_level = 30
        self.args.out_dir = os.path.join(self.args.res_search_dir,
                                         'test_macsyfinder_repliconDB')
        if os.path.exists(self.args.out_dir):
            shutil.rmtree(self.args.out_dir)
        os.mkdir(self.args.out_dir)

        seq_db = self.find_data("base", "test_base.fa")
        shutil.copy(seq_db, self.args.out_dir)
        self.args.sequence_db = os.path.join(self.args.out_dir,
                                             os.path.basename(seq_db))
        self.cfg = Config(MacsyDefaults(), self.args)

        self.ESCO030p01_genes = [('000010', 886), ('000020', 291),
                                 ('000030', 656), ('000040', 500),
                                 ('000050', 407), ('000060', 144),
                                 ('000070', 183), ('000080', 121),
                                 ('000090', 199), ('000100', 325),
                                 ('000110', 425), ('000120', 171),
                                 ('000130', 277), ('000140', 133),
                                 ('000150', 108), ('000160', 295),
                                 ('000170', 273), ('000180', 367),
                                 ('000190', 573), ('000200', 343),
                                 ('000210', 295), ('000220', 108),
                                 ('000230', 117), ('000240', 153),
                                 ('000250', 479), ('000260', 706),
                                 ('000270', 998), ('000280', 171),
                                 ('000290', 108), ('000300', 295),
                                 ('000310', 165), ('000320', 243),
                                 ('000330', 295), ('000340', 108),
                                 ('000350', 1755), ('000360', 248),
                                 ('000370', 286), ('000380', 186),
                                 ('000390', 83), ('000400', 153),
                                 ('000410', 69), ('000420', 295),
                                 ('000430', 108), ('000440', 145),
                                 ('000450', 59), ('000460', 124),
                                 ('000470', 246), ('000480', 325),
                                 ('000490', 54), ('000500', 95), ('000510',
                                                                  83),
                                 ('000520', 56), ('000530', 401),
                                 ('000540', 320), ('000550', 256),
                                 ('000560', 73), ('000570', 144),
                                 ('000580', 258), ('000590', 133),
                                 ('000600', 140), ('000610', 63),
                                 ('000620', 138), ('000630', 68),
                                 ('000640', 169), ('000650', 127),
                                 ('000660', 295), ('000670', 108),
                                 ('000670', 108)]

        self.PSAE001c01_genes = [('006940', 803), ('013980', 759),
                                 ('017350', 600), ('018920', 776),
                                 ('026600', 273), ('031420', 658),
                                 ('043580', 416), ('051090', 714),
                                 ('055870', 449), ('055880', 447),
                                 ('055890', 588), ('055900', 292),
                                 ('055910', 262), ('055920', 166),
                                 ('055930', 288), ('055940', 194),
                                 ('055950', 567), ('055960', 188),
                                 ('055970', 247), ('055980', 252),
                                 ('055990', 455), ('056000', 450),
                                 ('056010', 260), ('056020', 246),
                                 ('056030', 70), ('056040', 133),
                                 ('056050', 284), ('056060', 585),
                                 ('056070', 435), ('056080', 342),
                                 ('056090', 252), ('056100', 122),
                                 ('056110', 213), ('056120', 400),
                                 ('056130', 134), ('056140', 138),
                                 ('056150', 397), ('056160', 298),
                                 ('056170', 186), ('056180', 445),
                                 ('056190', 414), ('056200', 132),
                                 ('056210', 674), ('056220', 319),
                                 ('056230', 394), ('056240', 207),
                                 ('056250', 401), ('056260', 611),
                                 ('056270', 257), ('056280', 169),
                                 ('056290', 454), ('056300', 141),
                                 ('056310', 458), ('056320', 286),
                                 ('056330', 514), ('056340', 178),
                                 ('056350', 156), ('056360', 85),
                                 ('056370', 289), ('056380', 126),
                                 ('056390', 290), ('056400', 262),
                                 ('056410', 214), ('056420', 630),
                                 ('056430', 127), ('056440', 455),
                                 ('056440', 455)]
        self.NCDB_genes = [('056134', 289), ('056135', 126), ('056136', 290),
                           ('056137', 262), ('056138', 214), ('056139', 630),
                           ('056140', 127), ('056141', 803), ('056141', 803)]

        self.idx = Indexes(self.cfg)
        self.idx.build()

    def tearDown(self):
        try:
            shutil.rmtree(self.cfg.working_dir())
        except:
            pass
        RepliconDB.__init__ = self.real_init

    def test_fill_topology(self):
        self.args.topology_file = self.args.sequence_db + ".topo"
        db_send = {'ESCO030p01': 'circular', 'PSAE001c01': 'linear'}
        with open(self.args.topology_file, 'w') as f:
            for k, v in list(db_send.items()):
                f.write('{0} : {1}\n'.format(k, v))

        cfg = Config(MacsyDefaults(), self.args)
        RepliconDB.__init__ = self.fake_init
        db = RepliconDB(cfg)
        rcv_topo = db._fill_topology()
        self.assertDictEqual(db_send, rcv_topo)

    def test_fill_ordered_replicon_min_max(self):
        seq_ori = self.find_data("base", "ordered_replicon_base.fasta")
        shutil.copy(seq_ori, self.args.out_dir)
        self.args.sequence_db = os.path.join(self.args.out_dir,
                                             os.path.basename(seq_ori))
        cfg = Config(MacsyDefaults(), self.args)

        idx = Indexes(cfg)
        idx.build()
        RepliconDB.__init__ = self.fake_init
        db = RepliconDB(cfg)
        db._fill_ordered_min_max(cfg.replicon_topology())

        self.assertEqual(len(db._DB), 1)
        rep = db[RepliconDB.ordered_replicon_name]
        self.assertEqual(rep.topology, cfg.replicon_topology())
        self.assertEqual(rep.min, 1)
        self.assertEqual(rep.max, 52)

    def test_fill_gembase_min_max_default_topology(self):
        RepliconDB.__init__ = self.fake_init
        db = RepliconDB(self.cfg)
        db._fill_gembase_min_max({}, self.cfg.replicon_topology())
        self.assertEqual(len(db._DB), 3)
        self.assertEqual(set(db._DB.keys()),
                         set(['ESCO030p01', 'PSAE001c01', 'NC_xxxxx_xx']))
        PRRU001c01 = db['ESCO030p01']
        self.assertEqual(PRRU001c01.topology, 'circular')
        self.assertEqual(PRRU001c01.min, 1)
        self.assertEqual(PRRU001c01.max, 67)
        self.assertEqual(PRRU001c01.genes, self.ESCO030p01_genes)
        PSAE001c01 = db['PSAE001c01']
        self.assertEqual(PSAE001c01.topology, 'circular')
        self.assertEqual(PSAE001c01.min, 68)
        self.assertEqual(PSAE001c01.max, 133)
        self.assertEqual(PSAE001c01.genes, self.PSAE001c01_genes)
        DBNC = db['NC_xxxxx_xx']
        self.assertEqual(DBNC.topology, 'circular')
        self.assertEqual(DBNC.min, 134)
        self.assertEqual(DBNC.max, 141)
        self.assertEqual(DBNC.genes, self.NCDB_genes)

    def test_fill_gembase_min_max_oredered_replicon(self):
        seq_ori = self.find_data("base", "ordered_replicon_base.fasta")
        shutil.copy(seq_ori, self.args.out_dir)
        self.args.sequence_db = os.path.join(self.args.out_dir,
                                             os.path.basename(seq_ori))
        cfg = Config(MacsyDefaults(), self.args)

        idx = Indexes(cfg)
        idx.build()
        RepliconDB.__init__ = self.fake_init
        db = RepliconDB(cfg)
        with self.assertRaises(MacsypyError) as ctx:
            with self.catch_log() as log:
                db._fill_gembase_min_max({}, self.cfg.replicon_topology())
        self.assertEqual(
            str(ctx.exception),
            f"Error during sequence-db '{self.args.sequence_db}' parsing. "
            f"Are you sure db-type is 'gembase'?")

    def test_fill_gembase_min_max_with_topology(self):
        self.args.topology_file = self.args.sequence_db + ".topo"
        with open(self.args.topology_file, 'w') as f:
            f.write(
                '# topology file\nESCO030p01 : circular\nPSAE001c01 : linear\n'
            )
        cfg = Config(MacsyDefaults(), self.args)
        RepliconDB.__init__ = self.fake_init
        db = RepliconDB(cfg)
        topo_dict = db._fill_topology()
        db._fill_gembase_min_max(topo_dict, 'circular')
        self.assertEqual(len(db._DB), 3)
        self.assertEqual(set(db._DB.keys()),
                         set(['ESCO030p01', 'PSAE001c01', 'NC_xxxxx_xx']))
        ESCO030p01 = db['ESCO030p01']
        self.assertEqual(ESCO030p01.topology, 'circular')
        self.assertEqual(ESCO030p01.min, 1)
        self.assertEqual(ESCO030p01.max, 67)
        self.assertEqual(ESCO030p01.genes, self.ESCO030p01_genes)
        PSAE001c01 = db['PSAE001c01']
        self.assertEqual(PSAE001c01.topology, 'linear')
        self.assertEqual(PSAE001c01.min, 68)
        self.assertEqual(PSAE001c01.max, 133)
        self.assertEqual(PSAE001c01.genes, self.PSAE001c01_genes)
        DBNC = db['NC_xxxxx_xx']
        self.assertEqual(DBNC.topology, 'circular')
        self.assertEqual(DBNC.min, 134)
        self.assertEqual(DBNC.max, 141)
        self.assertEqual(DBNC.genes, self.NCDB_genes)

    def test_in(self):
        db = RepliconDB(self.cfg)
        self.assertIn('ESCO030p01', db)
        self.assertIn('PSAE001c01', db)
        self.assertIn('NC_xxxxx_xx', db)
        self.assertNotIn('toto', db)

    def test_getitem(self):
        db = RepliconDB(self.cfg)
        ESCO030p01 = RepliconInfo(self.cfg.replicon_topology(), 1, 67,
                                  self.ESCO030p01_genes)
        PSAE001c01 = RepliconInfo(self.cfg.replicon_topology(), 68, 133,
                                  self.PSAE001c01_genes)
        NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes)
        self.assertEqual(ESCO030p01, db['ESCO030p01'])
        self.assertEqual(PSAE001c01, db['PSAE001c01'])
        self.assertEqual(NCXX, db['NC_xxxxx_xx'])
        self.assertRaises(KeyError, db.__getitem__, 'foo')

    def test_get(self):
        db = RepliconDB(self.cfg)
        ESCO030p01 = RepliconInfo(self.cfg.replicon_topology(), 1, 67,
                                  self.ESCO030p01_genes)
        PSAE001c01 = RepliconInfo(self.cfg.replicon_topology(), 68, 133,
                                  self.PSAE001c01_genes)
        NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes)
        self.assertEqual(ESCO030p01, db.get('ESCO030p01'))
        self.assertEqual(PSAE001c01, db.get('PSAE001c01'))
        self.assertEqual(NCXX, db.get('NC_xxxxx_xx', 'foo'))
        self.assertIsNone(db.get('foo'))
        self.assertEqual('bar', db.get('foo', 'bar'))

    def test_items(self):
        db = RepliconDB(self.cfg)
        ESCO030p01 = RepliconInfo(self.cfg.replicon_topology(), 1, 67,
                                  self.ESCO030p01_genes)
        PSAE001c01 = RepliconInfo(self.cfg.replicon_topology(), 68, 133,
                                  self.PSAE001c01_genes)
        NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes)
        self.assertCountEqual(list(db.items()), [('ESCO030p01', ESCO030p01),
                                                 ('NC_xxxxx_xx', NCXX),
                                                 ('PSAE001c01', PSAE001c01)])

    def test_iteritems(self):
        db = RepliconDB(self.cfg)
        ESCO030p01 = RepliconInfo(self.cfg.replicon_topology(), 1, 67,
                                  self.ESCO030p01_genes)
        PSAE001c01 = RepliconInfo(self.cfg.replicon_topology(), 68, 133,
                                  self.PSAE001c01_genes)
        NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes)
        iter_items = db.iteritems()
        for item in [('ESCO030p01', ESCO030p01), ('PSAE001c01', PSAE001c01),
                     ('NC_xxxxx_xx', NCXX)]:
            with self.subTest(item=item):
                self.assertEqual(next(iter_items), item)

    def test_names(self):
        db = RepliconDB(self.cfg)
        exp_name = ['ESCO030p01', 'PSAE001c01', 'NC_xxxxx_xx']
        self.assertListEqual(db.replicon_names(), exp_name)

    def test_replicon_infos(self):
        db = RepliconDB(self.cfg)
        ESCO030p01 = RepliconInfo(self.cfg.replicon_topology(), 1, 67,
                                  self.ESCO030p01_genes)
        PSAE001c01 = RepliconInfo(self.cfg.replicon_topology(), 68, 133,
                                  self.PSAE001c01_genes)
        NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes)
        values = db.replicon_infos()
        self.assertCountEqual(values, [ESCO030p01, NCXX, PSAE001c01])