def setUp(self): self.tmp_dir = os.path.join(tempfile.gettempdir(), 'test_macsyfinder_search_genes') if os.path.exists(self.tmp_dir): shutil.rmtree(self.tmp_dir) os.mkdir(self.tmp_dir) macsypy.init_logger() macsypy.logger_set_level(30) args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_base.fa") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.log_level = 30 args.out_dir = os.path.join(self.tmp_dir, 'job_1') args.res_search_dir = args.out_dir args.no_cut_ga = True args.index_dir = os.path.join(self.tmp_dir) os.mkdir(args.out_dir) self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) idx = Indexes(self.cfg) idx.build() self.profile_factory = ProfileFactory(self.cfg)
def setUp(self): args = argparse.Namespace() args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = tempfile.gettempdir() args.log_level = 30 args.out_dir = os.path.join(args.res_search_dir, 'test_macsyfinder_Report') if os.path.exists(args.out_dir): shutil.rmtree(args.out_dir) os.mkdir(args.out_dir) seq_db = self.find_data("base", "test_base.fa") shutil.copy(seq_db, args.out_dir) args.sequence_db = os.path.join(args.out_dir, os.path.basename(seq_db)) self.cfg = Config(MacsyDefaults(), args) os.mkdir(os.path.join(self.cfg.out_dir(), self.cfg.hmmer_dir())) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe self.profile_factory = ProfileFactory(self.cfg) idx = Indexes(self.cfg) idx.build()
def test_iter(self): idx = Indexes(self.cfg) with self.assertRaises(MacsypyError) as ctx: next(iter(idx)) self.assertEqual(str(ctx.exception), 'Build index before to use it.') idx.build() expected_idx = [('VICH001.B.00001.C001_01359', 200, 1), ('VICH001.B.00001.C001_01360', 484, 2), ('VICH001.B.00001.C001_01361', 406, 3), ('VICH001.B.00001.C001_01390', 326, 4), ('VICH001.B.00001.C001_01391', 54, 5), ('VICH001.B.00001.C001_01392', 206, 6), ('VICH001.B.00001.C001_01393', 477, 7), ('VICH001.B.00001.C001_01394', 126, 8), ('VICH001.B.00001.C001_01395', 405, 9), ('VICH001.B.00001.C001_01396', 572, 10), ('VICH001.B.00001.C001_01397', 721, 11), ('VICH001.B.00001.C001_01398', 467, 12), ('VICH001.B.00001.C001_01399', 720, 13), ('VICH001.B.00001.C001_01400', 559, 14), ('VICH001.B.00001.C001_01401', 153, 15), ('VICH001.B.00001.C001_01402', 4558, 16), ('VICH001.B.00001.C001_01500', 120, 17), ('VICH001.B.00001.C001_01501', 344, 18), ('VICH001.B.00001.C001_01502', 478, 19), ('VICH001.B.00001.C001_01503', 724, 20), ('VICH001.B.00001.C001_01504', 309, 21), ('VICH001.B.00001.C001_01505', 390, 22), ('VICH001.B.00001.C001_01506', 419, 23), ('VICH001.B.00001.C001_01540', 353, 24), ('VICH001.B.00001.C001_01541', 229, 25), ('VICH001.B.00001.C001_01542', 267, 26), ('VICH001.B.00001.C001_01543', 328, 27), ('VICH001.B.00001.C001_01544', 258, 28), ('VICH001.B.00001.C001_01545', 228, 29), ('VICH001.B.00001.C001_01546', 538, 30), ('VICH001.B.00001.C001_01547', 77, 31), ('VICH001.B.00001.C001_01548', 476, 32), ('VICH001.B.00001.C001_01549', 324, 33), ('VICH001.B.00001.C001_01550', 387, 34), ('VICH001.B.00001.C001_01551', 382, 35), ('VICH001.B.00001.C001_01552', 149, 36), ('VICH001.B.00001.C001_01553', 319, 37), ('VICH001.B.00001.C001_01554', 237, 38), ('VICH001.B.00001.C001_01555', 74, 39), ('VICH001.B.00001.C001_01556', 362, 40), ('VICH001.B.00001.C001_01557', 170, 41), ('VICH001.B.00001.C001_01558', 77, 42), ('VICH001.B.00001.C001_01559', 296, 43), ('VICH001.B.00001.C001_01560', 405, 44), ('VICH001.B.00001.C001_01561', 182, 45), ('VICH001.B.00001.C001_01562', 445, 46), ('VICH001.B.00001.C001_01563', 212, 47), ('VICH001.B.00001.C001_01564', 387, 48), ('VICH001.B.00001.C001_01565', 414, 49)] self.assertListEqual(list(iter(idx)), expected_idx)
def test_build_no_idx(self): if not which('makeblastdb') and which('formatdb'): self.cfg.options['index_db_exe'] = 'formatdb' idx = Indexes(self.cfg) idx.build() my_idx = idx.find_my_indexes() hmmer_idx = idx.find_hmmer_indexes() self.assertEqual(my_idx, os.path.join( os.path.dirname(self.cfg.sequence_db), idx.name + ".idx")) self.assertEqual( hmmer_idx , [ self.cfg.sequence_db + suffix for suffix in ('.phr', '.pin', '.psd', '.psi', '.psq')])
def test_build_no_idx(self): if not which('makeblastdb') and which('formatdb'): self.cfg.options['index_db_exe'] = 'formatdb' idx = Indexes(self.cfg) idx.build() my_idx = idx.find_my_indexes() hmmer_idx = idx.find_hmmer_indexes() self.assertEqual(my_idx, os.path.join(os.path.dirname(self.cfg.sequence_db), idx.name + ".idx")) self.assertEqual(hmmer_idx, [self.cfg.sequence_db + suffix for suffix in ('.phr', '.pin', '.psd', '.psi', '.psq')])
def test_build_not_writable(self): # Skip test on Windows, since setting the folder permissions is not affecting files inside # in Singularity container tess are run as root and this test as non sense idx = Indexes(self.cfg) idx_dir = os.path.join(os.path.dirname(self.cfg.sequence_db())) os.chmod(idx_dir, 0000) try: with self.assertRaises(IOError) as ctx: with self.catch_log(): idx.build() self.assertRegex(str(ctx.exception), "cannot build indexes, \(.+/test_macsyfinder_indexes\) is not writable") finally: os.chmod(idx_dir, 0o777)
def _fill_my_db(self, macsyfinder_idx: str, db: Dict) -> None: """ Fill the dictionary with information on the matched sequences :param macsyfinder_idx: the path the macsyfinder index corresponding to the dataset :type macsyfinder_idx: string :param db: the database containing all sequence id of the hits. :type db: dict """ idx = Indexes(self.cfg) idx.build() for seqid, length, rank in idx: if seqid in db: db[seqid] = (length, rank)
def test_build_with_idx(self): if not which('makeblastdb') and which('formatdb'): self.cfg.options['index_db_exe'] = 'formatdb' #put fake hmmer indexes suffixes = ('.phr', '.pin', '.psd', '.psi', '.psq') for s in suffixes: new_idx = os.path.join( self.cfg.sequence_db + s) open(new_idx, 'w') idx = Indexes(self.cfg) new_idx = open(os.path.join( os.path.dirname(self.cfg.sequence_db), idx.name + ".idx"), 'w') idx.build() my_idx = idx.find_my_indexes() hmmer_idx = idx.find_hmmer_indexes() for f in hmmer_idx +[my_idx]: self.assertEqual(os.path.getsize(f), 0)
def test_build_force(self): # put fake hmmer indexes if not which('makeblastdb') and which('formatdb'): self.cfg.options['index_db_exe'] = 'formatdb' suffixes = ('.phr', '.pin', '.psd', '.psi', '.psq') for s in suffixes: new_idx = os.path.join( self.cfg.sequence_db + s) open(new_idx, 'w') idx = Indexes(self.cfg) idx.build(force=True) my_idx = idx.find_my_indexes() hmmer_idx = idx.find_hmmer_indexes() for f in hmmer_idx + [my_idx]: self.assertNotEqual(os.path.getsize(f), 0)
def test_build_with_idx(self): if not which('makeblastdb') and which('formatdb'): self.cfg.options['index_db_exe'] = 'formatdb' # put fake hmmer indexes suffixes = ('.phr', '.pin', '.psd', '.psi', '.psq') for s in suffixes: new_idx = os.path.join(self.cfg.sequence_db + s) open(new_idx, 'w') idx = Indexes(self.cfg) new_idx = open(os.path.join( os.path.dirname(self.cfg.sequence_db), idx.name + ".idx"), 'w') idx.build() my_idx = idx.find_my_indexes() hmmer_idx = idx.find_hmmer_indexes() for f in hmmer_idx + [my_idx]: self.assertEqual(os.path.getsize(f), 0)
def parse(self) -> List[LightHit]: """ parse a hmm output file and extract all hits and do some basic computation (coverage profile) :return: The list of extracted hits """ all_hits = [] idx = Indexes(self.cfg) macsyfinder_idx = idx.build() my_db = self._build_my_db(self._hmmer_raw_out) self._fill_my_db(macsyfinder_idx, my_db) with open(self._hmmer_raw_out, 'r') as hmm_out: i_evalue_sel = self.cfg.i_evalue_sel() coverage_threshold = self.cfg.coverage_profile() hmm_hits = (x[1] for x in groupby(hmm_out, self._hit_start)) # drop summary next(hmm_hits) for hmm_hit in hmm_hits: hit_id = self._parse_hmm_header(hmm_hit) seq_lg, position_hit = my_db[hit_id] replicon_name = self._get_replicon_name(hit_id) body = next(hmm_hits) l_hit = self._parse_hmm_body(hit_id, self.gene_profile_lg, seq_lg, coverage_threshold, replicon_name, position_hit, i_evalue_sel, body) all_hits += l_hit hits = sorted(all_hits, key=lambda h: -h.score) return hits
def test_fill_my_db(self): gene_name = "gspD" args = argparse.Namespace() args.db_type = 'gembase' args.models_dir = self.find_data('models') args.log_level = 30 args.sequence_db = self.find_data("base", "test_base.fa") args.index_dir = self.tmpdir cfg = Config(MacsyDefaults(), args) gspD_hmmer_path = self.find_data('hmm', 'gspD.search_hmm.out') idx = Indexes(cfg) macsyfinder_idx = idx.build() hmm_prof = macsyprofile.HmmProfile(gene_name, 596, gspD_hmmer_path, cfg) db = hmm_prof._build_my_db(gspD_hmmer_path) hmm_prof._fill_my_db(macsyfinder_idx, db) self.assertDictEqual( db, { 'PSAE001c01_031420': (658, 73), 'PSAE001c01_051090': (714, 75), 'PSAE001c01_018920': (776, 71), 'PSAE001c01_043580': (416, 74), 'PSAE001c01_017350': (600, 70), 'PSAE001c01_013980': (759, 69), 'PSAE001c01_026600': (273, 72), 'NC_xxxxx_xx_056141': (803, 141), 'PSAE001c01_006940': (803, 68) })
def test_build_no_idx(self): idx = Indexes(self.cfg) my_idx = idx.build() self.assertEqual( my_idx, os.path.join(os.path.dirname(self.cfg.sequence_db()), idx.name + ".idx"))
def test_fill_gembase_min_max_oredered_replicon(self): seq_ori = self.find_data("base", "ordered_replicon_base.fasta") shutil.copy(seq_ori, self.args.out_dir) self.args.sequence_db = os.path.join(self.args.out_dir, os.path.basename(seq_ori)) cfg = Config(MacsyDefaults(), self.args) idx = Indexes(cfg) idx.build() RepliconDB.__init__ = self.fake_init db = RepliconDB(cfg) with self.assertRaises(MacsypyError) as ctx: with self.catch_log() as log: db._fill_gembase_min_max({}, self.cfg.replicon_topology()) self.assertEqual( str(ctx.exception), f"Error during sequence-db '{self.args.sequence_db}' parsing. " f"Are you sure db-type is 'gembase'?")
def test_fill_ordered_replicon_min_max(self): seq_ori = self.find_data("base", "ordered_replicon_base.fasta") shutil.copy(seq_ori, self.args.out_dir) self.args.sequence_db = os.path.join(self.args.out_dir, os.path.basename(seq_ori)) cfg = Config(MacsyDefaults(), self.args) idx = Indexes(cfg) idx.build() RepliconDB.__init__ = self.fake_init db = RepliconDB(cfg) db._fill_ordered_min_max(cfg.replicon_topology()) self.assertEqual(len(db._DB), 1) rep = db[RepliconDB.ordered_replicon_name] self.assertEqual(rep.topology, cfg.replicon_topology()) self.assertEqual(rep.min, 1) self.assertEqual(rep.max, 52)
def test_build_with_idx(self): idx = Indexes(self.cfg) # case new style idx with open( os.path.join(os.path.dirname(self.cfg.sequence_db()), idx.name + ".idx"), 'w') as idx_file: idx_content_new = f"{self.cfg.sequence_db()}\nVICH001.B.00001.C001_01359{idx._field_separator}200{idx._field_separator}1\n" idx_file.write(idx_content_new) my_idx = idx.build() self.assertEqual(os.path.getsize(idx_file.name), len(idx_content_new)) # case old style no path as first line idx_path = os.path.join(os.path.dirname(self.cfg.sequence_db()), idx.name + ".idx") with open(idx_path, 'w') as idx_file: idx_content_old = "VICH001.B.00001.C001_01359;200;1\n" idx_file.write(idx_content_old) with self.catch_log(log_name='macsypy') as log: _ = idx.build() log_msg = log.get_value().strip() self.assertEqual( log_msg, f"The '{idx_path}' index file is in old format. Force index building." ) # case old style bad separator idx_path = os.path.join(os.path.dirname(self.cfg.sequence_db()), idx.name + ".idx") with open(idx_path, 'w') as idx_file: idx_content_old = f"{self.cfg.sequence_db()}\nVICH001.B.00001.C001_01359;200;1\n" idx_file.write(idx_content_old) with self.catch_log(log_name='macsypy') as log: _ = idx.build() log_msg = log.get_value().strip() self.assertEqual( log_msg, f"The '{idx_path}' index file is in old format. Force index building." ) # case idx seems valid read it with open( os.path.join(os.path.dirname(self.cfg.sequence_db()), idx.name + ".idx")) as idx_file_test: data = idx_file_test.read() new_content = f"""{self.cfg.sequence_db()} VICH001.B.00001.C001_01359{idx._field_separator}200{idx._field_separator}1 VICH001.B.00001.C001_01360{idx._field_separator}484{idx._field_separator}2 VICH001.B.00001.C001_01361{idx._field_separator}406{idx._field_separator}3 VICH001.B.00001.C001_01390{idx._field_separator}326{idx._field_separator}4 VICH001.B.00001.C001_01391{idx._field_separator}54{idx._field_separator}5 VICH001.B.00001.C001_01392{idx._field_separator}206{idx._field_separator}6 VICH001.B.00001.C001_01393{idx._field_separator}477{idx._field_separator}7 VICH001.B.00001.C001_01394{idx._field_separator}126{idx._field_separator}8 VICH001.B.00001.C001_01395{idx._field_separator}405{idx._field_separator}9 VICH001.B.00001.C001_01396{idx._field_separator}572{idx._field_separator}10 VICH001.B.00001.C001_01397{idx._field_separator}721{idx._field_separator}11 VICH001.B.00001.C001_01398{idx._field_separator}467{idx._field_separator}12 VICH001.B.00001.C001_01399{idx._field_separator}720{idx._field_separator}13 VICH001.B.00001.C001_01400{idx._field_separator}559{idx._field_separator}14 VICH001.B.00001.C001_01401{idx._field_separator}153{idx._field_separator}15 VICH001.B.00001.C001_01402{idx._field_separator}4558{idx._field_separator}16 VICH001.B.00001.C001_01500{idx._field_separator}120{idx._field_separator}17 VICH001.B.00001.C001_01501{idx._field_separator}344{idx._field_separator}18 VICH001.B.00001.C001_01502{idx._field_separator}478{idx._field_separator}19 VICH001.B.00001.C001_01503{idx._field_separator}724{idx._field_separator}20 VICH001.B.00001.C001_01504{idx._field_separator}309{idx._field_separator}21 VICH001.B.00001.C001_01505{idx._field_separator}390{idx._field_separator}22 VICH001.B.00001.C001_01506{idx._field_separator}419{idx._field_separator}23 VICH001.B.00001.C001_01540{idx._field_separator}353{idx._field_separator}24 VICH001.B.00001.C001_01541{idx._field_separator}229{idx._field_separator}25 VICH001.B.00001.C001_01542{idx._field_separator}267{idx._field_separator}26 VICH001.B.00001.C001_01543{idx._field_separator}328{idx._field_separator}27 VICH001.B.00001.C001_01544{idx._field_separator}258{idx._field_separator}28 VICH001.B.00001.C001_01545{idx._field_separator}228{idx._field_separator}29 VICH001.B.00001.C001_01546{idx._field_separator}538{idx._field_separator}30 VICH001.B.00001.C001_01547{idx._field_separator}77{idx._field_separator}31 VICH001.B.00001.C001_01548{idx._field_separator}476{idx._field_separator}32 VICH001.B.00001.C001_01549{idx._field_separator}324{idx._field_separator}33 VICH001.B.00001.C001_01550{idx._field_separator}387{idx._field_separator}34 VICH001.B.00001.C001_01551{idx._field_separator}382{idx._field_separator}35 VICH001.B.00001.C001_01552{idx._field_separator}149{idx._field_separator}36 VICH001.B.00001.C001_01553{idx._field_separator}319{idx._field_separator}37 VICH001.B.00001.C001_01554{idx._field_separator}237{idx._field_separator}38 VICH001.B.00001.C001_01555{idx._field_separator}74{idx._field_separator}39 VICH001.B.00001.C001_01556{idx._field_separator}362{idx._field_separator}40 VICH001.B.00001.C001_01557{idx._field_separator}170{idx._field_separator}41 VICH001.B.00001.C001_01558{idx._field_separator}77{idx._field_separator}42 VICH001.B.00001.C001_01559{idx._field_separator}296{idx._field_separator}43 VICH001.B.00001.C001_01560{idx._field_separator}405{idx._field_separator}44 VICH001.B.00001.C001_01561{idx._field_separator}182{idx._field_separator}45 VICH001.B.00001.C001_01562{idx._field_separator}445{idx._field_separator}46 VICH001.B.00001.C001_01563{idx._field_separator}212{idx._field_separator}47 VICH001.B.00001.C001_01564{idx._field_separator}387{idx._field_separator}48 VICH001.B.00001.C001_01565{idx._field_separator}414{idx._field_separator}49 """ self.assertEqual(data, new_content)
def test_build_force(self): idx = Indexes(self.cfg) idx.build(force=True) my_idx = idx.find_my_indexes() self.assertNotEqual(os.path.getsize(my_idx), 0)
def test_build_with_idx(self): idx = Indexes(self.cfg) open(os.path.join(os.path.dirname(self.cfg.sequence_db()), idx.name + ".idx"), 'w').close() idx.build() my_idx = idx.find_my_indexes() self.assertEqual(os.path.getsize(my_idx), 0)
def search_systems(config, model_bank, gene_bank, profile_factory, logger): """ Do the job, this function is the orchestrator of all the macsyfinder mechanics at the end several files are produced containing the results - macsyfinder.conf: The set of variables used to runt this job - macsyfinder.systems: The list of the potential systems - macsyfinder.rejected_cluster: The list of all clusters and clustrs combination which has been rejected and the reason - macsyfinder.log: the copy of the standard output :param config: The MacSyFinder Configuration :type config: :class:`macsypy.config.Config` object :param model_bank: The bank populated with the available models :type model_bank: :class:`macsypy.model.ModelBank` object :param gene_bank: the bank containing all genes :type gene_bank: :class:`macsypy.gene.GeneBank` object :param profile_factory: The profile factory :type profile_factory: :class:`macsypy.gene.ProfileFactory` :param logger: The logger use to display information to the user. It must be initialized. see :func:`macsypy.init_logger` :type logger: :class:`colorlog.Logger` object :return: the systems and rejected clusters found :rtype: ([:class:`macsypy.system.System`, ...], [:class:`macsypy.cluster.RejectedCluster`, ...]) """ working_dir = config.working_dir() config.save(path_or_buf=os.path.join(working_dir, config.cfg_name)) registry = ModelRegistry() models_loc_available = scan_models_dir( config.models_dir(), profile_suffix=config.profile_suffix(), relative_path=config.relative_path()) for model_loc in models_loc_available: registry.add(model_loc) # build indexes idx = Indexes(config) idx.build(force=config.idx) # create models parser = DefinitionParser(config, model_bank, gene_bank, registry, profile_factory) try: models_def_to_detect = get_def_to_detect(config.models(), registry) except KeyError as err: sys.exit(f"macsyfinder: {err}") parser.parse(models_def_to_detect) logger.info( f"MacSyFinder's results will be stored in working_dir{working_dir}") logger.info(f"Analysis launched on {config.sequence_db()} for model(s):") for m in models_def_to_detect: logger.info(f"\t- {m.fqn}") models_to_detect = [ model_bank[model_loc.fqn] for model_loc in models_def_to_detect ] all_genes = [] for model in models_to_detect: genes = model.mandatory_genes + model.accessory_genes + model.neutral_genes + model.forbidden_genes # Exchangeable (formerly homologs/analogs) are also added because they can "replace" an important gene... ex_genes = [] for g in genes: ex_genes += g.exchangeables all_genes += (genes + ex_genes) ############################################# # this part of code is executed in parallel ############################################# try: all_reports = search_genes(all_genes, config) except Exception as err: sys.exit(str(err)) ############################################# # end of parallel code ############################################# all_hits = [ hit for subl in [report.hits for report in all_reports] for hit in subl ] if len(all_hits) > 0: # It's important to keep this sorting to have in last all_hits version # the hits with the same replicon_name and position sorted by score # the best score in first hits_by_replicon = {} for hit in all_hits: if hit.replicon_name in hits_by_replicon: hits_by_replicon[hit.replicon_name].append(hit) else: hits_by_replicon[hit.replicon_name] = [hit] for rep_name in hits_by_replicon: hits_by_replicon[rep_name] = get_best_hits( hits_by_replicon[rep_name], key='score') hits_by_replicon[rep_name].sort(key=attrgetter('position')) models_to_detect = sorted(models_to_detect, key=attrgetter('name')) db_type = config.db_type() if db_type in ('ordered_replicon', 'gembase'): systems, rejected_clusters = _search_in_ordered_replicon( hits_by_replicon, models_to_detect, config, logger) return systems, rejected_clusters elif db_type == "unordered": likely_systems, rejected_hits = _search_in_unordered_replicon( hits_by_replicon, models_to_detect, logger) return likely_systems, rejected_hits else: assert False, f"dbtype have an invalid value {db_type}" else: # No hits detected return [], []
class Test(MacsyTest): def __init__(self, methodName='runTest'): super(Test, self).__init__(methodName) def fake_init(obj, cfg): obj.cfg = cfg obj._idx = Indexes(cfg) obj.sequence_idx = obj._idx.find_my_indexes() obj.topology_file = cfg.topology_file() obj._DB = {} self.fake_init = fake_init self.real_init = RepliconDB.__init__ def setUp(self): self.args = argparse.Namespace() self.args.db_type = 'gembase' self.args.models_dir = self.find_data('models') self.args.res_search_dir = tempfile.gettempdir() self.args.log_level = 30 self.args.out_dir = os.path.join(self.args.res_search_dir, 'test_macsyfinder_repliconDB') if os.path.exists(self.args.out_dir): shutil.rmtree(self.args.out_dir) os.mkdir(self.args.out_dir) seq_db = self.find_data("base", "test_base.fa") shutil.copy(seq_db, self.args.out_dir) self.args.sequence_db = os.path.join(self.args.out_dir, os.path.basename(seq_db)) self.cfg = Config(MacsyDefaults(), self.args) self.ESCO030p01_genes = [('000010', 886), ('000020', 291), ('000030', 656), ('000040', 500), ('000050', 407), ('000060', 144), ('000070', 183), ('000080', 121), ('000090', 199), ('000100', 325), ('000110', 425), ('000120', 171), ('000130', 277), ('000140', 133), ('000150', 108), ('000160', 295), ('000170', 273), ('000180', 367), ('000190', 573), ('000200', 343), ('000210', 295), ('000220', 108), ('000230', 117), ('000240', 153), ('000250', 479), ('000260', 706), ('000270', 998), ('000280', 171), ('000290', 108), ('000300', 295), ('000310', 165), ('000320', 243), ('000330', 295), ('000340', 108), ('000350', 1755), ('000360', 248), ('000370', 286), ('000380', 186), ('000390', 83), ('000400', 153), ('000410', 69), ('000420', 295), ('000430', 108), ('000440', 145), ('000450', 59), ('000460', 124), ('000470', 246), ('000480', 325), ('000490', 54), ('000500', 95), ('000510', 83), ('000520', 56), ('000530', 401), ('000540', 320), ('000550', 256), ('000560', 73), ('000570', 144), ('000580', 258), ('000590', 133), ('000600', 140), ('000610', 63), ('000620', 138), ('000630', 68), ('000640', 169), ('000650', 127), ('000660', 295), ('000670', 108), ('000670', 108)] self.PSAE001c01_genes = [('006940', 803), ('013980', 759), ('017350', 600), ('018920', 776), ('026600', 273), ('031420', 658), ('043580', 416), ('051090', 714), ('055870', 449), ('055880', 447), ('055890', 588), ('055900', 292), ('055910', 262), ('055920', 166), ('055930', 288), ('055940', 194), ('055950', 567), ('055960', 188), ('055970', 247), ('055980', 252), ('055990', 455), ('056000', 450), ('056010', 260), ('056020', 246), ('056030', 70), ('056040', 133), ('056050', 284), ('056060', 585), ('056070', 435), ('056080', 342), ('056090', 252), ('056100', 122), ('056110', 213), ('056120', 400), ('056130', 134), ('056140', 138), ('056150', 397), ('056160', 298), ('056170', 186), ('056180', 445), ('056190', 414), ('056200', 132), ('056210', 674), ('056220', 319), ('056230', 394), ('056240', 207), ('056250', 401), ('056260', 611), ('056270', 257), ('056280', 169), ('056290', 454), ('056300', 141), ('056310', 458), ('056320', 286), ('056330', 514), ('056340', 178), ('056350', 156), ('056360', 85), ('056370', 289), ('056380', 126), ('056390', 290), ('056400', 262), ('056410', 214), ('056420', 630), ('056430', 127), ('056440', 455), ('056440', 455)] self.NCDB_genes = [('056134', 289), ('056135', 126), ('056136', 290), ('056137', 262), ('056138', 214), ('056139', 630), ('056140', 127), ('056141', 803), ('056141', 803)] self.idx = Indexes(self.cfg) self.idx.build() def tearDown(self): try: shutil.rmtree(self.cfg.working_dir()) except: pass RepliconDB.__init__ = self.real_init def test_fill_topology(self): self.args.topology_file = self.args.sequence_db + ".topo" db_send = {'ESCO030p01': 'circular', 'PSAE001c01': 'linear'} with open(self.args.topology_file, 'w') as f: for k, v in list(db_send.items()): f.write('{0} : {1}\n'.format(k, v)) cfg = Config(MacsyDefaults(), self.args) RepliconDB.__init__ = self.fake_init db = RepliconDB(cfg) rcv_topo = db._fill_topology() self.assertDictEqual(db_send, rcv_topo) def test_fill_ordered_replicon_min_max(self): seq_ori = self.find_data("base", "ordered_replicon_base.fasta") shutil.copy(seq_ori, self.args.out_dir) self.args.sequence_db = os.path.join(self.args.out_dir, os.path.basename(seq_ori)) cfg = Config(MacsyDefaults(), self.args) idx = Indexes(cfg) idx.build() RepliconDB.__init__ = self.fake_init db = RepliconDB(cfg) db._fill_ordered_min_max(cfg.replicon_topology()) self.assertEqual(len(db._DB), 1) rep = db[RepliconDB.ordered_replicon_name] self.assertEqual(rep.topology, cfg.replicon_topology()) self.assertEqual(rep.min, 1) self.assertEqual(rep.max, 52) def test_fill_gembase_min_max_default_topology(self): RepliconDB.__init__ = self.fake_init db = RepliconDB(self.cfg) db._fill_gembase_min_max({}, self.cfg.replicon_topology()) self.assertEqual(len(db._DB), 3) self.assertEqual(set(db._DB.keys()), set(['ESCO030p01', 'PSAE001c01', 'NC_xxxxx_xx'])) PRRU001c01 = db['ESCO030p01'] self.assertEqual(PRRU001c01.topology, 'circular') self.assertEqual(PRRU001c01.min, 1) self.assertEqual(PRRU001c01.max, 67) self.assertEqual(PRRU001c01.genes, self.ESCO030p01_genes) PSAE001c01 = db['PSAE001c01'] self.assertEqual(PSAE001c01.topology, 'circular') self.assertEqual(PSAE001c01.min, 68) self.assertEqual(PSAE001c01.max, 133) self.assertEqual(PSAE001c01.genes, self.PSAE001c01_genes) DBNC = db['NC_xxxxx_xx'] self.assertEqual(DBNC.topology, 'circular') self.assertEqual(DBNC.min, 134) self.assertEqual(DBNC.max, 141) self.assertEqual(DBNC.genes, self.NCDB_genes) def test_fill_gembase_min_max_oredered_replicon(self): seq_ori = self.find_data("base", "ordered_replicon_base.fasta") shutil.copy(seq_ori, self.args.out_dir) self.args.sequence_db = os.path.join(self.args.out_dir, os.path.basename(seq_ori)) cfg = Config(MacsyDefaults(), self.args) idx = Indexes(cfg) idx.build() RepliconDB.__init__ = self.fake_init db = RepliconDB(cfg) with self.assertRaises(MacsypyError) as ctx: with self.catch_log() as log: db._fill_gembase_min_max({}, self.cfg.replicon_topology()) self.assertEqual( str(ctx.exception), f"Error during sequence-db '{self.args.sequence_db}' parsing. " f"Are you sure db-type is 'gembase'?") def test_fill_gembase_min_max_with_topology(self): self.args.topology_file = self.args.sequence_db + ".topo" with open(self.args.topology_file, 'w') as f: f.write( '# topology file\nESCO030p01 : circular\nPSAE001c01 : linear\n' ) cfg = Config(MacsyDefaults(), self.args) RepliconDB.__init__ = self.fake_init db = RepliconDB(cfg) topo_dict = db._fill_topology() db._fill_gembase_min_max(topo_dict, 'circular') self.assertEqual(len(db._DB), 3) self.assertEqual(set(db._DB.keys()), set(['ESCO030p01', 'PSAE001c01', 'NC_xxxxx_xx'])) ESCO030p01 = db['ESCO030p01'] self.assertEqual(ESCO030p01.topology, 'circular') self.assertEqual(ESCO030p01.min, 1) self.assertEqual(ESCO030p01.max, 67) self.assertEqual(ESCO030p01.genes, self.ESCO030p01_genes) PSAE001c01 = db['PSAE001c01'] self.assertEqual(PSAE001c01.topology, 'linear') self.assertEqual(PSAE001c01.min, 68) self.assertEqual(PSAE001c01.max, 133) self.assertEqual(PSAE001c01.genes, self.PSAE001c01_genes) DBNC = db['NC_xxxxx_xx'] self.assertEqual(DBNC.topology, 'circular') self.assertEqual(DBNC.min, 134) self.assertEqual(DBNC.max, 141) self.assertEqual(DBNC.genes, self.NCDB_genes) def test_in(self): db = RepliconDB(self.cfg) self.assertIn('ESCO030p01', db) self.assertIn('PSAE001c01', db) self.assertIn('NC_xxxxx_xx', db) self.assertNotIn('toto', db) def test_getitem(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology(), 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology(), 68, 133, self.PSAE001c01_genes) NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes) self.assertEqual(ESCO030p01, db['ESCO030p01']) self.assertEqual(PSAE001c01, db['PSAE001c01']) self.assertEqual(NCXX, db['NC_xxxxx_xx']) self.assertRaises(KeyError, db.__getitem__, 'foo') def test_get(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology(), 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology(), 68, 133, self.PSAE001c01_genes) NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes) self.assertEqual(ESCO030p01, db.get('ESCO030p01')) self.assertEqual(PSAE001c01, db.get('PSAE001c01')) self.assertEqual(NCXX, db.get('NC_xxxxx_xx', 'foo')) self.assertIsNone(db.get('foo')) self.assertEqual('bar', db.get('foo', 'bar')) def test_items(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology(), 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology(), 68, 133, self.PSAE001c01_genes) NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes) self.assertCountEqual(list(db.items()), [('ESCO030p01', ESCO030p01), ('NC_xxxxx_xx', NCXX), ('PSAE001c01', PSAE001c01)]) def test_iteritems(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology(), 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology(), 68, 133, self.PSAE001c01_genes) NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes) iter_items = db.iteritems() for item in [('ESCO030p01', ESCO030p01), ('PSAE001c01', PSAE001c01), ('NC_xxxxx_xx', NCXX)]: with self.subTest(item=item): self.assertEqual(next(iter_items), item) def test_names(self): db = RepliconDB(self.cfg) exp_name = ['ESCO030p01', 'PSAE001c01', 'NC_xxxxx_xx'] self.assertListEqual(db.replicon_names(), exp_name) def test_replicon_infos(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology(), 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology(), 68, 133, self.PSAE001c01_genes) NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes) values = db.replicon_infos() self.assertCountEqual(values, [ESCO030p01, NCXX, PSAE001c01])