def test_fill_ordered_replicon_min_max(self): self.tearDown() self.cfg = Config(hmmer_exe="hmmsearch", sequence_db=os.path.join(self._data_dir, "base", "ordered_replicon_base"), db_type="ordered_replicon", e_value_res=1, i_evalue_sel=0.5, def_dir=os.path.join(self._data_dir, 'DEF'), res_search_dir='/tmp', res_search_suffix=".search_hmm.out", profile_dir=os.path.join(self._data_dir, 'profiles'), profile_suffix=".hmm", res_extract_suffix="", log_level=30, log_file='/dev/null') shutil.copy(self.cfg.sequence_db, self.cfg.working_dir) self.cfg.options['sequence_db'] = os.path.join( self.cfg.working_dir, os.path.basename(self.cfg.sequence_db)) idx = Indexes(self.cfg) idx._build_my_indexes() RepliconDB.__init__ = self.fake_init db = RepliconDB(self.cfg) db._fill_ordered_min_max(self.cfg.replicon_topology) self.assertEqual(len(db._DB), 1) rep = db[RepliconDB.ordered_replicon_name] self.assertEqual(rep.topology, self.cfg.replicon_topology) self.assertEqual(rep.min, 1) self.assertEqual(rep.max, 52)
def test_fill_gembase_min_max_with_topology(self): self.cfg.options['topology_file'] = self.cfg.sequence_db + ".topo" with open(self.cfg.topology_file , 'w') as f: f.write('# topology file\nESCO030p01 : circular\nPSAE001c01 : linear\n') RepliconDB.__init__ = self.fake_init db = RepliconDB(self.cfg) topo_dict = db._fill_topology() db._fill_gembase_min_max(topo_dict, 'circular') self.assertEqual(len(db._DB), 3) self.assertEqual(set(db._DB.keys()), set(['ESCO030p01', 'PSAE001c01', 'NC_xxxxx_xx'])) ESCO030p01 = db['ESCO030p01'] self.assertEqual(ESCO030p01.topology, 'circular') self.assertEqual(ESCO030p01.min, 1) self.assertEqual(ESCO030p01.max, 67) self.assertEqual(ESCO030p01.genes, self.ESCO030p01_genes) PSAE001c01 = db['PSAE001c01'] self.assertEqual(PSAE001c01.topology, 'linear') self.assertEqual(PSAE001c01.min, 68) self.assertEqual(PSAE001c01.max, 133) self.assertEqual(PSAE001c01.genes, self.PSAE001c01_genes) DBNC = db['NC_xxxxx_xx'] self.assertEqual(DBNC.topology, 'circular') self.assertEqual(DBNC.min, 134) self.assertEqual(DBNC.max, 141) self.assertEqual(DBNC.genes, self.NCDB_genes)
def test_items(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology, 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology, 68, 133, self.PSAE001c01_genes) NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes) self.assertItemsEqual(db.items(), [('ESCO030p01',ESCO030p01), ('NC_xxxxx_xx',NCXX), ('PSAE001c01',PSAE001c01)])
def test_fill_ordered_replicon_min_max(self): self.tearDown() self.cfg = Config( hmmer_exe = "hmmsearch", sequence_db = os.path.join(self._data_dir, "base", "ordered_replicon_base"), db_type = "ordered_replicon", e_value_res = 1, i_evalue_sel = 0.5, def_dir = os.path.join(self._data_dir, 'DEF'), res_search_dir = tempfile.gettempdir(), res_search_suffix = ".search_hmm.out", profile_dir = os.path.join(self._data_dir, 'profiles'), profile_suffix = ".hmm", res_extract_suffix = "", log_level = 30, log_file = 'NUL' if platform.system() == 'Windows' else '/dev/null' ) shutil.copy(self.cfg.sequence_db, self.cfg.working_dir) self.cfg.options['sequence_db'] = os.path.join(self.cfg.working_dir, os.path.basename(self.cfg.sequence_db)) idx = Indexes(self.cfg) idx._build_my_indexes() RepliconDB.__init__ = self.fake_init db = RepliconDB(self.cfg) db._fill_ordered_min_max(self.cfg.replicon_topology) self.assertEqual(len(db._DB), 1) rep = db[RepliconDB.ordered_replicon_name] self.assertEqual(rep.topology, self.cfg.replicon_topology) self.assertEqual(rep.min, 1) self.assertEqual(rep.max, 52)
def test_get(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology, 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology, 68, 133, self.PSAE001c01_genes) self.assertEqual(ESCO030p01, db.get('ESCO030p01')) self.assertEqual(PSAE001c01, db.get('PSAE001c01')) self.assertIsNone(db.get('foo')) self.assertEqual('bar', db.get('foo', 'bar'))
def test_items(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology, 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology, 68, 133, self.PSAE001c01_genes) self.assertItemsEqual(db.items(), [('ESCO030p01',ESCO030p01),('PSAE001c01',PSAE001c01)]) db = RepliconDB(self.cfg) PRRU001c01 = RepliconInfo(self.cfg.replicon_topology, 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology, 68, 133, self.PSAE001c01_genes)
def test_replicon_infos(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology(), 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology(), 68, 133, self.PSAE001c01_genes) NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes) values = db.replicon_infos() self.assertCountEqual(values, [ESCO030p01, NCXX, PSAE001c01])
def test_fill_topology(self): self.cfg.options['topology_file'] = self.cfg.sequence_db + ".topo" db_send = {'ESCO030p01': 'circular', 'PSAE001c01': 'linear'} with open(self.cfg.topology_file, 'w') as f: for k, v in db_send.items(): f.write('%s : %s\n' % (k, v)) RepliconDB.__init__ = self.fake_init db = RepliconDB(self.cfg) rcv_topo = db._fill_topology() self.assertDictEqual(db_send, rcv_topo)
def test_iteritems(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology(), 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology(), 68, 133, self.PSAE001c01_genes) NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes) iter_items = db.iteritems() for item in [('ESCO030p01', ESCO030p01), ('PSAE001c01', PSAE001c01), ('NC_xxxxx_xx', NCXX)]: with self.subTest(item=item): self.assertEqual(next(iter_items), item)
def test_fill_topology(self): self.args.topology_file = self.args.sequence_db + ".topo" db_send = {'ESCO030p01': 'circular', 'PSAE001c01': 'linear'} with open(self.args.topology_file, 'w') as f: for k, v in list(db_send.items()): f.write('{0} : {1}\n'.format(k, v)) cfg = Config(MacsyDefaults(), self.args) RepliconDB.__init__ = self.fake_init db = RepliconDB(cfg) rcv_topo = db._fill_topology() self.assertDictEqual(db_send, rcv_topo)
def test_fill_topology(self): self.cfg.options['topology_file'] = self.cfg.sequence_db + ".topo" db_send = {'ESCO030p01' : 'circular', 'PSAE001c01' : 'linear' } with open(self.cfg.topology_file , 'w') as f: for k, v in db_send.items(): f.write('%s : %s\n' % (k,v)) RepliconDB.__init__ = self.fake_init db = RepliconDB(self.cfg) rcv_topo = db._fill_topology() self.assertDictEqual(db_send, rcv_topo)
def test_items(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology, 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology, 68, 133, self.PSAE001c01_genes) self.assertItemsEqual(db.items(), [('ESCO030p01', ESCO030p01), ('PSAE001c01', PSAE001c01)]) db = RepliconDB(self.cfg) PRRU001c01 = RepliconInfo(self.cfg.replicon_topology, 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology, 68, 133, self.PSAE001c01_genes)
def test_fill_gembase_min_max_default_topology(self): RepliconDB.__init__ = self.fake_init db = RepliconDB(self.cfg) db._fill_gembase_min_max({}, self.cfg.replicon_topology) self.assertEqual(len(db._DB), 2) PRRU001c01 = db['ESCO030p01'] self.assertEqual(PRRU001c01.topology, 'circular') self.assertEqual(PRRU001c01.min, 1) self.assertEqual(PRRU001c01.max, 67) PSAE001c01 = db['PSAE001c01'] self.assertEqual(PSAE001c01.topology, 'circular') self.assertEqual(PSAE001c01.min, 68) self.assertEqual(PSAE001c01.max, 133)
def test_fill_gembase_min_max_with_topology(self): self.cfg.options['topology_file'] = self.cfg.sequence_db + ".topo" with open(self.cfg.topology_file, 'w') as f: f.write('ESCO030p01 : circular\nPSAE001c01 : linear\n') RepliconDB.__init__ = self.fake_init db = RepliconDB(self.cfg) topo_dict = db._fill_topology() db._fill_gembase_min_max(topo_dict, 'circular') self.assertEqual(len(db._DB), 2) ESCO030p01 = db['ESCO030p01'] self.assertEqual(ESCO030p01.topology, 'circular') self.assertEqual(ESCO030p01.min, 1) self.assertEqual(ESCO030p01.max, 67) PSAE001c01 = db['PSAE001c01'] self.assertEqual(PSAE001c01.topology, 'linear') self.assertEqual(PSAE001c01.min, 68) self.assertEqual(PSAE001c01.max, 133)
def test_fill_gembase_min_max_with_topology(self): self.cfg.options['topology_file'] = self.cfg.sequence_db + ".topo" with open(self.cfg.topology_file , 'w') as f: f.write('ESCO030p01 : circular\nPSAE001c01 : linear\n') RepliconDB.__init__ = self.fake_init db = RepliconDB(self.cfg) topo_dict = db._fill_topology() db._fill_gembase_min_max(topo_dict, 'circular') self.assertEqual(len(db._DB), 2) ESCO030p01 = db['ESCO030p01'] self.assertEqual(ESCO030p01.topology, 'circular') self.assertEqual(ESCO030p01.min, 1) self.assertEqual(ESCO030p01.max, 67) PSAE001c01 = db['PSAE001c01'] self.assertEqual(PSAE001c01.topology, 'linear') self.assertEqual(PSAE001c01.min, 68) self.assertEqual(PSAE001c01.max, 133)
def _search_in_ordered_replicon(hits_by_replicon, models_to_detect, config, logger): systems = [] rejected_clusters = [] rep_db = RepliconDB(config) for rep_name in hits_by_replicon: logger.info( "\n{:#^60}".format(f" Hits analysis for replicon {rep_name} ")) rep_info = rep_db[rep_name] for model in models_to_detect: logger.info(f"Check model {model.fqn}") hits_related_one_model = model.filter(hits_by_replicon[rep_name]) logger.debug("{:#^80}".format(" hits related to {} ".format( model.name))) logger.debug("".join([str(h) for h in hits_related_one_model])) logger.debug("#" * 80) logger.info("Building clusters") hit_weights = HitWeight(**config.hit_weights()) clusters = cluster.build_clusters(hits_related_one_model, rep_info, model, hit_weights) logger.debug("{:#^80}".format("CLUSTERS")) logger.debug("\n" + "\n".join([str(c) for c in clusters])) logger.debug("#" * 80) logger.info("Searching systems") if model.multi_loci: # The loners are already in clusters lists with their context # so they are take in account clusters_combination = [ itertools.combinations(clusters, i) for i in range(1, len(clusters) + 1) ] else: # we must add loners manually # but only if the cluster does not already contains them loners = cluster.get_loners(hits_related_one_model, model, hit_weights) clusters_combination = [] for one_cluster in clusters: one_clust_combination = [one_cluster] filtered_loners = cluster.filter_loners( one_cluster, loners) one_clust_combination.extend(filtered_loners) clusters_combination.append([one_clust_combination]) for one_combination_set in clusters_combination: for one_clust_combination in one_combination_set: ordered_matcher = OrderedMatchMaker( model, redundancy_penalty=config.redundancy_penalty()) res = ordered_matcher.match(one_clust_combination) if isinstance(res, System): systems.append(res) else: rejected_clusters.append(res) if systems: systems.sort(key=lambda syst: (syst.replicon_name, syst.position[0], syst.model.fqn, -syst.score)) return systems, rejected_clusters
def test_fill_gembase_min_max_oredered_replicon(self): seq_ori = self.find_data("base", "ordered_replicon_base.fasta") shutil.copy(seq_ori, self.args.out_dir) self.args.sequence_db = os.path.join(self.args.out_dir, os.path.basename(seq_ori)) cfg = Config(MacsyDefaults(), self.args) idx = Indexes(cfg) idx.build() RepliconDB.__init__ = self.fake_init db = RepliconDB(cfg) with self.assertRaises(MacsypyError) as ctx: with self.catch_log() as log: db._fill_gembase_min_max({}, self.cfg.replicon_topology()) self.assertEqual( str(ctx.exception), f"Error during sequence-db '{self.args.sequence_db}' parsing. " f"Are you sure db-type is 'gembase'?")
def test_getitem(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology, 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology, 68, 133, self.PSAE001c01_genes) self.assertEqual(ESCO030p01, db['ESCO030p01']) self.assertEqual(PSAE001c01, db['PSAE001c01']) self.assertRaises(KeyError, db.__getitem__, 'foo')
def test_getitem(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology, 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology, 68, 133, self.PSAE001c01_genes) NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes) self.assertEqual(ESCO030p01, db['ESCO030p01']) self.assertEqual(PSAE001c01, db['PSAE001c01']) self.assertEqual(NCXX, db['NC_xxxxx_xx']) self.assertRaises(KeyError, db.__getitem__, 'foo')
def test_fill_ordered_replicon_min_max(self): seq_ori = self.find_data("base", "ordered_replicon_base.fasta") shutil.copy(seq_ori, self.args.out_dir) self.args.sequence_db = os.path.join(self.args.out_dir, os.path.basename(seq_ori)) cfg = Config(MacsyDefaults(), self.args) idx = Indexes(cfg) idx.build() RepliconDB.__init__ = self.fake_init db = RepliconDB(cfg) db._fill_ordered_min_max(cfg.replicon_topology()) self.assertEqual(len(db._DB), 1) rep = db[RepliconDB.ordered_replicon_name] self.assertEqual(rep.topology, cfg.replicon_topology()) self.assertEqual(rep.min, 1) self.assertEqual(rep.max, 52)
def test_fill_gembase_min_max_default_topology(self): RepliconDB.__init__ = self.fake_init db = RepliconDB(self.cfg) db._fill_gembase_min_max({}, self.cfg.replicon_topology) self.assertEqual(len(db._DB), 3) self.assertEqual(set(db._DB.keys()), set(['ESCO030p01', 'PSAE001c01', 'NC_xxxxx_xx'])) PRRU001c01 = db['ESCO030p01'] self.assertEqual(PRRU001c01.topology, 'circular') self.assertEqual(PRRU001c01.min, 1) self.assertEqual(PRRU001c01.max, 67) self.assertEqual(PRRU001c01.genes, self.ESCO030p01_genes) PSAE001c01 = db['PSAE001c01'] self.assertEqual(PSAE001c01.topology, 'circular') self.assertEqual(PSAE001c01.min, 68) self.assertEqual(PSAE001c01.max, 133) self.assertEqual(PSAE001c01.genes, self.PSAE001c01_genes) DBNC = db['NC_xxxxx_xx'] self.assertEqual(DBNC.topology, 'circular') self.assertEqual(DBNC.min, 134) self.assertEqual(DBNC.max, 141) self.assertEqual(DBNC.genes, self.NCDB_genes)
def test_get(self): db = RepliconDB(self.cfg) ESCO030p01 = RepliconInfo(self.cfg.replicon_topology, 1, 67, self.ESCO030p01_genes) PSAE001c01 = RepliconInfo(self.cfg.replicon_topology, 68, 133, self.PSAE001c01_genes) NCXX = RepliconInfo("circular", 134, 141, self.NCDB_genes) self.assertEqual(ESCO030p01, db.get('ESCO030p01')) self.assertEqual(PSAE001c01, db.get('PSAE001c01')) self.assertEqual(NCXX, db.get('NC_xxxxx_xx', 'foo')) self.assertIsNone(db.get('foo')) self.assertEqual('bar', db.get('foo', 'bar'))
def test_in(self): db = RepliconDB(self.cfg) self.assertIn('ESCO030p01', db) self.assertIn('PSAE001c01', db) self.assertNotIn('toto', db)
def test_names(self): db = RepliconDB(self.cfg) exp_name = ['ESCO030p01', 'PSAE001c01', 'NC_xxxxx_xx'] self.assertListEqual(db.replicon_names(), exp_name)