def setUp(self): if 'INTEGRON_HOME' in os.environ: self.integron_home = os.environ['INTEGRON_HOME'] self.local_install = True else: self.local_install = False self.integron_home = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) self.tmp_dir = os.path.join(tempfile.gettempdir(), 'tmp_test_integron_finder') if os.path.exists(self.tmp_dir) and os.path.isdir(self.tmp_dir): shutil.rmtree(self.tmp_dir) os.makedirs(self.tmp_dir) self.cmsearch = which('cmsearch') self.out_dir = self.tmp_dir self.model_attc_path = self.find_data(os.path.join('Models', 'attc_4.cm')) self.cpu_nb = 1 replicon_name = 'lian.001.c02.10' replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies self.replicon = next(sequences_db) self.evalue_attc = 1. self.max_attc_size = 200 self.min_attc_size = 40 self.length_cm = 47 # length in 'CLEN' (value for model attc_4.cm) self.call = call_wrapper() infernal.read_infernal = read_infernal_mock(self.tmp_dir)
def setUp(self): if 'INTEGRON_HOME' in os.environ: self.integron_home = os.environ['INTEGRON_HOME'] self.local_install = True else: self.local_install = False self.integron_home = os.path.normpath( os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..'))) self.tmp_dir = os.path.join(tempfile.gettempdir(), 'tmp_test_integron_finder') if os.path.exists(self.tmp_dir) and os.path.isdir(self.tmp_dir): shutil.rmtree(self.tmp_dir) os.makedirs(self.tmp_dir) self.cmsearch = which('cmsearch') self.out_dir = self.tmp_dir self.model_attc_path = self.find_data( os.path.join('Models', 'attc_4.cm')) self.cpu_nb = 1 replicon_name = 'lian.001.c02.10' replicon_path = self.find_data( os.path.join('Replicons', replicon_name + '.fst')) topologies = Topology('lin') with FastaIterator(replicon_path) as sequences_db: sequences_db.topologies = topologies self.replicon = next(sequences_db) self.evalue_attc = 1. self.max_attc_size = 200 self.min_attc_size = 40 self.length_cm = 47 # length in 'CLEN' (value for model attc_4.cm) self.call = call_wrapper() infernal.read_infernal = read_infernal_mock(self.tmp_dir)
def setUp(self): if 'INTEGRON_HOME' in os.environ: self.integron_home = os.environ['INTEGRON_HOME'] self.local_install = True else: self.local_install = False self.integron_home = os.path.normpath( os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..'))) self.tmp_dir = os.path.join(tempfile.gettempdir(), 'tmp_test_integron_finder') os.makedirs(self.tmp_dir) integron_finder.PRODIGAL = which('prodigal') integron_finder.HMMSEARCH = which('hmmsearch') integron_finder.N_CPU = '1' integron_finder.MODEL_DIR = os.path.join(self.integron_home, "data", "Models") integron_finder.MODEL_integrase = os.path.join( integron_finder.MODEL_DIR, "integron_integrase.hmm") integron_finder.MODEL_phage_int = os.path.join( integron_finder.MODEL_DIR, "phage-int.hmm") integron_finder.MODEL_attc = os.path.join(self.integron_home, 'data', 'Models', 'attc_4.cm') self.columns = [ 'pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model', 'distance_2attC', 'annotation' ] self.dtype = { "pos_beg": 'int', "pos_end": 'int', "strand": 'int', "evalue": 'float', "type_elt": 'str', "annotation": 'str', "model": 'str', "distance_2attC": 'float' }
def setUp(self): if 'INTEGRON_HOME' in os.environ: self.integron_home = os.environ['INTEGRON_HOME'] self.local_install = True else: self.local_install = False self.integron_home = os.path.normpath( os.path.abspath( os.path.join(os.path.dirname(__file__), '..' '..'))) self.tmp_dir = tempfile.gettempdir() self.bin = os.path.join( self.integron_home, 'integron_finder' ) if self.local_install else which('integron_finder')
def setUp(self): if 'INTEGRON_HOME' in os.environ: self.integron_home = os.environ['INTEGRON_HOME'] self.local_install = True else: self.local_install = False self.integron_home = os.path.normpath( os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..'))) self.tmp_dir = os.path.join(tempfile.gettempdir(), 'tmp_test_integron_finder') os.makedirs(self.tmp_dir) integron_finder.CMSEARCH = which('cmsearch') integron_finder.N_CPU = '1' integron_finder.MODEL_attc = os.path.join(self.integron_home, 'data', 'Models', 'attc_4.cm')
class TestProfile(MacsyTest): def setUp(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = tempfile.gettempdir() args.log_level = 50 self.cfg = Config(MacsyDefaults(), args) if os.path.exists(self.cfg.working_dir()): shutil.rmtree(self.cfg.working_dir()) os.makedirs(self.cfg.working_dir()) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) def tearDown(self): try: shutil.rmtree(self.cfg.working_dir()) except: pass def test_len(self): model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile("abc") profile = Profile(gene, self.cfg, path) self.assertEqual(len(profile), 501) def test_ga_threshold(self): # No GA threshold model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile(gene_name) profile = Profile(gene, self.cfg, path) self.assertFalse(profile.ga_threshold) # GA threshold line ends with ; gene_name = 'T5aSS_PF03797' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile(gene_name) profile = Profile(gene, self.cfg, path) self.assertTrue(profile.ga_threshold) # GA threshold line do NOT ends with ; gene_name = 'PF05930.13' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile(gene_name) profile = Profile(gene, self.cfg, path) self.assertTrue(profile.ga_threshold) # GA threshold invalid format string instead float gene_name = 'bad_GA' with self.catch_log(log_name='macsypy'): # When a CoreGene is created a Profile is automatically instanciated # So I mute the log to do not polute output c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile(gene_name) with self.catch_log(log_name='macsypy') as log: profile = Profile(gene, self.cfg, path) catch_msg = log.get_value().strip() self.assertFalse(profile.ga_threshold) self.assertEqual( catch_msg, "bad_GA GA score is not well formatted expected 2 floats got ''22.00'' ''23.00''.\n" "GA score will not used for gene 'bad_GA'.") # GA threshold invalid format only one score gene_name = 'bad_GA_2' with self.catch_log(log_name='macsypy'): # When a CoreGene is created a Profile is automatically instanciated # So I mute the log to do not polute output c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile(gene_name) with self.catch_log(log_name='macsypy') as log: profile = Profile(gene, self.cfg, path) catch_msg = log.get_value().strip() self.assertFalse(profile.ga_threshold) self.assertEqual( catch_msg, "bad_GA_2 GA score is not well formatted. expected: 'GA float float' got 'GA 22.00'.\n" "GA score will not used for gene 'bad_GA_2'.") def test_str(self): model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile("abc") profile = Profile(gene, self.cfg, path) s = "{0} : {1}".format(gene.name, path) self.assertEqual(str(profile), s) @unittest.skipIf(not which('hmmsearch'), 'hmmsearch not found in PATH') def test_execute_hmm_with_GA(self): for db_type in ("gembase", "ordered_replicon", "unordered"): self.cfg._set_db_type(db_type) model = Model("foo/T2SS", 10) gene_name = 'T5aSS_PF03797' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) # case GA threshold in profile profile_path = self.model_location.get_profile("T5aSS_PF03797") profile = Profile(gene, self.cfg, profile_path) report = profile.execute() hmmer_raw_out = profile.hmm_raw_output with open(hmmer_raw_out, 'r') as hmmer_raw_out_file: first_l = hmmer_raw_out_file.readline() # a hmmsearch output file has been produced self.assertTrue( first_l.startswith( "# hmmsearch :: search profile(s) against a sequence database" )) for i in range(5): # skip 4 lines l = hmmer_raw_out_file.readline() # a hmmsearch used the abc profile line should become with: "# query HMM file: {the path tp hmm profile used}" self.assertTrue(l.find(profile_path) != -1) for i in range(3): # skip 2 lines l = hmmer_raw_out_file.readline() self.assertEqual( "# model-specific thresholding: GA cutoffs", l.strip()) # test if profile is executed only once per run report_bis = profile.execute() self.assertIs(report, report_bis) @unittest.skipIf(not which('hmmsearch'), 'hmmsearch not found in PATH') def test_execute_hmm_protected_path(self): # create a hmmdir with space in name self.cfg.hmmer_dir = lambda: 'hmmer results' # create sequence_db path with space in path seq_path = os.path.join(self.cfg.working_dir(), "test test1.fasta") shutil.copyfile(self.find_data("base", "test_1.fasta"), seq_path) self.cfg._set_sequence_db(seq_path) model = Model("foo/T2SS", 10) gene_name = 'T5aSS_PF03797' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) # case GA threshold in profile profile_path = self.model_location.get_profile("T5aSS_PF03797") profile = Profile(gene, self.cfg, profile_path) report = profile.execute() hmmer_raw_out = profile.hmm_raw_output with open(hmmer_raw_out, 'r') as hmmer_raw_out_file: first_l = hmmer_raw_out_file.readline() # a hmmsearch output file has been produced self.assertTrue( first_l.startswith( "# hmmsearch :: search profile(s) against a sequence database" )) for i in range(5): # skip 4 lines l = hmmer_raw_out_file.readline() # a hmmsearch used the abc profile line should become with: "# query HMM file: {the path tp hmm profile used}" self.assertTrue(l.find(profile_path) != -1) for i in range(3): # skip 2 lines l = hmmer_raw_out_file.readline() self.assertEqual("# model-specific thresholding: GA cutoffs", l.strip()) @unittest.skipIf(not which('hmmsearch'), 'hmmsearch not found in PATH') def test_execute_hmm_w_GA_n_nocutga(self): # case GA threshold in profile but --no-cut-ga is set args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = tempfile.gettempdir() args.log_level = 0 args.e_value_search = 0.5 args.no_cut_ga = True cfg = Config(MacsyDefaults(), args) model = Model("foo/T2SS", 10) gene_name = 'T5aSS_PF03797' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) profile_path = self.model_location.get_profile("T5aSS_PF03797") profile = Profile(gene, cfg, profile_path) report = profile.execute() hmmer_raw_out = profile.hmm_raw_output with open(hmmer_raw_out, 'r') as hmmer_raw_out_file: for i in range(9): l = hmmer_raw_out_file.readline() self.assertEqual( "# sequence reporting threshold: E-value <= 0.5", l.strip()) @unittest.skipIf(not which('hmmsearch'), 'hmmsearch not found in PATH') def test_execute_hmm_wo_GA(self): # case cut-ga but no GA threshold in hmmprofile model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) # case -cut-ga and GA threshold in profile profile_path = self.model_location.get_profile("abc") profile = Profile(gene, self.cfg, profile_path) with self.catch_log() as log: report = profile.execute() hmmer_raw_out = profile.hmm_raw_output with open(hmmer_raw_out, 'r') as hmmer_raw_out_file: first_l = hmmer_raw_out_file.readline() # a hmmsearch output file has been produced self.assertTrue( first_l.startswith( "# hmmsearch :: search profile(s) against a sequence database" )) for i in range(5): # skip 4 lines l = hmmer_raw_out_file.readline() # a hmmsearch used the abc profile line should become with: "# query HMM file: {the path tp hmm profile used}" self.assertTrue(l.find(profile_path) != -1) for i in range(3): # skip 2 lines l = hmmer_raw_out_file.readline() self.assertEqual( '# sequence reporting threshold: E-value <= 0.1', l.strip()) def test_execute_unknown_binary(self): self.cfg._options['hmmer'] = "Nimportnaoik" model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile("abc", ) profile = Profile(gene, self.cfg, path) with self.catch_log(): with self.assertRaises(RuntimeError): profile.execute() def test_execute_hmmer_failed(self): fake_hmmer = os.path.join(tempfile.gettempdir(), 'hmmer_failed') with open(fake_hmmer, 'w') as hmmer: hmmer.write("""#! {} import sys sys.exit(127) """.format(sysconfig.sys.executable)) try: os.chmod(hmmer.name, 0o755) self.cfg._options['hmmer'] = hmmer.name model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile("abc", ) profile = Profile(gene, self.cfg, path) with self.catch_log(): with self.assertRaisesRegex( RuntimeError, "an error occurred during Hmmer " "execution: command = .* : return code = 127 .*" ) as ctx: profile.execute() finally: try: os.unlink(fake_hmmer) except Exception: pass
class Test(MacsyTest): def setUp(self): self.tmp_dir = tempfile.gettempdir() # reset AbstractSetOfHits internal id to have predictable results (Systems, ...) id # it's works only if there is only one replicon # for gembase the order is not guarantee AbstractSetOfHits._id = itertools.count(1) self.all_systems_tsv = "all_systems.tsv" self.all_systems_txt = "all_systems.txt" self.all_best_solutions = "all_best_solutions.tsv" self.best_solution = "best_solution.tsv" self.rejected_clusters = "rejected_clusters.txt" self.uncomplete_systems = "uncomplete_systems.txt" def tearDown(self): try: pass # self.out_dir is set in self._macsyfinder_run shutil.rmtree(self.out_dir) except: pass @unittest.skipIf(not which('hmmsearch'), 'hmmsearch not found in PATH') def test_gembase(self): """ """ expected_result_dir = self.find_data("functional_test_gembase") args = "--db-type=gembase " \ f"--models-dir={self.find_data('models')} " \ "--models TFF-SF Archaeal-T4P ComM MSH T2SS T4bP T4P Tad " \ "--out-dir={out_dir} " \ f"--previous-run {expected_result_dir} " \ "--relative-path" self._macsyfinder_run(args) for file_name in (self.all_systems_tsv, self.all_best_solutions, self.best_solution): with self.subTest(file_name=file_name): expected_result = self.find_data(expected_result_dir, file_name) get_results = os.path.join(self.out_dir, file_name) self.assertTsvEqual(expected_result, get_results, comment="#") expected_result = self.find_data(expected_result_dir, self.rejected_clusters) get_results = os.path.join(self.out_dir, self.rejected_clusters) self.assertFileEqual(expected_result, get_results, comment="#") def test_only_loners(self): expected_result_dir = self.find_data("functional_tests_only_loners") args = "--db-type ordered_replicon " \ "--replicon-topology linear " \ f"--models-dir {self.find_data('models')} " \ "-m test_loners MOB_cf_T5SS " \ "-o {out_dir} " \ f"--previous-run {expected_result_dir} " \ "--relative-path" self._macsyfinder_run(args) for file_name in (self.all_systems_tsv, self.all_best_solutions, self.best_solution, self.rejected_clusters): with self.subTest(file_name=file_name): expected_result = self.find_data(expected_result_dir, file_name) get_results = os.path.join(self.out_dir, file_name) self.assertFileEqual(expected_result, get_results, comment="#") def test_ordered_circular(self): # genetic organization of test_3.fasta # gene abc mfp omf omf abc gspd # gene id 01397 01398 01548 01562 01399 01400 # pos 8 9 19 27 37 38 # clst ] [ # syst (abc,37), (gspd, 38), (abc,2), (mfp,3) expected_result_dir = self.find_data( "functional_test_ordered_circular") # TODO how to specify multi_loci = false when multi_loci =True is set in xml args = "--db-type ordered_replicon " \ "--replicon-topology circular " \ f"--models-dir {self.find_data('models')} " \ "-m functional T12SS-simple-exch " \ "-o {out_dir} " \ f"--previous-run {expected_result_dir} " \ "--relative-path" self._macsyfinder_run(args) for file_name in (self.all_systems_tsv, self.all_best_solutions, self.best_solution, self.rejected_clusters): with self.subTest(file_name=file_name): expected_result = self.find_data(expected_result_dir, file_name) get_results = os.path.join(self.out_dir, file_name) self.assertFileEqual(expected_result, get_results, comment="#") def test_ordered_linear(self): # genetic organization of test_3.fasta # gene abc mfp omf omf abc gspd # gene id 01397 01398 01548 01562 01399 01400 # pos 8 9 19 27 37 38 # clst [ ] [ ] # syst no system expected_result_dir = self.find_data("functional_test_ordered_linear") # TODO how to specify multi_loci = false when multi_loci =True is set in xml args = "--db-type ordered_replicon " \ "--replicon-topology linear " \ f"--models-dir {self.find_data('models')} " \ "-m functional T12SS-simple-exch " \ "-o {out_dir} " \ f"--previous-run {expected_result_dir} " \ "--relative-path" self._macsyfinder_run(args) for file_name in (self.all_systems_tsv, self.all_best_solutions, self.best_solution, self.rejected_clusters): with self.subTest(file_name=file_name): expected_result = self.find_data(expected_result_dir, file_name) get_results = os.path.join(self.out_dir, file_name) self.assertFileEqual(expected_result, get_results, comment="#") def test_ordered_multi_system(self): # genetic organization of test_1.fasta # # gene omf mfp abc mfp abc gspd omf omf omf # gene id 01360 01361 01397 01398 01399 01400 01506 01548 01562 # pos 2 3 11 12 13 14 23 32 46 # clst [ ] [ ] [ ] [ ] [ ] # syst [abc mfp abc gspd omf omf omf] expected_result_dir = self.find_data( "functional_test_ordered_multi_system") # TODO how to specify multi_loci = false when multi_loci =True is set in xml args = "--db-type ordered_replicon " \ f"--models-dir {self.find_data('models')} " \ "-m functional T12SS-multi-syst-exch " \ "-o {out_dir} " \ f"--previous-run {expected_result_dir} " \ "--relative-path" self._macsyfinder_run(args) for file_name in (self.all_systems_tsv, self.all_best_solutions, self.best_solution, self.rejected_clusters): with self.subTest(file_name=file_name): expected_result = self.find_data(expected_result_dir, file_name) get_results = os.path.join(self.out_dir, file_name) self.assertFileEqual(expected_result, get_results, comment="#") def test_ordered_multi_system_loner_in_clust(self): # genetic organization of test_2.fasta # # gene abc mfp abc gspd omf omf omf # gene id 01397 01398 01399 01400 01506 01548 01562 # pos 8 9 10 11 13 29 43 # clst [ ] [ ] [ ] # syst [abc mfp abc gspd omf] expected_result_dir = self.find_data( "functional_test_ordered_multi_system_loner_in_clust") # TODO how to specify multi_loci = false when multi_loci =True is set in xml args = "--db-type ordered_replicon " \ f"--models-dir {self.find_data('models')} " \ "-m functional T12SS-multi-syst-exch " \ "-o {out_dir} " \ f"--previous-run {expected_result_dir} " \ "--relative-path" self._macsyfinder_run(args) for file_name in (self.all_systems_tsv, self.all_best_solutions, self.best_solution, self.rejected_clusters): with self.subTest(file_name=file_name): expected_result = self.find_data(expected_result_dir, file_name) get_results = os.path.join(self.out_dir, file_name) self.assertFileEqual(expected_result, get_results, comment="#") def test_ordered_multi_loci(self): # genetic organization of test_4.fasta # # gene abc mfp abc gspd omf omf # gene id 01397 01398 01399 01400 01548 01562 # pos 6 7 14 15 26 40 # clst [ ] [ ] # syst abc, mfp, abc, gspd, omf, omf expected_result_dir = self.find_data( "functional_test_ordered_multi_loci") args = "--db-type ordered_replicon " \ f"--models-dir {self.find_data('models')} " \ "-m functional T12SS-simple-exch " \ "-o {out_dir} " \ "--multi-loci functional/T12SS-simple-exch " \ f"--previous-run {expected_result_dir} " \ "--relative-path" self._macsyfinder_run(args) for file_name in (self.all_systems_tsv, self.all_best_solutions, self.best_solution, self.rejected_clusters): with self.subTest(file_name=file_name): expected_result = self.find_data(expected_result_dir, file_name) get_results = os.path.join(self.out_dir, file_name) self.assertFileEqual(expected_result, get_results, comment="#") def test_ordered_single_loci(self): # genetic organization of test_4.fasta # # gene abc mfp abc gspd omf omf # gene id 01397 01398 01399 01400 01548 01562 # pos 6 7 14 15 26 40 # clst [ ] [ ] # syst no system expected_result_dir = self.find_data( "functional_test_ordered_single_loci") args = "--db-type ordered_replicon " \ f"--models-dir {self.find_data('models')} " \ "-m functional T12SS-simple-exch " \ "-o {out_dir} " \ f"--previous-run {expected_result_dir} " \ "--relative-path" self._macsyfinder_run(args) for file_name in (self.all_systems_tsv, self.all_best_solutions, self.best_solution, self.rejected_clusters): with self.subTest(file_name=file_name): expected_result = self.find_data(expected_result_dir, file_name) get_results = os.path.join(self.out_dir, file_name) self.assertFileEqual(expected_result, get_results, comment="#") def test_unordered(self): # genetic organization of test_4.fasta # # gene abc mfp abc gspd omf omf # gene id 01397 01398 01399 01400 01548 01562 # pos 6 7 14 15 26 40 # syst abc mfp abc gspd omf omf expected_result_dir = self.find_data("functional_test_unordered") args = "--db-type unordered " \ f"--models-dir {self.find_data('models')} " \ "-m functional T12SS-simple-exch " \ "-o {out_dir} " \ f"--previous-run {expected_result_dir} " \ "--relative-path" self._macsyfinder_run(args) for file_name in (self.all_systems_tsv, self.all_systems_txt, self.uncomplete_systems): with self.subTest(file_name=file_name): expected_result = self.find_data(expected_result_dir, file_name) get_results = os.path.join(self.out_dir, file_name) self.assertFileEqual(expected_result, get_results, comment="#") def test_working_dir_exists(self): args = f"--sequence-db {self.find_data('base', 'one_replicon.fasta')} " \ "--db-type ordered_replicon " \ f"--models-dir {self.find_data('models')} " \ "-m functional T12SS " \ "-o {out_dir}" self.out_dir = os.path.join(self.tmp_dir, 'macsyfinder_working_dir_exists') os.makedirs(self.out_dir) open(os.path.join(self.out_dir, 'toto.empty'), 'w').close() args = args.format(out_dir=self.out_dir) with self.assertRaises(ValueError) as ctx: macsyfinder.main(args=args.split(), loglevel='ERROR') self.assertEqual( str(ctx.exception), f"'{self.out_dir}' already exists and is not a empty") def test_working_dir_exists_and_not_dir(self): args = f"--sequence-db {self.find_data('base', 'one_replicon.fasta')} " \ "--db-type ordered_replicon " \ f"--models-dir {self.find_data('models')} " \ "-m functional T12SS " \ "-o {out_dir} " self.out_dir = os.path.join( self.tmp_dir, 'macsyfinder_working_dir_exists_and_not_dir') try: open(self.out_dir, 'w').close() args = args.format(out_dir=self.out_dir) with self.assertRaises(ValueError) as ctx: macsyfinder.main(args=args.split(), loglevel='ERROR') self.assertEqual( str(ctx.exception), f"'{self.out_dir}' already exists and is not a directory") finally: os.unlink(self.out_dir) def test_no_models(self): args = f"--sequence-db {self.find_data('base', 'one_replicon.fasta')} " \ "--db-type ordered_replicon " \ f"--models-dir {self.find_data('models')} " \ "-o {out_dir} " self.out_dir = os.path.join(self.tmp_dir, 'macsyfinder_no_models') args = args.format(out_dir=self.out_dir) with self.catch_io(out=True): with self.assertRaises(OptionError) as ctx: macsyfinder.main(args=args.split(), loglevel='ERROR') self.assertEqual(str(ctx.exception), "argument --models or --previous-run is required.") def test_no_seq_db(self): args = "--db-type ordered_replicon " \ f"--models-dir {self.find_data('models')} " \ "-m functional T12SS " \ "-o {out_dir} " self.out_dir = os.path.join(self.tmp_dir, 'macsyfinder_no_seq_db') args = args.format(out_dir=self.out_dir) with self.catch_io(out=True): with self.assertRaises(OptionError) as ctx: macsyfinder.main(args=args.split(), loglevel='ERROR') self.assertEqual( str(ctx.exception), "argument --sequence-db or --previous-run is required.") def test_no_db_type(self): args = f"--sequence-db {self.find_data('base', 'one_replicon.fasta')} " \ f"--models-dir {self.find_data('models')} " \ "-m functional T12SS " \ "-o {out_dir} " self.out_dir = os.path.join(self.tmp_dir, 'macsyfinder_no_db_type') args = args.format(out_dir=self.out_dir) with self.catch_io(out=True): with self.assertRaises(OptionError) as ctx: macsyfinder.main(args=args.split(), loglevel='ERROR') self.assertEqual(str(ctx.exception), "argument --db-type or --previous-run is required.") def test_model_unknown(self): args = f"--sequence-db {self.find_data('base', 'one_replicon.fasta')} " \ "--db-type ordered_replicon " \ f"--models-dir {self.find_data('models')} " \ "-m functional Unknown_model " \ "-o {out_dir}" self.out_dir = os.path.join(self.tmp_dir, 'macsyfinder_model_unkwon') os.makedirs(self.out_dir) args = args.format(out_dir=self.out_dir) with self.assertRaises(ValueError) as ctx: macsyfinder.main(args=args.split(), loglevel='ERROR') self.assertEqual(str(ctx.exception), "Unknown_model does not match with any definitions") def _macsyfinder_run(self, args_tpl): # get the name of the calling function test_name = inspect.stack()[1].function self.out_dir = os.path.join(self.tmp_dir, 'macsyfinder_{}'.format(test_name)) os.makedirs(self.out_dir) args = args_tpl.format(out_dir=self.out_dir) # print("\n############################################") # print(args) # print("##############################################") macsyfinder.main(args=args.split(), loglevel='ERROR')
class TestMacsyfinder(MacsyTest): def setUp(self): self.tmp_dir = tempfile.mkdtemp() AbstractSetOfHits._id = itertools.count(1) def tearDown(self): try: shutil.rmtree(self.tmp_dir) except: pass def test_list_models(self): cmd_args = argparse.Namespace() cmd_args.models_dir = os.path.join(self._data_dir, 'fake_model_dir') cmd_args.list_models = True rcv_list_models = list_models(cmd_args) exp_list_models = """set_1 /def_1_1 /def_1_2 /def_1_3 /def_1_4 set_2 /level_1 /def_1_1 /def_1_2 /level_2 /def_2_3 /def_2_4 """ self.assertEqual(exp_list_models, rcv_list_models) def test_systems_to_txt(self): system_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Systems found """ f_out = StringIO() track_multi_systems_hit = HitSystemTracker([]) systems_to_txt([], track_multi_systems_hit, f_out) self.assertMultiLineEqual(system_str, f_out.getvalue()) args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) # test if id is well incremented gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) gene_name = "sctJ" c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) system_1 = System(model, [ Cluster([v_hit_1, v_hit_2], model, HitWeight(**cfg.hit_weights())) ], cfg.redundancy_penalty()) system_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Systems found: system id = replicon_id_T2SS_{next(System._id) - 1} model = foo/T2SS replicon = replicon_id clusters = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 1)] occ = 1 wholeness = 1.000 loci nb = 1 score = 1.500 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) neutral genes: ============================================================ """ f_out = StringIO() track_multi_systems_hit = HitSystemTracker([system_1]) systems_to_txt([system_1], track_multi_systems_hit, f_out) self.assertMultiLineEqual(system_str, f_out.getvalue()) def test_systems_to_tsv(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) gene_name = "sctJ" c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) system_1 = System(model, [ Cluster([v_hit_1, v_hit_2], model, HitWeight(**cfg.hit_weights())) ], cfg.redundancy_penalty()) system_tsv = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Systems found: """ system_tsv += "\t".join([ "replicon", "hit_id", "gene_name", "hit_pos", "model_fqn", "sys_id", "sys_loci", "sys_wholeness", "sys_score", "sys_occ", "hit_gene_ref", "hit_status", "hit_seq_len", "hit_i_eval", "hit_score", "hit_profile_cov", "hit_seq_cov", "hit_begin_match", "hit_end_match", "used_in" ]) system_tsv += "\n" system_tsv += "\t".join([ "replicon_id", "hit_1", "gspD", "1", "foo/T2SS", system_1.id, "1", "1.000", "1.500", "1", "gspD", "mandatory", "803", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) system_tsv += "\n" system_tsv += "\t".join([ "replicon_id", "hit_2", "sctJ", "1", "foo/T2SS", system_1.id, "1", "1.000", "1.500", "1", "sctJ", "accessory", "803", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) system_tsv += "\n\n" f_out = StringIO() track_multi_systems_hit = HitSystemTracker([system_1]) systems_to_tsv([system_1], track_multi_systems_hit, f_out) self.assertMultiLineEqual(system_tsv, f_out.getvalue()) # test No system found system_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Systems found """ f_out = StringIO() track_multi_systems_hit = HitSystemTracker([]) systems_to_tsv([], track_multi_systems_hit, f_out) self.assertMultiLineEqual(system_str, f_out.getvalue()) def test_solutions_to_tsv(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model_A = Model("foo/A", 10) model_B = Model("foo/B", 10) model_C = Model("foo/C", 10) c_gene_sctn_flg = CoreGene(models_location, "sctN_FLG", profile_factory) gene_sctn_flg = ModelGene(c_gene_sctn_flg, model_B) c_gene_sctj_flg = CoreGene(models_location, "sctJ_FLG", profile_factory) gene_sctj_flg = ModelGene(c_gene_sctj_flg, model_B) c_gene_flgB = CoreGene(models_location, "flgB", profile_factory) gene_flgB = ModelGene(c_gene_flgB, model_B) c_gene_tadZ = CoreGene(models_location, "tadZ", profile_factory) gene_tadZ = ModelGene(c_gene_tadZ, model_B) c_gene_sctn = CoreGene(models_location, "sctN", profile_factory) gene_sctn = ModelGene(c_gene_sctn, model_A) gene_sctn_hom = Exchangeable(c_gene_sctn_flg, gene_sctn) gene_sctn.add_exchangeable(gene_sctn_hom) c_gene_sctj = CoreGene(models_location, "sctJ", profile_factory) gene_sctj = ModelGene(c_gene_sctj, model_A) gene_sctj_an = Exchangeable(c_gene_sctj_flg, gene_sctj) gene_sctj.add_exchangeable(gene_sctj_an) c_gene_gspd = CoreGene(models_location, "gspD", profile_factory) gene_gspd = ModelGene(c_gene_gspd, model_A) gene_gspd_an = Exchangeable(c_gene_flgB, gene_gspd) gene_gspd.add_exchangeable(gene_gspd_an) c_gene_abc = CoreGene(models_location, "abc", profile_factory) gene_abc = ModelGene(c_gene_abc, model_A) gene_abc_ho = Exchangeable(c_gene_tadZ, gene_abc) gene_abc.add_exchangeable(gene_abc_ho) model_A.add_mandatory_gene(gene_sctn) model_A.add_mandatory_gene(gene_sctj) model_A.add_accessory_gene(gene_gspd) model_A.add_forbidden_gene(gene_abc) model_B.add_mandatory_gene(gene_sctn_flg) model_B.add_mandatory_gene(gene_sctj_flg) model_B.add_accessory_gene(gene_flgB) model_B.add_accessory_gene(gene_tadZ) model_C.add_mandatory_gene(gene_sctn_flg) model_C.add_mandatory_gene(gene_sctj_flg) model_C.add_mandatory_gene(gene_flgB) model_C.add_accessory_gene(gene_tadZ) model_C.add_accessory_gene(gene_gspd) h_sctj = Hit(c_gene_sctj, "hit_sctj", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctn = Hit(c_gene_sctn, "hit_sctn", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_gspd = Hit(c_gene_gspd, "hit_gspd", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_sctj_flg = Hit(c_gene_sctj_flg, "hit_sctj_flg", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_flgB = Hit(c_gene_flgB, "hit_flgB", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) h_tadZ = Hit(c_gene_tadZ, "hit_tadZ", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) model_A._min_mandatory_genes_required = 2 model_A._min_genes_required = 2 hit_weights = HitWeight(**cfg.hit_weights()) c1 = Cluster([ ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY), ValidHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) ], model_A, hit_weights) c2 = Cluster([ ValidHit(h_sctj, gene_sctj, GeneStatus.MANDATORY), ValidHit(h_sctn, gene_sctn, GeneStatus.MANDATORY) ], model_A, hit_weights) model_B._min_mandatory_genes_required = 1 model_B._min_genes_required = 2 c3 = Cluster([ ValidHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY), ValidHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY), ValidHit(h_flgB, gene_flgB, GeneStatus.ACCESSORY) ], model_B, hit_weights) model_C._min_mandatory_genes_required = 1 model_C._min_genes_required = 2 c4 = Cluster([ ValidHit(h_sctj_flg, gene_sctj_flg, GeneStatus.MANDATORY), ValidHit(h_tadZ, gene_tadZ, GeneStatus.ACCESSORY), ValidHit(h_flgB, gene_flgB, GeneStatus.MANDATORY), ValidHit(h_gspd, gene_gspd, GeneStatus.ACCESSORY) ], model_C, hit_weights) sys_A = System(model_A, [c1, c2], cfg.redundancy_penalty()) sys_A.id = "sys_id_A" sys_B = System(model_B, [c3], cfg.redundancy_penalty()) sys_B.id = "sys_id_B" sys_C = System(model_C, [c4], cfg.redundancy_penalty()) sys_C.id = "sys_id_C" sol_1 = [sys_A, sys_B] sol_2 = [sys_A, sys_C] sol_id_1 = '1' sol_id_2 = '2' sol_tsv = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Systems found: """ sol_tsv += "\t".join([ "sol_id", "replicon", "hit_id", "gene_name", "hit_pos", "model_fqn", "sys_id", "sys_loci", "sys_wholeness", "sys_score", "sys_occ", "hit_gene_ref", "hit_status", "hit_seq_len", "hit_i_eval", "hit_score", "hit_profile_cov", "hit_seq_cov", "hit_begin_match", "hit_end_match", "used_in" ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_gspd', 'gspD', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'gspD', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_sctj_flg', 'sctJ_FLG', '1', 'foo/B', 'sys_id_B', '1', '0.750', '2.000', '1', 'sctJ_FLG', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_tadZ', 'tadZ', '1', 'foo/B', 'sys_id_B', '1', '0.750', '2.000', '1', 'tadZ', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_1, 'replicon_id', 'hit_flgB', 'flgB', '1', 'foo/B', 'sys_id_B', '1', '0.750', '2.000', '1', 'flgB', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_gspd', 'gspD', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'gspD', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctj', 'sctJ', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctJ', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctn', 'sctN', '1', 'foo/A', 'sys_id_A', '2', '1.000', '1.500', '2', 'sctN', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', '' ]) sol_tsv += "\n" sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_sctj_flg', 'sctJ_FLG', '1', 'foo/C', 'sys_id_C', '1', '0.800', '3.000', '1', 'sctJ_FLG', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'sys_id_B' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_tadZ', 'tadZ', '1', 'foo/C', 'sys_id_C', '1', '0.800', '3.000', '1', 'tadZ', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'sys_id_B' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_flgB', 'flgB', '1', 'foo/C', 'sys_id_C', '1', '0.800', '3.000', '1', 'flgB', 'mandatory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'sys_id_B' ]) sol_tsv += "\n" sol_tsv += '\t'.join([ sol_id_2, 'replicon_id', 'hit_gspd', 'gspD', '1', 'foo/C', 'sys_id_C', '1', '0.800', '3.000', '1', 'gspD', 'accessory', '803', '1.0', '1.000', '1.000', '1.000', '10', '20', 'sys_id_A' ]) sol_tsv += "\n" sol_tsv += "\n" f_out = StringIO() hit_multi_sys_tracker = HitSystemTracker([sys_A, sys_B]) solutions_to_tsv([sol_1, sol_2], hit_multi_sys_tracker, f_out) self.assertMultiLineEqual(sol_tsv, f_out.getvalue()) def test_rejected_clst_to_txt(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = "blabla" cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 11) gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_1 = ModelGene(c_gene_gspd, model) gene_name = "sctC" c_gene_sctc = CoreGene(models_location, gene_name, profile_factory) gene_2 = ModelGene(c_gene_sctc, model) model.add_mandatory_gene(gene_1) model.add_accessory_gene(gene_2) # Hit(gene, model, hit_id, hit_seq_length, replicon_name, position, i_eval, score, # profile_coverage, sequence_coverage, begin_match, end_match h10 = Hit(c_gene_gspd, "h10", 10, "replicon_1", 10, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h10 = ValidHit(h10, gene_1, GeneStatus.MANDATORY) h20 = Hit(c_gene_sctc, "h20", 10, "replicon_1", 20, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h20 = ValidHit(h20, gene_2, GeneStatus.ACCESSORY) h40 = Hit(c_gene_gspd, "h10", 10, "replicon_1", 40, 1.0, 10.0, 1.0, 1.0, 10, 20) v_h40 = ValidHit(h40, gene_1, GeneStatus.MANDATORY) h50 = Hit(c_gene_sctc, "h20", 10, "replicon_1", 50, 1.0, 20.0, 1.0, 1.0, 10, 20) v_h50 = ValidHit(h50, gene_2, GeneStatus.ACCESSORY) hit_weights = HitWeight(**cfg.hit_weights()) c1 = Cluster([v_h10, v_h20], model, hit_weights) c2 = Cluster([v_h40, v_h50], model, hit_weights) r_c = RejectedClusters(model, [c1, c2], ["The reasons to reject this clusters"]) rej_clst_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Rejected clusters: Cluster: - model = T2SS - replicon = replicon_1 - hits = (h10, gspD, 10), (h20, sctC, 20) Cluster: - model = T2SS - replicon = replicon_1 - hits = (h10, gspD, 40), (h20, sctC, 50) These clusters have been rejected because: \t- The reasons to reject this clusters ============================================================ """ f_out = StringIO() rejected_clst_to_txt([r_c], f_out) self.maxDiff = None self.assertMultiLineEqual(rej_clst_str, f_out.getvalue()) rej_clst_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Rejected clusters """ f_out = StringIO() rejected_clst_to_txt([], f_out) self.assertMultiLineEqual(rej_clst_str, f_out.getvalue()) def test_likely_systems_to_txt(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'unordered' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) # test if id is well incremented gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) gene_name = "sctJ" c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) gene_name = "sctC" c_gene_sctc = CoreGene(models_location, gene_name, profile_factory) gene_sctc = ModelGene(c_gene_sctc, model) model.add_neutral_gene(gene_sctc) gene_name = "tadZ" c_gene_tadz = CoreGene(models_location, gene_name, profile_factory) gene_tadz = ModelGene(c_gene_tadz, model) model.add_forbidden_gene(gene_tadz) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 804, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctc, "hit_3", 805, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctc, GeneStatus.NEUTRAL) hit_4 = Hit(c_gene_tadz, "hit_4", 806, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_4 = ValidHit(hit_4, gene_tadz, GeneStatus.FORBIDDEN) system_1 = LikelySystem(model, [v_hit_1], [v_hit_2], [v_hit_3], [v_hit_4]) system_str = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Systems found: This replicon contains genetic materials needed for system foo/T2SS WARNING there quorum is reached but there is also some forbidden genes. system id = replicon_id_T2SS_1 model = foo/T2SS replicon = replicon_id hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 1), ('hit_3', 'sctC', 1), ('hit_4', 'tadZ', 1)] wholeness = 1.000 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) neutral genes: \t- sctC: 1 (sctC) forbidden genes: \t- tadZ: 1 (tadZ) Use ordered replicon to have better prediction. """ f_out = StringIO() track_multi_systems_hit = HitSystemTracker([system_1]) likely_systems_to_txt([system_1], track_multi_systems_hit, f_out) self.assertMultiLineEqual(system_str, f_out.getvalue()) f_out = StringIO() likely_systems_to_txt([], track_multi_systems_hit, f_out) expected_out = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Likely Systems found """ self.assertEqual(expected_out, f_out.getvalue()) def test_likely_systems_to_tsv(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'unordered' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) # test if id is well incremented gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) gene_name = "sctJ" c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) gene_name = "sctC" c_gene_sctc = CoreGene(models_location, gene_name, profile_factory) gene_sctc = ModelGene(c_gene_sctc, model) model.add_neutral_gene(gene_sctc) gene_name = "tadZ" c_gene_tadz = CoreGene(models_location, gene_name, profile_factory) gene_tadz = ModelGene(c_gene_tadz, model) model.add_forbidden_gene(gene_tadz) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 804, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctc, "hit_3", 805, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctc, GeneStatus.NEUTRAL) hit_4 = Hit(c_gene_tadz, "hit_4", 806, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_4 = ValidHit(hit_4, gene_tadz, GeneStatus.FORBIDDEN) system_1 = LikelySystem(model, [v_hit_1], [v_hit_2], [v_hit_3], [v_hit_4]) sol_tsv = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Likely Systems found:""" sol_tsv += "\n\n" sol_tsv += "\t".join([ "replicon", "hit_id", "gene_name", "hit_pos", "model_fqn", "sys_id", "sys_wholeness", "hit_gene_ref", "hit_status", "hit_seq_len", "hit_i_eval", "hit_score", "hit_profile_cov", "hit_seq_cov", "hit_begin_match", "hit_end_match", "used_in" ]) sol_tsv += "\n" sol_tsv += '\t'.join([ "replicon_id", "hit_1", "gspD", "1", "foo/T2SS", "replicon_id_T2SS_1", "1.000", "gspD", "mandatory", "803", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) sol_tsv += "\n" sol_tsv += '\t'.join([ "replicon_id", "hit_2", "sctJ", "1", "foo/T2SS", "replicon_id_T2SS_1", "1.000", "sctJ", "accessory", "804", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) sol_tsv += "\n" sol_tsv += '\t'.join([ "replicon_id", "hit_4", "tadZ", "1", "foo/T2SS", "replicon_id_T2SS_1", "1.000", "tadZ", "forbidden", "806", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) sol_tsv += "\n" sol_tsv += '\t'.join([ "replicon_id", "hit_3", "sctC", "1", "foo/T2SS", "replicon_id_T2SS_1", "1.000", "sctC", "neutral", "805", "1.0", "1.000", "1.000", "1.000", "10", "20", "" ]) sol_tsv += "\n" sol_tsv += "\n" f_out = StringIO() track_multi_systems_hit = HitSystemTracker([system_1]) likely_systems_to_tsv([system_1], track_multi_systems_hit, f_out) self.assertMultiLineEqual(sol_tsv, f_out.getvalue()) f_out = StringIO() likely_systems_to_tsv([], track_multi_systems_hit, f_out) expected_out = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Likely Systems found """ self.assertEqual(expected_out, f_out.getvalue()) def test_unnlikely_systems_to_txt(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'unordered' args.models_dir = self.find_data('models') cfg = Config(MacsyDefaults(), args) model_name = 'foo' models_location = ModelLocation( path=os.path.join(args.models_dir, model_name)) # we need to reset the ProfileFactory # because it's a like a singleton # so other tests are influenced by ProfileFactory and it's configuration # for instance search_genes get profile without hmmer_exe profile_factory = ProfileFactory(cfg) model = Model("foo/T2SS", 10) # test if id is well incremented gene_name = "gspD" c_gene_gspd = CoreGene(models_location, gene_name, profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) model.add_mandatory_gene(gene_gspd) gene_name = "sctJ" c_gene_sctj = CoreGene(models_location, gene_name, profile_factory) gene_sctj = ModelGene(c_gene_sctj, model) model.add_accessory_gene(gene_sctj) gene_name = "sctC" c_gene_sctc = CoreGene(models_location, gene_name, profile_factory) gene_sctc = ModelGene(c_gene_sctc, model) model.add_neutral_gene(gene_sctc) gene_name = "tadZ" c_gene_tadz = CoreGene(models_location, gene_name, profile_factory) gene_tadz = ModelGene(c_gene_tadz, model) model.add_forbidden_gene(gene_tadz) hit_1 = Hit(c_gene_gspd, "hit_1", 803, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_1 = ValidHit(hit_1, gene_gspd, GeneStatus.MANDATORY) hit_2 = Hit(c_gene_sctj, "hit_2", 804, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_2 = ValidHit(hit_2, gene_sctj, GeneStatus.ACCESSORY) hit_3 = Hit(c_gene_sctc, "hit_3", 805, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_3 = ValidHit(hit_3, gene_sctc, GeneStatus.NEUTRAL) hit_4 = Hit(c_gene_tadz, "hit_4", 806, "replicon_id", 1, 1.0, 1.0, 1.0, 1.0, 10, 20) v_hit_4 = ValidHit(hit_4, gene_tadz, GeneStatus.FORBIDDEN) reason = "why it not a system" system_1 = UnlikelySystem(model, [v_hit_1], [v_hit_2], [v_hit_3], [v_hit_4], reason) exp_txt = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # Unlikely Systems found: This replicon probably not contains a system foo/T2SS: {reason} system id = replicon_id_T2SS_1 model = foo/T2SS replicon = replicon_id hits = [('hit_1', 'gspD', 1), ('hit_2', 'sctJ', 1), ('hit_3', 'sctC', 1), ('hit_4', 'tadZ', 1)] wholeness = 1.000 mandatory genes: \t- gspD: 1 (gspD) accessory genes: \t- sctJ: 1 (sctJ) neutral genes: \t- sctC: 1 (sctC) forbidden genes: \t- tadZ: 1 (tadZ) Use ordered replicon to have better prediction. ============================================================ """ f_out = StringIO() unlikely_systems_to_txt([system_1], f_out) self.assertMultiLineEqual(exp_txt, f_out.getvalue()) f_out = StringIO() unlikely_systems_to_txt([], f_out) expected_out = f"""# macsyfinder {macsypy.__version__} # {' '.join(sys.argv)} # No Unlikely Systems found """ self.assertEqual(expected_out, f_out.getvalue()) def test_parse_args(self): command_line = "macsyfinder --sequence-db test_1.fasta --db-type=gembase --models-dir data/models/ " \ "--models functional all -w 4 --out test_1-all" parser, args = parse_args(command_line.split()[1:]) self.assertIsNone(args.cfg_file) self.assertIsNone(args.coverage_profile) self.assertIsNone(args.hmmer) self.assertIsNone(args.i_evalue_sel) self.assertIsNone(args.inter_gene_max_space) self.assertIsNone(args.max_nb_genes) self.assertIsNone(args.min_genes_required) self.assertIsNone(args.min_mandatory_genes_required) self.assertIsNone(args.multi_loci) self.assertIsNone(args.previous_run) self.assertIsNone(args.profile_suffix) self.assertIsNone(args.replicon_topology) self.assertIsNone(args.res_extract_suffix) self.assertIsNone(args.res_search_suffix) self.assertIsNone(args.topology_file) self.assertFalse(args.idx) self.assertFalse(args.list_models) self.assertFalse(args.mute) self.assertFalse(args.relative_path) self.assertEqual(args.db_type, 'gembase') self.assertEqual(args.models_dir, 'data/models/') self.assertEqual(args.out_dir, 'test_1-all') self.assertEqual(args.sequence_db, 'test_1.fasta') self.assertEqual(args.verbosity, 0) self.assertEqual(args.worker, 4) self.assertListEqual(args.models, [['functional', 'all']]) command_line = "macsyfinder --sequence-db test_!.fasta " \ "--db-type=ordered_replicon --models-dir data/models/ " \ "--models functional all -w 4 --out test_1-all " \ "--mute --multi-loci TXSscan/T2SS,TXSScan/T3SS --relative-path" parser, args = parse_args(command_line.split()[1:]) self.assertEqual(args.db_type, 'ordered_replicon') self.assertEqual(args.multi_loci, "TXSscan/T2SS,TXSScan/T3SS") self.assertTrue(args.relative_path) self.assertTrue(args.mute) command_line = "macsyfinder --sequence-db test_1.dasta " \ "--db-type=ordered_replicon --models-dir data/models/ " \ "--i-evalue-sel=0.5 " \ "--min-genes-required TXSScan/T2SS 15 --min-genes-required TXSScan/Flagellum 10" parser, args = parse_args(command_line.split()[1:]) self.assertEqual(args.i_evalue_sel, 0.5) self.assertListEqual( args.min_genes_required, [['TXSScan/T2SS', '15'], ['TXSScan/Flagellum', '10']]) @unittest.skipIf(not which('hmmsearch'), 'hmmsearch not found in PATH') def test_search_systems(self): logger = logging.getLogger('macsypy.macsyfinder') macsypy.logger_set_level(level='ERROR') defaults = MacsyDefaults() out_dir = os.path.join(self.tmp_dir, 'macsyfinder_test_search_systems') os.mkdir(out_dir) # test gembase replicon seq_db = self.find_data('base', 'VICH001.B.00001.C001.prt') model_dir = self.find_data('data_set', 'models') args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models set_1 all -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) systems, rejected_clst = search_systems(config, model_bank, gene_bank, profile_factory, logger) expected_sys_id = [ 'VICH001.B.00001.C001_MSH_5', 'VICH001.B.00001.C001_MSH_7', 'VICH001.B.00001.C001_T4P_25', 'VICH001.B.00001.C001_T4P_23', 'VICH001.B.00001.C001_T4P_21', 'VICH001.B.00001.C001_T4P_22', 'VICH001.B.00001.C001_T4P_17', 'VICH001.B.00001.C001_T4P_16', 'VICH001.B.00001.C001_T4bP_26', 'VICH001.B.00001.C001_T4P_24', 'VICH001.B.00001.C001_T4P_18', 'VICH001.B.00001.C001_T4P_19', 'VICH001.B.00001.C001_T4P_20', 'VICH001.B.00001.C001_T2SS_10', 'VICH001.B.00001.C001_T2SS_9' ] self.assertListEqual([s.id for s in systems], expected_sys_id) expected_scores = [ 10.5, 10.0, 12.0, 9.5, 9.0, 8.5, 6.0, 5.0, 5.5, 10.5, 7.5, 7.0, 8.0, 8.3, 7.5 ] self.assertListEqual([s.score for s in systems], expected_scores) self.assertEqual(len(rejected_clst), 11) # test hits but No Systems args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models set_1 Tad -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) systems, rejected_clst = search_systems(config, model_bank, gene_bank, profile_factory, logger) self.assertEqual(systems, []) # test No hits seq_db = self.find_data('base', 'test_1.fasta') args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models set_1 T4bP -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) systems, rejected_clst = search_systems(config, model_bank, gene_bank, profile_factory, logger) self.assertEqual(systems, []) self.assertEqual(rejected_clst, []) def test_search_systems_unordered(self): logger = logging.getLogger('macsypy.macsyfinder') macsypy.logger_set_level(level='ERROR') defaults = MacsyDefaults() out_dir = os.path.join(self.tmp_dir, 'macsyfinder_test_search_systems') os.mkdir(out_dir) seq_db = self.find_data('base', 'VICH001.B.00001.C001.prt') model_dir = self.find_data('data_set', 'models') # test unordered replicon args = f"--sequence-db {seq_db} --db-type=unordered --models-dir {model_dir} --models set_1 all -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) systems, uncomplete_sys = search_systems(config, model_bank, gene_bank, profile_factory, logger) expected_sys_id = [ 'Unordered_T2SS_4', 'Unordered_MSH_3', 'Unordered_T4P_5', 'Unordered_T4bP_6' ] self.assertListEqual([s.id for s in systems], expected_sys_id) expected_uncomplete_sys_id = [ 'Unordered_Archaeal-T4P_1', 'Unordered_ComM_2', 'Unordered_Tad_7' ] self.assertListEqual([s.id for s in uncomplete_sys], expected_uncomplete_sys_id) def test_search_systems_model_unknown(self): logger = logging.getLogger('macsypy.macsyfinder') macsypy.logger_set_level(level='ERROR') defaults = MacsyDefaults() out_dir = os.path.join(self.tmp_dir, 'macsyfinder_test_search_systems') os.mkdir(out_dir) seq_db = self.find_data('base', 'test_1.fasta') model_dir = self.find_data('data_set', 'models') args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models nimporaoik -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) exit_ori = sys.exit sys.exit = self.fake_exit try: with self.assertRaises(TypeError) as ctx: _ = search_systems(config, model_bank, gene_bank, profile_factory, logger) self.assertEqual( str(ctx.exception), "macsyfinder: \"No such model definition: 'nimporaoik'\"") finally: sys.exit = exit_ori
class TestSearchGenes(MacsyTest): def setUp(self): self.tmp_dir = os.path.join(tempfile.gettempdir(), 'test_macsyfinder_search_genes') if os.path.exists(self.tmp_dir): shutil.rmtree(self.tmp_dir) os.mkdir(self.tmp_dir) args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_base.fa") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.log_level = 30 args.out_dir = os.path.join(self.tmp_dir, 'job_1') args.res_search_dir = args.out_dir os.mkdir(args.out_dir) self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) idx = Indexes(self.cfg) idx._build_my_indexes() self.profile_factory = ProfileFactory(self.cfg) def tearDown(self): try: shutil.rmtree(self.cfg.working_dir()) #pass except: pass @unittest.skipIf(not which('hmmsearch'), 'hmmsearch not found in PATH') def test_search(self): gene_name = "abc" c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) expected_hit = [ Hit(c_gene_abc, "ESCO030p01_000260", 706, "ESCO030p01", 26, float(1.000e-200), float(660.800), float(1.000), float(0.714), 160, 663) ] self.assertEqual(len(report), 1) self.assertEqual(expected_hit[0], report[0].hits[0]) @unittest.skipIf(not which('hmmsearch'), 'hmmsearch not found in PATH') def test_search_recover(self): # first job searching using hmmsearch gene_name = "abc" c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) expected_hit = [ Hit(c_gene_abc, "ESCO030p01_000260", 706, "ESCO030p01", 26, float(1.000e-200), float(660.800), float(1.000), float(0.714), 160, 663) ] # second job using recover # disable hmmer to be sure that test use the recover inner function self.cfg.hmmer = lambda: "hmmer_disable" # and create a new dir for the second job previous_job_path = self.cfg.working_dir() self.cfg.previous_run = lambda: previous_job_path self.cfg.out_dir = lambda: os.path.join(self.tmp_dir, 'job_2') os.mkdir(self.cfg.out_dir()) # rerun with previous run # but we have to reset the profile attached to the gene gene._profile._report self.profile_factory = ProfileFactory(self.cfg) c_gene_abc = CoreGene(self.model_location, gene_name, self.profile_factory) report = search_genes([c_gene_abc], self.cfg) self.assertEqual(len(report), 1) self.assertEqual(expected_hit[0], report[0].hits[0])
class TestProfile(MacsyTest): def setUp(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = tempfile.gettempdir() args.log_level = 0 self.cfg = Config(MacsyDefaults(), args) if os.path.exists(self.cfg.working_dir()): shutil(self.cfg.working_dir()) os.makedirs(self.cfg.working_dir()) self.model_name = 'foo' self.model_location = ModelLocation(path=os.path.join(args.models_dir, self.model_name)) self.profile_factory = ProfileFactory(self.cfg) def tearDown(self): try: shutil.rmtree(self.cfg.working_dir()) except: pass def test_len(self): model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile("abc") profile = Profile(gene, self.cfg, path) self.assertEqual(len(profile), 501) def test_ga_threshold(self): model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile("abc") profile = Profile(gene, self.cfg, path) self.assertFalse(profile.ga_threshold) gene_name = 'T5aSS_PF03797' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile("T5aSS_PF03797") profile = Profile(gene, self.cfg, path) self.assertTrue(profile.ga_threshold) def test_str(self): model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile("abc") profile = Profile(gene, self.cfg, path) s = "{0} : {1}".format(gene.name, path) self.assertEqual(str(profile), s) @unittest.skipIf(not which('hmmsearch'), 'hmmsearch not found in PATH') def test_execute(self): for db_type in ("gembase", "ordered_replicon", "unordered"): self.cfg._set_db_type(db_type) model = Model("foo/T2SS", 10) gene_name = 'T5aSS_PF03797' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) # case GA threshold in profile profile_path = self.model_location.get_profile("T5aSS_PF03797") profile = Profile(gene, self.cfg, profile_path) report = profile.execute() hmmer_raw_out = profile.hmm_raw_output with open(hmmer_raw_out, 'r') as hmmer_raw_out_file: first_l = hmmer_raw_out_file.readline() # a hmmsearch output file has been produced self.assertTrue(first_l.startswith("# hmmsearch :: search profile(s) against a sequence database")) for i in range(5): # skip 4 lines l = hmmer_raw_out_file.readline() # a hmmsearch used the abc profile line should become with: "# query HMM file: {the path tp hmm profile used}" self.assertTrue(l.find(profile_path) != -1) for i in range(3): # skip 2 lines l = hmmer_raw_out_file.readline() self.assertEqual("# model-specific thresholding: GA cutoffs", l.strip()) # test if profile is executed only once per run report_bis = profile.execute() self.assertIs(report, report_bis) # case GA threshold in profile but --no-cut-ga is set args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = tempfile.gettempdir() args.log_level = 0 args.e_value_search = 0.5 args.no_cut_ga = True cfg = Config(MacsyDefaults(), args) profile = Profile(gene, cfg, profile_path) report = profile.execute() hmmer_raw_out = profile.hmm_raw_output with open(hmmer_raw_out, 'r') as hmmer_raw_out_file: for i in range(9): l = hmmer_raw_out_file.readline() self.assertEqual("# sequence reporting threshold: E-value <= 0.5", l.strip()) # case cut-ga but no GA threshold in hmmprofile gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) # case -cut-ga and GA threshold in profile profile_path = self.model_location.get_profile("abc") profile = Profile(gene, self.cfg, profile_path) with self.catch_log() as log: report = profile.execute() hmmer_raw_out = profile.hmm_raw_output with open(hmmer_raw_out, 'r') as hmmer_raw_out_file: first_l = hmmer_raw_out_file.readline() # a hmmsearch output file has been produced self.assertTrue(first_l.startswith("# hmmsearch :: search profile(s) against a sequence database")) for i in range(5): # skip 4 lines l = hmmer_raw_out_file.readline() # a hmmsearch used the abc profile line should become with: "# query HMM file: {the path tp hmm profile used}" self.assertTrue(l.find(profile_path) != -1) for i in range(3): # skip 2 lines l = hmmer_raw_out_file.readline() self.assertEqual('# sequence reporting threshold: E-value <= 0.1', l.strip()) def test_execute_unknown_binary(self): self.cfg._options['hmmer'] = "Nimportnaoik" model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile("abc", ) profile = Profile(gene, self.cfg, path) with self.catch_log(): with self.assertRaises(RuntimeError): profile.execute() def test_execute_hmmer_failed(self): fake_hmmer = os.path.join(tempfile.gettempdir(), 'hmmer_failed') with open(fake_hmmer, 'w') as hmmer: hmmer.write("""#! {} import sys sys.exit(127) """.format(sysconfig.sys.executable)) try: os.chmod(hmmer.name, 0o755) self.cfg._options['hmmer'] = hmmer.name model = Model("foo/T2SS", 10) gene_name = 'abc' c_gene = CoreGene(self.model_location, gene_name, self.profile_factory) gene = ModelGene(c_gene, model) path = self.model_location.get_profile("abc", ) profile = Profile(gene, self.cfg, path) with self.catch_log(): with self.assertRaisesRegex(RuntimeError, "an error occurred during Hmmer " "execution: command = .* : return code = 127 .*") as ctx: profile.execute() finally: try: os.unlink(fake_hmmer) except Exception: pass
print("#" * 70) ############## WORKAROUND ################## # integron_finder is a script not a lib # to test each function in integron_finder, # we need to import integron_finder # so we need to transform the script in lib # and ad it to the path somewhere a user can write # the fisrt element of path is integron_finder.tests if not 'INTEGRON_HOME' in os.environ: INTEGRON_HOME = os.path.abspath( os.path.join(os.path.dirname(__file__), '..')) sys.path.append(INTEGRON_HOME) from tests import which integron_finder_script = which('integron_finder') if integron_finder_script is None: raise RuntimeError( 'Cannot find integron_finder, do you set INTEGRON_HOME') else: INTEGRON_HOME = os.environ['INTEGRON_HOME'] integron_finder_script = os.path.join(INTEGRON_HOME, 'integron_finder') fake_lib = os.path.join(INTEGRON_HOME, 'tests', 'fake_lib') integron_finder_lib = os.path.join(fake_lib, 'integron_finder.py') if not os.path.exists(fake_lib): os.mkdir(fake_lib) if os.path.exists(integron_finder_lib): os.unlink(integron_finder_lib)
def setUp(self): if 'INTEGRON_HOME' in os.environ: self.integron_home = os.environ['INTEGRON_HOME'] self.local_install = True else: self.local_install = False self.integron_home = os.path.normpath( os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..'))) self.tmp_dir = os.path.join(tempfile.gettempdir(), 'tmp_test_integron_finder') os.makedirs(self.tmp_dir) integron_finder.PRODIGAL = which('prodigal') integron_finder.HMMSEARCH = which('hmmsearch') integron_finder.N_CPU = '1' integron_finder.MODEL_DIR = os.path.join(self.integron_home, "data", "Models") integron_finder.MODEL_integrase = os.path.join( integron_finder.MODEL_DIR, "integron_integrase.hmm") integron_finder.MODEL_phage_int = os.path.join( integron_finder.MODEL_DIR, "phage-int.hmm") integron_finder.MODEL_attc = os.path.join(self.integron_home, 'data', 'Models', 'attc_4.cm') integron_finder.circular = True integron_finder.out_dir = self.tmp_dir integron_finder.CMSEARCH = which('cmsearch') integron_finder.evalue_attc = 1. integron_finder.max_attc_size = 200 integron_finder.min_attc_size = 40 integron_finder.length_cm = 47 # length in 'CLEN' (value for model attc_4.cm) integron_finder.DISTANCE_THRESHOLD = 4000 # (4kb at least between 2 different arrays) self.columns = [ 'pos_beg', 'pos_end', 'strand', 'evalue', 'type_elt', 'model', 'distance_2attC', 'annotation' ] self.dtype = { "pos_beg": 'int', "pos_end": 'int', "strand": 'int', "evalue": 'float', "type_elt": 'str', "annotation": 'str', "model": 'str', "distance_2attC": 'float' } self.max_dtype = { 'Accession_number': 'str', 'cm_attC': 'str', 'cm_debut': 'int', 'cm_fin': 'int', 'pos_beg': 'int', 'pos_end': 'int', 'sens': 'str', 'evalue': 'float' } self.max_cols = [ 'Accession_number', 'cm_attC', 'cm_debut', 'cm_fin', 'pos_beg', 'pos_end', 'sens', 'evalue' ]