def setUp(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = tempfile.gettempdir() args.log_level = 30 self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) self.gene_bank = GeneBank() self.profile_factory = ProfileFactory(self.cfg)
def test_min_genes_required_cfg(self): # test min_genes_required is specified from configuration # so this value must overload the value read from xml def_2_parse = set() model_fqn = 'foo/model_5' def_2_parse.add(model_fqn) parsed = set() min_genes_required = [[model_fqn, '4']] self.args.min_genes_required = min_genes_required self.cfg = Config(MacsyDefaults(), self.args) self.model_bank = ModelBank() self.gene_bank = GeneBank() self.model_registry = ModelRegistry() models_location = scan_models_dir(self.args.models_dir) for ml in models_location: self.model_registry.add(ml) self.parser = DefinitionParser(self.cfg, self.model_bank, self.gene_bank, self.model_registry, self.profile_factory) models_2_detect = [ self.model_registry['foo'].get_definition(model_fqn) ] self.parser.parse(models_2_detect) m = self.model_bank[model_fqn] self.assertEqual(m.min_genes_required, 4)
def test_inter_gene_max_space_cfg(self): # test inter_gene_max_space is specified from configuration # so this value must overload the value read from xml model_fqn = 'foo/model_5' inter_gene_max_space_cfg = [[model_fqn, '222']] self.args.inter_gene_max_space = inter_gene_max_space_cfg self.cfg = Config(MacsyDefaults(), self.args) self.model_bank = ModelBank() self.gene_bank = GeneBank() self.model_registry = ModelRegistry() models_location = scan_models_dir(self.args.models_dir) for ml in models_location: self.model_registry.add(ml) self.parser = DefinitionParser(self.cfg, self.model_bank, self.gene_bank, self.model_registry, self.profile_factory) models_2_detect = [ self.model_registry['foo'].get_definition(model_fqn) ] self.parser.parse(models_2_detect) m = self.model_bank[model_fqn] self.assertEqual(m.inter_gene_max_space, 222)
def setUp(self): l = logging.getLogger() l.manager.loggerDict.clear() # add only one handler to the macsypy logger from macsypy.system_parser import _log macsy_log = _log.parent log_file = 'NUL' if platform.system() == 'Windows' else '/dev/null' log_handler = logging.FileHandler(log_file) macsy_log.addHandler(log_handler) self.cfg = Config(sequence_db=os.path.join(self._data_dir, "base", "test_base.fa"), db_type="gembase", hmmer_exe="", e_value_res=1, i_evalue_sel=0.5, def_dir=os.path.join(self._data_dir, 'DEF'), res_search_dir=tempfile.gettempdir(), res_search_suffix="", profile_dir=os.path.join(self._data_dir, 'profiles'), profile_suffix=".hmm", res_extract_suffix="", log_level=30, log_file=log_file ) self.system_bank = SystemBank() self.system_bank._system_bank = {} self.gene_bank = GeneBank() self.gene_bank._genes_bank = {} self.parser = SystemParser(self.cfg, self.system_bank, self.gene_bank)
def test_search_systems_model_unknown(self): logger = logging.getLogger('macsypy.macsyfinder') macsypy.logger_set_level(level='ERROR') defaults = MacsyDefaults() out_dir = os.path.join(self.tmp_dir, 'macsyfinder_test_search_systems') os.mkdir(out_dir) seq_db = self.find_data('base', 'test_1.fasta') model_dir = self.find_data('data_set', 'models') args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models nimporaoik -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) exit_ori = sys.exit sys.exit = self.fake_exit try: with self.assertRaises(TypeError) as ctx: _ = search_systems(config, model_bank, gene_bank, profile_factory, logger) self.assertEqual( str(ctx.exception), "macsyfinder: \"No such model definition: 'nimporaoik'\"") finally: sys.exit = exit_ori
def test_search_systems_unordered(self): logger = logging.getLogger('macsypy.macsyfinder') macsypy.logger_set_level(level='ERROR') defaults = MacsyDefaults() out_dir = os.path.join(self.tmp_dir, 'macsyfinder_test_search_systems') os.mkdir(out_dir) seq_db = self.find_data('base', 'VICH001.B.00001.C001.prt') model_dir = self.find_data('data_set', 'models') # test unordered replicon args = f"--sequence-db {seq_db} --db-type=unordered --models-dir {model_dir} --models set_1 all -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) systems, uncomplete_sys = search_systems(config, model_bank, gene_bank, profile_factory, logger) expected_sys_id = [ 'Unordered_T2SS_4', 'Unordered_MSH_3', 'Unordered_T4P_5', 'Unordered_T4bP_6' ] self.assertListEqual([s.id for s in systems], expected_sys_id) expected_uncomplete_sys_id = [ 'Unordered_Archaeal-T4P_1', 'Unordered_ComM_2', 'Unordered_Tad_7' ] self.assertListEqual([s.id for s in uncomplete_sys], expected_uncomplete_sys_id)
def setUp(self): defaults = MacsyDefaults() self.args = argparse.Namespace() self.args.sequence_db = self.find_data("base", "test_1.fasta") self.args.db_type = 'gembase' self.args.models_dir = self.find_data('models') self.args.res_search_dir = tempfile.gettempdir() self.cfg = Config(defaults, self.args) self.model_bank = ModelBank() self.gene_bank = GeneBank() self.profile_factory = ProfileFactory(self.cfg) self.model_registry = ModelRegistry() models_location = scan_models_dir(self.args.models_dir) for ml in models_location: self.model_registry.add(ml) self.parser = DefinitionParser(self.cfg, self.model_bank, self.gene_bank, self.model_registry, self.profile_factory)
def setUp(self): self.cfg = Config(sequence_db=os.path.join(self._data_dir, "base", "test_base.fa"), db_type="gembase", hmmer_exe="", e_value_res=1, i_evalue_sel=0.5, def_dir=os.path.join(self._data_dir, 'DEF'), res_search_dir="/tmp", res_search_suffix="", profile_dir=os.path.join(self._data_dir, 'profiles'), profile_suffix=".hmm", res_extract_suffix="", log_level=30, log_file='/dev/null') self.system_bank = SystemBank() self.system_bank._system_bank = {} self.gene_bank = GeneBank() self.gene_bank._genes_bank = {} self.parser = SystemParser(self.cfg, self.system_bank, self.gene_bank)
def test_max_nb_genes_cfg(self): self.model_bank = ModelBank() self.gene_bank = GeneBank() self.model_registry = ModelRegistry() models_location = scan_models_dir(self.args.models_dir) for ml in models_location: self.model_registry.add(ml) # max_nb_genes is specified in xml # no user configuration on this self.cfg = Config(MacsyDefaults(), self.args) model_fqn = 'foo/model_6' # 4 genes in this model but xml specify 3 self.cfg = Config(MacsyDefaults(), self.args) self.parser = DefinitionParser(self.cfg, self.model_bank, self.gene_bank, self.model_registry, self.profile_factory) models_2_detect = [ self.model_registry['foo'].get_definition(model_fqn) ] self.parser.parse(models_2_detect) m = self.model_bank[model_fqn] self.assertEqual(m.max_nb_genes, 3) # max_nb_genes is specified from configuration # so this value must overload the value read from xml model_fqn = 'foo/model_5' # 4 genes in this model max_nb_genes = [[model_fqn, '6']] self.args.max_nb_genes = max_nb_genes self.cfg = Config(MacsyDefaults(), self.args) self.parser = DefinitionParser(self.cfg, self.model_bank, self.gene_bank, self.model_registry, self.profile_factory) models_2_detect = [ self.model_registry['foo'].get_definition(model_fqn) ] self.parser.parse(models_2_detect) m = self.model_bank[model_fqn] self.assertEqual(m.max_nb_genes, 6)
def test_multi_loci_cfg(self): # test multi_loci is specified from configuration # so this value must overload the value read from xml model_fqn = 'foo/model_5' self.args.multi_loci = model_fqn self.cfg = Config(MacsyDefaults(), self.args) self.model_bank = ModelBank() self.gene_bank = GeneBank() self.model_registry = ModelRegistry() models_location = scan_models_dir(self.args.models_dir) for ml in models_location: self.model_registry.add(ml) self.parser = DefinitionParser(self.cfg, self.model_bank, self.gene_bank, self.model_registry, self.profile_factory) models_2_detect = [ self.model_registry['foo'].get_definition(model_fqn) ] self.parser.parse(models_2_detect) m = self.model_bank[model_fqn] self.assertTrue(m.multi_loci)
class Test(MacsyTest): def setUp(self): args = argparse.Namespace() args.sequence_db = self.find_data("base", "test_1.fasta") args.db_type = 'gembase' args.models_dir = self.find_data('models') args.res_search_dir = tempfile.gettempdir() args.log_level = 30 self.cfg = Config(MacsyDefaults(), args) self.model_name = 'foo' self.model_location = ModelLocation( path=os.path.join(args.models_dir, self.model_name)) self.gene_bank = GeneBank() self.profile_factory = ProfileFactory(self.cfg) def tearDown(self): try: shutil.rmtree(self.cfg.working_dir) except: pass def test_add_get_gene(self): gene_name = 'sctJ_FLG' with self.assertRaises(KeyError) as ctx: self.gene_bank[f"foo/{gene_name}"] self.assertEqual(str(ctx.exception), f"\"No such gene 'foo/{gene_name}' in this bank\"") model_foo = Model(self.model_name, 10) self.gene_bank.add_new_gene(self.model_location, gene_name, self.profile_factory) gene_from_bank = self.gene_bank[(model_foo.family_name, gene_name)] self.assertTrue(isinstance(gene_from_bank, CoreGene)) self.assertEqual(gene_from_bank.name, gene_name) gbk_contains_before = list(self.gene_bank) self.gene_bank.add_new_gene(self.model_location, gene_name, self.profile_factory) gbk_contains_after = list(self.gene_bank) self.assertEqual(gbk_contains_before, gbk_contains_after) gene_name = "bar" with self.assertRaises(MacsypyError) as ctx: self.gene_bank.add_new_gene(self.model_location, gene_name, self.profile_factory) self.assertEqual(str(ctx.exception), f"'{self.model_name}/{gene_name}': No such profile") def test_contains(self): model_foo = Model("foo/bar", 10) gene_name = 'sctJ_FLG' self.gene_bank.add_new_gene(self.model_location, gene_name, self.profile_factory) gene_in = self.gene_bank[(model_foo.family_name, gene_name)] self.assertIn(gene_in, self.gene_bank) gene_name = 'abc' c_gene_out = CoreGene(self.model_location, gene_name, self.profile_factory) gene_out = ModelGene(c_gene_out, model_foo) self.assertNotIn(gene_out, self.gene_bank) def test_iter(self): genes_names = ['sctJ_FLG', 'abc'] for g in genes_names: self.gene_bank.add_new_gene(self.model_location, g, self.profile_factory) self.assertListEqual([g.name for g in self.gene_bank], genes_names) def test_genes_fqn(self): genes_names = ['sctJ_FLG', 'abc'] for g in genes_names: self.gene_bank.add_new_gene(self.model_location, g, self.profile_factory) self.assertSetEqual( set(self.gene_bank.genes_fqn()), {f"{self.model_location.name}/{g.name}" for g in self.gene_bank}) def test_get_uniq_object(self): gene_name = 'sctJ_FLG' self.gene_bank.add_new_gene(self.model_location, gene_name, self.profile_factory) self.gene_bank.add_new_gene(self.model_location, gene_name, self.profile_factory) self.assertEqual(len(self.gene_bank), 1)
def test_search_systems(self): logger = logging.getLogger('macsypy.macsyfinder') macsypy.logger_set_level(level='ERROR') defaults = MacsyDefaults() out_dir = os.path.join(self.tmp_dir, 'macsyfinder_test_search_systems') os.mkdir(out_dir) # test gembase replicon seq_db = self.find_data('base', 'VICH001.B.00001.C001.prt') model_dir = self.find_data('data_set', 'models') args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models set_1 all -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) systems, rejected_clst = search_systems(config, model_bank, gene_bank, profile_factory, logger) expected_sys_id = [ 'VICH001.B.00001.C001_MSH_5', 'VICH001.B.00001.C001_MSH_7', 'VICH001.B.00001.C001_T4P_25', 'VICH001.B.00001.C001_T4P_23', 'VICH001.B.00001.C001_T4P_21', 'VICH001.B.00001.C001_T4P_22', 'VICH001.B.00001.C001_T4P_17', 'VICH001.B.00001.C001_T4P_16', 'VICH001.B.00001.C001_T4bP_26', 'VICH001.B.00001.C001_T4P_24', 'VICH001.B.00001.C001_T4P_18', 'VICH001.B.00001.C001_T4P_19', 'VICH001.B.00001.C001_T4P_20', 'VICH001.B.00001.C001_T2SS_10', 'VICH001.B.00001.C001_T2SS_9' ] self.assertListEqual([s.id for s in systems], expected_sys_id) expected_scores = [ 10.5, 10.0, 12.0, 9.5, 9.0, 8.5, 6.0, 5.0, 5.5, 10.5, 7.5, 7.0, 8.0, 8.3, 7.5 ] self.assertListEqual([s.score for s in systems], expected_scores) self.assertEqual(len(rejected_clst), 11) # test hits but No Systems args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models set_1 Tad -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) systems, rejected_clst = search_systems(config, model_bank, gene_bank, profile_factory, logger) self.assertEqual(systems, []) # test No hits seq_db = self.find_data('base', 'test_1.fasta') args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models set_1 T4bP -w 4 -o {out_dir}" _, parsed_args = parse_args(args.split()) config = Config(defaults, parsed_args) model_bank = ModelBank() gene_bank = GeneBank() profile_factory = ProfileFactory(config) systems, rejected_clst = search_systems(config, model_bank, gene_bank, profile_factory, logger) self.assertEqual(systems, []) self.assertEqual(rejected_clst, [])
def main(args=None, loglevel=None): """ main entry point to MacSyFinder do some check before to launch :func:`main_search_systems` which is the real function that perform a search :param args: the arguments passed on the command line without the program name :type args: List of string :param loglevel: the output verbosity :type loglevel: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL' """ args = sys.argv[1:] if args is None else args parser, parsed_args = parse_args(args) defaults = MacsyDefaults() config = Config(defaults, parsed_args) ########################### # creation of working dir ########################### working_dir = config.working_dir() if not os.path.exists(working_dir): os.makedirs(working_dir) else: if os.path.isdir(working_dir): if os.listdir(working_dir): raise ValueError( f"'{working_dir}' already exists and is not a empty") else: raise ValueError( f"'{working_dir}' already exists and is not a directory") ################ # init loggers # ################ macsypy.init_logger(log_file=os.path.join(config.working_dir(), config.log_file()), out=not config.mute()) if not loglevel: # logs are specify from args options macsypy.logger_set_level(level=config.log_level()) else: # used by unit tests to mute or unmute logs macsypy.logger_set_level(level=loglevel) logger = logging.getLogger('macsypy.macsyfinder') if parsed_args.list_models: print(list_models(parsed_args), file=sys.stdout) sys.exit(0) else: if not parsed_args.previous_run and not parsed_args.models: parser.print_help() print() sys.tracebacklimit = 0 raise OptionError( "argument --models or --previous-run is required.") elif not parsed_args.previous_run and not parsed_args.sequence_db: parser.print_help() print() sys.tracebacklimit = 0 raise OptionError( "argument --sequence-db or --previous-run is required.") elif not parsed_args.previous_run and not parsed_args.db_type: parser.print_help() print() sys.tracebacklimit = 0 raise OptionError( "argument --db-type or --previous-run is required.") _log.info(f"command used: {' '.join(sys.argv)}") models = ModelBank() genes = GeneBank() profile_factory = ProfileFactory(config) macsypy.hit.hit_weight = macsypy.hit.HitWeight(itself=3, exchangeable=.75, mandatory=2, accessory=.25, neutral=1.5) logger.info("\n{:#^70}".format(" Searching systems ")) all_systems, rejected_clusters = search_systems( config, models, genes, profile_factory, logger) track_multi_systems_hit = HitSystemTracker(all_systems) if config.db_type() in ('gembase', 'ordered_replicon'): ############################# # Ordered/Gembase replicons # ############################# ########################### # select the best systems # ########################### logger.info("\n{:#^70}".format(" Computing best solutions ")) best_solutions = [] one_best_solution = [] # group systems found by replicon # before to search best system combination import time for rep_name, syst_group in itertools.groupby( all_systems, key=lambda s: s.replicon_name): syst_group = list(syst_group) logger.info( f"Computing best solutions for {rep_name} (nb of systems {len(syst_group)})" ) t0 = time.time() best_sol_4_1_replicon, score = find_best_solutions(syst_group) t1 = time.time() logger.info( f"It took {t1 - t0:.2f}sec to find best solution ({score:.2f}) for replicon {rep_name}" ) # if several solutions are equivalent same number of system and score is same # store all equivalent solution in best_solution => all_best_systems # pick one in one_best_solution => best_systems best_solutions.extend(best_sol_4_1_replicon) one_best_solution.append(best_sol_4_1_replicon[0]) ############################## # Write the results in files # ############################## logger.info("\n{:#^70}".format(" Writing down results ")) system_filename = os.path.join(config.working_dir(), "all_systems.txt") tsv_filename = os.path.join(config.working_dir(), "all_systems.tsv") with open(system_filename, "w") as sys_file: systems_to_txt(all_systems, track_multi_systems_hit, sys_file) with open(tsv_filename, "w") as tsv_file: systems_to_tsv(all_systems, track_multi_systems_hit, tsv_file) cluster_filename = os.path.join(config.working_dir(), "rejected_clusters.txt") with open(cluster_filename, "w") as clst_file: rejected_clusters.sort(key=lambda clst: ( clst.replicon_name, clst.model, clst.hits)) rejected_clst_to_txt(rejected_clusters, clst_file) if not (all_systems or rejected_clusters): logger.info("No Systems found in this dataset.") tsv_filename = os.path.join(config.working_dir(), "all_best_solutions.tsv") with open(tsv_filename, "w") as tsv_file: solutions_to_tsv(best_solutions, track_multi_systems_hit, tsv_file) tsv_filename = os.path.join(config.working_dir(), "best_solution.tsv") with open(tsv_filename, "w") as tsv_file: # flattern the list and sort it one_best_solution = [ syst for sol in one_best_solution for syst in sol ] one_best_solution.sort( key=lambda syst: (syst.replicon_name, syst.position[0], syst.model.fqn, -syst.score)) systems_to_tsv(one_best_solution, track_multi_systems_hit, tsv_file) else: ####################### # Unordered replicons # ####################### ############################## # Write the results in files # ############################## logger.info("\n{:#^70}".format(" Writing down results ")) system_filename = os.path.join(config.working_dir(), "all_systems.txt") with open(system_filename, "w") as sys_file: likely_systems_to_txt(all_systems, track_multi_systems_hit, sys_file) # forbidden = [s for s in all_systems if s.forbidden_occ] # system_filename = os.path.join(config.working_dir(), "forbidden_components.tsv") # with open(system_filename, "w") as sys_file: # likely_systems_to_tsv(forbidden, track_multi_systems_hit, sys_file) system_filename = os.path.join(config.working_dir(), "all_systems.tsv") with open(system_filename, "w") as sys_file: likely_systems_to_tsv(all_systems, track_multi_systems_hit, sys_file) cluster_filename = os.path.join(config.working_dir(), "uncomplete_systems.txt") with open(cluster_filename, "w") as clst_file: unlikely_systems_to_txt(rejected_clusters, clst_file) if not (all_systems or rejected_clusters): logger.info("No Systems found in this dataset.") logger.info("END")