Exemple #1
0
    def setUp(self):
        args = argparse.Namespace()
        args.sequence_db = self.find_data("base", "test_1.fasta")
        args.db_type = 'gembase'
        args.models_dir = self.find_data('models')
        args.res_search_dir = tempfile.gettempdir()
        args.log_level = 30
        self.cfg = Config(MacsyDefaults(), args)

        self.model_name = 'foo'
        self.model_location = ModelLocation(
            path=os.path.join(args.models_dir, self.model_name))
        self.gene_bank = GeneBank()
        self.profile_factory = ProfileFactory(self.cfg)
    def test_min_genes_required_cfg(self):
        # test min_genes_required is specified from configuration
        # so this value must overload the value read from xml
        def_2_parse = set()
        model_fqn = 'foo/model_5'
        def_2_parse.add(model_fqn)
        parsed = set()

        min_genes_required = [[model_fqn, '4']]
        self.args.min_genes_required = min_genes_required

        self.cfg = Config(MacsyDefaults(), self.args)
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertEqual(m.min_genes_required, 4)
    def test_inter_gene_max_space_cfg(self):
        # test inter_gene_max_space is specified from configuration
        # so this value must overload the value read from xml
        model_fqn = 'foo/model_5'

        inter_gene_max_space_cfg = [[model_fqn, '222']]
        self.args.inter_gene_max_space = inter_gene_max_space_cfg

        self.cfg = Config(MacsyDefaults(), self.args)
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertEqual(m.inter_gene_max_space, 222)
 def setUp(self):
     l = logging.getLogger()
     l.manager.loggerDict.clear()
     
     # add only one handler to the macsypy logger
     from macsypy.system_parser import _log
     macsy_log = _log.parent
     log_file = 'NUL' if platform.system() == 'Windows' else '/dev/null'
     log_handler = logging.FileHandler(log_file)
     macsy_log.addHandler(log_handler)
     
     self.cfg = Config(sequence_db=os.path.join(self._data_dir, "base", "test_base.fa"),
                       db_type="gembase",
                       hmmer_exe="",
                       e_value_res=1,
                       i_evalue_sel=0.5,
                       def_dir=os.path.join(self._data_dir, 'DEF'),
                       res_search_dir=tempfile.gettempdir(),
                       res_search_suffix="",
                       profile_dir=os.path.join(self._data_dir, 'profiles'),
                       profile_suffix=".hmm",
                       res_extract_suffix="",
                       log_level=30,
                       log_file=log_file
                       )
     self.system_bank = SystemBank()
     self.system_bank._system_bank = {}
     self.gene_bank = GeneBank()
     self.gene_bank._genes_bank = {}
     self.parser = SystemParser(self.cfg, self.system_bank, self.gene_bank)
Exemple #5
0
    def test_search_systems_model_unknown(self):
        logger = logging.getLogger('macsypy.macsyfinder')
        macsypy.logger_set_level(level='ERROR')
        defaults = MacsyDefaults()

        out_dir = os.path.join(self.tmp_dir, 'macsyfinder_test_search_systems')
        os.mkdir(out_dir)
        seq_db = self.find_data('base', 'test_1.fasta')
        model_dir = self.find_data('data_set', 'models')
        args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models nimporaoik -w 4 -o {out_dir}"

        _, parsed_args = parse_args(args.split())
        config = Config(defaults, parsed_args)
        model_bank = ModelBank()
        gene_bank = GeneBank()
        profile_factory = ProfileFactory(config)

        exit_ori = sys.exit
        sys.exit = self.fake_exit
        try:
            with self.assertRaises(TypeError) as ctx:
                _ = search_systems(config, model_bank, gene_bank,
                                   profile_factory, logger)
            self.assertEqual(
                str(ctx.exception),
                "macsyfinder: \"No such model definition: 'nimporaoik'\"")
        finally:
            sys.exit = exit_ori
Exemple #6
0
    def test_search_systems_unordered(self):
        logger = logging.getLogger('macsypy.macsyfinder')
        macsypy.logger_set_level(level='ERROR')
        defaults = MacsyDefaults()

        out_dir = os.path.join(self.tmp_dir, 'macsyfinder_test_search_systems')
        os.mkdir(out_dir)
        seq_db = self.find_data('base', 'VICH001.B.00001.C001.prt')
        model_dir = self.find_data('data_set', 'models')
        # test unordered replicon
        args = f"--sequence-db {seq_db} --db-type=unordered --models-dir {model_dir} --models set_1 all -w 4 -o {out_dir}"

        _, parsed_args = parse_args(args.split())
        config = Config(defaults, parsed_args)
        model_bank = ModelBank()
        gene_bank = GeneBank()
        profile_factory = ProfileFactory(config)

        systems, uncomplete_sys = search_systems(config, model_bank, gene_bank,
                                                 profile_factory, logger)
        expected_sys_id = [
            'Unordered_T2SS_4', 'Unordered_MSH_3', 'Unordered_T4P_5',
            'Unordered_T4bP_6'
        ]
        self.assertListEqual([s.id for s in systems], expected_sys_id)

        expected_uncomplete_sys_id = [
            'Unordered_Archaeal-T4P_1', 'Unordered_ComM_2', 'Unordered_Tad_7'
        ]
        self.assertListEqual([s.id for s in uncomplete_sys],
                             expected_uncomplete_sys_id)
    def setUp(self):
        defaults = MacsyDefaults()
        self.args = argparse.Namespace()
        self.args.sequence_db = self.find_data("base", "test_1.fasta")
        self.args.db_type = 'gembase'
        self.args.models_dir = self.find_data('models')
        self.args.res_search_dir = tempfile.gettempdir()

        self.cfg = Config(defaults, self.args)
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.profile_factory = ProfileFactory(self.cfg)
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)
Exemple #8
0
 def setUp(self):
     self.cfg = Config(sequence_db=os.path.join(self._data_dir, "base",
                                                "test_base.fa"),
                       db_type="gembase",
                       hmmer_exe="",
                       e_value_res=1,
                       i_evalue_sel=0.5,
                       def_dir=os.path.join(self._data_dir, 'DEF'),
                       res_search_dir="/tmp",
                       res_search_suffix="",
                       profile_dir=os.path.join(self._data_dir, 'profiles'),
                       profile_suffix=".hmm",
                       res_extract_suffix="",
                       log_level=30,
                       log_file='/dev/null')
     self.system_bank = SystemBank()
     self.system_bank._system_bank = {}
     self.gene_bank = GeneBank()
     self.gene_bank._genes_bank = {}
     self.parser = SystemParser(self.cfg, self.system_bank, self.gene_bank)
    def test_max_nb_genes_cfg(self):
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)

        # max_nb_genes is specified in xml
        # no user configuration on this
        self.cfg = Config(MacsyDefaults(), self.args)
        model_fqn = 'foo/model_6'  # 4 genes in this model but xml specify 3
        self.cfg = Config(MacsyDefaults(), self.args)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertEqual(m.max_nb_genes, 3)

        # max_nb_genes is specified from configuration
        # so this value must overload the value read from xml
        model_fqn = 'foo/model_5'  # 4 genes in this model
        max_nb_genes = [[model_fqn, '6']]
        self.args.max_nb_genes = max_nb_genes
        self.cfg = Config(MacsyDefaults(), self.args)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertEqual(m.max_nb_genes, 6)
    def test_multi_loci_cfg(self):
        # test multi_loci is specified from configuration
        # so this value must overload the value read from xml
        model_fqn = 'foo/model_5'

        self.args.multi_loci = model_fqn

        self.cfg = Config(MacsyDefaults(), self.args)
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertTrue(m.multi_loci)
Exemple #11
0
class Test(MacsyTest):
    def setUp(self):
        args = argparse.Namespace()
        args.sequence_db = self.find_data("base", "test_1.fasta")
        args.db_type = 'gembase'
        args.models_dir = self.find_data('models')
        args.res_search_dir = tempfile.gettempdir()
        args.log_level = 30
        self.cfg = Config(MacsyDefaults(), args)

        self.model_name = 'foo'
        self.model_location = ModelLocation(
            path=os.path.join(args.models_dir, self.model_name))
        self.gene_bank = GeneBank()
        self.profile_factory = ProfileFactory(self.cfg)

    def tearDown(self):
        try:
            shutil.rmtree(self.cfg.working_dir)
        except:
            pass

    def test_add_get_gene(self):
        gene_name = 'sctJ_FLG'
        with self.assertRaises(KeyError) as ctx:
            self.gene_bank[f"foo/{gene_name}"]
        self.assertEqual(str(ctx.exception),
                         f"\"No such gene 'foo/{gene_name}' in this bank\"")
        model_foo = Model(self.model_name, 10)

        self.gene_bank.add_new_gene(self.model_location, gene_name,
                                    self.profile_factory)

        gene_from_bank = self.gene_bank[(model_foo.family_name, gene_name)]
        self.assertTrue(isinstance(gene_from_bank, CoreGene))
        self.assertEqual(gene_from_bank.name, gene_name)
        gbk_contains_before = list(self.gene_bank)
        self.gene_bank.add_new_gene(self.model_location, gene_name,
                                    self.profile_factory)
        gbk_contains_after = list(self.gene_bank)
        self.assertEqual(gbk_contains_before, gbk_contains_after)

        gene_name = "bar"
        with self.assertRaises(MacsypyError) as ctx:
            self.gene_bank.add_new_gene(self.model_location, gene_name,
                                        self.profile_factory)
        self.assertEqual(str(ctx.exception),
                         f"'{self.model_name}/{gene_name}': No such profile")

    def test_contains(self):
        model_foo = Model("foo/bar", 10)
        gene_name = 'sctJ_FLG'

        self.gene_bank.add_new_gene(self.model_location, gene_name,
                                    self.profile_factory)
        gene_in = self.gene_bank[(model_foo.family_name, gene_name)]
        self.assertIn(gene_in, self.gene_bank)

        gene_name = 'abc'
        c_gene_out = CoreGene(self.model_location, gene_name,
                              self.profile_factory)
        gene_out = ModelGene(c_gene_out, model_foo)
        self.assertNotIn(gene_out, self.gene_bank)

    def test_iter(self):
        genes_names = ['sctJ_FLG', 'abc']
        for g in genes_names:
            self.gene_bank.add_new_gene(self.model_location, g,
                                        self.profile_factory)
        self.assertListEqual([g.name for g in self.gene_bank], genes_names)

    def test_genes_fqn(self):
        genes_names = ['sctJ_FLG', 'abc']
        for g in genes_names:
            self.gene_bank.add_new_gene(self.model_location, g,
                                        self.profile_factory)
        self.assertSetEqual(
            set(self.gene_bank.genes_fqn()),
            {f"{self.model_location.name}/{g.name}"
             for g in self.gene_bank})

    def test_get_uniq_object(self):
        gene_name = 'sctJ_FLG'
        self.gene_bank.add_new_gene(self.model_location, gene_name,
                                    self.profile_factory)
        self.gene_bank.add_new_gene(self.model_location, gene_name,
                                    self.profile_factory)
        self.assertEqual(len(self.gene_bank), 1)
Exemple #12
0
    def test_search_systems(self):
        logger = logging.getLogger('macsypy.macsyfinder')
        macsypy.logger_set_level(level='ERROR')
        defaults = MacsyDefaults()

        out_dir = os.path.join(self.tmp_dir, 'macsyfinder_test_search_systems')
        os.mkdir(out_dir)

        # test gembase replicon
        seq_db = self.find_data('base', 'VICH001.B.00001.C001.prt')
        model_dir = self.find_data('data_set', 'models')
        args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models set_1 all -w 4 -o {out_dir}"

        _, parsed_args = parse_args(args.split())
        config = Config(defaults, parsed_args)
        model_bank = ModelBank()
        gene_bank = GeneBank()
        profile_factory = ProfileFactory(config)

        systems, rejected_clst = search_systems(config, model_bank, gene_bank,
                                                profile_factory, logger)
        expected_sys_id = [
            'VICH001.B.00001.C001_MSH_5', 'VICH001.B.00001.C001_MSH_7',
            'VICH001.B.00001.C001_T4P_25', 'VICH001.B.00001.C001_T4P_23',
            'VICH001.B.00001.C001_T4P_21', 'VICH001.B.00001.C001_T4P_22',
            'VICH001.B.00001.C001_T4P_17', 'VICH001.B.00001.C001_T4P_16',
            'VICH001.B.00001.C001_T4bP_26', 'VICH001.B.00001.C001_T4P_24',
            'VICH001.B.00001.C001_T4P_18', 'VICH001.B.00001.C001_T4P_19',
            'VICH001.B.00001.C001_T4P_20', 'VICH001.B.00001.C001_T2SS_10',
            'VICH001.B.00001.C001_T2SS_9'
        ]
        self.assertListEqual([s.id for s in systems], expected_sys_id)

        expected_scores = [
            10.5, 10.0, 12.0, 9.5, 9.0, 8.5, 6.0, 5.0, 5.5, 10.5, 7.5, 7.0,
            8.0, 8.3, 7.5
        ]
        self.assertListEqual([s.score for s in systems], expected_scores)
        self.assertEqual(len(rejected_clst), 11)

        # test hits but No Systems
        args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models set_1 Tad -w 4 -o {out_dir}"
        _, parsed_args = parse_args(args.split())
        config = Config(defaults, parsed_args)
        model_bank = ModelBank()
        gene_bank = GeneBank()
        profile_factory = ProfileFactory(config)
        systems, rejected_clst = search_systems(config, model_bank, gene_bank,
                                                profile_factory, logger)
        self.assertEqual(systems, [])

        # test No hits
        seq_db = self.find_data('base', 'test_1.fasta')
        args = f"--sequence-db {seq_db} --db-type=gembase --models-dir {model_dir} --models set_1 T4bP -w 4 -o {out_dir}"
        _, parsed_args = parse_args(args.split())
        config = Config(defaults, parsed_args)
        model_bank = ModelBank()
        gene_bank = GeneBank()
        profile_factory = ProfileFactory(config)
        systems, rejected_clst = search_systems(config, model_bank, gene_bank,
                                                profile_factory, logger)
        self.assertEqual(systems, [])
        self.assertEqual(rejected_clst, [])
Exemple #13
0
def main(args=None, loglevel=None):
    """
    main entry point to MacSyFinder do some check before to launch :func:`main_search_systems` which is
    the real function that perform a search

    :param args: the arguments passed on the command line without the program name
    :type args: List of string
    :param loglevel: the output verbosity
    :type loglevel: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
    """
    args = sys.argv[1:] if args is None else args
    parser, parsed_args = parse_args(args)

    defaults = MacsyDefaults()
    config = Config(defaults, parsed_args)

    ###########################
    # creation of working dir
    ###########################
    working_dir = config.working_dir()
    if not os.path.exists(working_dir):
        os.makedirs(working_dir)
    else:
        if os.path.isdir(working_dir):
            if os.listdir(working_dir):
                raise ValueError(
                    f"'{working_dir}' already exists and is not a empty")
        else:
            raise ValueError(
                f"'{working_dir}' already exists and is not a directory")

    ################
    # init loggers #
    ################
    macsypy.init_logger(log_file=os.path.join(config.working_dir(),
                                              config.log_file()),
                        out=not config.mute())
    if not loglevel:
        # logs are specify from args options
        macsypy.logger_set_level(level=config.log_level())
    else:
        # used by unit tests to mute or unmute logs
        macsypy.logger_set_level(level=loglevel)

    logger = logging.getLogger('macsypy.macsyfinder')

    if parsed_args.list_models:
        print(list_models(parsed_args), file=sys.stdout)
        sys.exit(0)
    else:
        if not parsed_args.previous_run and not parsed_args.models:
            parser.print_help()
            print()
            sys.tracebacklimit = 0
            raise OptionError(
                "argument --models or --previous-run is required.")
        elif not parsed_args.previous_run and not parsed_args.sequence_db:
            parser.print_help()
            print()
            sys.tracebacklimit = 0
            raise OptionError(
                "argument --sequence-db or --previous-run is required.")
        elif not parsed_args.previous_run and not parsed_args.db_type:
            parser.print_help()
            print()
            sys.tracebacklimit = 0
            raise OptionError(
                "argument --db-type or --previous-run is required.")

        _log.info(f"command used: {' '.join(sys.argv)}")

        models = ModelBank()
        genes = GeneBank()
        profile_factory = ProfileFactory(config)
        macsypy.hit.hit_weight = macsypy.hit.HitWeight(itself=3,
                                                       exchangeable=.75,
                                                       mandatory=2,
                                                       accessory=.25,
                                                       neutral=1.5)

        logger.info("\n{:#^70}".format(" Searching systems "))
        all_systems, rejected_clusters = search_systems(
            config, models, genes, profile_factory, logger)

        track_multi_systems_hit = HitSystemTracker(all_systems)
        if config.db_type() in ('gembase', 'ordered_replicon'):
            #############################
            # Ordered/Gembase replicons #
            #############################

            ###########################
            # select the best systems #
            ###########################
            logger.info("\n{:#^70}".format(" Computing best solutions "))
            best_solutions = []
            one_best_solution = []

            # group systems found by replicon
            # before to search best system combination
            import time
            for rep_name, syst_group in itertools.groupby(
                    all_systems, key=lambda s: s.replicon_name):
                syst_group = list(syst_group)
                logger.info(
                    f"Computing best solutions for {rep_name} (nb of systems {len(syst_group)})"
                )
                t0 = time.time()
                best_sol_4_1_replicon, score = find_best_solutions(syst_group)
                t1 = time.time()
                logger.info(
                    f"It took {t1 - t0:.2f}sec to find best solution ({score:.2f}) for replicon {rep_name}"
                )
                # if several solutions are equivalent same number of system and score is same
                # store all equivalent solution in best_solution => all_best_systems
                # pick one in one_best_solution => best_systems
                best_solutions.extend(best_sol_4_1_replicon)
                one_best_solution.append(best_sol_4_1_replicon[0])

            ##############################
            # Write the results in files #
            ##############################
            logger.info("\n{:#^70}".format(" Writing down results "))
            system_filename = os.path.join(config.working_dir(),
                                           "all_systems.txt")
            tsv_filename = os.path.join(config.working_dir(),
                                        "all_systems.tsv")

            with open(system_filename, "w") as sys_file:
                systems_to_txt(all_systems, track_multi_systems_hit, sys_file)

            with open(tsv_filename, "w") as tsv_file:
                systems_to_tsv(all_systems, track_multi_systems_hit, tsv_file)

            cluster_filename = os.path.join(config.working_dir(),
                                            "rejected_clusters.txt")
            with open(cluster_filename, "w") as clst_file:
                rejected_clusters.sort(key=lambda clst: (
                    clst.replicon_name, clst.model, clst.hits))
                rejected_clst_to_txt(rejected_clusters, clst_file)
            if not (all_systems or rejected_clusters):
                logger.info("No Systems found in this dataset.")

            tsv_filename = os.path.join(config.working_dir(),
                                        "all_best_solutions.tsv")
            with open(tsv_filename, "w") as tsv_file:
                solutions_to_tsv(best_solutions, track_multi_systems_hit,
                                 tsv_file)

            tsv_filename = os.path.join(config.working_dir(),
                                        "best_solution.tsv")
            with open(tsv_filename, "w") as tsv_file:
                # flattern the list and sort it
                one_best_solution = [
                    syst for sol in one_best_solution for syst in sol
                ]
                one_best_solution.sort(
                    key=lambda syst: (syst.replicon_name, syst.position[0],
                                      syst.model.fqn, -syst.score))
                systems_to_tsv(one_best_solution, track_multi_systems_hit,
                               tsv_file)
        else:
            #######################
            # Unordered replicons #
            #######################

            ##############################
            # Write the results in files #
            ##############################
            logger.info("\n{:#^70}".format(" Writing down results "))

            system_filename = os.path.join(config.working_dir(),
                                           "all_systems.txt")
            with open(system_filename, "w") as sys_file:
                likely_systems_to_txt(all_systems, track_multi_systems_hit,
                                      sys_file)

            # forbidden = [s for s in all_systems if s.forbidden_occ]
            # system_filename = os.path.join(config.working_dir(), "forbidden_components.tsv")
            # with open(system_filename, "w") as sys_file:
            #     likely_systems_to_tsv(forbidden, track_multi_systems_hit, sys_file)

            system_filename = os.path.join(config.working_dir(),
                                           "all_systems.tsv")
            with open(system_filename, "w") as sys_file:
                likely_systems_to_tsv(all_systems, track_multi_systems_hit,
                                      sys_file)

            cluster_filename = os.path.join(config.working_dir(),
                                            "uncomplete_systems.txt")
            with open(cluster_filename, "w") as clst_file:
                unlikely_systems_to_txt(rejected_clusters, clst_file)

            if not (all_systems or rejected_clusters):
                logger.info("No Systems found in this dataset.")

    logger.info("END")