def test_inter_gene_max_space_cfg(self):
        # test inter_gene_max_space is specified from configuration
        # so this value must overload the value read from xml
        model_fqn = 'foo/model_5'

        inter_gene_max_space_cfg = [[model_fqn, '222']]
        self.args.inter_gene_max_space = inter_gene_max_space_cfg

        self.cfg = Config(MacsyDefaults(), self.args)
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertEqual(m.inter_gene_max_space, 222)
    def test_min_genes_required_cfg(self):
        # test min_genes_required is specified from configuration
        # so this value must overload the value read from xml
        def_2_parse = set()
        model_fqn = 'foo/model_5'
        def_2_parse.add(model_fqn)
        parsed = set()

        min_genes_required = [[model_fqn, '4']]
        self.args.min_genes_required = min_genes_required

        self.cfg = Config(MacsyDefaults(), self.args)
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertEqual(m.min_genes_required, 4)
Ejemplo n.º 3
0
    def test_get_def_to_detect(self):
        cmd_args = argparse.Namespace()
        cmd_args.models_dir = os.path.join(self._data_dir, 'fake_model_dir')
        cmd_args.models = ('set_1', 'def_1_1', 'def_1_2', 'def_1_3')
        registry = ModelRegistry()
        models_location = scan_models_dir(cmd_args.models_dir)
        for ml in models_location:
            registry.add(ml)

        # case where models are specified on command line
        res, model_family, model_vers = get_def_to_detect(
            ('set_1', ['def_1_1', 'def_1_2', 'def_1_3']), registry)
        model_loc = registry['set_1']
        self.assertEqual(model_family, 'set_1')
        self.assertEqual(model_vers, '0.0b2')
        exp = [
            model_loc.get_definition(name)
            for name in ('set_1/def_1_1', 'set_1/def_1_2', 'set_1/def_1_3')
        ]
        self.assertListEqual(res, exp)

        # case we search all models
        res, model_family, model_vers = get_def_to_detect(('set_1', ['all']),
                                                          registry)
        self.assertEqual(model_family, 'set_1')
        self.assertEqual(model_vers, '0.0b2')
        exp = model_loc.get_all_definitions()
        self.assertListEqual(res, exp)

        # case the models required does not exists
        with self.assertRaises(ValueError):
            get_def_to_detect(('set_1', ['FOO', 'BAR']), registry)
Ejemplo n.º 4
0
 def test_models(self):
     mr = ModelRegistry()
     model_complex_expected = ModelLocation(path=self.complex_dir)
     model_simple_expected = ModelLocation(path=self.simple_dir)
     mr.add(model_complex_expected)
     mr.add(model_simple_expected)
     models_received = sorted(mr.models())
     models_expected = sorted(
         [model_complex_expected, model_simple_expected])
     self.assertListEqual(models_received, models_expected)
Ejemplo n.º 5
0
    def test_add_get(self):
        mr = ModelRegistry()
        model_complex_expected = ModelLocation(path=self.complex_dir)
        with self.assertRaises(KeyError) as ctx:
            mr[model_complex_expected.name]

        self.assertEqual(str(ctx.exception),
                         '"No such model definition: \'complex\'"')
        mr.add(model_complex_expected)
        self.assertEqual(model_complex_expected,
                         mr[model_complex_expected.name])
Ejemplo n.º 6
0
def list_models(args):
    """
    :param args: The command line argument once parsed
    :type args: :class:`argparse.Namespace` object
    :return: a string representation of all models and submodels installed.
    :rtype: str
    """
    config = Config(MacsyDefaults(), args)
    registry = ModelRegistry()
    models_loc_available = scan_models_dir(
        config.models_dir(),
        profile_suffix=config.profile_suffix(),
        relative_path=config.relative_path())
    for model_loc in models_loc_available:
        registry.add(model_loc)
    return str(registry)
Ejemplo n.º 7
0
def _find_all_installed_packages() -> ModelRegistry:
    """
    :return: all models installed
    """
    defaults = MacsyDefaults()
    config = Config(defaults, argparse.Namespace())
    system_model_dir = config.models_dir()
    user_model_dir = os.path.join(os.path.expanduser('~'), '.macsyfinder', 'data')
    model_dirs = (system_model_dir, user_model_dir) if os.path.exists(user_model_dir) else (system_model_dir,)
    registry = ModelRegistry()
    for model_dir in model_dirs:
        try:
            for model_loc in scan_models_dir(model_dir, profile_suffix=config.profile_suffix):
                registry.add(model_loc)
        except PermissionError as err:
            _log.warning(f"{model_dir} is not readable: {err} : skip it.")
    return registry
Ejemplo n.º 8
0
    def test_str(self):
        mr = ModelRegistry()
        model_complex_expected = ModelLocation(path=self.complex_dir)
        model_simple_expected = ModelLocation(path=self.simple_dir)
        mr.add(model_complex_expected)
        mr.add(model_simple_expected)
        expected_output = """complex
        /subdef_1
                 /def_1_1
                 /def_1_2
        /subdef_2
                 /def_2_1
                 /def_2_2
simple
       /def_1
       /def_2
"""
        self.assertEqual(expected_output, str(mr))
    def setUp(self):
        defaults = MacsyDefaults()
        self.args = argparse.Namespace()
        self.args.sequence_db = self.find_data("base", "test_1.fasta")
        self.args.db_type = 'gembase'
        self.args.models_dir = self.find_data('models')
        self.args.res_search_dir = tempfile.gettempdir()

        self.cfg = Config(defaults, self.args)
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.profile_factory = ProfileFactory(self.cfg)
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)
Ejemplo n.º 10
0
def _find_all_installed_packages(models_dir=None) -> ModelRegistry:
    """
    :return: all models installed
    """
    defaults = MacsyDefaults()
    args = argparse.Namespace()
    if models_dir is not None:
        args.models_dir = models_dir
    config = Config(defaults, args)
    model_dirs = config.models_dir()
    registry = ModelRegistry()
    for model_dir in model_dirs:
        try:
            for model_loc in scan_models_dir(
                    model_dir, profile_suffix=config.profile_suffix()):
                registry.add(model_loc)
        except PermissionError as err:
            _log.warning(f"{model_dir} is not readable: {err} : skip it.")
    return registry
    def test_max_nb_genes_cfg(self):
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)

        # max_nb_genes is specified in xml
        # no user configuration on this
        self.cfg = Config(MacsyDefaults(), self.args)
        model_fqn = 'foo/model_6'  # 4 genes in this model but xml specify 3
        self.cfg = Config(MacsyDefaults(), self.args)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertEqual(m.max_nb_genes, 3)

        # max_nb_genes is specified from configuration
        # so this value must overload the value read from xml
        model_fqn = 'foo/model_5'  # 4 genes in this model
        max_nb_genes = [[model_fqn, '6']]
        self.args.max_nb_genes = max_nb_genes
        self.cfg = Config(MacsyDefaults(), self.args)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertEqual(m.max_nb_genes, 6)
    def test_multi_loci_cfg(self):
        # test multi_loci is specified from configuration
        # so this value must overload the value read from xml
        model_fqn = 'foo/model_5'

        self.args.multi_loci = model_fqn

        self.cfg = Config(MacsyDefaults(), self.args)
        self.model_bank = ModelBank()
        self.gene_bank = GeneBank()
        self.model_registry = ModelRegistry()
        models_location = scan_models_dir(self.args.models_dir)
        for ml in models_location:
            self.model_registry.add(ml)
        self.parser = DefinitionParser(self.cfg, self.model_bank,
                                       self.gene_bank, self.model_registry,
                                       self.profile_factory)

        models_2_detect = [
            self.model_registry['foo'].get_definition(model_fqn)
        ]
        self.parser.parse(models_2_detect)
        m = self.model_bank[model_fqn]
        self.assertTrue(m.multi_loci)
Ejemplo n.º 13
0
def search_systems(config, model_bank, gene_bank, profile_factory, logger):
    """
    Do the job, this function is the orchestrator of all the macsyfinder mechanics
    at the end several files are produced containing the results

      - macsyfinder.conf: The set of variables used to runt this job
      - macsyfinder.systems: The list of the potential systems
      - macsyfinder.rejected_cluster: The list of all clusters and clustrs combination
                                      which has been rejected and the reason
      - macsyfinder.log: the copy of the standard output

    :param config: The MacSyFinder Configuration
    :type config: :class:`macsypy.config.Config` object
    :param model_bank: The bank populated with the available models
    :type model_bank: :class:`macsypy.model.ModelBank` object
    :param gene_bank: the bank containing all genes
    :type gene_bank: :class:`macsypy.gene.GeneBank` object
    :param profile_factory: The profile factory
    :type profile_factory: :class:`macsypy.gene.ProfileFactory`
    :param logger: The logger use to display information to the user.
                   It must be initialized. see :func:`macsypy.init_logger`
    :type logger: :class:`colorlog.Logger` object
    :return: the systems and rejected clusters found
    :rtype: ([:class:`macsypy.system.System`, ...], [:class:`macsypy.cluster.RejectedCluster`, ...])
    """
    working_dir = config.working_dir()
    config.save(path_or_buf=os.path.join(working_dir, config.cfg_name))
    registry = ModelRegistry()
    models_loc_available = scan_models_dir(
        config.models_dir(),
        profile_suffix=config.profile_suffix(),
        relative_path=config.relative_path())
    for model_loc in models_loc_available:
        registry.add(model_loc)
    # build indexes
    idx = Indexes(config)
    idx.build(force=config.idx)

    # create models
    parser = DefinitionParser(config, model_bank, gene_bank, registry,
                              profile_factory)
    try:
        models_def_to_detect = get_def_to_detect(config.models(), registry)
    except KeyError as err:
        sys.exit(f"macsyfinder: {err}")

    parser.parse(models_def_to_detect)

    logger.info(
        f"MacSyFinder's results will be stored in working_dir{working_dir}")
    logger.info(f"Analysis launched on {config.sequence_db()} for model(s):")

    for m in models_def_to_detect:
        logger.info(f"\t- {m.fqn}")

    models_to_detect = [
        model_bank[model_loc.fqn] for model_loc in models_def_to_detect
    ]
    all_genes = []
    for model in models_to_detect:
        genes = model.mandatory_genes + model.accessory_genes + model.neutral_genes + model.forbidden_genes
        # Exchangeable (formerly homologs/analogs) are also added because they can "replace" an important gene...
        ex_genes = []

        for g in genes:
            ex_genes += g.exchangeables
        all_genes += (genes + ex_genes)
    #############################################
    # this part of code is executed in parallel
    #############################################
    try:
        all_reports = search_genes(all_genes, config)
    except Exception as err:
        sys.exit(str(err))
    #############################################
    # end of parallel code
    #############################################
    all_hits = [
        hit for subl in [report.hits for report in all_reports] for hit in subl
    ]

    if len(all_hits) > 0:
        # It's important to keep this sorting to have in last all_hits version
        # the hits with the same replicon_name and position sorted by score
        # the best score in first
        hits_by_replicon = {}
        for hit in all_hits:
            if hit.replicon_name in hits_by_replicon:
                hits_by_replicon[hit.replicon_name].append(hit)
            else:
                hits_by_replicon[hit.replicon_name] = [hit]

        for rep_name in hits_by_replicon:
            hits_by_replicon[rep_name] = get_best_hits(
                hits_by_replicon[rep_name], key='score')
            hits_by_replicon[rep_name].sort(key=attrgetter('position'))

        models_to_detect = sorted(models_to_detect, key=attrgetter('name'))
        db_type = config.db_type()
        if db_type in ('ordered_replicon', 'gembase'):
            systems, rejected_clusters = _search_in_ordered_replicon(
                hits_by_replicon, models_to_detect, config, logger)
            return systems, rejected_clusters
        elif db_type == "unordered":
            likely_systems, rejected_hits = _search_in_unordered_replicon(
                hits_by_replicon, models_to_detect, logger)
            return likely_systems, rejected_hits
        else:
            assert False, f"dbtype have an invalid value {db_type}"
    else:
        # No hits detected
        return [], []