Beispiel #1
0
    def test_get_best_hits_4_func(self):
        model = Model("foo/T2SS", 10)
        gene_name = "gspD"
        c_gene_gspd = CoreGene(self.models_location, gene_name, self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model, loner=True)

        #        gene, model, id,            hit_seq_len, replicon_name, position, i_eval,
        #        score,      profil_coverage,      sequence_coverage,     begin,end
        ######################
        # based on the score #
        ######################
        h0 = CoreHit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234),
                     10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        h1 = CoreHit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, float(3.7e-76),
                     11, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736)
        m0 = ModelHit(h0, gene_gspd, GeneStatus.ACCESSORY)
        m1 = ModelHit(h1, gene_gspd, GeneStatus.ACCESSORY)
        l0 = Loner(h0, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m1])
        l1 = Loner(h1, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m0])
        l = get_best_hit_4_func(gene_name, [l0, l1])
        self.assertEqual(l, l1)

        #######################
        # based on the i_eval #
        #######################
        h0 = CoreHit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10,
                     10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        h1 = CoreHit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 11,
                     10, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736)
        m0 = ModelHit(h0, gene_gspd, GeneStatus.ACCESSORY)
        m1 = ModelHit(h1, gene_gspd, GeneStatus.ACCESSORY)
        l0 = Loner(h0, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m1])
        l1 = Loner(h1, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m0])

        l = get_best_hit_4_func(gene_name, [l0, l1], key='i_eval')
        self.assertEqual(l, l0)

        #################################
        # based on the profile_coverage #
        #################################
        h0 = CoreHit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10,
                     10, 10, (741.0 - 104.0 + 1) / 803, 104, 741)
        h1 = CoreHit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 10,
                     10, 11, (736.0 - 105.0 + 1) / 759, 105, 736)
        m0 = ModelHit(h0, gene_gspd, GeneStatus.ACCESSORY)
        m1 = ModelHit(h1, gene_gspd, GeneStatus.ACCESSORY)
        l0 = Loner(h0, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m1])
        l1 = Loner(h1, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m0])

        l = get_best_hit_4_func(gene_name, [l0, l1], key='profile_coverage')
        self.assertEqual(l, l1)

        # bad criterion
        with self.assertRaises(MacsypyError) as ctx:
            get_best_hits([l0, l1], key='nimportnaoik')
        self.assertEqual('The criterion for Hits comparison nimportnaoik does not exist or is not available.\n'
                         'It must be either "score", "i_eval" or "profile_coverage".', str(ctx.exception))
Beispiel #2
0
    def test_get_best_hits(self):
        model = Model("foo/T2SS", 10)
        gene_name = "gspD"
        c_gene_gspd = CoreGene(self.models_location, gene_name, self.profile_factory)
        gene_gspd = ModelGene(c_gene_gspd, model)

        #        gene, model, id,            hit_seq_len, replicon_name, position, i_eval,
        #        score,      profil_coverage,      sequence_coverage,     begin,end
        ######################
        # based on the score #
        ######################
        h0 = Hit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234),
                 10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        h1 = Hit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, float(3.7e-76),
                 11, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736)

        h = get_best_hits([h0, h1])
        self.assertEqual(h[0], h1)

        #######################
        # based on the i_eval #
        #######################
        h0 = Hit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10,
                 10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741)
        h1 = Hit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 11,
                 10, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736)

        h = get_best_hits([h0, h1], key='i_eval')
        self.assertEqual(h[0], h0)

        #################################
        # based on the profile_coverage #
        #################################
        h0 = Hit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10,
                 10, 10, (741.0 - 104.0 + 1) / 803, 104, 741)
        h1 = Hit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 10,
                 10, 11, (736.0 - 105.0 + 1) / 759, 105, 736)

        h = get_best_hits([h0, h1], key='profile_coverage')
        self.assertEqual(h[0], h1)

        # bad criterion
        with self.assertRaises(MacsypyError) as ctx:
            get_best_hits([h0, h1], key='nimportnaoik')
        self.assertEqual('The criterion for Hits comparison nimportnaoik does not exist or is not available.\n'
                         'It must be either "score", "i_eval" or "profile_coverage".', str(ctx.exception))
Beispiel #3
0
def search_systems(config, model_bank, gene_bank, profile_factory, logger):
    """
    Do the job, this function is the orchestrator of all the macsyfinder mechanics
    at the end several files are produced containing the results

      - macsyfinder.conf: The set of variables used to runt this job
      - macsyfinder.systems: The list of the potential systems
      - macsyfinder.rejected_cluster: The list of all clusters and clustrs combination
                                      which has been rejected and the reason
      - macsyfinder.log: the copy of the standard output

    :param config: The MacSyFinder Configuration
    :type config: :class:`macsypy.config.Config` object
    :param model_bank: The bank populated with the available models
    :type model_bank: :class:`macsypy.model.ModelBank` object
    :param gene_bank: the bank containing all genes
    :type gene_bank: :class:`macsypy.gene.GeneBank` object
    :param profile_factory: The profile factory
    :type profile_factory: :class:`macsypy.gene.ProfileFactory`
    :param logger: The logger use to display information to the user.
                   It must be initialized. see :func:`macsypy.init_logger`
    :type logger: :class:`colorlog.Logger` object
    :return: the systems and rejected clusters found
    :rtype: ([:class:`macsypy.system.System`, ...], [:class:`macsypy.cluster.RejectedCluster`, ...])
    """
    working_dir = config.working_dir()
    config.save(path_or_buf=os.path.join(working_dir, config.cfg_name))
    registry = ModelRegistry()
    models_loc_available = scan_models_dir(
        config.models_dir(),
        profile_suffix=config.profile_suffix(),
        relative_path=config.relative_path())
    for model_loc in models_loc_available:
        registry.add(model_loc)
    # build indexes
    idx = Indexes(config)
    idx.build(force=config.idx)

    # create models
    parser = DefinitionParser(config, model_bank, gene_bank, registry,
                              profile_factory)
    try:
        models_def_to_detect = get_def_to_detect(config.models(), registry)
    except KeyError as err:
        sys.exit(f"macsyfinder: {err}")

    parser.parse(models_def_to_detect)

    logger.info(
        f"MacSyFinder's results will be stored in working_dir{working_dir}")
    logger.info(f"Analysis launched on {config.sequence_db()} for model(s):")

    for m in models_def_to_detect:
        logger.info(f"\t- {m.fqn}")

    models_to_detect = [
        model_bank[model_loc.fqn] for model_loc in models_def_to_detect
    ]
    all_genes = []
    for model in models_to_detect:
        genes = model.mandatory_genes + model.accessory_genes + model.neutral_genes + model.forbidden_genes
        # Exchangeable (formerly homologs/analogs) are also added because they can "replace" an important gene...
        ex_genes = []

        for g in genes:
            ex_genes += g.exchangeables
        all_genes += (genes + ex_genes)
    #############################################
    # this part of code is executed in parallel
    #############################################
    try:
        all_reports = search_genes(all_genes, config)
    except Exception as err:
        sys.exit(str(err))
    #############################################
    # end of parallel code
    #############################################
    all_hits = [
        hit for subl in [report.hits for report in all_reports] for hit in subl
    ]

    if len(all_hits) > 0:
        # It's important to keep this sorting to have in last all_hits version
        # the hits with the same replicon_name and position sorted by score
        # the best score in first
        hits_by_replicon = {}
        for hit in all_hits:
            if hit.replicon_name in hits_by_replicon:
                hits_by_replicon[hit.replicon_name].append(hit)
            else:
                hits_by_replicon[hit.replicon_name] = [hit]

        for rep_name in hits_by_replicon:
            hits_by_replicon[rep_name] = get_best_hits(
                hits_by_replicon[rep_name], key='score')
            hits_by_replicon[rep_name].sort(key=attrgetter('position'))

        models_to_detect = sorted(models_to_detect, key=attrgetter('name'))
        db_type = config.db_type()
        if db_type in ('ordered_replicon', 'gembase'):
            systems, rejected_clusters = _search_in_ordered_replicon(
                hits_by_replicon, models_to_detect, config, logger)
            return systems, rejected_clusters
        elif db_type == "unordered":
            likely_systems, rejected_hits = _search_in_unordered_replicon(
                hits_by_replicon, models_to_detect, logger)
            return likely_systems, rejected_hits
        else:
            assert False, f"dbtype have an invalid value {db_type}"
    else:
        # No hits detected
        return [], []
def main(args=None, log_level=None) -> None:
    """
    main entry point to macsyprofile

    :param args: the arguments passed on the command line without the program name
    :type args: List of string
    :param log_level: the output verbosity
    :type log_level: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
    """
    global _log
    args = sys.argv[1:] if args is None else args
    parsed_args = parse_args(args)

    if log_level is None:
        log_level = verbosity_to_log_level(parsed_args.verbosity)
    _log = init_logger(log_level, out=(not parsed_args.mute))

    if not os.path.exists(parsed_args.previous_run):
        _log.critical(f"{parsed_args.previous_run}: No such directory.")
        sys.tracebacklimit = 0
        raise FileNotFoundError() from None
    elif not os.path.isdir(parsed_args.previous_run):
        _log.critical(f"{parsed_args.previous_run} is not a directory.")
        sys.tracebacklimit = 0
        raise ValueError() from None

    defaults = MacsyDefaults(i_evalue_sel=1.0e9, coverage_profile=-1.0)
    cfg = Config(defaults, parsed_args)

    msf_run_path = cfg.previous_run()
    hmmer_results = os.path.join(msf_run_path, cfg.hmmer_dir())
    hmm_suffix = cfg.res_search_suffix()
    profile_suffix = cfg.profile_suffix()
    if parsed_args.out:
        profile_report_path = os.path.normpath(parsed_args.out)
        dirname = os.path.normpath(os.path.dirname(parsed_args.out))
        if not os.path.exists(dirname):
            _log.critical(f"The {dirname} directory is not writable")
            sys.tracebacklimit = 0
            raise ValueError() from None
    else:
        profile_report_path = os.path.join(cfg.previous_run(),
                                           'hmm_coverage.tsv')

    if os.path.exists(profile_report_path) and not parsed_args.force:
        _log.critical(
            f"The file {profile_report_path} already exists. "
            f"Remove it or specify a new output name --out or use --force option"
        )
        sys.tracebacklimit = 0
        raise ValueError() from None

    hmmer_files = sorted(
        glob.glob(
            os.path.join(hmmer_results, f"{parsed_args.pattern}{hmm_suffix}")))
    try:
        model_familly_name = cfg.models()[0]
        model_dir = [
            p for p in
            [os.path.join(p, model_familly_name) for p in cfg.models_dir()]
            if os.path.exists(p)
        ][-1]
        profiles_dir = os.path.join(model_dir, 'profiles')
    except IndexError:
        _log.critical(
            f"Cannot find models in conf file {msf_run_path}. "
            f"May be these results have been generated with an old version of macsyfinder."
        )
        sys.tracebacklimit = 0
        raise ValueError() from None

    _log.debug(f"hmmer_files: {hmmer_files}")
    all_hits = []
    with open(profile_report_path, 'w') as prof_out:
        print(header(args), file=prof_out)
        for hmmer_out_path in hmmer_files:
            _log.info(f"parsing {hmmer_out_path}")
            gene_name = get_gene_name(hmmer_out_path, hmm_suffix)
            profile_path = os.path.join(profiles_dir,
                                        f"{gene_name}{profile_suffix}")
            gene_profile_len = get_profile_len(profile_path)
            hmm = HmmProfile(gene_name, gene_profile_len, hmmer_out_path, cfg)
            hits = hmm.parse()
            all_hits += hits
        if len(all_hits) > 0:
            if parsed_args.best_hits:
                # It's important to keep this sorting to have in last all_hits version
                # the hits with the same replicon_name and position sorted by score
                # the best score in first
                hits_by_replicon = {}
                for hit in all_hits:
                    if hit.replicon_name in hits_by_replicon:
                        hits_by_replicon[hit.replicon_name].append(hit)
                    else:
                        hits_by_replicon[hit.replicon_name] = [hit]
                all_hits = []
                for rep_name in hits_by_replicon:
                    hits_by_replicon[rep_name] = get_best_hits(
                        hits_by_replicon[rep_name], key=parsed_args.best_hits)
                    all_hits += sorted(hits_by_replicon[rep_name],
                                       key=lambda h: h.position)

            all_hits = sorted(
                all_hits,
                key=lambda h:
                (h.gene_name, h.replicon_name, h.position, h.score))
            _log.info(f"found {len(all_hits)} hits")
            for hit in all_hits:
                print(hit, file=prof_out)
            _log.info(f"result is in '{profile_report_path}'")
        else:
            _log.info("No hit found")