def test_get_best_hits_4_func(self): model = Model("foo/T2SS", 10) gene_name = "gspD" c_gene_gspd = CoreGene(self.models_location, gene_name, self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model, loner=True) # gene, model, id, hit_seq_len, replicon_name, position, i_eval, # score, profil_coverage, sequence_coverage, begin,end ###################### # based on the score # ###################### h0 = CoreHit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), 10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = CoreHit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, float(3.7e-76), 11, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) m0 = ModelHit(h0, gene_gspd, GeneStatus.ACCESSORY) m1 = ModelHit(h1, gene_gspd, GeneStatus.ACCESSORY) l0 = Loner(h0, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m1]) l1 = Loner(h1, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m0]) l = get_best_hit_4_func(gene_name, [l0, l1]) self.assertEqual(l, l1) ####################### # based on the i_eval # ####################### h0 = CoreHit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10, 10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = CoreHit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 11, 10, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) m0 = ModelHit(h0, gene_gspd, GeneStatus.ACCESSORY) m1 = ModelHit(h1, gene_gspd, GeneStatus.ACCESSORY) l0 = Loner(h0, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m1]) l1 = Loner(h1, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m0]) l = get_best_hit_4_func(gene_name, [l0, l1], key='i_eval') self.assertEqual(l, l0) ################################# # based on the profile_coverage # ################################# h0 = CoreHit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10, 10, 10, (741.0 - 104.0 + 1) / 803, 104, 741) h1 = CoreHit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 10, 10, 11, (736.0 - 105.0 + 1) / 759, 105, 736) m0 = ModelHit(h0, gene_gspd, GeneStatus.ACCESSORY) m1 = ModelHit(h1, gene_gspd, GeneStatus.ACCESSORY) l0 = Loner(h0, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m1]) l1 = Loner(h1, gene_ref=gene_gspd, gene_status=GeneStatus.ACCESSORY, counterpart=[m0]) l = get_best_hit_4_func(gene_name, [l0, l1], key='profile_coverage') self.assertEqual(l, l1) # bad criterion with self.assertRaises(MacsypyError) as ctx: get_best_hits([l0, l1], key='nimportnaoik') self.assertEqual('The criterion for Hits comparison nimportnaoik does not exist or is not available.\n' 'It must be either "score", "i_eval" or "profile_coverage".', str(ctx.exception))
def test_get_best_hits(self): model = Model("foo/T2SS", 10) gene_name = "gspD" c_gene_gspd = CoreGene(self.models_location, gene_name, self.profile_factory) gene_gspd = ModelGene(c_gene_gspd, model) # gene, model, id, hit_seq_len, replicon_name, position, i_eval, # score, profil_coverage, sequence_coverage, begin,end ###################### # based on the score # ###################### h0 = Hit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, float(1.2e-234), 10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = Hit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, float(3.7e-76), 11, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) h = get_best_hits([h0, h1]) self.assertEqual(h[0], h1) ####################### # based on the i_eval # ####################### h0 = Hit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10, 10, float(1.000000), (741.0 - 104.0 + 1) / 803, 104, 741) h1 = Hit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 11, 10, float(1.000000), (736.0 - 105.0 + 1) / 759, 105, 736) h = get_best_hits([h0, h1], key='i_eval') self.assertEqual(h[0], h0) ################################# # based on the profile_coverage # ################################# h0 = Hit(gene_gspd, "PSAE001c01_006940", 803, "PSAE001c01", 3450, 10, 10, 10, (741.0 - 104.0 + 1) / 803, 104, 741) h1 = Hit(gene_gspd, "PSAE001c01_013980", 759, "PSAE001c01", 3450, 10, 10, 11, (736.0 - 105.0 + 1) / 759, 105, 736) h = get_best_hits([h0, h1], key='profile_coverage') self.assertEqual(h[0], h1) # bad criterion with self.assertRaises(MacsypyError) as ctx: get_best_hits([h0, h1], key='nimportnaoik') self.assertEqual('The criterion for Hits comparison nimportnaoik does not exist or is not available.\n' 'It must be either "score", "i_eval" or "profile_coverage".', str(ctx.exception))
def search_systems(config, model_bank, gene_bank, profile_factory, logger): """ Do the job, this function is the orchestrator of all the macsyfinder mechanics at the end several files are produced containing the results - macsyfinder.conf: The set of variables used to runt this job - macsyfinder.systems: The list of the potential systems - macsyfinder.rejected_cluster: The list of all clusters and clustrs combination which has been rejected and the reason - macsyfinder.log: the copy of the standard output :param config: The MacSyFinder Configuration :type config: :class:`macsypy.config.Config` object :param model_bank: The bank populated with the available models :type model_bank: :class:`macsypy.model.ModelBank` object :param gene_bank: the bank containing all genes :type gene_bank: :class:`macsypy.gene.GeneBank` object :param profile_factory: The profile factory :type profile_factory: :class:`macsypy.gene.ProfileFactory` :param logger: The logger use to display information to the user. It must be initialized. see :func:`macsypy.init_logger` :type logger: :class:`colorlog.Logger` object :return: the systems and rejected clusters found :rtype: ([:class:`macsypy.system.System`, ...], [:class:`macsypy.cluster.RejectedCluster`, ...]) """ working_dir = config.working_dir() config.save(path_or_buf=os.path.join(working_dir, config.cfg_name)) registry = ModelRegistry() models_loc_available = scan_models_dir( config.models_dir(), profile_suffix=config.profile_suffix(), relative_path=config.relative_path()) for model_loc in models_loc_available: registry.add(model_loc) # build indexes idx = Indexes(config) idx.build(force=config.idx) # create models parser = DefinitionParser(config, model_bank, gene_bank, registry, profile_factory) try: models_def_to_detect = get_def_to_detect(config.models(), registry) except KeyError as err: sys.exit(f"macsyfinder: {err}") parser.parse(models_def_to_detect) logger.info( f"MacSyFinder's results will be stored in working_dir{working_dir}") logger.info(f"Analysis launched on {config.sequence_db()} for model(s):") for m in models_def_to_detect: logger.info(f"\t- {m.fqn}") models_to_detect = [ model_bank[model_loc.fqn] for model_loc in models_def_to_detect ] all_genes = [] for model in models_to_detect: genes = model.mandatory_genes + model.accessory_genes + model.neutral_genes + model.forbidden_genes # Exchangeable (formerly homologs/analogs) are also added because they can "replace" an important gene... ex_genes = [] for g in genes: ex_genes += g.exchangeables all_genes += (genes + ex_genes) ############################################# # this part of code is executed in parallel ############################################# try: all_reports = search_genes(all_genes, config) except Exception as err: sys.exit(str(err)) ############################################# # end of parallel code ############################################# all_hits = [ hit for subl in [report.hits for report in all_reports] for hit in subl ] if len(all_hits) > 0: # It's important to keep this sorting to have in last all_hits version # the hits with the same replicon_name and position sorted by score # the best score in first hits_by_replicon = {} for hit in all_hits: if hit.replicon_name in hits_by_replicon: hits_by_replicon[hit.replicon_name].append(hit) else: hits_by_replicon[hit.replicon_name] = [hit] for rep_name in hits_by_replicon: hits_by_replicon[rep_name] = get_best_hits( hits_by_replicon[rep_name], key='score') hits_by_replicon[rep_name].sort(key=attrgetter('position')) models_to_detect = sorted(models_to_detect, key=attrgetter('name')) db_type = config.db_type() if db_type in ('ordered_replicon', 'gembase'): systems, rejected_clusters = _search_in_ordered_replicon( hits_by_replicon, models_to_detect, config, logger) return systems, rejected_clusters elif db_type == "unordered": likely_systems, rejected_hits = _search_in_unordered_replicon( hits_by_replicon, models_to_detect, logger) return likely_systems, rejected_hits else: assert False, f"dbtype have an invalid value {db_type}" else: # No hits detected return [], []
def main(args=None, log_level=None) -> None: """ main entry point to macsyprofile :param args: the arguments passed on the command line without the program name :type args: List of string :param log_level: the output verbosity :type log_level: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL' """ global _log args = sys.argv[1:] if args is None else args parsed_args = parse_args(args) if log_level is None: log_level = verbosity_to_log_level(parsed_args.verbosity) _log = init_logger(log_level, out=(not parsed_args.mute)) if not os.path.exists(parsed_args.previous_run): _log.critical(f"{parsed_args.previous_run}: No such directory.") sys.tracebacklimit = 0 raise FileNotFoundError() from None elif not os.path.isdir(parsed_args.previous_run): _log.critical(f"{parsed_args.previous_run} is not a directory.") sys.tracebacklimit = 0 raise ValueError() from None defaults = MacsyDefaults(i_evalue_sel=1.0e9, coverage_profile=-1.0) cfg = Config(defaults, parsed_args) msf_run_path = cfg.previous_run() hmmer_results = os.path.join(msf_run_path, cfg.hmmer_dir()) hmm_suffix = cfg.res_search_suffix() profile_suffix = cfg.profile_suffix() if parsed_args.out: profile_report_path = os.path.normpath(parsed_args.out) dirname = os.path.normpath(os.path.dirname(parsed_args.out)) if not os.path.exists(dirname): _log.critical(f"The {dirname} directory is not writable") sys.tracebacklimit = 0 raise ValueError() from None else: profile_report_path = os.path.join(cfg.previous_run(), 'hmm_coverage.tsv') if os.path.exists(profile_report_path) and not parsed_args.force: _log.critical( f"The file {profile_report_path} already exists. " f"Remove it or specify a new output name --out or use --force option" ) sys.tracebacklimit = 0 raise ValueError() from None hmmer_files = sorted( glob.glob( os.path.join(hmmer_results, f"{parsed_args.pattern}{hmm_suffix}"))) try: model_familly_name = cfg.models()[0] model_dir = [ p for p in [os.path.join(p, model_familly_name) for p in cfg.models_dir()] if os.path.exists(p) ][-1] profiles_dir = os.path.join(model_dir, 'profiles') except IndexError: _log.critical( f"Cannot find models in conf file {msf_run_path}. " f"May be these results have been generated with an old version of macsyfinder." ) sys.tracebacklimit = 0 raise ValueError() from None _log.debug(f"hmmer_files: {hmmer_files}") all_hits = [] with open(profile_report_path, 'w') as prof_out: print(header(args), file=prof_out) for hmmer_out_path in hmmer_files: _log.info(f"parsing {hmmer_out_path}") gene_name = get_gene_name(hmmer_out_path, hmm_suffix) profile_path = os.path.join(profiles_dir, f"{gene_name}{profile_suffix}") gene_profile_len = get_profile_len(profile_path) hmm = HmmProfile(gene_name, gene_profile_len, hmmer_out_path, cfg) hits = hmm.parse() all_hits += hits if len(all_hits) > 0: if parsed_args.best_hits: # It's important to keep this sorting to have in last all_hits version # the hits with the same replicon_name and position sorted by score # the best score in first hits_by_replicon = {} for hit in all_hits: if hit.replicon_name in hits_by_replicon: hits_by_replicon[hit.replicon_name].append(hit) else: hits_by_replicon[hit.replicon_name] = [hit] all_hits = [] for rep_name in hits_by_replicon: hits_by_replicon[rep_name] = get_best_hits( hits_by_replicon[rep_name], key=parsed_args.best_hits) all_hits += sorted(hits_by_replicon[rep_name], key=lambda h: h.position) all_hits = sorted( all_hits, key=lambda h: (h.gene_name, h.replicon_name, h.position, h.score)) _log.info(f"found {len(all_hits)} hits") for hit in all_hits: print(hit, file=prof_out) _log.info(f"result is in '{profile_report_path}'") else: _log.info("No hit found")