def test1_stats(self): """ Calculate motif statistics """ for ncpus in [1,2]: stats = calc_stats(self.motifs, self.fg_fa, self.bg_fa, ncpus=ncpus) for f in self.stat_functions: self.assertIn(f, list(stats.values())[0]) # Two motifs self.assertEqual(2, len(stats)) m1 = "T-box_M1713_1.01_CTAGGTGTGAA" # not enriched m2 = "p53_Average_8_CATGyCnGGrCATGy" # highly enriched self.assertLess(stats[m1]["roc_auc"] , 0.9) self.assertGreater(stats[m2]["roc_auc"] , 0.5) self.assertEqual(stats[m1]["recall_at_fdr"] , 0.0) self.assertGreater(stats[m2]["recall_at_fdr"] , 0.8) self.assertGreater(stats[m1]["ks_pvalue"] , 0.01) self.assertLess(stats[m2]["ks_pvalue"] , 0.001) self.assertGreater(stats[m1]["phyper_at_fpr"] , 0.1) self.assertLess(stats[m2]["phyper_at_fpr"] , 1e-16) # Only calculate specific statistic stats = calc_stats(self.motifs, self.fg_fa, self.bg_fa, stats=["roc_auc"]) self.assertEqual(1, len(list(stats.values())[0])) self.assertLess(stats[m1]["roc_auc"] , 0.9) self.assertGreater(stats[m2]["roc_auc"] , 0.5)
def get_roc_values(motif, fg_file, bg_file): """Calculate ROC AUC values for ROC plots.""" #print(calc_stats(motif, fg_file, bg_file, stats=["roc_values"], ncpus=1)) #["roc_values"]) try: # fg_result = motif.pwm_scan_score(Fasta(fg_file), cutoff=0.0, nreport=1) # fg_vals = [sorted(x)[-1] for x in fg_result.values()] # # bg_result = motif.pwm_scan_score(Fasta(bg_file), cutoff=0.0, nreport=1) # bg_vals = [sorted(x)[-1] for x in bg_result.values()] # (x, y) = roc_values(fg_vals, bg_vals) stats = calc_stats(motif, fg_file, bg_file, stats=["roc_values"], ncpus=1) (x, y) = list(stats.values())[0]["roc_values"] return None, x, y except Exception as e: print(motif) print(motif.id) raise error = e return error, [], []
def test1_stats(kwargs, stat_functions): """ Calculate motif statistics """ for ncpus in [1, 2]: kwargs["ncpus"] = ncpus stats = calc_stats(**kwargs) for f in stat_functions: if "fg_table" not in kwargs or getattr(rocmetrics, f).input_type != "pos": print(f, fg_table, getattr(rocmetrics, f).input_type) assert f in list(stats.values())[0] # Two motifs assert 2 == len(stats) m1 = "T-box_M1713_1.01_CTAGGTGTGAA" # not enriched m2 = "p53_Average_8_CATGyCnGGrCATGy" # highly enriched assert stats[m1]["roc_auc"] < 0.9 assert stats[m2]["roc_auc"] > 0.5 assert stats[m1]["recall_at_fdr"] == 0.0 assert stats[m2]["recall_at_fdr"] > 0.8 if "fg_table" not in kwargs: assert stats[m1]["ks_pvalue"] > 0.01 assert stats[m2]["ks_pvalue"] < 0.001 assert stats[m1]["phyper_at_fpr"] > 0.1 assert stats[m2]["phyper_at_fpr"] < 1e-13
def test_one_statistic(kwargs): # Only calculate specific statistic stats = calc_stats(**kwargs) assert 1 == len(list(stats.values())[0]) m1 = "T-box_M1713_1.01_CTAGGTGTGAA" # not enriched m2 = "p53_Average_8_CATGyCnGGrCATGy" # highly enriched assert stats[m1]["roc_auc"] < 0.9 assert stats[m2]["roc_auc"] > 0.5
def test2_stats_single_motif(self): """ Calculate motif statistics """ m_id = "p53_Average_8_CATGyCnGGrCATGy" with open(self.motifs) as f: motifs = read_motifs(f) motif = [m for m in motifs if str(m) == m_id][0] stats = calc_stats(motif, self.fg_fa, self.bg_fa, stats=["roc_auc"]) self.assertGreater(stats[m_id]["roc_auc"], 0.9)
def test2_stats_single_motif(self): """ Calculate motif statistics """ m_id = "p53_Average_8_CATGyCnGGrCATGy" with open(self.motifs) as f: motifs = read_motifs(f) motif = [m for m in motifs if str(m) == m_id][0] stats = calc_stats(motif, self.fg_fa, self.bg_fa, stats=["roc_auc"]) self.assertGreater(stats[m_id]["roc_auc"] , 0.9)
def test2_stats_single_motif(kwargs): """ Calculate motif statistics """ m_id = "p53_Average_8_CATGyCnGGrCATGy" motifs = read_motifs(kwargs["motifs"]) motif = [m for m in motifs if str(m) == m_id][0] kwargs["motifs"] = motif stats = calc_stats(**kwargs) assert stats[m_id]["roc_auc"] > 0.9
def mp_calc_stats(motifs, fg_fa, bg_fa, bg_name=None): """Parallel calculation of motif statistics.""" try: stats = calc_stats(motifs, fg_fa, bg_fa, ncpus=1) except Exception as e: raise sys.stderr.write("ERROR: {}\n".format(str(e))) stats = {} if not bg_name: bg_name = "default" return bg_name, stats
def test1_stats(self): """ Calculate motif statistics """ for ncpus in [1, 2]: stats = calc_stats(self.motifs, self.fg_fa, self.bg_fa, ncpus=ncpus) for f in self.stat_functions: self.assertIn(f, list(stats.values())[0]) # Two motifs self.assertEqual(2, len(stats)) m1 = "T-box_M1713_1.01_CTAGGTGTGAA" # not enriched m2 = "p53_Average_8_CATGyCnGGrCATGy" # highly enriched self.assertLess(stats[m1]["roc_auc"], 0.9) self.assertGreater(stats[m2]["roc_auc"], 0.5) self.assertEqual(stats[m1]["recall_at_fdr"], 0.0) self.assertGreater(stats[m2]["recall_at_fdr"], 0.8) self.assertGreater(stats[m1]["ks_pvalue"], 0.01) self.assertLess(stats[m2]["ks_pvalue"], 0.001) self.assertGreater(stats[m1]["phyper_at_fpr"], 0.1) self.assertLess(stats[m2]["phyper_at_fpr"], 1e-16) # Only calculate specific statistic stats = calc_stats(self.motifs, self.fg_fa, self.bg_fa, stats=["roc_auc"]) self.assertEqual(1, len(list(stats.values())[0])) self.assertLess(stats[m1]["roc_auc"], 0.9) self.assertGreater(stats[m2]["roc_auc"], 0.5)
def create_denovo_motif_report(inputfile, pfmfile, fgfa, background, locfa, outdir, params, stats=None): """Create text and graphical (.html) motif reports.""" logger.info("creating de novo reports") motifs = read_motifs(pfmfile, fmt="pwm") # ROC plots create_roc_plots(pfmfile, fgfa, background, outdir, params["genome"]) # Closest match in database mc = MotifComparer() closest_match = mc.get_closest_match(motifs) if stats is None: stats = {} for bg, bgfa in background.items(): for m, s in calc_stats(fg_file=fgfa, bg_file=bgfa, motifs=motifs).items(): if m not in stats: stats[m] = {} stats[m][bg] = s stats = add_star(stats) if not params: params = {} cutoff_fpr = params.get("cutoff_fpr", 0.9) lsize = np.median([len(seq) for seq in Fasta(locfa).seqs]) # Location plots logger.debug("Creating localization plots") for motif in motifs: logger.debug(" {} {}".format(motif.id, motif)) outfile = os.path.join(outdir, "images/{}_histogram.svg".format(motif.id)) motif_localization(locfa, motif, lsize, outfile, cutoff=cutoff_fpr) # Create reports _create_text_report(inputfile, motifs, closest_match, stats, outdir) _create_graphical_report(inputfile, pfmfile, background, closest_match, outdir, stats)
def get_roc_values(motif, fg_file, bg_file, genome): """Calculate ROC AUC values for ROC plots.""" try: stats = calc_stats( fg_file=fg_file, bg_file=bg_file, motifs=motif, genome=genome, stats=["roc_values"], ncpus=1, ) (x, y) = list(stats.values())[0]["roc_values"] return None, x, y except Exception as e: print(motif) print(motif.id) print(str(e)) raise
def mp_calc_stats(motifs, fg_fa, bg_fa, zscore, gc, genome, bg_name=None): """Parallel calculation of motif statistics.""" try: stats = calc_stats( motifs=motifs, fg_file=fg_fa, bg_file=bg_fa, ncpus=1, zscore=zscore, gc=gc, genome=genome, ) except Exception as e: sys.stderr.write("ERROR: {}\n".format(str(e))) stats = {} raise if not bg_name: bg_name = "default" return bg_name, stats
def get_roc_values(motif, fg_file, bg_file): """Calculate ROC AUC values for ROC plots.""" #print(calc_stats(motif, fg_file, bg_file, stats=["roc_values"], ncpus=1)) #["roc_values"]) try: # fg_result = motif.pwm_scan_score(Fasta(fg_file), cutoff=0.0, nreport=1) # fg_vals = [sorted(x)[-1] for x in fg_result.values()] # # bg_result = motif.pwm_scan_score(Fasta(bg_file), cutoff=0.0, nreport=1) # bg_vals = [sorted(x)[-1] for x in bg_result.values()] # (x, y) = roc_values(fg_vals, bg_vals) stats = calc_stats(motif, fg_file, bg_file, stats=["roc_values"], ncpus=1) (x,y) = list(stats.values())[0]["roc_values"] return None,x,y except Exception as e: print(motif) print(motif.id) raise error = e return error,[],[]
def create_denovo_motif_report(inputfile, pwmfile, fgfa, background, locfa, outdir, params, stats=None): """Create text and graphical (.html) motif reports.""" logger.info("creating reports") motifs = read_motifs(pwmfile, fmt="pwm") # ROC plots create_roc_plots(pwmfile, fgfa, background, outdir) # Closest match in database mc = MotifComparer() closest_match = mc.get_closest_match(motifs) if stats is None: stats = {} for bg, bgfa in background.items(): for m, s in calc_stats(motifs, fgfa, bgfa).items(): if m not in stats: stats[m] = {} stats[m][bg] = s stats = add_star(stats) if not params: params = {} cutoff_fpr = params.get('cutoff_fpr', 0.9) lwidth = np.median([len(seq) for seq in Fasta(locfa).seqs]) # Location plots logger.debug("Creating localization plots") for motif in motifs: logger.debug(" {} {}".format(motif.id, motif)) outfile = os.path.join(outdir, "images/{}_histogram.svg".format(motif.id)) motif_localization(locfa, motif, lwidth, outfile, cutoff=cutoff_fpr) # Create reports _create_text_report(inputfile, motifs, closest_match, stats, outdir) _create_graphical_report(inputfile, pwmfile, background, closest_match, outdir, stats)
def best_motif_in_cluster( single_pwm, clus_pwm, clusters, fg_fa, background, genome, stats=None, metrics=("roc_auc", "recall_at_fdr"), ): """Return the best motif per cluster for a clustering results. The motif can be either the average motif or one of the clustered motifs. Parameters ---------- single_pwm : str Filename of motifs. clus_pwm : str Filename of motifs. clusters : Motif clustering result. fg_fa : str Filename of FASTA file. background : dict Dictionary for background file names. genome : str Genome name. stats : dict, optional If statistics are not supplied they will be computed. metrics : sequence, optional Metrics to use for motif evaluation. Default are "roc_auc" and "recall_at_fdr". Returns ------- motifs : list List of Motif instances. """ # combine original and clustered motifs motifs = read_motifs(single_pwm) + read_motifs(clus_pwm) motifs = dict([(str(m), m) for m in motifs]) # get the statistics for those motifs that were not yet checked clustered_motifs = [] for clus, singles in clusters: for motif in set([clus] + singles): if str(motif) not in stats: clustered_motifs.append(motifs[str(motif)]) new_stats = {} for bg, bg_fa in background.items(): for m, s in calc_stats(fg_file=fg_fa, bg_file=bg_fa, motifs=clustered_motifs, genome=genome).items(): if m not in new_stats: new_stats[m] = {} new_stats[m][bg] = s stats.update(new_stats) rank = rank_motifs(stats, metrics) # rank the motifs best_motifs = [] for clus, singles in clusters: if len(singles) > 1: eval_motifs = singles if clus not in motifs: eval_motifs.append(clus) eval_motifs = [motifs[str(e)] for e in eval_motifs] best_motif = sorted(eval_motifs, key=lambda x: rank[str(x)])[-1] best_motifs.append(best_motif) else: best_motifs.append(clus) for bg in background: stats[str(best_motifs[-1])][bg]["num_cluster"] = len(singles) best_motifs = sorted(best_motifs, key=lambda x: rank[str(x)], reverse=True) return best_motifs
def roc(args): """ Calculate ROC_AUC and other metrics and optionally plot ROC curve.""" outputfile = args.outfile # Default extension for image if outputfile and not outputfile.endswith(".png"): outputfile += ".png" motifs = read_motifs(open(args.pwmfile), fmt="pwm") ids = [] if args.ids: ids = args.ids.split(",") else: ids = [m.id for m in motifs] motifs = [m for m in motifs if (m.id in ids)] stats = [ "phyper_at_fpr", "roc_auc", "enr_at_fpr", "max_enrichment", "recall_at_fdr", "roc_values", "matches_at_fpr", ] motif_stats = calc_stats(motifs, args.sample, args.background, genome=args.genome, stats=stats) plot_x = [] plot_y = [] legend = [] f_out = sys.stdout if args.outdir: if not os.path.exists(args.outdir): os.makedirs(args.outdir) f_out = open(args.outdir + "/gimme.roc.report.txt", "w") # Print the metrics f_out.write( "Motif\t# matches\t# matches background\tP-value\tlog10 P-value\tROC AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n" ) for motif in motifs: if outputfile: x, y = motif_stats[str(motif)]["roc_values"] plot_x.append(x) plot_y.append(y) legend.append(motif.id) log_pvalue = np.inf if motif_stats[str(motif)]["phyper_at_fpr"] > 0: log_pvalue = -np.log10(motif_stats[str(motif)]["phyper_at_fpr"]) f_out.write( "{}\t{:d}\t{:d}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n".format( motif.id, motif_stats[str(motif)]["matches_at_fpr"][0], motif_stats[str(motif)]["matches_at_fpr"][1], motif_stats[str(motif)]["phyper_at_fpr"], log_pvalue, motif_stats[str(motif)]["roc_auc"], motif_stats[str(motif)]["enr_at_fpr"], motif_stats[str(motif)]["recall_at_fdr"], )) f_out.close() if args.outdir: html_report( args.outdir, args.outdir + "/gimme.roc.report.txt", args.pwmfile, 0.01, ) # Plot the ROC curve if outputfile: roc_plot(outputfile, plot_x, plot_y, ids=legend)
def best_motif_in_cluster(single_pwm, clus_pwm, clusters, fg_fa, background, stats=None, metrics=("roc_auc", "recall_at_fdr")): """Return the best motif per cluster for a clustering results. The motif can be either the average motif or one of the clustered motifs. Parameters ---------- single_pwm : str Filename of motifs. clus_pwm : str Filename of motifs. clusters : Motif clustering result. fg_fa : str Filename of FASTA file. background : dict Dictionary for background file names. stats : dict, optional If statistics are not supplied they will be computed. metrics : sequence, optional Metrics to use for motif evaluation. Default are "roc_auc" and "recall_at_fdr". Returns ------- motifs : list List of Motif instances. """ # combine original and clustered motifs motifs = read_motifs(single_pwm) + read_motifs(clus_pwm) motifs = dict([(str(m), m) for m in motifs]) # get the statistics for those motifs that were not yet checked clustered_motifs = [] for clus,singles in clusters: for motif in set([clus] + singles): if str(motif) not in stats: clustered_motifs.append(motifs[str(motif)]) new_stats = {} for bg, bg_fa in background.items(): for m,s in calc_stats(clustered_motifs, fg_fa, bg_fa).items(): if m not in new_stats: new_stats[m] = {} new_stats[m][bg] = s stats.update(new_stats) rank = rank_motifs(stats, metrics) # rank the motifs best_motifs = [] for clus, singles in clusters: if len(singles) > 1: eval_motifs = singles if clus not in motifs: eval_motifs.append(clus) eval_motifs = [motifs[str(e)] for e in eval_motifs] best_motif = sorted(eval_motifs, key=lambda x: rank[str(x)])[-1] best_motifs.append(best_motif) else: best_motifs.append(clus) for bg in background: stats[str(best_motifs[-1])][bg]["num_cluster"] = len(singles) best_motifs = sorted(best_motifs, key=lambda x: rank[str(x)], reverse=True) return best_motifs