def match(args): sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)]) db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)]) mc = MotifComparer() result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean") print "Motif\tMatch\tScore\tP-value" for motif, match in result.items(): pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True) print "%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval) if args.img: plotdata = [] for query, match in result.items(): motif = sample[query] dbmotif = db[match[0]] pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True) if orient == -1: tmp = dbmotif.id dbmotif = dbmotif.rc() dbmotif.id = tmp if pos < 0: tmp = motif.id motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm) motif.id = tmp elif pos > 0: tmp = dbmotif.id dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm) dbmotif.id = tmp plotdata.append((motif, dbmotif, pval)) match_plot(plotdata, args.img)
def determine_closest_match(self, motifs): self.logger.debug("Determining closest matching motifs in database") motif_db = self.config.get_default_params()["motif_db"] db = os.path.join(self.config.get_motif_dir(), motif_db) db_motifs = [] if db.endswith("pwm") or db.endswith("pfm"): db_motifs = read_motifs(open(db), fmt="pwm") elif db.endswith("transfac"): db_motifs = read_motifs(db, fmt="transfac") closest_match = {} mc = MotifComparer() db_motif_lookup = dict([(m.id, m) for m in db_motifs]) match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean", parallel=False) for motif in motifs: # Calculate p-value pval, pos, orient = mc.compare_motifs( motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True) closest_match[motif.id] = [ db_motif_lookup[match[motif.id][0]], pval ] return closest_match
def match(args): sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)]) db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)]) mc = MotifComparer() result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean") print("Motif\tMatch\tScore\tP-value") for motif, match in result.items(): pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True) print("%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval)) if args.img: plotdata = [] for query, match in result.items(): motif = sample[query] dbmotif = db[match[0]] pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True) if orient == -1: tmp = dbmotif.id dbmotif = dbmotif.rc() dbmotif.id = tmp if pos < 0: tmp = motif.id motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm) motif.id = tmp elif pos > 0: tmp = dbmotif.id dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm) dbmotif.id = tmp plotdata.append((motif, dbmotif, pval)) match_plot(plotdata, args.img)
def ap1_included(motifs): ap1 = motif_from_consensus("TGASTCA") mc = MotifComparer() match = mc.get_closest_match(ap1, motifs, metric="seqcor") print(match) if match["TGASTCA"][1][0] >= 0.75: return True return False
def ap1_included(self, motifs): ap1 = motif_from_consensus("TGASTCA") mc = MotifComparer() for motif in motifs: match = mc.get_closest_match(ap1, motif) if match["TGASTCA"][1][3] < 1e-6: return True return False
def ap1_included(self, motifs): #if len(motifs) == 0: # return False ap1 = motif_from_consensus("TGASTCA") mc = MotifComparer() match = mc.get_closest_match(ap1, motifs, metric="seqcor") print(match) if match["TGASTCA"][1][0] >= 0.8: return True return False
def create_denovo_motif_report(inputfile, pfmfile, fgfa, background, locfa, outdir, params, stats=None): """Create text and graphical (.html) motif reports.""" logger.info("creating de novo reports") motifs = read_motifs(pfmfile, fmt="pwm") # ROC plots create_roc_plots(pfmfile, fgfa, background, outdir, params["genome"]) # Closest match in database mc = MotifComparer() closest_match = mc.get_closest_match(motifs) if stats is None: stats = {} for bg, bgfa in background.items(): for m, s in calc_stats(fg_file=fgfa, bg_file=bgfa, motifs=motifs).items(): if m not in stats: stats[m] = {} stats[m][bg] = s stats = add_star(stats) if not params: params = {} cutoff_fpr = params.get("cutoff_fpr", 0.9) lsize = np.median([len(seq) for seq in Fasta(locfa).seqs]) # Location plots logger.debug("Creating localization plots") for motif in motifs: logger.debug(" {} {}".format(motif.id, motif)) outfile = os.path.join(outdir, "images/{}_histogram.svg".format(motif.id)) motif_localization(locfa, motif, lsize, outfile, cutoff=cutoff_fpr) # Create reports _create_text_report(inputfile, motifs, closest_match, stats, outdir) _create_graphical_report(inputfile, pfmfile, background, closest_match, outdir, stats)
def test1_closest_match(self): """ Closest match """ mc = MotifComparer() pwm = "test/data/pwmscan/TATA.pwm" ret = mc.get_closest_match(pwm) self.assertIn("TATA-box", ret) match = ret["TATA-box"] self.assertEqual("GM.5.0.TBP.0001", match[0]) scores = match[1] self.assertAlmostEqual(-0.1041, scores[0], 4) self.assertEqual(0, scores[1]) self.assertEqual(1, scores[2]) self.assertAlmostEqual(3.1666e-8, scores[3])
def test1_closest_match(self): """ Closest match """ mc = MotifComparer() pwm = "test/data/pwmscan/TATA.pwm" ret = mc.get_closest_match(pwm) self.assertIn('TATA-box', ret) match = ret['TATA-box'] self.assertEqual('GM.5.0.TBP.0001', match[0]) scores = match[1] self.assertAlmostEqual(-0.1041, scores[0], 4) self.assertEqual(0, scores[1]) self.assertEqual(1, scores[2]) self.assertAlmostEqual(3.1666e-8, scores[3])
def test1_closest_match(self): """ Closest match """ mc = MotifComparer() pwm = "test/data/pwmscan/TATA.pwm" ret = mc.get_closest_match(pwm) self.assertIn('TATA-box', ret) match = ret['TATA-box'] self.assertEqual('TBP_Average_1', match[0]) scores = match[1] self.assertAlmostEqual(-0.3276, scores[0], 4) self.assertEqual(-1, scores[1]) self.assertEqual(1, scores[2]) self.assertAlmostEqual(4.134e-7, scores[3])
def test1_denovo(self): """ de novo motif prediction """ gimme_motifs( "test/data/denovo/input.fa", self.outdir, params={ "tools": "BioProspector,Homer,MDmodule", "fraction": 0.5, "background": "random", "genome": "test/data/background/genome.fa", }, filter_significant=True, cluster=True, ) fnames = [ "gimme.denovo.pfm", "gimme.denovo.html", "gimme.clustereds.html", "params.txt", "stats.random.txt", ] with open(os.path.join(self.outdir, "gimmemotifs.log")) as f: log = f.read() self.assertIn("clustering", log) # Check if all output files are there for fname in fnames: self.assertTrue(os.path.exists(os.path.join(self.outdir, fname))) # Check if correct motif is predicted with open(os.path.join(self.outdir, "gimme.denovo.pfm")) as f: predicted_motifs = read_motifs(f) ap1 = motif_from_consensus("TGASTCA") mc = MotifComparer() ap1_predicted = False for motif in predicted_motifs: match = mc.get_closest_match(ap1, motif) if match["TGASTCA"][1][3] < 1e-5: ap1_predicted = True break self.assertTrue(ap1_predicted)
def determine_closest_match(self, motifs): self.logger.debug("Determining closest matching motifs in database") motif_db = self.config.get_default_params()["motif_db"] db = os.path.join(self.config.get_motif_dir(), motif_db) db_motifs = [] if db.endswith("pwm") or db.endswith("pfm"): db_motifs = read_motifs(open(db), fmt="pwm") elif db.endswith("transfac"): db_motifs = read_motifs(db, fmt="transfac") closest_match = {} mc = MotifComparer() db_motif_lookup = dict([(m.id, m) for m in db_motifs]) match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean", parallel=False) for motif in motifs: # Calculate p-value pval, pos, orient = mc.compare_motifs(motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True) closest_match[motif.id] = [db_motif_lookup[match[motif.id][0]], pval] return closest_match
def test1_denovo(self): """ de novo motif prediction """ gimme_motifs("test/data/denovo/input.fa", self.outdir, params={ "tools":"BioProspector,Homer,MDmodule", "fraction":0.5, "background":"random" }, filter_significant=True, cluster=True) fnames = ["motifs.pwm", "motif_report.html", "cluster_report.html", "params.txt", "stats.random.txt"] with open(os.path.join(self.outdir, 'gimmemotifs.log')) as f: log = f.read() self.assertIn("clustering", log) # Check if all output files are there for fname in fnames: self.assertTrue(os.path.exists(os.path.join(self.outdir, fname))) # Check if correct motif is predicted with open(os.path.join(self.outdir, "motifs.pwm")) as f: predicted_motifs = read_motifs(f) ap1 = motif_from_consensus("TGASTCA") mc = MotifComparer() ap1_predicted = False for motif in predicted_motifs: match = mc.get_closest_match(ap1, motif) if match["TGASTCA"][1][3] < 1e-5: ap1_predicted = True break self.assertTrue(ap1_predicted)
def create_denovo_motif_report(inputfile, pwmfile, fgfa, background, locfa, outdir, params, stats=None): """Create text and graphical (.html) motif reports.""" logger.info("creating reports") motifs = read_motifs(pwmfile, fmt="pwm") # ROC plots create_roc_plots(pwmfile, fgfa, background, outdir) # Closest match in database mc = MotifComparer() closest_match = mc.get_closest_match(motifs) if stats is None: stats = {} for bg, bgfa in background.items(): for m, s in calc_stats(motifs, fgfa, bgfa).items(): if m not in stats: stats[m] = {} stats[m][bg] = s stats = add_star(stats) if not params: params = {} cutoff_fpr = params.get('cutoff_fpr', 0.9) lwidth = np.median([len(seq) for seq in Fasta(locfa).seqs]) # Location plots logger.debug("Creating localization plots") for motif in motifs: logger.debug(" {} {}".format(motif.id, motif)) outfile = os.path.join(outdir, "images/{}_histogram.svg".format(motif.id)) motif_localization(locfa, motif, lwidth, outfile, cutoff=cutoff_fpr) # Create reports _create_text_report(inputfile, motifs, closest_match, stats, outdir) _create_graphical_report(inputfile, pwmfile, background, closest_match, outdir, stats)
def motifs(args): """ Calculate ROC_AUC and other metrics and optionally plot ROC curve.""" if args.outdir is None: raise ValueError("an output directory is required!") if not os.path.exists(args.outdir): os.makedirs(args.outdir) scan_dir = os.path.join(args.outdir, "motif_scan_results") if not os.path.exists(scan_dir): os.makedirs(scan_dir) file_type = determine_file_type(args.sample) outfile = os.path.join(args.outdir, f"input.w{args.size}.bed") sample = args.sample if file_type == "narrowpeak": narrowpeak_to_bed(args.sample, outfile, size=args.size) sample = outfile elif args.size and args.size > 0: if file_type == "fasta": logger.warn("size parameter will be ignored for FASTA input") elif file_type == "bed": write_equalsize_bedfile(args.sample, args.size, outfile) sample = outfile genome = args.genome if genome is None: args.zscore = False args.gc = False bgfile = None bg = args.background if bg is None: if genome is None: bg = "random" else: bg = "gc" if os.path.isfile(bg): bgfile = bg bg = "custom" else: # create background if not provided bgfile = os.path.join(args.outdir, "generated_background.{}.fa".format(bg)) size = args.size if size <= 0: size = None if bg == "gc": logger.info("creating background (matched GC%)") else: logger.info("creating background (random)") create_background_file( bgfile, bg, fmt="fasta", genome=genome, inputfile=sample, size=size, number=10000, ) pfmfile = args.pfmfile motifs = [] if args.known: motifs = read_motifs(pfmfile, fmt="pfm") if args.denovo: gimme_motifs( sample, args.outdir, params={ "tools": args.tools, "analysis": args.analysis, "background": bg, "custom_background": bgfile, "genome": args.genome, "size": args.size, }, ) denovo = read_motifs(os.path.join(args.outdir, "gimme.denovo.pfm")) mc = MotifComparer() result = mc.get_closest_match(denovo, dbmotifs=pfmfile, metric="seqcor") match_motifs = read_motifs(pfmfile, as_dict=True) new_map_file = os.path.join(args.outdir, "combined.motif2factors.txt") base = os.path.splitext(pfmfile)[0] map_file = base + ".motif2factors.txt" if os.path.exists(map_file): shutil.copyfile(map_file, new_map_file) motifs += denovo pfmfile = os.path.join(args.outdir, "combined.pfm") with open(pfmfile, "w") as f: for m in motifs: print(m.to_pwm(), file=f) with open(new_map_file, "a") as f: for m in denovo: print("{}\t{}\t{}\t{}".format(m.id, "de novo", "GimmeMotifs", "Y"), file=f) if result[m.id][0] in match_motifs: for factor in match_motifs[result[m.id] [0]].factors["direct"]: print( "{}\t{}\t{}\t{}".format(m.id, factor, "inferred (GimmeMotifs)", "N"), file=f, ) else: logger.info("skipping de novo") stats = [ "phyper_at_fpr", "roc_auc", "pr_auc", "enr_at_fpr", "recall_at_fdr", "roc_values", "matches_at_fpr", ] f_out = sys.stdout if args.outdir: f_out = open(args.outdir + "/gimme.roc.report.txt", "w") # Print the metrics f_out.write( "Motif\t# matches\t% matches input\t# matches background\t%matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n" ) logger.info("creating motif scan tables") # ftype = determine_file_type(args.sample) # sample = args.sample # delete_sample = False # if ftype == "narrowpeak": # f = NamedTemporaryFile(delete=False) # logger.debug("Using {} as temporary BED file".format(f.name)) # narrowpeak_to_bed(args.sample, f.name, size=args.size) # sample = f.name # delete_sample = True # Create a table with the best score per motif for all motifs. # This has three reasons: # * Can be used to calculate statistics; # * Can be used to select a set of non-redundant motifs; # * These files are included in the output and can be used for further analyis. score_table = os.path.join(scan_dir, "input.motif.score.txt") bg_score_table = os.path.join(scan_dir, "background.motif.score.txt") for infile, outfile in [(sample, score_table), (bgfile, bg_score_table)]: scan_to_file( infile, pfmfile, filepath_or_buffer=outfile, score_table=True, genome=args.genome, zscore=True, gcnorm=True, ) n_input = pd.read_csv(score_table, comment="#", sep="\t").shape[0] n_background = pd.read_csv(bg_score_table, comment="#", sep="\t").shape[0] logger.info("calculating stats") for motif_stats in calc_stats_iterator( motifs=pfmfile, fg_table=score_table, bg_table=bg_score_table, stats=stats, ncpus=args.ncpus, ): for motif in motifs: if str(motif) in motif_stats: log_pvalue = np.inf if motif_stats[str(motif)]["phyper_at_fpr"] > 0: log_pvalue = -np.log10( motif_stats[str(motif)]["phyper_at_fpr"]) f_out.write( "{}\t{:d}\t{:.3f}\t{:d}\t{:.3f}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n" .format( motif.id, motif_stats[str(motif)]["matches_at_fpr"][0], motif_stats[str(motif)]["matches_at_fpr"][0] / n_input * 100, motif_stats[str(motif)]["matches_at_fpr"][1], motif_stats[str(motif)]["matches_at_fpr"][1] / n_background * 100, motif_stats[str(motif)]["phyper_at_fpr"], log_pvalue, motif_stats[str(motif)]["roc_auc"], motif_stats[str(motif)]["pr_auc"], motif_stats[str(motif)]["enr_at_fpr"], motif_stats[str(motif)]["recall_at_fdr"], )) f_out.close() # Select a set of "non-redundant" motifs. # Using Recursive Feature Elimination, a set of motifs is selected that # best explains the peaks in comparison to the background sequences. nr_motifs = select_nonredundant_motifs( args.outdir + "/gimme.roc.report.txt", pfmfile, score_table, bg_score_table, tolerance=0.001, ) # Provide BED files with motif scan results for the non-redundant motifs # At the moment this is not ideal, as scanning is now performed twice # for this set of non-redundant motifs. motif_dict = dict([(m.id, m) for m in motifs]) for motif in nr_motifs: with NamedTemporaryFile(mode="w") as f: print(motif_dict[motif].to_pwm(), file=f) f.flush() safe_name = re.sub(r"[^a-zA-Z0-9\-]+", "_", motif) scan_to_file( sample, f.name, filepath_or_buffer=os.path.join(scan_dir, f"{safe_name}.matches.bed"), bed=True, fpr=0.01, genome=args.genome, zscore=True, gcnorm=True, ) if args.report: logger.info("creating statistics report") if args.outdir: roc_html_report( args.outdir, args.outdir + "/gimme.roc.report.txt", pfmfile, threshold=0.01, outname="gimme.motifs.redundant.html", link_matches=False, ) roc_html_report( args.outdir, args.outdir + "/gimme.roc.report.txt", pfmfile, threshold=0.01, use_motifs=nr_motifs, link_matches=True, ) logger.info( f"gimme motifs final report: {os.path.join(args.outdir, 'gimme.motifs.html')}" )