def determine_closest_match(self, motifs): self.logger.debug("Determining closest matching motifs in database") motif_db = self.config.get_default_params()["motif_db"] db = os.path.join(self.config.get_motif_dir(), motif_db) db_motifs = [] if db.endswith("pwm") or db.endswith("pfm"): db_motifs = read_motifs(open(db), fmt="pwm") elif db.endswith("transfac"): db_motifs = read_motifs(db, fmt="transfac") closest_match = {} mc = MotifComparer() db_motif_lookup = dict([(m.id, m) for m in db_motifs]) match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean", parallel=False) for motif in motifs: # Calculate p-value pval, pos, orient = mc.compare_motifs( motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True) closest_match[motif.id] = [ db_motif_lookup[match[motif.id][0]], pval ] return closest_match
def match(args): sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)]) db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)]) mc = MotifComparer() result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean") print "Motif\tMatch\tScore\tP-value" for motif, match in result.items(): pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True) print "%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval) if args.img: plotdata = [] for query, match in result.items(): motif = sample[query] dbmotif = db[match[0]] pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True) if orient == -1: tmp = dbmotif.id dbmotif = dbmotif.rc() dbmotif.id = tmp if pos < 0: tmp = motif.id motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm) motif.id = tmp elif pos > 0: tmp = dbmotif.id dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm) dbmotif.id = tmp plotdata.append((motif, dbmotif, pval)) match_plot(plotdata, args.img)
def cluster(self, threshold=0.5, metric = "pcc", clust_method = "average"): """ Returns: ---------- dict A dictionary with keys=cluster names and values=MotifList objects """ #Needs gimmemotif from gimmemotifs.motif import Motif from gimmemotifs.comparison import MotifComparer sns.set_style("ticks") #set style back to ticks, as this is set globally during gimmemotifs import #Fill in self.gimme_obj variable motif_list = [motif.get_gimmemotif().gimme_obj for motif in self] #list of gimmemotif objects #Similarities between all motifs mc = MotifComparer() score_dict = mc.get_all_scores(motif_list, motif_list, match = "total", metric = metric, combine = "mean") #metric can be: seqcor, pcc, ed, distance, wic, chisq, akl or ssd self.similarity_matrix = generate_similarity_matrix(score_dict) # Clustering vector = ssd.squareform(self.similarity_matrix.to_numpy()) self.linkage_mat = linkage(vector, method=clust_method) # Flatten clusters fclust_labels = fcluster(self.linkage_mat, threshold, criterion="distance") #cluster membership per motif formatted_labels = ["Cluster_{0}".format(label) for label in fclust_labels] # Extract motifs belonging to each cluster cluster_dict = {label: MotifList() for label in formatted_labels} #initialize dictionary for i, cluster_label in enumerate(formatted_labels): cluster_dict[cluster_label].append(self[i]) return cluster_dict
def create_consensus(self): """ Create consensus motif from MotifList """ motif_list = [motif.gimme_obj for motif in self] #list of gimmemotif objects if len(motif_list) > 1: consensus_found = False mc = MotifComparer() #Initialize score_dict score_dict = mc.get_all_scores(motif_list, motif_list, match="total", metric="pcc", combine="mean") while not consensus_found: #Which motifs to merge? best_similarity_motifs = sorted( find_best_pair(motif_list, score_dict) ) #indices of most similar motifs in cluster_motifs #Merge new_motif = merge_motifs(motif_list[best_similarity_motifs[0]], motif_list[best_similarity_motifs[1]]) del (motif_list[best_similarity_motifs[1]]) motif_list[best_similarity_motifs[0]] = new_motif if len(motif_list) == 1: #done merging consensus_found = True else: #Update score_dict #add the comparison of the new motif to the score_dict score_dict[new_motif.id] = score_dict.get(new_motif.id, {}) for m in motif_list: score_dict[new_motif.id][m.id] = mc.compare_motifs( new_motif, m, metric="pcc") score_dict[m.id][new_motif.id] = mc.compare_motifs( m, new_motif, metric="pcc") #Round pwm values gimmemotif_consensus = motif_list[0] gimmemotif_consensus.pwm = [[round(f, 5) for f in l] for l in gimmemotif_consensus.pwm] #Convert back to OneMotif obj onemotif_consensus = gimmemotif_to_onemotif(gimmemotif_consensus) onemotif_consensus.gimme_obj = gimmemotif_consensus #Control the naming of the new motif all_names = [motif.name for motif in self] onemotif_consensus.name = ",".join(all_names[:3]) onemotif_consensus.name += "(...)" if len(all_names) > 3 else "" return (onemotif_consensus)
def match(args): sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)]) db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)]) mc = MotifComparer() result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean") print("Motif\tMatch\tScore\tP-value") for motif, match in result.items(): pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True) print("%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval)) if args.img: plotdata = [] for query, match in result.items(): motif = sample[query] dbmotif = db[match[0]] pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True) if orient == -1: tmp = dbmotif.id dbmotif = dbmotif.rc() dbmotif.id = tmp if pos < 0: tmp = motif.id motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm) motif.id = tmp elif pos > 0: tmp = dbmotif.id dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm) dbmotif.id = tmp plotdata.append((motif, dbmotif, pval)) match_plot(plotdata, args.img)
def _create_images(outdir, clusters): ids = [] mc = MotifComparer() trim_ic = 0.2 sys.stderr.write("Creating images\n") for cluster,members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(outdir,"%s.png" % cluster.id), fmt="PNG") ids.append([cluster.id, {"src":"%s.png" % cluster.id},[]]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(), key=lambda x: x[1])[0][1] for motif in members: _, pos, strand = scores[motif] add = pos - add_pos if strand in [1,"+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join(outdir, "%s.png" % motif.id.replace(" ", "_")), fmt="PNG", add_left=add) ids[-1][2] = [dict([("src", "%s.png" % m.id.replace(" ", "_")), ("alt", m.id.replace(" ", "_"))]) for m in members] return ids
def ap1_included(self, motifs): ap1 = motif_from_consensus("TGASTCA") mc = MotifComparer() for motif in motifs: match = mc.get_closest_match(ap1, motif) if match["TGASTCA"][1][3] < 1e-6: return True return False
def ap1_included(motifs): ap1 = motif_from_consensus("TGASTCA") mc = MotifComparer() match = mc.get_closest_match(ap1, motifs, metric="seqcor") print(match) if match["TGASTCA"][1][0] >= 0.75: return True return False
def ap1_included(self, motifs): #if len(motifs) == 0: # return False ap1 = motif_from_consensus("TGASTCA") mc = MotifComparer() match = mc.get_closest_match(ap1, motifs, metric="seqcor") print(match) if match["TGASTCA"][1][0] >= 0.8: return True return False
def create_denovo_motif_report(inputfile, pfmfile, fgfa, background, locfa, outdir, params, stats=None): """Create text and graphical (.html) motif reports.""" logger.info("creating de novo reports") motifs = read_motifs(pfmfile, fmt="pwm") # ROC plots create_roc_plots(pfmfile, fgfa, background, outdir, params["genome"]) # Closest match in database mc = MotifComparer() closest_match = mc.get_closest_match(motifs) if stats is None: stats = {} for bg, bgfa in background.items(): for m, s in calc_stats(fg_file=fgfa, bg_file=bgfa, motifs=motifs).items(): if m not in stats: stats[m] = {} stats[m][bg] = s stats = add_star(stats) if not params: params = {} cutoff_fpr = params.get("cutoff_fpr", 0.9) lsize = np.median([len(seq) for seq in Fasta(locfa).seqs]) # Location plots logger.debug("Creating localization plots") for motif in motifs: logger.debug(" {} {}".format(motif.id, motif)) outfile = os.path.join(outdir, "images/{}_histogram.svg".format(motif.id)) motif_localization(locfa, motif, lsize, outfile, cutoff=cutoff_fpr) # Create reports _create_text_report(inputfile, motifs, closest_match, stats, outdir) _create_graphical_report(inputfile, pfmfile, background, closest_match, outdir, stats)
def test1_closest_match(self): """ Closest match """ mc = MotifComparer() pwm = "test/data/pwmscan/TATA.pwm" ret = mc.get_closest_match(pwm) self.assertIn("TATA-box", ret) match = ret["TATA-box"] self.assertEqual("GM.5.0.TBP.0001", match[0]) scores = match[1] self.assertAlmostEqual(-0.1041, scores[0], 4) self.assertEqual(0, scores[1]) self.assertEqual(1, scores[2]) self.assertAlmostEqual(3.1666e-8, scores[3])
def test1_closest_match(self): """ Closest match """ mc = MotifComparer() pwm = "test/data/pwmscan/TATA.pwm" ret = mc.get_closest_match(pwm) self.assertIn('TATA-box', ret) match = ret['TATA-box'] self.assertEqual('GM.5.0.TBP.0001', match[0]) scores = match[1] self.assertAlmostEqual(-0.1041, scores[0], 4) self.assertEqual(0, scores[1]) self.assertEqual(1, scores[2]) self.assertAlmostEqual(3.1666e-8, scores[3])
def test1_closest_match(self): """ Closest match """ mc = MotifComparer() pwm = "test/data/pwmscan/TATA.pwm" ret = mc.get_closest_match(pwm) self.assertIn('TATA-box', ret) match = ret['TATA-box'] self.assertEqual('TBP_Average_1', match[0]) scores = match[1] self.assertAlmostEqual(-0.3276, scores[0], 4) self.assertEqual(-1, scores[1]) self.assertEqual(1, scores[2]) self.assertAlmostEqual(4.134e-7, scores[3])
def test1_denovo(self): """ de novo motif prediction """ gimme_motifs( "test/data/denovo/input.fa", self.outdir, params={ "tools": "BioProspector,Homer,MDmodule", "fraction": 0.5, "background": "random", "genome": "test/data/background/genome.fa", }, filter_significant=True, cluster=True, ) fnames = [ "gimme.denovo.pfm", "gimme.denovo.html", "gimme.clustereds.html", "params.txt", "stats.random.txt", ] with open(os.path.join(self.outdir, "gimmemotifs.log")) as f: log = f.read() self.assertIn("clustering", log) # Check if all output files are there for fname in fnames: self.assertTrue(os.path.exists(os.path.join(self.outdir, fname))) # Check if correct motif is predicted with open(os.path.join(self.outdir, "gimme.denovo.pfm")) as f: predicted_motifs = read_motifs(f) ap1 = motif_from_consensus("TGASTCA") mc = MotifComparer() ap1_predicted = False for motif in predicted_motifs: match = mc.get_closest_match(ap1, motif) if match["TGASTCA"][1][3] < 1e-5: ap1_predicted = True break self.assertTrue(ap1_predicted)
def match(args): sample = dict([(m.id, m) for m in read_motifs(args.pfmfile)]) db = dict([(m.id, m) for m in read_motifs(args.dbpfmfile)]) mc = MotifComparer() result = mc.get_best_matches( sample.values(), args.nmatches, db.values(), "partial", "seqcor", "mean" ) plotdata = [] print("Motif\tMatch\tScore\tP-value") for motif_name, matches in result.items(): for match in matches: pval, pos, orient = mc.compare_motifs( sample[motif_name], db[match[0]], "partial", "seqcor", "mean", pval=True ) print("%s\t%s\t%0.2f\t%0.3e" % (motif_name, match[0], match[1][0], pval)) motif = sample[motif_name] dbmotif = db[match[0]] if args.img: if orient == -1: tmp = dbmotif.id dbmotif = dbmotif.rc() dbmotif.id = tmp if pos < 0: tmp = motif.id motif = Motif([[0.25, 0.25, 0.25, 0.25]] * -pos + motif.pwm) motif.id = tmp elif pos > 0: tmp = dbmotif.id dbmotif = Motif([[0.25, 0.25, 0.25, 0.25]] * pos + dbmotif.pwm) dbmotif.id = tmp diff = len(motif) - len(dbmotif) if diff > 0: dbmotif = Motif(dbmotif.pwm + [[0.25, 0.25, 0.25, 0.25]] * diff) else: motif = Motif(motif.pwm + [[0.25, 0.25, 0.25, 0.25]] * -diff) plotdata.append((motif, dbmotif, pval)) if args.img: match_plot(plotdata, args.img)
def determine_closest_match(self, motifs): self.logger.debug("Determining closest matching motifs in database") motif_db = self.config.get_default_params()["motif_db"] db = os.path.join(self.config.get_motif_dir(), motif_db) db_motifs = [] if db.endswith("pwm") or db.endswith("pfm"): db_motifs = read_motifs(open(db), fmt="pwm") elif db.endswith("transfac"): db_motifs = read_motifs(db, fmt="transfac") closest_match = {} mc = MotifComparer() db_motif_lookup = dict([(m.id, m) for m in db_motifs]) match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean", parallel=False) for motif in motifs: # Calculate p-value pval, pos, orient = mc.compare_motifs(motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True) closest_match[motif.id] = [db_motif_lookup[match[motif.id][0]], pval] return closest_match
def merge_motifs(motif_1, motif_2): """Creates the consensus motif from two provided motifs, using the pos and orientation calculated by gimmemotifs get_all_scores() Parameter: ---------- motif_1 : Object of class Motif First gimmemotif object to create the consensus. motif_2 : Object of class Motif Second gimmemotif object to create consensus. Returns: -------- consensus : Object of class Motif Consensus of both motifs with id composed of ids of motifs it was created. """ from gimmemotifs.comparison import MotifComparer mc = MotifComparer() _, pos, orientation = mc.compare_motifs(motif_1, motif_2, metric= "pcc") consensus = motif_1.average_motifs(motif_2, pos = pos, orientation = orientation) consensus.id = motif_1.id + "+" + motif_2.id return consensus
def test1_denovo(self): """ de novo motif prediction """ gimme_motifs("test/data/denovo/input.fa", self.outdir, params={ "tools":"BioProspector,Homer,MDmodule", "fraction":0.5, "background":"random" }, filter_significant=True, cluster=True) fnames = ["motifs.pwm", "motif_report.html", "cluster_report.html", "params.txt", "stats.random.txt"] with open(os.path.join(self.outdir, 'gimmemotifs.log')) as f: log = f.read() self.assertIn("clustering", log) # Check if all output files are there for fname in fnames: self.assertTrue(os.path.exists(os.path.join(self.outdir, fname))) # Check if correct motif is predicted with open(os.path.join(self.outdir, "motifs.pwm")) as f: predicted_motifs = read_motifs(f) ap1 = motif_from_consensus("TGASTCA") mc = MotifComparer() ap1_predicted = False for motif in predicted_motifs: match = mc.get_closest_match(ap1, motif) if match["TGASTCA"][1][3] < 1e-5: ap1_predicted = True break self.assertTrue(ap1_predicted)
def create_denovo_motif_report(inputfile, pwmfile, fgfa, background, locfa, outdir, params, stats=None): """Create text and graphical (.html) motif reports.""" logger.info("creating reports") motifs = read_motifs(pwmfile, fmt="pwm") # ROC plots create_roc_plots(pwmfile, fgfa, background, outdir) # Closest match in database mc = MotifComparer() closest_match = mc.get_closest_match(motifs) if stats is None: stats = {} for bg, bgfa in background.items(): for m, s in calc_stats(motifs, fgfa, bgfa).items(): if m not in stats: stats[m] = {} stats[m][bg] = s stats = add_star(stats) if not params: params = {} cutoff_fpr = params.get('cutoff_fpr', 0.9) lwidth = np.median([len(seq) for seq in Fasta(locfa).seqs]) # Location plots logger.debug("Creating localization plots") for motif in motifs: logger.debug(" {} {}".format(motif.id, motif)) outfile = os.path.join(outdir, "images/{}_histogram.svg".format(motif.id)) motif_localization(locfa, motif, lwidth, outfile, cutoff=cutoff_fpr) # Create reports _create_text_report(inputfile, motifs, closest_match, stats, outdir) _create_graphical_report(inputfile, pwmfile, background, closest_match, outdir, stats)
def motifs(args): """ Calculate ROC_AUC and other metrics and optionally plot ROC curve.""" if args.outdir is None: raise ValueError("an output directory is required!") if not os.path.exists(args.outdir): os.makedirs(args.outdir) scan_dir = os.path.join(args.outdir, "motif_scan_results") if not os.path.exists(scan_dir): os.makedirs(scan_dir) file_type = determine_file_type(args.sample) outfile = os.path.join(args.outdir, f"input.w{args.size}.bed") sample = args.sample if file_type == "narrowpeak": narrowpeak_to_bed(args.sample, outfile, size=args.size) sample = outfile elif args.size and args.size > 0: if file_type == "fasta": logger.warn("size parameter will be ignored for FASTA input") elif file_type == "bed": write_equalsize_bedfile(args.sample, args.size, outfile) sample = outfile genome = args.genome if genome is None: args.zscore = False args.gc = False bgfile = None bg = args.background if bg is None: if genome is None: bg = "random" else: bg = "gc" if os.path.isfile(bg): bgfile = bg bg = "custom" else: # create background if not provided bgfile = os.path.join(args.outdir, "generated_background.{}.fa".format(bg)) size = args.size if size <= 0: size = None if bg == "gc": logger.info("creating background (matched GC%)") else: logger.info("creating background (random)") create_background_file( bgfile, bg, fmt="fasta", genome=genome, inputfile=sample, size=size, number=10000, ) pfmfile = args.pfmfile motifs = [] if args.known: motifs = read_motifs(pfmfile, fmt="pfm") if args.denovo: gimme_motifs( sample, args.outdir, params={ "tools": args.tools, "analysis": args.analysis, "background": bg, "custom_background": bgfile, "genome": args.genome, "size": args.size, }, ) denovo = read_motifs(os.path.join(args.outdir, "gimme.denovo.pfm")) mc = MotifComparer() result = mc.get_closest_match(denovo, dbmotifs=pfmfile, metric="seqcor") match_motifs = read_motifs(pfmfile, as_dict=True) new_map_file = os.path.join(args.outdir, "combined.motif2factors.txt") base = os.path.splitext(pfmfile)[0] map_file = base + ".motif2factors.txt" if os.path.exists(map_file): shutil.copyfile(map_file, new_map_file) motifs += denovo pfmfile = os.path.join(args.outdir, "combined.pfm") with open(pfmfile, "w") as f: for m in motifs: print(m.to_pwm(), file=f) with open(new_map_file, "a") as f: for m in denovo: print("{}\t{}\t{}\t{}".format(m.id, "de novo", "GimmeMotifs", "Y"), file=f) if result[m.id][0] in match_motifs: for factor in match_motifs[result[m.id] [0]].factors["direct"]: print( "{}\t{}\t{}\t{}".format(m.id, factor, "inferred (GimmeMotifs)", "N"), file=f, ) else: logger.info("skipping de novo") stats = [ "phyper_at_fpr", "roc_auc", "pr_auc", "enr_at_fpr", "recall_at_fdr", "roc_values", "matches_at_fpr", ] f_out = sys.stdout if args.outdir: f_out = open(args.outdir + "/gimme.roc.report.txt", "w") # Print the metrics f_out.write( "Motif\t# matches\t% matches input\t# matches background\t%matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n" ) logger.info("creating motif scan tables") # ftype = determine_file_type(args.sample) # sample = args.sample # delete_sample = False # if ftype == "narrowpeak": # f = NamedTemporaryFile(delete=False) # logger.debug("Using {} as temporary BED file".format(f.name)) # narrowpeak_to_bed(args.sample, f.name, size=args.size) # sample = f.name # delete_sample = True # Create a table with the best score per motif for all motifs. # This has three reasons: # * Can be used to calculate statistics; # * Can be used to select a set of non-redundant motifs; # * These files are included in the output and can be used for further analyis. score_table = os.path.join(scan_dir, "input.motif.score.txt") bg_score_table = os.path.join(scan_dir, "background.motif.score.txt") for infile, outfile in [(sample, score_table), (bgfile, bg_score_table)]: scan_to_file( infile, pfmfile, filepath_or_buffer=outfile, score_table=True, genome=args.genome, zscore=True, gcnorm=True, ) n_input = pd.read_csv(score_table, comment="#", sep="\t").shape[0] n_background = pd.read_csv(bg_score_table, comment="#", sep="\t").shape[0] logger.info("calculating stats") for motif_stats in calc_stats_iterator( motifs=pfmfile, fg_table=score_table, bg_table=bg_score_table, stats=stats, ncpus=args.ncpus, ): for motif in motifs: if str(motif) in motif_stats: log_pvalue = np.inf if motif_stats[str(motif)]["phyper_at_fpr"] > 0: log_pvalue = -np.log10( motif_stats[str(motif)]["phyper_at_fpr"]) f_out.write( "{}\t{:d}\t{:.3f}\t{:d}\t{:.3f}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n" .format( motif.id, motif_stats[str(motif)]["matches_at_fpr"][0], motif_stats[str(motif)]["matches_at_fpr"][0] / n_input * 100, motif_stats[str(motif)]["matches_at_fpr"][1], motif_stats[str(motif)]["matches_at_fpr"][1] / n_background * 100, motif_stats[str(motif)]["phyper_at_fpr"], log_pvalue, motif_stats[str(motif)]["roc_auc"], motif_stats[str(motif)]["pr_auc"], motif_stats[str(motif)]["enr_at_fpr"], motif_stats[str(motif)]["recall_at_fdr"], )) f_out.close() # Select a set of "non-redundant" motifs. # Using Recursive Feature Elimination, a set of motifs is selected that # best explains the peaks in comparison to the background sequences. nr_motifs = select_nonredundant_motifs( args.outdir + "/gimme.roc.report.txt", pfmfile, score_table, bg_score_table, tolerance=0.001, ) # Provide BED files with motif scan results for the non-redundant motifs # At the moment this is not ideal, as scanning is now performed twice # for this set of non-redundant motifs. motif_dict = dict([(m.id, m) for m in motifs]) for motif in nr_motifs: with NamedTemporaryFile(mode="w") as f: print(motif_dict[motif].to_pwm(), file=f) f.flush() safe_name = re.sub(r"[^a-zA-Z0-9\-]+", "_", motif) scan_to_file( sample, f.name, filepath_or_buffer=os.path.join(scan_dir, f"{safe_name}.matches.bed"), bed=True, fpr=0.01, genome=args.genome, zscore=True, gcnorm=True, ) if args.report: logger.info("creating statistics report") if args.outdir: roc_html_report( args.outdir, args.outdir + "/gimme.roc.report.txt", pfmfile, threshold=0.01, outname="gimme.motifs.redundant.html", link_matches=False, ) roc_html_report( args.outdir, args.outdir + "/gimme.roc.report.txt", pfmfile, threshold=0.01, use_motifs=nr_motifs, link_matches=True, ) logger.info( f"gimme motifs final report: {os.path.join(args.outdir, 'gimme.motifs.html')}" )
def cluster(args): revcomp = not args.single outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): os.mkdir(outdir) trim_ic = 0.2 clusters = [] motifs = pwmfile_to_motifs(args.inputfile) if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True) clusters = tree.getResult() ids = [] mc = MotifComparer() sys.stderr.write("Creating images\n") for cluster,members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(outdir,"%s.png" % cluster.id), format="PNG") ids.append([cluster.id, {"src":"%s.png" % cluster.id},[]]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1,"+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join(outdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add) ids[-1][2] = [dict([("src", "%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members] config = MotifConfig() env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(motifs=ids) with open(os.path.join(outdir, "cluster_report.html"), "w") as f: f.write(result.encode('utf-8')) f = open(os.path.join(outdir, "cluster_key.txt"), "w") for id in ids: f.write("%s\t%s\n" % (id[0], ",".join([x["alt"] for x in id[2]]))) f.close() f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close()
import sys from gimmemotifs.motif import read_motifs from gimmemotifs.comparison import seqcor, MotifComparer, _get_all_scores pwmfile = sys.argv[1] outfile = sys.argv[2] chunksize = int(sys.argv[3]) chunk = int(sys.argv[4]) metric = sys.argv[5] if metric not in ["wic", "seqcor", "pcc", "ed"]: raise ValueError("invalid metric {}".format(metric)) all_motifs = read_motifs(open(pwmfile)) chunk_motifs = all_motifs[(chunk - 1) * chunksize:chunk * chunksize] mc = MotifComparer() if metric == "pcc": dists = mc.get_all_scores(chunk_motifs, all_motifs, "partial", metric, "mean", False) else: dists = mc.get_all_scores(chunk_motifs, all_motifs, "total", metric, "mean", False) cols = list(dists.values())[0] with open(outfile, "w") as f: f.write("\t{}\n".format("\t".join(cols))) for k, v in dists.items(): f.write("{}\t{}\n".format( k, "\t".join(["{:.6f}".format(v[c][0]) for c in cols])))
def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold): self.logger.info("clustering significant motifs.") trim_ic = 0.2 clusters = [] motifs = read_motifs(open(pfm_file), fmt="pwm") if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs(pfm_file, "total", "wic", "mean", True, threshold=float(threshold), include_bg=True, progress=False) clusters = tree.getResult() ids = [] mc = MotifComparer() for cluster, members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(self.imgdir, "%s.png" % cluster.id), format="PNG") ids.append([cluster.id, {"src": "images/%s.png" % cluster.id}, []]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(), cmp=lambda x, y: cmp(x[1], y[1]))[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1, "+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join( self.imgdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add) ids[-1][2] = [ dict([("src", "images/%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members ] env = jinja2.Environment( loader=jinja2.FileSystemLoader([self.config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(expname=self.basename, motifs=ids, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION) f = open(self.cluster_report, "w") f.write(result.encode('utf-8')) f.close() f = open(cluster_pwm, "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close() self.logger.debug("Clustering done. See the result in %s", self.cluster_report) return clusters
def cluster_motifs( motifs, match="total", metric="wic", combine="mean", pval=True, threshold=0.95, trim_edges=False, edge_ic_cutoff=0.2, include_bg=True, progress=True, ): """ Clusters a set of sequence motifs. Required arg 'motifs' is a file containing positional frequency matrices or an array with motifs. Optional args: 'match', 'metric' and 'combine' specify the method used to compare and score the motifs. By default the WIC score is used (metric='wic'), using the the score over the whole alignment (match='total'), with the total motif score calculated as the mean score of all positions (combine='mean'). 'match' can be either 'total' for the total alignment or 'subtotal' for the maximum scoring subsequence of the alignment. 'metric' can be any metric defined in MotifComparer, currently: 'pcc', 'ed', 'distance', 'wic' or 'chisq' 'combine' determines how the total score is calculated from the score of individual positions and can be either 'sum' or 'mean' 'pval' can be True or False and determines if the score should be converted to an empirical p-value 'threshold' determines the score (or p-value) cutoff If 'trim_edges' is set to True, all motif edges with an IC below 'edge_ic_cutoff' will be removed before clustering When computing the average of two motifs 'include_bg' determines if, at a position only present in one motif, the information in that motif should be kept, or if it should be averaged with background frequencies. Should probably be left set to True. """ # First read pfm or pfm formatted motiffile if type([]) != type(motifs): motifs = read_motifs(open(motifs), fmt="pwm") mc = MotifComparer() # Trim edges with low information content if trim_edges: for motif in motifs: motif.trim(edge_ic_cutoff) # Make a MotifTree node for every motif nodes = [MotifTree(m) for m in motifs] # Determine all pairwise scores and maxscore per motif scores = {} motif_nodes = dict([(n.motif.id, n) for n in nodes]) motifs = [n.motif for n in nodes] if progress: sys.stderr.write("Calculating initial scores\n") result = mc.get_all_scores(motifs, motifs, match, metric, combine, pval, parallel=True) for m1, other_motifs in result.items(): for m2, score in other_motifs.items(): if m1 == m2: if pval: motif_nodes[m1].maxscore = 1 - score[0] else: motif_nodes[m1].maxscore = score[0] else: if pval: score = [1 - score[0]] + score[1:] scores[(motif_nodes[m1], motif_nodes[m2])] = score cluster_nodes = [node for node in nodes] ave_count = 1 total = len(cluster_nodes) while len(cluster_nodes) > 1: l = sorted(scores.keys(), key=lambda x: scores[x][0]) i = -1 (n1, n2) = l[i] while not n1 in cluster_nodes or not n2 in cluster_nodes: i -= 1 (n1, n2) = l[i] (score, pos, orientation) = scores[(n1, n2)] ave_motif = n1.motif.average_motifs(n2.motif, pos, orientation, include_bg=include_bg) ave_motif.trim(edge_ic_cutoff) ave_motif.id = "Average_%s" % ave_count ave_count += 1 new_node = MotifTree(ave_motif) if pval: new_node.maxscore = 1 - mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0] else: new_node.maxscore = mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0] new_node.mergescore = score # print "%s + %s = %s with score %s" % (n1.motif.id, n2.motif.id, ave_motif.id, score) n1.parent = new_node n2.parent = new_node new_node.left = n1 new_node.right = n2 cmp_nodes = dict([(node.motif, node) for node in nodes if not node.parent]) if progress: progress = (1 - len(cmp_nodes) / float(total)) * 100 sys.stderr.write( "\rClustering [{0}{1}] {2}%".format( "#" * (int(progress) / 10), " " * (10 - int(progress) / 10), int(progress) ) ) result = mc.get_all_scores([new_node.motif], cmp_nodes.keys(), match, metric, combine, pval, parallel=True) for motif, n in cmp_nodes.items(): x = result[new_node.motif.id][motif.id] if pval: x = [1 - x[0]] + x[1:] scores[(new_node, n)] = x nodes.append(new_node) cluster_nodes = [node for node in nodes if not node.parent] if progress: sys.stderr.write("\n") root = nodes[-1] for node in [node for node in nodes if not node.left]: node.parent.checkMerge(root, threshold) return root
def cluster_motifs(motifs, match="total", metric="wic", combine="mean", pval=True, threshold=0.95, trim_edges=False, edge_ic_cutoff=0.2, include_bg=True, progress=True): """ Clusters a set of sequence motifs. Required arg 'motifs' is a file containing positional frequency matrices or an array with motifs. Optional args: 'match', 'metric' and 'combine' specify the method used to compare and score the motifs. By default the WIC score is used (metric='wic'), using the the score over the whole alignment (match='total'), with the total motif score calculated as the mean score of all positions (combine='mean'). 'match' can be either 'total' for the total alignment or 'subtotal' for the maximum scoring subsequence of the alignment. 'metric' can be any metric defined in MotifComparer, currently: 'pcc', 'ed', 'distance', 'wic' or 'chisq' 'combine' determines how the total score is calculated from the score of individual positions and can be either 'sum' or 'mean' 'pval' can be True or False and determines if the score should be converted to an empirical p-value 'threshold' determines the score (or p-value) cutoff If 'trim_edges' is set to True, all motif edges with an IC below 'edge_ic_cutoff' will be removed before clustering When computing the average of two motifs 'include_bg' determines if, at a position only present in one motif, the information in that motif should be kept, or if it should be averaged with background frequencies. Should probably be left set to True. """ # First read pfm or pfm formatted motiffile if type([]) != type(motifs): motifs = read_motifs(open(motifs), fmt="pwm") mc = MotifComparer() # Trim edges with low information content if trim_edges: for motif in motifs: motif.trim(edge_ic_cutoff) # Make a MotifTree node for every motif nodes = [MotifTree(m) for m in motifs] # Determine all pairwise scores and maxscore per motif scores = {} motif_nodes = dict([(n.motif.id,n) for n in nodes]) motifs = [n.motif for n in nodes] if progress: sys.stderr.write("Calculating initial scores\n") result = mc.get_all_scores(motifs, motifs, match, metric, combine, pval, parallel=True) for m1, other_motifs in result.items(): for m2, score in other_motifs.items(): if m1 == m2: if pval: motif_nodes[m1].maxscore = 1 - score[0] else: motif_nodes[m1].maxscore = score[0] else: if pval: score = [1 - score[0]] + score[1:] scores[(motif_nodes[m1],motif_nodes[m2])] = score cluster_nodes = [node for node in nodes] ave_count = 1 total = len(cluster_nodes) while len(cluster_nodes) > 1: l = sorted(scores.keys(), key=lambda x: scores[x][0]) i = -1 (n1, n2) = l[i] while not n1 in cluster_nodes or not n2 in cluster_nodes: i -= 1 (n1,n2) = l[i] (score, pos, orientation) = scores[(n1,n2)] ave_motif = n1.motif.average_motifs(n2.motif, pos, orientation, include_bg=include_bg) ave_motif.trim(edge_ic_cutoff) ave_motif.id = "Average_%s" % ave_count ave_count += 1 new_node = MotifTree(ave_motif) if pval: new_node.maxscore = 1 - mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0] else: new_node.maxscore = mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0] new_node.mergescore = score #print "%s + %s = %s with score %s" % (n1.motif.id, n2.motif.id, ave_motif.id, score) n1.parent = new_node n2.parent = new_node new_node.left = n1 new_node.right = n2 cmp_nodes = dict([(node.motif, node) for node in nodes if not node.parent]) if progress: progress = (1 - len(cmp_nodes) / float(total)) * 100 sys.stderr.write('\rClustering [{0}{1}] {2}%'.format( '#'*(int(progress)/10), " "*(10 - int(progress)/10), int(progress))) result = mc.get_all_scores( [new_node.motif], cmp_nodes.keys(), match, metric, combine, pval, parallel=True) for motif, n in cmp_nodes.items(): x = result[new_node.motif.id][motif.id] if pval: x = [1 - x[0]] + x[1:] scores[(new_node, n)] = x nodes.append(new_node) cluster_nodes = [node for node in nodes if not node.parent] if progress: sys.stderr.write("\n") root = nodes[-1] for node in [node for node in nodes if not node.left]: node.parent.checkMerge(root, threshold) return root
for line in f: vals = line.strip().split("\t") if len(vals) == 4: m2f[vals[0]] = m2f.get(vals[0], []) + [vals[1:]] #print(m2f) # Read factor to family mapping from the CIS-BP databse anno = pd.read_table(tf_info) anno = anno[["TF_Name", "Family_Name"]].drop_duplicates().set_index("TF_Name") # read motifs motifs = dict([(m.id, m) for m in read_motifs(open(pfmfile))]) df_cluster = pd.read_table(clusterfile) ic_cutoff = 5 mc = MotifComparer() id_count = {} df = df_cluster.loc[k] sys.stderr.write(str(k) + "\n") seen_line = {} with open("{}.pfm".format(outname), "w") as out: with open("{}.motif2factors.txt".format(outname), "w") as m2f_out: print("Motif\tFactor\tEvidence\tCurated", file=m2f_out) for cluster in range(k): if cluster % 10 == 0: sys.stderr.write("{}\n".format(cluster)) out.flush() motif_ids = df[df == cluster].index motif = get_clustered_motifs(motif_ids) if motif.information_content() >= ic_cutoff: factors = []
def cluster_motifs_with_report(infile, outfile, outdir, threshold, title=None): # Cluster significant motifs if title is None: title = infile motifs = read_motifs(infile, fmt="pwm") trim_ic = 0.2 clusters = [] if len(motifs) == 0: return [] elif len(motifs) == 1: clusters = [[motifs[0], motifs]] else: logger.info("clustering %d motifs.", len(motifs)) tree = cluster_motifs(infile, "total", "wic", "mean", True, threshold=float(threshold), include_bg=True, progress=False) clusters = tree.getResult() ids = [] mc = MotifComparer() img_dir = os.path.join(outdir, "images") if not os.path.exists(img_dir): os.mkdir(img_dir) for cluster, members in clusters: cluster.trim(trim_ic) png = "images/{}.png".format(cluster.id) cluster.to_img(os.path.join(outdir, png), fmt="PNG") ids.append([cluster.id, {"src": png}, []]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(), key=lambda x: x[1])[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1, "+"]: pass else: rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) png = "images/{}.png".format(motif.id.replace(" ", "_")) motif.to_img(os.path.join(outdir, png), fmt="PNG", add_left=add) ids[-1][2] = [ dict([("src", "images/{}.png".format(motif.id.replace(" ", "_"))), ("alt", motif.id.replace(" ", "_"))]) for motif in members ] config = MotifConfig() env = jinja2.Environment( loader=jinja2.FileSystemLoader([config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(motifs=ids, inputfile=title, date=datetime.today().strftime("%d/%m/%Y"), version=__version__) cluster_report = os.path.join(outdir, "cluster_report.html") with open(cluster_report, "wb") as f: f.write(result.encode('utf-8')) f = open(outfile, "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close() logger.debug("Clustering done. See the result in %s", cluster_report) return clusters
def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold): self.logger.info("clustering significant motifs.") trim_ic = 0.2 clusters = [] motifs = read_motifs(open(pfm_file), fmt="pwm") if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs( pfm_file, "total", "wic", "mean", True, threshold=float(threshold), include_bg=True, progress=False ) clusters = tree.getResult() ids = [] mc = MotifComparer() for cluster,members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(self.imgdir,"%s.png" % cluster.id), format="PNG") ids.append([cluster.id, {"src":"images/%s.png" % cluster.id},[]]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1,"+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join(self.imgdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add) ids[-1][2] = [dict([("src", "images/%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members] env = jinja2.Environment(loader=jinja2.FileSystemLoader([self.config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(expname=self.basename, motifs=ids, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION) f = open(self.cluster_report, "w") f.write(result.encode('utf-8')) f.close() f = open(cluster_pwm, "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close() self.logger.debug("Clustering done. See the result in %s", self.cluster_report) return clusters
def cluster_motifs_with_report(infile, outfile, outdir, threshold, title=None): # Cluster significant motifs if title is None: title = infile motifs = read_motifs(infile, fmt="pwm") trim_ic = 0.2 clusters = [] if len(motifs) == 0: return [] elif len(motifs) == 1: clusters = [[motifs[0], motifs]] else: logger.info("clustering %d motifs.", len(motifs)) tree = cluster_motifs( infile, "total", "wic", "mean", True, threshold=float(threshold), include_bg=True, progress=False ) clusters = tree.getResult() ids = [] mc = MotifComparer() img_dir = os.path.join(outdir, "images") if not os.path.exists(img_dir): os.mkdir(img_dir) for cluster,members in clusters: cluster.trim(trim_ic) png = "images/{}.png".format(cluster.id) cluster.to_img(os.path.join(outdir, png), fmt="PNG") ids.append([cluster.id, {"src":png},[]]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(),key=lambda x: x[1])[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1,"+"]: pass else: rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) png = "images/{}.png".format(motif.id.replace(" ", "_")) motif.to_img(os.path.join(outdir, png), fmt="PNG", add_left=add) ids[-1][2] = [dict([("src", "images/{}.png".format(motif.id.replace(" ", "_"))), ("alt", motif.id.replace(" ", "_"))]) for motif in members] config = MotifConfig() env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render( motifs=ids, inputfile=title, date=datetime.today().strftime("%d/%m/%Y"), version=__version__) cluster_report = os.path.join(outdir, "cluster_report.html") with open(cluster_report, "wb") as f: f.write(result.encode('utf-8')) f = open(outfile, "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close() logger.debug("Clustering done. See the result in %s", cluster_report) return clusters