def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None): threshold = check_threshold(data_dir, genome, scoring) config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") df = pd.read_table(input_table, index_col=0) regions = list(df.index) s = Scanner() s.set_motifs(pwmfile) s.set_genome(genome) scores = [] if scoring == "count": for row in s.count(regions, cutoff=threshold): scores.append(row) else: for row in s.best_score(regions): scores.append(row) motif_names = [m.id for m in read_motifs(open(pwmfile))] return pd.DataFrame(scores, index=df.index, columns=motif_names)
def threshold(args): if args.fdr < 0 or args.fdr > 1: print "Please specify a FDR between 0 and 1" sys.exit(1) motifs = pwmfile_to_motifs(args.pwmfile) s = Scanner() s.set_motifs(args.pwmfile) score_table = [] for scores in s.best_score(args.inputfile): score_table.append(scores) print "Motif\tScore\tCutoff" for i, scores in enumerate(np.array(score_table).transpose()): motif = motifs[i] pwm = motif.pwm min_score = motif.pwm_min_score() if len(scores) > 0: opt_score = scoreatpercentile(scores, 100 - (100 * args.fdr)) cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score) print "{0}\t{1}\t{2}".format(motif.id, opt_score, cutoff) else: sys.stderr.write("Warning: no matches for {0}\n".format(motif.id))
def threshold(args): if args.fdr < 0 or args.fdr > 1: print "Please specify a FDR between 0 and 1" sys.exit(1) motifs = pwmfile_to_motifs(args.pwmfile) s = Scanner() s.set_motifs(args.pwmfile) score_table = [] for scores in s.best_score(args.inputfile): score_table.append(scores) print "Motif\tScore\tCutoff" for i,scores in enumerate(np.array(score_table).transpose()): motif = motifs[i] pwm = motif.pwm min_score = motif.pwm_min_score() if len(scores) > 0: opt_score = scoreatpercentile(scores, 100 - (100 * args.fdr)) cutoff = (opt_score - min_score) / ( motif.pwm_max_score() - min_score) print "{0}\t{1}\t{2}".format( motif.id, opt_score , cutoff) else: sys.stderr.write("Warning: no matches for {0}\n".format(motif.id))
def command_scan(inputfile, pwmfile, nreport=1, fpr=0.01, cutoff=None, bed=False, scan_rc=True, table=False, score_table=False, moods=False, pvalue=None, bgfile=None, genome=None, ncpus=None, normalize=False): motifs = read_motifs(pwmfile) fa = as_fasta(inputfile, genome) # initialize scanner s = Scanner(ncpus=ncpus) s.set_motifs(pwmfile) if genome: s.set_genome(genome=genome) if genome or bgfile: s.set_background(genome=genome, fname=bgfile, length=fa.median_length()) if not score_table: s.set_threshold(fpr=fpr, threshold=cutoff) if table: it = scan_table(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods) elif score_table: it = scan_score_table(s, fa, motifs, scan_rc, normalize=normalize) else: it = scan_normal(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods, bed, normalize=normalize) for row in it: yield row
def roc(args): """ Calculate ROC_AUC and other metrics and optionally plot ROC curve. """ pwmfile = args.pwmfile fg_file = args.sample bg_file = args.background outputfile = args.outfile # Default extension for image if outputfile and not outputfile.endswith(".png"): outputfile += ".png" motifs = read_motifs(open(pwmfile), fmt="pwm") s = Scanner() s.set_motifs(pwmfile) ids = [] if args.ids: ids = args.ids.split(",") else: ids = [m.id for m in motifs] fg_total = dict([(m.id, []) for m in motifs]) for scores in s.best_score(fg_file): for motif,score in zip(motifs, scores): fg_total[motif.id].append(score) bg_total = dict([(m.id, []) for m in motifs]) for scores in s.best_score(bg_file): for motif,score in zip(motifs, scores): bg_total[motif.id].append(score) plot_x = [] plot_y = [] # Print the metrics print "Motif\tROC AUC\tMNCP\tEnr. at 5% FDR\tMax enr.\tRecall at 10% FDR" for motif_id in ids: fg_vals = fg_total[motif_id] bg_vals = bg_total[motif_id] (x, y) = ROC_values(fg_vals, bg_vals) plot_x.append(x) plot_y.append(y) auc = ROC_AUC(fg_vals, bg_vals) mncp = MNCP(fg_vals, bg_vals) enr_fdr = enr_at_fdr(fg_vals, bg_vals) max_enr,score = max_enrichment(fg_vals, bg_vals) recall = recall_at_fdr(fg_vals, bg_vals, 0.1) print "%s\t%0.3f\t%03f\t%0.2f\t%0.2f\t%0.4f" % ( motif_id, auc, mncp, enr_fdr, max_enr, recall) # Plot the ROC curve if outputfile: roc_plot(outputfile, plot_x, plot_y, ids=ids)
def roc(args): """ Calculate ROC_AUC and other metrics and optionally plot ROC curve. """ pwmfile = args.pwmfile fg_file = args.sample bg_file = args.background outputfile = args.outfile # Default extension for image if outputfile and not outputfile.endswith(".png"): outputfile += ".png" motifs = read_motifs(open(pwmfile), fmt="pwm") s = Scanner() s.set_motifs(pwmfile) ids = [] if args.ids: ids = args.ids.split(",") else: ids = [m.id for m in motifs] fg_total = dict([(m.id, []) for m in motifs]) for scores in s.best_score(fg_file): for motif, score in zip(motifs, scores): fg_total[motif.id].append(score) bg_total = dict([(m.id, []) for m in motifs]) for scores in s.best_score(bg_file): for motif, score in zip(motifs, scores): bg_total[motif.id].append(score) plot_x = [] plot_y = [] # Print the metrics print "Motif\tROC AUC\tMNCP\tEnr. at 5% FDR\tMax enr." for motif_id in ids: fg_vals = fg_total[motif_id] bg_vals = bg_total[motif_id] (x, y) = ROC_values(fg_vals, bg_vals) plot_x.append(x) plot_y.append(y) auc = ROC_AUC(fg_vals, bg_vals) mncp = MNCP(fg_vals, bg_vals) enr_fdr = enr_at_fdr(fg_vals, bg_vals) max_enr, score = max_enrichment(fg_vals, bg_vals) print "%s\t%0.3f\t%03f\t%0.2f\t%0.2f" % (motif_id, auc, mncp, enr_fdr, max_enr) # Plot the ROC curve if outputfile: roc_plot(outputfile, plot_x, plot_y, ids=ids)
def get_PWMScore(self, fin_regions_fa): """ Scan motif in every peak. Arguments: fin_regions_fa {[type]} -- [input fasta file] Returns: [type] -- [pfmscorefile] """ pfmscorefile = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) seqs = [s.split(" ")[0] for s in as_fasta(fin_regions_fa, genome=self.genome).ids] s = Scanner(ncpus=self.ncore) s.set_motifs(self.pfmfile) s.set_threshold(threshold=0.0) s.set_genome(self.genome) with open(self.pfmfile) as f: motifs = read_motifs(f) chunksize = 10000 # Run 10k peaks one time. with tqdm(total=len(seqs)) as pbar: for chunk in range(0, len(seqs), chunksize): chunk_seqs = seqs[chunk : chunk + chunksize] # print(chunk, "-", chunk + chunksize, "enhancers") pfm_score = [] it = s.best_score(chunk_seqs, zscore=True, gc=True) # We are using GC-normalization for motif scan because many sequence is GC-enriched. # GimmeMotif develop branch already include GC-normalization option now. for seq, scores in zip(chunk_seqs, it): for motif, score in zip(motifs, scores): pfm_score.append([motif.id, seq, score]) pbar.update(1) pfm_score = pd.DataFrame(pfm_score, columns=["motif", "enhancer", "zscore"]) pfm_score = pfm_score.set_index("motif") # print("\tCombine") pfm_score["zscoreRank"] = minmax_scale(rankdata(pfm_score["zscore"])) # When we built model, rank and minmax normalization was used. cols = ["enhancer", "zscore", "zscoreRank"] write_header = False if chunk == 0: write_header = True pfm_score[cols].to_csv(pfmscorefile, sep="\t", header=write_header) # pbar.update(chunk + chunksize) return pfmscorefile.name
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None, ncpus=None): config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:, 0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) s = Scanner(ncpus=ncpus) s.set_motifs(pwmfile) s.set_genome(genome) nregions = len(regions) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR, genome=genome) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) logger.info("creating score table") for row in s.best_score(regions): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(open(pwmfile))] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
def get_motif_scores(fa, motifs): s = Scanner() s.set_motifs(motifs) s.set_threshold(threshold=0.0) seqs = Fasta(fa.seqfn) for i, result in enumerate(s.scan(seqs, nreport=1)): intron_id = seqs.ids[i] for m, matches in enumerate(result): motif = motifs[m] for score, pos, strand in matches: if score < 0: score_rescaled = rescale(score, orig_range=[motif.min_score, 0], new_range=[0, 50]) else: score_rescaled = rescale(score, orig_range=[0, motif.max_score], new_range=[50, 100]) yield (intron_id, motif.id, score_rescaled)
def threshold(args): """Calculate motif score threshold for a given FPR.""" if args.fpr < 0 or args.fpr > 1: print("Please specify a FPR between 0 and 1") sys.exit(1) motifs = read_motifs(args.pwmfile) s = Scanner() s.set_motifs(args.pwmfile) s.set_threshold(args.fpr, filename=args.inputfile) print("Motif\tScore\tCutoff") for motif in motifs: min_score = motif.pwm_min_score() max_score = motif.pwm_max_score() opt_score = s.threshold[motif.id] if opt_score is None: opt_score = motif.pwm_max_score() threshold = (opt_score - min_score) / (max_score - min_score) print("{0}\t{1}\t{2}".format(motif.id, opt_score, threshold))
def scan(self, background_length=200, fpr=0.02, n_cpus=-1, verbose=True): """ Scan DNA sequences searching for TF binding motifs. Args: background_length (int): background length. This is used for the calculation of the binding score. fpr (float): False positive rate for motif identification. n_cpus (int): number of CPUs for parallel calculation. verbose (bool): Whether to show a progress bar. """ self.fpr = fpr self.background_length = background_length print("initiating scanner ...") ## 1. initialilze scanner ## # load motif motifs = default_motifs() # initialize scanner s = Scanner(ncpus=n_cpus) # set parameters s.set_motifs(motifs) s.set_background(genome=self.ref_genome, length=background_length) #s.set_background(genome="mm9", length=400) s.set_threshold(fpr=fpr) ## 2. motif scan ## print("getting DNA sequences ...") target_sequences = peak2fasta(self.all_peaks, self.ref_genome) print("scanning motifs ...") self.scanned_df = scan_dna_for_motifs(s, motifs, target_sequences, verbose) self.__addLog("scanMotifs")
def threshold(args): """Calculate motif score threshold for a given FPR.""" if args.fpr < 0 or args.fpr > 1: print("Please specify a FPR between 0 and 1") sys.exit(1) motifs = read_motifs(args.pwmfile) s = Scanner() s.set_motifs(args.pwmfile) s.set_threshold(args.fpr, filename=args.inputfile) print("Motif\tScore\tCutoff") for motif in motifs: min_score = motif.pwm_min_score() max_score = motif.pwm_max_score() opt_score = s.threshold[motif.id] if opt_score is None: opt_score = motif.pwm_max_score() threshold = (opt_score - min_score) / (max_score - min_score) print("{0}\t{1}\t{2}".format( motif.id, opt_score, threshold))
def scan_to_table(input_table, genome, scoring, pwmfile=None, ncpus=None): """Scan regions in input table with motifs. Parameters ---------- input_table : str Filename of input table. Can be either a text-separated tab file or a feather file. genome : str Genome name. Can be either the name of a FASTA-formatted file or a genomepy genome name. scoring : str "count" or "score" pwmfile : str, optional Specify a PFM file for scanning. ncpus : int, optional If defined this specifies the number of cores to use. Returns ------- table : pandas.DataFrame DataFrame with motif ids as column names and regions as index. Values are either counts or scores depending on the 'scoring' parameter.s """ config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:,0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) s = Scanner(ncpus=ncpus) s.set_motifs(pwmfile) s.set_genome(genome) s.set_background(genome=genome) nregions = len(regions) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) logger.info("creating score table") for row in s.best_score(regions, normalize=True): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(pwmfile)] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
def moap( inputfile, method="hypergeom", scoring=None, outfile=None, motiffile=None, pfmfile=None, genome=None, fpr=0.01, ncpus=None, subsample=None, zscore=True, gc=True, ): """Run a single motif activity prediction algorithm. Parameters ---------- inputfile : str :1File with regions (chr:start-end) in first column and either cluster name in second column or a table with values. method : str, optional Motif activity method to use. Any of 'hypergeom', 'lasso', 'lightningclassification', 'lightningregressor', 'bayesianridge', 'rf', 'xgboost'. Default is 'hypergeom'. scoring: str, optional Either 'score' or 'count' outfile : str, optional Name of outputfile to save the fitted activity values. motiffile : str, optional Table with motif scan results. First column should be exactly the same regions as in the inputfile. pfmfile : str, optional File with motifs in pwm format. Required when motiffile is not supplied. genome : str, optional Genome name, as indexed by gimme. Required when motiffile is not supplied fpr : float, optional FPR for motif scanning ncpus : int, optional Number of threads to use. Default is the number specified in the config. zscore : bool, optional Use z-score normalized motif scores. gc : bool optional Use GC% bins for z-score. Returns ------- pandas DataFrame with motif activity """ if scoring and scoring not in ["score", "count"]: raise ValueError("valid values are 'score' and 'count'") if inputfile.endswith("feather"): df = pd.read_feather(inputfile) df = df.set_index(df.columns[0]) else: # read data df = pd.read_table(inputfile, index_col=0, comment="#") clf = Moap.create(method, ncpus=ncpus) if clf.ptype == "classification": if df.shape[1] != 1: raise ValueError("1 column expected for {}".format(method)) else: if np.dtype("object") in set(df.dtypes): raise ValueError("columns should all be numeric for {}".format(method)) if motiffile is None: if genome is None: raise ValueError("need a genome") pfmfile = pfmfile_location(pfmfile) try: motifs = read_motifs(pfmfile) except Exception: sys.stderr.write("can't read motifs from {}".format(pfmfile)) raise # initialize scanner s = Scanner(ncpus=ncpus) s.set_motifs(pfmfile) s.set_genome(genome) s.set_background(genome=genome) # scan for motifs motif_names = [m.id for m in read_motifs(pfmfile)] scores = [] if method == "classic" or scoring == "count": logger.info("motif scanning (scores)") scores = scan_to_table( inputfile, genome, "count", pfmfile=pfmfile, ncpus=ncpus, zscore=zscore, gc=gc, ) else: logger.info("motif scanning (scores)") scores = scan_to_table( inputfile, genome, "score", pfmfile=pfmfile, ncpus=ncpus, zscore=zscore, gc=gc, ) motifs = pd.DataFrame(scores, index=df.index, columns=motif_names) elif isinstance(motiffile, pd.DataFrame): motifs = motiffile else: motifs = pd.read_table(motiffile, index_col=0, comment="#") if outfile and os.path.exists(outfile): out = pd.read_table(outfile, index_col=0, comment="#") ncols = df.shape[1] if ncols == 1: ncols = len(df.iloc[:, 0].unique()) if out.shape[0] == motifs.shape[1] and out.shape[1] == ncols: logger.warn("%s output already exists... skipping", method) return out if subsample is not None: n = int(subsample * df.shape[0]) logger.debug("Subsampling %d regions", n) df = df.sample(n) motifs = motifs.loc[df.index] if method == "lightningregressor": outdir = os.path.dirname(outfile) tmpname = os.path.join(outdir, ".lightning.tmp") clf.fit(motifs, df, tmpdir=tmpname) shutil.rmtree(tmpname) else: clf.fit(motifs, df) if outfile: with open(outfile, "w") as f: f.write("# maelstrom - GimmeMotifs version {}\n".format(__version__)) f.write("# method: {} with motif {}\n".format(method, scoring)) if genome: f.write("# genome: {}\n".format(genome)) if isinstance(motiffile, str): f.write("# motif table: {}\n".format(motiffile)) f.write("# {}\n".format(clf.act_description)) with open(outfile, "a") as f: clf.act_.to_csv(f, sep="\t") return clf.act_
def scan_to_table( input_table, genome, scoring, pfmfile=None, ncpus=None, zscore=True, gc=True ): """Scan regions in input table with motifs. Parameters ---------- input_table : str Filename of input table. Can be either a text-separated tab file or a feather file. genome : str Genome name. Can be either the name of a FASTA-formatted file or a genomepy genome name. scoring : str "count" or "score" pfmfile : str, optional Specify a PFM file for scanning. ncpus : int, optional If defined this specifies the number of cores to use. Returns ------- table : pandas.DataFrame DataFrame with motif ids as column names and regions as index. Values are either counts or scores depending on the 'scoring' parameter.s """ config = MotifConfig() if pfmfile is None: pfmfile = config.get_default_params().get("motif_db", None) if pfmfile is not None: pfmfile = os.path.join(config.get_motif_dir(), pfmfile) if pfmfile is None: raise ValueError("no pfmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:, 0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) if len(regions) >= 1000: check_regions = np.random.choice(regions, size=1000, replace=False) else: check_regions = regions size = int( np.median([len(seq) for seq in as_fasta(check_regions, genome=genome).seqs]) ) s = Scanner(ncpus=ncpus) s.set_motifs(pfmfile) s.set_genome(genome) s.set_background(genome=genome, gc=gc, size=size) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) msg = "creating score table" if zscore: msg += " (z-score" if gc: msg += ", GC%" msg += ")" else: msg += " (logodds)" logger.info(msg) for row in s.best_score(regions, zscore=zscore, gc=gc): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(pfmfile)] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
def diff(args): infiles = args.inputfiles.split(",") bgfile = args.bgfile outfile = args.outputfile pwmfile = args.pwmfile cutoff = args.cutoff genome = args.genome minenr = float(args.minenr) minfreq = float(args.minfreq) tmpdir = mkdtemp() # Retrieve FASTA clusters from BED file if len(infiles) == 1 and infiles[0].endswith("bed"): if not args.genome: sys.stderr.write("Can't convert BED file without genome!\n") sys.exit(1) clusters = {} for line in open(infiles[0]): vals = line.strip().split("\t") clusters.setdefault(vals[4], []).append(vals[:3]) infiles = [] for cluster, regions in clusters.items(): sys.stderr.write("Creating FASTA file for {0}\n".format(cluster)) inbed = os.path.join(tmpdir, "{0}.bed".format(cluster)) outfa = os.path.join(tmpdir, "{0}.fa".format(cluster)) with open(inbed, "w") as f: for vals in regions: f.write("{0}\t{1}\t{2}\n".format(*vals)) Genome(genome).track2fasta(inbed, outfa) infiles.append(outfa) pwms = dict([(m.id, m) for m in pwmfile_to_motifs(pwmfile)]) motifs = [m for m in pwms.keys()] names = [os.path.basename(os.path.splitext(fname)[0]) for fname in infiles] s = Scanner() s.set_motifs(pwmfile) s.set_threshold(threshold=cutoff) # Get background frequencies nbg = float(len(Fasta(bgfile).seqs)) bgcounts = s.total_count(bgfile, nreport=1) bgfreq = [(c + 0.01) / nbg for c in bgcounts] # Get frequences in input files freq = {} counts = {} for fname in infiles: mcounts = s.total_count(fname, nreport=1) n = float(len(Fasta(fname).seqs)) counts[fname] = mcounts freq[fname] = [(c + 0.01) / n for c in mcounts] freq = np.array([freq[fname] for fname in infiles]).transpose() counts = np.array([counts[fname] for fname in infiles]).transpose() #for row in freq: # print freq diff_plot(motifs, pwms, names, freq, counts, bgfreq, bgcounts, outfile, minenr=minenr, minfreq=minfreq) shutil.rmtree(tmpdir)
def command_scan(inputfile, pwmfile, nreport=1, cutoff=0.9, bed=False, scan_rc=True, table=False, score_table=False, moods=False, pvalue=None, bgfile=None, genome=None): motifs = pwmfile_to_motifs(pwmfile) index_dir = None if genome is not None: index_dir = os.path.join(MotifConfig().get_index_dir(), genome) # initialize scanner s = Scanner() s.set_motifs(pwmfile) fa = as_fasta(inputfile, index_dir) if moods: result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, table) else: result_it = s.scan(fa, nreport, scan_rc, cutoff) if table: # header yield "\t{}".format("\t".join([m.id for m in motifs])) if moods: result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, table) for seq_id, counts in result_it: yield "{}\t{}".format(seq_id, "\t".join([str(x) for x in counts])) else: # get iterator result_it = s.count(fa, nreport, scan_rc, cutoff) # counts table for i, counts in enumerate(result_it): yield "{}\t{}".format( fa.ids[i], "\t".join([str(x) for x in counts]) ) elif score_table: # get iterator result_it = s.best_score(fa, scan_rc) # header yield "\t{}".format("\t".join([m.id for m in motifs])) # score table for i,scores in enumerate(result_it): yield "{}\t{}".format( fa.ids[i], "\t".join([str(x) for x in scores]) ) else: if moods: for motif, d in result_it: for seq_id,matches in d.items(): for pos,score,strand in matches: yield format_line(fa, seq_id, motif, score, pos, strand, bed=bed) else: for i, result in enumerate(result_it): seq_id = fa.ids[i] for motif, matches in zip(motifs, result): for (score, pos, strand) in matches: yield format_line(fa, seq_id, motif, score, pos, strand, bed=bed)
def moap(inputfile, method="classic", scoring="score", outfile=None, motiffile=None, pwmfile=None, genome=None, cutoff=0.95): """ Run a single motif activity prediction algorithm. Parameters ---------- inputfile : str File with regions (chr:start-end) in first column and either cluster name in second column or a table with values. method : str, optional Motif activity method to use. Any of 'classic', 'ks', 'lasso', 'lightning', 'mara', 'rf'. Default is 'classic'. scoring: str, optional Either 'score' or 'count' outfile : str, optional Name of outputfile to save the fitted activity values. motiffile : str, optional Table with motif scan results. First column should be exactly the same regions as in the inputfile. pwmfile : str, optional File with motifs in pwm format. Required when motiffile is not supplied. genome : str, optional Genome name, as indexed by gimme. Required when motiffile is not supplied cutoff : float, optional Cutoff for motif scanning Returns ------- pandas DataFrame with motif activity """ if scoring not in ['score', 'count']: raise ValueError("valid values are 'score' and 'count'") config = MotifConfig() m2f = None # read data df = pd.read_table(inputfile, index_col=0) if method in CLUSTER_METHODS: if df.shape[1] != 1: raise ValueError("1 column expected for {}".format(method)) else: if np.dtype('object') in set(df.dtypes): raise ValueError( "columns should all be numeric for {}".format(method)) if method not in VALUE_METHODS: raise ValueError("method {} not valid".format(method)) if motiffile is None: if genome is None: raise ValueError("need a genome") # check pwmfile if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") if not os.path.exists(pwmfile): raise ValueError("{} does not exist".format(pwmfile)) try: motifs = read_motifs(open(pwmfile)) except: sys.stderr.write("can't read motifs from {}".format(pwmfile)) raise base = os.path.splitext(pwmfile)[0] map_file = base + ".motif2factors.txt" if os.path.exists(map_file): m2f = pd.read_table(map_file, index_col=0) # initialize scanner s = Scanner() sys.stderr.write(pwmfile + "\n") s.set_motifs(pwmfile) s.set_genome(genome) # scan for motifs sys.stderr.write("scanning for motifs\n") motif_names = [m.id for m in read_motifs(open(pwmfile))] scores = [] if method == 'classic' or scoring == "count": for row in s.count(list(df.index), cutoff=cutoff): scores.append(row) else: for row in s.best_score(list(df.index)): scores.append(row) motifs = pd.DataFrame(scores, index=df.index, columns=motif_names) else: motifs = pd.read_table(motiffile, index_col=0) motifs = motifs.loc[df.index] clf = None if method == "ks": clf = KSMoap() if method == "mwu": clf = MWMoap() if method == "rf": clf = RFMoap() if method == "lasso": clf = LassoMoap() if method == "lightning": clf = LightningMoap() if method == "mara": clf = MaraMoap() if method == "more": clf = MoreMoap() if method == "classic": clf = ClassicMoap() clf.fit(motifs, df) if outfile: with open(outfile, "w") as f: f.write("# maelstrom - GimmeMotifs version {}\n".format(GM_VERSION)) f.write("# method: {} with motif {}\n".format(method, scoring)) if genome: f.write("# genome: {}\n".format(genome)) if motiffile: f.write("# motif table: {}\n".format(motiffile)) f.write("# {}\n".format(clf.act_description)) with open(outfile, "a") as f: clf.act_.to_csv(f, sep="\t") return clf.act_
def diff(args): infiles = args.inputfiles.split(",") bgfile = args.bgfile outfile = args.outputfile pwmfile = args.pwmfile cutoff = args.cutoff genome = args.genome minenr = float(args.minenr) minfreq = float(args.minfreq) tmpdir = mkdtemp() # Retrieve FASTA clusters from BED file if len(infiles) == 1 and infiles[0].endswith("bed"): if not args.genome: sys.stderr.write("Can't convert BED file without genome!\n") sys.exit(1) clusters = {} for line in open(infiles[0]): vals = line.strip().split("\t") clusters.setdefault(vals[4], []).append(vals[:3]) infiles = [] for cluster,regions in clusters.items(): sys.stderr.write("Creating FASTA file for {0}\n".format(cluster)) inbed = os.path.join(tmpdir, "{0}.bed".format(cluster)) outfa = os.path.join(tmpdir, "{0}.fa".format(cluster)) with open(inbed, "w") as f: for vals in regions: f.write("{0}\t{1}\t{2}\n".format(*vals)) Genome(genome).track2fasta(inbed, outfa) infiles.append(outfa) pwms = dict([(m.id, m) for m in pwmfile_to_motifs(pwmfile)]) motifs = [m for m in pwms.keys()] names = [os.path.basename(os.path.splitext(fname)[0]) for fname in infiles] s = Scanner() s.set_motifs(pwmfile) s.set_threshold(threshold=cutoff) # Get background frequencies nbg = float(len(Fasta(bgfile).seqs)) bgcounts = s.total_count(bgfile, nreport=1) bgfreq = [(c + 0.01) / nbg for c in bgcounts] # Get frequences in input files freq = {} counts = {} for fname in infiles: mcounts = s.total_count(fname, nreport=1) n = float(len(Fasta(fname).seqs)) counts[fname] = mcounts freq[fname] = [(c + 0.01) / n for c in mcounts] freq = np.array([freq[fname] for fname in infiles]).transpose() counts = np.array([counts[fname] for fname in infiles]).transpose() #for row in freq: # print freq diff_plot(motifs, pwms, names, freq, counts, bgfreq, bgcounts, outfile, minenr=minenr, minfreq=minfreq) shutil.rmtree(tmpdir)
def calc_stats_iterator( fg_file=None, bg_file=None, fg_table=None, bg_table=None, motifs=None, stats=None, genome=None, zscore=True, gc=True, ncpus=None, ): """Calculate motif enrichment metrics. Parameters ---------- fg_file : str, optional Filename of a FASTA, BED or region file with positive sequences. bg_file : str, optional Filename of a FASTA, BED or region file with negative sequences. fg_table : str, optional Filename of a table with motif scan results of positive sequences. bg_table : str, optional Filename of a table with motif scan results of negative sequences. motifs : str, list or Motif instance, optional A file with motifs in pfm format, a list of Motif instances or a single Motif instance. If motifs is `None`, the default motif database is used. genome : str, optional Genome or index directory in case of BED/regions. stats : list, optional Names of metrics to calculate. See gimmemotifs.rocmetrics.__all__ for available metrics. ncpus : int, optional Number of cores to use. Returns ------- result : dict Dictionary with results where keys are motif ids and the values are dictionary with metric name and value pairs. """ if not stats: stats = rocmetrics.__all__ if fg_table is None: if fg_file is None: raise ValueError("Need either fg_table or fg_file argument") elif fg_file is not None: raise ValueError("Need either fg_table or fg_file argument, not both") if bg_table is None: if bg_file is None: raise ValueError("Need either bg_table or bg_file argument") elif bg_file is not None: raise ValueError("Need either bg_table or bg_file argument, not both") if fg_table is not None or bg_table is not None: remove_stats = [] for s in stats: func = getattr(rocmetrics, s) if func.input_type == "pos": remove_stats.append(s) if len(remove_stats) != 0: logger.warn( "Cannot calculate stats that require position from table of motif scores." ) logger.warn(f"Skipping the following statistics: {', '.join(remove_stats)}") stats = [s for s in stats if s not in remove_stats] if isinstance(motifs, Motif): all_motifs = [motifs] else: if type([]) == type(motifs): all_motifs = motifs else: motifs = pfmfile_location(motifs) all_motifs = read_motifs(motifs, fmt="pwm") if fg_table is not None or bg_table is not None: filtered_motifs = pd.read_csv( fg_table, sep="\t", index_col=0, nrows=1, comment="#" ).columns filtered_motifs = filtered_motifs.intersection( pd.read_csv(bg_table, sep="\t", index_col=0, nrows=1, comment="#").columns ) all_motifs = [m for m in all_motifs if m.id in filtered_motifs] if ncpus is None: ncpus = int(MotifConfig().get_default_params()["ncpus"]) if fg_file is not None or bg_file is not None: if zscore or gc: # Precalculate mean and stddev for z-score calculation s = Scanner(ncpus=ncpus) s.set_motifs(all_motifs) s.set_genome(genome) s.set_meanstd(gc=gc) chunksize = 240 for i in range(0, len(all_motifs), chunksize): result = {} logger.debug( "chunk %s of %s", (i / chunksize) + 1, len(all_motifs) // chunksize + 1 ) motifs = all_motifs[i : i + chunksize] if fg_table is None: fg_total = scan_to_best_match( fg_file, motifs, ncpus=ncpus, genome=genome, zscore=zscore, gc=gc ) else: fg_total = pd.read_csv( fg_table, sep="\t", usecols=[m.id for m in motifs], comment="#" ).to_dict(orient="list") for m in fg_total: fg_total[m] = [(x, None) for x in fg_total[m]] if bg_table is None: bg_total = scan_to_best_match( bg_file, motifs, ncpus=ncpus, genome=genome, zscore=zscore, gc=gc ) else: bg_total = pd.read_csv( bg_table, sep="\t", usecols=[m.id for m in motifs], comment="#" ).to_dict(orient="list") for m in bg_total: bg_total[m] = [(x, None) for x in bg_total[m]] logger.debug("calculating statistics") if ncpus == 1: it = _single_stats(motifs, stats, fg_total, bg_total) else: it = _mp_stats(motifs, stats, fg_total, bg_total, ncpus) for motif_id, s, ret in it: if motif_id not in result: result[motif_id] = {} result[motif_id][s] = ret yield result
def moap(inputfile, method="hypergeom", scoring=None, outfile=None, motiffile=None, pwmfile=None, genome=None, fpr=0.01, ncpus=None): """Run a single motif activity prediction algorithm. Parameters ---------- inputfile : str :1File with regions (chr:start-end) in first column and either cluster name in second column or a table with values. method : str, optional Motif activity method to use. Any of 'hypergeom', 'lasso', 'lightningclassification', 'lightningregressor', 'bayesianridge', 'rf', 'xgboost'. Default is 'hypergeom'. scoring: str, optional Either 'score' or 'count' outfile : str, optional Name of outputfile to save the fitted activity values. motiffile : str, optional Table with motif scan results. First column should be exactly the same regions as in the inputfile. pwmfile : str, optional File with motifs in pwm format. Required when motiffile is not supplied. genome : str, optional Genome name, as indexed by gimme. Required when motiffile is not supplied fpr : float, optional FPR for motif scanning ncpus : int, optional Number of threads to use. Default is the number specified in the config. Returns ------- pandas DataFrame with motif activity """ if scoring and scoring not in ['score', 'count']: raise ValueError("valid values are 'score' and 'count'") config = MotifConfig() m2f = None if inputfile.endswith("feather"): df = pd.read_feather(inputfile) df = df.set_index(df.columns[0]) else: # read data df = pd.read_table(inputfile, index_col=0, comment="#") clf = Moap.create(method, ncpus=ncpus) if clf.ptype == "classification": if df.shape[1] != 1: raise ValueError("1 column expected for {}".format(method)) else: if np.dtype('object') in set(df.dtypes): raise ValueError( "columns should all be numeric for {}".format(method)) if motiffile is None: if genome is None: raise ValueError("need a genome") # check pwmfile if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError( "no pwmfile given and no default database specified") if not os.path.exists(pwmfile): raise ValueError("{} does not exist".format(pwmfile)) try: motifs = read_motifs(open(pwmfile)) except: sys.stderr.write("can't read motifs from {}".format(pwmfile)) raise base = os.path.splitext(pwmfile)[0] map_file = base + ".motif2factors.txt" if os.path.exists(map_file): m2f = pd.read_table(map_file, index_col=0, comment="#") # initialize scanner s = Scanner(ncpus=ncpus) sys.stderr.write(pwmfile + "\n") s.set_motifs(pwmfile) s.set_genome(genome) # scan for motifs sys.stderr.write("scanning for motifs\n") motif_names = [m.id for m in read_motifs(open(pwmfile))] scores = [] if method == 'classic' or scoring == "count": s.set_threshold(fpr=fpr) for row in s.count(list(df.index)): scores.append(row) else: for row in s.best_score(list(df.index)): scores.append(row) motifs = pd.DataFrame(scores, index=df.index, columns=motif_names) else: motifs = pd.read_table(motiffile, index_col=0, comment="#") if outfile and os.path.exists(outfile): out = pd.read_table(outfile, index_col=0, comment="#") ncols = df.shape[1] if ncols == 1: ncols = len(df.iloc[:, 0].unique()) if out.shape[0] == motifs.shape[1] and out.shape[1] == ncols: logger.warn("%s output already exists... skipping", method) return out motifs = motifs.loc[df.index] if method == "lightningregressor": outdir = os.path.dirname(outfile) tmpname = os.path.join(outdir, ".lightning.tmp") clf.fit(motifs, df, tmpdir=tmpname) shutil.rmtree(tmpname) else: clf.fit(motifs, df) if outfile: with open(outfile, "w") as f: f.write( "# maelstrom - GimmeMotifs version {}\n".format(GM_VERSION)) f.write("# method: {} with motif {}\n".format(method, scoring)) if genome: f.write("# genome: {}\n".format(genome)) if motiffile: f.write("# motif table: {}\n".format(motiffile)) f.write("# {}\n".format(clf.act_description)) with open(outfile, "a") as f: clf.act_.to_csv(f, sep="\t") return clf.act_
def moap(inputfile, method="classic", scoring="score", outfile=None, motiffile=None, pwmfile=None, genome=None, cutoff=0.95): """ Run a single motif activity prediction algorithm. Parameters ---------- inputfile : str File with regions (chr:start-end) in first column and either cluster name in second column or a table with values. method : str, optional Motif activity method to use. Any of 'classic', 'ks', 'lasso', 'lightning', 'mara', 'rf'. Default is 'classic'. scoring: str, optional Either 'score' or 'count' outfile : str, optional Name of outputfile to save the fitted activity values. motiffile : str, optional Table with motif scan results. First column should be exactly the same regions as in the inputfile. pwmfile : str, optional File with motifs in pwm format. Required when motiffile is not supplied. genome : str, optional Genome name, as indexed by gimme. Required when motiffile is not supplied cutoff : float, optional Cutoff for motif scanning Returns ------- pandas DataFrame with motif activity """ if scoring not in ['score', 'count']: raise ValueError("valid values are 'score' and 'count'") config = MotifConfig() m2f = None # read data df = pd.read_table(inputfile, index_col=0) if method in CLUSTER_METHODS: if df.shape[1] != 1: raise ValueError("1 column expected for {}".format(method)) else: if np.dtype('object') in set(df.dtypes): raise ValueError( "columns should all be numeric for {}".format(method)) if method not in VALUE_METHODS: raise ValueError("method {} not valid".format(method)) if motiffile is None: if genome is None: raise ValueError("need a genome") # check pwmfile if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError( "no pwmfile given and no default database specified") if not os.path.exists(pwmfile): raise ValueError("{} does not exist".format(pwmfile)) try: motifs = read_motifs(open(pwmfile)) except: sys.stderr.write("can't read motifs from {}".format(pwmfile)) raise base = os.path.splitext(pwmfile)[0] map_file = base + ".motif2factors.txt" if os.path.exists(map_file): m2f = pd.read_table(map_file, index_col=0) # initialize scanner s = Scanner() sys.stderr.write(pwmfile + "\n") s.set_motifs(pwmfile) s.set_genome(genome) # scan for motifs sys.stderr.write("scanning for motifs\n") motif_names = [m.id for m in read_motifs(open(pwmfile))] scores = [] if method == 'classic' or scoring == "count": for row in s.count(list(df.index), cutoff=cutoff): scores.append(row) else: for row in s.best_score(list(df.index)): scores.append(row) motifs = pd.DataFrame(scores, index=df.index, columns=motif_names) else: motifs = pd.read_table(motiffile, index_col=0) clf = None if method == "ks": clf = KSMoap() if method == "mwu": clf = MWMoap() if method == "rf": clf = RFMoap() if method == "lasso": clf = LassoMoap() if method == "lightning": clf = LightningMoap() if method == "mara": clf = MaraMoap() if method == "more": clf = MoreMoap() if method == "classic": clf = ClassicMoap() clf.fit(motifs, df) if outfile: with open(outfile, "w") as f: f.write( "# maelstrom - GimmeMotifs version {}\n".format(GM_VERSION)) f.write("# method: {} with motif {}\n".format(method, scoring)) if genome: f.write("# genome: {}\n".format(genome)) if motiffile: f.write("# motif table: {}\n".format(motiffile)) f.write("# {}\n".format(clf.act_description)) with open(outfile, "a") as f: clf.act_.to_csv(f, sep="\t") return clf.act_
def scan(self, background_length=200, fpr=0.02, n_cpus=-1, verbose=True, motifs=None, TF_evidence_level="direct_and_indirect", TF_formatting="auto"): """ Scan DNA sequences searching for TF binding motifs. Args: background_length (int): background length. This is used for the calculation of the binding score. fpr (float): False positive rate for motif identification. n_cpus (int): number of CPUs for parallel calculation. verbose (bool): Whether to show a progress bar. motifs (list): a list of gimmemotifs motifs, will revert to default_motifs() if None TF_evidence_level (str): Please select one from ["direct", "direct_and_indirect"]. If "direct" is selected, TFs that have a binding evidence were used. If "direct_and_indirect" is selected, TFs with binding evidence and inferred TFs are used. For more information, please read explanation of Motif class in gimmemotifs documentation (https://gimmemotifs.readthedocs.io/en/master/index.html) """ self.fpr = fpr self.background_length = background_length ## 1. initialilze scanner ## # load motif if motifs is None: if verbose: print( "No motif data entered. Loading default motifs for your species ..." ) if self.species in [ "Mouse", "Human", "Rat" ]: # If species is vertebrate, we use gimmemotif default motifs as a default. motifs = default_motifs() self.motif_db_name = "gimme.vertebrate.v5.0" self.TF_formatting = True if verbose: print( " Default motif for vertebrate: gimme.vertebrate.v5.0. \n For more information, please go https://gimmemotifs.readthedocs.io/en/master/overview.html \n" ) elif self.species in [ "Zebrafish" ]: # If species is Zebrafish, we use CisBP database. self.motif_db_name = 'CisBP_ver2_Danio_rerio.pfm' motifs = load_motifs(self.motif_db_name) self.TF_formatting = False if verbose: print( f" Default motif for {self.species}: {self.motif_db_name}. \n For more information, please go celloracle documentation. \n" ) elif self.species in [ "S.cerevisiae" ]: # If species is S.cerevisiae, we use CisBP database. self.motif_db_name = 'CisBP_ver2_Saccharomyces_cerevisiae.pfm' motifs = load_motifs(self.motif_db_name) self.TF_formatting = False if verbose: print( f" Default motif for {self.species}: {self.motif_db_name}. \n For more information, please go celloracle documentation. \n" ) elif self.species in [ "Xenopus" ]: # If species is S.cerevisiae, we use CisBP database. self.motif_db_name = 'CisBP_ver2_Xenopus_tropicalis_and_Xenopus_laevis.pfm' motifs = load_motifs(self.motif_db_name) self.TF_formatting = False if verbose: print( f" Default motif for {self.species}: {self.motif_db_name}. \n For more information, please go celloracle documentation. \n" ) elif self.species in [ "Drosophila" ]: # If species is S.cerevisiae, we use CisBP database. self.motif_db_name = 'CisBP_ver2_Drosophila_mix.pfm' motifs = load_motifs(self.motif_db_name) self.TF_formatting = False if verbose: print( f" Default motif for {self.species}: {self.motif_db_name}. \n For more information, please go celloracle documentation. \n" ) elif self.species in [ "C.elegans" ]: # If species is S.cerevisiae, we use CisBP database. self.motif_db_name = 'CisBP_ver2_Caenorhabditis_elegans.pfm' motifs = load_motifs(self.motif_db_name) self.TF_formatting = False if verbose: print( f" Default motif for {self.species}: {self.motif_db_name}. \n For more information, please go celloracle documentation. \n" ) elif self.species in [ "Arabidopsis" ]: # If species is S.cerevisiae, we use CisBP database. self.motif_db_name = 'CisBP_ver2_Arabidopsis_thaliana.pfm' motifs = load_motifs(self.motif_db_name) self.TF_formatting = False if verbose: print( f" Default motif for {self.species}: {self.motif_db_name}. \n For more information, please go celloracle documentation. \n" ) else: raise ValueError( f"We have no default motifs for your species, {self.species}. Please set motifs." ) else: # Check format if isinstance(motifs, list): if isinstance(motifs[0], Motif): if verbose: print( "Checking your motifs... Motifs format looks good. \n" ) else: raise ValueError(f"Motif data type was invalid.") else: raise ValueError( f"motifs should be a list of Motif object in gimmemotifs.") self.motif_db_name = "custom_motifs" if TF_formatting == "auto": self.TF_formatting = False else: self.TF_formatting = TF_formatting self.motifs = motifs self.dic_motif2TFs = _get_dic_motif2TFs( species=self.species, motifs=motifs, TF_evidence_level=TF_evidence_level, formatting=self.TF_formatting) self.TF_evidence_level = TF_evidence_level # initialize scanner if verbose: print("Initiating scanner... \n") s = Scanner(ncpus=n_cpus) # set parameters s.set_motifs(motifs) try: s.set_background( genome=self.ref_genome, size=background_length) # For gimmemotifs ver 14.4 except: s.set_background( genome=self.ref_genome, length=background_length) # For old gimmemotifs ver 13 #s.set_background(genome="mm9", length=400) if verbose: print( "Calculating FPR-based threshold. This step may take substantial time when you load a new ref-genome. It will be done quicker on the second time. \n" ) s.set_threshold(fpr=fpr) ## 2. motif scan ## print("Convert peak info into DNA sequences ... \n") # Get DNA sequences target_sequences = peak2fasta(self.all_peaks, self.ref_genome) # Remove DNA sequence with zero length target_sequences = remove_zero_seq(fasta_object=target_sequences) print( "Scanning motifs ... It may take several hours if you proccess many peaks. \n" ) self.scanned_df = scan_dna_for_motifs(s, motifs, target_sequences, verbose) self.__addLog("scanMotifs")