def check_threshold(outdir, genome, scoring="count"): # gimme_motifs config, to get defaults config = MotifConfig() threshold_file = None if scoring == "count": # Motif scanning threshold threshold_file = os.path.join(outdir, "threshold.{}.txt".format(genome)) if not os.path.exists(threshold_file): # Random sequences from genome index_dir = os.path.join(config.get_index_dir(), genome) bg_file = os.path.join(outdir, "background.{}.fa".format(genome)) if not os.path.exists(bg_file): m = RandomGenomicFasta(index_dir, BG_LENGTH, BG_NUMBER) m.writefasta(bg_file) pwmfile = config.get_default_params().get("motif_db") pwmfile = os.path.join(config.get_motif_dir(), pwmfile) cmd = "gimme threshold {} {} {} > {}".format( pwmfile, bg_file, FDR, threshold_file) sp.call(cmd, shell=True) return threshold_file
def scan_it_moods(infile, motifs, cutoff, bgfile, nreport=1, scan_rc=True, pvalue=None, count=False): tmpdir = mkdtemp() matrices = [] pseudocount = 1e-3 # sys.stderr.write("bgfile: {}\n".format(bgfile)) bg = MOODS.tools.bg_from_sequence_dna("".join(Fasta(bgfile).seqs), 1) for motif in motifs: pfmname = os.path.join(tmpdir, "{}.pfm".format(motif.id)) with open(pfmname, "w") as f: matrix = np.array(motif.pwm).transpose() for line in [" ".join([str(x) for x in row]) for row in matrix]: f.write("{}\n".format(line)) matrices.append(MOODS.parsers.pfm_log_odds(pfmname, bg, pseudocount)) thresholds = [] if pvalue is not None: thresholds = [ MOODS.tools.threshold_from_p(m, bg, float(pvalue)) for m in matrices ] # sys.stderr.write("{}\n".format(thresholds)) else: thresholds = [calc_threshold_moods(m, float(cutoff)) for m in matrices] scanner = MOODS.scan.Scanner(7) scanner.set_motifs(matrices, bg, thresholds) config = MotifConfig() ncpus = int(config.get_default_params()["ncpus"]) fa = Fasta(infile) chunk = 500 if (len(fa) / chunk) < ncpus: chunk = len(fa) / (ncpus + 1) jobs = [] func = scan_fa_with_motif_moods if count: func = scan_fa_with_motif_moods_count pool = mp.Pool() for i in range(0, len(fa), chunk): jobs.append( pool.apply_async( func, (fa[i:i + chunk], motifs, matrices, bg, thresholds, nreport, scan_rc), )) for job in jobs: for ret in job.get(): yield ret
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None): threshold = check_threshold(data_dir, genome, scoring) config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") df = pd.read_table(input_table, index_col=0) regions = list(df.index) s = Scanner() s.set_motifs(pwmfile) s.set_genome(genome) scores = [] if scoring == "count": for row in s.count(regions, cutoff=threshold): scores.append(row) else: for row in s.best_score(regions): scores.append(row) motif_names = [m.id for m in read_motifs(open(pwmfile))] return pd.DataFrame(scores, index=df.index, columns=motif_names)
def scan(infile, motifs, cutoff, nreport=1, it=False): # Get configuration defaults config = MotifConfig() # Cutoff for motif scanning, only used if a cutoff is not supplied default_cutoff = config.get_default_params()['scan_cutoff'] # Number of CPUs to use ncpus = config.get_default_params()['ncpus'] cutoffs = parse_cutoff(motifs, cutoff, default_cutoff) job_server = pp.Server(secret="beetrootsoup") if job_server.get_ncpus() > ncpus: job_server.set_ncpus(ncpus) total_result = {} jobs = [] fa = Fasta(infile) for motif in motifs: for i in range(0, len(fa), CHUNK): total_result[motif] = {} jobs.append(job_server.submit( scan_fa_with_motif, (fa[i:i + CHUNK], motif, cutoffs[motif.id], nreport, ), (),())) motifkey = dict([(m.id, m) for m in motifs]) for job in jobs: motif, result = job() total_result[motifkey[motif.id]].update(result) return total_result
def get_genome(genomebuild, fastadir, indexdir=None): config = MotifConfig() if not indexdir: indexdir = config.get_index_dir() genome_dir = os.path.join(fastadir, genomebuild) index_dir = os.path.join(indexdir, genomebuild) # Check for rights to write to directory if not os.path.exists(genome_dir): try: os.mkdir(genome_dir) except OSError: sys.stderr.write("Could not create genome dir {}\n".format(genome_dir)) sys.exit(1) # Download annotation gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild) download_annotation(genomebuild, gene_file) # Download genome FASTA file download_genome(genomebuild, genome_dir) sys.stderr.write("Creating index\n") g = GenomeIndex() g = g.create_index(genome_dir, index_dir) create_bedtools_fa(index_dir, genome_dir)
def pfmfile_location(infile): config = MotifConfig() if infile is None: infile = config.get_default_params().get("motif_db", None) if infile is None: raise ValueError( "No motif file was given and no default " "database specified in the config file." ) if isinstance(infile, six.string_types): if not os.path.exists(infile): motif_dir = config.get_motif_dir() checkfile = os.path.join(motif_dir, infile) if os.path.exists(checkfile): infile = checkfile else: for ext in [".pfm", ".pwm"]: if os.path.exists(checkfile + ext): infile = checkfile + ext break if not os.path.exists(infile): raise ValueError("Motif file {} not found".format(infile)) return infile
def scan(infile, motifs, cutoff, nreport=1, it=False): # Get configuration defaults config = MotifConfig() # Cutoff for motif scanning, only used if a cutoff is not supplied default_cutoff = config.get_default_params()['scan_cutoff'] # Number of CPUs to use ncpus = config.get_default_params()['ncpus'] cutoffs = parse_cutoff(motifs, cutoff, default_cutoff) total_result = {} jobs = [] fa = Fasta(infile) for motif in motifs: for i in range(0, len(fa), CHUNK): total_result[motif] = {} jobs.append(pool.apply_async( scan_fa_with_motif, (fa[i:i + CHUNK], motif, cutoffs[motif.id], nreport, ))) motifkey = dict([(m.id, m) for m in motifs]) for job in jobs: motif, result = job.get() total_result[motifkey[motif.id]].update(result) return total_result
def scan_it(infile, motifs, cutoff, nreport=1, rc=True): # Get configuration defaults config = MotifConfig() # Cutoff for motif scanning, only used if a cutoff is not supplied default_cutoff = config.get_default_params()['scan_cutoff'] # Number of CPUs to use ncpus = config.get_default_params()['ncpus'] cutoffs = parse_cutoff(motifs, cutoff, default_cutoff) jobs = [] fa = Fasta(infile) motifkey = dict([(m.id, m) for m in motifs]) for motif in motifs: for i in range(0, len(fa), CHUNK): jobs.append(pool.apply_async( scan_fa_with_motif, (fa[i:i + CHUNK], motif, cutoffs[motif.id], nreport, rc, ))) while len(jobs) > 10: job = jobs.pop(0) motif, result = job.get() yield motifkey[motif.id], result for job in jobs: motif, result = job.get() yield motifkey[motif.id], result
def scan_it_moods(infile, motifs, cutoff, bgfile, nreport=1, scan_rc=True, pvalue=None, count=False): tmpdir = mkdtemp() matrices = [] pseudocount = 1e-3 #sys.stderr.write("bgfile: {}\n".format(bgfile)) bg = MOODS.tools.bg_from_sequence_dna("".join(Fasta(bgfile).seqs), 1) for motif in motifs: pfmname = os.path.join(tmpdir, "{}.pfm".format(motif.id)) with open(pfmname, "w") as f: matrix = np.array(motif.pwm).transpose() for line in [" ".join([str(x) for x in row]) for row in matrix]: f.write("{}\n".format(line)) matrices.append(MOODS.parsers.pfm_log_odds(pfmname, bg, pseudocount)) thresholds = [] if pvalue is not None: thresholds = [MOODS.tools.threshold_from_p(m, bg, float(pvalue)) for m in matrices] #sys.stderr.write("{}\n".format(thresholds)) else: thresholds = [calc_threshold_moods(m, float(cutoff)) for m in matrices] scanner = MOODS.scan.Scanner(7) scanner.set_motifs(matrices, bg, thresholds) config = MotifConfig() ncpus = int(config.get_default_params()['ncpus']) fa = Fasta(infile) chunk = 500 if (len(fa) / chunk) < ncpus: chunk = len(fa) / (ncpus + 1) jobs = [] func = scan_fa_with_motif_moods if count: func = scan_fa_with_motif_moods_count for i in range(0, len(fa), chunk): jobs.append(pool.apply_async( func, (fa[i:i + chunk], motifs, matrices, bg, thresholds, nreport, scan_rc, ))) for job in jobs: for ret in job.get(): yield ret
def default_motifs(): """Return list of Motif instances from default motif database.""" config = MotifConfig() d = config.get_motif_dir() m = config.get_default_params()['motif_db'] if not d or not m: raise ValueError("default motif database not configured") fname = os.path.join(d, m) with open(fname) as f: motifs = read_motifs(f) return motifs
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None, ncpus=None): config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:, 0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) s = Scanner(ncpus=ncpus) s.set_motifs(pwmfile) s.set_genome(genome) nregions = len(regions) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR, genome=genome) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) logger.info("creating score table") for row in s.best_score(regions): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(open(pwmfile))] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
class MotifProgram: from gimmemotifs.config import MotifConfig config = MotifConfig() def __init__(self): pass def bin(self): return self.config.bin(self.name) def dir(self): return self.config.dir(self.name) def is_configured(self): return self.config.is_configured(self.name) def is_installed(self): return self.is_configured() and os.access(self.bin(), os.X_OK) def run(self, fastafile, savedir, params={}): if not self.is_configured(): raise ValueError, "%s is not configured" % self.name if not self.is_installed(): raise ValueError, "%s is not installed or not correctly configured" % self.name try: return self._run_program(self.bin(), fastafile, savedir, params) except KeyboardInterrupt: return ([], "Killed", "Killed")
def __init__(self, scale=True, ncpus=None): """Predict motif activities using Support Vector Regression. Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled before classification. ncpus : int, optional Number of threads. Default is the number specified in the config. Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) SVR weights. """ self.act_description = "activity values: SVR weights" if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.ncpus = ncpus self.scale = scale self.act_ = None self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "regression"
def get_all_scores(self, motifs, dbmotifs, match, metric, combine, pval=False, parallel=True, trim=None, ncpus=None): # trim motifs first, if specified if trim: for m in motifs: m.trim(trim) for m in dbmotifs: m.trim(trim) # hash of result scores scores = {} if parallel: # Divide the job into big chunks, to keep parallel overhead to minimum # Number of chunks = number of processors available if ncpus is None: ncpus = int(MotifConfig().get_default_params()["ncpus"]) pool = Pool(processes=ncpus, maxtasksperchild=1000) batch_len = len(dbmotifs) // ncpus if batch_len <= 0: batch_len = 1 jobs = [] for i in range(0, len(dbmotifs), batch_len): # submit jobs to the job server p = pool.apply_async(_get_all_scores, args=(self, motifs, dbmotifs[i:i + batch_len], match, metric, combine, pval)) jobs.append(p) pool.close() for job in jobs: # Get the job result result = job.get() # and update the result score for m1, v in result.items(): for m2, s in v.items(): if m1 not in scores: scores[m1] = {} scores[m1][m2] = s pool.join() else: # Do the whole thing at once if we don't want parallel scores = _get_all_scores(self, motifs, dbmotifs, match, metric, combine, pval) return scores
def create_roc_plots(pfmfile, fgfa, background, outdir, genome): """Make ROC plots for all motifs.""" motifs = read_motifs(pfmfile, fmt="pwm", as_dict=True) ncpus = int(MotifConfig().get_default_params()["ncpus"]) pool = Pool(processes=ncpus) jobs = {} for bg, fname in background.items(): for m_id, m in motifs.items(): k = "{}_{}".format(str(m), bg) jobs[k] = pool.apply_async(get_roc_values, (motifs[m_id], fgfa, fname, genome)) imgdir = os.path.join(outdir, "images") if not os.path.exists(imgdir): os.mkdir(imgdir) roc_img_file = os.path.join(outdir, "images", "{}_roc.{}.png") for motif in motifs.values(): for bg in background: k = "{}_{}".format(str(motif), bg) error, x, y = jobs[k].get() if error: logger.error("Error in thread: %s", error) logger.error("Motif: %s", motif) sys.exit(1) roc_plot(roc_img_file.format(motif.id, bg), x, y)
def __init__(self, name=None): self.config = MotifConfig() self.server = None if not name: name = "%s_%s" % (self.NAME, datetime.today().strftime("%d_%m_%Y")) self.name = name # create a directory for all the intermediate and output files self._setup_output_dir(name) # setup logging self._setup_logging() self.logger.info("%s version %s", self.NAME, GM_VERSION) self.logger.info("output dir: %s", self.outdir) # setup the names of the intermediate and output files self._setup_filenames()
def __init__(self, scale=True, permute=False, ncpus=None): """Predict motif activities using lightning CDClassifier Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled before classification ncpus : int, optional Number of threads. Default is the number specified in the config. Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) fitted coefficients sig_ : DataFrame, shape (n_motifs,) boolean values, if coefficients are higher/lower than the 1%t from random permutation """ self.act_description = ("activity values: coefficients from " "fitted model") #self.cdc = CDClassifier(random_state=args.seed) self.cdc = CDClassifier() self.parameters = { "penalty": ["l1/l2"], "loss": ["squared_hinge"], "multiclass": [True], "max_iter": [20], "alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 3.0)], "C": [0.001, 0.01, 0.1, 0.5, 1.0], "tol": [1e-3] } self.kfolds = 10 if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.clf = GridSearchCV(self.cdc, self.parameters, cv=self.kfolds, n_jobs=ncpus) self.scale = scale self.permute = permute self.act_ = None self.sig_ = None self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "classification"
def prepare_denovo_input_bed(inputfile, params, outdir): """Prepare a BED file for de novo motif prediction. All regions to same size; split in test and validation set; converted to FASTA. Parameters ---------- inputfile : str BED file with input regions. params : dict Dictionary with parameters. outdir : str Output directory to save files. """ logger.info("preparing input (BED)") # Create BED file with regions of equal size width = int(params["width"]) bedfile = os.path.join(outdir, "input.bed") write_equalwidth_bedfile(inputfile, width, bedfile) abs_max = int(params["abs_max"]) fraction = float(params["fraction"]) pred_bedfile = os.path.join(outdir, "prediction.bed") val_bedfile = os.path.join(outdir, "validation.bed") # Split input into prediction and validation set logger.debug( "Splitting %s into prediction set (%s) and validation set (%s)", bedfile, pred_bedfile, val_bedfile) divide_file(bedfile, pred_bedfile, val_bedfile, fraction, abs_max) config = MotifConfig() genome = Genome(params["genome"]) for infile in [pred_bedfile, val_bedfile]: genome.track2fasta( infile, infile.replace(".bed", ".fa"), ) # Create file for location plots lwidth = int(params["lwidth"]) extend = (lwidth - width) // 2 genome.track2fasta( val_bedfile, os.path.join(outdir, "localization.fa"), extend_up=extend, extend_down=extend, stranded=params["use_strand"], )
def __init__(self, ncpus=None): self.config = MotifConfig() self.threshold = None self.genome = None if ncpus is None: self.ncpus = int(MotifConfig().get_default_params()["ncpus"]) else: self.ncpus = ncpus if self.ncpus > 1: try: ctx = mp.get_context('spawn') self.pool = ctx.Pool(processes=self.ncpus) except AttributeError: self.pool = mp.Pool(processes=self.ncpus) self.use_cache = False if self.config.get_default_params().get("use_cache", False): self._init_cache()
def __init__(self, matchfile, genome="hg19", number=None): config = MotifConfig() index = os.path.join(config.get_index_dir(), genome) # Create temporary files tmpbed = NamedTemporaryFile(dir=mytmpdir()).name tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name # Create bed-file with coordinates of random sequences matched_gc_bedfile(tmpbed, matchfile, genome, number) # Convert track to fasta track2fasta(index, tmpbed, tmpfasta) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def _write_report(outdir, ids, tree, clusters): config = MotifConfig() env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(motifs=ids) with open(os.path.join(outdir, "cluster_report.html"), "w") as f: f.write(result) f = open(os.path.join(outdir, "cluster_key.txt"), "w") for motif_id in ids: f.write("%s\t%s\n" % (motif_id[0], ",".join([x["alt"] for x in motif_id[2]]))) f.close() f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close()
def __init__(self, scale=True, kfolds=4, alpha_stepsize=1.0, ncpus=None): """Predict motif activities using Lasso MultiTask regression Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled before classification kfolds : integer, optional, default 5 number of kfolds for parameter search alpha_stepsize : float, optional, default 1.0 stepsize for use in alpha gridsearch ncpus : int, optional Number of threads. Default is the number specified in the config. Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) fitted motif activities sig_ : DataFrame, shape (n_motifs,) boolean values, if coefficients are higher/lower than the 1%t from random permutation """ self.kfolds = kfolds self.act_description = "activity values: coefficients from " "fitted model" self.scale = scale if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.ncpus = ncpus # initialize attributes self.act_ = None self.sig_ = None mtk = MultiTaskLasso() parameters = { "alpha": [np.exp(-x) for x in np.arange(0, 10, alpha_stepsize)] } self.clf = GridSearchCV(mtk, parameters, cv=kfolds, n_jobs=self.ncpus, scoring="r2") self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "regression"
def scan_it(infile, motifs, cutoff, nreport=1, rc=True): # Get configuration defaults config = MotifConfig() # Cutoff for motif scanning, only used if a cutoff is not supplied default_cutoff = config.get_default_params()['scan_cutoff'] # Number of CPUs to use ncpus = config.get_default_params()['ncpus'] cutoffs = parse_cutoff(motifs, cutoff, default_cutoff) job_server = pp.Server(secret="beetrootsoup") pp.SHOW_EXPECTED_EXCEPTIONS # True if job_server.get_ncpus() > ncpus: job_server.set_ncpus(ncpus) jobs = [] fa = Fasta(infile) motifkey = dict([(m.id, m) for m in motifs]) for motif in motifs: for i in range(0, len(fa), CHUNK): jobs.append(job_server.submit( scan_fa_with_motif, (fa[i:i + CHUNK], motif, cutoffs[motif.id], nreport, rc, ), (),())) while len(jobs) > 10: job = jobs.pop(0) motif, result = job() yield motifkey[motif.id], result for job in jobs: motif, result = job() yield motifkey[motif.id], result
def __init__(self, ncpus=None): self.config = MotifConfig() self.threshold = None self.genome = None self.background = None self.meanstd = {} self.gc_bins = [(0, 1)] if ncpus is None: self.ncpus = int(MotifConfig().get_default_params()["ncpus"]) else: self.ncpus = ncpus if self.ncpus > 1: # try: # ctx = mp.get_context('spawn') # self.pool = ctx.Pool(processes=self.ncpus) # except AttributeError: self.pool = mp.Pool(processes=self.ncpus) self.use_cache = False if self.config.get_default_params().get("use_cache", False): self._init_cache()
def pwmfile_location(infile): config = MotifConfig() if infile is None: infile = config.get_default_params().get("motif_db", None) if infile is None: raise ValueError("No motif file was given and no default " "database specified in the config file.") if isinstance(infile, six.string_types): if not os.path.exists(infile): motif_dir = config.get_motif_dir() checkfile = os.path.join(motif_dir, infile) if os.path.exists(checkfile): infile = checkfile else: for ext in ['.pfm', '.pwm']: if os.path.exists(checkfile + ext): infile = checkfile + ext break if not os.path.exists(infile): raise ValueError("Motif file {} not found".format(infile)) return infile
def maelstrom_html_report(outdir, infile, pwmfile=None, threshold=2): df = pd.read_table(infile, index_col=0) df = df[np.any(abs(df) >= threshold, 1)] M = max(abs(df.min().min()), df.max().max()) m = -M if pwmfile: with open(pwmfile) as f: motifs = read_motifs(f) else: motifs = default_motifs() del df.index.name cols = df.columns m2f = dict([(m.id,",".join(m.factors)) for m in motifs]) df["factors"] = [m2f.get(m, "") for m in df.index] f = df["factors"].str.len() > 30 df["factors"] = '<div title="' + df["factors"] + '">' + df["factors"].str.slice(0,30) df.loc[f, "factors"] += '(...)' df['factors'] += '</div>' df["logo"] = ['<img src="logos/{}.png" height=40/>'.format(x) for x in list(df.index)] if not os.path.exists(outdir + "/logos"): os.makedirs(outdir + "/logos") for motif in motifs: if motif.id in df.index: motif.to_img(outdir + "/logos/{}.png".format(motif.id), fmt="PNG") template_dir = MotifConfig().get_template_dir() js = open(os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8").read() css = open(os.path.join(template_dir, "sortable/sortable-theme-slick.css"), encoding="utf-8").read() cm = sns.diverging_palette(240, 10, as_cmap=True) df = df[["factors", "logo"] + list(cols)] with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f: f.write("<head>\n") f.write("<style>{}</style>\n".format(css)) f.write("</head>\n") f.write("<body>\n") f.write(df.style.apply(background_gradient, low=0.7, high=0.7, m=m, M=M, subset=cols).set_precision(3).set_table_attributes("data-sortable").render().replace("data-sortable", 'class="sortable-theme-slick" data-sortable')) f.write("<script>{}</script>\n".format(js)) f.write("</body>\n")
def __init__(self, scale=True, cv=3, ncpus=None): """Predict motif activities using lightning CDRegressor Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled before classification cv : int, optional, default 3 Cross-validation k-fold parameter. ncpus : int, optional Number of threads. Default is the number specified in the config. Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) fitted coefficients sig_ : DataFrame, shape (n_motifs,) boolean values, if coefficients are higher/lower than the 1%t from random permutation """ self.act_description = ("activity values: coefficients from " "fitted model") if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.ncpus = ncpus self.kfolds = cv self.scale = scale self.act_ = None self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "regression"
def __init__(self, ncpus=None): """Predict motif activities using a random forest classifier Parameters ---------- ncpus : int, optional Number of threads. Default is the number specified in the config. Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) feature importances from the model """ self.act_ = None if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.ncpus = ncpus self.act_description = ("activity values: feature importances " "from fitted Random Forest model") self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "classification"
class MotifProgram(object): config = MotifConfig() local_bin = None def __init__(self): pass def bin(self): if self.local_bin: return self.local_bin else: return self.config.bin(self.name) def dir(self): return self.config.dir(self.name) def is_configured(self): return self.config.is_configured(self.name) def is_installed(self): return self.is_configured() and os.access(self.bin(), os.X_OK) def run(self, fastafile, savedir, params=None, tmp=None): if not self.is_configured(): raise ValueError("%s is not configured" % self.name) if not self.is_installed(): raise ValueError( "%s is not installed or not correctly configured" % self.name) self.tmpdir = mkdtemp(prefix="{0}.".format(self.name), dir=tmp) try: return self._run_program(self.bin(), fastafile, savedir, params) except KeyboardInterrupt: return ([], "Killed", "Killed")
def location(args): """ Creates histrogram of motif location. Parameters ---------- args : argparse object Command line arguments. """ fastafile = args.fastafile pwmfile = args.pwmfile lwidth = args.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if args.ids: ids = args.ids.split(",") n_cpus = int(MotifConfig().get_default_params()["ncpus"]) pool = Pool(processes=n_cpus, maxtasksperchild=1000) for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append( pool.apply_async( motif_localization, (fastafile,motif,lwidth,outfile, args.cutoff) )) for job in jobs: job.get()
def visualize_maelstrom(outdir, sig_cutoff=3, pwmfile=None): config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) pwmfile = os.path.join(config.get_motif_dir(), pwmfile) mapfile = pwmfile.replace(".pwm", ".motif2factors.txt") if os.path.exists(mapfile): m2f = pd.read_csv(mapfile, sep="\t", names=["motif","factors"], index_col=0) m2f["factors"] = m2f["factors"].str[:50] else: motifs = [m.id for m in read_motifs(open(pwmfile))] m2f = pd.DataFrame({"factors": motifs}, index=motifs) sig_fname = os.path.join(outdir, "final.out.csv") df_sig = pd.read_table(sig_fname, index_col=0) f = np.any(df_sig >= sig_cutoff, 1) vis = df_sig[f] if vis.shape[0] == 0: sys.stderr.write("No motifs reach the threshold, skipping visualization.\n") return # cluster rows row_linkage = hierarchy.linkage( distance.pdist(vis, metric="euclidean"), method='complete') idx = hierarchy.leaves_list(row_linkage) plt.figure(figsize=size) vis = safe_join(vis, m2f).set_index("factors") # size of figure size = [2 + vis.shape[1] * 0.4, 1.8 + vis.shape[0] * 0.3] cg = sns.heatmap(vis.iloc[idx], cmap="viridis", yticklabels=True, cbar_kws={"orientation":"horizontal"}) _ = plt.setp(cg.yaxis.get_majorticklabels(), rotation=0) plt.title("Motif Relevance") plt.tight_layout() plt.savefig(os.path.join(outdir, "motif.relevance.png"), dpi=300) freq_fname = os.path.join(outdir, "motif.freq.txt") if os.path.exists(freq_fname): df_freq = pd.read_table(freq_fname, index_col=0) df_freq = df_freq.T vis_freq = df_freq.loc[vis.iloc[idx].index] vis_freq = safe_join(vis_freq, m2f).set_index("factors") plt.figure(figsize=size) cg = sns.heatmap(vis_freq, cmap="viridis", yticklabels=True, vmin=0, vmax=0.2, cbar_kws={"orientation":"horizontal"}) #idx = cg.dendrogram_row.reordered_ind _ = plt.setp(cg.yaxis.get_majorticklabels(), rotation=0) plt.title("Motif Frequency") plt.tight_layout() plt.savefig(os.path.join(outdir, "motif.frequency.png"), dpi=300) plt.figure(figsize=size) bla = vis_freq.min(1) bla[bla < 0.01] = 0.01 cg = sns.heatmap(np.log2(vis_freq.apply(lambda x: x / bla, 0)), yticklabels=True, vmin=-5, vmax=5, cbar_kws={"orientation":"horizontal"}) #idx = cg.dendrogram_row.reordered_ind _ = plt.setp(cg.yaxis.get_majorticklabels(), rotation=0) plt.title("Motif Enrichment") plt.tight_layout() plt.savefig(os.path.join(outdir, "motif.enrichment.png"), dpi=300)
class GimmeMotifs(object): NAME = "gimme_motifs" SCAN_THRESHOLD = "0.9" def __init__(self, name=None): self.config = MotifConfig() self.server = None if not name: name = "%s_%s" % (self.NAME, datetime.today().strftime("%d_%m_%Y")) self.name = name # create a directory for all the intermediate and output files self._setup_output_dir(name) # setup logging self._setup_logging() self.logger.info("%s version %s", self.NAME, GM_VERSION) self.logger.info("output dir: %s", self.outdir) # setup the names of the intermediate and output files self._setup_filenames() def job_server(self): try: self.server.submit(job_server_ok) except Exception: self.server = self._get_job_server() return self.server def _setup_output_dir(self, name): if os.path.exists(name): sys.stderr.write( "Output directory {} already exists!\n".format(name)) sys.stderr.write( "Resuming a previous run is not yet implemented. Please specify a different name,\n" ) sys.stderr.write( "or delete this directory if you really want to overwrite it\n" ) #sys.exit(1) else: try: os.makedirs(name) except OSError: sys.stderr.write( "Can't create output directory {}!\n".format(name)) #sys.exit(1) self.outdir = name self.tmpdir = os.path.join(self.outdir, "intermediate_results") self.imgdir = os.path.join(self.outdir, "images") try: os.mkdir(self.tmpdir) os.mkdir(self.imgdir) except OSError: pass star_img = os.path.join(self.config.get_template_dir(), "star.png") shutil.copyfile(star_img, os.path.join(self.imgdir, "star.png")) def _setup_logging(self): self.logger = logging.getLogger('motif_analysis') self.logger.setLevel(logging.DEBUG) self.logger.propagate = 0 # nice format file_formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") screen_formatter = logging.Formatter( "%(asctime)s - %(levelname)s - %(message)s") # Log to file logfile = os.path.join(self.name, "%s.log" % self.NAME) fh = logging.FileHandler(logfile, "w") fh.setLevel(logging.DEBUG) fh.setFormatter(file_formatter) self.logger.addHandler(fh) # Log to screen sh = logging.StreamHandler(sys.stdout) sh.setLevel(logging.INFO) sh.setFormatter(screen_formatter) self.logger.addHandler(sh) self.logger.debug("Logging started") self.logger.info("log: %s", logfile) def _setup_filenames(self): basename = os.path.split(self.name)[-1] self.basename = basename self.logger.debug("basename: {}".format(basename)) # Um yes, there is a smarter way, I'm sure! ;) self.input_bed = os.path.join(self.tmpdir, "%s_peakinputfile.bed" % basename) self.prediction_bed = os.path.join(self.tmpdir, "%s_prediction.bed" % basename) self.prediction_fa = os.path.join(self.tmpdir, "%s_prediction.fa" % basename) self.prediction_bg = os.path.join( self.tmpdir, "%s_prediction_background.fa" % basename) self.validation_bed = os.path.join(self.tmpdir, "%s_validation.bed" % basename) self.validation_fa = os.path.join(self.tmpdir, "%s_validation.fa" % basename) self.validation_gff = os.path.join(self.tmpdir, "%s_validation.gff" % basename) self.predicted_pfm = os.path.join(self.tmpdir, "%s_all_motifs.pfm" % basename) self.significant_pfm = os.path.join( self.tmpdir, "%s_significant_motifs.pfm" % basename) self.location_fa = os.path.join(self.tmpdir, "%s_validation_500.fa" % basename) self.location_pfile = os.path.join( self.tmpdir, "%s_localization_pvalue.txt" % basename) self.stats_file = os.path.join(self.tmpdir, "%s_stats.txt" % basename) self.ranks_file = os.path.join(self.tmpdir, "%s_ranks.txt" % basename) #self.cluster_dir = os.path.join(self.outdir, "cluster_report") self.validation_cluster_gff = os.path.join( self.tmpdir, "%s_validation_clustered.gff" % basename) self.cluster_pwm = os.path.join(self.tmpdir, "%s_clustered_motifs.pwm" % basename) self.final_pwm = os.path.join(self.outdir, "%s_motifs.pwm" % basename) self.cluster_report = os.path.join(self.outdir, "%s_cluster_report.html" % basename) self.motif_report = os.path.join(self.outdir, "%s_motif_report.html" % basename) self.text_report = os.path.join(self.outdir, "%s_motif_report.tsv" % basename) self.params_file = os.path.join(self.outdir, "%s_params.txt" % basename) # Data structures to hold the background file locations ftypes = { "bed": ".bed", "fa": ".fa", "gff": ".gff", "enrichment": "_enrichment.txt", "roc": "_significant_motifs_roc_metrics.txt", "cluster_gff": "_clustered.gff", "cluster_enrichment": "_enrichment_clustered.txt", "cluster_roc": "_roc_metrics_clustered.txt" } self.bg_file = dict([(t, {}) for t in ftypes.keys()]) for bg in (FA_VALID_BGS + BED_VALID_BGS): for ftype, extension in ftypes.items(): self.bg_file[ftype][bg] = os.path.join( self.tmpdir, "%s_bg_%s%s" % (basename, bg, extension)) def _is_parallel_enabled(self): return True def _get_job_server(self): return pool def _check_input(self, fname): """ Check if the inputfile is a valid bed-file """ if not os.path.exists(fname): self.logger.error("Inputfile %s does not exist!", fname) sys.exit(1) for i, line in enumerate(open(fname)): if line.startswith("#") or line.startswith( "track") or line.startswith("browser"): # comment or BED specific stuff pass else: vals = line.strip().split("\t") if len(vals) < 3: self.logger.error( "Expecting tab-seperated values (chromosome<tab>start<tab>end) on line %s of file %s", i + 1, fname) sys.exit(1) try: start, end = int(vals[1]), int(vals[2]) except ValueError: self.logger.error( "No valid integer coordinates on line %s of file %s", i + 1, fname) sys.exit(1) if len(vals) > 3: try: float(vals[3]) except ValueError: pass #self.logger.warn("No numerical value in column 4 on line %s of file %s, ignoring..." % (i + 1, file)) def prepare_input_bed(self, inputfile, organism="hg18", width=200, fraction=0.2, abs_max=1000, use_strand=False): """ Create all the bed- and fasta-files necessary for motif prediction and validation """ self.inputfile = inputfile width = int(width) fraction = float(fraction) abs_max = int(abs_max) use_strand = bool(use_strand) self.logger.info("preparing input (BED)") # Set all peaks to specific width self.logger.debug("Creating inputfile %s, width %s", self.input_bed, width) # if not self.weird: write_equalwidth_bedfile(inputfile, width, self.input_bed) # Split input_bed in prediction and validation set self.logger.debug( "Splitting %s into prediction set (%s) and validation set (%s)", self.input_bed, self.prediction_bed, self.validation_bed) #if not self.weird: self.prediction_num, self.validation_num = divide_file( self.input_bed, self.prediction_bed, self.validation_bed, fraction, abs_max) # Make fasta files index_dir = os.path.join(self.config.get_index_dir(), organism) self.logger.debug("Creating %s", self.prediction_fa) genome_index.track2fasta(index_dir, self.prediction_bed, self.prediction_fa, use_strand=use_strand, ignore_missing=True) self.logger.debug("Creating %s", self.validation_fa) genome_index.track2fasta(index_dir, self.validation_bed, self.validation_fa, use_strand=use_strand, ignore_missing=True) def prepare_input_fa(self, inputfile, width=200, fraction=0.2, abs_max=1000): """ Create all the bed- and fasta-files necessary for motif prediction and validation """ self.inputfile = inputfile width = int(width) fraction = float(fraction) abs_max = int(abs_max) self.logger.info("preparing input (FASTA)") # Split inputfile in prediction and validation set self.logger.debug( "Splitting %s into prediction set (%s) and validation set (%s)", self.inputfile, self.prediction_fa, self.validation_fa) self.prediction_num, self.validation_num = divide_fa_file( self.inputfile, self.prediction_fa, self.validation_fa, fraction, abs_max) def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10): fg = Fasta(fafile) if bg_type == "random": if int(self.markov_model) >= 6: self.logger.warn( "Are you sure about the Markov model? It seems too high!") else: order = { "1": "1st", "2": "2nd", "3": "3rd", "4": "4th", "5": "5th" }[str(self.markov_model)] self.logger.debug( "Creating random background (%s order Markov)" % order) m = MarkovFasta(fg, k=int(self.markov_model), n=nr_times * len(fg)) m.writefasta(outfile) self.logger.debug("Random background: %s", outfile) # return the number of random sequences created return len(m) elif bg_type == "genomic": self.logger.debug("Creating genomic background") index_dir = os.path.join(self.config.get_index_dir(), organism) f = RandomGenomicFasta(index_dir, width, nr_times * len(fg)) f.writefasta(outfile) return len(f) elif bg_type == "gc": self.logger.debug("Creating GC matched background") f = MatchedGcFasta(fafile, organism, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("GC matched background: %s", outfile) return len(f) elif bg_type == "promoter": gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism) index_dir = os.path.join(self.config.get_index_dir(), organism) self.logger.info( "Creating random promoter background (%s, using genes in %s)", organism, gene_file) f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("Random promoter background: %s", outfile) return len(f) elif bg_type == "user": bg_file = self.params["user_background"] if not os.path.exists(bg_file): self.logger.error( "User-specified background file %s does not exist!", bg_file) sys.exit(1) else: self.logger.info( "Copying user-specified background file %s to %s.", bg_file, outfile) fa = Fasta(bg_file) l = median([len(seq) for seq in fa.seqs]) if l < width * 0.95 or l > width * 1.05: self.logger.warn( "The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length.", bg_file, l, width) fa.writefasta(outfile) return len(fa) # def filter_motifs(self, motif_ids, enrichmentfile, e_cutoff, p_cutoff): # filt_motifs = [] # for line in open(enrichmentfile).readlines(): # if not line.startswith("#"): # vals = line.strip().split("\t") # if vals[0] in motif_ids: # p,e = float(vals[2]), float(vals[5]) # if p <= p_cutoff and e >= e_cutoff: # filt_motifs.append(vals[0]) # return filt_motifs def calculate_enrichment(self, motif_file, fg, bg): """ fg: [sample_fa, sample_gff] bg: [[bg1_fa, bg1_gff, bg1_enrichment], [bg2_fa, bg2_gff, bg2_enrichment], .. etc] """ self.logger.debug("Scanning background sequences with motifs") # define filenames fnames = [(fg[0], fg[1])] + [x[:2] for x in bg] # scan and save as gff for infile, outfile in fnames: with open(outfile, "w") as f: for line in command_scan(infile, motif_file, nreport=1, cutoff=self.SCAN_THRESHOLD, bed=False, scan_rc=True): f.write(line + "\n") self.logger.debug("Calculating enrichment") enrichment_cmd = gff_enrichment num_sample = len(Fasta(fg[0]).items()) for fasta_file, gff_file, out_file in bg: num_bg = len(Fasta(fasta_file).items()) enrichment_cmd(fg[1], gff_file, num_sample, num_bg, out_file) def create_background(self, background=None, organism="hg18", width=200): if background is None: background = ["random"] nr_sequences = {} # Create background for motif prediction if "gc" in background: self._create_background("gc", self.validation_bed, self.validation_fa, self.prediction_bg, organism=organism, width=width) else: self._create_background(background[0], self.validation_bed, self.validation_fa, self.prediction_bg, organism=organism, width=width) # Get background fasta files for bg in background: nr_sequences[bg] = self._create_background(bg, self.validation_bed, self.validation_fa, self.bg_file["fa"][bg], organism=organism, width=width) def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold): self.logger.info("clustering significant motifs.") trim_ic = 0.2 clusters = [] motifs = read_motifs(open(pfm_file), fmt="pwm") if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs(pfm_file, "total", "wic", "mean", True, threshold=float(threshold), include_bg=True, progress=False) clusters = tree.getResult() ids = [] mc = MotifComparer() for cluster, members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(self.imgdir, "%s.png" % cluster.id), format="PNG") ids.append([cluster.id, {"src": "images/%s.png" % cluster.id}, []]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(), cmp=lambda x, y: cmp(x[1], y[1]))[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1, "+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join( self.imgdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add) ids[-1][2] = [ dict([("src", "images/%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members ] env = jinja2.Environment( loader=jinja2.FileSystemLoader([self.config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(expname=self.basename, motifs=ids, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION) f = open(self.cluster_report, "w") f.write(result.encode('utf-8')) f.close() f = open(cluster_pwm, "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close() self.logger.debug("Clustering done. See the result in %s", self.cluster_report) return clusters def create_roc_plots(self, pwm_file, fg_fasta, bg_fasta, name): motifs = dict([(m.id, m) for m in read_motifs(open(pwm_file), fmt="pwm")]) jobs = {} for id, m in motifs.items(): jobs[id] = self.job_server().apply_async(get_roc_values, ( motifs[id], fg_fasta, bg_fasta, )) roc_img_file = os.path.join(self.imgdir, "%s_%s_roc.png") for id in motifs.keys(): error, x, y = jobs[id].get() if error: self.logger.error("Error in thread: %s", error) sys.exit(1) roc_plot(roc_img_file % (id, name), x, y) def calculate_cluster_enrichment(self, pwm, background): fg = [self.validation_fa, self.validation_cluster_gff] bg = [[ self.bg_file["fa"][bg_id], self.bg_file["gff"][bg_id], self.bg_file["cluster_enrichment"][bg_id] ] for bg_id in background] self.calculate_enrichment(pwm, fg, bg) pass def _roc_metrics(self, pwm, sample_fa, bg_fa, roc_file): motifs = dict([(m.id, m) for m in read_motifs(open(pwm), fmt="pwm")]) jobs = {} for id, m in motifs.items(): jobs[id] = self.job_server().apply_async(get_scores, ( motifs[id], sample_fa, bg_fa, )) all_auc = {} all_mncp = {} f = open(roc_file, "w") f.write("Motif\tROC AUC\tMNCP\tMax f-measure\tSens @ max f-measure\n") for id in motifs.keys(): error, auc, mncp, max_f, y = jobs[id].get() if error: self.logger.error("Error in thread: %s", error) sys.exit(1) f.write("%s\t%s\t%s\t%s\t%s\n" % (id, auc, mncp, max_f, y)) all_auc[id] = auc all_mncp[id] = mncp f.close() return all_auc, all_mncp def _calc_report_values(self, pwm, background): self.logger.debug("Calculating final statistics for report") self.p = dict([(b, {}) for b in background]) self.e = dict([(b, {}) for b in background]) e_files = dict([(bg, self.bg_file["cluster_enrichment"][bg]) for bg in background]) for bg in self.p.keys(): for line in open(e_files[bg]).readlines(): if not (line.startswith("#") or line.startswith("Motif\tSig")): vals = line.strip().split("\t") self.p[bg][vals[0]] = float(vals[2]) self.e[bg][vals[0]] = float(vals[5]) self.auc = dict([(b, {}) for b in background]) self.mncp = dict([(b, {}) for b in background]) rocs = dict([(bg, [self.bg_file["fa"][bg], self.bg_file["roc"][bg]]) for bg in background]) for bg in self.auc.keys(): bg_fasta_file, roc_file = rocs[bg] self.auc[bg], self.mncp[bg] = self._roc_metrics( pwm, self.validation_fa, bg_fasta_file, roc_file) motifs = read_motifs(open(pwm), fmt="pwm") self.closest_match = self.determine_closest_match(motifs) def _create_text_report(self, pwm, background): self.logger.debug("Creating text report") motifs = read_motifs(open(pwm), fmt="pwm") sort_key = background[0] if "gc" in background: sort_key = "gc" f = open(self.text_report, "w") header = "ID\tconsensus\tBest match db\tp-value best match\t" + "\t".join( "Enrichment (%s)\tp-value (%s)\tROC AUC (%s)\tMNCP (%s)" % (b, b, b, b) for b in background) #print header f.write("%s\n" % header) for motif in sorted(motifs, cmp=lambda x, y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])): vals = [ motif.id, motif.to_consensus(), self.closest_match[motif.id][0].id, self.closest_match[motif.id][1] ] for bg in background: vals += [ self.e[bg][motif.id], self.p[bg][motif.id], self.auc[bg][motif.id], self.mncp[bg][motif.id] ] f.write("%s\n" % "\t".join([str(x) for x in vals])) #print "%s\n" % "\t".join([str(x) for x in vals]) f.close() def print_params(self): f = open(self.params_file, "w") for param, value in self.params.items(): f.write("%s\t%s\n" % (param, value)) f.close() def _create_report(self, pwm, background, stats=None, best_id=None): if stats is None: stats = {} if best_id is None: best_id = {} self.logger.debug("Creating graphical report") class ReportMotif: pass motifs = read_motifs(open(pwm), fmt="pwm") for m, match in self.closest_match.items(): match[0].to_img(os.path.join(self.imgdir, "%s.png" % match[0].id), format="PNG") sort_key = background[0] if "gc" in background: sort_key = "gc" roc_img_file = "%s_%s_roc" report_motifs = [] sorted_motifs = sorted(motifs, cmp=lambda x, y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])) for motif in sorted_motifs: rm = ReportMotif() rm.id = motif.id rm.id_href = {"href": "#%s" % motif.id} rm.id_name = {"name": motif.id} rm.img = {"src": os.path.join("images", "%s.png" % motif.id)} rm.best = best_id[motif.id] rm.consensus = motif.to_consensus() rm.stars = stats["%s_%s" % (motif.id, motif.to_consensus())]["stars"] rm.bg = {} for bg in background: rm.bg[bg] = {} rm.bg[bg]["e"] = "%0.2f" % self.e[bg].setdefault(motif.id, 0.0) rm.bg[bg]["p"] = "%0.2f" % self.p[bg].setdefault(motif.id, 1.0) rm.bg[bg]["auc"] = "%0.3f" % self.auc[bg][motif.id] rm.bg[bg]["mncp"] = "%0.3f" % self.mncp[bg][motif.id] rm.bg[bg]["roc_img"] = { "src": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png" } rm.bg[bg]["roc_img_link"] = { "href": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png" } rm.histogram_img = {"data": "images/%s_histogram.svg" % motif.id} rm.histogram_link = {"href": "images/%s_histogram.svg" % motif.id} rm.match_img = { "src": "images/%s.png" % self.closest_match[motif.id][0].id } rm.match_id = self.closest_match[motif.id][0].id rm.match_pval = "%0.2e" % self.closest_match[motif.id][1] report_motifs.append(rm) total_report = self.motif_report env = jinja2.Environment( loader=jinja2.FileSystemLoader([self.config.get_template_dir()])) template = env.get_template("report_template.jinja.html") result = template.render(expname=self.basename, motifs=report_motifs, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION) f = open(total_report, "w") f.write(result.encode('utf-8')) f.close() def determine_closest_match(self, motifs): self.logger.debug("Determining closest matching motifs in database") motif_db = self.config.get_default_params()["motif_db"] db = os.path.join(self.config.get_motif_dir(), motif_db) db_motifs = [] if db.endswith("pwm") or db.endswith("pfm"): db_motifs = read_motifs(open(db), fmt="pwm") elif db.endswith("transfac"): db_motifs = read_motifs(db, fmt="transfac") closest_match = {} mc = MotifComparer() db_motif_lookup = dict([(m.id, m) for m in db_motifs]) match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean", parallel=False) for motif in motifs: # Calculate p-value pval, pos, orient = mc.compare_motifs( motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True) closest_match[motif.id] = [ db_motif_lookup[match[motif.id][0]], pval ] return closest_match def _determine_best_motif_in_cluster(self, clusters, pwm, sample_fa, bg_fa, imgdir=None): num_cluster = {} best_id = {} out = open(pwm, "w") for i, (clus, singles) in enumerate(clusters): best_motif = clus if len(singles) > 1: motifs = [clus] + singles tmp = NamedTemporaryFile(dir=mytmpdir()) tmp2 = NamedTemporaryFile(dir=mytmpdir()) for m in motifs: tmp.write("%s\n" % m.to_pwm()) tmp.flush() auc, mncp = self._roc_metrics(tmp.name, sample_fa, bg_fa, tmp2.name) bla = sorted(motifs, cmp=lambda x, y: cmp(mncp[x.id], mncp[y.id])) for m in bla: self.logger.debug("sorted: %s %s %s", str(m), mncp[m.id], auc[m.id]) self.logger.debug("end list") best_motif = sorted( motifs, cmp=lambda x, y: cmp(mncp[x.id], mncp[y.id]))[-1] tmp.close() tmp2.close() old_id = best_motif.id best_motif.id = "GimmeMotifs_%d" % (i + 1) best_id[best_motif.id] = old_id.split("_")[0] num_cluster["%s_%s" % (best_motif.id, best_motif.to_consensus())] = len(singles) if imgdir: best_motif.to_img(os.path.join(imgdir, best_motif.id), format="PNG") out.write("%s\n" % best_motif.to_pwm()) out.close() return num_cluster, best_id def run_full_analysis(self, inputfile, user_params=None): """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """ self.logger.info("starting full motif analysis") self.logger.debug("Using temporary directory {0}".format(mytmpdir())) if user_params is None: user_params = {} params = self.config.get_default_params() params.update(user_params) if params["torque"]: from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult self.logger.debug("Using torque") else: from gimmemotifs.prediction import pp_predict_motifs, PredictionResult self.logger.debug("Using multiprocessing") self.params = params #self.weird = params["weird_option"] background = [x.strip() for x in params["background"].split(",")] self.logger.debug("Parameters:") for param, value in params.items(): self.logger.debug(" %s: %s", param, value) # Checking input self.input_type = "BED" # If we can load it as fasta then it is a fasta, yeh? try: Fasta(inputfile) self.logger.debug("Inputfile is a FASTA file") self.input_type = "FASTA" except Exception: # Leave it to BED pass index_msg = ("No index found for genome {}! " "Has GimmeMotifs been configured correctly and is the " "genome indexed?").format(params["genome"]) index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) if self.input_type == "FASTA": for bg in background: if not bg in FA_VALID_BGS: self.logger.info( "Input type is FASTA, can't use background type '%s'", bg) if bg == "genomic": if not os.path.exists(index_dir): self.logger.error(index_msg) sys.exit(1) background = [bg for bg in background if bg in FA_VALID_BGS] elif self.input_type == "BED": # Does the index_dir exist? #bed-specific if not os.path.exists(index_dir): self.logger.error(index_msg) sys.exit(1) # is it a valid bed-file etc. self._check_input(inputfile) # bed-specific # Check for valid background for bg in background: if not bg in BED_VALID_BGS: self.logger.info( "Input type is BED, can't use background type '%s'", bg) background = [bg for bg in background if bg in BED_VALID_BGS] if len(background) == 0: self.logger.error("No valid backgrounds specified!") sys.exit(1) self.max_time = None max_time = None # Maximum time? if params["max_time"]: try: max_time = float(params["max_time"]) except Exception: self.logger.debug( "Could not parse max_time value, setting to no limit") self.max_time = None if max_time > 0: self.logger.debug( "Time limit for motif prediction: %0.2f hours" % max_time) max_time = 3600 * max_time self.max_time = max_time self.logger.debug("Max_time in seconds %0.0f" % self.max_time) else: self.logger.debug( "Invalid time limit for motif prediction, setting to no limit" ) self.max_time = None else: self.logger.debug("No time limit for motif prediction") if "random" in background: self.markov_model = params["markov_model"] # Create the necessary files for motif prediction and validation if self.input_type == "BED": self.prepare_input_bed(inputfile, params["genome"], params["width"], params["fraction"], params["abs_max"], params["use_strand"]) # Create file for location plots index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) lwidth = int(params["lwidth"]) width = int(params["width"]) extend = (lwidth - width) / 2 genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"], ignore_missing=True) elif self.input_type == "FASTA": self.prepare_input_fa(inputfile, params["width"], params["fraction"], params["abs_max"]) # File for location plots self.location_fa = self.validation_fa fa = Fasta(self.location_fa) seqs = fa.seqs lwidth = len(seqs[0]) all_same_width = not (False in [len(seq) == lwidth for seq in seqs]) if not all_same_width: self.logger.warn( "PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!" ) else: self.logger.error("Unknown input type, shouldn't happen") sys.exit(1) tools = dict([(x.strip(), x in [y.strip() for y in params["tools"].split(",")]) for x in params["available_tools"].split(",")]) self.create_background(background, params["genome"], params["width"]) # Predict the motifs analysis = params["analysis"] """ Predict motifs, input is a FASTA-file""" self.logger.info("starting motif prediction (%s)", analysis) self.logger.info("tools: %s", ", ".join([x for x in tools.keys() if tools[x]])) bg_file = self.bg_file["fa"][sorted( background, lambda x, y: cmp(BG_RANK[x], BG_RANK[y]))[0]] self.logger.debug("Using bg_file %s for significance" % bg_file) result = pp_predict_motifs(self.prediction_fa, self.predicted_pfm, analysis, params["genome"], params["use_strand"], self.prediction_bg, tools, self.job_server(), logger=self.logger, max_time=self.max_time, fg_file=self.validation_fa, bg_file=bg_file) motifs = result.motifs self.logger.info("predicted %s motifs", len(motifs)) self.logger.debug("written to %s", self.predicted_pfm) if len(motifs) == 0: self.logger.info("no motifs found") sys.exit() # Write stats output to file f = open(self.stats_file, "w") stat_keys = result.stats.values()[0].keys() f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys))) self.logger.debug(result.stats) for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats: f.write( "%s\t%s\n" % (motif.id, "\t".join([str(stats[k]) for k in stat_keys]))) else: self.logger.error( "No stats for motif {0}, skipping this motif!".format( motif.id)) motifs.remove(motif) f.close() self.motifs_with_stats = motifs f = open(self.ranks_file, "w") tools = dict((m.id.split("_")[0], 1) for m in motifs).keys() f.write("Metric\tType\t%s\n" % ("\t".join(tools))) for stat in ["mncp", "roc_auc", "maxenr"]: best_motif = {} for motif in self.motifs_with_stats: val = result.stats["%s_%s" % (motif.id, motif.to_consensus())][stat] name = motif.id.split("_")[0] if val > best_motif.setdefault(name, 0): best_motif[name] = val names = best_motif.keys() vals = [best_motif[name] for name in names] rank = rankdata(vals) ind = [names.index(x) for x in tools] f.write("%s\t%s\t%s\n" % (stat, "value", "\t".join([str(vals[i]) for i in ind]))) f.write("%s\t%s\t%s\n" % (stat, "rank", "\t".join([str(rank[i]) for i in ind]))) f.close() #self.logger.debug("RANK: %s" % stat) #self.logger.debug("\t".join([str(x) for x in names])) #self.logger.debug("\t".join([str(x) for x in vals])) #self.logger.debug("\t".join([str(x) for x in rank])) # Determine significant motifs nsig = 0 f = open(self.significant_pfm, "w") for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats[ 'enr_fdr'] >= 2: f.write("%s\n" % motif.to_pfm()) nsig += 1 f.close() self.logger.info("%s motifs are significant", nsig) self.logger.debug("written to %s", self.significant_pfm) if nsig == 0: self.logger.info("no significant motifs found") return # ROC metrics of significant motifs for bg in background: self._roc_metrics(self.significant_pfm, self.validation_fa, self.bg_file["fa"][bg], self.bg_file["roc"][bg]) # Cluster significant motifs clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm, self.outdir, params["cluster_threshold"]) # Determine best motif in cluster num_cluster, best_id = self._determine_best_motif_in_cluster( clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir) ### Enable parallel and modular evaluation of results # Scan (multiple) files with motifs # Define callback functions once scanning is finished: # - ROC plot # - Statistics # - Location plots (histogram) # - # Stars tmp = NamedTemporaryFile(dir=mytmpdir()).name p = PredictionResult(tmp, logger=self.logger, job_server=self.server, fg_file=self.validation_fa, bg_file=bg_file, do_counter=False) p.add_motifs( ("clustering", (read_motifs(open(self.final_pwm)), "", ""))) while len(p.stats.keys()) < len(p.motifs): sleep(5) #print "p.stats" #print p.stats #print "num_cluster" #print num_cluster for mid, num in num_cluster.items(): p.stats[mid]["numcluster"] = num all_stats = { "mncp": [2, 5, 8], "roc_auc": [0.6, 0.75, 0.9], "maxenr": [10, 20, 30], "enr_fdr": [4, 8, 12], "fraction": [0.4, 0.6, 0.8], "ks_sig": [4, 7, 10], "numcluster": [3, 6, 9], } self.logger.info("creating report") # ROC plots for bg in background: self.create_roc_plots(self.final_pwm, self.validation_fa, self.bg_file["fa"][bg], bg) # Location plots self.logger.debug("Creating localization plots") motifs = read_motifs(open(self.final_pwm), fmt="pwm") for motif in motifs: m = "%s_%s" % (motif.id, motif.to_consensus()) s = p.stats[m] outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id) motif_localization(self.location_fa, motif, lwidth, outfile, cutoff=s["cutoff_fdr"]) s["stars"] = int( mean([star(s[x], all_stats[x]) for x in all_stats.keys()]) + 0.5) self.logger.debug("Motif %s: %s stars" % (m, s["stars"])) # Calculate enrichment of final, clustered motifs self.calculate_cluster_enrichment(self.final_pwm, background) # Create report self.print_params() self._calc_report_values(self.final_pwm, background) self._create_report(self.final_pwm, background, stats=p.stats, best_id=best_id) self._create_text_report(self.final_pwm, background) self.logger.info("finished") self.logger.info("output dir: %s", os.path.split(self.motif_report)[0]) self.logger.info("report: %s", os.path.split(self.motif_report)[-1]) #self.logger.info("Open %s in your browser to see your results." % (self.motif_report)) if not (params["keep_intermediate"]): self.logger.debug( "Deleting intermediate files. Please specifify the -k option if you want to keep these files." ) shutil.rmtree(self.tmpdir) self.logger.debug("Done") return self.motif_report
def pp_predict_motifs(fastafile, outfile, analysis="small", organism="hg18", single=False, background="", tools=None, job_server=None, ncpus=8, max_time=None, stats_fg=None, stats_bg=None): """Parallel prediction of motifs. Utility function for gimmemotifs.denovo.gimme_motifs. Probably better to use that, instead of this function directly. """ if tools is None: tools = {} config = MotifConfig() if not tools: tools = dict([(x, 1) for x in config.get_default_params["tools"].split(",")]) #logger = logging.getLogger('gimme.prediction.pp_predict_motifs') wmin = 5 step = 1 if analysis in ["large", "xl"]: step = 2 wmin = 6 analysis_max = {"xs": 5, "small": 8, "medium": 10, "large": 14, "xl": 20} wmax = analysis_max[analysis] if analysis == "xs": sys.stderr.write("Setting analysis xs to small") analysis = "small" if not job_server: job_server = pool jobs = {} result = PredictionResult( outfile, fg_file=stats_fg, background=stats_bg, job_server=job_server, ) # Dynamically load all tools toolio = [ x[1]() for x in inspect.getmembers( tool_classes, lambda x: inspect.isclass(x) and issubclass( x, tool_classes.MotifProgram)) if x[0] != 'MotifProgram' ] # TODO: # Add warnings for running time: Weeder, GADEM ### Add all jobs to the job_server ### params = { 'analysis': analysis, 'background': background, "single": single, "organism": organism } # Tools that don't use a specified width usually take longer # ie. GADEM, XXmotif, MEME # Start these first. for t in [tool for tool in toolio if not tool.use_width]: if t.name in tools and tools[t.name]: logger.debug("Starting %s job", t.name) job_name = t.name jobs[job_name] = job_server.apply_async( _run_tool, (job_name, t, fastafile, params), callback=result.add_motifs) else: logger.debug("Skipping %s", t.name) for t in [tool for tool in toolio if tool.use_width]: if t.name in tools and tools[t.name]: for i in range(wmin, wmax + 1, step): logger.debug("Starting %s job, width %s", t.name, i) job_name = "%s_width_%s" % (t.name, i) my_params = params.copy() my_params['width'] = i jobs[job_name] = job_server.apply_async( _run_tool, (job_name, t, fastafile, my_params), callback=result.add_motifs) else: logger.debug("Skipping %s", t.name) logger.info("all jobs submitted") for job in jobs.values(): job.get() result.wait_for_stats() ### Wait until all jobs are finished or the time runs out ### # start_time = time() # try: # # Run until all jobs are finished # while len(result.finished) < len(jobs.keys()) and (not(max_time) or time() - start_time < max_time): # pass # if len(result.finished) < len(jobs.keys()): # logger.info("Maximum allowed running time reached, destroying remaining jobs") # job_server.terminate() # result.submit_remaining_stats() # ### Or the user gets impatient... ### # except KeyboardInterrupt: # # Destroy all running jobs # logger.info("Caught interrupt, destroying all running jobs") # job_server.terminate() # result.submit_remaining_stats() # # # if stats_fg and stats_bg: # logger.info("waiting for motif statistics") # n = 0 # last_len = 0 # # # while len(set(result.stats.keys())) < len(set([str(m) for m in result.motifs])): # if n >= 30: # logger.debug("waited long enough") # logger.debug("motifs: %s, stats: %s", len(result.motifs), len(result.stats.keys())) # for i,motif in enumerate(result.motifs): # if "{}_{}".format(motif.id, motif.to_consensus()) not in result.stats: # logger.debug("deleting %s", motif) # del result.motifs[i] # break # sleep(2) # if len(result.stats.keys()) == last_len: # n += 1 # else: # last_len = len(result.stats.keys()) # n = 0 # return result
def pp_predict_motifs(fastafile, outfile, analysis="small", organism="hg18", single=False, background="", tools=None, job_server=None, ncpus=8, max_time=-1, stats_fg=None, stats_bg=None): """Parallel prediction of motifs. Utility function for gimmemotifs.denovo.gimme_motifs. Probably better to use that, instead of this function directly. """ if tools is None: tools = {} config = MotifConfig() if not tools: tools = dict([(x,1) for x in config.get_default_params["tools"].split(",")]) #logger = logging.getLogger('gimme.prediction.pp_predict_motifs') wmin = 5 step = 1 if analysis in ["large","xl"]: step = 2 wmin = 6 analysis_max = {"xs":5,"small":8, "medium":10,"large":14, "xl":20} wmax = analysis_max[analysis] if analysis == "xs": sys.stderr.write("Setting analysis xs to small") analysis = "small" if not job_server: n_cpus = int(config.get_default_params()["ncpus"]) job_server = Pool(processes=n_cpus, maxtasksperchild=1000) jobs = {} result = PredictionResult( outfile, fg_file=stats_fg, background=stats_bg, job_server=job_server, ) # Dynamically load all tools toolio = [x[1]() for x in inspect.getmembers( tool_classes, lambda x: inspect.isclass(x) and issubclass(x, tool_classes.MotifProgram) ) if x[0] != 'MotifProgram'] # TODO: # Add warnings for running time: Weeder, GADEM ### Add all jobs to the job_server ### params = { 'analysis': analysis, 'background':background, "single":single, "organism":organism } # Tools that don't use a specified width usually take longer # ie. GADEM, XXmotif, MEME # Start these first. for t in [tool for tool in toolio if not tool.use_width]: if t.name in tools and tools[t.name]: logger.debug("Starting %s job", t.name) job_name = t.name jobs[job_name] = job_server.apply_async( _run_tool, (job_name, t, fastafile, params), callback=result.add_motifs) else: logger.debug("Skipping %s", t.name) for t in [tool for tool in toolio if tool.use_width]: if t.name in tools and tools[t.name]: for i in range(wmin, wmax + 1, step): logger.debug("Starting %s job, width %s", t.name, i) job_name = "%s_width_%s" % (t.name, i) my_params = params.copy() my_params['width'] = i jobs[job_name] = job_server.apply_async( _run_tool, (job_name, t, fastafile, my_params), callback=result.add_motifs) else: logger.debug("Skipping %s", t.name) logger.info("all jobs submitted") for job in jobs.values(): job.get() result.wait_for_stats() ### Wait until all jobs are finished or the time runs out ### # start_time = time() # try: # # Run until all jobs are finished # while len(result.finished) < len(jobs.keys()) and (not(max_time) or time() - start_time < max_time): # pass # if len(result.finished) < len(jobs.keys()): # logger.info("Maximum allowed running time reached, destroying remaining jobs") # job_server.terminate() # result.submit_remaining_stats() # ### Or the user gets impatient... ### # except KeyboardInterrupt: # # Destroy all running jobs # logger.info("Caught interrupt, destroying all running jobs") # job_server.terminate() # result.submit_remaining_stats() # # # if stats_fg and stats_bg: # logger.info("waiting for motif statistics") # n = 0 # last_len = 0 # # # while len(set(result.stats.keys())) < len(set([str(m) for m in result.motifs])): # if n >= 30: # logger.debug("waited long enough") # logger.debug("motifs: %s, stats: %s", len(result.motifs), len(result.stats.keys())) # for i,motif in enumerate(result.motifs): # if "{}_{}".format(motif.id, motif.to_consensus()) not in result.stats: # logger.debug("deleting %s", motif) # del result.motifs[i] # break # sleep(2) # if len(result.stats.keys()) == last_len: # n += 1 # else: # last_len = len(result.stats.keys()) # n = 0 # return result
def _create_graphical_report(inputfile, pwm, background, closest_match, outdir, stats, best_id=None): """Create main gimme_motifs output html report.""" if best_id is None: best_id = {} logger.debug("Creating graphical report") class ReportMotif(object): """Placeholder for motif stats.""" pass config = MotifConfig() imgdir = os.path.join(outdir, "images") if not os.path.exists(imgdir): os.mkdir(imgdir) motifs = read_motifs(pwm, fmt="pwm") roc_img_file = "%s_roc.%s" dbpwm = config.get_default_params()["motif_db"] pwmdir = config.get_motif_dir() dbmotifs = read_motifs(os.path.join(pwmdir, dbpwm), as_dict=True) report_motifs = [] for motif in motifs: rm = ReportMotif() rm.id = motif.id rm.id_href = {"href": "#%s" % motif.id} rm.id_name = {"name": motif.id} rm.img = {"src": os.path.join("images", "%s.png" % motif.id)} motif.to_img(os.path.join(outdir, "images/{}.png".format(motif.id)), fmt="PNG") # TODO: fix best ID rm.best = "Gimme"#best_id[motif.id] rm.consensus = motif.to_consensus() rm.stars = int(np.mean( [stats[str(motif)][bg].get("stars", 0) for bg in background] ) + 0.5) rm.bg = {} for bg in background: rm.bg[bg] = {} this_stats = stats.get(str(motif), {}).get(bg) # TODO: fix these stats rm.bg[bg]["e"] = "%0.2f" % this_stats.get("enr_at_fpr", 1.0) rm.bg[bg]["p"] = "%0.2f" % this_stats.get("phyper_at_fpr", 1.0) rm.bg[bg]["auc"] = "%0.3f" % this_stats.get("roc_auc", 0.5) rm.bg[bg]["mncp"] = "%0.3f" % this_stats.get("mncp", 1.0) rm.bg[bg]["roc_img"] = {"src": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"} rm.bg[bg][u"roc_img_link"] = {u"href": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"} rm.histogram_img = {"data":"images/%s_histogram.svg" % motif.id} rm.histogram_link= {"href":"images/%s_histogram.svg" % motif.id} match_id = closest_match[motif.id][0] dbmotifs[match_id].to_img(os.path.join(outdir, "images/{}.png".format(match_id)), fmt="PNG") rm.match_img = {"src": "images/{}.png".format(match_id)} rm.match_id = closest_match[motif.id][0] rm.match_pval = "%0.2e" % closest_match[motif.id][1][-1] report_motifs.append(rm) total_report = os.path.join(outdir, "motif_report.html") star_img = os.path.join(config.get_template_dir(), "star.png") shutil.copyfile(star_img, os.path.join(outdir, "images", "star.png")) env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()])) template = env.get_template("report_template.jinja.html") # TODO: title result = template.render( motifs=report_motifs, inputfile=inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=__version__, bg_types=list(background.keys())) with open(total_report, "wb") as f: f.write(result.encode('utf-8'))
class MotifProgram(object): """Motif program base class.""" config = MotifConfig() local_bin = None def _parse_params(self, params=None, needs_background=False): """ Parse parameters. Combine default and user-defined parameters. """ prm = self.default_params.copy() if params is not None: prm.update(params) # Background file is essential! if "background" in prm: # Absolute path, just to be sure prm["background"] = os.path.abspath(prm["background"]) elif needs_background: raise ValueError("Background file needed!") return prm def _read_and_label_motifs(self, outfile, stdout, stderr, fmt="meme"): """Read output motifs and label with program name""" if not os.path.exists(outfile): stdout += "\nMotif file {0} not found!\n".format(outfile) stderr += "\nMotif file {0} not found!\n".format(outfile) return [], stdout, stderr motifs = read_motifs(outfile, fmt="meme") for m in motifs: m.id = "{0}_{1}".format(self.name, m.id) return motifs, stdout, stderr def bin(self): """ Get the command used to run the tool. Returns ------- command : str The tool system command. """ if self.local_bin: return self.local_bin else: return self.config.bin(self.name) def dir(self): """ Get the installation directory of the tool. Returns ------- dir : str The tool directory. """ return self.config.dir(self.name) def is_configured(self): """ Check if the tool is configured. Returns ------- is_configured : bool True if the tool is configured. """ return self.config.is_configured(self.name) def is_installed(self): """ Check if the tool is installed. Returns ------- is_installed : bool True if the tool is installed. """ return self.is_configured() and os.access(self.bin(), os.X_OK) def run(self, fastafile, params=None, tmp=None): """ Run the tool and predict motifs from a FASTA file. Parameters ---------- fastafile : str Name of the FASTA input file. params : dict, optional Optional parameters. For some of the tools required parameters are passed using this dictionary. tmp : str, optional Directory to use for creation of temporary files. Returns ------- motifs : list of Motif instances The predicted motifs. stdout : str Standard out of the tool. stderr : str Standard error of the tool. """ if not self.is_configured(): raise ValueError("%s is not configured" % self.name) if not self.is_installed(): raise ValueError( "%s is not installed or not correctly configured" % self.name) self.tmpdir = mkdtemp(prefix="{0}.".format(self.name), dir=tmp) fastafile = os.path.abspath(fastafile) try: return self._run_program(self.bin(), fastafile, params) except KeyboardInterrupt: return ([], "Killed", "Killed")
def write_config(self): from gimmemotifs.config import MotifConfig cfg = MotifConfig(use_config="cfg/gimmemotifs.cfg.example") data_dir = os.path.abspath(self.install_data) cfg.set_template_dir(os.path.join(data_dir, 'gimmemotifs/templates')) cfg.set_gene_dir(os.path.join(data_dir, 'gimmemotifs/genes')) cfg.set_score_dir(os.path.join(data_dir, 'gimmemotifs/score_dists')) cfg.set_index_dir(os.path.join(data_dir, 'gimmemotifs/genome_index')) cfg.set_motif_dir(os.path.join(data_dir, 'gimmemotifs/motif_databases')) cfg.set_bg_dir(os.path.join(data_dir, 'gimmemotifs/bg')) print print "Trying to locate motif programs" MOTIF_CLASSES = ["MDmodule", "Meme", "Weeder", "Gadem", "MotifSampler", "Trawler", "Improbizer", "MoAn", "BioProspector"] available = [] for program in MOTIF_CLASSES: m = eval(program)() cmd = m.cmd bin = which(cmd) if bin: print "Found %s in %s" % (m.name, bin) available.append(m.name) dir = None if program == "Weeder": dir = bin.replace("weederTFBS.out","") elif program == "Meme": dir = bin.replace("bin/meme", "") elif program == "Trawler": dir = bin.replace("bin/trawler.pl", "") cfg.set_program(m.name, {"bin":bin, "dir":dir}) else: print "Couldn't find %s" % m.name print print "Trying to locate seqlogo" bin = which("seqlogo") if bin: print "Found seqlogo in %s" % (bin) cfg.set_seqlogo(bin) else: print "Couldn't find seqlogo" print DEFAULT_PARAMS["available_tools"] = ",".join(available) DEFAULT_PARAMS["tools"] = ",".join(available) cfg.set_default_params(DEFAULT_PARAMS) # Use a user-specific configfile if any other installation scheme is used if os.path.abspath(self.install_data) == "/usr/share": config_file = "/usr/share/gimmemotifs/%s" % CONFIG_NAME else: config_file = os.path.expanduser("~/.%s" % CONFIG_NAME) if os.path.exists(config_file): new_config = config_file + ".tmp" print "INFO: Configfile %s already exists!\n Will create %s, which contains the new config.\n If you want to use the newly generated config you can move %s to %s, otherwise you can delete %s.\n" % (config_file, new_config, new_config, config_file, new_config) f = open(new_config, "wb") cfg.write(f) else: print "Writing configuration file %s" % config_file f = open(config_file, "wb") cfg.write(f) print "Edit %s to further configure GimmeMotifs." % config_file
def diff(args): infiles = args.inputfiles.split(",") bgfile = args.bgfile outfile = args.outputfile pwmfile = args.pwmfile cutoff = args.cutoff genome = args.genome minenr = float(args.minenr) minfreq = float(args.minfreq) tmpdir = mkdtemp() # Retrieve FASTA clusters from BED file if len(infiles) == 1 and infiles[0].endswith("bed"): if not args.genome: sys.stderr.write("Can't convert BED file without genome!\n") sys.exit(1) clusters = {} for line in open(infiles[0]): vals = line.strip().split("\t") clusters.setdefault(vals[4], []).append(vals[:3]) infiles = [] config = MotifConfig() index_dir = config.get_index_dir() for cluster,regions in clusters.items(): sys.stderr.write("Creating FASTA file for {0}\n".format(cluster)) inbed = os.path.join(tmpdir, "{0}.bed".format(cluster)) outfa = os.path.join(tmpdir, "{0}.fa".format(cluster)) with open(inbed, "w") as f: for vals in regions: f.write("{0}\t{1}\t{2}\n".format(*vals)) track2fasta(os.path.join(index_dir, genome), inbed, outfa) infiles.append(outfa) pwms = dict([(m.id, m) for m in pwmfile_to_motifs(pwmfile)]) motifs = [m for m in pwms.keys()] names = [os.path.basename(os.path.splitext(f)[0]) for f in infiles] # Get background frequencies nbg = float(len(Fasta(bgfile).seqs)) bgcounts = get_counts(bgfile, pwms.values(), cutoff) bgfreq = [(bgcounts[m] + 0.01) / nbg for m in motifs] # Get frequences in input files freq = {} counts = {} for fname in infiles: c = get_counts(fname, pwms.values(), cutoff) n = float(len(Fasta(fname).seqs)) freq[fname] = [(c[m] + 0.01) / n for m in motifs] counts[fname] = [c[m] for m in motifs] freq = np.array([freq[fname] for fname in infiles]).transpose() counts = np.array([counts[fname] for fname in infiles]).transpose() #for row in freq: # print freq diff_plot(motifs, pwms, names, freq, counts, bgfreq, bgcounts, outfile, minenr=minenr, minfreq=minfreq) shutil.rmtree(tmpdir)
class MotifComparer(object): """Class for motif comparison. Compare two or more motifs using a variety of metrics. Probably the best metric to compare motifs is seqcor. The implementation of this metric is similar to the one used in Grau (2015), where motifs are scored according to the Pearson correlation of the scores along sequence. In this case a de Bruijn of k=7 is used. Valid metrics are: seqcor - Pearson correlation of motif scores along sequence. pcc - Pearson correlation coefficient of motif PFMs. ed - Euclidean distance-based similarity of motif PFMs. distance - Distance-based similarity of motif PFMs. wic - Weighted Information Content, see van Heeringen 2011. chisq - Chi-squared similarity of motif PFMs. akl - Similarity based on average Kullback-Leibler similarity, see Mahony, 2011. ssd - Sum of squared distances of motif PFMs. Examples -------- mc = MotifComparer() # Compare two motifs score, pos, strand = mc.compare_motifs(m1, m2, metric="seqcor") # Compare a list of motifs to another list of motifs mc.get_all_scores(motifs, dbmotifs, match, metric, combine) # Get the best match for every motif in a list of reference motifs get_closest_match(motifs, dbmotifs=None) """ def __init__(self): self.config = MotifConfig() self.metrics = ["pcc", "ed", "distance", "wic"] self.combine = ["mean", "sum"] self._load_scores() # Create a parallel python job server, to use for fast motif comparison def _load_scores(self): self.scoredist = {} for metric in self.metrics: self.scoredist[metric] = {"total": {}, "subtotal": {}} for match in ["total", "subtotal"]: for combine in ["mean"]: self.scoredist[metric]["%s_%s" % (match, combine)] = {} score_file = os.path.join(self.config.get_score_dir(), "%s_%s_%s_score_dist.txt" % (match, metric, combine)) if os.path.exists(score_file): with open(score_file) as f: for line in f: l1, l2, m, sd = line.strip().split("\t")[:4] self.scoredist[metric]["%s_%s" % (match, combine)].setdefault(int(l1), {})[int(l2)] = [float(m), float(sd)] def compare_motifs(self, m1, m2, match="total", metric="wic", combine="mean", pval=False): """Compare two motifs. The similarity metric can be any of seqcor, pcc, ed, distance, wic, chisq, akl or ssd. If match is 'total' the similarity score is calculated for the whole match, including positions that are not present in both motifs. If match is partial or subtotal, only the matching psotiions are used to calculate the score. The score of individual position is combined using either the mean or the sum. Note that the match and combine parameters have no effect on the seqcor similarity metric. Parameters ---------- m1 : Motif instance Motif instance 1. m2 : Motif instance Motif instance 2. match : str, optional Match can be "partial", "subtotal" or "total". Not all metrics use this. metric : str, optional Distance metric. combine : str, optional Combine positional scores using "mean" or "sum". Not all metrics use this. pval : bool, optional Calculate p-vale of match. Returns ------- score, position, strand """ if metric == "seqcor": return seqcor(m1, m2) elif match == "partial": if pval: return self.pvalue(m1, m2, "total", metric, combine, self.max_partial(m1.pwm, m2.pwm, metric, combine)) elif metric in ["pcc", "ed", "distance", "wic", "chisq", "ssd"]: return self.max_partial(m1.pwm, m2.pwm, metric, combine) else: return self.max_partial(m1.pfm, m2.pfm, metric, combine) elif match == "total": if pval: return self.pvalue(m1, m2, match, metric, combine, self.max_total(m1.pwm, m2.pwm, metric, combine)) elif metric in ["pcc", 'akl']: # Slightly randomize the weight matrix return self.max_total(m1.wiggle_pwm(), m2.wiggle_pwm(), metric, combine) elif metric in ["ed", "distance", "wic", "chisq", "pcc", "ssd"]: return self.max_total(m1.pwm, m2.pwm, metric, combine) else: return self.max_total(m1.pfm, m2.pfm, metric, combine) elif match == "subtotal": if metric in ["pcc", "ed", "distance", "wic", "chisq", "ssd"]: return self.max_subtotal(m1.pwm, m2.pwm, metric, combine) else: return self.max_subtotal(m1.pfm, m2.pfm, metric, combine) def _check_length(self, l): # Set the length to a length represented in randomly generated JASPAR motifs if l < 4: return 4 if l == 13: return 14 if l == 17: return 18 if l == 19: return 20 if l == 21: return 22 if l > 22: return 30 return l def pvalue(self, m1, m2, match, metric, combine, score): l1, l2 = len(m1.pwm), len(m2.pwm) l1 = self._check_length(l1) l2 = self._check_length(l2) m,s = self.scoredist[metric]["%s_%s" % (match, combine)][l1][l2] try: [1 - norm.cdf(score[0], m, s), score[1], score[2]] except Exception as e: print("Error with score: {}\n{}".format(score, e)) return [1, np.nan, np.nan] return [1 - norm.cdf(score[0], m, s), score[1], score[2]] def score_matrices(self, matrix1, matrix2, metric, combine): if metric in self.metrics and combine in self.combine: s = score(matrix1, matrix2, metric, combine) if s != s: return None else: return s else: if metric == "akl": func = akl elif metric == "chisq": func = chisq elif metric == "ssd": func = ssd else: try: func = getattr(distance, metric) except: raise Exception("Unknown metric '{}'".format(metric)) scores = [] for pos1,pos2 in zip(matrix1,matrix2): scores.append(func(pos1, pos2)) if combine == "mean": return np.mean(scores) elif combine == "sum": return np.sum(scores) else: raise ValueError("Unknown combine") def max_subtotal(self, matrix1, matrix2, metric, combine): scores = [] min_overlap = 4 if len(matrix1) < min_overlap or len(matrix2) < min_overlap: return self.max_total(matrix1, matrix2, metric, combine) #return c_max_subtotal(matrix1, matrix2, metric, combine) for i in range(-(len(matrix2) - min_overlap), len(matrix1) - min_overlap + 1): p1,p2 = self.make_equal_length_truncate(matrix1, matrix2, i) s = self.score_matrices(p1, p2, metric, combine) if s: scores.append([s, i, 1]) rev_matrix2 = [row[::-1] for row in matrix2[::-1]] for i in range(-(len(matrix2) - min_overlap), len(matrix1) - min_overlap + 1): p1,p2 = self.make_equal_length_truncate(matrix1, rev_matrix2, i) s = self.score_matrices(p1, p2, metric, combine) if s: scores.append([s, i, -1]) if not scores: return [] return sorted(scores, key=lambda x: x[0])[-1] def max_partial(self, matrix1, matrix2, metric, combine): scores = [] for i in range(-(len(matrix2) -1), len(matrix1)): p1,p2 = self.make_equal_length_truncate_second(matrix1, matrix2, i) s = self.score_matrices(p1, p2, metric, combine) if s: scores.append([s, i, 1]) rev_matrix2 = [row[::-1] for row in matrix2[::-1]] for i in range(-(len(matrix2) -1), len(matrix1)): p1,p2 = self.make_equal_length_truncate_second(matrix1, rev_matrix2, i) s = self.score_matrices(p1, p2, metric, combine) if s: scores.append([s, i, -1]) if not scores: return [] return sorted(scores, key=lambda x: x[0])[-1] def max_total(self, matrix1, matrix2, metric, combine): scores = [] for i in range(-(len(matrix2) -1), len(matrix1)): p1,p2 = self.make_equal_length(matrix1, matrix2, i) s = self.score_matrices(p1, p2, metric, combine) if s: scores.append([s, i, 1]) rev_matrix2 = [row[::-1] for row in matrix2[::-1]] for i in range(-(len(matrix2) -1), len(matrix1)): p1,p2 = self.make_equal_length(matrix1, rev_matrix2, i) s = self.score_matrices(p1, p2, metric, combine) if s: scores.append([s, i, -1]) if not scores: sys.stdout.write("No score {} {}".format(matrix1, matrix2)) return [] return sorted(scores, key=lambda x: x[0])[-1] def make_equal_length(self, pwm1, pwm2, pos, bg=None): if bg is None: bg = [0.25,0.25,0.25,0.25] p1 = pwm1[:] p2 = pwm2[:] if pos < 1: p1 = [bg for _ in range(-pos)] + p1 else: p2 = [bg for _ in range(pos)] + p2 diff = len(p1) - len(p2) if diff > 0: p2 += [bg for _ in range(diff)] elif diff < 0: p1 += [bg for _ in range(-diff)] return p1,p2 def make_equal_length_truncate(self, pwm1, pwm2, pos): p1 = pwm1[:] p2 = pwm2[:] if pos < 0: p2 = p2[-pos:] elif pos > 0: p1 = p1[pos:] if len(p1) > len(p2): p1 = p1[:len(p2)] else: p2 = p2[:len(p1)] return p1, p2 def make_equal_length_truncate_second(self, pwm1, pwm2, pos, bg=None): if bg is None: bg = [0.25,0.25,0.25,0.25] p1 = pwm1[:] p2 = pwm2[:] if pos < 0: p2 = p2[-pos:] else: p2 = [bg for _ in range(pos)] + p2 diff = len(p1) - len(p2) if diff > 0: p2 += [bg for _ in range(diff)] elif diff < 0: p2 = p2[:len(p1)] return p1,p2 def get_all_scores(self, motifs, dbmotifs, match, metric, combine, pval=False, parallel=True, trim=None, ncpus=None): """Pairwise comparison of a set of motifs compared to reference motifs. Parameters ---------- motifs : list List of Motif instances. dbmotifs : list List of Motif instances. match : str Match can be "partial", "subtotal" or "total". Not all metrics use this. metric : str Distance metric. combine : str Combine positional scores using "mean" or "sum". Not all metrics use this. pval : bool , optional Calculate p-vale of match. parallel : bool , optional Use multiprocessing for parallel execution. True by default. trim : float or None If a float value is specified, motifs are trimmed used this IC cutoff before comparison. ncpus : int or None Specifies the number of cores to use for parallel execution. Returns ------- scores : dict Dictionary with scores. """ # trim motifs first, if specified if trim: for m in motifs: m.trim(trim) for m in dbmotifs: m.trim(trim) # hash of result scores scores = {} if parallel: # Divide the job into big chunks, to keep parallel overhead to minimum # Number of chunks = number of processors available if ncpus is None: ncpus = int(MotifConfig().get_default_params()["ncpus"]) pool = Pool(processes=ncpus, maxtasksperchild=1000) batch_len = len(dbmotifs) // ncpus if batch_len <= 0: batch_len = 1 jobs = [] for i in range(0, len(dbmotifs), batch_len): # submit jobs to the job server p = pool.apply_async(_get_all_scores, args=(self, motifs, dbmotifs[i: i + batch_len], match, metric, combine, pval)) jobs.append(p) pool.close() for job in jobs: # Get the job result result = job.get() # and update the result score for m1,v in result.items(): for m2, s in v.items(): if m1 not in scores: scores[m1] = {} scores[m1][m2] = s pool.join() else: # Do the whole thing at once if we don't want parallel scores = _get_all_scores(self, motifs, dbmotifs, match, metric, combine, pval) return scores def get_closest_match(self, motifs, dbmotifs=None, match="partial", metric="wic",combine="mean", parallel=True, ncpus=None): """Return best match in database for motifs. Parameters ---------- motifs : list or str Filename of motifs or list of motifs. dbmotifs : list or str, optional Database motifs, default will be used if not specified. match : str, optional metric : str, optional combine : str, optional ncpus : int, optional Number of threads to use. Returns ------- closest_match : dict """ if dbmotifs is None: pwm = self.config.get_default_params()["motif_db"] pwmdir = self.config.get_motif_dir() dbmotifs = os.path.join(pwmdir, pwm) motifs = parse_motifs(motifs) dbmotifs = parse_motifs(dbmotifs) dbmotif_lookup = dict([(m.id, m) for m in dbmotifs]) scores = self.get_all_scores(motifs, dbmotifs, match, metric, combine, parallel=parallel, ncpus=ncpus) for motif in scores: scores[motif] = sorted( scores[motif].items(), key=lambda x:x[1][0] )[-1] for motif in motifs: dbmotif, score = scores[motif.id] pval, pos, orient = self.compare_motifs( motif, dbmotif_lookup[dbmotif], match, metric, combine, True) scores[motif.id] = [dbmotif, (list(score) + [pval])] return scores def generate_score_dist(self, motifs, match, metric, combine): score_file = os.path.join(self.config.get_score_dir(), "%s_%s_%s_score_dist.txt" % (match, metric, combine)) f = open(score_file, "w") all_scores = {} for l in [len(motif) for motif in motifs]: all_scores[l] = {} sorted_motifs = {} for l in all_scores.keys(): sorted_motifs[l] = [motif for motif in motifs if len(motif) == l] for l1 in all_scores.keys(): for l2 in all_scores.keys(): scores = self.get_all_scores(sorted_motifs[l1], sorted_motifs[l2], match, metric, combine) scores = [[y[0] for y in x.values() if y] for x in scores.values()] scores = np.array(scores).ravel() f.write("%s\t%s\t%s\t%s\n" % (l1, l2, np.mean(scores), np.std(scores))) f.close()
def cluster(args): revcomp = not args.single outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): os.mkdir(outdir) trim_ic = 0.2 clusters = [] motifs = pwmfile_to_motifs(args.inputfile) if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True) clusters = tree.getResult() ids = [] mc = MotifComparer() sys.stderr.write("Creating images\n") for cluster,members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(outdir,"%s.png" % cluster.id), format="PNG") ids.append([cluster.id, {"src":"%s.png" % cluster.id},[]]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1,"+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join(outdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add) ids[-1][2] = [dict([("src", "%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members] config = MotifConfig() env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(motifs=ids) with open(os.path.join(outdir, "cluster_report.html"), "w") as f: f.write(result.encode('utf-8')) f = open(os.path.join(outdir, "cluster_key.txt"), "w") for id in ids: f.write("%s\t%s\n" % (id[0], ",".join([x["alt"] for x in id[2]]))) f.close() f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close()
def scan_to_table(input_table, genome, scoring, pwmfile=None, ncpus=None): """Scan regions in input table with motifs. Parameters ---------- input_table : str Filename of input table. Can be either a text-separated tab file or a feather file. genome : str Genome name. Can be either the name of a FASTA-formatted file or a genomepy genome name. scoring : str "count" or "score" pwmfile : str, optional Specify a PFM file for scanning. ncpus : int, optional If defined this specifies the number of cores to use. Returns ------- table : pandas.DataFrame DataFrame with motif ids as column names and regions as index. Values are either counts or scores depending on the 'scoring' parameter.s """ config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:,0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) s = Scanner(ncpus=ncpus) s.set_motifs(pwmfile) s.set_genome(genome) s.set_background(genome=genome) nregions = len(regions) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) logger.info("creating score table") for row in s.best_score(regions, normalize=True): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(pwmfile)] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
def genome(args): config = MotifConfig() if not os.path.exists(args.indexdir): print "Index_dir %s does not exist!" % (args.indexdir) sys.exit(1) if not os.path.exists(args.fastadir): print "FASTA dir %s does not exist!" % (args.fastadir) sys.exit(1) pred_bin = "genePredToBed" pred = find_executable(pred_bin) if not pred: sys.stderr.write("{} not found in path!\n".format(pred_bin)) sys.exit(1) fastadir = args.fastadir genomebuild = args.genomebuild genome_dir = os.path.join(fastadir, genomebuild) index_dir = os.path.join(args.indexdir, args.genomebuild) # Check for rights to write to directory if not os.path.exists(genome_dir): try: os.mkdir(genome_dir) except: sys.stderr.write("Could not create genome dir {}\n".format(genome_dir)) sys.exit(1) # Download gene file based on URL + genomebuild gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild) tmp = NamedTemporaryFile(delete=False, suffix=".gz") anno = [] f = urllib2.urlopen(UCSC_GENE_URL.format(genomebuild)) p = re.compile(r'\w+.Gene.txt.gz') for line in f.readlines(): m = p.search(line) if m: anno.append(m.group(0)) sys.stderr.write("Retrieving gene annotation for {}\n".format(genomebuild)) url = "" for a in ANNOS: if a in anno: url = UCSC_GENE_URL.format(genomebuild) + a break if url: urllib.urlretrieve( url, tmp.name ) sp.call("zcat {} | cut -f2-11 | {} /dev/stdin {}".format(tmp.name, pred, gene_file), shell=True) else: sys.stderr.write("No annotation found!") # download genome based on URL + genomebuild sys.stderr.write("Downloading {} genome\n".format(genomebuild)) for genome_url in [UCSC_GENOME_URL, ALT_UCSC_GENOME_URL]: remote = genome_url.format(genomebuild) genome_fa = os.path.join( genome_dir, os.path.split(remote)[-1] ) sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild))) urllib.urlretrieve( genome_url.format(genomebuild), genome_fa ) if not check_genome_file(genome_fa): continue break if not check_genome_file(genome_fa): sys.stderr.write("Failed to download genome\n") sys.exit(1) sys.stderr.write("Unpacking\n") if genome_fa.endswith("tar.gz"): cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa) else: cmd = "gunzip {0} && rm {0}".format(genome_fa) sp.call(cmd, shell=True, cwd=genome_dir) fa_files = glob("{}/*.fa".format(genome_dir)) if len(fa_files) == 1: f = Fasta(fa_files[0]) for n,s in f.items(): with open("{}/{}.fa".format(n)) as f: f.write("{}\n{}\n".format(n,s)) os.unlink(fa_files[0]) sys.stderr.write("Creating index\n") g = GenomeIndex() g = g.create_index(genome_dir, index_dir)
def matched_gc_bedfile(bedfile, matchfile, genome, number): N_FRACTION = 0.1 config = MotifConfig() index = os.path.join(config.get_index_dir(), genome) genome_size = os.path.join(index, "genome.size") genome_fa = os.path.join(index, "genome.fa") if not os.path.exists(genome_size) or not os.path.exists(genome_fa): raise RuntimeError, "genome files not found, please re-index {} " \ "with a recent version of gimme index".format(genome) try: fa = Fasta(matchfile) gc = [(seq.upper().count("C") + seq.upper().count("G")) / float(len(seq)) for seq in fa.seqs] lengths = [len(seq) for seq in fa.seqs] except: try: bed = pybedtools.BedTool(matchfile) gc = [float(x[4]) for x in bed.nucleotide_content(fi=genome_fa)] lengths = [x.length for x in bed] except: sys.stderr.write("Please provide input file in BED or FASTA format\n") sys.exit(1) gc_hist,bins = np.histogram(gc, range=(0,1), bins=20) length = np.median(lengths) if np.std(lengths) > length * 0.05: sys.stderr.write("Sequences do not seem to be of equal length.\n") sys.stderr.write("GC% matched sequences of the median length ({}) will be created\n".format(length)) total = sum(gc_hist) if number: norm = number * gc_hist / (float(sum(gc_hist))) + 0.5 inorm = norm.astype(np.int) s = np.argsort(norm - inorm) while sum(inorm) > number: if inorm[np.argmin(s)] > 0: inorm[np.argmin(s)] -= 1 s[np.argmin(s)] = len(s) while sum(inorm) < number: inorm[np.argmax(s)] += 1 s[np.argmax(s)] = 0 gc_hist = inorm rnd = pybedtools.BedTool() out = open(bedfile, "w") #sys.stderr.write("Generating sequences\n") #sys.stderr.write("{}\n".format(number)) r = rnd.random(l=length, n=number * 15, g=genome_size).nucleotide_content(fi=genome_fa) #sys.stderr.write("Retrieving\n") features = [f[:3] + [float(f[7])] for f in r if float(f[12]) <= length * N_FRACTION] gc = [f[3] for f in features] #sys.stderr.write("Done\n") for bin_start, bin_end, count in zip(bins[:-1], bins[1:], gc_hist): #sys.stderr.write("CG {}-{}\n".format(bin_start, bin_end)) if count > 0: rcount = 0 for f in features: if (f[3] >= bin_start and f[3] < bin_end): out.write("{}\t{}\t{}\n".format(*f[:3])) rcount += 1 if rcount >= count: break if count != rcount: sys.stderr.write("not enough random sequences found for {} <= GC < {} ({} instead of {})\n".format(bin_start, bin_end, rcount, count)) out.close()
def run(self): from gimmemotifs.config import MotifConfig cfg = MotifConfig(use_config=self.build_cfg) data_dir = self.remove_nonsense(os.path.abspath(self.install_dir)) cfg.set_template_dir(os.path.join(data_dir, 'gimmemotifs/templates')) cfg.set_gene_dir(os.path.join(data_dir, 'gimmemotifs/genes')) cfg.set_score_dir(os.path.join(data_dir, 'gimmemotifs/score_dists')) cfg.set_index_dir(os.path.join(data_dir, 'gimmemotifs/genome_index')) cfg.set_motif_dir(os.path.join(data_dir, 'gimmemotifs/motif_databases')) cfg.set_bg_dir(os.path.join(data_dir, 'gimmemotifs/bg')) cfg.set_tools_dir(os.path.join(data_dir, 'gimmemotifs/tools')) final_tools_dir = self.remove_nonsense(self.install_tools_dir) for program in MOTIF_CLASSES: m = eval(program)() if cfg.is_configured(m.name): bin = cfg.bin(m.name).replace(self.build_tools_dir, final_tools_dir) dir = cfg.dir(m.name) if dir: dir = dir.replace(self.build_tools_dir, final_tools_dir) cfg.set_program(m.name, {"bin":bin, "dir":dir}) dir = cfg.get_seqlogo() dir = dir.replace(self.build_tools_dir, final_tools_dir) cfg.set_seqlogo(dir) # Use a user-specific configfile if any other installation scheme is used # if os.path.abspath(self.install_dir) == "/usr/share": config_file = os.path.join(self.install_dir, "gimmemotifs/%s" % CONFIG_NAME) self.outfiles = [config_file] if os.path.exists(config_file): timestr = time.strftime("%Y%m%d-%H%M%S") old_config = "{}.{}".format(config_file, timestr) shutil.move(config_file, old_config) dlog.info("INFO: Configfile %s already existed!", config_file) dlog.info("INFO: This config has been saved as %s", old_config) dlog.info("writing configuration file %s" % config_file) f = open(config_file, "wb") cfg.write(f)
class GimmeMotifs(object): NAME = "gimme_motifs" SCAN_THRESHOLD = "0.9" def __init__(self, name=None): self.config = MotifConfig() self.server = None if not name: name = "%s_%s" % (self.NAME, datetime.today().strftime("%d_%m_%Y")) self.name = name # create a directory for all the intermediate and output files self._setup_output_dir(name) # setup logging self._setup_logging() self.logger.info("%s version %s", self.NAME, GM_VERSION) self.logger.info("output dir: %s", self.outdir) # setup the names of the intermediate and output files self._setup_filenames() def job_server(self): try: self.server.submit(job_server_ok) except Exception: self.server = self._get_job_server() return self.server def _setup_output_dir(self, name): if os.path.exists(name): sys.stderr.write("Output directory {} already exists!\n".format(name)) sys.stderr.write("Resuming a previous run is not yet implemented. Please specify a different name,\n") sys.stderr.write("or delete this directory if you really want to overwrite it\n") #sys.exit(1) else: try: os.makedirs(name) except OSError: sys.stderr.write("Can't create output directory {}!\n".format(name)) #sys.exit(1) self.outdir = name self.tmpdir = os.path.join(self.outdir, "intermediate_results") self.imgdir = os.path.join(self.outdir, "images") try: os.mkdir(self.tmpdir) os.mkdir(self.imgdir) except OSError: pass star_img = os.path.join(self.config.get_template_dir(), "star.png") shutil.copyfile(star_img, os.path.join(self.imgdir, "star.png")) def _setup_logging(self): self.logger = logging.getLogger('motif_analysis') self.logger.setLevel(logging.DEBUG) self.logger.propagate = 0 # nice format file_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") screen_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") # Log to file logfile = os.path.join(self.name, "%s.log" % self.NAME) fh = logging.FileHandler(logfile, "w") fh.setLevel(logging.DEBUG) fh.setFormatter(file_formatter) self.logger.addHandler(fh) # Log to screen sh = logging.StreamHandler(sys.stdout) sh.setLevel(logging.INFO) sh.setFormatter(screen_formatter) self.logger.addHandler(sh) self.logger.debug("Logging started") self.logger.info("log: %s", logfile) def _setup_filenames(self): basename = os.path.split(self.name)[-1] self.basename = basename self.logger.debug("basename: {}".format(basename)) # Um yes, there is a smarter way, I'm sure! ;) self.input_bed = os.path.join(self.tmpdir, "%s_peakinputfile.bed" % basename) self.prediction_bed = os.path.join(self.tmpdir, "%s_prediction.bed" % basename) self.prediction_fa = os.path.join(self.tmpdir, "%s_prediction.fa" % basename) self.prediction_bg = os.path.join(self.tmpdir, "%s_prediction_background.fa" % basename) self.validation_bed = os.path.join(self.tmpdir, "%s_validation.bed" % basename) self.validation_fa = os.path.join(self.tmpdir, "%s_validation.fa" % basename) self.validation_gff = os.path.join(self.tmpdir, "%s_validation.gff" % basename) self.predicted_pfm = os.path.join(self.tmpdir, "%s_all_motifs.pfm" % basename) self.significant_pfm = os.path.join(self.tmpdir, "%s_significant_motifs.pfm" % basename) self.location_fa = os.path.join(self.tmpdir, "%s_validation_500.fa" % basename) self.location_pfile = os.path.join(self.tmpdir, "%s_localization_pvalue.txt" % basename) self.stats_file = os.path.join(self.tmpdir, "%s_stats.txt" % basename) self.ranks_file = os.path.join(self.tmpdir, "%s_ranks.txt" % basename) #self.cluster_dir = os.path.join(self.outdir, "cluster_report") self.validation_cluster_gff = os.path.join(self.tmpdir, "%s_validation_clustered.gff" % basename) self.cluster_pwm = os.path.join(self.tmpdir, "%s_clustered_motifs.pwm" % basename) self.final_pwm = os.path.join(self.outdir, "%s_motifs.pwm" % basename) self.cluster_report = os.path.join(self.outdir, "%s_cluster_report.html" % basename) self.motif_report = os.path.join(self.outdir, "%s_motif_report.html" % basename) self.text_report = os.path.join(self.outdir, "%s_motif_report.tsv" % basename) self.params_file = os.path.join(self.outdir, "%s_params.txt" % basename) # Data structures to hold the background file locations ftypes = { "bed": ".bed", "fa": ".fa", "gff": ".gff", "enrichment": "_enrichment.txt", "roc": "_significant_motifs_roc_metrics.txt", "cluster_gff": "_clustered.gff", "cluster_enrichment": "_enrichment_clustered.txt", "cluster_roc": "_roc_metrics_clustered.txt" } self.bg_file = dict([(t,{}) for t in ftypes.keys()]) for bg in (FA_VALID_BGS + BED_VALID_BGS): for ftype, extension in ftypes.items(): self.bg_file[ftype][bg] = os.path.join(self.tmpdir, "%s_bg_%s%s" % (basename, bg, extension)) def _is_parallel_enabled(self): return True def _get_job_server(self): return pool def _check_input(self, fname): """ Check if the inputfile is a valid bed-file """ if not os.path.exists(fname): self.logger.error("Inputfile %s does not exist!", fname) sys.exit(1) for i, line in enumerate(open(fname)): if line.startswith("#") or line.startswith("track") or line.startswith("browser"): # comment or BED specific stuff pass else: vals = line.strip().split("\t") if len(vals) < 3: self.logger.error("Expecting tab-seperated values (chromosome<tab>start<tab>end) on line %s of file %s", i + 1, fname) sys.exit(1) try: start, end = int(vals[1]), int(vals[2]) except ValueError: self.logger.error("No valid integer coordinates on line %s of file %s", i + 1, fname) sys.exit(1) if len(vals) > 3: try: float(vals[3]) except ValueError: pass #self.logger.warn("No numerical value in column 4 on line %s of file %s, ignoring..." % (i + 1, file)) def prepare_input_bed(self, inputfile, organism="hg18", width=200, fraction=0.2, abs_max=1000, use_strand=False): """ Create all the bed- and fasta-files necessary for motif prediction and validation """ self.inputfile = inputfile width = int(width) fraction = float(fraction) abs_max = int(abs_max) use_strand = bool(use_strand) self.logger.info("preparing input (BED)") # Set all peaks to specific width self.logger.debug("Creating inputfile %s, width %s", self.input_bed, width) # if not self.weird: write_equalwidth_bedfile(inputfile, width, self.input_bed) # Split input_bed in prediction and validation set self.logger.debug( "Splitting %s into prediction set (%s) and validation set (%s)", self.input_bed, self.prediction_bed, self.validation_bed) #if not self.weird: self.prediction_num, self.validation_num = divide_file(self.input_bed, self.prediction_bed, self.validation_bed, fraction, abs_max) # Make fasta files index_dir = os.path.join(self.config.get_index_dir(), organism) self.logger.debug("Creating %s", self.prediction_fa) genome_index.track2fasta(index_dir, self.prediction_bed, self.prediction_fa, use_strand=use_strand, ignore_missing=True) self.logger.debug("Creating %s", self.validation_fa) genome_index.track2fasta(index_dir, self.validation_bed, self.validation_fa, use_strand=use_strand, ignore_missing=True) def prepare_input_fa(self, inputfile, width=200, fraction=0.2, abs_max=1000): """ Create all the bed- and fasta-files necessary for motif prediction and validation """ self.inputfile = inputfile width = int(width) fraction = float(fraction) abs_max = int(abs_max) self.logger.info("preparing input (FASTA)") # Split inputfile in prediction and validation set self.logger.debug( "Splitting %s into prediction set (%s) and validation set (%s)", self.inputfile, self.prediction_fa, self.validation_fa) self.prediction_num, self.validation_num = divide_fa_file(self.inputfile, self.prediction_fa, self.validation_fa, fraction, abs_max) def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10): fg = Fasta(fafile) if bg_type == "random": if int(self.markov_model) >= 6: self.logger.warn("Are you sure about the Markov model? It seems too high!") else: order = {"1":"1st","2":"2nd", "3":"3rd", "4":"4th", "5":"5th"}[str(self.markov_model)] self.logger.debug("Creating random background (%s order Markov)" % order) m = MarkovFasta(fg, k=int(self.markov_model), n=nr_times * len(fg)) m.writefasta(outfile) self.logger.debug("Random background: %s", outfile) # return the number of random sequences created return len(m) elif bg_type == "genomic": self.logger.debug("Creating genomic background") index_dir = os.path.join(self.config.get_index_dir(), organism) f = RandomGenomicFasta(index_dir, width, nr_times * len(fg)) f.writefasta(outfile) return len(f) elif bg_type == "gc": self.logger.debug("Creating GC matched background") f = MatchedGcFasta(fafile, organism, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("GC matched background: %s", outfile) return len(f) elif bg_type == "promoter": gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism) index_dir = os.path.join(self.config.get_index_dir(), organism) self.logger.info( "Creating random promoter background (%s, using genes in %s)", organism, gene_file) f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("Random promoter background: %s", outfile) return len(f) elif bg_type == "user": bg_file = self.params["user_background"] if not os.path.exists(bg_file): self.logger.error( "User-specified background file %s does not exist!", bg_file) sys.exit(1) else: self.logger.info("Copying user-specified background file %s to %s.", bg_file, outfile) fa = Fasta(bg_file) l = median([len(seq) for seq in fa.seqs]) if l < width * 0.95 or l > width * 1.05: self.logger.warn("The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length.", bg_file, l, width) fa.writefasta(outfile) return len(fa) # def filter_motifs(self, motif_ids, enrichmentfile, e_cutoff, p_cutoff): # filt_motifs = [] # for line in open(enrichmentfile).readlines(): # if not line.startswith("#"): # vals = line.strip().split("\t") # if vals[0] in motif_ids: # p,e = float(vals[2]), float(vals[5]) # if p <= p_cutoff and e >= e_cutoff: # filt_motifs.append(vals[0]) # return filt_motifs def calculate_enrichment(self, motif_file, fg, bg): """ fg: [sample_fa, sample_gff] bg: [[bg1_fa, bg1_gff, bg1_enrichment], [bg2_fa, bg2_gff, bg2_enrichment], .. etc] """ self.logger.debug("Scanning background sequences with motifs") # define filenames fnames = [(fg[0], fg[1])] + [x[:2] for x in bg] # scan and save as gff for infile,outfile in fnames: with open(outfile, "w") as f: for line in command_scan(infile, motif_file, nreport=1, cutoff=self.SCAN_THRESHOLD, bed=False, scan_rc=True): f.write(line + "\n") self.logger.debug("Calculating enrichment") enrichment_cmd = gff_enrichment num_sample = len(Fasta(fg[0]).items()) for fasta_file, gff_file, out_file in bg: num_bg = len(Fasta(fasta_file).items()) enrichment_cmd(fg[1], gff_file, num_sample, num_bg, out_file) def create_background(self, background=None, organism="hg18", width=200): if background is None: background = ["random"] nr_sequences = {} # Create background for motif prediction if "gc" in background: self._create_background("gc", self.validation_bed, self.validation_fa, self.prediction_bg, organism=organism, width=width) else: self._create_background(background[0], self.validation_bed, self.validation_fa, self.prediction_bg, organism=organism, width=width) # Get background fasta files for bg in background: nr_sequences[bg] = self._create_background(bg, self.validation_bed, self.validation_fa, self.bg_file["fa"][bg], organism=organism, width=width) def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold): self.logger.info("clustering significant motifs.") trim_ic = 0.2 clusters = [] motifs = read_motifs(open(pfm_file), fmt="pwm") if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs( pfm_file, "total", "wic", "mean", True, threshold=float(threshold), include_bg=True, progress=False ) clusters = tree.getResult() ids = [] mc = MotifComparer() for cluster,members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(self.imgdir,"%s.png" % cluster.id), format="PNG") ids.append([cluster.id, {"src":"images/%s.png" % cluster.id},[]]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1,"+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join(self.imgdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add) ids[-1][2] = [dict([("src", "images/%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members] env = jinja2.Environment(loader=jinja2.FileSystemLoader([self.config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(expname=self.basename, motifs=ids, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION) f = open(self.cluster_report, "w") f.write(result.encode('utf-8')) f.close() f = open(cluster_pwm, "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close() self.logger.debug("Clustering done. See the result in %s", self.cluster_report) return clusters def create_roc_plots(self, pwm_file, fg_fasta, bg_fasta, name): motifs = dict([(m.id, m) for m in read_motifs(open(pwm_file), fmt="pwm")]) jobs = {} for id,m in motifs.items(): jobs[id] = self.job_server().apply_async(get_roc_values, (motifs[id],fg_fasta,bg_fasta,)) roc_img_file = os.path.join(self.imgdir, "%s_%s_roc.png") for id in motifs.keys(): error, x, y = jobs[id].get() if error: self.logger.error("Error in thread: %s", error) sys.exit(1) roc_plot(roc_img_file % (id,name), x, y) def calculate_cluster_enrichment(self, pwm, background): fg = [self.validation_fa, self.validation_cluster_gff] bg = [[self.bg_file["fa"][bg_id], self.bg_file["gff"][bg_id], self.bg_file["cluster_enrichment"][bg_id]] for bg_id in background] self.calculate_enrichment(pwm, fg, bg) pass def _roc_metrics(self, pwm, sample_fa, bg_fa, roc_file): motifs = dict([(m.id, m) for m in read_motifs(open(pwm), fmt="pwm")]) jobs = {} for id,m in motifs.items(): jobs[id] = self.job_server().apply_async(get_scores, (motifs[id],sample_fa,bg_fa,)) all_auc = {} all_mncp = {} f = open(roc_file, "w") f.write("Motif\tROC AUC\tMNCP\tMax f-measure\tSens @ max f-measure\n") for id in motifs.keys(): error, auc, mncp, max_f, y = jobs[id].get() if error: self.logger.error("Error in thread: %s", error) sys.exit(1) f.write("%s\t%s\t%s\t%s\t%s\n" % (id,auc,mncp,max_f,y)) all_auc[id] = auc all_mncp[id] = mncp f.close() return all_auc,all_mncp def _calc_report_values(self, pwm, background): self.logger.debug("Calculating final statistics for report") self.p = dict([(b,{}) for b in background]) self.e = dict([(b,{}) for b in background]) e_files = dict([(bg, self.bg_file["cluster_enrichment"][bg]) for bg in background]) for bg in self.p.keys(): for line in open(e_files[bg]).readlines(): if not (line.startswith("#") or line.startswith("Motif\tSig")): vals = line.strip().split("\t") self.p[bg][vals[0]] = float(vals[2]) self.e[bg][vals[0]] = float(vals[5]) self.auc = dict([(b,{}) for b in background]) self.mncp = dict([(b,{}) for b in background]) rocs = dict([(bg, [self.bg_file["fa"][bg], self.bg_file["roc"][bg]]) for bg in background]) for bg in self.auc.keys(): bg_fasta_file, roc_file = rocs[bg] self.auc[bg], self.mncp[bg] = self._roc_metrics(pwm, self.validation_fa, bg_fasta_file, roc_file) motifs = read_motifs(open(pwm), fmt="pwm") self.closest_match = self.determine_closest_match(motifs) def _create_text_report(self, pwm, background): self.logger.debug("Creating text report") motifs = read_motifs(open(pwm), fmt="pwm") sort_key = background[0] if "gc" in background: sort_key = "gc" f = open(self.text_report, "w") header = "ID\tconsensus\tBest match db\tp-value best match\t" + "\t".join("Enrichment (%s)\tp-value (%s)\tROC AUC (%s)\tMNCP (%s)" % (b,b,b,b) for b in background) #print header f.write("%s\n" % header) for motif in sorted(motifs, cmp=lambda x,y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])): vals = [motif.id, motif.to_consensus(), self.closest_match[motif.id][0].id, self.closest_match[motif.id][1]] for bg in background: vals += [self.e[bg][motif.id], self.p[bg][motif.id], self.auc[bg][motif.id], self.mncp[bg][motif.id]] f.write("%s\n" % "\t".join([str(x) for x in vals])) #print "%s\n" % "\t".join([str(x) for x in vals]) f.close() def print_params(self): f = open(self.params_file, "w") for param, value in self.params.items(): f.write("%s\t%s\n" % (param, value)) f.close() def _create_report(self, pwm, background, stats=None, best_id=None): if stats is None: stats = {} if best_id is None: best_id = {} self.logger.debug("Creating graphical report") class ReportMotif: pass motifs = read_motifs(open(pwm), fmt="pwm") for m,match in self.closest_match.items(): match[0].to_img(os.path.join(self.imgdir,"%s.png" % match[0].id), format="PNG") sort_key = background[0] if "gc" in background: sort_key = "gc" roc_img_file = "%s_%s_roc" report_motifs = [] sorted_motifs = sorted(motifs, cmp= lambda x,y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id]) ) for motif in sorted_motifs: rm = ReportMotif() rm.id = motif.id rm.id_href = {"href": "#%s" % motif.id} rm.id_name = {"name": motif.id} rm.img = {"src": os.path.join("images", "%s.png" % motif.id)} rm.best = best_id[motif.id] rm.consensus = motif.to_consensus() rm.stars = stats["%s_%s" % (motif.id, motif.to_consensus())]["stars"] rm.bg = {} for bg in background: rm.bg[bg] = {} rm.bg[bg]["e"] = "%0.2f" % self.e[bg].setdefault(motif.id, 0.0) rm.bg[bg]["p"] = "%0.2f" % self.p[bg].setdefault(motif.id, 1.0) rm.bg[bg]["auc"] = "%0.3f" % self.auc[bg][motif.id] rm.bg[bg]["mncp"] = "%0.3f" % self.mncp[bg][motif.id] rm.bg[bg]["roc_img"] = {"src": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"} rm.bg[bg]["roc_img_link"] = {"href": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"} rm.histogram_img = {"data":"images/%s_histogram.svg" % motif.id} rm.histogram_link= {"href":"images/%s_histogram.svg" % motif.id} rm.match_img = {"src": "images/%s.png" % self.closest_match[motif.id][0].id} rm.match_id = self.closest_match[motif.id][0].id rm.match_pval = "%0.2e" % self.closest_match[motif.id][1] report_motifs.append(rm) total_report = self.motif_report env = jinja2.Environment(loader=jinja2.FileSystemLoader([self.config.get_template_dir()])) template = env.get_template("report_template.jinja.html") result = template.render(expname=self.basename, motifs=report_motifs, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION) f = open(total_report, "w") f.write(result.encode('utf-8')) f.close() def determine_closest_match(self, motifs): self.logger.debug("Determining closest matching motifs in database") motif_db = self.config.get_default_params()["motif_db"] db = os.path.join(self.config.get_motif_dir(), motif_db) db_motifs = [] if db.endswith("pwm") or db.endswith("pfm"): db_motifs = read_motifs(open(db), fmt="pwm") elif db.endswith("transfac"): db_motifs = read_motifs(db, fmt="transfac") closest_match = {} mc = MotifComparer() db_motif_lookup = dict([(m.id, m) for m in db_motifs]) match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean", parallel=False) for motif in motifs: # Calculate p-value pval, pos, orient = mc.compare_motifs(motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True) closest_match[motif.id] = [db_motif_lookup[match[motif.id][0]], pval] return closest_match def _determine_best_motif_in_cluster(self, clusters, pwm, sample_fa, bg_fa, imgdir=None): num_cluster = {} best_id = {} out = open(pwm, "w") for i, (clus, singles) in enumerate(clusters): motifs = [clus] + singles tmp = NamedTemporaryFile(dir=mytmpdir()) tmp2 = NamedTemporaryFile(dir=mytmpdir()) for m in motifs: tmp.write("%s\n" % m.to_pwm()) tmp.flush() auc,mncp = self._roc_metrics(tmp.name, sample_fa, bg_fa, tmp2.name) bla = sorted(motifs, cmp=lambda x,y: cmp(mncp[x.id], mncp[y.id])) for m in bla: self.logger.debug("sorted: %s %s %s", str(m), mncp[m.id], auc[m.id]) self.logger.debug("end list") best_motif = sorted(motifs, cmp=lambda x,y: cmp(mncp[x.id], mncp[y.id]))[-1] old_id = best_motif.id best_motif.id = "GimmeMotifs_%d" % (i + 1) best_id[best_motif.id] = old_id.split("_")[0] num_cluster["%s_%s" % (best_motif.id, best_motif.to_consensus())] = len(singles) if imgdir: best_motif.to_img(os.path.join(imgdir, best_motif.id), format="PNG") out.write("%s\n" % best_motif.to_pwm()) tmp.close() tmp2.close() out.close() return num_cluster, best_id def run_full_analysis(self, inputfile, user_params=None): """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """ self.logger.info("starting full motif analysis") self.logger.debug("Using temporary directory {0}".format(mytmpdir())) if user_params is None: user_params = {} params = self.config.get_default_params() params.update(user_params) if params["torque"]: from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult self.logger.debug("Using torque") else: from gimmemotifs.prediction import pp_predict_motifs, PredictionResult self.logger.debug("Using multiprocessing") self.params = params #self.weird = params["weird_option"] background = [x.strip() for x in params["background"].split(",")] self.logger.debug("Parameters:") for param, value in params.items(): self.logger.debug(" %s: %s", param, value) # Checking input self.input_type = "BED" # If we can load it as fasta then it is a fasta, yeh? try: Fasta(inputfile) self.logger.debug("Inputfile is a FASTA file") self.input_type = "FASTA" except Exception: # Leave it to BED pass index_msg = ( "No index found for genome {}! " "Has GimmeMotifs been configured correctly and is the " "genome indexed?" ).format(params["genome"]) index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) if self.input_type == "FASTA": for bg in background: if not bg in FA_VALID_BGS: self.logger.info("Input type is FASTA, can't use background type '%s'", bg) if bg == "genomic": if not os.path.exists(index_dir): self.logger.error(index_msg) sys.exit(1) background = [bg for bg in background if bg in FA_VALID_BGS] elif self.input_type == "BED": # Does the index_dir exist? #bed-specific if not os.path.exists(index_dir): self.logger.error(index_msg) sys.exit(1) # is it a valid bed-file etc. self._check_input(inputfile) # bed-specific # Check for valid background for bg in background: if not bg in BED_VALID_BGS: self.logger.info("Input type is BED, can't use background type '%s'", bg) background = [bg for bg in background if bg in BED_VALID_BGS] if len(background) == 0: self.logger.error("No valid backgrounds specified!") sys.exit(1) self.max_time = None max_time = None # Maximum time? if params["max_time"]: try: max_time = float(params["max_time"]) except Exception: self.logger.debug("Could not parse max_time value, setting to no limit") self.max_time = None if max_time > 0: self.logger.debug("Time limit for motif prediction: %0.2f hours" % max_time) max_time = 3600 * max_time self.max_time = max_time self.logger.debug("Max_time in seconds %0.0f" % self.max_time) else: self.logger.debug("Invalid time limit for motif prediction, setting to no limit") self.max_time = None else: self.logger.debug("No time limit for motif prediction") if "random" in background: self.markov_model = params["markov_model"] # Create the necessary files for motif prediction and validation if self.input_type == "BED": self.prepare_input_bed(inputfile, params["genome"], params["width"], params["fraction"], params["abs_max"], params["use_strand"]) # Create file for location plots index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) lwidth = int(params["lwidth"]) width = int(params["width"]) extend = (lwidth - width) / 2 genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"], ignore_missing=True) elif self.input_type == "FASTA": self.prepare_input_fa(inputfile, params["width"], params["fraction"], params["abs_max"]) # File for location plots self.location_fa = self.validation_fa fa = Fasta(self.location_fa) seqs = fa.seqs lwidth = len(seqs[0]) all_same_width = not(False in [len(seq) == lwidth for seq in seqs]) if not all_same_width: self.logger.warn("PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!") else: self.logger.error("Unknown input type, shouldn't happen") sys.exit(1) tools = dict([(x.strip(), x in [y.strip() for y in params["tools"].split(",")]) for x in params["available_tools"].split(",")]) self.create_background(background, params["genome"], params["width"]) # Predict the motifs analysis = params["analysis"] """ Predict motifs, input is a FASTA-file""" self.logger.info("starting motif prediction (%s)", analysis) self.logger.info("tools: %s", ", ".join([x for x in tools.keys() if tools[x]])) bg_file = self.bg_file["fa"][sorted(background, lambda x,y: cmp(BG_RANK[x], BG_RANK[y]))[0]] self.logger.debug("Using bg_file %s for significance" % bg_file) result = pp_predict_motifs(self.prediction_fa, self.predicted_pfm, analysis, params["genome"], params["use_strand"], self.prediction_bg, tools, self.job_server(), logger=self.logger, max_time=self.max_time, fg_file=self.validation_fa, bg_file=bg_file) motifs = result.motifs self.logger.info("predicted %s motifs", len(motifs)) self.logger.debug("written to %s",self.predicted_pfm) if len(motifs) == 0: self.logger.info("no motifs found") sys.exit() # Write stats output to file f = open(self.stats_file, "w") stat_keys = result.stats.values()[0].keys() f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys))) self.logger.debug(result.stats) for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats: f.write("%s\t%s\n" % (motif.id, "\t".join([str(stats[k]) for k in stat_keys]))) else: self.logger.error("No stats for motif {0}, skipping this motif!".format(motif.id)) motifs.remove(motif) f.close() self.motifs_with_stats = motifs f = open(self.ranks_file, "w") tools = dict((m.id.split("_")[0],1) for m in motifs).keys() f.write("Metric\tType\t%s\n" % ("\t".join(tools))) for stat in ["mncp", "roc_auc", "maxenr"]: best_motif = {} for motif in self.motifs_with_stats: val = result.stats["%s_%s" % (motif.id, motif.to_consensus())][stat] name = motif.id.split("_")[0] if val > best_motif.setdefault(name, 0): best_motif[name] = val names = best_motif.keys() vals = [best_motif[name] for name in names] rank = rankdata(vals) ind = [names.index(x) for x in tools] f.write("%s\t%s\t%s\n" % (stat, "value", "\t".join([str(vals[i]) for i in ind]))) f.write("%s\t%s\t%s\n" % (stat, "rank", "\t".join([str(rank[i]) for i in ind]))) f.close() #self.logger.debug("RANK: %s" % stat) #self.logger.debug("\t".join([str(x) for x in names])) #self.logger.debug("\t".join([str(x) for x in vals])) #self.logger.debug("\t".join([str(x) for x in rank])) # Determine significant motifs nsig = 0 f = open(self.significant_pfm, "w") for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats['enr_fdr'] >= 2: f.write("%s\n" % motif.to_pfm()) nsig += 1 f.close() self.logger.info("%s motifs are significant", nsig) self.logger.debug("written to %s", self.significant_pfm) if nsig == 0: self.logger.info("no significant motifs found") return # ROC metrics of significant motifs for bg in background: self._roc_metrics(self.significant_pfm, self.validation_fa, self.bg_file["fa"][bg], self.bg_file["roc"][bg]) # Cluster significant motifs clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm, self.outdir, params["cluster_threshold"]) # Determine best motif in cluster num_cluster, best_id = self._determine_best_motif_in_cluster(clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir) ### Enable parallel and modular evaluation of results # Scan (multiple) files with motifs # Define callback functions once scanning is finished: # - ROC plot # - Statistics # - Location plots (histogram) # - # Stars tmp = NamedTemporaryFile(dir=mytmpdir()).name p = PredictionResult(tmp, logger=self.logger, job_server=self.server, fg_file = self.validation_fa, bg_file = bg_file) p.add_motifs(("clustering", (read_motifs(open(self.final_pwm), fmt="pwm"), "",""))) while len(p.stats.keys()) < len(p.motifs): sleep(5) for mid, num in num_cluster.items(): p.stats[mid]["numcluster"] = num all_stats = { "mncp": [2, 5, 8], "roc_auc": [0.6, 0.75, 0.9], "maxenr": [10, 20, 30], "enr_fdr": [4, 8, 12], "fraction": [0.4, 0.6, 0.8], "ks_sig": [4, 7, 10], "numcluster": [3, 6, 9], } self.logger.info("creating report") # ROC plots for bg in background: self.create_roc_plots(self.final_pwm, self.validation_fa, self.bg_file["fa"][bg], bg) # Location plots self.logger.debug("Creating localization plots") motifs = read_motifs(open(self.final_pwm), fmt="pwm") for motif in motifs: m = "%s_%s" % (motif.id, motif.to_consensus()) s = p.stats[m] outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id) motif_localization(self.location_fa, motif, lwidth, outfile, cutoff=s["cutoff_fdr"]) s["stars"] = int(mean([star(s[x], all_stats[x]) for x in all_stats.keys()]) + 0.5) self.logger.debug("Motif %s: %s stars" % (m, s["stars"])) # Calculate enrichment of final, clustered motifs self.calculate_cluster_enrichment(self.final_pwm, background) # Create report self.print_params() self._calc_report_values(self.final_pwm, background) self._create_report(self.final_pwm, background, stats=p.stats, best_id=best_id) self._create_text_report(self.final_pwm, background) self.logger.info("finished") self.logger.info("output dir: %s", os.path.split(self.motif_report)[0]) self.logger.info("report: %s", os.path.split(self.motif_report)[-1]) #self.logger.info("Open %s in your browser to see your results." % (self.motif_report)) if not(params["keep_intermediate"]): self.logger.debug("Deleting intermediate files. Please specifify the -k option if you want to keep these files.") shutil.rmtree(self.tmpdir) self.logger.debug("Done") return self.motif_report
def __init__(self): self.config = MotifConfig() self.metrics = ["pcc", "ed", "distance", "wic"] self.combine = ["mean", "sum"] self._load_scores()
def moap(inputfile, method="classic", scoring="score", outfile=None, motiffile=None, pwmfile=None, genome=None, cutoff=0.95): """ Run a single motif activity prediction algorithm. Parameters ---------- inputfile : str File with regions (chr:start-end) in first column and either cluster name in second column or a table with values. method : str, optional Motif activity method to use. Any of 'classic', 'ks', 'lasso', 'lightning', 'mara', 'rf'. Default is 'classic'. scoring: str, optional Either 'score' or 'count' outfile : str, optional Name of outputfile to save the fitted activity values. motiffile : str, optional Table with motif scan results. First column should be exactly the same regions as in the inputfile. pwmfile : str, optional File with motifs in pwm format. Required when motiffile is not supplied. genome : str, optional Genome name, as indexed by gimme. Required when motiffile is not supplied cutoff : float, optional Cutoff for motif scanning Returns ------- pandas DataFrame with motif activity """ if scoring not in ['score', 'count']: raise ValueError("valid values are 'score' and 'count'") config = MotifConfig() m2f = None # read data df = pd.read_table(inputfile, index_col=0) if method in CLUSTER_METHODS: if df.shape[1] != 1: raise ValueError("1 column expected for {}".format(method)) else: if np.dtype('object') in set(df.dtypes): raise ValueError( "columns should all be numeric for {}".format(method)) if method not in VALUE_METHODS: raise ValueError("method {} not valid".format(method)) if motiffile is None: if genome is None: raise ValueError("need a genome") # check pwmfile if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") if not os.path.exists(pwmfile): raise ValueError("{} does not exist".format(pwmfile)) try: motifs = read_motifs(open(pwmfile)) except: sys.stderr.write("can't read motifs from {}".format(pwmfile)) raise base = os.path.splitext(pwmfile)[0] map_file = base + ".motif2factors.txt" if os.path.exists(map_file): m2f = pd.read_table(map_file, index_col=0) # initialize scanner s = Scanner() sys.stderr.write(pwmfile + "\n") s.set_motifs(pwmfile) s.set_genome(genome) # scan for motifs sys.stderr.write("scanning for motifs\n") motif_names = [m.id for m in read_motifs(open(pwmfile))] scores = [] if method == 'classic' or scoring == "count": for row in s.count(list(df.index), cutoff=cutoff): scores.append(row) else: for row in s.best_score(list(df.index)): scores.append(row) motifs = pd.DataFrame(scores, index=df.index, columns=motif_names) else: motifs = pd.read_table(motiffile, index_col=0) motifs = motifs.loc[df.index] clf = None if method == "ks": clf = KSMoap() if method == "mwu": clf = MWMoap() if method == "rf": clf = RFMoap() if method == "lasso": clf = LassoMoap() if method == "lightning": clf = LightningMoap() if method == "mara": clf = MaraMoap() if method == "more": clf = MoreMoap() if method == "classic": clf = ClassicMoap() clf.fit(motifs, df) if outfile: with open(outfile, "w") as f: f.write("# maelstrom - GimmeMotifs version {}\n".format(GM_VERSION)) f.write("# method: {} with motif {}\n".format(method, scoring)) if genome: f.write("# genome: {}\n".format(genome)) if motiffile: f.write("# motif table: {}\n".format(motiffile)) f.write("# {}\n".format(clf.act_description)) with open(outfile, "a") as f: clf.act_.to_csv(f, sep="\t") return clf.act_
def create_background_file(outfile, bg_type, fmt="fasta", size=None, genome=None, inputfile=None, number=10000): """ Create a background file for motif analysis. Parameters ---------- outfile : str Name of the output file. bg_type : str Type of background (gc, genomic, random or promoter). fmt : str, optional Either 'fasta' or 'bed'. size : int, optional Size of the generated sequences, is determined from the inputfile if not given. genome : str, optional inputfile : str, optional number : int, optional """ fmt = fmt.lower() if fmt in ["fa", "fsa"]: fmt = "fasta" if bg_type not in BG_TYPES: print("The argument 'type' should be one of: %s" % (",".join(BG_TYPES))) sys.exit(1) if fmt == "bed" and bg_type == "random": print("Random background can only be generated in FASTA format!") sys.exit(1) if bg_type == "gc" and not inputfile: print("need a FASTA formatted input file for background gc") sys.exit(1) # GimmeMotifs configuration for file and directory locations config = MotifConfig() # Genome index location for creation of FASTA files if bg_type in ["gc", "genomic", "promoter"] and fmt == "fasta": if genome is None: print("Need a genome to create background file") sys.exit(1) Genome(genome) if bg_type in ["promoter"]: # Gene definition fname = Genome(genome).filename gene_file = fname.replace(".fa", ".annotation.bed.gz") if not gene_file: gene_file = os.path.join(config.get_gene_dir(), "{}.bed".format(genome)) if not os.path.exists(gene_file): print("Could not find a gene file for genome {}".format(genome)) print("Did you use the --annotation flag for genomepy?") print( "Alternatively make sure there is a file called {}.bed in {}". format(genome, config.get_gene_dir())) sys.exit(1) # Number of sequences if number is None: if inputfile: number = number_of_seqs_in_file(inputfile) logger.info("Using %s of background sequences based on input file", number) else: number = 10000 logger.info( "Number of background sequences not specified, using 10,000 sequences" ) if bg_type == "random": f = Fasta(inputfile) m = MarkovFasta(f, n=number, k=1) m.writefasta(outfile) elif bg_type == "gc": if fmt == "fasta": m = MatchedGcFasta(inputfile, genome, number=number, size=size) m.writefasta(outfile) else: matched_gc_bedfile(outfile, inputfile, genome, number, size=size) else: if size is None: size = np.median( [len(seq) for seq in as_fasta(inputfile, genome=genome).seqs]) if bg_type == "promoter": if fmt == "fasta": m = PromoterFasta(gene_file, genome, size=size, n=number) m.writefasta(outfile) else: create_promoter_bedfile(outfile, gene_file, size, number) elif bg_type == "genomic": if fmt == "fasta": m = RandomGenomicFasta(genome, size, number) m.writefasta(outfile) else: create_random_genomic_bedfile(outfile, genome, size, number)
def background(args): inputfile = args.inputfile out = args.outputfile bg_type = args.bg_type outformat = args.outformat.lower() length = args.length if not bg_type in BG_TYPES: print "The argument 'type' should be one of: %s" % (",".join(BG_TYPES)) sys.exit(1) if outformat == "bed" and bg_type == "random": print "Random background can only be generated in FASTA format!" sys.exit(1) if bg_type == "gc" and not inputfile: print "need a FASTA formatted input file for background gc" sys.exit(1) # GimmeMotifs configuration for file and directory locations config = MotifConfig() # Genome index location for creation of FASTA files index_dir = os.path.join(config.get_index_dir(), args.genome) if bg_type in ["gc", "genomic", "promoter"] and outformat == "fasta": if not os.path.exists(index_dir): print "Index for %s does not exist. Has the genome been indexed for use with GimmeMotifs?" % args.genome sys.exit(1) # Gene definition gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % args.genome) if bg_type in ["promoter"]: if not os.path.exists(gene_file): print "Can't find gene definition for %s (%s). See GimmeMotifs documentation on how to add gene files." % (args.genome, gene_file) sys.exit(1) # Number of sequences number = None if args.number: number = args.number elif inputfile: number = number_of_seqs_in_file(inputfile) else: sys.stderr.write("please provide either a number or an inputfile\n") sys.exit(1) if bg_type == "random": f = Fasta(inputfile) m = bg.MarkovFasta(f, n=number, k=args.markov_order) m.writefasta(out) elif bg_type == "gc": if outformat in ["fasta", "fa"]: m = bg.MatchedGcFasta(inputfile, args.genome, number=number) m.writefasta(out) else: bg.matched_gc_bedfile(out, inputfile, args.genome, number) elif bg_type == "promoter": if outformat in ["fasta", "fa"]: m = bg.PromoterFasta(gene_file, index_dir, length=length, n=number) m.writefasta(out) else: bg.create_promoter_bedfile(out, gene_file, length, number) elif bg_type == "genomic": if outformat in ["fasta", "fa"]: m = bg.RandomGenomicFasta(index_dir, length, number) m.writefasta(out) else: bg.create_random_genomic_bedfile(out, index_dir, length, number)
def create_background( bg_type, fafile, outfile, genome="hg18", size=200, nr_times=10, custom_background=None, ): """Create background of a specific type. Parameters ---------- bg_type : str Name of background type. fafile : str Name of input FASTA file. outfile : str Name of output FASTA file. genome : str, optional Genome name. size : int, optional Size of regions. nr_times : int, optional Generate this times as many background sequences as compared to input file. Returns ------- nr_seqs : int Number of sequences created. """ size = int(size) config = MotifConfig() fg = Fasta(fafile) if bg_type in ["genomic", "gc"]: if not genome: logger.error("Need a genome to create background") sys.exit(1) if bg_type == "random": f = MarkovFasta(fg, k=1, n=nr_times * len(fg)) logger.debug("Random background: %s", outfile) elif bg_type == "genomic": logger.debug("Creating genomic background") f = RandomGenomicFasta(genome, size, nr_times * len(fg)) elif bg_type == "gc": logger.debug("Creating GC matched background") f = MatchedGcFasta(fafile, genome, nr_times * len(fg)) logger.debug("GC matched background: %s", outfile) elif bg_type == "promoter": fname = Genome(genome).filename gene_file = fname.replace(".fa", ".annotation.bed.gz") if not gene_file: gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genome) if not os.path.exists(gene_file): print("Could not find a gene file for genome {}") print("Did you use the --annotation flag for genomepy?") print( "Alternatively make sure there is a file called {}.bed in {}". format(genome, config.get_gene_dir())) raise ValueError() logger.info( "Creating random promoter background (%s, using genes in %s)", genome, gene_file, ) f = PromoterFasta(gene_file, genome, size, nr_times * len(fg)) logger.debug("Random promoter background: %s", outfile) elif bg_type == "custom": bg_file = custom_background if not bg_file: raise IOError("Background file not specified!") if not os.path.exists(bg_file): raise IOError("Custom background file %s does not exist!", bg_file) else: logger.info("Copying custom background file %s to %s.", bg_file, outfile) f = Fasta(bg_file) median_length = np.median([len(seq) for seq in f.seqs]) if median_length < (size * 0.95) or median_length > (size * 1.05): logger.warn( "The custom background file %s contains sequences with a " "median size of %s, while GimmeMotifs predicts motifs in sequences " "of size %s. This will influence the statistics! It is recommended " "to use background sequences of the same size.", bg_file, median_length, size, ) f.writefasta(outfile) return len(f)
def scan_to_table( input_table, genome, scoring, pfmfile=None, ncpus=None, zscore=True, gc=True ): """Scan regions in input table with motifs. Parameters ---------- input_table : str Filename of input table. Can be either a text-separated tab file or a feather file. genome : str Genome name. Can be either the name of a FASTA-formatted file or a genomepy genome name. scoring : str "count" or "score" pfmfile : str, optional Specify a PFM file for scanning. ncpus : int, optional If defined this specifies the number of cores to use. Returns ------- table : pandas.DataFrame DataFrame with motif ids as column names and regions as index. Values are either counts or scores depending on the 'scoring' parameter.s """ config = MotifConfig() if pfmfile is None: pfmfile = config.get_default_params().get("motif_db", None) if pfmfile is not None: pfmfile = os.path.join(config.get_motif_dir(), pfmfile) if pfmfile is None: raise ValueError("no pfmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:, 0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) if len(regions) >= 1000: check_regions = np.random.choice(regions, size=1000, replace=False) else: check_regions = regions size = int( np.median([len(seq) for seq in as_fasta(check_regions, genome=genome).seqs]) ) s = Scanner(ncpus=ncpus) s.set_motifs(pfmfile) s.set_genome(genome) s.set_background(genome=genome, gc=gc, size=size) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) msg = "creating score table" if zscore: msg += " (z-score" if gc: msg += ", GC%" msg += ")" else: msg += " (logodds)" logger.info(msg) for row in s.best_score(regions, zscore=zscore, gc=gc): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(pfmfile)] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
def run(self): from gimmemotifs.config import MotifConfig cfg = MotifConfig(use_config=self.build_cfg) data_dir = self.remove_nonsense(os.path.abspath(self.install_dir)) cfg.set_template_dir(os.path.join(data_dir, 'gimmemotifs/templates')) cfg.set_gene_dir(os.path.join(data_dir, 'gimmemotifs/genes')) cfg.set_score_dir(os.path.join(data_dir, 'gimmemotifs/score_dists')) cfg.set_index_dir(os.path.join(data_dir, 'gimmemotifs/genome_index')) cfg.set_motif_dir(os.path.join(data_dir, 'gimmemotifs/motif_databases')) cfg.set_bg_dir(os.path.join(data_dir, 'gimmemotifs/bg')) cfg.set_tools_dir(os.path.join(data_dir, 'gimmemotifs/tools')) final_tools_dir = self.remove_nonsense(self.install_tools_dir) for program in MOTIF_CLASSES: m = eval(program)() if cfg.is_configured(m.name): bin = cfg.bin(m.name).replace(self.build_tools_dir, final_tools_dir) dir = cfg.dir(m.name) if dir: dir = dir.replace(self.build_tools_dir, final_tools_dir) cfg.set_program(m.name, {"bin":bin, "dir":dir}) dir = cfg.get_seqlogo() dir = dir.replace(self.build_tools_dir, final_tools_dir) cfg.set_seqlogo(dir) # Use a user-specific configfile if any other installation scheme is used # if os.path.abspath(self.install_dir) == "/usr/share": config_file = os.path.join(self.install_dir, "gimmemotifs/%s" % CONFIG_NAME) self.outfiles = [config_file] if os.path.exists(config_file): new_config = config_file + ".tmp" dlog.info("INFO: Configfile %s already exists!" % config_file) dlog.info("INFO: Will create %s, which contains the new config." % new_config) dlog.info("INFO: If you want to use the newly generated config you can move %s to %s, otherwise you can delete %s.\n" % (new_config, config_file, new_config)) f = open(new_config, "wb") cfg.write(f) else: dlog.info("writing configuration file %s" % config_file) f = open(config_file, "wb") cfg.write(f) if os.path.abspath(self.install_dir) != "/usr/share": dlog.info("PLEASE NOTE: GimmeMotifs is installed in a non-standard location.") dlog.info("PLEASE NOTE: This is fine, but then every user should have a file called ~/.gimmemotifs.cfg") dlog.info("PLEASE NOTE: The file %s is fully configured during install and can be used for that purpose." % config_file)
def run(self): if not os.path.exists(self.build_cfg): os.mkdir(self.build_cfg) from gimmemotifs.config import MotifConfig cfg = MotifConfig(use_config="cfg/gimmemotifs.cfg.base") dlog.info("locating motif programs") available = [] for program in MOTIF_CLASSES: # Get class m = eval(program)() cmd = m.cmd ### ugly, fixme :) if cmd == "trawler.pl": cmd = "trawler/bin/trawler.pl" if cmd == "ChIPMunk.sh": cmd = "ChIPMunk/ChIPMunk.sh" if cmd == "hms": cmd = "HMS/hms" bin = "" if cmd == "/bin/false": # motif db bin = "/bin/false" elif os.path.exists(os.path.join(self.build_tools_dir, cmd)): bin = os.path.join(self.build_tools_dir, cmd) dlog.info("using included version of %s: %s" % (program, bin)) else: ### ugly, fixme :) if cmd == "trawler/bin/trawler.pl": cmd = "trawler.pl" if cmd == "ChIPMunk/ChIPMunk.sh": cmd = "ChIPMunk.sh" if cmd == "HMS/hms": cmd = "hms" if program in MOTIF_BINS.keys(): dlog.info("could not find compiled version of %s" % program) bin = which(cmd) if bin: dlog.info("using installed version of %s: %s" % (program, bin)) else: dlog.info("not found: %s" % program) ### Some more ugly stuff if bin: dir = bin.replace(m.cmd,"") if program == "Weeder": dir = bin.replace("weederTFBS.out","") elif program == "Meme": dir = bin.replace("bin/meme.bin", "").replace("meme.bin", "") elif program == "Trawler": dir = bin.replace("bin/trawler.pl", "") elif program == "ChIPMunk": dir = bin.replace("ChIPMunk.sh", "") available.append(m.name) cfg.set_program(m.name, {"bin":bin, "dir":dir}) # Weblogo bin = "" seq_included = os.path.join(self.build_tools_dir, "seqlogo") if os.path.exists(seq_included): bin = seq_included dlog.info("using included version of weblogo: %s" % seq_included) else: bin = which("seqlogo") dlog.info("using installed version of seqlogo: %s" % (bin)) if bin: cfg.set_seqlogo(bin) else: dlog.info("couldn't find seqlogo") # Set the available tools in the config file DEFAULT_PARAMS["available_tools"] = ",".join(available) for tool in available: if tool in LONG_RUNNING: dlog.info("PLEASE NOTE: %s can take a very long time to run on large datasets. Therefore it is not added to the default tools. You can always enable it later, see documentation for details" % tool) available.remove(tool) DEFAULT_PARAMS["tools"] = ",".join(available) cfg.set_default_params(DEFAULT_PARAMS) # Write (temporary) config file config_file = os.path.join(self.build_cfg, "%s" % CONFIG_NAME) dlog.info("writing (temporary) configuration file: %s" % config_file) f = open(config_file, "wb") cfg.write(f) f.close()