def pfmfile_location(infile): config = MotifConfig() if infile is None: infile = config.get_default_params().get("motif_db", None) if infile is None: raise ValueError( "No motif file was given and no default " "database specified in the config file." ) if isinstance(infile, six.string_types): if not os.path.exists(infile): motif_dir = config.get_motif_dir() checkfile = os.path.join(motif_dir, infile) if os.path.exists(checkfile): infile = checkfile else: for ext in [".pfm", ".pwm"]: if os.path.exists(checkfile + ext): infile = checkfile + ext break if not os.path.exists(infile): raise ValueError("Motif file {} not found".format(infile)) return infile
def create_roc_plots(pfmfile, fgfa, background, outdir, genome): """Make ROC plots for all motifs.""" motifs = read_motifs(pfmfile, fmt="pwm", as_dict=True) ncpus = int(MotifConfig().get_default_params()["ncpus"]) pool = Pool(processes=ncpus) jobs = {} for bg, fname in background.items(): for m_id, m in motifs.items(): k = "{}_{}".format(str(m), bg) jobs[k] = pool.apply_async(get_roc_values, (motifs[m_id], fgfa, fname, genome)) imgdir = os.path.join(outdir, "images") if not os.path.exists(imgdir): os.mkdir(imgdir) roc_img_file = os.path.join(outdir, "images", "{}_roc.{}.png") for motif in motifs.values(): for bg in background: k = "{}_{}".format(str(motif), bg) error, x, y = jobs[k].get() if error: logger.error("Error in thread: %s", error) logger.error("Motif: %s", motif) sys.exit(1) roc_plot(roc_img_file.format(motif.id, bg), x, y)
def __init__(self, scale=True, ncpus=None): """Predict motif activities using Support Vector Regression. Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled before classification. ncpus : int, optional Number of threads. Default is the number specified in the config. Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) SVR weights. """ self.act_description = "activity values: SVR weights" if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.ncpus = ncpus self.scale = scale self.act_ = None self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "regression"
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None): threshold = check_threshold(data_dir, genome, scoring) config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") df = pd.read_table(input_table, index_col=0) regions = list(df.index) s = Scanner() s.set_motifs(pwmfile) s.set_genome(genome) scores = [] if scoring == "count": for row in s.count(regions, cutoff=threshold): scores.append(row) else: for row in s.best_score(regions): scores.append(row) motif_names = [m.id for m in read_motifs(open(pwmfile))] return pd.DataFrame(scores, index=df.index, columns=motif_names)
def scan_it_moods(infile, motifs, cutoff, bgfile, nreport=1, scan_rc=True, pvalue=None, count=False): tmpdir = mkdtemp() matrices = [] pseudocount = 1e-3 # sys.stderr.write("bgfile: {}\n".format(bgfile)) bg = MOODS.tools.bg_from_sequence_dna("".join(Fasta(bgfile).seqs), 1) for motif in motifs: pfmname = os.path.join(tmpdir, "{}.pfm".format(motif.id)) with open(pfmname, "w") as f: matrix = np.array(motif.pwm).transpose() for line in [" ".join([str(x) for x in row]) for row in matrix]: f.write("{}\n".format(line)) matrices.append(MOODS.parsers.pfm_log_odds(pfmname, bg, pseudocount)) thresholds = [] if pvalue is not None: thresholds = [ MOODS.tools.threshold_from_p(m, bg, float(pvalue)) for m in matrices ] # sys.stderr.write("{}\n".format(thresholds)) else: thresholds = [calc_threshold_moods(m, float(cutoff)) for m in matrices] scanner = MOODS.scan.Scanner(7) scanner.set_motifs(matrices, bg, thresholds) config = MotifConfig() ncpus = int(config.get_default_params()["ncpus"]) fa = Fasta(infile) chunk = 500 if (len(fa) / chunk) < ncpus: chunk = len(fa) / (ncpus + 1) jobs = [] func = scan_fa_with_motif_moods if count: func = scan_fa_with_motif_moods_count pool = mp.Pool() for i in range(0, len(fa), chunk): jobs.append( pool.apply_async( func, (fa[i:i + chunk], motifs, matrices, bg, thresholds, nreport, scan_rc), )) for job in jobs: for ret in job.get(): yield ret
class MotifProgram: from gimmemotifs.config import MotifConfig config = MotifConfig() def __init__(self): pass def bin(self): return self.config.bin(self.name) def dir(self): return self.config.dir(self.name) def is_configured(self): return self.config.is_configured(self.name) def is_installed(self): return self.is_configured() and os.access(self.bin(), os.X_OK) def run(self, fastafile, savedir, params={}): if not self.is_configured(): raise ValueError, "%s is not configured" % self.name if not self.is_installed(): raise ValueError, "%s is not installed or not correctly configured" % self.name try: return self._run_program(self.bin(), fastafile, savedir, params) except KeyboardInterrupt: return ([], "Killed", "Killed")
def get_genome(genomebuild, fastadir, indexdir=None): config = MotifConfig() if not indexdir: indexdir = config.get_index_dir() genome_dir = os.path.join(fastadir, genomebuild) index_dir = os.path.join(indexdir, genomebuild) # Check for rights to write to directory if not os.path.exists(genome_dir): try: os.mkdir(genome_dir) except OSError: sys.stderr.write("Could not create genome dir {}\n".format(genome_dir)) sys.exit(1) # Download annotation gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild) download_annotation(genomebuild, gene_file) # Download genome FASTA file download_genome(genomebuild, genome_dir) sys.stderr.write("Creating index\n") g = GenomeIndex() g = g.create_index(genome_dir, index_dir) create_bedtools_fa(index_dir, genome_dir)
def get_all_scores(self, motifs, dbmotifs, match, metric, combine, pval=False, parallel=True, trim=None, ncpus=None): # trim motifs first, if specified if trim: for m in motifs: m.trim(trim) for m in dbmotifs: m.trim(trim) # hash of result scores scores = {} if parallel: # Divide the job into big chunks, to keep parallel overhead to minimum # Number of chunks = number of processors available if ncpus is None: ncpus = int(MotifConfig().get_default_params()["ncpus"]) pool = Pool(processes=ncpus, maxtasksperchild=1000) batch_len = len(dbmotifs) // ncpus if batch_len <= 0: batch_len = 1 jobs = [] for i in range(0, len(dbmotifs), batch_len): # submit jobs to the job server p = pool.apply_async(_get_all_scores, args=(self, motifs, dbmotifs[i:i + batch_len], match, metric, combine, pval)) jobs.append(p) pool.close() for job in jobs: # Get the job result result = job.get() # and update the result score for m1, v in result.items(): for m2, s in v.items(): if m1 not in scores: scores[m1] = {} scores[m1][m2] = s pool.join() else: # Do the whole thing at once if we don't want parallel scores = _get_all_scores(self, motifs, dbmotifs, match, metric, combine, pval) return scores
def __init__(self, scale=True, permute=False, ncpus=None): """Predict motif activities using lightning CDClassifier Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled before classification ncpus : int, optional Number of threads. Default is the number specified in the config. Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) fitted coefficients sig_ : DataFrame, shape (n_motifs,) boolean values, if coefficients are higher/lower than the 1%t from random permutation """ self.act_description = ("activity values: coefficients from " "fitted model") #self.cdc = CDClassifier(random_state=args.seed) self.cdc = CDClassifier() self.parameters = { "penalty": ["l1/l2"], "loss": ["squared_hinge"], "multiclass": [True], "max_iter": [20], "alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 3.0)], "C": [0.001, 0.01, 0.1, 0.5, 1.0], "tol": [1e-3] } self.kfolds = 10 if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.clf = GridSearchCV(self.cdc, self.parameters, cv=self.kfolds, n_jobs=ncpus) self.scale = scale self.permute = permute self.act_ = None self.sig_ = None self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "classification"
def prepare_denovo_input_bed(inputfile, params, outdir): """Prepare a BED file for de novo motif prediction. All regions to same size; split in test and validation set; converted to FASTA. Parameters ---------- inputfile : str BED file with input regions. params : dict Dictionary with parameters. outdir : str Output directory to save files. """ logger.info("preparing input (BED)") # Create BED file with regions of equal size width = int(params["width"]) bedfile = os.path.join(outdir, "input.bed") write_equalwidth_bedfile(inputfile, width, bedfile) abs_max = int(params["abs_max"]) fraction = float(params["fraction"]) pred_bedfile = os.path.join(outdir, "prediction.bed") val_bedfile = os.path.join(outdir, "validation.bed") # Split input into prediction and validation set logger.debug( "Splitting %s into prediction set (%s) and validation set (%s)", bedfile, pred_bedfile, val_bedfile) divide_file(bedfile, pred_bedfile, val_bedfile, fraction, abs_max) config = MotifConfig() genome = Genome(params["genome"]) for infile in [pred_bedfile, val_bedfile]: genome.track2fasta( infile, infile.replace(".bed", ".fa"), ) # Create file for location plots lwidth = int(params["lwidth"]) extend = (lwidth - width) // 2 genome.track2fasta( val_bedfile, os.path.join(outdir, "localization.fa"), extend_up=extend, extend_down=extend, stranded=params["use_strand"], )
def __init__(self, ncpus=None): self.config = MotifConfig() self.threshold = None self.genome = None if ncpus is None: self.ncpus = int(MotifConfig().get_default_params()["ncpus"]) else: self.ncpus = ncpus if self.ncpus > 1: try: ctx = mp.get_context('spawn') self.pool = ctx.Pool(processes=self.ncpus) except AttributeError: self.pool = mp.Pool(processes=self.ncpus) self.use_cache = False if self.config.get_default_params().get("use_cache", False): self._init_cache()
def __init__(self, scale=True, kfolds=4, alpha_stepsize=1.0, ncpus=None): """Predict motif activities using Lasso MultiTask regression Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled before classification kfolds : integer, optional, default 5 number of kfolds for parameter search alpha_stepsize : float, optional, default 1.0 stepsize for use in alpha gridsearch ncpus : int, optional Number of threads. Default is the number specified in the config. Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) fitted motif activities sig_ : DataFrame, shape (n_motifs,) boolean values, if coefficients are higher/lower than the 1%t from random permutation """ self.kfolds = kfolds self.act_description = "activity values: coefficients from " "fitted model" self.scale = scale if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.ncpus = ncpus # initialize attributes self.act_ = None self.sig_ = None mtk = MultiTaskLasso() parameters = { "alpha": [np.exp(-x) for x in np.arange(0, 10, alpha_stepsize)] } self.clf = GridSearchCV(mtk, parameters, cv=kfolds, n_jobs=self.ncpus, scoring="r2") self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "regression"
def default_motifs(): """Return list of Motif instances from default motif database.""" config = MotifConfig() d = config.get_motif_dir() m = config.get_default_params()['motif_db'] if not d or not m: raise ValueError("default motif database not configured") fname = os.path.join(d, m) with open(fname) as f: motifs = read_motifs(f) return motifs
def __init__(self, ncpus=None): self.config = MotifConfig() self.threshold = None self.genome = None self.background = None self.meanstd = {} self.gc_bins = [(0, 1)] if ncpus is None: self.ncpus = int(MotifConfig().get_default_params()["ncpus"]) else: self.ncpus = ncpus if self.ncpus > 1: # try: # ctx = mp.get_context('spawn') # self.pool = ctx.Pool(processes=self.ncpus) # except AttributeError: self.pool = mp.Pool(processes=self.ncpus) self.use_cache = False if self.config.get_default_params().get("use_cache", False): self._init_cache()
def run(self): from gimmemotifs.config import MotifConfig cfg = MotifConfig(use_config=self.build_cfg) data_dir = self.remove_nonsense(os.path.abspath(self.install_dir)) dlog.info("data_dir: {}".format(data_dir)) cfg.set_template_dir(os.path.join(data_dir, 'gimmemotifs/templates')) cfg.set_gene_dir(os.path.join(data_dir, 'gimmemotifs/genes')) cfg.set_score_dir(os.path.join(data_dir, 'gimmemotifs/score_dists')) cfg.set_motif_dir(os.path.join(data_dir, 'gimmemotifs/motif_databases')) cfg.set_bg_dir(os.path.join(data_dir, 'gimmemotifs/bg')) cfg.set_tools_dir(os.path.join(data_dir, 'gimmemotifs/tools')) final_tools_dir = self.remove_nonsense(self.install_tools_dir) for program in MOTIF_CLASSES: m = eval(program)() if cfg.is_configured(m.name): bin = cfg.bin(m.name).replace( os.path.abspath(self.build_tools_dir), final_tools_dir) dir = cfg.dir(m.name) if dir: dir = dir.replace(os.path.abspath(self.build_tools_dir), final_tools_dir) cfg.set_program(m.name, {"bin": bin, "dir": dir}) dir = cfg.get_seqlogo() dir = dir.replace(os.path.abspath(self.build_tools_dir), final_tools_dir) cfg.set_seqlogo(dir) # Use a user-specific configfile if any other installation scheme is used # if os.path.abspath(self.install_dir) == "/usr/share": config_file = os.path.join(self.install_dir, "gimmemotifs/%s" % CONFIG_NAME) self.outfiles = [config_file] if os.path.exists(config_file): timestr = time.strftime("%Y%m%d-%H%M%S") old_config = "{}.{}".format(config_file, timestr) shutil.move(config_file, old_config) dlog.info("INFO: Configfile %s already existed!", config_file) dlog.info("INFO: This config has been saved as %s", old_config) dlog.info("writing configuration file %s" % config_file) f = open(config_file, "w") cfg.write(f)
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None, ncpus=None): config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:, 0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) s = Scanner(ncpus=ncpus) s.set_motifs(pwmfile) s.set_genome(genome) nregions = len(regions) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR, genome=genome) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) logger.info("creating score table") for row in s.best_score(regions): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(open(pwmfile))] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
def maelstrom_html_report(outdir, infile, pwmfile=None, threshold=2): df = pd.read_table(infile, index_col=0) df = df[np.any(abs(df) >= threshold, 1)] M = max(abs(df.min().min()), df.max().max()) m = -M if pwmfile: with open(pwmfile) as f: motifs = read_motifs(f) else: motifs = default_motifs() del df.index.name cols = df.columns m2f = dict([(m.id,",".join(m.factors)) for m in motifs]) df["factors"] = [m2f.get(m, "") for m in df.index] f = df["factors"].str.len() > 30 df["factors"] = '<div title="' + df["factors"] + '">' + df["factors"].str.slice(0,30) df.loc[f, "factors"] += '(...)' df['factors'] += '</div>' df["logo"] = ['<img src="logos/{}.png" height=40/>'.format(x) for x in list(df.index)] if not os.path.exists(outdir + "/logos"): os.makedirs(outdir + "/logos") for motif in motifs: if motif.id in df.index: motif.to_img(outdir + "/logos/{}.png".format(motif.id), fmt="PNG") template_dir = MotifConfig().get_template_dir() js = open(os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8").read() css = open(os.path.join(template_dir, "sortable/sortable-theme-slick.css"), encoding="utf-8").read() cm = sns.diverging_palette(240, 10, as_cmap=True) df = df[["factors", "logo"] + list(cols)] with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f: f.write("<head>\n") f.write("<style>{}</style>\n".format(css)) f.write("</head>\n") f.write("<body>\n") f.write(df.style.apply(background_gradient, low=0.7, high=0.7, m=m, M=M, subset=cols).set_precision(3).set_table_attributes("data-sortable").render().replace("data-sortable", 'class="sortable-theme-slick" data-sortable')) f.write("<script>{}</script>\n".format(js)) f.write("</body>\n")
def __init__(self, name=None): self.config = MotifConfig() self.server = None if not name: name = "%s_%s" % (self.NAME, datetime.today().strftime("%d_%m_%Y")) self.name = name # create a directory for all the intermediate and output files self._setup_output_dir(name) # setup logging self._setup_logging() self.logger.info("%s version %s", self.NAME, GM_VERSION) self.logger.info("output dir: %s", self.outdir) # setup the names of the intermediate and output files self._setup_filenames()
def _write_report(outdir, ids, tree, clusters): config = MotifConfig() env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(motifs=ids) with open(os.path.join(outdir, "cluster_report.html"), "w") as f: f.write(result) f = open(os.path.join(outdir, "cluster_key.txt"), "w") for motif_id in ids: f.write("%s\t%s\n" % (motif_id[0], ",".join([x["alt"] for x in motif_id[2]]))) f.close() f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close()
def __init__(self, scale=True, cv=3, ncpus=None): """Predict motif activities using lightning CDRegressor Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled before classification cv : int, optional, default 3 Cross-validation k-fold parameter. ncpus : int, optional Number of threads. Default is the number specified in the config. Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) fitted coefficients sig_ : DataFrame, shape (n_motifs,) boolean values, if coefficients are higher/lower than the 1%t from random permutation """ self.act_description = ("activity values: coefficients from " "fitted model") if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.ncpus = ncpus self.kfolds = cv self.scale = scale self.act_ = None self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "regression"
def __init__(self, ncpus=None): """Predict motif activities using a random forest classifier Parameters ---------- ncpus : int, optional Number of threads. Default is the number specified in the config. Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) feature importances from the model """ self.act_ = None if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.ncpus = ncpus self.act_description = ("activity values: feature importances " "from fitted Random Forest model") self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "classification"
def location(args): """ Creates histrogram of motif location. Parameters ---------- args : argparse object Command line arguments. """ fastafile = args.fastafile pwmfile = args.pwmfile lwidth = args.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if args.ids: ids = args.ids.split(",") n_cpus = int(MotifConfig().get_default_params()["ncpus"]) pool = Pool(processes=n_cpus, maxtasksperchild=1000) for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append( pool.apply_async( motif_localization, (fastafile,motif,lwidth,outfile, args.cutoff) )) for job in jobs: job.get()
class MotifProgram(object): config = MotifConfig() local_bin = None def __init__(self): pass def bin(self): if self.local_bin: return self.local_bin else: return self.config.bin(self.name) def dir(self): return self.config.dir(self.name) def is_configured(self): return self.config.is_configured(self.name) def is_installed(self): return self.is_configured() and os.access(self.bin(), os.X_OK) def run(self, fastafile, savedir, params=None, tmp=None): if not self.is_configured(): raise ValueError("%s is not configured" % self.name) if not self.is_installed(): raise ValueError( "%s is not installed or not correctly configured" % self.name) self.tmpdir = mkdtemp(prefix="{0}.".format(self.name), dir=tmp) try: return self._run_program(self.bin(), fastafile, savedir, params) except KeyboardInterrupt: return ([], "Killed", "Killed")
def check_threshold(outdir, genome, scoring="count"): # gimme_motifs config, to get defaults config = MotifConfig() threshold_file = None if scoring == "count": # Motif scanning threshold threshold_file = os.path.join(outdir, "threshold.{}.txt".format(genome)) if not os.path.exists(threshold_file): # Random sequences from genome index_dir = os.path.join(config.get_index_dir(), genome) bg_file = os.path.join(outdir, "background.{}.fa".format(genome)) if not os.path.exists(bg_file): m = RandomGenomicFasta(index_dir, BG_LENGTH, BG_NUMBER) m.writefasta(bg_file) pwmfile = config.get_default_params().get("motif_db") pwmfile = os.path.join(config.get_motif_dir(), pwmfile) cmd = "gimme threshold {} {} {} > {}".format( pwmfile, bg_file, FDR, threshold_file) sp.call(cmd, shell=True) return threshold_file
def create_background_file(outfile, bg_type, fmt="fasta", size=None, genome=None, inputfile=None, number=10000): """ Create a background file for motif analysis. Parameters ---------- outfile : str Name of the output file. bg_type : str Type of background (gc, genomic, random or promoter). fmt : str, optional Either 'fasta' or 'bed'. size : int, optional Size of the generated sequences, is determined from the inputfile if not given. genome : str, optional inputfile : str, optional number : int, optional """ fmt = fmt.lower() if fmt in ["fa", "fsa"]: fmt = "fasta" if bg_type not in BG_TYPES: print("The argument 'type' should be one of: %s" % (",".join(BG_TYPES))) sys.exit(1) if fmt == "bed" and bg_type == "random": print("Random background can only be generated in FASTA format!") sys.exit(1) if bg_type == "gc" and not inputfile: print("need a FASTA formatted input file for background gc") sys.exit(1) # GimmeMotifs configuration for file and directory locations config = MotifConfig() # Genome index location for creation of FASTA files if bg_type in ["gc", "genomic", "promoter"] and fmt == "fasta": if genome is None: print("Need a genome to create background file") sys.exit(1) Genome(genome) if bg_type in ["promoter"]: # Gene definition fname = Genome(genome).filename gene_file = fname.replace(".fa", ".annotation.bed.gz") if not gene_file: gene_file = os.path.join(config.get_gene_dir(), "{}.bed".format(genome)) if not os.path.exists(gene_file): print("Could not find a gene file for genome {}".format(genome)) print("Did you use the --annotation flag for genomepy?") print( "Alternatively make sure there is a file called {}.bed in {}". format(genome, config.get_gene_dir())) sys.exit(1) # Number of sequences if number is None: if inputfile: number = number_of_seqs_in_file(inputfile) logger.info("Using %s of background sequences based on input file", number) else: number = 10000 logger.info( "Number of background sequences not specified, using 10,000 sequences" ) if bg_type == "random": f = Fasta(inputfile) m = MarkovFasta(f, n=number, k=1) m.writefasta(outfile) elif bg_type == "gc": if fmt == "fasta": m = MatchedGcFasta(inputfile, genome, number=number, size=size) m.writefasta(outfile) else: matched_gc_bedfile(outfile, inputfile, genome, number, size=size) else: if size is None: size = np.median( [len(seq) for seq in as_fasta(inputfile, genome=genome).seqs]) if bg_type == "promoter": if fmt == "fasta": m = PromoterFasta(gene_file, genome, size=size, n=number) m.writefasta(outfile) else: create_promoter_bedfile(outfile, gene_file, size, number) elif bg_type == "genomic": if fmt == "fasta": m = RandomGenomicFasta(genome, size, number) m.writefasta(outfile) else: create_random_genomic_bedfile(outfile, genome, size, number)
def cluster(args): revcomp = not args.single outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): os.mkdir(outdir) trim_ic = 0.2 clusters = [] motifs = pwmfile_to_motifs(args.inputfile) if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True) clusters = tree.getResult() ids = [] mc = MotifComparer() sys.stderr.write("Creating images\n") for cluster,members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(outdir,"%s.png" % cluster.id), format="PNG") ids.append([cluster.id, {"src":"%s.png" % cluster.id},[]]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1,"+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join(outdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add) ids[-1][2] = [dict([("src", "%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members] config = MotifConfig() env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(motifs=ids) with open(os.path.join(outdir, "cluster_report.html"), "w") as f: f.write(result.encode('utf-8')) f = open(os.path.join(outdir, "cluster_key.txt"), "w") for id in ids: f.write("%s\t%s\n" % (id[0], ",".join([x["alt"] for x in id[2]]))) f.close() f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close()
def pp_predict_motifs(fastafile, outfile, analysis="small", organism="hg18", single=False, background="", tools=None, job_server=None, ncpus=8, max_time=None, stats_fg=None, stats_bg=None): """Parallel prediction of motifs. Utility function for gimmemotifs.denovo.gimme_motifs. Probably better to use that, instead of this function directly. """ if tools is None: tools = {} config = MotifConfig() if not tools: tools = dict([(x, 1) for x in config.get_default_params["tools"].split(",")]) #logger = logging.getLogger('gimme.prediction.pp_predict_motifs') wmin = 5 step = 1 if analysis in ["large", "xl"]: step = 2 wmin = 6 analysis_max = {"xs": 5, "small": 8, "medium": 10, "large": 14, "xl": 20} wmax = analysis_max[analysis] if analysis == "xs": sys.stderr.write("Setting analysis xs to small") analysis = "small" if not job_server: job_server = pool jobs = {} result = PredictionResult( outfile, fg_file=stats_fg, background=stats_bg, job_server=job_server, ) # Dynamically load all tools toolio = [ x[1]() for x in inspect.getmembers( tool_classes, lambda x: inspect.isclass(x) and issubclass( x, tool_classes.MotifProgram)) if x[0] != 'MotifProgram' ] # TODO: # Add warnings for running time: Weeder, GADEM ### Add all jobs to the job_server ### params = { 'analysis': analysis, 'background': background, "single": single, "organism": organism } # Tools that don't use a specified width usually take longer # ie. GADEM, XXmotif, MEME # Start these first. for t in [tool for tool in toolio if not tool.use_width]: if t.name in tools and tools[t.name]: logger.debug("Starting %s job", t.name) job_name = t.name jobs[job_name] = job_server.apply_async( _run_tool, (job_name, t, fastafile, params), callback=result.add_motifs) else: logger.debug("Skipping %s", t.name) for t in [tool for tool in toolio if tool.use_width]: if t.name in tools and tools[t.name]: for i in range(wmin, wmax + 1, step): logger.debug("Starting %s job, width %s", t.name, i) job_name = "%s_width_%s" % (t.name, i) my_params = params.copy() my_params['width'] = i jobs[job_name] = job_server.apply_async( _run_tool, (job_name, t, fastafile, my_params), callback=result.add_motifs) else: logger.debug("Skipping %s", t.name) logger.info("all jobs submitted") for job in jobs.values(): job.get() result.wait_for_stats() ### Wait until all jobs are finished or the time runs out ### # start_time = time() # try: # # Run until all jobs are finished # while len(result.finished) < len(jobs.keys()) and (not(max_time) or time() - start_time < max_time): # pass # if len(result.finished) < len(jobs.keys()): # logger.info("Maximum allowed running time reached, destroying remaining jobs") # job_server.terminate() # result.submit_remaining_stats() # ### Or the user gets impatient... ### # except KeyboardInterrupt: # # Destroy all running jobs # logger.info("Caught interrupt, destroying all running jobs") # job_server.terminate() # result.submit_remaining_stats() # # # if stats_fg and stats_bg: # logger.info("waiting for motif statistics") # n = 0 # last_len = 0 # # # while len(set(result.stats.keys())) < len(set([str(m) for m in result.motifs])): # if n >= 30: # logger.debug("waited long enough") # logger.debug("motifs: %s, stats: %s", len(result.motifs), len(result.stats.keys())) # for i,motif in enumerate(result.motifs): # if "{}_{}".format(motif.id, motif.to_consensus()) not in result.stats: # logger.debug("deleting %s", motif) # del result.motifs[i] # break # sleep(2) # if len(result.stats.keys()) == last_len: # n += 1 # else: # last_len = len(result.stats.keys()) # n = 0 # return result
class MotifProgram(object): """Motif program base class.""" config = MotifConfig() local_bin = None def _parse_params(self, params=None, needs_background=False): """ Parse parameters. Combine default and user-defined parameters. """ prm = self.default_params.copy() if params is not None: prm.update(params) # Background file is essential! if "background" in prm: # Absolute path, just to be sure prm["background"] = os.path.abspath(prm["background"]) elif needs_background: raise ValueError("Background file needed!") return prm def _read_and_label_motifs(self, outfile, stdout, stderr, fmt="meme"): """Read output motifs and label with program name""" if not os.path.exists(outfile): stdout += "\nMotif file {0} not found!\n".format(outfile) stderr += "\nMotif file {0} not found!\n".format(outfile) return [], stdout, stderr motifs = read_motifs(outfile, fmt="meme") for m in motifs: m.id = "{0}_{1}".format(self.name, m.id) return motifs, stdout, stderr def bin(self): """ Get the command used to run the tool. Returns ------- command : str The tool system command. """ if self.local_bin: return self.local_bin else: return self.config.bin(self.name) def dir(self): """ Get the installation directory of the tool. Returns ------- dir : str The tool directory. """ return self.config.dir(self.name) def is_configured(self): """ Check if the tool is configured. Returns ------- is_configured : bool True if the tool is configured. """ return self.config.is_configured(self.name) def is_installed(self): """ Check if the tool is installed. Returns ------- is_installed : bool True if the tool is installed. """ return self.is_configured() and os.access(self.bin(), os.X_OK) def run(self, fastafile, params=None, tmp=None): """ Run the tool and predict motifs from a FASTA file. Parameters ---------- fastafile : str Name of the FASTA input file. params : dict, optional Optional parameters. For some of the tools required parameters are passed using this dictionary. tmp : str, optional Directory to use for creation of temporary files. Returns ------- motifs : list of Motif instances The predicted motifs. stdout : str Standard out of the tool. stderr : str Standard error of the tool. """ if not self.is_configured(): raise ValueError("%s is not configured" % self.name) if not self.is_installed(): raise ValueError( "%s is not installed or not correctly configured" % self.name) self.tmpdir = mkdtemp(prefix="{0}.".format(self.name), dir=tmp) fastafile = os.path.abspath(fastafile) try: return self._run_program(self.bin(), fastafile, params) except KeyboardInterrupt: return ([], "Killed", "Killed")
def scan_to_table( input_table, genome, scoring, pfmfile=None, ncpus=None, zscore=True, gc=True ): """Scan regions in input table with motifs. Parameters ---------- input_table : str Filename of input table. Can be either a text-separated tab file or a feather file. genome : str Genome name. Can be either the name of a FASTA-formatted file or a genomepy genome name. scoring : str "count" or "score" pfmfile : str, optional Specify a PFM file for scanning. ncpus : int, optional If defined this specifies the number of cores to use. Returns ------- table : pandas.DataFrame DataFrame with motif ids as column names and regions as index. Values are either counts or scores depending on the 'scoring' parameter.s """ config = MotifConfig() if pfmfile is None: pfmfile = config.get_default_params().get("motif_db", None) if pfmfile is not None: pfmfile = os.path.join(config.get_motif_dir(), pfmfile) if pfmfile is None: raise ValueError("no pfmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:, 0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) if len(regions) >= 1000: check_regions = np.random.choice(regions, size=1000, replace=False) else: check_regions = regions size = int( np.median([len(seq) for seq in as_fasta(check_regions, genome=genome).seqs]) ) s = Scanner(ncpus=ncpus) s.set_motifs(pfmfile) s.set_genome(genome) s.set_background(genome=genome, gc=gc, size=size) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) msg = "creating score table" if zscore: msg += " (z-score" if gc: msg += ", GC%" msg += ")" else: msg += " (logodds)" logger.info(msg) for row in s.best_score(regions, zscore=zscore, gc=gc): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(pfmfile)] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
def create_background( bg_type, fafile, outfile, genome="hg18", size=200, nr_times=10, custom_background=None, ): """Create background of a specific type. Parameters ---------- bg_type : str Name of background type. fafile : str Name of input FASTA file. outfile : str Name of output FASTA file. genome : str, optional Genome name. size : int, optional Size of regions. nr_times : int, optional Generate this times as many background sequences as compared to input file. Returns ------- nr_seqs : int Number of sequences created. """ size = int(size) config = MotifConfig() fg = Fasta(fafile) if bg_type in ["genomic", "gc"]: if not genome: logger.error("Need a genome to create background") sys.exit(1) if bg_type == "random": f = MarkovFasta(fg, k=1, n=nr_times * len(fg)) logger.debug("Random background: %s", outfile) elif bg_type == "genomic": logger.debug("Creating genomic background") f = RandomGenomicFasta(genome, size, nr_times * len(fg)) elif bg_type == "gc": logger.debug("Creating GC matched background") f = MatchedGcFasta(fafile, genome, nr_times * len(fg)) logger.debug("GC matched background: %s", outfile) elif bg_type == "promoter": fname = Genome(genome).filename gene_file = fname.replace(".fa", ".annotation.bed.gz") if not gene_file: gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genome) if not os.path.exists(gene_file): print("Could not find a gene file for genome {}") print("Did you use the --annotation flag for genomepy?") print( "Alternatively make sure there is a file called {}.bed in {}". format(genome, config.get_gene_dir())) raise ValueError() logger.info( "Creating random promoter background (%s, using genes in %s)", genome, gene_file, ) f = PromoterFasta(gene_file, genome, size, nr_times * len(fg)) logger.debug("Random promoter background: %s", outfile) elif bg_type == "custom": bg_file = custom_background if not bg_file: raise IOError("Background file not specified!") if not os.path.exists(bg_file): raise IOError("Custom background file %s does not exist!", bg_file) else: logger.info("Copying custom background file %s to %s.", bg_file, outfile) f = Fasta(bg_file) median_length = np.median([len(seq) for seq in f.seqs]) if median_length < (size * 0.95) or median_length > (size * 1.05): logger.warn( "The custom background file %s contains sequences with a " "median size of %s, while GimmeMotifs predicts motifs in sequences " "of size %s. This will influence the statistics! It is recommended " "to use background sequences of the same size.", bg_file, median_length, size, ) f.writefasta(outfile) return len(f)