def scan(self, seqs, nreport=100, scan_rc=True, normalize=False): """ scan a set of regions / sequences """ if not self.threshold: sys.stderr.write("Using default threshold of 0.95. " "This is likely not optimal!\n") self.set_threshold(threshold=0.95) seqs = as_fasta(seqs, genome=self.genome) it = self._scan_sequences(seqs.seqs, nreport, scan_rc) if normalize: if len(self.meanstd) == 0: self.set_meanstd() mean_std = [self.meanstd.get(m_id) for m_id in self.motif_ids] means = [x[0] for x in mean_std] stds = [x[1] for x in mean_std] for result in it: if normalize: zresult = [] for i, mrow in enumerate(result): mrow = [((x[0] - means[i]) / stds[i], x[1], x[2]) for x in mrow] zresult.append(mrow) yield zresult else: yield result
def command_scan(inputfile, pwmfile, nreport=1, fpr=0.01, cutoff=None, bed=False, scan_rc=True, table=False, score_table=False, moods=False, pvalue=None, bgfile=None, genome=None, ncpus=None, normalize=False): motifs = read_motifs(pwmfile) fa = as_fasta(inputfile, genome) # initialize scanner s = Scanner(ncpus=ncpus) s.set_motifs(pwmfile) if genome: s.set_genome(genome=genome) if genome or bgfile: s.set_background(genome=genome, fname=bgfile, length=fa.median_length()) if not score_table: s.set_threshold(fpr=fpr, threshold=cutoff) if table: it = scan_table(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods) elif score_table: it = scan_score_table(s, fa, motifs, scan_rc, normalize=normalize) else: it = scan_normal(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods, bed, normalize=normalize) for row in it: yield row
def get_PWMScore(self, fin_regions_fa): """ Scan motif in every peak. Arguments: fin_regions_fa {[type]} -- [input fasta file] Returns: [type] -- [pfmscorefile] """ pfmscorefile = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) seqs = [s.split(" ")[0] for s in as_fasta(fin_regions_fa, genome=self.genome).ids] s = Scanner(ncpus=self.ncore) s.set_motifs(self.pfmfile) s.set_threshold(threshold=0.0) s.set_genome(self.genome) with open(self.pfmfile) as f: motifs = read_motifs(f) chunksize = 10000 # Run 10k peaks one time. with tqdm(total=len(seqs)) as pbar: for chunk in range(0, len(seqs), chunksize): chunk_seqs = seqs[chunk : chunk + chunksize] # print(chunk, "-", chunk + chunksize, "enhancers") pfm_score = [] it = s.best_score(chunk_seqs, zscore=True, gc=True) # We are using GC-normalization for motif scan because many sequence is GC-enriched. # GimmeMotif develop branch already include GC-normalization option now. for seq, scores in zip(chunk_seqs, it): for motif, score in zip(motifs, scores): pfm_score.append([motif.id, seq, score]) pbar.update(1) pfm_score = pd.DataFrame(pfm_score, columns=["motif", "enhancer", "zscore"]) pfm_score = pfm_score.set_index("motif") # print("\tCombine") pfm_score["zscoreRank"] = minmax_scale(rankdata(pfm_score["zscore"])) # When we built model, rank and minmax normalization was used. cols = ["enhancer", "zscore", "zscoreRank"] write_header = False if chunk == 0: write_header = True pfm_score[cols].to_csv(pfmscorefile, sep="\t", header=write_header) # pbar.update(chunk + chunksize) return pfmscorefile.name
def scan(self, seqs, nreport=100, scan_rc=True): """ scan a set of regions / sequences """ if not self.threshold: sys.stderr.write("Using default threshold of 0.95. " "This is likely not optimal!\n") self.set_threshold(threshold=0.95) seqs = as_fasta(seqs, genome=self.genome) it = self._scan_sequences(seqs.seqs, nreport, scan_rc) for result in it: yield result
def scan(self, seqs, nreport=100, scan_rc=True, zscore=False, gc=False): """ Scan a set of regions or sequences. """ if not self.threshold: logger.info("Using default threshold of 0.95. " "This is likely not optimal!") self.set_threshold(threshold=0.95) seqs = as_fasta(seqs, genome=self.genome) it = self._scan_sequences(seqs.seqs, nreport, scan_rc) if zscore: if gc: if len(self.meanstd) <= 1: self.set_meanstd(gc=gc) else: if len(self.meanstd) != 1: self.set_meanstd(gc=gc) gc_seqs = [self.get_seq_bin(seq) for seq in seqs.seqs] logger.debug("Scanning") for result, gc_seq in zip(it, gc_seqs): if zscore: zresult = [] for i, mrow in enumerate(result): try: m_mean, m_std = self.get_motif_mean_std( gc_seq, self.motif_ids[i]) except Exception: print(self.meanstd) print(gc_seq, self.motif_ids[i]) raise mrow = [((x[0] - m_mean) / m_std, x[1], x[2]) for x in mrow] zresult.append(mrow) yield zresult else: yield result
def scan(self, seqs, nreport=100, scan_rc=True, zscore=False, gc=False): """ Scan a set of regions or sequences. """ seqs = as_fasta(seqs, genome=self.genome) if zscore: if gc: if len(self.meanstd) <= 1: self.set_meanstd(gc=gc) else: if len(self.meanstd) != 1: self.set_meanstd(gc=gc) batch_size = 10000 logger.debug("Scanning") for batch_idx in range(0, len(seqs), batch_size): it = self._scan_sequences( seqs.seqs[batch_idx:batch_idx + batch_size], nreport, scan_rc, zscore=zscore, ) for result in it: yield result
def create_background_file(outfile, bg_type, fmt="fasta", size=None, genome=None, inputfile=None, number=10000): """ Create a background file for motif analysis. Parameters ---------- outfile : str Name of the output file. bg_type : str Type of background (gc, genomic, random or promoter). fmt : str, optional Either 'fasta' or 'bed'. size : int, optional Size of the generated sequences, is determined from the inputfile if not given. genome : str, optional inputfile : str, optional number : int, optional """ fmt = fmt.lower() if fmt in ["fa", "fsa"]: fmt = "fasta" if bg_type not in BG_TYPES: print("The argument 'type' should be one of: %s" % (",".join(BG_TYPES))) sys.exit(1) if fmt == "bed" and bg_type == "random": print("Random background can only be generated in FASTA format!") sys.exit(1) if bg_type == "gc" and not inputfile: print("need a FASTA formatted input file for background gc") sys.exit(1) # GimmeMotifs configuration for file and directory locations config = MotifConfig() # Genome index location for creation of FASTA files if bg_type in ["gc", "genomic", "promoter"] and fmt == "fasta": if genome is None: print("Need a genome to create background file") sys.exit(1) Genome(genome) if bg_type in ["promoter"]: # Gene definition fname = Genome(genome).filename gene_file = fname.replace(".fa", ".annotation.bed.gz") if not gene_file: gene_file = os.path.join(config.get_gene_dir(), "{}.bed".format(genome)) if not os.path.exists(gene_file): print("Could not find a gene file for genome {}".format(genome)) print("Did you use the --annotation flag for genomepy?") print( "Alternatively make sure there is a file called {}.bed in {}". format(genome, config.get_gene_dir())) sys.exit(1) # Number of sequences if number is None: if inputfile: number = number_of_seqs_in_file(inputfile) logger.info("Using %s of background sequences based on input file", number) else: number = 10000 logger.info( "Number of background sequences not specified, using 10,000 sequences" ) if bg_type == "random": f = Fasta(inputfile) m = MarkovFasta(f, n=number, k=1) m.writefasta(outfile) elif bg_type == "gc": if fmt == "fasta": m = MatchedGcFasta(inputfile, genome, number=number, size=size) m.writefasta(outfile) else: matched_gc_bedfile(outfile, inputfile, genome, number, size=size) else: if size is None: size = np.median( [len(seq) for seq in as_fasta(inputfile, genome=genome).seqs]) if bg_type == "promoter": if fmt == "fasta": m = PromoterFasta(gene_file, genome, size=size, n=number) m.writefasta(outfile) else: create_promoter_bedfile(outfile, gene_file, size, number) elif bg_type == "genomic": if fmt == "fasta": m = RandomGenomicFasta(genome, size, number) m.writefasta(outfile) else: create_random_genomic_bedfile(outfile, genome, size, number)
def scan_to_table( input_table, genome, scoring, pfmfile=None, ncpus=None, zscore=True, gc=True ): """Scan regions in input table with motifs. Parameters ---------- input_table : str Filename of input table. Can be either a text-separated tab file or a feather file. genome : str Genome name. Can be either the name of a FASTA-formatted file or a genomepy genome name. scoring : str "count" or "score" pfmfile : str, optional Specify a PFM file for scanning. ncpus : int, optional If defined this specifies the number of cores to use. Returns ------- table : pandas.DataFrame DataFrame with motif ids as column names and regions as index. Values are either counts or scores depending on the 'scoring' parameter.s """ config = MotifConfig() if pfmfile is None: pfmfile = config.get_default_params().get("motif_db", None) if pfmfile is not None: pfmfile = os.path.join(config.get_motif_dir(), pfmfile) if pfmfile is None: raise ValueError("no pfmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:, 0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) if len(regions) >= 1000: check_regions = np.random.choice(regions, size=1000, replace=False) else: check_regions = regions size = int( np.median([len(seq) for seq in as_fasta(check_regions, genome=genome).seqs]) ) s = Scanner(ncpus=ncpus) s.set_motifs(pfmfile) s.set_genome(genome) s.set_background(genome=genome, gc=gc, size=size) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) msg = "creating score table" if zscore: msg += " (z-score" if gc: msg += ", GC%" msg += ")" else: msg += " (logodds)" logger.info(msg) for row in s.best_score(regions, zscore=zscore, gc=gc): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(pfmfile)] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
def set_background(self, fname=None, genome=None, size=200, nseq=10000, gc=False, gc_bins=None): """Set the background to use for FPR and z-score calculations. Background can be specified either as a genome name or as the name of a FASTA file. Parameters ---------- fname : str, optional Name of FASTA file to use as background. genome : str, optional Name of genome to use to retrieve random sequences. size : int, optional Size of genomic sequences to retrieve. The default is 200. nseq : int, optional Number of genomic sequences to retrieve. """ if self.background: return size = int(size) if genome and fname: raise ValueError("Need either genome or filename for background.") if fname: if not os.path.exists(fname): raise IOError( "Background file {} does not exist!".format(fname)) self.background = Fasta(fname) self.background_hash = file_checksum(fname) return if not genome: if self.genome: genome = self.genome else: raise ValueError( "Need either genome or filename for background.") logger.debug("using background: genome {} with size {}".format( genome, size)) lock.acquire() with Cache(CACHE_DIR) as cache: self.background_hash = "d{}:{}:{}:{}".format( genome, int(size), gc, str(gc_bins)) c = cache.get(self.background_hash) if c: fa, gc_bins = c else: fa = None if not fa: if gc: if gc_bins is None: gc_bins = [(0.0, 0.2), (0.8, 1)] for b in np.arange(0.2, 0.799, 0.05): gc_bins.append((b, b + 0.05)) with NamedTemporaryFile() as tmp: logger.info("using {} sequences".format(nseq)) gc_bin_bedfile(tmp.name, genome, number=nseq, length=size, bins=gc_bins) fa = as_fasta(tmp.name, genome=genome) else: fa = RandomGenomicFasta(genome, size, nseq) cache.set(self.background_hash, (fa, gc_bins)) lock.release() self.background = fa if gc_bins: self.gc_bins = gc_bins
def command_scan(inputfile, pwmfile, nreport=1, cutoff=0.9, bed=False, scan_rc=True, table=False, score_table=False, moods=False, pvalue=None, bgfile=None, genome=None): motifs = pwmfile_to_motifs(pwmfile) index_dir = None if genome is not None: index_dir = os.path.join(MotifConfig().get_index_dir(), genome) # initialize scanner s = Scanner() s.set_motifs(pwmfile) fa = as_fasta(inputfile, index_dir) if moods: result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, table) else: result_it = s.scan(fa, nreport, scan_rc, cutoff) if table: # header yield "\t{}".format("\t".join([m.id for m in motifs])) if moods: result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, table) for seq_id, counts in result_it: yield "{}\t{}".format(seq_id, "\t".join([str(x) for x in counts])) else: # get iterator result_it = s.count(fa, nreport, scan_rc, cutoff) # counts table for i, counts in enumerate(result_it): yield "{}\t{}".format( fa.ids[i], "\t".join([str(x) for x in counts]) ) elif score_table: # get iterator result_it = s.best_score(fa, scan_rc) # header yield "\t{}".format("\t".join([m.id for m in motifs])) # score table for i,scores in enumerate(result_it): yield "{}\t{}".format( fa.ids[i], "\t".join([str(x) for x in scores]) ) else: if moods: for motif, d in result_it: for seq_id,matches in d.items(): for pos,score,strand in matches: yield format_line(fa, seq_id, motif, score, pos, strand, bed=bed) else: for i, result in enumerate(result_it): seq_id = fa.ids[i] for motif, matches in zip(motifs, result): for (score, pos, strand) in matches: yield format_line(fa, seq_id, motif, score, pos, strand, bed=bed)