def check_threshold(outdir, genome, scoring="count"): # gimme_motifs config, to get defaults config = MotifConfig() threshold_file = None if scoring == "count": # Motif scanning threshold threshold_file = os.path.join(outdir, "threshold.{}.txt".format(genome)) if not os.path.exists(threshold_file): # Random sequences from genome index_dir = os.path.join(config.get_index_dir(), genome) bg_file = os.path.join(outdir, "background.{}.fa".format(genome)) if not os.path.exists(bg_file): m = RandomGenomicFasta(index_dir, BG_LENGTH, BG_NUMBER) m.writefasta(bg_file) pwmfile = config.get_default_params().get("motif_db") pwmfile = os.path.join(config.get_motif_dir(), pwmfile) cmd = "gimme threshold {} {} {} > {}".format( pwmfile, bg_file, FDR, threshold_file) sp.call(cmd, shell=True) return threshold_file
def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10): fg = Fasta(fafile) if bg_type == "random": if int(self.markov_model) >= 6: self.logger.warn("Are you sure about the Markov model? It seems too high!") else: order = {"1":"1st","2":"2nd", "3":"3rd", "4":"4th", "5":"5th"}[str(self.markov_model)] self.logger.debug("Creating random background (%s order Markov)" % order) m = MarkovFasta(fg, k=int(self.markov_model), n=nr_times * len(fg)) m.writefasta(outfile) self.logger.debug("Random background: %s", outfile) # return the number of random sequences created return len(m) elif bg_type == "genomic": self.logger.debug("Creating genomic background") index_dir = os.path.join(self.config.get_index_dir(), organism) f = RandomGenomicFasta(index_dir, width, nr_times * len(fg)) f.writefasta(outfile) return len(f) elif bg_type == "gc": self.logger.debug("Creating GC matched background") f = MatchedGcFasta(fafile, organism, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("GC matched background: %s", outfile) return len(f) elif bg_type == "promoter": gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism) index_dir = os.path.join(self.config.get_index_dir(), organism) self.logger.info( "Creating random promoter background (%s, using genes in %s)", organism, gene_file) f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("Random promoter background: %s", outfile) return len(f) elif bg_type == "user": bg_file = self.params["user_background"] if not os.path.exists(bg_file): self.logger.error( "User-specified background file %s does not exist!", bg_file) sys.exit(1) else: self.logger.info("Copying user-specified background file %s to %s.", bg_file, outfile) fa = Fasta(bg_file) l = median([len(seq) for seq in fa.seqs]) if l < width * 0.95 or l > width * 1.05: self.logger.warn("The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length.", bg_file, l, width) fa.writefasta(outfile) return len(fa)
def set_background(self, fname=None, genome=None, length=200, nseq=10000): """Set the background to use for FPR and z-score calculations. Background can be specified either as a genome name or as the name of a FASTA file. Parameters ---------- fname : str, optional Name of FASTA file to use as background. genome : str, optional Name of genome to use to retrieve random sequences. length : int, optional Length of genomic sequences to retrieve. The default is 200. nseq : int, optional Number of genomic sequences to retrieve. """ length = int(length) if genome and fname: raise ValueError("Need either genome or filename for background.") if fname: if not os.path.exists(fname): raise IOError( "Background file {} does not exist!".format(fname)) self.background = Fasta(fname) self.background_hash = file_checksum(fname) return if not genome: if self.genome: genome = self.genome logger.info( "Using default background: genome {} with length {}". format(genome, length)) else: raise ValueError( "Need either genome or filename for background.") logger.info("Using background: genome {} with length {}".format( genome, length)) with Cache(CACHE_DIR) as cache: self.background_hash = "{}\{}".format(genome, int(length)) fa = cache.get(self.background_hash) if not fa: fa = RandomGenomicFasta(genome, length, nseq) cache.set(self.background_hash, fa) self.background = fa
def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10): fg = Fasta(fafile) if bg_type == "random": if int(self.markov_model) >= 6: self.logger.warn( "Are you sure about the Markov model? It seems too high!") else: order = { "1": "1st", "2": "2nd", "3": "3rd", "4": "4th", "5": "5th" }[str(self.markov_model)] self.logger.debug( "Creating random background (%s order Markov)" % order) m = MarkovFasta(fg, k=int(self.markov_model), n=nr_times * len(fg)) m.writefasta(outfile) self.logger.debug("Random background: %s", outfile) # return the number of random sequences created return len(m) elif bg_type == "genomic": self.logger.debug("Creating genomic background") index_dir = os.path.join(self.config.get_index_dir(), organism) f = RandomGenomicFasta(index_dir, width, nr_times * len(fg)) f.writefasta(outfile) return len(f) elif bg_type == "gc": self.logger.debug("Creating GC matched background") f = MatchedGcFasta(fafile, organism, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("GC matched background: %s", outfile) return len(f) elif bg_type == "promoter": gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism) index_dir = os.path.join(self.config.get_index_dir(), organism) self.logger.info( "Creating random promoter background (%s, using genes in %s)", organism, gene_file) f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("Random promoter background: %s", outfile) return len(f) elif bg_type == "user": bg_file = self.params["user_background"] if not os.path.exists(bg_file): self.logger.error( "User-specified background file %s does not exist!", bg_file) sys.exit(1) else: self.logger.info( "Copying user-specified background file %s to %s.", bg_file, outfile) fa = Fasta(bg_file) l = median([len(seq) for seq in fa.seqs]) if l < width * 0.95 or l > width * 1.05: self.logger.warn( "The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length.", bg_file, l, width) fa.writefasta(outfile) return len(fa)
def create_background( bg_type, fafile, outfile, genome="hg18", size=200, nr_times=10, custom_background=None, ): """Create background of a specific type. Parameters ---------- bg_type : str Name of background type. fafile : str Name of input FASTA file. outfile : str Name of output FASTA file. genome : str, optional Genome name. size : int, optional Size of regions. nr_times : int, optional Generate this times as many background sequences as compared to input file. Returns ------- nr_seqs : int Number of sequences created. """ size = int(size) config = MotifConfig() fg = Fasta(fafile) if bg_type in ["genomic", "gc"]: if not genome: logger.error("Need a genome to create background") sys.exit(1) if bg_type == "random": f = MarkovFasta(fg, k=1, n=nr_times * len(fg)) logger.debug("Random background: %s", outfile) elif bg_type == "genomic": logger.debug("Creating genomic background") f = RandomGenomicFasta(genome, size, nr_times * len(fg)) elif bg_type == "gc": logger.debug("Creating GC matched background") f = MatchedGcFasta(fafile, genome, nr_times * len(fg)) logger.debug("GC matched background: %s", outfile) elif bg_type == "promoter": fname = Genome(genome).filename gene_file = fname.replace(".fa", ".annotation.bed.gz") if not gene_file: gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genome) if not os.path.exists(gene_file): print("Could not find a gene file for genome {}") print("Did you use the --annotation flag for genomepy?") print( "Alternatively make sure there is a file called {}.bed in {}". format(genome, config.get_gene_dir())) raise ValueError() logger.info( "Creating random promoter background (%s, using genes in %s)", genome, gene_file, ) f = PromoterFasta(gene_file, genome, size, nr_times * len(fg)) logger.debug("Random promoter background: %s", outfile) elif bg_type == "custom": bg_file = custom_background if not bg_file: raise IOError("Background file not specified!") if not os.path.exists(bg_file): raise IOError("Custom background file %s does not exist!", bg_file) else: logger.info("Copying custom background file %s to %s.", bg_file, outfile) f = Fasta(bg_file) median_length = np.median([len(seq) for seq in f.seqs]) if median_length < (size * 0.95) or median_length > (size * 1.05): logger.warn( "The custom background file %s contains sequences with a " "median size of %s, while GimmeMotifs predicts motifs in sequences " "of size %s. This will influence the statistics! It is recommended " "to use background sequences of the same size.", bg_file, median_length, size, ) f.writefasta(outfile) return len(f)
def set_background(self, fname=None, genome=None, size=200, nseq=10000, gc=False, gc_bins=None): """Set the background to use for FPR and z-score calculations. Background can be specified either as a genome name or as the name of a FASTA file. Parameters ---------- fname : str, optional Name of FASTA file to use as background. genome : str, optional Name of genome to use to retrieve random sequences. size : int, optional Size of genomic sequences to retrieve. The default is 200. nseq : int, optional Number of genomic sequences to retrieve. """ if self.background: return size = int(size) if genome and fname: raise ValueError("Need either genome or filename for background.") if fname: if not os.path.exists(fname): raise IOError( "Background file {} does not exist!".format(fname)) self.background = Fasta(fname) self.background_hash = file_checksum(fname) return if not genome: if self.genome: genome = self.genome else: raise ValueError( "Need either genome or filename for background.") logger.debug("using background: genome {} with size {}".format( genome, size)) lock.acquire() with Cache(CACHE_DIR) as cache: self.background_hash = "d{}:{}:{}:{}".format( genome, int(size), gc, str(gc_bins)) c = cache.get(self.background_hash) if c: fa, gc_bins = c else: fa = None if not fa: if gc: if gc_bins is None: gc_bins = [(0.0, 0.2), (0.8, 1)] for b in np.arange(0.2, 0.799, 0.05): gc_bins.append((b, b + 0.05)) with NamedTemporaryFile() as tmp: logger.info("using {} sequences".format(nseq)) gc_bin_bedfile(tmp.name, genome, number=nseq, length=size, bins=gc_bins) fa = as_fasta(tmp.name, genome=genome) else: fa = RandomGenomicFasta(genome, size, nseq) cache.set(self.background_hash, (fa, gc_bins)) lock.release() self.background = fa if gc_bins: self.gc_bins = gc_bins
def set_threshold(self, fpr=None, threshold=None, genome=None, length=200, filename=None): """Set motif scanning threshold based on background sequences. Parameters ---------- fpr : float, optional Desired FPR, between 0.0 and 1.0. threshold : float or str, optional Desired motif threshold, expressed as the fraction of the difference between minimum and maximum score of the PWM. Should either be a float between 0.0 and 1.0 or a filename with thresholds as created by 'gimme threshold'. """ if threshold: if fpr: raise ValueError("Need either fpr or threshold.") if genome: sys.stderr.write( "Parameter genome ignored when threshold is specified.\n" "Did you want to use fpr?\n") if filename: sys.stderr.write( "Parameter filename ignored when threshold is specified.\n" "Did you want to use fpr?\n") if genome and filename: raise ValueError("Need either genome or filename.") if fpr: fpr = float(fpr) if not (0.0 < fpr < 1.0): raise ValueError("Parameter fpr should be between 0 and 1") if not self.motifs: raise ValueError("please run set_motifs() first") thresholds = {} with open(self.motifs) as f: motifs = read_motifs(f) if threshold is not None: self.threshold = parse_threshold_values(self.motifs, threshold) return if filename: if not os.path.exists(filename): raise IOError("File {} does not exist.".format(filename)) bg_hash = file_checksum(filename) seqs = Fasta(filename).seqs elif genome: bg_hash = "{}\{}".format(genome, int(length)) else: raise ValueError("Need genome or filename") with Cache(CACHE_DIR) as cache: scan_motifs = [] for motif in motifs: k = "{}|{}|{:.4f}".format(motif.hash(), bg_hash, fpr) threshold = cache.get(k) if threshold is None: scan_motifs.append(motif) else: if np.isclose(threshold, motif.pwm_max_score()): thresholds[motif.id] = None else: thresholds[motif.id] = threshold if len(scan_motifs) > 0: if genome: Genome(genome) sys.stderr.write( "Determining threshold for fpr {} and length {} based on {}\n" .format(fpr, int(length), genome)) fa = RandomGenomicFasta(genome, length, 10000) seqs = fa.seqs else: sys.stderr.write( "Determining threshold for fpr {} based on {}\n". format(fpr, filename)) for motif, threshold in self._threshold_from_seqs( scan_motifs, seqs, fpr): k = "{}|{}|{:.4f}".format(motif.hash(), bg_hash, fpr) cache.set(k, threshold) if np.isclose(threshold, motif.pwm_max_score()): thresholds[motif.id] = None else: thresholds[motif.id] = threshold self.threshold_str = "{}_{}_{}_{}_{}".format(fpr, threshold, genome, length, filename) self.threshold = thresholds