def check_threshold(outdir, genome, scoring="count"): # gimme_motifs config, to get defaults config = MotifConfig() threshold_file = None if scoring == "count": # Motif scanning threshold threshold_file = os.path.join(outdir, "threshold.{}.txt".format(genome)) if not os.path.exists(threshold_file): # Random sequences from genome index_dir = os.path.join(config.get_index_dir(), genome) bg_file = os.path.join(outdir, "background.{}.fa".format(genome)) if not os.path.exists(bg_file): m = RandomGenomicFasta(index_dir, BG_LENGTH, BG_NUMBER) m.writefasta(bg_file) pwmfile = config.get_default_params().get("motif_db") pwmfile = os.path.join(config.get_motif_dir(), pwmfile) cmd = "gimme threshold {} {} {} > {}".format( pwmfile, bg_file, FDR, threshold_file) sp.call(cmd, shell=True) return threshold_file
def check_threshold(outdir, genome, scoring="count"): # gimme_motifs config, to get defaults config = MotifConfig() threshold_file = None if scoring == "count": # Motif scanning threshold threshold_file = os.path.join(outdir, "threshold.{}.txt".format(genome)) if not os.path.exists(threshold_file): # Random sequences from genome index_dir = os.path.join(config.get_index_dir(), genome) bg_file = os.path.join(outdir, "background.{}.fa".format(genome)) if not os.path.exists(bg_file): m = RandomGenomicFasta(index_dir, BG_LENGTH, BG_NUMBER) m.writefasta(bg_file) pwmfile = config.get_default_params().get("motif_db") pwmfile = os.path.join(config.get_motif_dir(), pwmfile) cmd = "gimme threshold {} {} {} > {}".format( pwmfile, bg_file, FDR, threshold_file) sp.call(cmd, shell=True) return threshold_file
def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10): fg = Fasta(fafile) if bg_type == "random": if int(self.markov_model) >= 6: self.logger.warn("Are you sure about the Markov model? It seems too high!") else: order = {"1":"1st","2":"2nd", "3":"3rd", "4":"4th", "5":"5th"}[str(self.markov_model)] self.logger.debug("Creating random background (%s order Markov)" % order) m = MarkovFasta(fg, k=int(self.markov_model), n=nr_times * len(fg)) m.writefasta(outfile) self.logger.debug("Random background: %s", outfile) # return the number of random sequences created return len(m) elif bg_type == "genomic": self.logger.debug("Creating genomic background") index_dir = os.path.join(self.config.get_index_dir(), organism) f = RandomGenomicFasta(index_dir, width, nr_times * len(fg)) f.writefasta(outfile) return len(f) elif bg_type == "gc": self.logger.debug("Creating GC matched background") f = MatchedGcFasta(fafile, organism, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("GC matched background: %s", outfile) return len(f) elif bg_type == "promoter": gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism) index_dir = os.path.join(self.config.get_index_dir(), organism) self.logger.info( "Creating random promoter background (%s, using genes in %s)", organism, gene_file) f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("Random promoter background: %s", outfile) return len(f) elif bg_type == "user": bg_file = self.params["user_background"] if not os.path.exists(bg_file): self.logger.error( "User-specified background file %s does not exist!", bg_file) sys.exit(1) else: self.logger.info("Copying user-specified background file %s to %s.", bg_file, outfile) fa = Fasta(bg_file) l = median([len(seq) for seq in fa.seqs]) if l < width * 0.95 or l > width * 1.05: self.logger.warn("The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length.", bg_file, l, width) fa.writefasta(outfile) return len(fa)
def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10): fg = Fasta(fafile) if bg_type == "random": if int(self.markov_model) >= 6: self.logger.warn( "Are you sure about the Markov model? It seems too high!") else: order = { "1": "1st", "2": "2nd", "3": "3rd", "4": "4th", "5": "5th" }[str(self.markov_model)] self.logger.debug( "Creating random background (%s order Markov)" % order) m = MarkovFasta(fg, k=int(self.markov_model), n=nr_times * len(fg)) m.writefasta(outfile) self.logger.debug("Random background: %s", outfile) # return the number of random sequences created return len(m) elif bg_type == "genomic": self.logger.debug("Creating genomic background") index_dir = os.path.join(self.config.get_index_dir(), organism) f = RandomGenomicFasta(index_dir, width, nr_times * len(fg)) f.writefasta(outfile) return len(f) elif bg_type == "gc": self.logger.debug("Creating GC matched background") f = MatchedGcFasta(fafile, organism, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("GC matched background: %s", outfile) return len(f) elif bg_type == "promoter": gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism) index_dir = os.path.join(self.config.get_index_dir(), organism) self.logger.info( "Creating random promoter background (%s, using genes in %s)", organism, gene_file) f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("Random promoter background: %s", outfile) return len(f) elif bg_type == "user": bg_file = self.params["user_background"] if not os.path.exists(bg_file): self.logger.error( "User-specified background file %s does not exist!", bg_file) sys.exit(1) else: self.logger.info( "Copying user-specified background file %s to %s.", bg_file, outfile) fa = Fasta(bg_file) l = median([len(seq) for seq in fa.seqs]) if l < width * 0.95 or l > width * 1.05: self.logger.warn( "The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length.", bg_file, l, width) fa.writefasta(outfile) return len(fa)