def generatePSP(positives, negatives, outfile): ''' generate a discrimitative PSP file from the positives and negatives that can be used to do descriminative MEME ''' psp_options = PARAMS["psp_options"] nseqs_pos = int(FastaIterator.count(positives)) nseqs_neg = int(FastaIterator.count(negatives)) if nseqs_pos < 2 or nseqs_neg < 2: E.warn("%s: input files do not have sufficent sequences" "to run psp-gen, skipping" % outfile) P.touch(outfile) return # get appropriate options from meme options if PARAMS.get("meme_revcomp", True): psp_options += " -revcomp" statement = '''psp-gen -pos %(positives)s -neg %(negatives)s %(psp_options)s > %(outfile)s ''' P.run(statement)
def runDREME(infile, outfile, neg_file="", options=""): ''' Run DREME on fasta file. If a neg_file is passed then DREME will use this as the negative set, otherwise the default is to shuffle the input ''' nseqs_pos = int(FastaIterator.count(infile)) if nseqs_pos < 2: E.warn("%s: less than 2 sequences - dreme skipped" % outfile) P.touch(outfile) return if neg_file: nseqs_neg = int(FastaIterator.count(neg_file)) if nseqs_neg < 2: E.warn( "%s: less than 2 sequences in negatives file - dreme skipped" % outfile) P.touch(outfile) return else: neg_file = "-n %s" % neg_file logfile = outfile + ".log" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.get_temp_dir(".") statement = ''' dreme -p %(infile)s %(neg_file)s -png -oc %(tmpdir)s %(dreme_options)s %(options)s > %(logfile)s ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile, method="dreme")
def runMemeCHIP(infile, outfile, motifs=None): '''Run the MEME-CHiP pipeline on the input files. optional motifs files can be supplied as a list''' if motifs: motifs = " ".join("-db %s" % motif for motif in motifs) else: motifs = " " nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme-chip skipped") P.touch(outfile) return target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.getTempDir(".") statement = ''' meme-chip %(infile)s -p %(meme_threads)s -oc %(tmpdir)s -nmeme %(memechip_nmeme)s %(memechip_options)s %(motifs)s > %(outfile)s.log ''' # If running with more than one thread # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options # through job_options if int(PARAMS["memechip_threads"]) != 1: job_options = str(PARAMS["memechip_job_options"]) job_threads = int(PARAMS["memechip_threads"]) cluster_parallel_environment = str(PARAMS["memechip_cluster_parallel_environment"]) P.run() collectMEMEResults(tmpdir, target_path, outfile, method="memechip")
def runMEMEOnSequences(infile, outfile): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' to_cluster = True # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) return target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "meme", outfile) tmpdir = P.getTempDir(".") statement = ''' meme %(infile)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(motifs_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def runMEMEOnSequences(infile, outfile): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' to_cluster = True # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) return target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme", outfile) tmpdir = P.getTempDir(".") statement = ''' meme %(infile)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(motifs_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def subsampleNReadsFromFasta(infile, outfile, nreads, logfile=""): checkParams() nseqs = FastaIterator.count(infile) if nreads > nseqs: prop = 1 else: prop = float(nreads) / float(nseqs) if logfile: logfile = "-L %s" % logfile statement = ''' python %(scriptsdir)s/fasta2fasta.py -I %(infile)s %(logfile)s -m sample --sample-proportion=%(prop)s -S %(outfile)s ''' P.run()
def subsampleNReadsFromFasta(infile, outfile, nreads, logfile=""): checkParams() nseqs = FastaIterator.count(infile) if nreads > nseqs: prop = 1 else: prop = float(nreads)/float(nseqs) if logfile: logfile = "-L %s" % logfile statement = ''' python %(scriptsdir)s/fasta2fasta.py -I %(infile)s %(logfile)s -m sample --sample-proportion=%(prop)s -S %(outfile)s ''' P.run()
def runMEMEOnSequences(infile, outfile, background=None, psp=None): '''run MEME on fasta sequences to find motifs By defualt MEME calculates a zero-th order background model from the nucleotide frequencies in the input set. To use a different background set, a background file created by fasta-get-markov must be supplied. To perform descrimantive analysis a position specific prior (psp) file must be provided. This can be generated used generatePSP. ''' # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs < 2: E.warn("%s: less than 2 sequences - meme skipped" % outfile) P.touch(outfile) return # Get the total length of the sequences to decide the memory total_seqs_length = 0 with IOTools.open_file(infile, "r") as fasta_reader: iterator_fasta = FastaIterator.iterate(fasta_reader) for fasta_seq in iterator_fasta: total_seqs_length += len(fasta_seq.sequence) fasta_reader.close() # If the length of all sequences is higher than 160,000bp # Up the memory job_memory = "2G" if (total_seqs_length > 160000): job_memory = "4G" if PARAMS.get("meme_revcomp", True): revcomp = "-revcomp" else: revcomp = "" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.get_temp_dir(".") if background: background_model = "-bfile %s" % background else: background_model = "" if psp: E.info("Running MEME in descriminative mode") psp_file = "-psp %s" % psp else: psp_file = "" statement = ''' meme %(infile)s -dna %(revcomp)s -p %(meme_threads)s -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(background_model)s %(psp_file)s %(meme_options)s 2> %(outfile)s.log ''' # If running with more than one thread # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options # through job_options if int(PARAMS["meme_threads"]) != 1: job_options = str(PARAMS["meme_job_options"]) job_threads = int(PARAMS["meme_threads"]) cluster_parallel_environment = str( PARAMS["meme_cluster_parallel_environment"]) P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)