def main(): usage = "usage: %prog [options] <input FASTA>" description = "The program applies a modified EXTREME algorithm to find motifs in a FASTA file. It accepts a positive sequence set, a negative sequence set, a list of seed PFMs, and an index number indicating which of the seed PFMs to use" parser = ArgumentParser(description=description) parser.add_argument('fastafile', metavar='f', help='FASTA file containing the sequences') parser.add_argument('negfastafile', metavar='g', help='Negative FASTA file. This is for comparison so that you know the motif you discovered is over-represented.') parser.add_argument('jfile', metavar='j', help='File containing PWM seeds') parser.add_argument('indexvalue', metavar='i', help='Which seed from the Minimal MEME Format file to use (it is an integer ranging from 1 to the total number of PFM seeds in your file)', type=int) parser.add_argument("-p", "--pseudocounts", help="Pseudo counts added to initial PFM guess. Default:0.0", type=float, default=0.0) parser.add_argument("-maxsites", dest="maxsites", help="Maximum number of expected sites for the motif. If not specified, defaults to 5 times number of initial predicted sites.", type=int, default=0) parser.add_argument("-minsites", dest="minsites", help="Minimum number of expected sites for the motif. Default: 10", type=int, default=10) parser.add_argument("-t", "--tries", dest="tries", help="Number of tries for each motif discovered. The fudge factor is changed until the number of discovered sites is in the \"acceptable\" range", type=int, default=15) parser.add_argument("-s", "--seed", dest="seed", help="Random seed", type=int, default=1) parser.add_argument("-saveseqs", "--saveseqs", dest="saveseqs", help="If specified, save sequences to current directory", action='store_true') import time print "Started at:" print time.ctime() starttime = time.time() args = parser.parse_args() seed = args.seed minsites = args.minsites maxsites = args.maxsites random.seed(seed) jfile = open(args.jfile,'r') from numpy import fromstring from string import join lines = jfile.readlines() j = 0 for i in range(len(lines)): line = lines[i] if '>' in line:#This is a name line, so read in next lines for matrix j += 1 if j == args.indexvalue:#at the desired index parts = lines[i].split() pos_cs = parts[1] motifname = parts[0][1:] w = len(pos_cs) strlines = lines[i+1:i+1+w] pwm_string = '' for strline in strlines: strparts = strline.split() for strpart in strparts: pwm_string += strpart + ' ' #print pwm_string pwm_guess = fromstring(pwm_string,sep=' ',dtype=float) pwm_guess = pwm_guess.reshape((w,4)) break print 'Using initial motif guess',motifname print 'Adding',str(args.pseudocounts),'pseudocounts and normalizing' pwm_guess = pwm_guess + args.pseudocounts pwm_guess = pwm_guess/pwm_guess.sum(axis=1)[:,newaxis] jfile.close() # make the directory (recursively) import os outdir = motifname outpre = outdir + "/" clobber = True try:#adapted from DREME.py by T. Bailey os.makedirs(outdir) except OSError as exc: if exc.errno == errno.EEXIST: if not clobber: print >> sys.stderr, ("output directory (%s) already exists " "but EXTREME was not told to clobber it") % (outdir); sys.exit(1) else: raise #Use DREME's SeqIO to read in FASTA to list seqs = sequence.convert_ambigs(sequence.readFASTA(args.fastafile, None, True)) #print seqs negseqs = sequence.convert_ambigs(sequence.readFASTA(args.negfastafile, None, True)) tries = args.tries theta_motifs, theta_background_matrices, lambda_motifs, logevs, disc_pwms, disc_logevs, disc_nsites = extreme(seqs,negseqs,minsites,maxsites,pwm_guess,tries) k = 1 outputMEMEformat(disc_pwms, disc_logevs, disc_nsites, outpre) try: from weblogolib import LogoData, LogoOptions, LogoFormat, png_formatter, eps_formatter, unambiguous_dna_alphabet for theta_motif, theta_background_matrix, lambda_motif, logev in zip(theta_motifs, theta_background_matrices, lambda_motifs, logevs): outputMotif(theta_motif, theta_background_matrix, lambda_motif, logev, k, outpre) k = k+1 except ImportError: print "You do not have Weblogolib, so sequence logos will not be made" if args.saveseqs: print "Saving Positive sequences to Positive_seq.fa" pos_file = open("Positive_seq.fa","w") for s in range(len(seqs)): pos_file.write(">sequence"+str(s+1)+"\n") pos_file.write(seqs[s]+"\n") pos_file.close() print "Saving Negative sequences to Negative_seq.fa" neg_file = open("Negative_seq.fa","w") for s in range(len(negseqs)): neg_file.write(">sequence"+str(s+1)+"\n") neg_file.write(negseqs[s]+"\n") neg_file.close() print "Ended at:" print time.ctime() stoptime = time.time() duration = stoptime - starttime print "Duration:", duration
def main(): usage = "usage: %prog [options] <input FASTA> <negative FASTA>" description = "The program performs a DREME-like search for gapped k-mers" parser = ArgumentParser(description=description) parser.add_argument('fastafile', metavar='f', help='FASTA file containing the sequences') parser.add_argument('negativefile', metavar='n', help='FASTA file containing the negative sequences') parser.add_argument('outputfile', metavar='o', help='Output file') parser.add_argument( "-w", "--width", dest="width", help= "Width of the motif to search for. This makes the program only search for a motif of this width. Beware if greater than 8", type=int, default=0) parser.add_argument("-ming", dest="mingap", help="Minimum gap of k-mer to search for. Default: 0", type=int, default=0) parser.add_argument("-maxg", dest="maxgap", help="Maximum gap of k-mer to search for. Default: 12", type=int, default=10) parser.add_argument( "-l", dest="halflength", help= "Number of non-degenerate letters per half-site. Total number of non-degenerate letters is twice this number. Default: 4", type=int, default=4) parser.add_argument( "-minw", dest="minwidth", help= "Minimum width of the motif to search for. The default is 3, which is the width of the smallest core motif.", type=int, default=3) parser.add_argument( "-maxw", dest="maxwidth", help= "Maximum width of the motif to search for. This program does one refinement at this width (if greater than 8), and then picks the most significant short-mer. Default: 8", type=int, default=8) parser.add_argument( "-mink", dest="mink", help= "Minimum width of the core to search for. The default is 3, which is the width of the smallest core motif.", type=int, default=3) parser.add_argument( "-maxk", dest="maxk", help="Maximum width of the core to search for. Default: 8", type=int, default=8) parser.add_argument("-z", "--zthresh", dest="zthresh", help="Corrected z-score threshold. Default: 5", type=float, default=5) parser.add_argument( "-minsites", "--minsites", dest="minsites", help="Minimum number of sites for a k-mer to be included. Default: 10", type=int, default=10) args = parser.parse_args() pos_seq_file_name = args.fastafile neg_seq_file_name = args.negativefile print 'Reading positive sequence file...' pos_seqs = sequence.convert_ambigs( sequence.readFASTA(pos_seq_file_name, None, True)) print 'Reading negative sequence file...' neg_seqs = sequence.convert_ambigs( sequence.readFASTA(neg_seq_file_name, None, True)) halflength = args.halflength ming = args.mingap maxg = args.maxgap zthresh = args.zthresh minsites = args.minsites find_kmers(pos_seqs, neg_seqs, halflength, ming, maxg, minsites, zthresh, args.outputfile)
def main(): usage = "usage: %prog [options] <input FASTA> <negative FASTA>" description = "The program performs a DREME-like search for gapped k-mers" parser = ArgumentParser(description=description) parser.add_argument("fastafile", metavar="f", help="FASTA file containing the sequences") parser.add_argument("negativefile", metavar="n", help="FASTA file containing the negative sequences") parser.add_argument("outputfile", metavar="o", help="Output file") parser.add_argument( "-w", "--width", dest="width", help="Width of the motif to search for. This makes the program only search for a motif of this width. Beware if greater than 8", type=int, default=0, ) parser.add_argument( "-ming", dest="mingap", help="Minimum gap of k-mer to search for. Default: 0", type=int, default=0 ) parser.add_argument( "-maxg", dest="maxgap", help="Maximum gap of k-mer to search for. Default: 12", type=int, default=10 ) parser.add_argument( "-l", dest="halflength", help="Number of non-degenerate letters per half-site. Total number of non-degenerate letters is twice this number. Default: 4", type=int, default=4, ) parser.add_argument( "-minw", dest="minwidth", help="Minimum width of the motif to search for. The default is 3, which is the width of the smallest core motif.", type=int, default=3, ) parser.add_argument( "-maxw", dest="maxwidth", help="Maximum width of the motif to search for. This program does one refinement at this width (if greater than 8), and then picks the most significant short-mer. Default: 8", type=int, default=8, ) parser.add_argument( "-mink", dest="mink", help="Minimum width of the core to search for. The default is 3, which is the width of the smallest core motif.", type=int, default=3, ) parser.add_argument( "-maxk", dest="maxk", help="Maximum width of the core to search for. Default: 8", type=int, default=8 ) parser.add_argument( "-z", "--zthresh", dest="zthresh", help="Corrected z-score threshold. Default: 5", type=float, default=5 ) parser.add_argument( "-minsites", "--minsites", dest="minsites", help="Minimum number of sites for a k-mer to be included. Default: 10", type=int, default=10, ) args = parser.parse_args() pos_seq_file_name = args.fastafile neg_seq_file_name = args.negativefile print "Reading positive sequence file..." pos_seqs = sequence.convert_ambigs(sequence.readFASTA(pos_seq_file_name, None, True)) print "Reading negative sequence file..." neg_seqs = sequence.convert_ambigs(sequence.readFASTA(neg_seq_file_name, None, True)) halflength = args.halflength ming = args.mingap maxg = args.maxgap zthresh = args.zthresh minsites = args.minsites find_kmers(pos_seqs, neg_seqs, halflength, ming, maxg, minsites, zthresh, args.outputfile)
def main(): usage = "usage: %prog [options] <input FASTA>" description = "The program applies a modified EXTREME algorithm to find motifs in a FASTA file. It accepts a positive sequence set, a negative sequence set, a list of seed PFMs, and an index number indicating which of the seed PFMs to use" parser = ArgumentParser(description=description) parser.add_argument('fastafile', metavar='f', help='FASTA file containing the sequences') parser.add_argument('negfastafile', metavar='g', help='Negative FASTA file. This is for comparison so that you know the motif you discovered is over-represented.') parser.add_argument('jfile', metavar='j', help='File containing PWM seeds') parser.add_argument('indexvalue', metavar='i', help='Which seed from the Minimal MEME Format file to use (it is an integer ranging from 1 to the total number of PFM seeds in your file)', type=int) parser.add_argument("-p", "--pseudocounts", help="Pseudo counts added to initial PFM guess. Default:0.0", type=float, default=0.0) parser.add_argument("-q", "--initialstep", help="The initial step size for the online EM algorithm. A VERY sensitive parameter. I get best success for ChIP size data (about 100,000 to 1,000,000 bps) with a step size of 0.05. For DNase footprinting, which usually has >5,000,000 bps, I find 0.02 works best. Default:0.05", type=float, default=0.05) parser.add_argument("-maxsites", dest="maxsites", help="Maximum number of expected sites for the motif. If not specified, defaults to 5 times number of initial predicted sites.", type=int, default=0) parser.add_argument("-minsites", dest="minsites", help="Minimum number of expected sites for the motif. Default: 10", type=int, default=10) parser.add_argument("-t", "--tries", dest="tries", help="Number of tries for each motif discovered. The fudge factor is changed until the number of discovered sites is in the \"acceptable\" range", type=int, default=15) parser.add_argument("-s", "--seed", dest="seed", help="Random seed", type=int, default=1) parser.add_argument("-saveseqs", "--saveseqs", dest="saveseqs", help="If specified, save sequences to current directory", action='store_true') parser.add_argument("-b", "--background", dest="background", help="If specified, the minimal MEME output will use the calculated background probabilities instead of uniform probabilities.", action='store_true') import time print "Started at:" print time.ctime() starttime = time.time() args = parser.parse_args() seed = args.seed initialstep = args.initialstep minsites = args.minsites maxsites = args.maxsites random.seed(seed) jfile = open(args.jfile,'r') from numpy import fromstring from string import join lines = jfile.readlines() j = 0 for i in range(len(lines)): line = lines[i] if '>' in line:#This is a name line, so read in next lines for matrix j += 1 if j == args.indexvalue:#at the desired index parts = lines[i].split() pos_cs = parts[1] motifname = parts[0][1:] w = len(pos_cs) strlines = lines[i+1:i+1+w] pwm_string = '' for strline in strlines: strparts = strline.split() for strpart in strparts: pwm_string += strpart + ' ' #print pwm_string pwm_guess = fromstring(pwm_string,sep=' ',dtype=float) pwm_guess = pwm_guess.reshape((w,4)) break print 'Using initial motif guess',motifname print 'Adding',str(args.pseudocounts),'pseudocounts and normalizing' pwm_guess = pwm_guess + args.pseudocounts pwm_guess = pwm_guess/pwm_guess.sum(axis=1)[:,newaxis] jfile.close() # make the directory (recursively) import os outdir = motifname outpre = outdir + "/" clobber = True try:#adapted from DREME.py by T. Bailey os.makedirs(outdir) except OSError as exc: if exc.errno == errno.EEXIST: if not clobber: print >> sys.stderr, ("output directory (%s) already exists " "but EXTREME was not told to clobber it") % (outdir); sys.exit(1) else: raise #Use DREME's SeqIO to read in FASTA to list seqs = sequence.convert_ambigs(sequence.readFASTA(args.fastafile, None, True)) #print seqs negseqs = sequence.convert_ambigs(sequence.readFASTA(args.negfastafile, None, True)) tries = args.tries theta_motifs, theta_background_matrices, lambda_motifs, logevs, disc_pwms, disc_bkg, disc_logevs, disc_nsites = extreme(seqs,negseqs,minsites,maxsites,pwm_guess,initialstep,tries) k = 1 outputMEMEformat(disc_pwms, disc_bkg, disc_logevs, disc_nsites, outpre, args.background) try: from weblogolib import LogoData, LogoOptions, LogoFormat, png_formatter, eps_formatter, unambiguous_dna_alphabet for theta_motif, theta_background_matrix, lambda_motif, logev in zip(theta_motifs, theta_background_matrices, lambda_motifs, logevs): outputMotif(theta_motif, theta_background_matrix, lambda_motif, logev, k, outpre) k = k+1 except ImportError: print "You do not have Weblogolib, so sequence logos will not be made" if args.saveseqs: print "Saving Positive sequences to Positive_seq.fa" pos_file = open("Positive_seq.fa","w") for s in range(len(seqs)): pos_file.write(">sequence"+str(s+1)+"\n") pos_file.write(seqs[s]+"\n") pos_file.close() print "Saving Negative sequences to Negative_seq.fa" neg_file = open("Negative_seq.fa","w") for s in range(len(negseqs)): neg_file.write(">sequence"+str(s+1)+"\n") neg_file.write(negseqs[s]+"\n") neg_file.close() print "Ended at:" print time.ctime() stoptime = time.time() duration = stoptime - starttime print "Duration:", duration