Esempio n. 1
0
def main():
    usage = "usage: %prog [options] <input FASTA>"
    description = "The program applies a modified EXTREME algorithm to find motifs in a FASTA file. It accepts a positive sequence set, a negative sequence set, a list of seed PFMs, and an index number indicating which of the seed PFMs to use"
    parser = ArgumentParser(description=description)
    parser.add_argument('fastafile', metavar='f', help='FASTA file containing the sequences')
    parser.add_argument('negfastafile', metavar='g', help='Negative FASTA file. This is for comparison so that you know the motif you discovered is over-represented.')
    parser.add_argument('jfile', metavar='j', help='File containing PWM seeds')
    parser.add_argument('indexvalue', metavar='i', help='Which seed from the Minimal MEME Format file to use (it is an integer ranging from 1 to the total number of PFM seeds in your file)', type=int)
    parser.add_argument("-p", "--pseudocounts", help="Pseudo counts added to initial PFM guess. Default:0.0", type=float, default=0.0)
    parser.add_argument("-maxsites", dest="maxsites", help="Maximum number of expected sites for the motif. If not specified, defaults to 5 times number of initial predicted sites.", type=int, default=0)
    parser.add_argument("-minsites", dest="minsites", help="Minimum number of expected sites for the motif. Default: 10", type=int, default=10)
    parser.add_argument("-t", "--tries", dest="tries", help="Number of tries for each motif discovered. The fudge factor is changed until the number of discovered sites is in the \"acceptable\" range", type=int, default=15)
    parser.add_argument("-s", "--seed", dest="seed", help="Random seed", type=int, default=1)
    parser.add_argument("-saveseqs", "--saveseqs", dest="saveseqs", help="If specified, save sequences to current directory", action='store_true')
    import time
    print "Started at:"
    print time.ctime()
    starttime = time.time()
    args = parser.parse_args()
    seed = args.seed
    minsites = args.minsites
    maxsites = args.maxsites
    random.seed(seed)
    jfile = open(args.jfile,'r')
    from numpy import fromstring
    from string import join
    lines = jfile.readlines()
    j = 0
    for i in range(len(lines)):
        line = lines[i]
        if '>' in line:#This is a name line, so read in next lines for matrix
            j += 1
            if j == args.indexvalue:#at the desired index
                parts = lines[i].split()
                pos_cs = parts[1]
                motifname = parts[0][1:]
                w = len(pos_cs)
                strlines = lines[i+1:i+1+w]
                pwm_string = ''
                for strline in strlines:
                    strparts = strline.split()
                    for strpart in strparts:
                        pwm_string += strpart + ' '
                #print pwm_string
                pwm_guess = fromstring(pwm_string,sep=' ',dtype=float)
                pwm_guess = pwm_guess.reshape((w,4))
                break
    print 'Using initial motif guess',motifname
    print 'Adding',str(args.pseudocounts),'pseudocounts and normalizing'
    pwm_guess = pwm_guess + args.pseudocounts
    pwm_guess = pwm_guess/pwm_guess.sum(axis=1)[:,newaxis]
    jfile.close() 
   
    # make the directory (recursively)
    import os
    outdir = motifname
    outpre = outdir + "/"
    clobber = True
    try:#adapted from DREME.py by T. Bailey
        os.makedirs(outdir)
    except OSError as exc:
        if exc.errno == errno.EEXIST:
            if not clobber:
                print >> sys.stderr, ("output directory (%s) already exists "
                "but EXTREME was not told to clobber it") % (outdir); sys.exit(1)
        else: raise
    #Use DREME's SeqIO to read in FASTA to list
    seqs = sequence.convert_ambigs(sequence.readFASTA(args.fastafile, None, True))
    #print seqs
    negseqs = sequence.convert_ambigs(sequence.readFASTA(args.negfastafile, None, True))
    tries = args.tries
    theta_motifs, theta_background_matrices, lambda_motifs, logevs, disc_pwms, disc_logevs, disc_nsites = extreme(seqs,negseqs,minsites,maxsites,pwm_guess,tries)
    k = 1
    outputMEMEformat(disc_pwms, disc_logevs, disc_nsites, outpre)
    try:
        from weblogolib import LogoData, LogoOptions, LogoFormat, png_formatter, eps_formatter, unambiguous_dna_alphabet
        for theta_motif, theta_background_matrix, lambda_motif, logev in zip(theta_motifs, theta_background_matrices, lambda_motifs, logevs):
            outputMotif(theta_motif, theta_background_matrix, lambda_motif, logev, k, outpre)
            k = k+1
    except ImportError:
        print "You do not have Weblogolib, so sequence logos will not be made"
    
    
    if args.saveseqs:
        print "Saving Positive sequences to Positive_seq.fa"
        pos_file = open("Positive_seq.fa","w")
        for s in range(len(seqs)):
            pos_file.write(">sequence"+str(s+1)+"\n")
            pos_file.write(seqs[s]+"\n")
        pos_file.close()
        print "Saving Negative sequences to Negative_seq.fa"
        neg_file = open("Negative_seq.fa","w")
        for s in range(len(negseqs)):
            neg_file.write(">sequence"+str(s+1)+"\n")
            neg_file.write(negseqs[s]+"\n")
        neg_file.close()
    print "Ended at:"
    print time.ctime()
    stoptime = time.time()
    duration = stoptime - starttime
    print "Duration:", duration
Esempio n. 2
0
def main():
    usage = "usage: %prog [options] <input FASTA> <negative FASTA>"
    description = "The program performs a DREME-like search for gapped k-mers"
    parser = ArgumentParser(description=description)
    parser.add_argument('fastafile',
                        metavar='f',
                        help='FASTA file containing the sequences')
    parser.add_argument('negativefile',
                        metavar='n',
                        help='FASTA file containing the negative sequences')
    parser.add_argument('outputfile', metavar='o', help='Output file')
    parser.add_argument(
        "-w",
        "--width",
        dest="width",
        help=
        "Width of the motif to search for. This makes the program only search for a motif of this width. Beware if greater than 8",
        type=int,
        default=0)
    parser.add_argument("-ming",
                        dest="mingap",
                        help="Minimum gap of k-mer to search for. Default: 0",
                        type=int,
                        default=0)
    parser.add_argument("-maxg",
                        dest="maxgap",
                        help="Maximum gap of k-mer to search for. Default: 12",
                        type=int,
                        default=10)
    parser.add_argument(
        "-l",
        dest="halflength",
        help=
        "Number of non-degenerate letters per half-site. Total number of non-degenerate letters is twice this number. Default: 4",
        type=int,
        default=4)
    parser.add_argument(
        "-minw",
        dest="minwidth",
        help=
        "Minimum width of the motif to search for. The default is 3, which is the width of the smallest core motif.",
        type=int,
        default=3)
    parser.add_argument(
        "-maxw",
        dest="maxwidth",
        help=
        "Maximum width of the motif to search for. This program does one refinement at this width (if greater than 8), and then picks the most significant short-mer. Default: 8",
        type=int,
        default=8)
    parser.add_argument(
        "-mink",
        dest="mink",
        help=
        "Minimum width of the core to search for. The default is 3, which is the width of the smallest core motif.",
        type=int,
        default=3)
    parser.add_argument(
        "-maxk",
        dest="maxk",
        help="Maximum width of the core to search for. Default: 8",
        type=int,
        default=8)
    parser.add_argument("-z",
                        "--zthresh",
                        dest="zthresh",
                        help="Corrected z-score threshold. Default: 5",
                        type=float,
                        default=5)
    parser.add_argument(
        "-minsites",
        "--minsites",
        dest="minsites",
        help="Minimum number of sites for a k-mer to be included. Default: 10",
        type=int,
        default=10)
    args = parser.parse_args()
    pos_seq_file_name = args.fastafile
    neg_seq_file_name = args.negativefile
    print 'Reading positive sequence file...'
    pos_seqs = sequence.convert_ambigs(
        sequence.readFASTA(pos_seq_file_name, None, True))
    print 'Reading negative sequence file...'
    neg_seqs = sequence.convert_ambigs(
        sequence.readFASTA(neg_seq_file_name, None, True))
    halflength = args.halflength
    ming = args.mingap
    maxg = args.maxgap
    zthresh = args.zthresh
    minsites = args.minsites
    find_kmers(pos_seqs, neg_seqs, halflength, ming, maxg, minsites, zthresh,
               args.outputfile)
Esempio n. 3
0
def main():
    usage = "usage: %prog [options] <input FASTA> <negative FASTA>"
    description = "The program performs a DREME-like search for gapped k-mers"
    parser = ArgumentParser(description=description)
    parser.add_argument("fastafile", metavar="f", help="FASTA file containing the sequences")
    parser.add_argument("negativefile", metavar="n", help="FASTA file containing the negative sequences")
    parser.add_argument("outputfile", metavar="o", help="Output file")
    parser.add_argument(
        "-w",
        "--width",
        dest="width",
        help="Width of the motif to search for. This makes the program only search for a motif of this width. Beware if greater than 8",
        type=int,
        default=0,
    )
    parser.add_argument(
        "-ming", dest="mingap", help="Minimum gap of k-mer to search for. Default: 0", type=int, default=0
    )
    parser.add_argument(
        "-maxg", dest="maxgap", help="Maximum gap of k-mer to search for. Default: 12", type=int, default=10
    )
    parser.add_argument(
        "-l",
        dest="halflength",
        help="Number of non-degenerate letters per half-site. Total number of non-degenerate letters is twice this number. Default: 4",
        type=int,
        default=4,
    )
    parser.add_argument(
        "-minw",
        dest="minwidth",
        help="Minimum width of the motif to search for. The default is 3, which is the width of the smallest core motif.",
        type=int,
        default=3,
    )
    parser.add_argument(
        "-maxw",
        dest="maxwidth",
        help="Maximum width of the motif to search for. This program does one refinement at this width (if greater than 8), and then picks the most significant short-mer. Default: 8",
        type=int,
        default=8,
    )
    parser.add_argument(
        "-mink",
        dest="mink",
        help="Minimum width of the core to search for. The default is 3, which is the width of the smallest core motif.",
        type=int,
        default=3,
    )
    parser.add_argument(
        "-maxk", dest="maxk", help="Maximum width of the core to search for. Default: 8", type=int, default=8
    )
    parser.add_argument(
        "-z", "--zthresh", dest="zthresh", help="Corrected z-score threshold. Default: 5", type=float, default=5
    )
    parser.add_argument(
        "-minsites",
        "--minsites",
        dest="minsites",
        help="Minimum number of sites for a k-mer to be included. Default: 10",
        type=int,
        default=10,
    )
    args = parser.parse_args()
    pos_seq_file_name = args.fastafile
    neg_seq_file_name = args.negativefile
    print "Reading positive sequence file..."
    pos_seqs = sequence.convert_ambigs(sequence.readFASTA(pos_seq_file_name, None, True))
    print "Reading negative sequence file..."
    neg_seqs = sequence.convert_ambigs(sequence.readFASTA(neg_seq_file_name, None, True))
    halflength = args.halflength
    ming = args.mingap
    maxg = args.maxgap
    zthresh = args.zthresh
    minsites = args.minsites
    find_kmers(pos_seqs, neg_seqs, halflength, ming, maxg, minsites, zthresh, args.outputfile)
Esempio n. 4
0
def main():
    usage = "usage: %prog [options] <input FASTA>"
    description = "The program applies a modified EXTREME algorithm to find motifs in a FASTA file. It accepts a positive sequence set, a negative sequence set, a list of seed PFMs, and an index number indicating which of the seed PFMs to use"
    parser = ArgumentParser(description=description)
    parser.add_argument('fastafile', metavar='f', help='FASTA file containing the sequences')
    parser.add_argument('negfastafile', metavar='g', help='Negative FASTA file. This is for comparison so that you know the motif you discovered is over-represented.')
    parser.add_argument('jfile', metavar='j', help='File containing PWM seeds')
    parser.add_argument('indexvalue', metavar='i', help='Which seed from the Minimal MEME Format file to use (it is an integer ranging from 1 to the total number of PFM seeds in your file)', type=int)
    parser.add_argument("-p", "--pseudocounts", help="Pseudo counts added to initial PFM guess. Default:0.0", type=float, default=0.0)
    parser.add_argument("-q", "--initialstep", help="The initial step size for the online EM algorithm. A VERY sensitive parameter. I get best success for ChIP size data (about 100,000 to 1,000,000 bps) with a step size of 0.05. For DNase footprinting, which usually has >5,000,000 bps, I find 0.02 works best. Default:0.05", type=float, default=0.05)    
    parser.add_argument("-maxsites", dest="maxsites", help="Maximum number of expected sites for the motif. If not specified, defaults to 5 times number of initial predicted sites.", type=int, default=0)
    parser.add_argument("-minsites", dest="minsites", help="Minimum number of expected sites for the motif. Default: 10", type=int, default=10)
    parser.add_argument("-t", "--tries", dest="tries", help="Number of tries for each motif discovered. The fudge factor is changed until the number of discovered sites is in the \"acceptable\" range", type=int, default=15)
    parser.add_argument("-s", "--seed", dest="seed", help="Random seed", type=int, default=1)
    parser.add_argument("-saveseqs", "--saveseqs", dest="saveseqs", help="If specified, save sequences to current directory", action='store_true')
    parser.add_argument("-b", "--background", dest="background", help="If specified, the minimal MEME output will use the calculated background probabilities instead of uniform probabilities.", action='store_true')
    import time
    print "Started at:"
    print time.ctime()
    starttime = time.time()
    args = parser.parse_args()
    seed = args.seed
    initialstep = args.initialstep
    minsites = args.minsites
    maxsites = args.maxsites
    random.seed(seed)
    jfile = open(args.jfile,'r')
    from numpy import fromstring
    from string import join
    lines = jfile.readlines()
    j = 0
    for i in range(len(lines)):
        line = lines[i]
        if '>' in line:#This is a name line, so read in next lines for matrix
            j += 1
            if j == args.indexvalue:#at the desired index
                parts = lines[i].split()
                pos_cs = parts[1]
                motifname = parts[0][1:]
                w = len(pos_cs)
                strlines = lines[i+1:i+1+w]
                pwm_string = ''
                for strline in strlines:
                    strparts = strline.split()
                    for strpart in strparts:
                        pwm_string += strpart + ' '
                #print pwm_string
                pwm_guess = fromstring(pwm_string,sep=' ',dtype=float)
                pwm_guess = pwm_guess.reshape((w,4))
                break
    print 'Using initial motif guess',motifname
    print 'Adding',str(args.pseudocounts),'pseudocounts and normalizing'
    pwm_guess = pwm_guess + args.pseudocounts
    pwm_guess = pwm_guess/pwm_guess.sum(axis=1)[:,newaxis]
    jfile.close() 
   
    # make the directory (recursively)
    import os
    outdir = motifname
    outpre = outdir + "/"
    clobber = True
    try:#adapted from DREME.py by T. Bailey
        os.makedirs(outdir)
    except OSError as exc:
        if exc.errno == errno.EEXIST:
            if not clobber:
                print >> sys.stderr, ("output directory (%s) already exists "
                "but EXTREME was not told to clobber it") % (outdir); sys.exit(1)
        else: raise
    #Use DREME's SeqIO to read in FASTA to list
    seqs = sequence.convert_ambigs(sequence.readFASTA(args.fastafile, None, True))
    #print seqs
    negseqs = sequence.convert_ambigs(sequence.readFASTA(args.negfastafile, None, True))
    tries = args.tries
    theta_motifs, theta_background_matrices, lambda_motifs, logevs, disc_pwms, disc_bkg, disc_logevs, disc_nsites = extreme(seqs,negseqs,minsites,maxsites,pwm_guess,initialstep,tries)
    k = 1
    outputMEMEformat(disc_pwms, disc_bkg, disc_logevs, disc_nsites, outpre, args.background)
    try:
        from weblogolib import LogoData, LogoOptions, LogoFormat, png_formatter, eps_formatter, unambiguous_dna_alphabet
        for theta_motif, theta_background_matrix, lambda_motif, logev in zip(theta_motifs, theta_background_matrices, lambda_motifs, logevs):
            outputMotif(theta_motif, theta_background_matrix, lambda_motif, logev, k, outpre)
            k = k+1
    except ImportError:
        print "You do not have Weblogolib, so sequence logos will not be made"
    
    
    if args.saveseqs:
        print "Saving Positive sequences to Positive_seq.fa"
        pos_file = open("Positive_seq.fa","w")
        for s in range(len(seqs)):
            pos_file.write(">sequence"+str(s+1)+"\n")
            pos_file.write(seqs[s]+"\n")
        pos_file.close()
        print "Saving Negative sequences to Negative_seq.fa"
        neg_file = open("Negative_seq.fa","w")
        for s in range(len(negseqs)):
            neg_file.write(">sequence"+str(s+1)+"\n")
            neg_file.write(negseqs[s]+"\n")
        neg_file.close()
    print "Ended at:"
    print time.ctime()
    stoptime = time.time()
    duration = stoptime - starttime
    print "Duration:", duration