def generatePSP(positives, negatives, outfile):
    ''' generate a discrimitative PSP file from
    the positives and negatives that can be used
    to do descriminative MEME '''

    psp_options = PARAMS["psp_options"]

    nseqs_pos = int(FastaIterator.count(positives))
    nseqs_neg = int(FastaIterator.count(negatives))

    if nseqs_pos < 2 or nseqs_neg < 2:
        E.warn("%s: input files do not have sufficent sequences"
               "to run psp-gen, skipping" % outfile)
        P.touch(outfile)
        return

    # get appropriate options from meme options
    if PARAMS.get("meme_revcomp", True):
        psp_options += " -revcomp"

    statement = '''psp-gen -pos %(positives)s
                           -neg %(negatives)s
                           %(psp_options)s
                   > %(outfile)s '''

    P.run(statement)
def runDREME(infile, outfile, neg_file="", options=""):
    ''' Run DREME on fasta file. If a neg_file is passed
    then DREME will use this as the negative set, otherwise
    the default is to shuffle the input '''

    nseqs_pos = int(FastaIterator.count(infile))
    if nseqs_pos < 2:
        E.warn("%s: less than 2 sequences - dreme skipped" % outfile)
        P.touch(outfile)
        return

    if neg_file:
        nseqs_neg = int(FastaIterator.count(neg_file))
        if nseqs_neg < 2:
            E.warn(
                "%s: less than 2 sequences in negatives file - dreme skipped" %
                outfile)
            P.touch(outfile)
            return
        else:
            neg_file = "-n %s" % neg_file

    logfile = outfile + ".log"
    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile)
    tmpdir = P.get_temp_dir(".")

    statement = '''
    dreme -p %(infile)s %(neg_file)s -png
        -oc %(tmpdir)s
            %(dreme_options)s
            %(options)s
       > %(logfile)s
    '''

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile, method="dreme")
def runMemeCHIP(infile, outfile, motifs=None):
    '''Run the MEME-CHiP pipeline on the input files.
    optional motifs files can be supplied as a list'''

    if motifs:
        motifs = " ".join("-db %s" % motif for motif in motifs)
    else:
        motifs = " "

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme-chip skipped")
        P.touch(outfile)
        return

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), outfile)
    tmpdir = P.getTempDir(".")

    statement = '''
    meme-chip %(infile)s
             -p %(meme_threads)s 
             -oc %(tmpdir)s
             -nmeme %(memechip_nmeme)s
             %(memechip_options)s     
             %(motifs)s > %(outfile)s.log '''
    
    # If running with more than one thread
    # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html
    # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options
    # through job_options
    if int(PARAMS["memechip_threads"]) != 1:
        job_options = str(PARAMS["memechip_job_options"])
        job_threads = int(PARAMS["memechip_threads"])
        cluster_parallel_environment = str(PARAMS["memechip_cluster_parallel_environment"])
     
    
    P.run()
   

    collectMEMEResults(tmpdir, target_path, outfile, method="memechip")
Beispiel #4
0
def runMEMEOnSequences(infile, outfile):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker
    '''
    to_cluster = True
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
        return

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "meme", outfile)
    tmpdir = P.getTempDir(".")

    statement = '''
        meme %(infile)s -dna -revcomp 
                        -mod %(meme_model)s 
                        -nmotifs %(meme_nmotifs)s 
                        -oc %(tmpdir)s 
                        -maxsize %(motifs_max_size)s 
                        %(meme_options)s 
       > %(outfile)s.log
    '''

    P.run()

    collectMEMEResults(tmpdir, target_path, outfile)
Beispiel #5
0
def runMEMEOnSequences(infile, outfile):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker
    '''
    to_cluster = True
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
        return

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme",
                               outfile)
    tmpdir = P.getTempDir(".")

    statement = '''
        meme %(infile)s -dna -revcomp 
                        -mod %(meme_model)s 
                        -nmotifs %(meme_nmotifs)s 
                        -oc %(tmpdir)s 
                        -maxsize %(motifs_max_size)s 
                        %(meme_options)s 
       > %(outfile)s.log
    '''

    P.run()

    collectMEMEResults(tmpdir, target_path, outfile)
Beispiel #6
0
def subsampleNReadsFromFasta(infile, outfile, nreads, logfile=""):

    checkParams()

    nseqs = FastaIterator.count(infile)

    if nreads > nseqs:
        prop = 1
    else:
        prop = float(nreads) / float(nseqs)

    if logfile:
        logfile = "-L %s" % logfile

    statement = ''' python %(scriptsdir)s/fasta2fasta.py 
                     -I %(infile)s
                     %(logfile)s
                     -m sample
                     --sample-proportion=%(prop)s
                     -S %(outfile)s '''

    P.run()
def subsampleNReadsFromFasta(infile, outfile, nreads, logfile=""):

    checkParams()

    nseqs = FastaIterator.count(infile)

    if nreads > nseqs:
        prop = 1
    else:    
        prop = float(nreads)/float(nseqs)

    if logfile:
        logfile = "-L %s" % logfile

    statement = ''' python %(scriptsdir)s/fasta2fasta.py 
                     -I %(infile)s
                     %(logfile)s
                     -m sample
                     --sample-proportion=%(prop)s
                     -S %(outfile)s '''

    P.run()
def runMEMEOnSequences(infile, outfile, background=None, psp=None):
    '''run MEME on fasta sequences to find motifs
   
    By defualt MEME calculates a zero-th order background
    model from the nucleotide frequencies in the input set.

    To use a different background set, a background
    file created by fasta-get-markov must be supplied.

    To perform descrimantive analysis a position specific
    prior (psp) file must be provided. This can be generated
    used generatePSP.

    '''
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs < 2:
        E.warn("%s: less than 2 sequences - meme skipped" % outfile)
        P.touch(outfile)
        return

    # Get the total length of the sequences to decide the memory
    total_seqs_length = 0

    with IOTools.open_file(infile, "r") as fasta_reader:

        iterator_fasta = FastaIterator.iterate(fasta_reader)

        for fasta_seq in iterator_fasta:
            total_seqs_length += len(fasta_seq.sequence)

    fasta_reader.close()

    # If the length of all sequences is higher than 160,000bp
    # Up the memory
    job_memory = "2G"

    if (total_seqs_length > 160000):
        job_memory = "4G"

    if PARAMS.get("meme_revcomp", True):
        revcomp = "-revcomp"
    else:
        revcomp = ""

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile)
    tmpdir = P.get_temp_dir(".")
    if background:
        background_model = "-bfile %s" % background
    else:
        background_model = ""

    if psp:
        E.info("Running MEME in descriminative mode")
        psp_file = "-psp %s" % psp
    else:
        psp_file = ""

    statement = '''
    meme %(infile)s -dna %(revcomp)s
    -p %(meme_threads)s
    -mod %(meme_model)s
    -nmotifs %(meme_nmotifs)s
    -oc %(tmpdir)s
    -maxsize %(meme_max_size)s
    %(background_model)s
    %(psp_file)s
    %(meme_options)s
       2> %(outfile)s.log
    '''

    # If running with more than one thread
    # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html
    # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options
    # through job_options
    if int(PARAMS["meme_threads"]) != 1:
        job_options = str(PARAMS["meme_job_options"])
        job_threads = int(PARAMS["meme_threads"])
        cluster_parallel_environment = str(
            PARAMS["meme_cluster_parallel_environment"])

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile)