Python PipelineMotifs Examples, PipelineMotifs Python Examples

Example #1

0

Show file

def exportMotifControlSequences( infile, outfile ):
    '''for each interval, export the left and right 
    sequence segment of the same size.
    '''
    PipelineMotifs.exportSequencesFromBedFile( infile, outfile,
                                               masker = PARAMS['motifs_masker'],
                                               mode = "leftright" )

Example #2

0

Show file

def exportMotifDetectionSequences( infile, outfile ):
    '''export sequences for motif discovery.

    This method requires the _interval tables.
    '''
    PipelineMotifs.exportSequencesFromBedFile( infile, outfile,
                                               masker = PARAMS['motifs_masker'])

Example #3

0

Show file

File: pipeline_motifs.py Project: santayana/cgat

def loadMast(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.

    Add columns for the control data as well.
    '''
    PipelineMotifs.loadMAST(infile, outfile)

Example #4

0

Show file

File: pipeline_motifs.py Project: santayana/cgat

def runMast(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that
    all sequences are output and MAST curves can be computed. 

    10000 is a heuristic.
    '''
    PipelineMotifs.runMAST(infiles, outfile)

Example #5

0

Show file

File: pipeline_motifs.py Project: BioinformaticsArchive/cgat

def loadMast( infile, outfile ):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.

    Add columns for the control data as well.
    '''
    PipelineMotifs.loadMAST( infile, outfile )

Example #6

0

Show file

File: pipeline_motifs.py Project: BioinformaticsArchive/cgat

def runMast( infiles, outfile ):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that
    all sequences are output and MAST curves can be computed. 

    10000 is a heuristic.
    '''
    PipelineMotifs.runMAST( infiles, outfile )

Example #7

0

Show file

File: pipeline_motifs.py Project: santayana/cgat

def exportMotifDiscoverySequences(infile, outfile):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak 
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.
    '''
    track = P.snip(infile, "_intervals.load")
    dbhandle = connect()

    p = P.substituteParameters(**locals())
    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=False,
        masker=P.asList(p['motifs_masker']),
        halfwidth=int(p["motifs_halfwidth"]),
        maxsize=int(p["motifs_max_size"]),
        proportion=p["motifs_proportion"],
        min_sequences=p["motifs_min_sequences"],
        num_sequences=p["motifs_num_sequences"],
        order=p['motifs_score'])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)

Example #8

0

Show file

File: pipeline_motifs.py Project: BioinformaticsArchive/cgat

def exportMotifDiscoverySequences( infile, outfile ):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio are supplied.
    
    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak 
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.
    '''
    track = P.snip( infile, "_intervals.load" )
    dbhandle = connect()
        
    p = P.substituteParameters( **locals() )
    nseq = PipelineMotifs.writeSequencesForIntervals( track, 
                                                      outfile,
                                                      dbhandle,
                                                      full = False,
                                                      masker = P.asList(p['motifs_masker']),
                                                      halfwidth = int(p["motifs_halfwidth"]),
                                                      maxsize = int(p["motifs_max_size"]),
                                                      proportion = p["motifs_proportion"],
                                                      min_sequences = p["motifs_min_sequences"],
                                                      num_sequences = p["motifs_num_sequences"],
                                                      order = p['motifs_score'])

    if nseq == 0:
        E.warn( "%s: no sequences - meme skipped" % outfile)
        P.touch( outfile )

Example #9

0

Show file

File: pipeline_motifs.py Project: santayana/cgat

def runMeme(infile, outfile):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker
    '''

    track = P.snip(infile, ".discovery.fasta")

    PipelineMotifs.runMEMEOnSequences(infile, outfile)

Example #10

0

Show file

File: pipeline_motifs.py Project: BioinformaticsArchive/cgat

def runMeme( infile, outfile ):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker
    '''

    track = P.snip( infile, ".discovery.fasta" )

    PipelineMotifs.runMEMEOnSequences( infile, outfile )

Example #11

0

Show file

def buildBackgroundSequences(infile, outfile, npeaks, width, masker):
    '''get the peak sequences, masking or not specificed in the ini file.
    '''
    
    track = P.snip( infile, "_intervals.load" )
    dbhandle = connect()
    
    nseq = PipelineMotifs.writeSequencesForIntervals( track, 
                                                      outfile,
                                                      dbhandle,
                                                      full = False,
                                                      masker = [masker],
                                                      halfwidth = width,
                                                      maxsize = int(PARAMS["motifs_max_size"]),
                                                      proportion = None,
                                                      num_sequences = npeaks,
                                                      order = 'peakval',
                                                      shift = "leftright")

    if nseq == 0:
        E.warn( "%s: no sequences in background" % outfile_background)

Example #12

0

Show file

File: pipeline_motifs.py Project: santayana/cgat

def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''
    PipelineMotifs.runTomTom(infile, outfile)

Example #13

0

Show file

File: pipeline_motifs.py Project: BioinformaticsArchive/cgat

def runTomTom( infile, outfile ):
    '''compare ab-initio motifs against tomtom.'''
    PipelineMotifs.runTomTom( infile, outfile )