예제 #1
0
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.getTempFilename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(track,
                                      tmpfasta,
                                      dbhandle,
                                      full=True,
                                      masker="dust",
                                      proportion=PARAMS["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        P.touch(outfile)
    else:
        statement = '''
    BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run()

    os.unlink(tmpfasta)
def loadContigSummary(infile, outfile):
    '''
    load contig summary stats for each assembler
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + "_" + os.path.basename(infile) + ".load"
    P.load(infile, outname)
    P.touch(outfile)
예제 #3
0
def loadFastqc( infile, outfile ):
    '''load FASTQC stats.'''
    
    track = P.snip( infile, ".fastqc" )

    filename = os.path.join( PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt" )

    for fn in glob.glob( filename ):
        prefix = os.path.basename( os.path.dirname( fn ) )
        results = []
        
        for name, status, header, data in FastqcSectionIterator(IOTools.openFile( fn )):
            # do not collect basic stats, see loadFastQCSummary
            if name == "Basic Statistics": continue

            parser = CSV2DB.buildParser()
            (options, args) = parser.parse_args([])
            options.tablename = prefix + "_" + re.sub(" ", "_", name ) 
            options.allow_empty= True

            inf = cStringIO.StringIO( "\n".join( [header] + data ) + "\n" )
            CSV2DB.run( inf, options )
            results.append( (name, status ) )

        # load status table
        parser = CSV2DB.buildParser()
        (options, args) = parser.parse_args([])
        options.tablename = prefix + "_status"
        options.allow_empty= True

        inf = cStringIO.StringIO( "\n".join( ["name\tstatus"] + ["\t".join( x ) for x in results ] ) + "\n" )
        CSV2DB.run( inf, options )

    P.touch( outfile )
예제 #4
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")

    to_cluster = True
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                               outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
           tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path): shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
예제 #5
0
def loadFastqc( infile, outfile ):
    '''load FASTQC stats.'''
    
    track = P.snip( infile, ".fastqc" )

    filename = os.path.join( PARAMS["exportdir"], "fastqc", track + "*_fastqc", "fastqc_data.txt" )

    for fn in glob.glob( filename ):
        prefix = os.path.basename( os.path.dirname( fn ) )
        results = []
        
        for name, status, header, data in FastqcSectionIterator(IOTools.openFile( fn )):
            # do not collect basic stats, see loadFastQCSummary
            if name == "Basic Statistics": continue

            parser = CSV2DB.buildParser()
            (options, args) = parser.parse_args([])
            options.tablename = prefix + "_" + re.sub(" ", "_", name ) 
            options.allow_empty= True

            inf = cStringIO.StringIO( "\n".join( [header] + data ) + "\n" )
            CSV2DB.run( inf, options )
            results.append( (name, status ) )

        # load status table
        parser = CSV2DB.buildParser()
        (options, args) = parser.parse_args([])
        options.tablename = prefix + "_status"
        options.allow_empty= True

        inf = cStringIO.StringIO( "\n".join( ["name\tstatus"] + ["\t".join( x ) for x in results ] ) + "\n" )
        CSV2DB.run( inf, options )

    P.touch( outfile )
예제 #6
0
def buildAssemblyBWAIndices(infile, outfile):
    '''
    build bwa indices
    '''
    statement = '''bwa index %(infile)s'''
    P.run()
    P.touch(outfile)
def importGO(infile, outfile, suffix):
    '''import GO results into a table.'''

    x = "_expdiff.%s" % suffix
    assert infile.endswith(x)
    track, method, control = getExpressionMatch(infile[:-len(x)] + ".expdiff")

    if track == control: return

    tablename = "%(track)s_vs_%(control)s_%(method)s_%(suffix)s" % locals()

    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    statement = '''
    python %(toolsdir)s/cat_tables.py %(indir)s/*.overall |\
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              --allow-empty \
              --index=category \
              --index=goid \
              --table=%(tablename)s \
    > %(outfile)s
    '''
    P.run()
예제 #8
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")

    to_cluster = True
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
           tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
예제 #9
0
def buildPicardAlignmentStats(infile, outfile, genome_file):
    '''gather BAM file alignment statistics using Picard '''

    to_cluster = True
    job_options = getPicardOptions()

    if getNumReadsFromBAMFile(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    # Picard seems to have problem if quality information is missing
    # or there is no sequence/quality information within the bam file.
    # Thus, add it explicitely.
    statement = '''cat %(infile)s 
                       | python %(scriptsdir)s/bam2bam.py -v 0 --set-sequence --sam 
                       | CollectMultipleMetrics 
                                       INPUT=/dev/stdin 
                                       REFERENCE_SEQUENCE=%(genome_file)s
                                       ASSUME_SORTED=true 
                                       OUTPUT=%(outfile)s 
                                       VALIDATION_STRINGENCY=SILENT 
                       >& %(outfile)s'''

    P.run()
예제 #10
0
파일: PipelineGO.py 프로젝트: yangjl/cgat
def runGOFromDatabase( outfile, outdir, 
                       statement_fg, 
                       statement_bg, 
                       go_file,
                       ontology_file = None,
                       samples = 1000 ):
    '''Take gene lists from the SQL database using
    ``statement_foreground`` and ``statement_background``
    '''

    dbhandle = sqlite3.connect( PARAMS["database"] )
    
    cc = dbhandle.cursor()
    fg = set( [x[0] for x in cc.execute( statement_fg).fetchall() ] )
    bg = set( [x[0] for x in cc.execute( statement_bg).fetchall() ] )

    if len(fg) == 0:
        P.touch( outfile )
        return

    fg_file = os.path.join( outdir, "foreground" )
    bg_file = os.path.join( outdir, "background" )
    outf = open( fg_file, "w")
    outf.write("\n".join( map(str, fg ) ) + "\n" )
    outf.close()
    outf = open( bg_file, "w")
    outf.write("\n".join( map(str, bg ) ) + "\n" )
    outf.close()
    
    runGOFromFiles( outfile, outdir, 
                    fg_file, bg_file, 
                    go_file,
                    ontology_file = ontology_file,
                    samples = samples )
예제 #11
0
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.getTempFilename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=True,
        masker="dust",
        proportion=PARAMS["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        P.touch(outfile)
    else:
        statement = '''
    BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run()

    os.unlink(tmpfasta)
def mergeEffectsPerGene( infile, outfile ):
    '''summarize effects on a per-gene level.'''
    
    tablename = outfile[:-len(".load")]

    dbhandle = connect()

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT DISTINCT 
           track,
           gene_id, 
           COUNT(*) AS ntranscripts,
           MIN(e.nalleles) AS min_nalleles,
           MAX(e.nalleles) AS max_nalleles,
           MIN(e.stop_min) AS min_stop_min,
           MAX(e.stop_min) AS max_stop_min,
           MIN(e.stop_max) AS min_stop_max,
           MAX(e.stop_max) AS max_stop_max,
           SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1  
                     ELSE 0 END) AS nmd_knockout,
           SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1  
                     ELSE 0 END) AS nmd_affected
    FROM annotations.transcript_info as i, effects AS e
    WHERE i.transcript_id = e.transcript_id
    GROUP BY i.gene_id, track
    ''' % locals()
    
    Database.executewait( dbhandle, "DROP TABLE IF EXISTS %(tablename)s" % locals() )
    Database.executewait( dbhandle, statement )
    Database.executewait( dbhandle, "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" % locals())
    dbhandle.commit()

    P.touch(outfile)
예제 #13
0
def exportMotifDiscoverySequences( infile, outfile ):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio are supplied.
    
    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak 
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.
    '''
    track = P.snip( infile, "_intervals.load" )
    dbhandle = connect()
        
    p = P.substituteParameters( **locals() )
    nseq = PipelineMotifs.writeSequencesForIntervals( track, 
                                                      outfile,
                                                      dbhandle,
                                                      full = False,
                                                      masker = P.asList(p['motifs_masker']),
                                                      halfwidth = int(p["motifs_halfwidth"]),
                                                      maxsize = int(p["motifs_max_size"]),
                                                      proportion = p["motifs_proportion"],
                                                      min_sequences = p["motifs_min_sequences"],
                                                      num_sequences = p["motifs_num_sequences"],
                                                      order = p['motifs_score'])

    if nseq == 0:
        E.warn( "%s: no sequences - meme skipped" % outfile)
        P.touch( outfile )
예제 #14
0
def removeBamfiles(infiles, outfile):
    for bamfile in infiles:
        bam_index = bamfile + ".bai"
        os.unlink(bamfile)
        if os.path.exists(bam_index):
            os.unlink(bam_index)
    P.touch(outfile)
예제 #15
0
파일: pipeline_idr.py 프로젝트: jmadzo/cgat
def removeBamfiles(infiles, outfile):
    for bamfile in infiles:
        bam_index = bamfile + ".bai"
        os.unlink(bamfile)
        if os.path.exists(bam_index):
            os.unlink(bam_index)
    P.touch(outfile)
예제 #16
0
def plotFalsePositiveRates(infile, outfile):
    '''
    barplot the false positive rates across
    taxonomic levels
    '''
    R('''library(ggplot2)''')
    R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")'''
      % infile)
    for i in [0, 1]:
        # specificity
        outf = P.snip(outfile, ".pdf") + ".%i.specificity.pdf" % i
        R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = fp_rate, fill = track, stat = "identity"))'''
          % i)
        R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''')
        R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))'''
          )
        R('''ggsave("%s")''' % outf)

        # sensitivity
        outf = P.snip(outfile, ".pdf") + ".%i.sensitivity.pdf" % i
        R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = tp_rate, fill = track, stat = "identity"))'''
          % i)
        R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''')
        R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))'''
          )
        R('''ggsave("%s")''' % outf)

    P.touch(outfile)
예제 #17
0
def exportMotifDiscoverySequences(infile, outfile):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak 
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.
    '''
    track = P.snip(infile, "_intervals.load")
    dbhandle = connect()

    p = P.substituteParameters(**locals())
    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=False,
        masker=P.asList(p['motifs_masker']),
        halfwidth=int(p["motifs_halfwidth"]),
        maxsize=int(p["motifs_max_size"]),
        proportion=p["motifs_proportion"],
        min_sequences=p["motifs_min_sequences"],
        num_sequences=p["motifs_num_sequences"],
        order=p['motifs_score'])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
예제 #18
0
def loadFilteredContigLengths(infile, outfile):
    '''
    load contig lengths
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip(os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname)
    P.touch(outfile)
def buildPicardAlignmentStats( infile, outfile, genome_file ):
    '''gather BAM file alignment statistics using Picard '''

    to_cluster = True
    job_options = getPicardOptions()

    if getNumReadsFromBAMFile(infile) == 0:
        E.warn( "no reads in %s - no metrics" % infile )
        P.touch( outfile )
        return

    # Picard seems to have problem if quality information is missing
    # or there is no sequence/quality information within the bam file.
    # Thus, add it explicitely.
    statement = '''cat %(infile)s 
                       | python %(scriptsdir)s/bam2bam.py -v 0 --set-sequence --sam 
                       | CollectMultipleMetrics 
                                       INPUT=/dev/stdin 
                                       REFERENCE_SEQUENCE=%(genome_file)s
                                       ASSUME_SORTED=true 
                                       OUTPUT=%(outfile)s 
                                       VALIDATION_STRINGENCY=SILENT 
                       >& %(outfile)s'''

    P.run()
def plotFalsePositiveRates(infile, outfile):
    '''
    barplot the false positive rates across
    taxonomic levels
    '''
    R('''library(ggplot2)''')
    R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' %
      infile)
    for i in [0, 1]:
        # specificity
        outf = P.snip(outfile, ".pdf") + ".%i.specificity.pdf" % i
        R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = fp_rate, fill = track, stat = "identity"))''' %
          i)
        R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''')
        R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''')
        R('''ggsave("%s")''' % outf)

        # sensitivity
        outf = P.snip(outfile, ".pdf") + ".%i.sensitivity.pdf" % i
        R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = tp_rate, fill = track, stat = "identity"))''' %
          i)
        R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''')
        R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''')
        R('''ggsave("%s")''' % outf)

    P.touch(outfile)
def importGO( infile, outfile, suffix ):
    '''import GO results into a table.'''

    x = "_expdiff.%s" % suffix 
    assert infile.endswith( x )
    track, method, control = getExpressionMatch(
        infile[:-len(x)] + ".expdiff" )

    if track == control: return
    
    tablename = "%(track)s_vs_%(control)s_%(method)s_%(suffix)s" % locals()

    indir = infile + ".dir"

    if not os.path.exists( indir ):
        P.touch( outfile )
        return

    statement = '''
    python %(toolsdir)s/cat_tables.py %(indir)s/*.overall |\
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              --allow-empty \
              --index=category \
              --index=goid \
              --table=%(tablename)s \
    > %(outfile)s
    '''
    P.run()
예제 #22
0
def runMACS( infile, outfile ):
    '''run MACS for peak detection.

    The output bed files contain the P-value as their score field.
    '''
    to_cluster = True

    if infile.endswith( ".norm.bam"):

        track = infile[:-len(".norm.bam")]
        if track.startswith("control"):
            P.touch( outfile )
            return

        format = "bam"
        suffix = ".norm.bam"

    elif infile.endswith( ".bam"):

        track = infile[:-len(".bam")]
        if track.startswith("control"):
            P.touch( outfile )
            return

        format = "bam"
        suffix = ".norm.bam"
        
    elif infile.endswith(".bed.gz"):

        track = infile[:-len(".bed.gz")]
        if track.startswith("control"):
            outs = open( outfile, "w")
            outs.close()
            return

        format = "bed"
        suffix = ".bed.gz"
        
    control = getControl( track )
    
    if control != None:
        control += suffix
    else:
        E.info("%s: no control for track %s" % (outfile, track ) )
        control = None
            
    if control: control = "-c %s" % control
    else: control = ""


        
    statement = '''
    macs -t %(infile)s %(control)s \
    --diag \
    --name=%(outfile)s \
    --format=%(format)s \
    %(macs_options)s >& %(outfile)s''' 
    
    P.run( **dict( locals().items() + PARAMS.items() ) )
예제 #23
0
def loadContigLengths(infile, outfile):
    '''
    load contig lengths
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + \
        "_" + P.snip(os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname, "--index=scaffold_name")
    P.touch(outfile)
예제 #24
0
def buildAssemblyBowtie2Indices(infile, outfile):
    '''
    build bowtie indices
    '''
    outbase = P.snip(infile, ".fa")
    statement = '''bowtie2-build -f %(infile)s %(outbase)s'''
    P.run()
    P.touch(outfile)
예제 #25
0
def loadContigSummary(infile, outfile):
    '''
    load contig summary stats for each assembler
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + \
        "_" + os.path.basename(infile) + ".load"
    P.load(infile, outname)
    P.touch(outfile)
예제 #26
0
def loadContigLengths(infile, outfile):
    '''
    load contig lengths
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip(
        os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname)
    P.touch(outfile)
예제 #27
0
def loadContigGCContent(infile, outfile):
    '''
    load contig GC content
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + \
        "_" + P.snip(os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname, "--index=id")
    P.touch(outfile)
예제 #28
0
def estimateInsertSizes( infiles, outfile):
    """
    Plots the internal insert size distribution and calculates the average and standard deviation based on the FWHM
    """
    
    infiles = " ".join(infiles)

    to_cluster = USECLUSTER

    statement = '''
    zcat %(infiles)s | python %(rmaadir)s/return_insert_sizes.py > %(outfile)s
    '''
    P.run()
    # required to resolve strange timing issues
    # when trying to open the file in the next command
    P.touch( outfile )
    ins_sizes_array=numpy.array( [map(int, x[:-1].split("\t")) for x in open(outfile, "r")] )

    max_freq=ins_sizes_array[:,1].max()
    half_max=float(max_freq)/2.0
    E.info( "maximum frequency=%i, halfwidth=%i" % (max_freq, half_max))

    # get half width coordinates
    for bin, value in ins_sizes_array:
        if value < half_max: continue
        FWHMmin=bin
        break

    for bin, value in ins_sizes_array[::-1]:
        if value < half_max: continue
        FWHMmax=bin
        break

    FWHM=FWHMmax-FWHMmin
    std_dev=int(float(FWHM)/2.3548)
    ins_size=int(FWHMmin+float(FWHM)/2.0)-PARAMS["remove_bases_from_right"]

    E.info( "".join(["For ", infiles, " FWHM is ", str(FWHM), " ranging from ", str(FWHMmin), " to ", str(FWHMmax), ". std dev ", 
                     str(std_dev), " and ins size ", str(ins_size)] ) )

    x, y= [], []
    
    for bin,value in ins_sizes_array:
        if FWHMmin - 2 * std_dev < bin < FWHMmax + 2 * std_dev:
            x.append(bin)
            y.append(value)

    if PLOT:
        pylab.title("Insert size")
        pylab.xlabel('inner distance between sequenced ends')
        pylab.ylabel('frequency based on unique eland mappings')
        pylab.scatter(x,y)
        pylab.savefig(outfile + ".png")

    fwhm_file=open(outfile + ".txt", 'w')
    my_str='%s\t%s\n' % (ins_size, std_dev)
    fwhm_file.write(my_str)
    fwhm_file.close()
예제 #29
0
파일: pipeline_idr.py 프로젝트: jmadzo/cgat
def reMergeBamfiles(infiles, sentinal):
    infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles]
    outfile = P.snip(sentinal, ".sentinal") + ".bam"
    bad_samples = PARAMS["options_to_remove"].split(",")

    to_merge = IDR.filterBadLibraries(infiles, bad_samples)

    IDR.mergeBams(to_merge, outfile)
    P.touch(sentinal)
예제 #30
0
def buildBwaIndices(infile, outfile):
    '''
    build bowtie indices
    '''
    to_cluster = True
    to_cluster = True
    statement = '''bwa index %(infile)s'''
    P.run()
    P.touch(outfile)
예제 #31
0
def buildAssemblyBowtieIndices(infile, outfile):
    '''
    build bowtie indices
    '''
    outbase = TRACKS.getTracks()[0]
    directory = os.path.dirname(infile)
    statement = '''bowtie-build -f %(infile)s %(directory)s/%(outbase)s'''
    P.run()
    P.touch(outfile)
예제 #32
0
def buildBwaIndices(infile, outfile):
    '''
    build bowtie indices
    '''
    to_cluster = True
    to_cluster = True
    statement = '''bwa index %(infile)s'''
    P.run()
    P.touch(outfile)
예제 #33
0
def buildBowtie2Indices(infile, outfile):
    '''
    build bowtie indices
    '''
    to_cluster = True
    outbase = P.snip(infile, ".fa")
    statement = '''bowtie2-build -f %(infile)s %(outbase)s'''
    P.run()
    P.touch(outfile)
예제 #34
0
def reMergeBamfiles(infiles, sentinal):
    infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles]
    outfile = P.snip(sentinal, ".sentinal") + ".bam"
    bad_samples = PARAMS["options_to_remove"].split(",")

    to_merge = IDR.filterBadLibraries(infiles, bad_samples)

    IDR.mergeBams(to_merge, outfile)
    P.touch(sentinal)
def buildAssemblyBowtieIndices(infile, outfile):
    '''
    build bowtie indices
    '''
    outbase = TRACKS.getTracks()[0]
    directory = os.path.dirname(infile)
    statement = '''bowtie-build -f %(infile)s %(directory)s/%(outbase)s'''
    P.run()
    P.touch(outfile)
예제 #36
0
def buildAssemblyBowtieIndices(infile, outfile):
    '''
    build bowtie indices
    '''
    outbase = P.snip(infile, ".fa")
    directory = os.path.dirname(infile)
    statement = '''bowtie-build -f %(infile)s %(outbase)s'''
    P.run()
    P.touch(outfile)
예제 #37
0
def poolSampleBamfiles(infiles, sentinal):
    """
    Merge filtered sample files for each tissue
    """
    infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles]
    outfile = P.snip(sentinal, ".sentinal") + ".bam"

    IDR.mergeBams(infiles, outfile)

    P.touch(sentinal)
예제 #38
0
파일: pipeline_idr.py 프로젝트: jmadzo/cgat
def poolSampleBamfiles(infiles, sentinal):
    """
    Merge filtered sample files for each tissue
    """
    infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles]
    outfile = P.snip(sentinal, ".sentinal") + ".bam"

    IDR.mergeBams(infiles, outfile)

    P.touch(sentinal)
예제 #39
0
def callPeaksOnIndividualReplicates(infile, outfile):
    infile = P.snip(infile, ".sentinel") + ".bam"
    # fetch peak calling parameters
    PARAMS_PEAKCALLER = get_peak_caller_parameters(
        PARAMS["options_peak_caller"])

    # call peaks
    IDR.callIDRPeaks(infile, outfile, PARAMS["options_peak_caller"],
                     PARAMS["options_control_type"], PARAMS_PEAKCALLER)

    P.touch(outfile)
예제 #40
0
def splitPooledBamfiles(infile, sentinel):
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module, "splitBam", params, infile, outfile)

    P.touch(sentinel)
def plotRNASEQTagData( infiles, outfile ):
    '''perform differential expression analysis using deseq.'''

    design_file = infiles[0]
    geneset_file = infiles[1]
    bamfiles = infiles[2]

    #IMS: now running on feature counts
    infile = os.path.join( "feature_counts.dir", P.snip( geneset_file, ".gtf.gz") + ".feature_counts.tsv.gz" )
    Expression.plotTagStats( infile, design_file, outfile )

    P.touch( outfile )
예제 #42
0
def poolInputBamfiles(infiles, sentinal):
    """
    Merge filtered input files for each tissue, with the option of excluding
    undesirable libraries.
    """
    infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles]
    outfile = P.snip(sentinal, ".sentinal") + ".bam"
    bad_samples = PARAMS["filter_remove_inputs"].split(",")

    to_merge = IDR.filterBadLibraries(infiles, bad_samples)

    IDR.mergeBams(to_merge, outfile)
    P.touch(sentinal)
예제 #43
0
def splitPooledBamfiles(infile, sentinal):
    infile = P.snip(infile, ".sentinal") + ".bam"
    outfile = P.snip(sentinal, ".sentinal")
    params = '2'
    module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "splitBam",
             params,
             infile,
             outfile)

    P.touch(sentinal)
예제 #44
0
파일: pipeline_idr.py 프로젝트: jmadzo/cgat
def splitPooledBamfiles(infile, sentinal):
    infile = P.snip(infile, ".sentinal") + ".bam"
    outfile = P.snip(sentinal, ".sentinal")
    params = '2'
    module = P.snip(IDR.__file__, ".py")

    P.submit(module,
             "splitBam",
             params,
             infile,
             outfile)

    P.touch(sentinal)
예제 #45
0
파일: pipeline_idr.py 프로젝트: jmadzo/cgat
def callPeaksOnPooledReplicates(infile, outfile):
    # fetch peak calling parameters
    PARAMS_PEAKCALLER = get_peak_caller_parameters(
        PARAMS["options_peak_caller"])

    # call peaks on pseudoreplicates
    IDR.callIDRPeaks(infile,
                     outfile,
                     PARAMS["options_peak_caller"],
                     PARAMS["options_control_type"],
                     PARAMS_PEAKCALLER,
                     pseudoreplicate=False)

    P.touch(outfile)
예제 #46
0
def loadEdgeR( infile, outfile ):
    '''load EdgeR per-chunk summary stats.'''

    prefix = P.snip( outfile, ".load" )

    for fn in glob.glob( infile + "*_summary.tsv" ):
        prefix = P.snip(fn[len(infile)+1:], "_summary.tsv")

        P.load( fn, 
                prefix + ".deseq_summary.load", 
                collapse = 0,
                transpose = "sample")

    P.touch( outfile )
예제 #47
0
def callPeaksOnPooledReplicates(infile, outfile):
    # fetch peak calling parameters
    PARAMS_PEAKCALLER = get_peak_caller_parameters(
        PARAMS["options_peak_caller"])

    # call peaks on pseudoreplicates
    IDR.callIDRPeaks(infile,
                     outfile,
                     PARAMS["options_peak_caller"],
                     PARAMS["options_control_type"],
                     PARAMS_PEAKCALLER,
                     pseudoreplicate=False)

    P.touch(outfile)
예제 #48
0
파일: pipeline_idr.py 프로젝트: jmadzo/cgat
def callPeaksOnIndividualReplicates(infile, outfile):
    infile = P.snip(infile, ".sentinal") + ".bam"
    # fetch peak calling parameters
    PARAMS_PEAKCALLER = get_peak_caller_parameters(
        PARAMS["options_peak_caller"])

    # call peaks
    IDR.callIDRPeaks(infile,
                     outfile,
                     PARAMS["options_peak_caller"],
                     PARAMS["options_control_type"],
                     PARAMS_PEAKCALLER)

    P.touch(outfile)
예제 #49
0
def makeAnnotatorGeneSets( infile, outfile, slice ):
    '''compute annotator overlap between sets.
    '''
    
    workspaces = ("genomic", "alignable", slice )

    track = infile[:-len(".gtf.gz")]

    infiles = ANNOTATOR_TRACKS

    related = getRelatedTracks( infile, infiles )

    if related:
        E.info("removing related tracks %s from %s" % \
                   ( related, infile ) )
        related = set(related)
        infiles = [x for x in TRACKS if x not in related ]
        
    tmpdir = tempfile.mkdtemp( dir = os.getcwd() )

    annotations = os.path.join( tmpdir, "annotations")
    PAnnotator.buildGeneSetAnnotations( infiles,
                                        annotations,
                                        slice )

    segments = PAnnotator.buildAnnotatorSlicedSegments( tmpdir, 
                                                        outfile, 
                                                        track, 
                                                        slice )

    if not segments:
        E.warn( "no segments for %s - no annotator results" % outfile )
        shutil.rmtree( tmpdir )
        P.touch( outfile )
        return

    workspaces, synonyms = PAnnotator.buildAnnotatorWorkSpace( tmpdir, 
                                                               outfile,
                                                               workspaces = workspaces,
                                                               gc_control = True )
    
    PAnnotator.runAnnotator( tmpdir, 
                             outfile, 
                             annotations, 
                             segments, 
                             workspaces, 
                             synonyms )

    shutil.rmtree( tmpdir )
예제 #50
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the 
    top 10% of intervals (peakval) are used. 
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    to_cluster = True
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme",
                               outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    tmpdir = P.getTempDir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.asList(PARAMS['motifs_masker']),
        halfwidth=int(PARAMS["meme_halfwidth"]),
        maxsize=int(PARAMS["meme_max_size"]),
        proportion=PARAMS["meme_proportion"],
        min_sequences=PARAMS["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run()

        collectMEMEResults(tmpdir, target_path, outfile)
예제 #51
0
def splitBamfiles(infile, sentinel):
    """
    For all tracks, split the filtered bamfile in two using pysam
    """
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module, "splitBam", params, infile, outfile)

    P.touch(sentinel)
예제 #52
0
def loadGeneSummary(infile, outfile):
    '''summarize binding information per gene.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    cc.execute("""DROP TABLE IF EXISTS %(table)s """ % locals())
    cc.execute("""CREATE TABLE %(table)s AS
            SELECT gene_id, SUM( tata ) AS tata, SUM( cpg ) AS cpg 
                   FROM promotorinfo_transcripts AS p,
                        annotations.transcript_info as i
                   WHERE i.transcript_id = p.transcript_id
                   GROUP BY gene_id""" % locals())
    cc.close()

    P.touch(outfile)