def get_gc(qsorted_bam_file, reference_fasta, prefix, java_heap=None):
    '''
    Uses picard tools (CollectGcBiasMetrics). Note that the reference
    MUST be the same fasta file that generated the bowtie indices.
    Assumes picard was already loaded into space (module add picard-tools)
    '''
    # remove redundant (or malformed) info (read group) from bam
    logging.info('Getting GC bias...')
    output_file = '{0}.gc.txt'.format(prefix)
    plot_file = '{0}.gcPlot.pdf'.format(prefix)
    summary_file = '{0}.gcSummary.txt'.format(prefix)
    if java_heap is None:
        java_heap_param = '-Xmx10G'
    else:
        java_heap_param = '-Xmx{}'.format(java_heap)
    get_gc_metrics = ('java {6} -XX:ParallelGCThreads=1 -jar '
                      '{5} '
                      'CollectGcBiasMetrics R={0} I={1} O={2} '
                      'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE '
                      'VERBOSITY=ERROR QUIET=TRUE '
                      'ASSUME_SORTED=FALSE '
                      'CHART={3} S={4}').format(reference_fasta,
                                                qsorted_bam_file,
                                                output_file,
                                                plot_file,
                                                summary_file,
                                                locate_picard(),
                                                java_heap_param)
    logging.info(get_gc_metrics)
    os.system(get_gc_metrics)
    return output_file, plot_file, summary_file
Esempio n. 2
0
def run_preseq(bam_w_dups, prefix):
    '''
    Runs preseq. Look at preseq data output to get PBC/NRF.
    '''
    # First sort because this file no longer exists...
    sort_bam = 'samtools sort -o {1}.sorted.bam -T {1} -@ 2 {0}'.format(
        bam_w_dups, prefix)
    os.system(sort_bam)

    logging.info('Running preseq...')
    preseq_data = '{0}.preseq.dat'.format(prefix)
    preseq_log = '{0}.preseq.log'.format(prefix)
    preseq = ('preseq lc_extrap '
              '-P -B -o {0} {1}.sorted.bam -seed 1 -v 2> {2}').format(
                  preseq_data, prefix, preseq_log)
    logging.info(preseq)
    os.system(preseq)
    os.system('rm {0}.sorted.bam'.format(prefix))
    return preseq_data, preseq_log
Esempio n. 3
0
def run_preseq(bam_w_dups, prefix, nth=1, mem_gb=None):
    '''
    Runs preseq. Look at preseq data output to get PBC/NRF.
    '''
    # First sort because this file no longer exists...

    sort_bam = samtools_sort(bam_w_dups, nth, mem_gb)

    logging.info('Running preseq...')
    preseq_data = '{0}.preseq.dat'.format(prefix)
    preseq_log = '{0}.preseq.log'.format(prefix)

    run_shell_cmd('preseq lc_extrap -P -B -o {preseq_data} {sort_bam} '
                  '-seed 1 -v 2> {preseq_log}'.format(
                      preseq_data=preseq_data,
                      sort_bam=sort_bam,
                      preseq_log=preseq_log,
                  ))
    rm_f(sort_bam)

    return preseq_data, preseq_log
def make_tss_plot(bam_file,
                  tss,
                  prefix,
                  chromsizes,
                  read_len,
                  bins=400,
                  bp_edge=2000,
                  processes=8,
                  greenleaf_norm=True):
    '''
    Take bootstraps, generate tss plots, and get a mean and
    standard deviation on the plot. Produces 2 plots. One is the
    aggregation plot alone, while the other also shows the signal
    at each TSS ordered by strength.
    '''
    logging.info('Generating tss plot...')
    tss_plot_file = '{0}.tss_enrich.png'.format(prefix)
    tss_plot_large_file = '{0}.large_tss_enrich.png'.format(prefix)
    tss_log_file = '{0}.tss_enrich.qc'.format(prefix)

    # Load the TSS file
    tss = pybedtools.BedTool(tss)
    tss_ext = tss.slop(b=bp_edge, g=chromsizes)

    # Load the bam file
    # Need to shift reads and just get ends, just load bed file?
    bam = metaseq.genomic_signal(bam_file, 'bam')
    # Shift to center the read on the cut site
    bam_array = bam.array(tss_ext,
                          bins=bins,
                          shift_width=-read_len / 2,
                          processes=processes,
                          stranded=True)
    # Normalization (Greenleaf style): Find the avg height
    # at the end bins and take fold change over that
    if greenleaf_norm:
        # Use enough bins to cover 100 bp on either end
        num_edge_bins = int(100 / (2 * bp_edge / bins))
        bin_means = bam_array.mean(axis=0)
        avg_noise = (sum(bin_means[:num_edge_bins]) +
                     sum(bin_means[-num_edge_bins:])) / (2 * num_edge_bins)
        bam_array /= avg_noise
    else:
        bam_array /= bam.mapped_read_count() / 1e6

    # Generate a line plot
    fig = plt.figure()
    ax = fig.add_subplot(111)
    x = np.linspace(-bp_edge, bp_edge, bins)

    ax.plot(x, bam_array.mean(axis=0), color='r', label='Mean')
    ax.axvline(0, linestyle=':', color='k')

    # Note the middle high point (TSS)
    tss_point_val = max(bam_array.mean(axis=0))

    # write tss_point_val to file
    with open(tss_log_file, 'w') as fp:
        fp.write(str(tss_point_val))

    ax.set_xlabel('Distance from TSS (bp)')
    if greenleaf_norm:
        ax.set_ylabel('TSS Enrichment')
    else:
        ax.set_ylabel('Average read coverage (per million mapped reads)')
    ax.legend(loc='best')

    fig.savefig(tss_plot_file)

    # Print a more complicated plot with lots of info

    # Find a safe upper percentile - we can't use X if the Xth percentile is 0
    upper_prct = 99
    if mlab.prctile(bam_array.ravel(), upper_prct) == 0.0:
        upper_prct = 100.0

    plt.rcParams['font.size'] = 8
    fig = metaseq.plotutils.imshow(bam_array,
                                   x=x,
                                   figsize=(5, 10),
                                   vmin=5,
                                   vmax=upper_prct,
                                   percentile=True,
                                   line_kwargs=dict(color='k', label='All'),
                                   fill_kwargs=dict(color='k', alpha=0.3),
                                   sort_by=bam_array.mean(axis=1))

    # And save the file
    fig.savefig(tss_plot_large_file)

    return tss_plot_file, tss_plot_large_file, tss_log_file