Ejemplo n.º 1
0
def make_output_and(cov, control_cov, out_prefix, window):
    # dump raw counts to file
    raw_out = open('%s_raw.txt' % out_prefix,'w')
    for i in range(-window/2,window/2+1):
        print >> raw_out, '%d\t%e\t%e' % (i, cov[window/2+i], control_cov[window/2+i])
    raw_out.close()

    # make plot data structures
    splice_i = ro.IntVector(2*range(-window/2,window/2+1))
    cov_r = ro.FloatVector(cov+control_cov)
    labels = ro.StrVector(['Main']*len(cov)+['Control']*len(control_cov))
    df = ro.DataFrame({'splice_i':splice_i, 'cov':cov_r, 'label':labels})

    # construct plot
    gp = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='splice_i', y='cov', colour='label') + \
        ggplot2.geom_point() + \
        ggplot2.scale_x_continuous('Position relative to splice site') + \
        ggplot2.scale_y_continuous('Coverage') + \
        ggplot2.scale_colour_discrete('')

    # plot to file
    grdevices.pdf(file='%s.pdf' % out_prefix)
    gp.plot()
    grdevices.dev_off()
Ejemplo n.º 2
0
def make_output_and(cov, control_cov, out_prefix, window):
    # dump raw counts to file
    raw_out = open("%s_raw.txt" % out_prefix, "w")
    for i in range(-window / 2, window / 2 + 1):
        print >> raw_out, "%d\t%e\t%e" % (i, cov[window / 2 + i], control_cov[window / 2 + i])
    raw_out.close()

    # make plot data structures
    splice_i = ro.IntVector(2 * range(-window / 2, window / 2 + 1))
    cov_r = ro.FloatVector(cov + control_cov)
    labels = ro.StrVector(["Main"] * len(cov) + ["Control"] * len(control_cov))
    df = ro.DataFrame({"splice_i": splice_i, "cov": cov_r, "label": labels})

    # construct plot
    gp = (
        ggplot2.ggplot(df)
        + ggplot2.aes_string(x="splice_i", y="cov", colour="label")
        + ggplot2.geom_point()
        + ggplot2.scale_x_continuous("Position relative to splice site")
        + ggplot2.scale_y_continuous("Coverage")
        + ggplot2.scale_colour_discrete("")
    )

    # plot to file
    grdevices.pdf(file="%s.pdf" % out_prefix)
    gp.plot()
    grdevices.dev_off()
Ejemplo n.º 3
0
def make_output_and(te_tss_cov, control_te_tss_cov, out_prefix, upstream,
                    downstream):
    # clean raw counts dir
    if os.path.isdir('%s_raw' % out_prefix):
        shutil.rmtree('%s_raw' % out_prefix)
    os.mkdir('%s_raw' % out_prefix)

    # dump raw counts to file
    for te in te_tss_cov:
        if te[0] in [
                'n', '*', 'HERVH-int', 'L2a', 'AluSx', 'AluJb', 'MIRb', 'LTR7'
        ] and te[1] in [
                'n', '*', 'LINE/L1', 'SINE/Alu', 'LTR/ERV1', 'LTR/ERVL-MaLR',
                'LINE/L2', 'LTR/ERVL', 'SINE/MIR', 'DNA/hAT-Charlie',
                'LTR/ERVK', 'DNA/TcMar-Tigger'
        ]:
            raw_out = open(
                '%s_raw/%s_%s.txt' %
                (out_prefix, te[0].replace('/', '_'), te[1].replace('/', '_')),
                'w')
            for i in range(-upstream, downstream + 1):
                print >> raw_out, '%d\t%e\t%e' % (i, te_tss_cov[te][
                    upstream + i], control_te_tss_cov[te][upstream + i])
            raw_out.close()

    # clean plot dirs
    if os.path.isdir('%s_plot' % out_prefix):
        shutil.rmtree('%s_plot' % out_prefix)
    os.mkdir('%s_plot' % out_prefix)

    # make data structures
    tss_i = ro.IntVector(2 * range(-upstream, downstream + 1))
    labels = ro.StrVector(['Main'] * (upstream + downstream + 1) +
                          ['Control'] * (upstream + downstream + 1))
    for te in te_tss_cov:
        if te[0] in [
                'n', '*', 'HERVH-int', 'L2a', 'AluSx', 'AluJb', 'MIRb', 'LTR7'
        ] and te[1] in [
                'n', '*', 'LINE/L1', 'SINE/Alu', 'LTR/ERV1', 'LTR/ERVL-MaLR',
                'LINE/L2', 'LTR/ERVL', 'SINE/MIR', 'DNA/hAT-Charlie',
                'LTR/ERVK', 'DNA/TcMar-Tigger'
        ]:
            cov = ro.FloatVector(te_tss_cov[te] + control_te_tss_cov[te])
            df = ro.DataFrame({'tss_i': tss_i, 'cov': cov, 'label': labels})

            # construct full plot
            gp = ggplot2.ggplot(df) + \
                ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \
                ggplot2.geom_point() + \
                ggplot2.scale_x_continuous('TSS index') + \
                ggplot2.scale_y_continuous('Coverage') + \
                ggplot2.scale_colour_discrete('')

            # plot to file
            grdevices.pdf(
                file='%s_plot/%s_%s.pdf' %
                (out_prefix, te[0].replace('/', '_'), te[1].replace('/', '_')))
            gp.plot()
            grdevices.dev_off()
Ejemplo n.º 4
0
def main():
    usage = 'usage: %prog [options] <raw file>'
    parser = OptionParser(usage)
    parser.add_option('-d', dest='downstream', default=2000, type='int', help='TSS downstream [Default: %default]')
    parser.add_option('-o', dest='out_prefix', default='tss', help='Output prefix [Default: %default]')
    parser.add_option('-u', dest='upstream', default=5000, type='int', help='TSS upstream [Default: %default]')
    parser.add_option('--ymax', dest='ymax', default=None, type='float', help='Y-coordinate limit [Default: %default]')
    (options,args) = parser.parse_args()
    
    if len(args) != 1:
        parser.error('Must provide raw file')
    else:
        raw_file = args[0]

    # collect data
    coords = []
    main_cov = []
    control_cov = []
    for line in open(raw_file):
        a = line.split()
        coords.append(int(a[0]))
        main_cov.append(float(a[1]))
        control_cov.append(float(a[2]))

    # data structures
    tss_i = ro.IntVector(range(-options.upstream,options.downstream+1))
    labels = ro.StrVector(['Main']*(options.upstream+options.downstream+1)+['Control']*(options.upstream+options.downstream+1))
    cov = ro.FloatVector(main_cov + control_cov)

    df = ro.DataFrame({'tss_i':tss_i, 'cov':cov, 'label':labels})

    # plot
    '''
    gp = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \
        ggplot2.geom_point() + \
        ggplot2.scale_x_continuous('TSS index') + \
        ggplot2.scale_colour_discrete('')
    '''
    gp = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \
        ggplot2.geom_smooth(method='loess', size=1, span=0.2, se=False) + \
        ggplot2.scale_x_continuous('TSS Position') + \
        ggplot2.scale_colour_discrete('') + \
        ggplot2.theme_bw()

    if options.ymax == None:
        gp += ggplot2.scale_y_continuous('Coverage')
    else:
        gp += ggplot2.scale_y_continuous('Coverage', limits=ro.FloatVector([0,options.ymax]))

    # save to file
    grdevices.pdf(file='%s_and.pdf' % options.out_prefix)
    gp.plot()
    grdevices.dev_off()
Ejemplo n.º 5
0
def make_output_and(tss_cov, control_tss_cov, out_prefix, upstream, downstream):
    # dump raw counts to file
    raw_out = open('%s_raw.txt' % out_prefix,'w')
    for i in range(-upstream,downstream+1):
        print >> raw_out, '%d\t%e\t%e' % (i, tss_cov[upstream+i], control_tss_cov[upstream+i])
    raw_out.close()

    # make plot data structures
    tss_i = ro.IntVector(2*range(-upstream,downstream+1))
    cov = ro.FloatVector(tss_cov+control_tss_cov)
    labels = ro.StrVector(['Main']*len(tss_cov)+['Control']*len(control_tss_cov))
    df = ro.DataFrame({'tss_i':tss_i, 'cov':cov, 'label':labels})

    # construct full plot
    gp = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \
        ggplot2.geom_point() + \
        ggplot2.scale_x_continuous('TSS index') + \
        ggplot2.scale_y_continuous('Coverage') + \
        ggplot2.scale_colour_discrete('')

    # plot to file
    grdevices.pdf(file='%s_full.pdf' % out_prefix)
    gp.plot()
    grdevices.dev_off()

    # construct zoomed plot
    gp = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \
        ggplot2.geom_point() + \
        ggplot2.scale_x_continuous('TSS index',limits=ro.IntVector([-1000,1000])) + \
        ggplot2.scale_y_continuous('Coverage') + \
        ggplot2.scale_colour_discrete('')

    # plot to file
    grdevices.pdf(file='%s_zoom.pdf' % out_prefix)
    gp.plot()
    grdevices.dev_off()
Ejemplo n.º 6
0
def make_output_and(te_tss_cov, control_te_tss_cov, out_prefix, upstream, downstream):
    # clean raw counts dir
    if os.path.isdir('%s_raw' % out_prefix):
        shutil.rmtree('%s_raw' % out_prefix)
    os.mkdir('%s_raw' % out_prefix)

    # dump raw counts to file
    for te in te_tss_cov:
        if te[0] in ['n','*','HERVH-int','L2a','AluSx','AluJb','MIRb','LTR7'] and te[1] in ['n','*','LINE/L1','SINE/Alu','LTR/ERV1','LTR/ERVL-MaLR','LINE/L2','LTR/ERVL','SINE/MIR','DNA/hAT-Charlie','LTR/ERVK','DNA/TcMar-Tigger']:
            raw_out = open('%s_raw/%s_%s.txt' % (out_prefix,te[0].replace('/','_'),te[1].replace('/','_')),'w')
            for i in range(-upstream,downstream+1):
                print >> raw_out, '%d\t%e\t%e' % (i, te_tss_cov[te][upstream+i], control_te_tss_cov[te][upstream+i])
            raw_out.close()

    # clean plot dirs
    if os.path.isdir('%s_plot' % out_prefix):
        shutil.rmtree('%s_plot' % out_prefix)
    os.mkdir('%s_plot' % out_prefix)

    # make data structures
    tss_i = ro.IntVector(2*range(-upstream,downstream+1))
    labels = ro.StrVector(['Main']*(upstream+downstream+1)+['Control']*(upstream+downstream+1))
    for te in te_tss_cov:
        if te[0] in ['n','*','HERVH-int','L2a','AluSx','AluJb','MIRb','LTR7'] and te[1] in ['n','*','LINE/L1','SINE/Alu','LTR/ERV1','LTR/ERVL-MaLR','LINE/L2','LTR/ERVL','SINE/MIR','DNA/hAT-Charlie','LTR/ERVK','DNA/TcMar-Tigger']:
            cov = ro.FloatVector(te_tss_cov[te] + control_te_tss_cov[te])
            df = ro.DataFrame({'tss_i':tss_i, 'cov':cov, 'label':labels})

            # construct full plot
            gp = ggplot2.ggplot(df) + \
                ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \
                ggplot2.geom_point() + \
                ggplot2.scale_x_continuous('TSS index') + \
                ggplot2.scale_y_continuous('Coverage') + \
                ggplot2.scale_colour_discrete('')

            # plot to file
            grdevices.pdf(file='%s_plot/%s_%s.pdf' % (out_prefix,te[0].replace('/','_'),te[1].replace('/','_')))
            gp.plot()
            grdevices.dev_off()
Ejemplo n.º 7
0
def main():
    usage = 'usage: %prog [options] <raw file>'
    parser = OptionParser(usage)
    parser.add_option('-d',
                      dest='downstream',
                      default=2000,
                      type='int',
                      help='TSS downstream [Default: %default]')
    parser.add_option('-o',
                      dest='out_prefix',
                      default='tss',
                      help='Output prefix [Default: %default]')
    parser.add_option('-u',
                      dest='upstream',
                      default=5000,
                      type='int',
                      help='TSS upstream [Default: %default]')
    parser.add_option('--ymax',
                      dest='ymax',
                      default=None,
                      type='float',
                      help='Y-coordinate limit [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide raw file')
    else:
        raw_file = args[0]

    # collect data
    coords = []
    main_cov = []
    control_cov = []
    for line in open(raw_file):
        a = line.split()
        coords.append(int(a[0]))
        main_cov.append(float(a[1]))
        control_cov.append(float(a[2]))

    # data structures
    tss_i = ro.IntVector(range(-options.upstream, options.downstream + 1))
    labels = ro.StrVector(['Main'] *
                          (options.upstream + options.downstream + 1) +
                          ['Control'] *
                          (options.upstream + options.downstream + 1))
    cov = ro.FloatVector(main_cov + control_cov)

    df = ro.DataFrame({'tss_i': tss_i, 'cov': cov, 'label': labels})

    # plot
    '''
    gp = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \
        ggplot2.geom_point() + \
        ggplot2.scale_x_continuous('TSS index') + \
        ggplot2.scale_colour_discrete('')
    '''
    gp = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \
        ggplot2.geom_smooth(method='loess', size=1, span=0.2, se=False) + \
        ggplot2.scale_x_continuous('TSS Position') + \
        ggplot2.scale_colour_discrete('') + \
        ggplot2.theme_bw()

    if options.ymax == None:
        gp += ggplot2.scale_y_continuous('Coverage')
    else:
        gp += ggplot2.scale_y_continuous('Coverage',
                                         limits=ro.FloatVector(
                                             [0, options.ymax]))

    # save to file
    grdevices.pdf(file='%s_and.pdf' % options.out_prefix)
    gp.plot()
    grdevices.dev_off()
Ejemplo n.º 8
0
samples = pd.concat((samplebs, samplebg))
# re-index to avoid duplicate row.names in Rdf
samples.index = npy.arange(len(samples))
samplesgrouped = samples.groupby(['model'])
variances = samplesgrouped['Zweighted'].aggregate(npy.var)
print variances
print variances['BG'] / variances['BS']
print estimatesum(samples)
print samplesgrouped['Zweighted'].aggregate(estimatesum)
print trueZnsum

# grdevices.png(file="sampled-Z.png", width=4, height=3, units="in", res=300)
rsamples = com.convert_to_r_dataframe(samples)
pp = ggplot2.ggplot(rsamples) + \
    ggplot2.aes_string(x='Z', color='factor(model)') + \
    ggplot2.scale_colour_discrete(name="model") + \
    ggplot2.geom_density() + \
    ggplot2.scale_x_log10()
# ggplot2.scale_x_continuous(limits=FloatVector((0, 1)))
pp.plot()
# grdevices.dev_off()


def makeestimate(sampler, numsamples, **kwargs):
    samples = sample(sampler, numsamples, **kwargs)
    return estimatesum(samples['Zweighted'])


def makeestimates(sampler, numsamples, numestimates, **kwargs):
    estimates = [
        makeestimate(sampler, numsamples, **kwargs)
Ejemplo n.º 9
0
    logging.debug('True sum: %s', trueZnsum)

emdf = pd.DataFrame({
    'BSdists' : distsbs,
    'BGdists' : distsbg,
    'truesums' : truesums,
    'varratios' : varratios,
})

# Plot sampled Z
logging.info('Plotting sampled Zn')
grdevices.png(file="sampled-Z.png", width=4, height=3, units="in", res=300)
rsamples = com.convert_to_r_dataframe(samples)
pp = ggplot2.ggplot(rsamples) + \
    ggplot2.aes_string(x='Z', color='factor(model)') + \
    ggplot2.scale_colour_discrete(name="model") + \
    ggplot2.geom_density() + \
    ggplot2.scale_x_log10()
# ggplot2.scale_x_continuous(limits=FloatVector((0, 1)))
pp.plot()
grdevices.dev_off()

# Plot likelihood ratios
logging.info('Plotting likelihood ratios from binding site samples')
grdevices.png(file="sampled-ratios.png",
              width=4, height=3, units="in", res=300)
rsamplesbs = com.convert_to_r_dataframe(samples[samples['model'] == 'BS'])
pp = ggplot2.ggplot(rsamplesbs) + \
    ggplot2.aes_string(x='ir') + \
    ggplot2.geom_density() + \
    ggplot2.scale_x_log10()