Ejemplo n.º 1
0
def interval(locus_table, interval_table, intervals, loci, boxplot = True):
    qry = get_interval_query(intervals, loci, locus_table, interval_table)
    frame = robjects.r('''data <- dbGetQuery(con, {})'''.format(qry))
    # because we're sorting by interval, which is a factor, we need to
    # explicitly re-sort the data by the first integer value
    # of the interval.  This is a bit cumbersome, because sorting
    # in R is less than pleasant.
    sort_string = '''data$interval <- factor(data$interval, {})'''.format(order_intervals(frame[1]))
    robjects.r(sort_string)
    gg_frame = ggplot2.ggplot(robjects.r('''data'''))
    if boxplot:
        plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi') + \
                ggplot2.geom_boxplot(**{
                    'outlier.size':0, 
                    'alpha':0.3
                    }
                ) + \
                ggplot2.geom_jitter(ggplot2.aes_string(color = 'locus'), size = 3, \
                alpha = 0.6, position=ggplot2.position_jitter(width=0.25)) + \
                ggplot2.scale_y_continuous('phylogenetic informativeness') + \
                ggplot2.scale_x_discrete('interval (years ago)')

    else:
        plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi',
                fill='locus') + ggplot2.geom_bar() + \
                ggplot2.facet_wrap(robjects.Formula('~ locus')) + \
                ggplot2.opts(**{
                    'axis.text.x':ggplot2.theme_text(angle = -90,  hjust = 0),
                    'legend.position':'none'
                    }) + \
                ggplot2.scale_y_continuous('phylogenetic informativeness') + \
                ggplot2.scale_x_discrete('interval (years ago)')
    return plot
Ejemplo n.º 2
0
def compare_sum_barplot(locus_table, interval_table, intervals, loci, names,
        rows):
    frame = get_r_data_by_top(locus_table, interval_table, intervals, names,
            rows)
    #pdb.set_trace()
    frame2 = robjects.r('''agg_data <- aggregate(pi ~ interval + db, data = data, sum)''')
    if len(intervals) > 1:
        sort_string = '''agg_data$interval <- factor(agg_data$interval,{})'''.format(order_intervals(frame2[0]))
        robjects.r(sort_string)
    gg_frame = ggplot2.ggplot(robjects.r('''agg_data'''))
    plot = gg_frame + \
        ggplot2.aes_string(
                x = 'interval', 
                y = 'pi',
                fill='factor(db)'
            ) + \
        ggplot2.geom_bar(**{
            'position':'dodge',
            'colour':'#767676',
            'alpha':0.6
            }
        ) + \
        ggplot2.scale_y_continuous('net phylogenetic informativeness') + \
        ggplot2.scale_x_discrete('interval (years ago)') + \
        ggplot2.scale_fill_brewer("database", palette="Blues")
    return plot
Ejemplo n.º 3
0
def main():
    usage = 'usage: %prog [options] <mut1 file> <mut2 file>'
    parser = OptionParser(usage)
    parser.add_option('-m', dest='mut_norm', action='store_true', default=False, help='Normalize by # mutations (as opposed to sequenced bp) [Default: %default]')
    parser.add_option('-o', dest='output_pdf', default='mut_cmp.pdf', help='Output pdf file for heatmap [Default: %default]')
    parser.add_option('-r', dest='raw', action='store_true', default=False, help='Use raw mutation counts (as opposed to normalized for ACGT content) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error(usage)
    else:
        mut1_file = args[0]
        mut2_file = args[1]

    mutation_profile1, seq_bp1 = parse_mutations(mut1_file, options.raw)
    mutation_profile2, seq_bp2 = parse_mutations(mut2_file, options.raw)

    relative_mutation_profile = compute_relative_profile(mutation_profile1, seq_bp1, mutation_profile2, seq_bp2)

    print_table(relative_mutation_profile)

    # make plotting data structures
    nts = ['_','A','C','G','T']
    nts1 = []
    nts2 = []
    rel = []
    for nt1 in nts:
        for nt2 in nts:
            nts1.append(nt1)
            nts2.append(nt2)
            rel.append(relative_mutation_profile[(nt1,nt2)])

    nts1_r = ro.StrVector(nts1)
    nts2_r = ro.StrVector(nts2)
    rel_r = ro.FloatVector(rel)

    df = ro.DataFrame({'nt1':nts1_r, 'nt2':nts2_r, 'rel':rel_r})

    # plot
    '''
    gp = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='nt2', y='nt1', fill='rel') + \
        ggplot2.geom_tile() + \
        ggplot2.scale_x_discrete(mut2_file, limits=nts) + \
        ggplot2.scale_y_discrete(mut1_file, limits=nts) + \
        ggplot2.scale_fill_gradient('Enrichment 1/2')
    '''

    gp = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='nt2', y='nt1', fill='rel') + \
        ggplot2.geom_tile() + \
        ggplot2.scale_x_discrete('Read') + \
        ggplot2.scale_y_discrete('Reference') + \
        ggplot2.scale_fill_gradient2('log2 enrichment', low='darkblue', mid='white', high='darkred')

    # save to file
    grdevices.pdf(file=options.output_pdf)
    gp.plot()
    grdevices.dev_off()
Ejemplo n.º 4
0
def plot_coef(feat_mat_dir,
              model_dir,
              expt_names,
              pref,
              outfile=None,
              height=120,
              fsize=12):

    for expt_idx, ex in enumerate(expt_names):
        feat_mat_file = os.path.join(feat_mat_dir, ex + '_feat_mat.npz')
        model_file = os.path.join(model_dir, pref + ex + '_model.pkl')
        model = read_model(model_file)
        (tmp_feat, tmp_y, tmp_feat_names,
         tmp_gene_names) = read_feat_mat(feat_mat_file)

        if expt_idx == 0:
            feat_names = tmp_feat_names
            clf_coef = model.clf_coef()
            reg_coef = model.reg_coef()
        else:
            assert (all(f[0] == f[1] for f in zip(feat_names, tmp_feat_names)))
            clf_coef = np.concatenate((clf_coef, model.clf_coef()), axis=1)
            reg_coef = np.concatenate((reg_coef, model.reg_coef()), axis=1)

    nexpt = expt_idx + 1

    # Now clf_coef has one row per coefficient and one column per experiment.
    # The reshape below will read the data row-first.
    df = pd.DataFrame({
        'feature': np.repeat(feat_names, nexpt),
        'Classification': np.reshape(clf_coef, (clf_coef.size, )),
        'Regression': np.reshape(reg_coef, (reg_coef.size, ))
    })

    df2 = pd.melt(df, id_vars='feature', var_name='fun')
    r_df = com.convert_to_r_dataframe(df2)
    gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(feature)', y = 'value') + \
        ggplot2.facet_wrap('fun', scales = 'free_y') + \
        ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Importance') + \
        ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + \
        ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1),
                         'axis.text.y':ggplot2.element_text(size = fsize),
                         'strip.text.x':ggplot2.element_text(size = fsize + 1)})
    w = max(22 * nexpt, 80)
    if outfile is None:
        gp.plot()
    else:
        ro.r.ggsave(filename=outfile,
                    plot=gp,
                    width=w,
                    height=height,
                    unit='mm')
    return df
Ejemplo n.º 5
0
def plot_cv_r2(pandas_df, outfile, fsize = 10, height = 120, max_width = 50, xlab = 'Parameters'):
    """Makes boxplots of cross-validation results for different parameter settings"""

    ncv = len(set(list(pandas_df['title'])))
    r_df = com.convert_to_r_dataframe(pandas_df)
    
    gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(title)', y = 'r2') + \
        ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('R-squared') + \
        ggplot2.scale_x_discrete(xlab) + ggplot2.theme_bw() + \
        ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1),
                         'axis.text.y':ggplot2.element_text(size = fsize)})
    w = max(5 * ncv, max_width) 
    ro.r.ggsave(filename = outfile, plot = gp, width = w, height = height, unit = 'mm')
Ejemplo n.º 6
0
def plot_cv_r2(pandas_df,
               outfile,
               fsize=10,
               height=120,
               max_width=50,
               xlab='Parameters'):
    """Makes boxplots of cross-validation results for different parameter settings"""

    ncv = len(set(list(pandas_df['title'])))
    r_df = com.convert_to_r_dataframe(pandas_df)

    gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(title)', y = 'r2') + \
        ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('R-squared') + \
        ggplot2.scale_x_discrete(xlab) + ggplot2.theme_bw() + \
        ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1),
                         'axis.text.y':ggplot2.element_text(size = fsize)})
    w = max(5 * ncv, max_width)
    ro.r.ggsave(filename=outfile, plot=gp, width=w, height=height, unit='mm')
Ejemplo n.º 7
0
def plot_thresh_distr(motif_names, thresh, out_dir, width = 350):
    """Creates boxplots of the thresholds used with each feature."""

    df = pd.DataFrame({'motif':motif_names, 'thresh':thresh})
    df = df[df['thresh'] > 1]

    df.to_csv(os.path.join(out_dir, 'count_thresh.txt'), sep = '\t', index = False)
    fsize = 10
    r_df = com.convert_to_r_dataframe(df)
    gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(motif)', y = 'thresh') + \
            ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Threshold counts', limits = ro.IntVector([0, 70])) + \
            ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + ggplot2.coord_flip() + \
            ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize),
                             'axis.text.y':ggplot2.element_text(size = fsize, hjust = 1),
                             'strip.text.x':ggplot2.element_text(size = fsize + 1)})
    for ext in ['.pdf', '.png']:
        ro.r.ggsave(filename = os.path.join(out_dir, 'count_thresh_bar' + ext),
                    plot = gp, width = width, height = 300, unit = 'mm')
Ejemplo n.º 8
0
def compare_mean_boxplot(locus_table, interval_table, intervals, loci, names, rows):
    frame = get_r_data_by_top(locus_table, interval_table, intervals, names,
            rows)
    if len(intervals) > 1:
        sort_string = '''data$interval <- factor(data$interval, {})'''.format(order_intervals(frame[1]))
        robjects.r(sort_string)
    gg_frame = ggplot2.ggplot(robjects.r('''data'''))
    plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi') + \
                ggplot2.geom_boxplot(ggplot2.aes_string(fill = 'factor(db)'), **{
                    'outlier.size':3,
                    'outlier.colour':'#767676',
                    'outlier.alpha':0.3,
                    'alpha':0.6
                    }
                ) + \
                ggplot2.scale_y_continuous('mean phylogenetic informativeness') + \
                ggplot2.scale_x_discrete('interval (years ago)') + \
                ggplot2.scale_fill_brewer("database", palette='Blues')
    return plot
Ejemplo n.º 9
0
def plot_thresh_distr(motif_names, thresh, out_dir, width=350):
    """Creates boxplots of the thresholds used with each feature."""

    df = pd.DataFrame({'motif': motif_names, 'thresh': thresh})
    df = df[df['thresh'] > 1]

    df.to_csv(os.path.join(out_dir, 'count_thresh.txt'), sep='\t', index=False)
    fsize = 10
    r_df = com.convert_to_r_dataframe(df)
    gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(motif)', y = 'thresh') + \
            ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Threshold counts', limits = ro.IntVector([0, 70])) + \
            ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + ggplot2.coord_flip() + \
            ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize),
                             'axis.text.y':ggplot2.element_text(size = fsize, hjust = 1),
                             'strip.text.x':ggplot2.element_text(size = fsize + 1)})
    for ext in ['.pdf', '.png']:
        ro.r.ggsave(filename=os.path.join(out_dir, 'count_thresh_bar' + ext),
                    plot=gp,
                    width=width,
                    height=300,
                    unit='mm')
Ejemplo n.º 10
0
def plot_coef(feat_mat_dir, model_dir, expt_names, pref, outfile = None, height = 120, fsize = 12):
    
    for expt_idx, ex in enumerate(expt_names):
        feat_mat_file = os.path.join(feat_mat_dir, ex + '_feat_mat.npz')
        model_file = os.path.join(model_dir, pref + ex + '_model.pkl')
        model = read_model(model_file)
        (tmp_feat, tmp_y, tmp_feat_names, tmp_gene_names) = read_feat_mat(feat_mat_file)
        
        if expt_idx == 0:
            feat_names = tmp_feat_names
            clf_coef = model.clf_coef()
            reg_coef = model.reg_coef()
        else:
            assert(all(f[0] == f[1] for f in zip(feat_names, tmp_feat_names)))
            clf_coef = np.concatenate((clf_coef, model.clf_coef()), axis = 1)
            reg_coef = np.concatenate((reg_coef, model.reg_coef()), axis = 1)
    
    nexpt = expt_idx + 1
    
    # Now clf_coef has one row per coefficient and one column per experiment.
    # The reshape below will read the data row-first.
    df = pd.DataFrame({'feature':np.repeat(feat_names, nexpt),
                       'Classification':np.reshape(clf_coef, (clf_coef.size,)),
                       'Regression':np.reshape(reg_coef, (reg_coef.size,))})

    df2 = pd.melt(df, id_vars = 'feature', var_name = 'fun')
    r_df = com.convert_to_r_dataframe(df2)
    gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(feature)', y = 'value') + \
        ggplot2.facet_wrap('fun', scales = 'free_y') + \
        ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Importance') + \
        ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + \
        ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1),
                         'axis.text.y':ggplot2.element_text(size = fsize),
                         'strip.text.x':ggplot2.element_text(size = fsize + 1)})
    w = max(22 * nexpt, 80)
    if outfile is None:
        gp.plot()
    else:
        ro.r.ggsave(filename = outfile, plot = gp, width = w, height = height, unit = 'mm')
    return df
Ejemplo n.º 11
0
def main():
    usage = 'usage: %prog [options] <mut1 file> <mut2 file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-m',
        dest='mut_norm',
        action='store_true',
        default=False,
        help=
        'Normalize by # mutations (as opposed to sequenced bp) [Default: %default]'
    )
    parser.add_option('-o',
                      dest='output_pdf',
                      default='mut_cmp.pdf',
                      help='Output pdf file for heatmap [Default: %default]')
    parser.add_option(
        '-r',
        dest='raw',
        action='store_true',
        default=False,
        help=
        'Use raw mutation counts (as opposed to normalized for ACGT content) [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(usage)
    else:
        mut1_file = args[0]
        mut2_file = args[1]

    mutation_profile1, seq_bp1 = parse_mutations(mut1_file, options.raw)
    mutation_profile2, seq_bp2 = parse_mutations(mut2_file, options.raw)

    relative_mutation_profile = compute_relative_profile(
        mutation_profile1, seq_bp1, mutation_profile2, seq_bp2)

    print_table(relative_mutation_profile)

    # make plotting data structures
    nts = ['_', 'A', 'C', 'G', 'T']
    nts1 = []
    nts2 = []
    rel = []
    for nt1 in nts:
        for nt2 in nts:
            nts1.append(nt1)
            nts2.append(nt2)
            rel.append(relative_mutation_profile[(nt1, nt2)])

    nts1_r = ro.StrVector(nts1)
    nts2_r = ro.StrVector(nts2)
    rel_r = ro.FloatVector(rel)

    df = ro.DataFrame({'nt1': nts1_r, 'nt2': nts2_r, 'rel': rel_r})

    # plot
    '''
    gp = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='nt2', y='nt1', fill='rel') + \
        ggplot2.geom_tile() + \
        ggplot2.scale_x_discrete(mut2_file, limits=nts) + \
        ggplot2.scale_y_discrete(mut1_file, limits=nts) + \
        ggplot2.scale_fill_gradient('Enrichment 1/2')
    '''

    gp = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='nt2', y='nt1', fill='rel') + \
        ggplot2.geom_tile() + \
        ggplot2.scale_x_discrete('Read') + \
        ggplot2.scale_y_discrete('Reference') + \
        ggplot2.scale_fill_gradient2('log2 enrichment', low='darkblue', mid='white', high='darkred')

    # save to file
    grdevices.pdf(file=options.output_pdf)
    gp.plot()
    grdevices.dev_off()