def plot_squiggle(args, filename, start_times, mean_signals): """ Use rpy2 to create a squiggle plot of the read """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] total_time = start_times[-1] - start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([t - t_0 for t in start_times]) r_mean_signals = robjects.FloatVector(mean_signals) # infer the appropriate number of events given the number of facets num_events = len(r_mean_signals) events_per_facet = (num_events / args.num_facets) + 1 # dummy variable to control faceting facet_category = robjects.FloatVector([(i / events_per_facet) + 1 for i in range(len(start_times))]) # make a data frame of the start times and mean signals d = {'start': r_start_times, 'mean': r_mean_signals, 'cat': facet_category} df = robjects.DataFrame(d) gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) else: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) \ + ggplot2.theme_bw() if args.saveas is not None: plot_file = os.path.basename(filename) + "." + args.saveas if os.path.isfile(plot_file): raise Exception('Cannot create plot for %s: plot file %s already exists' % (filename, plot_file)) if args.saveas == "pdf": grdevices.pdf(plot_file, width = 8.5, height = 11) elif args.saveas == "png": grdevices.png(plot_file, width = 8.5, height = 11, units = "in", res = 300) pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def interval(locus_table, interval_table, intervals, loci, boxplot = True): qry = get_interval_query(intervals, loci, locus_table, interval_table) frame = robjects.r('''data <- dbGetQuery(con, {})'''.format(qry)) # because we're sorting by interval, which is a factor, we need to # explicitly re-sort the data by the first integer value # of the interval. This is a bit cumbersome, because sorting # in R is less than pleasant. sort_string = '''data$interval <- factor(data$interval, {})'''.format(order_intervals(frame[1])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''data''')) if boxplot: plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi') + \ ggplot2.geom_boxplot(**{ 'outlier.size':0, 'alpha':0.3 } ) + \ ggplot2.geom_jitter(ggplot2.aes_string(color = 'locus'), size = 3, \ alpha = 0.6, position=ggplot2.position_jitter(width=0.25)) + \ ggplot2.scale_y_continuous('phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') else: plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi', fill='locus') + ggplot2.geom_bar() + \ ggplot2.facet_wrap(robjects.Formula('~ locus')) + \ ggplot2.opts(**{ 'axis.text.x':ggplot2.theme_text(angle = -90, hjust = 0), 'legend.position':'none' }) + \ ggplot2.scale_y_continuous('phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') return plot
def plot_coef(feat_mat_dir, model_dir, expt_names, pref, outfile=None, height=120, fsize=12): for expt_idx, ex in enumerate(expt_names): feat_mat_file = os.path.join(feat_mat_dir, ex + '_feat_mat.npz') model_file = os.path.join(model_dir, pref + ex + '_model.pkl') model = read_model(model_file) (tmp_feat, tmp_y, tmp_feat_names, tmp_gene_names) = read_feat_mat(feat_mat_file) if expt_idx == 0: feat_names = tmp_feat_names clf_coef = model.clf_coef() reg_coef = model.reg_coef() else: assert (all(f[0] == f[1] for f in zip(feat_names, tmp_feat_names))) clf_coef = np.concatenate((clf_coef, model.clf_coef()), axis=1) reg_coef = np.concatenate((reg_coef, model.reg_coef()), axis=1) nexpt = expt_idx + 1 # Now clf_coef has one row per coefficient and one column per experiment. # The reshape below will read the data row-first. df = pd.DataFrame({ 'feature': np.repeat(feat_names, nexpt), 'Classification': np.reshape(clf_coef, (clf_coef.size, )), 'Regression': np.reshape(reg_coef, (reg_coef.size, )) }) df2 = pd.melt(df, id_vars='feature', var_name='fun') r_df = com.convert_to_r_dataframe(df2) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(feature)', y = 'value') + \ ggplot2.facet_wrap('fun', scales = 'free_y') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Importance') + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) w = max(22 * nexpt, 80) if outfile is None: gp.plot() else: ro.r.ggsave(filename=outfile, plot=gp, width=w, height=height, unit='mm') return df
def plotStats(data, outFolder, tiles, prop="qual", prefix="", high="yellow", low="blue", pdf=False, detail=True): #overview plot p = ggplot.ggplot(data) p = p + ggplot.aes_string(x="x", y="y", col=prop) \ + ggplot.geom_point(size=0.1) \ + ggplot.facet_wrap(robjects.Formula("~ tile")) \ + ggplot.scale_colour_gradient(high=high, low=low) \ + ggplot.ggtitle("Overview %s" % (prop)) if prefix: fileName = "%s_overview_%s.png" % (prefix, prop) else: fileName = "overview_%s.png" % (prop) p.save(os.path.join(outFolder, fileName), scale=2) #detail plots if detail: detailFolder = os.path.join(outFolder, "detailPlots") for t in tiles: p = ggplot.ggplot(data.rx(data.rx2("tile").ro == t, True)) p = p + ggplot.aes_string(x="x", y="y", col=prop) \ + ggplot.geom_point(size=1) \ + ggplot.facet_wrap(robjects.Formula("~ tile")) \ + ggplot.scale_colour_gradient(high=high, low=low) \ + ggplot.ggtitle("%i %s" % (t, prop)) if prefix: fileName = "%s_%i_%s.png" % (prefix, t, prop) else: fileName = "%i_%s.png" % (t, prop) p.save(os.path.join(detailFolder, fileName), scale=2) if pdf: fileName = "%s%i_%s.pdf" % (prefix, t, prop) p.save(os.path.join(detailFolder, fileName), scale=2)
def plot_dupl_url(self): # -- pages per URL (URL-level duplicates) row_filter = ['url'] data = self.histogr data = data[data['type'].isin(row_filter)] title = 'Pages per URL (URL-level duplicates)' p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='count', y='frequency') \ + ggplot2.geom_jitter() \ + ggplot2.facet_wrap('crawl', ncol=5) \ + ggplot2.labs(title=title, x='(duplicate) pages per URL', y='log(frequency)') \ + ggplot2.scale_y_log10() # + ggplot2.scale_x_log10() # could use log-log scale img_path = os.path.join(PLOTDIR, 'crawler/histogr_url_dupl.png') p.save(img_path) # data.to_csv(img_path + '.csv') return p
def plot_host_domain_tld(self): # -- pages/URLs per host / domain / tld data = self.histogr data = data[data['type'].isin(['host', 'domain', 'tld'])] data = data[data['type_counted'].isin(['url'])] img_path = os.path.join(PLOTDIR, 'crawler/histogr_host_domain_tld.png') # data.to_csv(img_path + '.csv') title = 'URLs per Host / Domain / TLD' p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='count', weight='frequency', color='type') \ + ggplot2.geom_freqpoly(bins=20) \ + ggplot2.facet_wrap('crawl', ncol=4) \ + ggplot2.labs(title='', x=title, y='Frequency') \ + ggplot2.scale_y_log10() \ + ggplot2.scale_x_log10() p.save(img_path) return p
def multiple_locus_net_informativeness_facet(locus_table, net_pi_table, loci): if loci[0].lower() != 'all': qry = '''"SELECT {0}.locus, time, pi FROM {0}, {1} WHERE {0}.id = {1}.id and locus in {2}"'''.format(locus_table, net_pi_table, tuple(loci)) else: qry = '''"SELECT {0}.locus, time, pi FROM {0}, {1} WHERE {0}.id = {1}.id"'''.format(locus_table, net_pi_table) frame = robjects.r('''dbGetQuery(con, {})'''.format(qry)) gg_frame = ggplot2.ggplot(frame) plot = gg_frame + ggplot2.aes_string(x = 'time', y='pi') + \ ggplot2.geom_point(ggplot2.aes_string(colour = 'locus'), size = 3, \ alpha = 0.4) + ggplot2.scale_x_reverse('years ago') + \ ggplot2.facet_wrap(robjects.Formula('~ locus')) + \ ggplot2.opts(**{'legend.position' : 'none'}) + \ ggplot2.scale_y_continuous('phylogenetic informativeness') return plot
def mem_usage_graph(cfg): r = robjects.r varis = [] langs = [] probs = [] mems = [] for var in cfg.variations: for lang in cfg.languages: for prob in cfg.problems: mem_filename = get_mem_output(lang, prob, var) with open(mem_filename, 'r') as mem_file: mem = mem_file.readline() mems.append(float(mem)) varis.append(pretty_varis[var]) langs.append(pretty_langs[lang]) probs.append(prob) # memory usage is a simple histogram with all information in one graph. r.pdf('bargraph-memusage.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Variation': StrVector(varis), 'Mem': FloatVector(mems) }) gp = ggplot2.ggplot(df) # we rotate the x labels to make sure they don't overlap pp = gp +\ ggplot2.opts (**{'axis.text.x': ggplot2.theme_text (angle = 90, hjust=1)}) + \ ggplot2.aes_string (x='Problem', y='Mem', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.facet_wrap ('Variation') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Memory usage (in bytes)")')# + \ pp.plot() r['dev.off']()
def mem_usage_graph (cfg): r = robjects.r varis = [] langs = [] probs = [] mems = [] for var in cfg.variations: for lang in cfg.languages: for prob in cfg.problems: mem_filename = get_mem_output (lang, prob, var) with open (mem_filename, 'r') as mem_file: mem = mem_file.readline() mems.append (float (mem)) varis.append (pretty_varis [var]) langs.append (pretty_langs [lang]) probs.append (prob) # memory usage is a simple histogram with all information in one graph. r.pdf ('bargraph-memusage.pdf', height=pdf_height (), width=pdf_width ()) df = robjects.DataFrame({'Language': StrVector (langs), 'Problem': StrVector (probs), 'Variation' : StrVector (varis), 'Mem' : FloatVector (mems) }) gp = ggplot2.ggplot (df) # we rotate the x labels to make sure they don't overlap pp = gp +\ ggplot2.opts (**{'axis.text.x': ggplot2.theme_text (angle = 90, hjust=1)}) + \ ggplot2.aes_string (x='Problem', y='Mem', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.facet_wrap ('Variation') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Memory usage (in bytes)")')# + \ pp.plot () r['dev.off']()
def plot_coef(feat_mat_dir, model_dir, expt_names, pref, outfile = None, height = 120, fsize = 12): for expt_idx, ex in enumerate(expt_names): feat_mat_file = os.path.join(feat_mat_dir, ex + '_feat_mat.npz') model_file = os.path.join(model_dir, pref + ex + '_model.pkl') model = read_model(model_file) (tmp_feat, tmp_y, tmp_feat_names, tmp_gene_names) = read_feat_mat(feat_mat_file) if expt_idx == 0: feat_names = tmp_feat_names clf_coef = model.clf_coef() reg_coef = model.reg_coef() else: assert(all(f[0] == f[1] for f in zip(feat_names, tmp_feat_names))) clf_coef = np.concatenate((clf_coef, model.clf_coef()), axis = 1) reg_coef = np.concatenate((reg_coef, model.reg_coef()), axis = 1) nexpt = expt_idx + 1 # Now clf_coef has one row per coefficient and one column per experiment. # The reshape below will read the data row-first. df = pd.DataFrame({'feature':np.repeat(feat_names, nexpt), 'Classification':np.reshape(clf_coef, (clf_coef.size,)), 'Regression':np.reshape(reg_coef, (reg_coef.size,))}) df2 = pd.melt(df, id_vars = 'feature', var_name = 'fun') r_df = com.convert_to_r_dataframe(df2) gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(feature)', y = 'value') + \ ggplot2.facet_wrap('fun', scales = 'free_y') + \ ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Importance') + \ ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + \ ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1), 'axis.text.y':ggplot2.element_text(size = fsize), 'strip.text.x':ggplot2.element_text(size = fsize + 1)}) w = max(22 * nexpt, 80) if outfile is None: gp.plot() else: ro.r.ggsave(filename = outfile, plot = gp, width = w, height = height, unit = 'mm') return df
def rpy2_plotter(anno, clusters, name): """Plot genes distribution in clusters using ggplot2 from R.""" pandas2ri.activate() grdevices = importr('grDevices') rprint = robjects.globalenv.get("print") anno = anno.sort_values(by="n_ft", ascending=False) anno = anno.head(n=10) category = anno["category"].tolist() clusters = clusters[clusters["category"].isin(category)] clusters = pandas2ri.py2ri(clusters) pp = ggplot2.ggplot(clusters) + ggplot2.aes_string( x="n_features") + ggplot2.geom_histogram( binwidth=1) + ggplot2.facet_wrap(robjects.Formula("~category"), ncol=5) + ggplot2.labs( x="Number of Features", y="Number of Clusters", title="Clusters distribution") grdevices.pdf(file=name, width=11.692, height=8.267) rprint(pp) grdevices.dev_off()
min_samples_leaf=random_search.best_params_['min_samples_leaf'], min_samples_split=random_search.best_params_['min_samples_split'], bootstrap=random_search.best_params_['bootstrap']) rf.fit(X_train, Y_train) # Check MSE and R^2 of RF approach mean_squared_error(Y_test, rf.predict(X_test)) r2_score(Y_test, rf.predict(X_test)) # Prepare the RF output to be plotted pickups_2015_actual = pickups_14_15[pickups_14_15['year'] == 2015][['borough', 'nbhd', 'date', 'hour', 'passenger_count']] pickups_2015_pred = pickups_2015_actual.copy() pickups_2015_pred['passenger_count'] = rf.predict(X_test) pickups_2015_actual['type'] = 'actual' pickups_2015_pred['type'] = 'predicted' pickups_2015 = pd.concat([pickups_2015_actual, pickups_2015_pred], axis=0) pickups_2015 = pickups_2015.groupby(['borough', 'nbhd', 'date', 'type'])['passenger_count'].sum().reset_index() # Pick a random date to look at pickups_2015 = pickups_2015[pickups_2015['date'] == '2015-05-07'] pickups_2015 = pd.merge(pickups_2015, nbhd_borders, how='right', on=['nbhd']).dropna() pickups_2015['passenger_count'] = np.log1p(pickups_2015['passenger_count']) # Plot actual vs. predicted p6 = ggplot2.ggplot(pandas2ri.py2ri(pickups_2015)) + \ ggplot2.aes_string(x='lon', y='lat', group='nbhd', fill='passenger_count') + \ ggplot2.geom_polygon() + \ ggplot2.facet_wrap(robjects.Formula('~ type')) + \ ggplot2.scale_fill_gradient(low='yellow', high='red') + \ ggplot2.theme(legend_position='bottom') + \ ggplot2.labs(x='', y='', title='Actual vs. Expected Total Passengers on 5-7-2015', fill='Passenger Count\n(Log-Scale)') p6.save('./plots/actual_pred_pickups.png', width=7, height=5)
d['time'] = FloatVector([x for x in times]) + FloatVector(times_r) d['n_loop'] = IntVector([x[-1] for x in combos]) + IntVector([x[3] for x in combos_r]) d['group'] = StrVector([d['code'][x] + ':' + d['sequence'][x] for x in range(len(d['n_loop']))]) dataf = DataFrame(d) from rpy2.robjects.lib import ggplot2 p = ggplot2.ggplot(dataf) + \ ggplot2.geom_line(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.geom_point(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.facet_wrap(Formula('~sequence')) + \ ggplot2.scale_y_continuous('running time') + \ ggplot2.scale_x_continuous('repeated n times', ) + \ ggplot2.xlim(0, max(n_loops)) + \ ggplot2.labs(title = "Benchmark (running time)") from rpy2.robjects.packages import importr grdevices = importr('grDevices') grdevices.png('../../_static/benchmark_sum.png', width = 712, height = 512) p.plot() grdevices.dev_off() #base = importr("base") stats = importr('stats')
def as_dataframe (cfg, results, basis): r = robjects.r varis = [] langs = [] probs = [] times = [] threads = [] # speedups, with upper and lower bounds below speedups = [] speedup_lowers = [] speedup_uppers = [] ses = [] # standard errors mems = [] # memory usage langs_ideal = list (cfg.languages) langs_ideal.append ('ideal') probs_ideal = list (cfg.problems) probs_ideal.append ('ideal') for var in cfg.variations: for lang in langs_ideal: # cfg.languages: for prob in probs_ideal: # cfg.problems: for thread in cfg.threads: if lang == 'ideal' and prob == 'ideal': continue elif lang == 'ideal' or prob == 'ideal': varis.append (var) langs.append (pretty_langs[lang]) probs.append (prob) threads.append (thread) speedups.append (thread) speedup_lowers.append (thread) speedup_uppers.append (thread) times.append (0) ses.append(0) mems.append (0) continue varis.append (var) # pretty_varis [var]) langs.append (pretty_langs [lang]) probs.append (prob) threads.append (thread) if var.find('seq') >= 0: thread = cfg.threads[-1] vals = FloatVector (results[thread][prob][var][lang][0]) time = mean (vals) times.append (time) # # time confidence interval # t_result = r['t.test'] (FloatVector(vals), **{" conf.level": 0.999}).rx ('conf.int')[0] ses.append ((t_result[1] - t_result[0])/2) # # memory usage # mem_filename = get_mem_output (lang, prob, var) with open (mem_filename, 'r') as mem_file: mem = mem_file.readline() mems.append (float (mem)) # we include dummy data for the sequential case to avoid the # speedup calculation below if var.find('seq') >= 0: speedups.append (1) speedup_lowers.append (1) speedup_uppers.append (1) continue # # speedup values and confidence intervals # seq_vals = results[cfg.threads[-1]][prob][var.replace ('par', 'seq')][lang][0] # sequential base base = FloatVector (seq_vals) # base with p = 1 base_p1 = FloatVector (results[1][prob][var][lang][0]) # use fastest sequential program if basis == 'fastest' and mean (base_p1) < mean(base): base = base_p1 elif basis == 'seq': pass elif basis == 'p1': base = base_p1 labels = ['Base'] * r.length(base)[0] + ['N']*r.length (vals)[0] df = DataFrame ({'Times': base + vals, 'Type': StrVector(labels)}) ratio_test = r['pairwiseCI'] (r('Times ~ Type'), data=df, control='N', method='Param.ratio', **{'var.equal': False})[0][0] speedups.append (mean(base) / time) speedup_lowers.append (ratio_test[1][0]) speedup_uppers.append (ratio_test[2][0]) df = robjects.DataFrame({'Language': StrVector (langs), 'Problem': StrVector (probs), 'Variation' : StrVector (varis), 'Threads': IntVector (threads), 'Time': FloatVector (times), 'SE': FloatVector (ses), 'Speedup': FloatVector (speedups), 'SpeedupLower': FloatVector (speedup_lowers), 'SpeedupUpper': FloatVector (speedup_uppers), 'Mem' : FloatVector (mems) }) r.assign ('df', df) r ('save (df, file="performance.Rda")') # reshape the data to make variation not a column itself, but a part of # the other columns describe ie, time, speedup, etc. # # also, remove the 'ideal' problem as we don't want it in this plot. df = r(''' redf = reshape (df, timevar="Variation", idvar = c("Language","Problem","Threads"), direction="wide") redf$Problem <- factor(redf$Problem, levels = c("randmat","thresh","winnow","outer","product","chain")) redf[which(redf$Problem != "ideal"),] ''') r.pdf ('speedup-expertpar-all.pdf', height=6.5, width=10) change_name = 'Language' legendVec = IntVector (range (len (langs_ideal))) legendVec.names = StrVector (langs_ideal) gg = ggplot2.ggplot (df) limits = ggplot2.aes (ymax = 'SpeedupUpper.expertpar', ymin = 'SpeedupLower.expertpar') dodge = ggplot2.position_dodge (width=0.9) pp = gg + \ ggplot2.geom_line() + ggplot2.geom_point(size=2.5) +\ robjects.r('scale_color_manual(values = c("#ffcb7e", "#1da06b", "#b94646", "#00368a", "#CCCCCC"))') +\ ggplot2.aes_string(x='Threads', y='Speedup.expertpar', group=change_name, color=change_name, shape=change_name) + \ ggplot2.geom_errorbar (limits, width=0.25) + \ ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, vjust=-0.2), 'axis.title.y' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, angle=90, vjust=0.2), 'axis.text.x' : ggplot2.theme_text(family = 'serif', size = 10), 'axis.text.y' : ggplot2.theme_text(family = 'serif', size = 10), 'legend.title' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10), 'legend.text' : ggplot2.theme_text(family = 'serif', size = 10), 'strip.text.x' : ggplot2.theme_text(family = 'serif', size = 10), 'aspect.ratio' : 1, }) + \ robjects.r('ylab("Speedup")') + \ robjects.r('xlab("Number of cores")') + \ ggplot2.facet_wrap ('Problem', nrow = 2) pp.plot() r['dev.off']()
def plot_qc_reads(qc_df): """ Plot number of reads part of a pipeline QC file. """ # Record NA values as 0 qc_df = qc_df.fillna(0)#.set_index("sample") cols = ["sample", "num_reads", "num_mapped", "num_unique_mapped", "num_junctions"] qc_df = qc_df[cols] melted_qc = pandas.melt(qc_df, id_vars=["sample"]) qc_r = conversion_pydataframe(melted_qc) labels = tuple(["num_reads", "num_mapped", "num_unique_mapped", "num_junctions"]) labels = robj.StrVector(labels) variable_i = qc_r.names.index('variable') qc_r[variable_i] = robj.FactorVector(qc_r[variable_i], levels = labels) ggplot2.theme_set(ggplot2.theme_bw(12)) scales = importr("scales") r_opts = r.options(scipen=4) p = ggplot2.ggplot(qc_r) + \ ggplot2.geom_point(aes_string(x="sample", y="value")) + \ ggplot2.scale_y_continuous(trans=scales.log10_trans(), breaks=scales.trans_breaks("log10", robj.r('function(x) 10^x')), labels=scales.trans_format("log10", robj.r('math_format(10^.x)'))) + \ r.xlab("CLIP-Seq samples") + \ r.ylab("No. reads") + \ ggplot2.coord_flip() + \ ggplot2.facet_wrap(Formula("~ variable"), ncol=1) + \ theme(**{"panel.grid.major.x": element_blank(), "panel.grid.minor.x": element_blank(), "panel.grid.major.y": theme_line(size=0.5,colour="grey66",linetype=3)}) p.plot() return r.par(mfrow=np.array([1,2])) num_samples = len(qc_df.num_reads) r.par(bty="n", lwd=1.7, lty=2) r_opts = r.options(scipen=4) r.options(r_opts) r.dotchart(convert_to_r_matrix(qc_df[["num_reads", "num_mapped", "num_unique_mapped"]]), xlab="No. reads", lcolor="black", pch=19, gcolor="darkblue", cex=0.8) r.par(bty="n") r.dotchart(convert_to_r_matrix(qc_df[["num_ribosub_mapped", "num_ribo", "num_junctions"]]), xlab="No. reads", lcolor="black", pch=19, gcolor="darkblue", cex=0.8)
def plot_squiggle(args, filename, start_times, mean_signals): """ Use rpy2 to create a squiggle plot of the read """ r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') # set t_0 as the first measured time for the read. t_0 = start_times[0] total_time = start_times[-1] - start_times[0] # adjust times to be relative to t_0 r_start_times = robjects.FloatVector([t - t_0 for t in start_times]) r_mean_signals = robjects.FloatVector(mean_signals) # infer the appropriate number of events given the number of facets num_events = len(r_mean_signals) events_per_facet = (num_events / args.num_facets) + 1 # dummy variable to control faceting facet_category = robjects.FloatVector([(i / events_per_facet) + 1 for i in range(len(start_times))]) # make a data frame of the start times and mean signals d = {'start': r_start_times, 'mean': r_mean_signals, 'cat': facet_category} df = robjects.DataFrame(d) gp = ggplot2.ggplot(df) if not args.theme_bw: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) else: pp = gp + ggplot2.aes_string(x='start', y='mean') \ + ggplot2.geom_step(size=0.25) \ + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \ + ggplot2.scale_x_continuous('Time (seconds)') \ + ggplot2.scale_y_continuous('Mean signal (picoamps)') \ + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \ + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) \ + ggplot2.theme_bw() if args.saveas is not None: plot_file = os.path.basename(filename) + "." + args.saveas if os.path.isfile(plot_file): raise Exception( 'Cannot create plot for %s: plot file %s already exists' % (filename, plot_file)) if args.saveas == "pdf": grdevices.pdf(plot_file, width=8.5, height=11) elif args.saveas == "png": grdevices.png(plot_file, width=8.5, height=11, units="in", res=300) pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def as_dataframe(cfg, results, basis): r = robjects.r varis = [] langs = [] probs = [] times = [] threads = [] # speedups, with upper and lower bounds below speedups = [] speedup_lowers = [] speedup_uppers = [] ses = [] # standard errors mems = [] # memory usage langs_ideal = list(cfg.languages) langs_ideal.append('ideal') probs_ideal = list(cfg.problems) probs_ideal.append('ideal') for var in cfg.variations: for lang in langs_ideal: # cfg.languages: for prob in probs_ideal: # cfg.problems: for thread in cfg.threads: if lang == 'ideal' and prob == 'ideal': continue elif lang == 'ideal' or prob == 'ideal': varis.append(var) langs.append(pretty_langs[lang]) probs.append(prob) threads.append(thread) speedups.append(thread) speedup_lowers.append(thread) speedup_uppers.append(thread) times.append(0) ses.append(0) mems.append(0) continue varis.append(var) # pretty_varis [var]) langs.append(pretty_langs[lang]) probs.append(prob) threads.append(thread) if var.find('seq') >= 0: thread = cfg.threads[-1] vals = FloatVector(results[thread][prob][var][lang][0]) time = mean(vals) times.append(time) # # time confidence interval # t_result = r['t.test'](FloatVector(vals), **{ " conf.level": 0.999 }).rx('conf.int')[0] ses.append((t_result[1] - t_result[0]) / 2) # # memory usage # mem_filename = get_mem_output(lang, prob, var) with open(mem_filename, 'r') as mem_file: mem = mem_file.readline() mems.append(float(mem)) # we include dummy data for the sequential case to avoid the # speedup calculation below if var.find('seq') >= 0: speedups.append(1) speedup_lowers.append(1) speedup_uppers.append(1) continue # # speedup values and confidence intervals # seq_vals = results[cfg.threads[-1]][prob][var.replace( 'par', 'seq')][lang][0] # sequential base base = FloatVector(seq_vals) # base with p = 1 base_p1 = FloatVector(results[1][prob][var][lang][0]) # use fastest sequential program if basis == 'fastest' and mean(base_p1) < mean(base): base = base_p1 elif basis == 'seq': pass elif basis == 'p1': base = base_p1 labels = ['Base' ] * r.length(base)[0] + ['N'] * r.length(vals)[0] df = DataFrame({ 'Times': base + vals, 'Type': StrVector(labels) }) ratio_test = r['pairwiseCI'](r('Times ~ Type'), data=df, control='N', method='Param.ratio', **{ 'var.equal': False })[0][0] speedups.append(mean(base) / time) speedup_lowers.append(ratio_test[1][0]) speedup_uppers.append(ratio_test[2][0]) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Variation': StrVector(varis), 'Threads': IntVector(threads), 'Time': FloatVector(times), 'SE': FloatVector(ses), 'Speedup': FloatVector(speedups), 'SpeedupLower': FloatVector(speedup_lowers), 'SpeedupUpper': FloatVector(speedup_uppers), 'Mem': FloatVector(mems) }) r.assign('df', df) r('save (df, file="performance.Rda")') # reshape the data to make variation not a column itself, but a part of # the other columns describe ie, time, speedup, etc. # # also, remove the 'ideal' problem as we don't want it in this plot. df = r(''' redf = reshape (df, timevar="Variation", idvar = c("Language","Problem","Threads"), direction="wide") redf$Problem <- factor(redf$Problem, levels = c("randmat","thresh","winnow","outer","product","chain")) redf[which(redf$Problem != "ideal"),] ''') r.pdf('speedup-expertpar-all.pdf', height=6.5, width=10) change_name = 'Language' legendVec = IntVector(range(len(langs_ideal))) legendVec.names = StrVector(langs_ideal) gg = ggplot2.ggplot(df) limits = ggplot2.aes(ymax='SpeedupUpper.expertpar', ymin='SpeedupLower.expertpar') dodge = ggplot2.position_dodge(width=0.9) pp = gg + \ ggplot2.geom_line() + ggplot2.geom_point(size=2.5) +\ robjects.r('scale_color_manual(values = c("#ffcb7e", "#1da06b", "#b94646", "#00368a", "#CCCCCC"))') +\ ggplot2.aes_string(x='Threads', y='Speedup.expertpar', group=change_name, color=change_name, shape=change_name) + \ ggplot2.geom_errorbar (limits, width=0.25) + \ ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, vjust=-0.2), 'axis.title.y' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, angle=90, vjust=0.2), 'axis.text.x' : ggplot2.theme_text(family = 'serif', size = 10), 'axis.text.y' : ggplot2.theme_text(family = 'serif', size = 10), 'legend.title' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10), 'legend.text' : ggplot2.theme_text(family = 'serif', size = 10), 'strip.text.x' : ggplot2.theme_text(family = 'serif', size = 10), 'aspect.ratio' : 1, }) + \ robjects.r('ylab("Speedup")') + \ robjects.r('xlab("Number of cores")') + \ ggplot2.facet_wrap ('Problem', nrow = 2) pp.plot() r['dev.off']()
def test_vars(self): gp = (ggplot2.ggplot(mtcars) + ggplot2.aes(x='wt', y='mpg') + ggplot2.geom_point() + ggplot2.facet_wrap(ggplot2.vars('gears'))) assert isinstance(gp, ggplot2.GGPlot)
[x[0] for x in combos_r]) d['n_loop'] = IntVector([x[-1] for x in combos]) + IntVector( [x[1] for x in combos_r]) d['group'] = StrVector( [d['code'][x] + ':' + d['sequence'][x] for x in xrange(len(d['n_loop']))]) dataf = DataFrame(d) from rpy2.robjects.lib import ggplot2 p = ggplot2.ggplot(dataf) + \ ggplot2.geom_line(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.geom_point(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.facet_wrap(Formula('~sequence')) + \ ggplot2.scale_y_continuous('running time') + \ ggplot2.scale_x_continuous('repeated n times', ) + \ ggplot2.xlim(0, max(n_loops)) + \ ggplot2.opts(title = "Benchmark (running time)") from rpy2.robjects.packages import importr grdevices = importr('grDevices') grdevices.png('../../_static/benchmark_sum.png', width=712, height=512) p.plot() grdevices.dev_off() #base = importr("base") stats = importr('stats') nlme = importr("nlme") fit = nlme.lmList(Formula('time ~ n_loop | group'),
def test_as_labeller(self, labeller): if isinstance(labeller, dict): labeller = ggplot2.dict2rvec(labeller) gp = (ggplot2.ggplot(mtcars) + ggplot2.facet_wrap( rl('~am'), labeller=ggplot2.as_labeller(labeller))) assert isinstance(gp, ggplot2.GGPlot)
iris_py = pandas.read_csv("/home/yarden/iris.csv") iris_py = iris_py.rename(columns={"Name": "Species"}) corrs = [] from scipy.stats import spearmanr for species in set(iris_py.Species): entries = iris_py[iris_py["Species"] == species] c = spearmanr(entries["SepalLength"], entries["SepalWidth"]) print "c: ", c # compute r.cor(x, y) and divide up by Species # Assume we get a vector of length Species saying what the # correlation is for each Species' Petal Length/Width p = ggplot2.ggplot(iris) + \ ggplot2.geom_point(ggplot2.aes_string(x="Sepal.Length", y="Sepal.Width")) + \ ggplot2.facet_wrap(Formula("~Species")) p.plot() r["dev.off"]() sys.exit(1) grdevices = importr('grDevices') ggplot2.theme_set(ggplot2.theme_bw(12)) p = ggplot2.ggplot(iris) + \ ggplot2.geom_point(ggplot2.aes_string(x="Sepal.Length", y="Sepal.Width")) + \ ggplot2.facet_wrap(Formula('~ Species'), ncol=2, nrow = 2) + \ ggplot2.geom_text(aes_string(x="Sepal.Length", y="Sepal.Width"), label="t") + \ ggplot2.GBaseObject(r('ggplot2::coord_fixed')()) # aspect ratio p.plot()