Ejemplo n.º 1
0
def plot_squiggle(args, filename, start_times, mean_signals):
	"""
	Use rpy2 to create a squiggle plot of the read
	"""
	r = robjects.r
	r.library("ggplot2")
	grdevices = importr('grDevices')

	# set t_0 as the first measured time for the read.
	t_0 = start_times[0]
	total_time = start_times[-1] - start_times[0]
	# adjust times to be relative to t_0
	r_start_times = robjects.FloatVector([t - t_0 for t in start_times])
	r_mean_signals = robjects.FloatVector(mean_signals)
	
	# infer the appropriate number of events given the number of facets
	num_events = len(r_mean_signals)
	events_per_facet = (num_events / args.num_facets) + 1
	# dummy variable to control faceting
	facet_category = robjects.FloatVector([(i / events_per_facet) + 1 for i in range(len(start_times))])

	# make a data frame of the start times and mean signals
	d = {'start': r_start_times, 'mean': r_mean_signals, 'cat': facet_category}
	df = robjects.DataFrame(d)

	gp = ggplot2.ggplot(df)
	if not args.theme_bw:
		pp = gp + ggplot2.aes_string(x='start', y='mean') \
			+ ggplot2.geom_step(size=0.25) \
			+ ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \
			+ ggplot2.scale_x_continuous('Time (seconds)') \
			+ ggplot2.scale_y_continuous('Mean signal (picoamps)') \
			+ ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \
			+ ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)})
	else:
		pp = gp + ggplot2.aes_string(x='start', y='mean') \
			+ ggplot2.geom_step(size=0.25) \
			+ ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \
			+ ggplot2.scale_x_continuous('Time (seconds)') \
			+ ggplot2.scale_y_continuous('Mean signal (picoamps)') \
			+ ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \
			+ ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) \
			+ ggplot2.theme_bw()

	if args.saveas is not None:
		plot_file = os.path.basename(filename) + "." + args.saveas
		if os.path.isfile(plot_file):
			raise Exception('Cannot create plot for %s: plot file %s already exists' % (filename, plot_file))
		if args.saveas == "pdf":
			grdevices.pdf(plot_file, width = 8.5, height = 11)
		elif args.saveas == "png":
			grdevices.png(plot_file, width = 8.5, height = 11, 
				units = "in", res = 300)
		pp.plot()
		grdevices.dev_off()
	else:
		pp.plot()
		# keep the plot open until user hits enter
		print('Type enter to exit.')
		raw_input()
Ejemplo n.º 2
0
def interval(locus_table, interval_table, intervals, loci, boxplot = True):
    qry = get_interval_query(intervals, loci, locus_table, interval_table)
    frame = robjects.r('''data <- dbGetQuery(con, {})'''.format(qry))
    # because we're sorting by interval, which is a factor, we need to
    # explicitly re-sort the data by the first integer value
    # of the interval.  This is a bit cumbersome, because sorting
    # in R is less than pleasant.
    sort_string = '''data$interval <- factor(data$interval, {})'''.format(order_intervals(frame[1]))
    robjects.r(sort_string)
    gg_frame = ggplot2.ggplot(robjects.r('''data'''))
    if boxplot:
        plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi') + \
                ggplot2.geom_boxplot(**{
                    'outlier.size':0, 
                    'alpha':0.3
                    }
                ) + \
                ggplot2.geom_jitter(ggplot2.aes_string(color = 'locus'), size = 3, \
                alpha = 0.6, position=ggplot2.position_jitter(width=0.25)) + \
                ggplot2.scale_y_continuous('phylogenetic informativeness') + \
                ggplot2.scale_x_discrete('interval (years ago)')

    else:
        plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi',
                fill='locus') + ggplot2.geom_bar() + \
                ggplot2.facet_wrap(robjects.Formula('~ locus')) + \
                ggplot2.opts(**{
                    'axis.text.x':ggplot2.theme_text(angle = -90,  hjust = 0),
                    'legend.position':'none'
                    }) + \
                ggplot2.scale_y_continuous('phylogenetic informativeness') + \
                ggplot2.scale_x_discrete('interval (years ago)')
    return plot
Ejemplo n.º 3
0
def plot_coef(feat_mat_dir,
              model_dir,
              expt_names,
              pref,
              outfile=None,
              height=120,
              fsize=12):

    for expt_idx, ex in enumerate(expt_names):
        feat_mat_file = os.path.join(feat_mat_dir, ex + '_feat_mat.npz')
        model_file = os.path.join(model_dir, pref + ex + '_model.pkl')
        model = read_model(model_file)
        (tmp_feat, tmp_y, tmp_feat_names,
         tmp_gene_names) = read_feat_mat(feat_mat_file)

        if expt_idx == 0:
            feat_names = tmp_feat_names
            clf_coef = model.clf_coef()
            reg_coef = model.reg_coef()
        else:
            assert (all(f[0] == f[1] for f in zip(feat_names, tmp_feat_names)))
            clf_coef = np.concatenate((clf_coef, model.clf_coef()), axis=1)
            reg_coef = np.concatenate((reg_coef, model.reg_coef()), axis=1)

    nexpt = expt_idx + 1

    # Now clf_coef has one row per coefficient and one column per experiment.
    # The reshape below will read the data row-first.
    df = pd.DataFrame({
        'feature': np.repeat(feat_names, nexpt),
        'Classification': np.reshape(clf_coef, (clf_coef.size, )),
        'Regression': np.reshape(reg_coef, (reg_coef.size, ))
    })

    df2 = pd.melt(df, id_vars='feature', var_name='fun')
    r_df = com.convert_to_r_dataframe(df2)
    gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(feature)', y = 'value') + \
        ggplot2.facet_wrap('fun', scales = 'free_y') + \
        ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Importance') + \
        ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + \
        ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1),
                         'axis.text.y':ggplot2.element_text(size = fsize),
                         'strip.text.x':ggplot2.element_text(size = fsize + 1)})
    w = max(22 * nexpt, 80)
    if outfile is None:
        gp.plot()
    else:
        ro.r.ggsave(filename=outfile,
                    plot=gp,
                    width=w,
                    height=height,
                    unit='mm')
    return df
def plotStats(data,
              outFolder,
              tiles,
              prop="qual",
              prefix="",
              high="yellow",
              low="blue",
              pdf=False,
              detail=True):
    #overview plot
    p = ggplot.ggplot(data)
    p = p + ggplot.aes_string(x="x", y="y", col=prop) \
        + ggplot.geom_point(size=0.1) \
        + ggplot.facet_wrap(robjects.Formula("~ tile")) \
        + ggplot.scale_colour_gradient(high=high, low=low) \
        + ggplot.ggtitle("Overview %s" % (prop))
    if prefix:
        fileName = "%s_overview_%s.png" % (prefix, prop)
    else:
        fileName = "overview_%s.png" % (prop)
    p.save(os.path.join(outFolder, fileName), scale=2)

    #detail plots
    if detail:
        detailFolder = os.path.join(outFolder, "detailPlots")
        for t in tiles:
            p = ggplot.ggplot(data.rx(data.rx2("tile").ro == t, True))
            p = p + ggplot.aes_string(x="x", y="y", col=prop) \
                + ggplot.geom_point(size=1) \
                + ggplot.facet_wrap(robjects.Formula("~ tile")) \
                + ggplot.scale_colour_gradient(high=high, low=low) \
                + ggplot.ggtitle("%i %s" % (t, prop))
            if prefix:
                fileName = "%s_%i_%s.png" % (prefix, t, prop)
            else:
                fileName = "%i_%s.png" % (t, prop)
            p.save(os.path.join(detailFolder, fileName), scale=2)
            if pdf:
                fileName = "%s%i_%s.pdf" % (prefix, t, prop)
                p.save(os.path.join(detailFolder, fileName), scale=2)
Ejemplo n.º 5
0
 def plot_dupl_url(self):
     # -- pages per URL (URL-level duplicates)
     row_filter = ['url']
     data = self.histogr
     data = data[data['type'].isin(row_filter)]
     title = 'Pages per URL (URL-level duplicates)'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='count', y='frequency') \
         + ggplot2.geom_jitter() \
         + ggplot2.facet_wrap('crawl', ncol=5) \
         + ggplot2.labs(title=title, x='(duplicate) pages per URL',
                        y='log(frequency)') \
         + ggplot2.scale_y_log10()
     # + ggplot2.scale_x_log10()  # could use log-log scale
     img_path = os.path.join(PLOTDIR, 'crawler/histogr_url_dupl.png')
     p.save(img_path)
     # data.to_csv(img_path + '.csv')
     return p
 def plot_host_domain_tld(self):
     # -- pages/URLs per host / domain / tld
     data = self.histogr
     data = data[data['type'].isin(['host', 'domain', 'tld'])]
     data = data[data['type_counted'].isin(['url'])]
     img_path = os.path.join(PLOTDIR, 'crawler/histogr_host_domain_tld.png')
     # data.to_csv(img_path + '.csv')
     title = 'URLs per Host / Domain / TLD'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='count', weight='frequency', color='type') \
         + ggplot2.geom_freqpoly(bins=20) \
         + ggplot2.facet_wrap('crawl', ncol=4) \
         + ggplot2.labs(title='', x=title,
                        y='Frequency') \
         + ggplot2.scale_y_log10() \
         + ggplot2.scale_x_log10()
     p.save(img_path)
     return p
Ejemplo n.º 7
0
def multiple_locus_net_informativeness_facet(locus_table, net_pi_table, loci):
    if loci[0].lower() != 'all':
        qry = '''"SELECT {0}.locus, time, pi FROM {0}, {1} 
            WHERE {0}.id = {1}.id and locus in {2}"'''.format(locus_table,
            net_pi_table, tuple(loci))
    else:
        qry = '''"SELECT {0}.locus, time, pi FROM {0}, {1} 
            WHERE {0}.id = {1}.id"'''.format(locus_table,
            net_pi_table)
    frame = robjects.r('''dbGetQuery(con, {})'''.format(qry))
    gg_frame = ggplot2.ggplot(frame)
    plot = gg_frame + ggplot2.aes_string(x = 'time', y='pi') + \
        ggplot2.geom_point(ggplot2.aes_string(colour = 'locus'), size = 3, \
        alpha = 0.4) + ggplot2.scale_x_reverse('years ago') + \
        ggplot2.facet_wrap(robjects.Formula('~ locus')) + \
        ggplot2.opts(**{'legend.position' : 'none'}) + \
        ggplot2.scale_y_continuous('phylogenetic informativeness')
    return plot
 def plot_dupl_url(self):
     # -- pages per URL (URL-level duplicates)
     row_filter = ['url']
     data = self.histogr
     data = data[data['type'].isin(row_filter)]
     title = 'Pages per URL (URL-level duplicates)'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='count', y='frequency') \
         + ggplot2.geom_jitter() \
         + ggplot2.facet_wrap('crawl', ncol=5) \
         + ggplot2.labs(title=title, x='(duplicate) pages per URL',
                        y='log(frequency)') \
         + ggplot2.scale_y_log10()
     # + ggplot2.scale_x_log10()  # could use log-log scale
     img_path = os.path.join(PLOTDIR, 'crawler/histogr_url_dupl.png')
     p.save(img_path)
     # data.to_csv(img_path + '.csv')
     return p
Ejemplo n.º 9
0
 def plot_host_domain_tld(self):
     # -- pages/URLs per host / domain / tld
     data = self.histogr
     data = data[data['type'].isin(['host', 'domain', 'tld'])]
     data = data[data['type_counted'].isin(['url'])]
     img_path = os.path.join(PLOTDIR,
                             'crawler/histogr_host_domain_tld.png')
     # data.to_csv(img_path + '.csv')
     title = 'URLs per Host / Domain / TLD'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='count', weight='frequency', color='type') \
         + ggplot2.geom_freqpoly(bins=20) \
         + ggplot2.facet_wrap('crawl', ncol=4) \
         + ggplot2.labs(title='', x=title,
                        y='Frequency') \
         + ggplot2.scale_y_log10() \
         + ggplot2.scale_x_log10()
     p.save(img_path)
     return p
Ejemplo n.º 10
0
def mem_usage_graph(cfg):
    r = robjects.r
    varis = []
    langs = []
    probs = []
    mems = []
    for var in cfg.variations:
        for lang in cfg.languages:
            for prob in cfg.problems:
                mem_filename = get_mem_output(lang, prob, var)
                with open(mem_filename, 'r') as mem_file:
                    mem = mem_file.readline()
                    mems.append(float(mem))
                varis.append(pretty_varis[var])
                langs.append(pretty_langs[lang])
                probs.append(prob)

    # memory usage is a simple histogram with all information in one graph.
    r.pdf('bargraph-memusage.pdf', height=pdf_height(), width=pdf_width())
    df = robjects.DataFrame({
        'Language': StrVector(langs),
        'Problem': StrVector(probs),
        'Variation': StrVector(varis),
        'Mem': FloatVector(mems)
    })

    gp = ggplot2.ggplot(df)

    # we rotate the x labels to make sure they don't overlap
    pp = gp  +\
        ggplot2.opts (**{'axis.text.x': ggplot2.theme_text (angle = 90, hjust=1)}) + \
        ggplot2.aes_string (x='Problem', y='Mem', fill='Language') + \
        ggplot2.geom_bar (position='dodge', stat='identity') + \
        ggplot2.facet_wrap ('Variation') + \
        ggplot2_options () + \
        ggplot2_colors () + \
        robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\
        robjects.r('ylab("Memory usage (in bytes)")')# + \

    pp.plot()
    r['dev.off']()
Ejemplo n.º 11
0
def mem_usage_graph (cfg):
  r = robjects.r
  varis = []
  langs = []
  probs = []
  mems  = []
  for var in cfg.variations:
    for lang in cfg.languages:
      for prob in cfg.problems:
        mem_filename = get_mem_output (lang, prob, var)
        with open (mem_filename, 'r') as mem_file:
          mem = mem_file.readline()
          mems.append (float (mem))
        varis.append (pretty_varis [var])
        langs.append (pretty_langs [lang])
        probs.append (prob)

  # memory usage is a simple histogram with all information in one graph.
  r.pdf ('bargraph-memusage.pdf', height=pdf_height (), width=pdf_width ())
  df = robjects.DataFrame({'Language': StrVector (langs),
                           'Problem': StrVector (probs),
                           'Variation' : StrVector (varis),
                           'Mem' : FloatVector (mems)
                           })

  gp = ggplot2.ggplot (df)

  # we rotate the x labels to make sure they don't overlap
  pp = gp  +\
      ggplot2.opts (**{'axis.text.x': ggplot2.theme_text (angle = 90, hjust=1)}) + \
      ggplot2.aes_string (x='Problem', y='Mem', fill='Language') + \
      ggplot2.geom_bar (position='dodge', stat='identity') + \
      ggplot2.facet_wrap ('Variation') + \
      ggplot2_options () + \
      ggplot2_colors () + \
      robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\
      robjects.r('ylab("Memory usage (in bytes)")')# + \

  pp.plot ()
  r['dev.off']()
Ejemplo n.º 12
0
def plot_coef(feat_mat_dir, model_dir, expt_names, pref, outfile = None, height = 120, fsize = 12):
    
    for expt_idx, ex in enumerate(expt_names):
        feat_mat_file = os.path.join(feat_mat_dir, ex + '_feat_mat.npz')
        model_file = os.path.join(model_dir, pref + ex + '_model.pkl')
        model = read_model(model_file)
        (tmp_feat, tmp_y, tmp_feat_names, tmp_gene_names) = read_feat_mat(feat_mat_file)
        
        if expt_idx == 0:
            feat_names = tmp_feat_names
            clf_coef = model.clf_coef()
            reg_coef = model.reg_coef()
        else:
            assert(all(f[0] == f[1] for f in zip(feat_names, tmp_feat_names)))
            clf_coef = np.concatenate((clf_coef, model.clf_coef()), axis = 1)
            reg_coef = np.concatenate((reg_coef, model.reg_coef()), axis = 1)
    
    nexpt = expt_idx + 1
    
    # Now clf_coef has one row per coefficient and one column per experiment.
    # The reshape below will read the data row-first.
    df = pd.DataFrame({'feature':np.repeat(feat_names, nexpt),
                       'Classification':np.reshape(clf_coef, (clf_coef.size,)),
                       'Regression':np.reshape(reg_coef, (reg_coef.size,))})

    df2 = pd.melt(df, id_vars = 'feature', var_name = 'fun')
    r_df = com.convert_to_r_dataframe(df2)
    gp = ggplot2.ggplot(r_df) + ggplot2.aes_string(x = 'factor(feature)', y = 'value') + \
        ggplot2.facet_wrap('fun', scales = 'free_y') + \
        ggplot2.geom_boxplot() + ggplot2.scale_y_continuous('Importance') + \
        ggplot2.scale_x_discrete('') + ggplot2.theme_bw() + \
        ggplot2.theme(**{'axis.text.x':ggplot2.element_text(size = fsize, angle = 65, vjust = 1, hjust = 1),
                         'axis.text.y':ggplot2.element_text(size = fsize),
                         'strip.text.x':ggplot2.element_text(size = fsize + 1)})
    w = max(22 * nexpt, 80)
    if outfile is None:
        gp.plot()
    else:
        ro.r.ggsave(filename = outfile, plot = gp, width = w, height = height, unit = 'mm')
    return df
Ejemplo n.º 13
0
def rpy2_plotter(anno, clusters, name):
    """Plot genes distribution in clusters using ggplot2 from R."""
    pandas2ri.activate()
    grdevices = importr('grDevices')
    rprint = robjects.globalenv.get("print")

    anno = anno.sort_values(by="n_ft", ascending=False)
    anno = anno.head(n=10)
    category = anno["category"].tolist()
    clusters = clusters[clusters["category"].isin(category)]
    clusters = pandas2ri.py2ri(clusters)

    pp = ggplot2.ggplot(clusters) + ggplot2.aes_string(
        x="n_features") + ggplot2.geom_histogram(
            binwidth=1) + ggplot2.facet_wrap(robjects.Formula("~category"),
                                             ncol=5) + ggplot2.labs(
                                                 x="Number of Features",
                                                 y="Number of Clusters",
                                                 title="Clusters distribution")

    grdevices.pdf(file=name, width=11.692, height=8.267)
    rprint(pp)
    grdevices.dev_off()
Ejemplo n.º 14
0
                           min_samples_leaf=random_search.best_params_['min_samples_leaf'], 
                           min_samples_split=random_search.best_params_['min_samples_split'], 
                           bootstrap=random_search.best_params_['bootstrap'])
rf.fit(X_train, Y_train)

# Check MSE and R^2 of RF approach
mean_squared_error(Y_test, rf.predict(X_test))
r2_score(Y_test, rf.predict(X_test))

# Prepare the RF output to be plotted
pickups_2015_actual = pickups_14_15[pickups_14_15['year'] == 2015][['borough', 'nbhd', 'date', 'hour', 'passenger_count']]
pickups_2015_pred = pickups_2015_actual.copy()
pickups_2015_pred['passenger_count'] = rf.predict(X_test)
pickups_2015_actual['type'] = 'actual'
pickups_2015_pred['type'] = 'predicted'
pickups_2015 = pd.concat([pickups_2015_actual, pickups_2015_pred], axis=0)
pickups_2015 = pickups_2015.groupby(['borough', 'nbhd', 'date', 'type'])['passenger_count'].sum().reset_index()
# Pick a random date to look at
pickups_2015 = pickups_2015[pickups_2015['date'] == '2015-05-07']
pickups_2015 = pd.merge(pickups_2015, nbhd_borders, how='right', on=['nbhd']).dropna()
pickups_2015['passenger_count'] = np.log1p(pickups_2015['passenger_count'])

# Plot actual vs. predicted
p6 = ggplot2.ggplot(pandas2ri.py2ri(pickups_2015)) + \
ggplot2.aes_string(x='lon', y='lat', group='nbhd', fill='passenger_count') + \
ggplot2.geom_polygon() + \
ggplot2.facet_wrap(robjects.Formula('~ type')) + \
ggplot2.scale_fill_gradient(low='yellow', high='red') + \
ggplot2.theme(legend_position='bottom') + \
ggplot2.labs(x='', y='', title='Actual vs. Expected Total Passengers on 5-7-2015', fill='Passenger Count\n(Log-Scale)')
p6.save('./plots/actual_pred_pickups.png', width=7, height=5)
Ejemplo n.º 15
0
d['time'] = FloatVector([x for x in times]) + FloatVector(times_r)
d['n_loop']    = IntVector([x[-1] for x in combos]) + IntVector([x[3] for x in combos_r])
d['group'] = StrVector([d['code'][x] + ':' + d['sequence'][x] for x in range(len(d['n_loop']))])
dataf = DataFrame(d)



from rpy2.robjects.lib import ggplot2
p = ggplot2.ggplot(dataf) + \
    ggplot2.geom_line(ggplot2.aes_string(x="n_loop", 
                                         y="time",
                                         colour="code")) + \
    ggplot2.geom_point(ggplot2.aes_string(x="n_loop", 
                                          y="time",
                                          colour="code")) + \
    ggplot2.facet_wrap(Formula('~sequence')) + \
    ggplot2.scale_y_continuous('running time') + \
    ggplot2.scale_x_continuous('repeated n times', ) + \
    ggplot2.xlim(0, max(n_loops)) + \
    ggplot2.labs(title = "Benchmark (running time)")


from rpy2.robjects.packages import importr
grdevices = importr('grDevices')
grdevices.png('../../_static/benchmark_sum.png',
              width = 712, height = 512)
p.plot()
grdevices.dev_off()

#base = importr("base")
stats = importr('stats')
Ejemplo n.º 16
0
def as_dataframe (cfg, results, basis):
  r = robjects.r
  varis = []
  langs = []
  probs = []
  times = []
  threads = []

  # speedups, with upper and lower bounds below
  speedups = [] 
  speedup_lowers = []
  speedup_uppers = []

  ses = [] # standard errors
  mems = [] # memory usage

  langs_ideal = list (cfg.languages)
  langs_ideal.append ('ideal')

  probs_ideal = list (cfg.problems)
  probs_ideal.append ('ideal')

  for var in cfg.variations:
    for lang in langs_ideal: # cfg.languages:
      for prob in probs_ideal: # cfg.problems:
        for thread in cfg.threads:

          if lang == 'ideal' and prob == 'ideal':
            continue
          elif lang == 'ideal' or prob == 'ideal':
            varis.append (var)
            langs.append (pretty_langs[lang])
            probs.append (prob)
            threads.append (thread)
            speedups.append (thread)
            speedup_lowers.append (thread)
            speedup_uppers.append (thread)
            times.append (0)
            ses.append(0)
            mems.append (0)
            continue

          varis.append (var) # pretty_varis [var])
          langs.append (pretty_langs [lang])
          probs.append (prob)
          threads.append (thread)
          
          if var.find('seq') >= 0:
            thread = cfg.threads[-1]

          vals = FloatVector (results[thread][prob][var][lang][0])
          time = mean (vals)
          times.append (time)

          #
          # time confidence interval
          #
          t_result = r['t.test'] (FloatVector(vals), 
                                  **{" conf.level": 0.999}).rx ('conf.int')[0]
          ses.append ((t_result[1] - t_result[0])/2)

          #
          # memory usage
          #
          mem_filename = get_mem_output (lang, prob, var)
          with open (mem_filename, 'r') as mem_file:
            mem = mem_file.readline()
            mems.append (float (mem))

          # we include dummy data for the sequential case to avoid the 
          # speedup calculation below
          if var.find('seq') >= 0:
            speedups.append (1)
            speedup_lowers.append (1)
            speedup_uppers.append (1)
            continue
            
          #
          # speedup values and confidence intervals
          #
          seq_vals = results[cfg.threads[-1]][prob][var.replace ('par', 'seq')][lang][0]

          # sequential base
          base = FloatVector (seq_vals)
          # base with p = 1
          base_p1 = FloatVector (results[1][prob][var][lang][0])
          # use fastest sequential program
          if basis == 'fastest' and mean (base_p1) < mean(base):
            base = base_p1
          elif basis == 'seq':
            pass
          elif basis == 'p1':
            base = base_p1
      

          labels = ['Base'] * r.length(base)[0] + ['N']*r.length (vals)[0]
          df = DataFrame ({'Times': base + vals, 
                           'Type': StrVector(labels)})
          ratio_test = r['pairwiseCI'] (r('Times ~ Type'), data=df,
                                        control='N',
                                        method='Param.ratio',
                                        **{'var.equal': False})[0][0]

          speedups.append (mean(base) / time)
          speedup_lowers.append (ratio_test[1][0])
          speedup_uppers.append (ratio_test[2][0])

  df = robjects.DataFrame({'Language': StrVector (langs),
                           'Problem': StrVector (probs),
                           'Variation' : StrVector (varis),
                           'Threads': IntVector (threads),
                           
                           'Time': FloatVector (times),
                           'SE': FloatVector (ses),
                           
                           'Speedup': FloatVector (speedups),
                           'SpeedupLower': FloatVector (speedup_lowers),
                           'SpeedupUpper': FloatVector (speedup_uppers),
                           
                           'Mem' : FloatVector (mems)
                           })


  r.assign ('df', df)

  r ('save (df, file="performance.Rda")')
  
  # reshape the data to make variation not a column itself, but a part of
  # the other columns describe ie, time, speedup, etc.
  #
  # also, remove the 'ideal' problem as we don't want it in this plot.
  df = r('''
redf = reshape (df, 
                timevar="Variation", 
                idvar = c("Language","Problem","Threads"), 
                direction="wide")
redf$Problem <- factor(redf$Problem, levels = c("randmat","thresh","winnow","outer","product","chain"))
redf[which(redf$Problem != "ideal"),]
''')
  
  r.pdf ('speedup-expertpar-all.pdf',
         height=6.5, width=10)

  change_name = 'Language'

  legendVec = IntVector (range (len (langs_ideal)))
  legendVec.names = StrVector (langs_ideal)

  gg = ggplot2.ggplot (df)

  limits = ggplot2.aes (ymax = 'SpeedupUpper.expertpar', ymin = 'SpeedupLower.expertpar')
  dodge = ggplot2.position_dodge (width=0.9)

  pp = gg + \
      ggplot2.geom_line() + ggplot2.geom_point(size=2.5) +\
      robjects.r('scale_color_manual(values = c("#ffcb7e", "#1da06b", "#b94646", "#00368a", "#CCCCCC"))') +\
      ggplot2.aes_string(x='Threads', y='Speedup.expertpar', 
                         group=change_name, color=change_name, 
                         shape=change_name) + \
      ggplot2.geom_errorbar (limits, width=0.25) + \
      ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, vjust=-0.2),
                       'axis.title.y' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, angle=90, vjust=0.2),
                       'axis.text.x' : ggplot2.theme_text(family = 'serif', size = 10),
                       'axis.text.y' : ggplot2.theme_text(family = 'serif', size = 10),
                       'legend.title' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10),
                       'legend.text' : ggplot2.theme_text(family = 'serif', size = 10),
                       'strip.text.x' : ggplot2.theme_text(family = 'serif', size = 10),
                       'aspect.ratio' : 1,
                       }) + \
      robjects.r('ylab("Speedup")') + \
      robjects.r('xlab("Number of cores")') + \
      ggplot2.facet_wrap ('Problem', nrow = 2)

  pp.plot()

  r['dev.off']()
Ejemplo n.º 17
0
def plot_qc_reads(qc_df):
    """
    Plot number of reads part of a pipeline QC file.
    """
    # Record NA values as 0
    qc_df = qc_df.fillna(0)#.set_index("sample")
    cols = ["sample",
            "num_reads",
            "num_mapped",
            "num_unique_mapped",
            "num_junctions"]
    qc_df = qc_df[cols]
    melted_qc = pandas.melt(qc_df, id_vars=["sample"])
    qc_r = conversion_pydataframe(melted_qc)
    labels = tuple(["num_reads",
                    "num_mapped",
                    "num_unique_mapped",
                    "num_junctions"])
    labels = robj.StrVector(labels)
    variable_i = qc_r.names.index('variable')
    qc_r[variable_i] = robj.FactorVector(qc_r[variable_i],
                                         levels = labels)
    ggplot2.theme_set(ggplot2.theme_bw(12))
    scales = importr("scales")
    r_opts = r.options(scipen=4)
    p = ggplot2.ggplot(qc_r) + \
        ggplot2.geom_point(aes_string(x="sample", y="value")) + \
        ggplot2.scale_y_continuous(trans=scales.log10_trans(),
                                   breaks=scales.trans_breaks("log10",
                                                              robj.r('function(x) 10^x')),
                                   labels=scales.trans_format("log10",
                                                              robj.r('math_format(10^.x)'))) + \
        r.xlab("CLIP-Seq samples") + \
        r.ylab("No. reads") + \
        ggplot2.coord_flip() + \
        ggplot2.facet_wrap(Formula("~ variable"), ncol=1) + \
        theme(**{"panel.grid.major.x": element_blank(),
                 "panel.grid.minor.x": element_blank(),
                 "panel.grid.major.y": theme_line(size=0.5,colour="grey66",linetype=3)})
    p.plot()

    return
    r.par(mfrow=np.array([1,2]))
    num_samples = len(qc_df.num_reads)
    r.par(bty="n", lwd=1.7, lty=2)
    r_opts = r.options(scipen=4)
    r.options(r_opts)
    r.dotchart(convert_to_r_matrix(qc_df[["num_reads",
                                          "num_mapped",
                                          "num_unique_mapped"]]),
               xlab="No. reads",
               lcolor="black",
               pch=19,
               gcolor="darkblue",
               cex=0.8)
    r.par(bty="n")
    r.dotchart(convert_to_r_matrix(qc_df[["num_ribosub_mapped",
                                          "num_ribo",
                                          "num_junctions"]]),
               xlab="No. reads",
               lcolor="black",
               pch=19,
               gcolor="darkblue",
               cex=0.8)
Ejemplo n.º 18
0
def plot_squiggle(args, filename, start_times, mean_signals):
    """
	Use rpy2 to create a squiggle plot of the read
	"""
    r = robjects.r
    r.library("ggplot2")
    grdevices = importr('grDevices')

    # set t_0 as the first measured time for the read.
    t_0 = start_times[0]
    total_time = start_times[-1] - start_times[0]
    # adjust times to be relative to t_0
    r_start_times = robjects.FloatVector([t - t_0 for t in start_times])
    r_mean_signals = robjects.FloatVector(mean_signals)

    # infer the appropriate number of events given the number of facets
    num_events = len(r_mean_signals)
    events_per_facet = (num_events / args.num_facets) + 1
    # dummy variable to control faceting
    facet_category = robjects.FloatVector([(i / events_per_facet) + 1
                                           for i in range(len(start_times))])

    # make a data frame of the start times and mean signals
    d = {'start': r_start_times, 'mean': r_mean_signals, 'cat': facet_category}
    df = robjects.DataFrame(d)

    gp = ggplot2.ggplot(df)
    if not args.theme_bw:
        pp = gp + ggplot2.aes_string(x='start', y='mean') \
         + ggplot2.geom_step(size=0.25) \
         + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \
         + ggplot2.scale_x_continuous('Time (seconds)') \
         + ggplot2.scale_y_continuous('Mean signal (picoamps)') \
         + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \
         + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)})
    else:
        pp = gp + ggplot2.aes_string(x='start', y='mean') \
         + ggplot2.geom_step(size=0.25) \
         + ggplot2.facet_wrap(robjects.Formula('~cat'), ncol=1, scales="free_x") \
         + ggplot2.scale_x_continuous('Time (seconds)') \
         + ggplot2.scale_y_continuous('Mean signal (picoamps)') \
         + ggplot2.ggtitle('Squiggle plot for read: ' + filename + "\nTotal time (sec): " + str(total_time)) \
         + ggplot2.theme(**{'plot.title': ggplot2.element_text(size=11)}) \
         + ggplot2.theme_bw()

    if args.saveas is not None:
        plot_file = os.path.basename(filename) + "." + args.saveas
        if os.path.isfile(plot_file):
            raise Exception(
                'Cannot create plot for %s: plot file %s already exists' %
                (filename, plot_file))
        if args.saveas == "pdf":
            grdevices.pdf(plot_file, width=8.5, height=11)
        elif args.saveas == "png":
            grdevices.png(plot_file, width=8.5, height=11, units="in", res=300)
        pp.plot()
        grdevices.dev_off()
    else:
        pp.plot()
        # keep the plot open until user hits enter
        print('Type enter to exit.')
        raw_input()
Ejemplo n.º 19
0
def as_dataframe(cfg, results, basis):
    r = robjects.r
    varis = []
    langs = []
    probs = []
    times = []
    threads = []

    # speedups, with upper and lower bounds below
    speedups = []
    speedup_lowers = []
    speedup_uppers = []

    ses = []  # standard errors
    mems = []  # memory usage

    langs_ideal = list(cfg.languages)
    langs_ideal.append('ideal')

    probs_ideal = list(cfg.problems)
    probs_ideal.append('ideal')

    for var in cfg.variations:
        for lang in langs_ideal:  # cfg.languages:
            for prob in probs_ideal:  # cfg.problems:
                for thread in cfg.threads:

                    if lang == 'ideal' and prob == 'ideal':
                        continue
                    elif lang == 'ideal' or prob == 'ideal':
                        varis.append(var)
                        langs.append(pretty_langs[lang])
                        probs.append(prob)
                        threads.append(thread)
                        speedups.append(thread)
                        speedup_lowers.append(thread)
                        speedup_uppers.append(thread)
                        times.append(0)
                        ses.append(0)
                        mems.append(0)
                        continue

                    varis.append(var)  # pretty_varis [var])
                    langs.append(pretty_langs[lang])
                    probs.append(prob)
                    threads.append(thread)

                    if var.find('seq') >= 0:
                        thread = cfg.threads[-1]

                    vals = FloatVector(results[thread][prob][var][lang][0])
                    time = mean(vals)
                    times.append(time)

                    #
                    # time confidence interval
                    #
                    t_result = r['t.test'](FloatVector(vals), **{
                        " conf.level": 0.999
                    }).rx('conf.int')[0]
                    ses.append((t_result[1] - t_result[0]) / 2)

                    #
                    # memory usage
                    #
                    mem_filename = get_mem_output(lang, prob, var)
                    with open(mem_filename, 'r') as mem_file:
                        mem = mem_file.readline()
                        mems.append(float(mem))

                    # we include dummy data for the sequential case to avoid the
                    # speedup calculation below
                    if var.find('seq') >= 0:
                        speedups.append(1)
                        speedup_lowers.append(1)
                        speedup_uppers.append(1)
                        continue

                    #
                    # speedup values and confidence intervals
                    #
                    seq_vals = results[cfg.threads[-1]][prob][var.replace(
                        'par', 'seq')][lang][0]

                    # sequential base
                    base = FloatVector(seq_vals)
                    # base with p = 1
                    base_p1 = FloatVector(results[1][prob][var][lang][0])
                    # use fastest sequential program
                    if basis == 'fastest' and mean(base_p1) < mean(base):
                        base = base_p1
                    elif basis == 'seq':
                        pass
                    elif basis == 'p1':
                        base = base_p1

                    labels = ['Base'
                              ] * r.length(base)[0] + ['N'] * r.length(vals)[0]
                    df = DataFrame({
                        'Times': base + vals,
                        'Type': StrVector(labels)
                    })
                    ratio_test = r['pairwiseCI'](r('Times ~ Type'),
                                                 data=df,
                                                 control='N',
                                                 method='Param.ratio',
                                                 **{
                                                     'var.equal': False
                                                 })[0][0]

                    speedups.append(mean(base) / time)
                    speedup_lowers.append(ratio_test[1][0])
                    speedup_uppers.append(ratio_test[2][0])

    df = robjects.DataFrame({
        'Language': StrVector(langs),
        'Problem': StrVector(probs),
        'Variation': StrVector(varis),
        'Threads': IntVector(threads),
        'Time': FloatVector(times),
        'SE': FloatVector(ses),
        'Speedup': FloatVector(speedups),
        'SpeedupLower': FloatVector(speedup_lowers),
        'SpeedupUpper': FloatVector(speedup_uppers),
        'Mem': FloatVector(mems)
    })

    r.assign('df', df)

    r('save (df, file="performance.Rda")')

    # reshape the data to make variation not a column itself, but a part of
    # the other columns describe ie, time, speedup, etc.
    #
    # also, remove the 'ideal' problem as we don't want it in this plot.
    df = r('''
redf = reshape (df, 
                timevar="Variation", 
                idvar = c("Language","Problem","Threads"), 
                direction="wide")
redf$Problem <- factor(redf$Problem, levels = c("randmat","thresh","winnow","outer","product","chain"))
redf[which(redf$Problem != "ideal"),]
''')

    r.pdf('speedup-expertpar-all.pdf', height=6.5, width=10)

    change_name = 'Language'

    legendVec = IntVector(range(len(langs_ideal)))
    legendVec.names = StrVector(langs_ideal)

    gg = ggplot2.ggplot(df)

    limits = ggplot2.aes(ymax='SpeedupUpper.expertpar',
                         ymin='SpeedupLower.expertpar')
    dodge = ggplot2.position_dodge(width=0.9)

    pp = gg + \
        ggplot2.geom_line() + ggplot2.geom_point(size=2.5) +\
        robjects.r('scale_color_manual(values = c("#ffcb7e", "#1da06b", "#b94646", "#00368a", "#CCCCCC"))') +\
        ggplot2.aes_string(x='Threads', y='Speedup.expertpar',
                           group=change_name, color=change_name,
                           shape=change_name) + \
        ggplot2.geom_errorbar (limits, width=0.25) + \
        ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, vjust=-0.2),
                         'axis.title.y' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, angle=90, vjust=0.2),
                         'axis.text.x' : ggplot2.theme_text(family = 'serif', size = 10),
                         'axis.text.y' : ggplot2.theme_text(family = 'serif', size = 10),
                         'legend.title' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10),
                         'legend.text' : ggplot2.theme_text(family = 'serif', size = 10),
                         'strip.text.x' : ggplot2.theme_text(family = 'serif', size = 10),
                         'aspect.ratio' : 1,
                         }) + \
        robjects.r('ylab("Speedup")') + \
        robjects.r('xlab("Number of cores")') + \
        ggplot2.facet_wrap ('Problem', nrow = 2)

    pp.plot()

    r['dev.off']()
Ejemplo n.º 20
0
 def test_vars(self):
     gp = (ggplot2.ggplot(mtcars) + ggplot2.aes(x='wt', y='mpg') +
           ggplot2.geom_point() + ggplot2.facet_wrap(ggplot2.vars('gears')))
     assert isinstance(gp, ggplot2.GGPlot)
Ejemplo n.º 21
0
    [x[0] for x in combos_r])
d['n_loop'] = IntVector([x[-1] for x in combos]) + IntVector(
    [x[1] for x in combos_r])
d['group'] = StrVector(
    [d['code'][x] + ':' + d['sequence'][x] for x in xrange(len(d['n_loop']))])
dataf = DataFrame(d)

from rpy2.robjects.lib import ggplot2
p = ggplot2.ggplot(dataf) + \
    ggplot2.geom_line(ggplot2.aes_string(x="n_loop",
                                         y="time",
                                         colour="code")) + \
    ggplot2.geom_point(ggplot2.aes_string(x="n_loop",
                                          y="time",
                                          colour="code")) + \
    ggplot2.facet_wrap(Formula('~sequence')) + \
    ggplot2.scale_y_continuous('running time') + \
    ggplot2.scale_x_continuous('repeated n times', ) + \
    ggplot2.xlim(0, max(n_loops)) + \
    ggplot2.opts(title = "Benchmark (running time)")

from rpy2.robjects.packages import importr
grdevices = importr('grDevices')
grdevices.png('../../_static/benchmark_sum.png', width=712, height=512)
p.plot()
grdevices.dev_off()

#base = importr("base")
stats = importr('stats')
nlme = importr("nlme")
fit = nlme.lmList(Formula('time ~ n_loop | group'),
Ejemplo n.º 22
0
 def test_as_labeller(self, labeller):
     if isinstance(labeller, dict):
         labeller = ggplot2.dict2rvec(labeller)
     gp = (ggplot2.ggplot(mtcars) + ggplot2.facet_wrap(
         rl('~am'), labeller=ggplot2.as_labeller(labeller)))
     assert isinstance(gp, ggplot2.GGPlot)
Ejemplo n.º 23
0
    iris_py = pandas.read_csv("/home/yarden/iris.csv")
    iris_py = iris_py.rename(columns={"Name": "Species"})
    corrs = []
    from scipy.stats import spearmanr
    for species in set(iris_py.Species):
        entries = iris_py[iris_py["Species"] == species]
        c = spearmanr(entries["SepalLength"], entries["SepalWidth"])
        print "c: ", c

    # compute r.cor(x, y) and divide up by Species
    # Assume we get a vector of length Species saying what the
    # correlation is for each Species' Petal Length/Width
    p = ggplot2.ggplot(iris) + \
        ggplot2.geom_point(ggplot2.aes_string(x="Sepal.Length", y="Sepal.Width")) + \
        ggplot2.facet_wrap(Formula("~Species")) 
    p.plot()
    r["dev.off"]()    

    sys.exit(1)
    grdevices = importr('grDevices')
    ggplot2.theme_set(ggplot2.theme_bw(12))

    p = ggplot2.ggplot(iris) + \
        ggplot2.geom_point(ggplot2.aes_string(x="Sepal.Length", y="Sepal.Width")) + \
        ggplot2.facet_wrap(Formula('~ Species'), ncol=2, nrow = 2) + \
        ggplot2.geom_text(aes_string(x="Sepal.Length", y="Sepal.Width"), label="t") + \
        ggplot2.GBaseObject(r('ggplot2::coord_fixed')()) # aspect ratio
    p.plot()