def rank_abundance_plot(counter, name): grdevices.png('analytics_out/{0}_rank_abundance.png'.format(name)) ranks, fracs = rank_abundance_data(counter) df = robjects.DataFrame({'rank': ranks, 'f': fracs}) pp = ggplot.ggplot(df) + \ ggplot.aes_string(x = 'rank', y = 'f') + \ ggplot.geom_point() + \ ggplot.scale_y_log10(name = 'fraction of hits') pp.plot() grdevices.dev_off()
def plot_host_domain_tld(self): # -- pages/URLs per host / domain / tld data = self.histogr data = data[data['type'].isin(['host', 'domain', 'tld'])] data = data[data['type_counted'].isin(['url'])] img_path = os.path.join(PLOTDIR, 'crawler/histogr_host_domain_tld.png') # data.to_csv(img_path + '.csv') title = 'URLs per Host / Domain / TLD' p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='count', weight='frequency', color='type') \ + ggplot2.geom_freqpoly(bins=20) \ + ggplot2.facet_wrap('crawl', ncol=4) \ + ggplot2.labs(title='', x=title, y='Frequency') \ + ggplot2.scale_y_log10() \ + ggplot2.scale_x_log10() p.save(img_path) return p
def plot_dupl_url(self): # -- pages per URL (URL-level duplicates) row_filter = ['url'] data = self.histogr data = data[data['type'].isin(row_filter)] title = 'Pages per URL (URL-level duplicates)' p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='count', y='frequency') \ + ggplot2.geom_jitter() \ + ggplot2.facet_wrap('crawl', ncol=5) \ + ggplot2.labs(title=title, x='(duplicate) pages per URL', y='log(frequency)') \ + ggplot2.scale_y_log10() # + ggplot2.scale_x_log10() # could use log-log scale img_path = os.path.join(PLOTDIR, 'crawler/histogr_url_dupl.png') p.save(img_path) # data.to_csv(img_path + '.csv') return p
def scatter(self, dataframe, filename, parm1, parm2, units1, units2, group,logx,logy): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=parm1, y=parm2,colour=group) geom = ggplot2.geom_point(alpha = 0.7) labs = ggplot2.labs(x=parm1+ " " + units1, y=parm2 + " " + units2) xlogscale = ggplot2.scale_x_log10() ylogscale = ggplot2.scale_y_log10() if logx == True and logy == True: gg = data + aes + geom + labs + xlogscale + ylogscale elif logx == True: gg = data + aes + geom + labs + xlogscale elif logy == True: gg = data + aes + geom + labs + ylogscale else: gg = data + aes + geom + labs gg.plot() grdevices.dev_off()
def scatter(self, dataframe, filename, parm1, parm2, units1, units2, group, logx, logy): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=parm1, y=parm2, colour=group) geom = ggplot2.geom_point(alpha=0.7) labs = ggplot2.labs(x=parm1 + " " + units1, y=parm2 + " " + units2) xlogscale = ggplot2.scale_x_log10() ylogscale = ggplot2.scale_y_log10() if logx == True and logy == True: gg = data + aes + geom + labs + xlogscale + ylogscale elif logx == True: gg = data + aes + geom + labs + xlogscale elif logy == True: gg = data + aes + geom + labs + ylogscale else: gg = data + aes + geom + labs gg.plot() grdevices.dev_off()
def plot_domain_cumul(self, crawl): # -- coverage (cumulative pages) per domain data = self.histogr data = data[data['type'].isin(['domain'])] data = data[data['crawl'] == crawl] data = data[data['type_counted'].isin(['url'])] data['urls'] = data['count']*data['frequency'] print(data) data = data[['urls', 'count', 'frequency']] data = data.sort_values(['count'], ascending=0) data['cum_domains'] = data['frequency'].cumsum() data['cum_urls'] = data['urls'].cumsum() data_perc = data.apply(lambda x: round(100.0*x/float(x.sum()), 1)) data['%domains'] = data_perc['frequency'] data['%urls'] = data_perc['urls'] data['%cum_domains'] = data['cum_domains'].apply( lambda x: round(100.0*x/float(data['frequency'].sum()), 1)) data['%cum_urls'] = data['cum_urls'].apply( lambda x: round(100.0*x/float(data['urls'].sum()), 1)) with pandas.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 200): print(data) img_path = os.path.join(PLOTDIR, 'crawler/histogr_domain_cumul.png') # data.to_csv(img_path + '.csv') title = 'Cumulative URLs for Top Domains' p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='cum_domains', y='cum_urls') \ + ggplot2.geom_line() + ggplot2.geom_point() \ + GGPLOT2_THEME \ + ggplot2.labs(title=title, x='domains cumulative', y='URLs cumulative') \ + ggplot2.scale_y_log10() \ + ggplot2.scale_x_log10() p.save(img_path) return p
def plot_domain_cumul(self, crawl): # -- coverage (cumulative pages) per domain data = self.histogr data = data[data['type'].isin(['domain'])] data = data[data['crawl'] == crawl] data = data[data['type_counted'].isin(['url'])] data['urls'] = data['count'] * data['frequency'] print(data) data = data[['urls', 'count', 'frequency']] data = data.sort_values(['count'], ascending=0) data['cum_domains'] = data['frequency'].cumsum() data['cum_urls'] = data['urls'].cumsum() data_perc = data.apply(lambda x: round(100.0 * x / float(x.sum()), 1)) data['%domains'] = data_perc['frequency'] data['%urls'] = data_perc['urls'] data['%cum_domains'] = data['cum_domains'].apply( lambda x: round(100.0 * x / float(data['frequency'].sum()), 1)) data['%cum_urls'] = data['cum_urls'].apply( lambda x: round(100.0 * x / float(data['urls'].sum()), 1)) with pandas.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 200): print(data) img_path = os.path.join(PLOTDIR, 'crawler/histogr_domain_cumul.png') # data.to_csv(img_path + '.csv') title = 'Cumulative URLs for Top Domains' p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='cum_domains', y='cum_urls') \ + ggplot2.geom_line() + ggplot2.geom_point() \ + GGPLOT2_THEME \ + ggplot2.labs(title=title, x='domains cumulative', y='URLs cumulative') \ + ggplot2.scale_y_log10() \ + ggplot2.scale_x_log10() p.save(img_path) return p
ggplot2.scale_colour_manual("Color", values=colormap, breaks=colormap.names, labels=[elt[1] for elt in colormap_labels]) + \ ggplot2.geom_point(size=3) + \ ggplot2.scale_linetype_manual(values=linemap) + \ ggplot2.geom_line(size=1.5) # custom y-axis lines: major lines ("breaks") are every 10^n; 9 # minor lines ("minor_breaks") between major lines if (yscale == 'log'): pp = pp + \ ggplot2.scale_y_log10(breaks = ro.r("10^(%d:%d)" % (gflops_range[0], gflops_range[1])), minor_breaks = ro.r("rep(10^(%d:%d), each=9) * rep(1:9, %d)" % (gflops_range[0] - 1, gflops_range[1], gflops_range[1] - gflops_range[0]))) #pp.plot(vp = vp) #-- ggplot2perfcolor-end grdevices.dev_off() # grdevices.png('../../_static/graphics_ggplot2coordtranssqrt.png', # width = 612, height = 612) # #-- ggplot2coordtranssqrt-begin # pp = gp + \ # ggplot2.aes_string(x='wt', y='mpg') + \ # ggplot2.scale_y_sqrt() + \ # ggplot2.geom_point()
ggplot2.scale_colour_manual("Color", values=colormap, breaks=colormap.names, labels=[elt[1] for elt in colormap_labels]) + \ ggplot2.geom_point(size=3) + \ ggplot2.scale_linetype_manual(values=linemap) + \ ggplot2.geom_line(size=1.5) # custom y-axis lines: major lines ("breaks") are every 10^n; 9 # minor lines ("minor_breaks") between major lines if (yscale == 'log'): pp = pp + \ ggplot2.scale_y_log10(breaks = ro.r("10^(%d:%d)" % (gflops_range[0], gflops_range[1])), minor_breaks = ro.r("rep(10^(%d:%d), each=9) * rep(1:9, %d)" % (gflops_range[0] - 1, gflops_range[1], gflops_range[1] - gflops_range[0]))) pp.plot(vp = vp) #-- ggplot2perfcolor-end grdevices.dev_off() # grdevices.png('../../_static/graphics_ggplot2coordtranssqrt.png', # width = 612, height = 612) # #-- ggplot2coordtranssqrt-begin # pp = gp + \ # ggplot2.aes_string(x='wt', y='mpg') + \ # ggplot2.scale_y_sqrt() + \
x_lab = r("expression(Area (km^{2}))") annotate1 = r('annotate("text", x = '+str(max(areas)-30)+', y = 0.5, color = "red", label = "Mean Annual", parse=FALSE)') annotate2 = r('annotate("text", x = '+str(max(areas)-30)+', y = 0.42, label = "'+r_sq_lab+'", color = "red", parse=TRUE)') annotate3 = r('annotate("text", x = '+str(max(areas)-30)+', y = 0.34, label = "slope~'+sl+'", color = "red", parse=TRUE)') annotate4 = r('annotate("text", x = '+str(max(areas)-150)+', y = 0.7, color = "blue", label = "LGM", parse=FALSE)') annotate5 = r('annotate("text", x = '+str(max(areas)-150)+', y = 0.6, color = "blue", label = "'+r_sq_lab_lgm+'", parse=TRUE)') annotate6 = r('annotate("text", x = '+str(max(areas)-150)+', y = 0.5, color = "blue", label = "slope~'+sl_lgm+'", parse=TRUE)') pp = ggplot2.ggplot(dat_frame) + \ ggplot2.aes_string(y='discharge', x='areas') + \ ggplot2.ggtitle('Area vs. Sediment Flux') + \ ggplot2.scale_x_log10(x_lab) + \ ggplot2.theme_bw() + \ ggplot2.stat_smooth(method = "lm", formula = 'y ~ x') + \ ggplot2.scale_y_log10(y_lab) + \ annotate1 + \ annotate2 + \ annotate3 + \ annotate4 + \ annotate5 + \ annotate6 + \ ggplot2.geom_point(color='blue') + \ ggplot2.geom_errorbar(ggplot2.aes_string(ymin='min',ymax='max'), data=dat_frame, width=.02, alpha=.3) + \ ggplot2.geom_point(data=dat_frame2,color='red',show_guide='FALSE' ) + \ ggplot2.stat_smooth(data=dat_frame2, method = "lm", formula = 'y ~ x', color='red') grdevices = importr('grDevices') grdevices.pdf(file="area_qs.pdf") pp.plot()
#!/usr/bin/env python2 import cPickle import numpy from rpy2.robjects import FloatVector, DataFrame from rpy2.robjects.lib import ggplot2 from rpy2.robjects.packages import importr pscores = cPickle.load(open('bootstrap.pickle')) pscores.sort() proportion = numpy.linspace(1, len(pscores), len(pscores)) / len(pscores) dataf = DataFrame({ 'pscore': FloatVector(pscores), 'proportion': FloatVector(proportion), }) grdevices = importr('grDevices') #grdevices.postscript(file="pscores.eps", width=512, height=512) grdevices.postscript(file='pscores.eps') ( ggplot2.ggplot(dataf) + ggplot2.aes_string(y='pscore', x='proportion') + ggplot2.geom_point() + ggplot2.scale_x_log10() + ggplot2.scale_y_log10() + ggplot2.stat_smooth(method='lm') ).plot() grdevices.dev_off()