def plot_host_domain_tld(self): # -- pages/URLs per host / domain / tld data = self.histogr data = data[data['type'].isin(['host', 'domain', 'tld'])] data = data[data['type_counted'].isin(['url'])] img_path = os.path.join(PLOTDIR, 'crawler/histogr_host_domain_tld.png') # data.to_csv(img_path + '.csv') title = 'URLs per Host / Domain / TLD' p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='count', weight='frequency', color='type') \ + ggplot2.geom_freqpoly(bins=20) \ + ggplot2.facet_wrap('crawl', ncol=4) \ + ggplot2.labs(title='', x=title, y='Frequency') \ + ggplot2.scale_y_log10() \ + ggplot2.scale_x_log10() p.save(img_path) return p
def scatter(self, dataframe, filename, parm1, parm2, units1, units2, group,logx,logy): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=parm1, y=parm2,colour=group) geom = ggplot2.geom_point(alpha = 0.7) labs = ggplot2.labs(x=parm1+ " " + units1, y=parm2 + " " + units2) xlogscale = ggplot2.scale_x_log10() ylogscale = ggplot2.scale_y_log10() if logx == True and logy == True: gg = data + aes + geom + labs + xlogscale + ylogscale elif logx == True: gg = data + aes + geom + labs + xlogscale elif logy == True: gg = data + aes + geom + labs + ylogscale else: gg = data + aes + geom + labs gg.plot() grdevices.dev_off()
def scatter(self, dataframe, filename, parm1, parm2, units1, units2, group, logx, logy): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=parm1, y=parm2, colour=group) geom = ggplot2.geom_point(alpha=0.7) labs = ggplot2.labs(x=parm1 + " " + units1, y=parm2 + " " + units2) xlogscale = ggplot2.scale_x_log10() ylogscale = ggplot2.scale_y_log10() if logx == True and logy == True: gg = data + aes + geom + labs + xlogscale + ylogscale elif logx == True: gg = data + aes + geom + labs + xlogscale elif logy == True: gg = data + aes + geom + labs + ylogscale else: gg = data + aes + geom + labs gg.plot() grdevices.dev_off()
def plot_domain_cumul(self, crawl): # -- coverage (cumulative pages) per domain data = self.histogr data = data[data['type'].isin(['domain'])] data = data[data['crawl'] == crawl] data = data[data['type_counted'].isin(['url'])] data['urls'] = data['count']*data['frequency'] print(data) data = data[['urls', 'count', 'frequency']] data = data.sort_values(['count'], ascending=0) data['cum_domains'] = data['frequency'].cumsum() data['cum_urls'] = data['urls'].cumsum() data_perc = data.apply(lambda x: round(100.0*x/float(x.sum()), 1)) data['%domains'] = data_perc['frequency'] data['%urls'] = data_perc['urls'] data['%cum_domains'] = data['cum_domains'].apply( lambda x: round(100.0*x/float(data['frequency'].sum()), 1)) data['%cum_urls'] = data['cum_urls'].apply( lambda x: round(100.0*x/float(data['urls'].sum()), 1)) with pandas.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 200): print(data) img_path = os.path.join(PLOTDIR, 'crawler/histogr_domain_cumul.png') # data.to_csv(img_path + '.csv') title = 'Cumulative URLs for Top Domains' p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='cum_domains', y='cum_urls') \ + ggplot2.geom_line() + ggplot2.geom_point() \ + GGPLOT2_THEME \ + ggplot2.labs(title=title, x='domains cumulative', y='URLs cumulative') \ + ggplot2.scale_y_log10() \ + ggplot2.scale_x_log10() p.save(img_path) return p
def plot_domain_cumul(self, crawl): # -- coverage (cumulative pages) per domain data = self.histogr data = data[data['type'].isin(['domain'])] data = data[data['crawl'] == crawl] data = data[data['type_counted'].isin(['url'])] data['urls'] = data['count'] * data['frequency'] print(data) data = data[['urls', 'count', 'frequency']] data = data.sort_values(['count'], ascending=0) data['cum_domains'] = data['frequency'].cumsum() data['cum_urls'] = data['urls'].cumsum() data_perc = data.apply(lambda x: round(100.0 * x / float(x.sum()), 1)) data['%domains'] = data_perc['frequency'] data['%urls'] = data_perc['urls'] data['%cum_domains'] = data['cum_domains'].apply( lambda x: round(100.0 * x / float(data['frequency'].sum()), 1)) data['%cum_urls'] = data['cum_urls'].apply( lambda x: round(100.0 * x / float(data['urls'].sum()), 1)) with pandas.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 200): print(data) img_path = os.path.join(PLOTDIR, 'crawler/histogr_domain_cumul.png') # data.to_csv(img_path + '.csv') title = 'Cumulative URLs for Top Domains' p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='cum_domains', y='cum_urls') \ + ggplot2.geom_line() + ggplot2.geom_point() \ + GGPLOT2_THEME \ + ggplot2.labs(title=title, x='domains cumulative', y='URLs cumulative') \ + ggplot2.scale_y_log10() \ + ggplot2.scale_x_log10() p.save(img_path) return p
y_lab = r("expression(Discharge (m^{3}/s))") x_lab = r("expression(Area (km^{2}))") annotate1 = r('annotate("text", x = '+str(max(areas)-30)+', y = 0.5, color = "red", label = "Mean Annual", parse=FALSE)') annotate2 = r('annotate("text", x = '+str(max(areas)-30)+', y = 0.42, label = "'+r_sq_lab+'", color = "red", parse=TRUE)') annotate3 = r('annotate("text", x = '+str(max(areas)-30)+', y = 0.34, label = "slope~'+sl+'", color = "red", parse=TRUE)') annotate4 = r('annotate("text", x = '+str(max(areas)-150)+', y = 0.7, color = "blue", label = "LGM", parse=FALSE)') annotate5 = r('annotate("text", x = '+str(max(areas)-150)+', y = 0.6, color = "blue", label = "'+r_sq_lab_lgm+'", parse=TRUE)') annotate6 = r('annotate("text", x = '+str(max(areas)-150)+', y = 0.5, color = "blue", label = "slope~'+sl_lgm+'", parse=TRUE)') pp = ggplot2.ggplot(dat_frame) + \ ggplot2.aes_string(y='discharge', x='areas') + \ ggplot2.ggtitle('Area vs. Sediment Flux') + \ ggplot2.scale_x_log10(x_lab) + \ ggplot2.theme_bw() + \ ggplot2.stat_smooth(method = "lm", formula = 'y ~ x') + \ ggplot2.scale_y_log10(y_lab) + \ annotate1 + \ annotate2 + \ annotate3 + \ annotate4 + \ annotate5 + \ annotate6 + \ ggplot2.geom_point(color='blue') + \ ggplot2.geom_errorbar(ggplot2.aes_string(ymin='min',ymax='max'), data=dat_frame, width=.02, alpha=.3) + \ ggplot2.geom_point(data=dat_frame2,color='red',show_guide='FALSE' ) + \ ggplot2.stat_smooth(data=dat_frame2, method = "lm", formula = 'y ~ x', color='red') grdevices = importr('grDevices')
samples.index = npy.arange(len(samples)) samplesgrouped = samples.groupby(['model']) variances = samplesgrouped['Zweighted'].aggregate(npy.var) print variances print variances['BG'] / variances['BS'] print estimatesum(samples) print samplesgrouped['Zweighted'].aggregate(estimatesum) print trueZnsum # grdevices.png(file="sampled-Z.png", width=4, height=3, units="in", res=300) rsamples = com.convert_to_r_dataframe(samples) pp = ggplot2.ggplot(rsamples) + \ ggplot2.aes_string(x='Z', color='factor(model)') + \ ggplot2.scale_colour_discrete(name="model") + \ ggplot2.geom_density() + \ ggplot2.scale_x_log10() # ggplot2.scale_x_continuous(limits=FloatVector((0, 1))) pp.plot() # grdevices.dev_off() def makeestimate(sampler, numsamples, **kwargs): samples = sample(sampler, numsamples, **kwargs) return estimatesum(samples['Zweighted']) def makeestimates(sampler, numsamples, numestimates, **kwargs): estimates = [ makeestimate(sampler, numsamples, **kwargs) for _ in xrange(numestimates)] kwargs.update({
emdf = pd.DataFrame({ 'BSdists' : distsbs, 'BGdists' : distsbg, 'truesums' : truesums, 'varratios' : varratios, }) # Plot sampled Z logging.info('Plotting sampled Zn') grdevices.png(file="sampled-Z.png", width=4, height=3, units="in", res=300) rsamples = com.convert_to_r_dataframe(samples) pp = ggplot2.ggplot(rsamples) + \ ggplot2.aes_string(x='Z', color='factor(model)') + \ ggplot2.scale_colour_discrete(name="model") + \ ggplot2.geom_density() + \ ggplot2.scale_x_log10() # ggplot2.scale_x_continuous(limits=FloatVector((0, 1))) pp.plot() grdevices.dev_off() # Plot likelihood ratios logging.info('Plotting likelihood ratios from binding site samples') grdevices.png(file="sampled-ratios.png", width=4, height=3, units="in", res=300) rsamplesbs = com.convert_to_r_dataframe(samples[samples['model'] == 'BS']) pp = ggplot2.ggplot(rsamplesbs) + \ ggplot2.aes_string(x='ir') + \ ggplot2.geom_density() + \ ggplot2.scale_x_log10() pp.plot() grdevices.dev_off()