def plot_host_domain_tld(self):
     # -- pages/URLs per host / domain / tld
     data = self.histogr
     data = data[data['type'].isin(['host', 'domain', 'tld'])]
     data = data[data['type_counted'].isin(['url'])]
     img_path = os.path.join(PLOTDIR, 'crawler/histogr_host_domain_tld.png')
     # data.to_csv(img_path + '.csv')
     title = 'URLs per Host / Domain / TLD'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='count', weight='frequency', color='type') \
         + ggplot2.geom_freqpoly(bins=20) \
         + ggplot2.facet_wrap('crawl', ncol=4) \
         + ggplot2.labs(title='', x=title,
                        y='Frequency') \
         + ggplot2.scale_y_log10() \
         + ggplot2.scale_x_log10()
     p.save(img_path)
     return p
 def plot_host_domain_tld(self):
     # -- pages/URLs per host / domain / tld
     data = self.histogr
     data = data[data['type'].isin(['host', 'domain', 'tld'])]
     data = data[data['type_counted'].isin(['url'])]
     img_path = os.path.join(PLOTDIR,
                             'crawler/histogr_host_domain_tld.png')
     # data.to_csv(img_path + '.csv')
     title = 'URLs per Host / Domain / TLD'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='count', weight='frequency', color='type') \
         + ggplot2.geom_freqpoly(bins=20) \
         + ggplot2.facet_wrap('crawl', ncol=4) \
         + ggplot2.labs(title='', x=title,
                        y='Frequency') \
         + ggplot2.scale_y_log10() \
         + ggplot2.scale_x_log10()
     p.save(img_path)
     return p
Beispiel #3
0
	def scatter(self, dataframe, filename, parm1, parm2, units1, units2, group,logx,logy):
		grdevices.png(file=filename, width=512, height=512)
		data = ggplot2.ggplot(dataframe)
		aes = ggplot2.aes_string(x=parm1, y=parm2,colour=group)
		geom = ggplot2.geom_point(alpha = 0.7)
		labs = ggplot2.labs(x=parm1+ " " + units1, y=parm2 + " " + units2)
		xlogscale = ggplot2.scale_x_log10()
		ylogscale = ggplot2.scale_y_log10()
		
		if logx == True and logy == True:
			gg = data + aes + geom + labs + xlogscale + ylogscale
		elif logx == True:
			gg = data + aes + geom + labs + xlogscale 
		elif logy == True:
			gg = data + aes + geom + labs + ylogscale
		else:
			gg = data + aes + geom + labs 
			
		gg.plot()
		grdevices.dev_off()
Beispiel #4
0
    def scatter(self, dataframe, filename, parm1, parm2, units1, units2, group,
                logx, logy):
        grdevices.png(file=filename, width=512, height=512)
        data = ggplot2.ggplot(dataframe)
        aes = ggplot2.aes_string(x=parm1, y=parm2, colour=group)
        geom = ggplot2.geom_point(alpha=0.7)
        labs = ggplot2.labs(x=parm1 + " " + units1, y=parm2 + " " + units2)
        xlogscale = ggplot2.scale_x_log10()
        ylogscale = ggplot2.scale_y_log10()

        if logx == True and logy == True:
            gg = data + aes + geom + labs + xlogscale + ylogscale
        elif logx == True:
            gg = data + aes + geom + labs + xlogscale
        elif logy == True:
            gg = data + aes + geom + labs + ylogscale
        else:
            gg = data + aes + geom + labs

        gg.plot()
        grdevices.dev_off()
 def plot_domain_cumul(self, crawl):
     # -- coverage (cumulative pages) per domain
     data = self.histogr
     data = data[data['type'].isin(['domain'])]
     data = data[data['crawl'] == crawl]
     data = data[data['type_counted'].isin(['url'])]
     data['urls'] = data['count']*data['frequency']
     print(data)
     data = data[['urls', 'count', 'frequency']]
     data = data.sort_values(['count'], ascending=0)
     data['cum_domains'] = data['frequency'].cumsum()
     data['cum_urls'] = data['urls'].cumsum()
     data_perc = data.apply(lambda x: round(100.0*x/float(x.sum()), 1))
     data['%domains'] = data_perc['frequency']
     data['%urls'] = data_perc['urls']
     data['%cum_domains'] = data['cum_domains'].apply(
         lambda x: round(100.0*x/float(data['frequency'].sum()), 1))
     data['%cum_urls'] = data['cum_urls'].apply(
         lambda x: round(100.0*x/float(data['urls'].sum()), 1))
     with pandas.option_context('display.max_rows', None,
                                'display.max_columns', None,
                                'display.width', 200):
         print(data)
     img_path = os.path.join(PLOTDIR,
                             'crawler/histogr_domain_cumul.png')
     # data.to_csv(img_path + '.csv')
     title = 'Cumulative URLs for Top Domains'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='cum_domains', y='cum_urls') \
         + ggplot2.geom_line() + ggplot2.geom_point() \
         + GGPLOT2_THEME \
         + ggplot2.labs(title=title, x='domains cumulative',
                        y='URLs cumulative') \
         + ggplot2.scale_y_log10() \
         + ggplot2.scale_x_log10()
     p.save(img_path)
     return p
 def plot_domain_cumul(self, crawl):
     # -- coverage (cumulative pages) per domain
     data = self.histogr
     data = data[data['type'].isin(['domain'])]
     data = data[data['crawl'] == crawl]
     data = data[data['type_counted'].isin(['url'])]
     data['urls'] = data['count'] * data['frequency']
     print(data)
     data = data[['urls', 'count', 'frequency']]
     data = data.sort_values(['count'], ascending=0)
     data['cum_domains'] = data['frequency'].cumsum()
     data['cum_urls'] = data['urls'].cumsum()
     data_perc = data.apply(lambda x: round(100.0 * x / float(x.sum()), 1))
     data['%domains'] = data_perc['frequency']
     data['%urls'] = data_perc['urls']
     data['%cum_domains'] = data['cum_domains'].apply(
         lambda x: round(100.0 * x / float(data['frequency'].sum()), 1))
     data['%cum_urls'] = data['cum_urls'].apply(
         lambda x: round(100.0 * x / float(data['urls'].sum()), 1))
     with pandas.option_context('display.max_rows', None,
                                'display.max_columns', None,
                                'display.width', 200):
         print(data)
     img_path = os.path.join(PLOTDIR, 'crawler/histogr_domain_cumul.png')
     # data.to_csv(img_path + '.csv')
     title = 'Cumulative URLs for Top Domains'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='cum_domains', y='cum_urls') \
         + ggplot2.geom_line() + ggplot2.geom_point() \
         + GGPLOT2_THEME \
         + ggplot2.labs(title=title, x='domains cumulative',
                        y='URLs cumulative') \
         + ggplot2.scale_y_log10() \
         + ggplot2.scale_x_log10()
     p.save(img_path)
     return p
Beispiel #7
0

y_lab = r("expression(Discharge (m^{3}/s))")
x_lab = r("expression(Area (km^{2}))")
annotate1 = r('annotate("text", x = '+str(max(areas)-30)+', y = 0.5, color = "red", label = "Mean Annual", parse=FALSE)')
annotate2 = r('annotate("text", x = '+str(max(areas)-30)+', y = 0.42, label = "'+r_sq_lab+'", color = "red", parse=TRUE)')
annotate3 = r('annotate("text", x = '+str(max(areas)-30)+', y = 0.34, label = "slope~'+sl+'", color = "red", parse=TRUE)')

annotate4 = r('annotate("text", x = '+str(max(areas)-150)+', y = 0.7, color = "blue", label = "LGM", parse=FALSE)')
annotate5 = r('annotate("text", x = '+str(max(areas)-150)+', y = 0.6, color = "blue", label = "'+r_sq_lab_lgm+'", parse=TRUE)')
annotate6 = r('annotate("text", x = '+str(max(areas)-150)+', y = 0.5, color = "blue", label = "slope~'+sl_lgm+'", parse=TRUE)')

pp = ggplot2.ggplot(dat_frame) + \
    ggplot2.aes_string(y='discharge', x='areas') + \
    ggplot2.ggtitle('Area vs. Sediment Flux') + \
    ggplot2.scale_x_log10(x_lab) + \
    ggplot2.theme_bw() + \
    ggplot2.stat_smooth(method = "lm", formula = 'y ~ x') + \
    ggplot2.scale_y_log10(y_lab) + \
    annotate1 + \
    annotate2 + \
    annotate3 + \
    annotate4 + \
    annotate5 + \
    annotate6 + \
    ggplot2.geom_point(color='blue') + \
    ggplot2.geom_errorbar(ggplot2.aes_string(ymin='min',ymax='max'), data=dat_frame, width=.02, alpha=.3) + \
    ggplot2.geom_point(data=dat_frame2,color='red',show_guide='FALSE' ) + \
    ggplot2.stat_smooth(data=dat_frame2, method = "lm", formula = 'y ~ x', color='red')

grdevices = importr('grDevices')
Beispiel #8
0
samples.index = npy.arange(len(samples))
samplesgrouped = samples.groupby(['model'])
variances = samplesgrouped['Zweighted'].aggregate(npy.var)
print variances
print variances['BG'] / variances['BS']
print estimatesum(samples)
print samplesgrouped['Zweighted'].aggregate(estimatesum)
print trueZnsum

# grdevices.png(file="sampled-Z.png", width=4, height=3, units="in", res=300)
rsamples = com.convert_to_r_dataframe(samples)
pp = ggplot2.ggplot(rsamples) + \
    ggplot2.aes_string(x='Z', color='factor(model)') + \
    ggplot2.scale_colour_discrete(name="model") + \
    ggplot2.geom_density() + \
    ggplot2.scale_x_log10()
# ggplot2.scale_x_continuous(limits=FloatVector((0, 1)))
pp.plot()
# grdevices.dev_off()


def makeestimate(sampler, numsamples, **kwargs):
    samples = sample(sampler, numsamples, **kwargs)
    return estimatesum(samples['Zweighted'])


def makeestimates(sampler, numsamples, numestimates, **kwargs):
    estimates = [
        makeestimate(sampler, numsamples, **kwargs)
        for _ in xrange(numestimates)]
    kwargs.update({
Beispiel #9
0
emdf = pd.DataFrame({
    'BSdists' : distsbs,
    'BGdists' : distsbg,
    'truesums' : truesums,
    'varratios' : varratios,
})

# Plot sampled Z
logging.info('Plotting sampled Zn')
grdevices.png(file="sampled-Z.png", width=4, height=3, units="in", res=300)
rsamples = com.convert_to_r_dataframe(samples)
pp = ggplot2.ggplot(rsamples) + \
    ggplot2.aes_string(x='Z', color='factor(model)') + \
    ggplot2.scale_colour_discrete(name="model") + \
    ggplot2.geom_density() + \
    ggplot2.scale_x_log10()
# ggplot2.scale_x_continuous(limits=FloatVector((0, 1)))
pp.plot()
grdevices.dev_off()

# Plot likelihood ratios
logging.info('Plotting likelihood ratios from binding site samples')
grdevices.png(file="sampled-ratios.png",
              width=4, height=3, units="in", res=300)
rsamplesbs = com.convert_to_r_dataframe(samples[samples['model'] == 'BS'])
pp = ggplot2.ggplot(rsamplesbs) + \
    ggplot2.aes_string(x='ir') + \
    ggplot2.geom_density() + \
    ggplot2.scale_x_log10()
pp.plot()
grdevices.dev_off()