Example #1
0
def rank_abundance_plot(counter, name):
    grdevices.png('analytics_out/{0}_rank_abundance.png'.format(name))
    ranks, fracs = rank_abundance_data(counter)
    df = robjects.DataFrame({'rank': ranks, 'f': fracs})
    pp = ggplot.ggplot(df) + \
        ggplot.aes_string(x = 'rank', y = 'f') + \
        ggplot.geom_point() + \
        ggplot.scale_y_log10(name = 'fraction of hits')
    pp.plot()
    grdevices.dev_off()
 def plot_host_domain_tld(self):
     # -- pages/URLs per host / domain / tld
     data = self.histogr
     data = data[data['type'].isin(['host', 'domain', 'tld'])]
     data = data[data['type_counted'].isin(['url'])]
     img_path = os.path.join(PLOTDIR, 'crawler/histogr_host_domain_tld.png')
     # data.to_csv(img_path + '.csv')
     title = 'URLs per Host / Domain / TLD'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='count', weight='frequency', color='type') \
         + ggplot2.geom_freqpoly(bins=20) \
         + ggplot2.facet_wrap('crawl', ncol=4) \
         + ggplot2.labs(title='', x=title,
                        y='Frequency') \
         + ggplot2.scale_y_log10() \
         + ggplot2.scale_x_log10()
     p.save(img_path)
     return p
 def plot_dupl_url(self):
     # -- pages per URL (URL-level duplicates)
     row_filter = ['url']
     data = self.histogr
     data = data[data['type'].isin(row_filter)]
     title = 'Pages per URL (URL-level duplicates)'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='count', y='frequency') \
         + ggplot2.geom_jitter() \
         + ggplot2.facet_wrap('crawl', ncol=5) \
         + ggplot2.labs(title=title, x='(duplicate) pages per URL',
                        y='log(frequency)') \
         + ggplot2.scale_y_log10()
     # + ggplot2.scale_x_log10()  # could use log-log scale
     img_path = os.path.join(PLOTDIR, 'crawler/histogr_url_dupl.png')
     p.save(img_path)
     # data.to_csv(img_path + '.csv')
     return p
 def plot_dupl_url(self):
     # -- pages per URL (URL-level duplicates)
     row_filter = ['url']
     data = self.histogr
     data = data[data['type'].isin(row_filter)]
     title = 'Pages per URL (URL-level duplicates)'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='count', y='frequency') \
         + ggplot2.geom_jitter() \
         + ggplot2.facet_wrap('crawl', ncol=5) \
         + ggplot2.labs(title=title, x='(duplicate) pages per URL',
                        y='log(frequency)') \
         + ggplot2.scale_y_log10()
     # + ggplot2.scale_x_log10()  # could use log-log scale
     img_path = os.path.join(PLOTDIR, 'crawler/histogr_url_dupl.png')
     p.save(img_path)
     # data.to_csv(img_path + '.csv')
     return p
 def plot_host_domain_tld(self):
     # -- pages/URLs per host / domain / tld
     data = self.histogr
     data = data[data['type'].isin(['host', 'domain', 'tld'])]
     data = data[data['type_counted'].isin(['url'])]
     img_path = os.path.join(PLOTDIR,
                             'crawler/histogr_host_domain_tld.png')
     # data.to_csv(img_path + '.csv')
     title = 'URLs per Host / Domain / TLD'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='count', weight='frequency', color='type') \
         + ggplot2.geom_freqpoly(bins=20) \
         + ggplot2.facet_wrap('crawl', ncol=4) \
         + ggplot2.labs(title='', x=title,
                        y='Frequency') \
         + ggplot2.scale_y_log10() \
         + ggplot2.scale_x_log10()
     p.save(img_path)
     return p
Example #6
0
	def scatter(self, dataframe, filename, parm1, parm2, units1, units2, group,logx,logy):
		grdevices.png(file=filename, width=512, height=512)
		data = ggplot2.ggplot(dataframe)
		aes = ggplot2.aes_string(x=parm1, y=parm2,colour=group)
		geom = ggplot2.geom_point(alpha = 0.7)
		labs = ggplot2.labs(x=parm1+ " " + units1, y=parm2 + " " + units2)
		xlogscale = ggplot2.scale_x_log10()
		ylogscale = ggplot2.scale_y_log10()
		
		if logx == True and logy == True:
			gg = data + aes + geom + labs + xlogscale + ylogscale
		elif logx == True:
			gg = data + aes + geom + labs + xlogscale 
		elif logy == True:
			gg = data + aes + geom + labs + ylogscale
		else:
			gg = data + aes + geom + labs 
			
		gg.plot()
		grdevices.dev_off()
Example #7
0
    def scatter(self, dataframe, filename, parm1, parm2, units1, units2, group,
                logx, logy):
        grdevices.png(file=filename, width=512, height=512)
        data = ggplot2.ggplot(dataframe)
        aes = ggplot2.aes_string(x=parm1, y=parm2, colour=group)
        geom = ggplot2.geom_point(alpha=0.7)
        labs = ggplot2.labs(x=parm1 + " " + units1, y=parm2 + " " + units2)
        xlogscale = ggplot2.scale_x_log10()
        ylogscale = ggplot2.scale_y_log10()

        if logx == True and logy == True:
            gg = data + aes + geom + labs + xlogscale + ylogscale
        elif logx == True:
            gg = data + aes + geom + labs + xlogscale
        elif logy == True:
            gg = data + aes + geom + labs + ylogscale
        else:
            gg = data + aes + geom + labs

        gg.plot()
        grdevices.dev_off()
 def plot_domain_cumul(self, crawl):
     # -- coverage (cumulative pages) per domain
     data = self.histogr
     data = data[data['type'].isin(['domain'])]
     data = data[data['crawl'] == crawl]
     data = data[data['type_counted'].isin(['url'])]
     data['urls'] = data['count']*data['frequency']
     print(data)
     data = data[['urls', 'count', 'frequency']]
     data = data.sort_values(['count'], ascending=0)
     data['cum_domains'] = data['frequency'].cumsum()
     data['cum_urls'] = data['urls'].cumsum()
     data_perc = data.apply(lambda x: round(100.0*x/float(x.sum()), 1))
     data['%domains'] = data_perc['frequency']
     data['%urls'] = data_perc['urls']
     data['%cum_domains'] = data['cum_domains'].apply(
         lambda x: round(100.0*x/float(data['frequency'].sum()), 1))
     data['%cum_urls'] = data['cum_urls'].apply(
         lambda x: round(100.0*x/float(data['urls'].sum()), 1))
     with pandas.option_context('display.max_rows', None,
                                'display.max_columns', None,
                                'display.width', 200):
         print(data)
     img_path = os.path.join(PLOTDIR,
                             'crawler/histogr_domain_cumul.png')
     # data.to_csv(img_path + '.csv')
     title = 'Cumulative URLs for Top Domains'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='cum_domains', y='cum_urls') \
         + ggplot2.geom_line() + ggplot2.geom_point() \
         + GGPLOT2_THEME \
         + ggplot2.labs(title=title, x='domains cumulative',
                        y='URLs cumulative') \
         + ggplot2.scale_y_log10() \
         + ggplot2.scale_x_log10()
     p.save(img_path)
     return p
 def plot_domain_cumul(self, crawl):
     # -- coverage (cumulative pages) per domain
     data = self.histogr
     data = data[data['type'].isin(['domain'])]
     data = data[data['crawl'] == crawl]
     data = data[data['type_counted'].isin(['url'])]
     data['urls'] = data['count'] * data['frequency']
     print(data)
     data = data[['urls', 'count', 'frequency']]
     data = data.sort_values(['count'], ascending=0)
     data['cum_domains'] = data['frequency'].cumsum()
     data['cum_urls'] = data['urls'].cumsum()
     data_perc = data.apply(lambda x: round(100.0 * x / float(x.sum()), 1))
     data['%domains'] = data_perc['frequency']
     data['%urls'] = data_perc['urls']
     data['%cum_domains'] = data['cum_domains'].apply(
         lambda x: round(100.0 * x / float(data['frequency'].sum()), 1))
     data['%cum_urls'] = data['cum_urls'].apply(
         lambda x: round(100.0 * x / float(data['urls'].sum()), 1))
     with pandas.option_context('display.max_rows', None,
                                'display.max_columns', None,
                                'display.width', 200):
         print(data)
     img_path = os.path.join(PLOTDIR, 'crawler/histogr_domain_cumul.png')
     # data.to_csv(img_path + '.csv')
     title = 'Cumulative URLs for Top Domains'
     p = ggplot2.ggplot(data) \
         + ggplot2.aes_string(x='cum_domains', y='cum_urls') \
         + ggplot2.geom_line() + ggplot2.geom_point() \
         + GGPLOT2_THEME \
         + ggplot2.labs(title=title, x='domains cumulative',
                        y='URLs cumulative') \
         + ggplot2.scale_y_log10() \
         + ggplot2.scale_x_log10()
     p.save(img_path)
     return p
Example #10
0
        ggplot2.scale_colour_manual("Color",
                                    values=colormap,
                                    breaks=colormap.names,
                                    labels=[elt[1] for elt in
                                            colormap_labels]) + \
        ggplot2.geom_point(size=3) + \
        ggplot2.scale_linetype_manual(values=linemap) + \
        ggplot2.geom_line(size=1.5)

    # custom y-axis lines: major lines ("breaks") are every 10^n; 9
    #   minor lines ("minor_breaks") between major lines
    if (yscale == 'log'):
        pp = pp + \
            ggplot2.scale_y_log10(breaks = ro.r("10^(%d:%d)" % (gflops_range[0],
                                                                gflops_range[1])),
                                  minor_breaks =
                                  ro.r("rep(10^(%d:%d), each=9) * rep(1:9, %d)" %
                                       (gflops_range[0] - 1, gflops_range[1],
                                        gflops_range[1] - gflops_range[0])))

    #pp.plot(vp = vp)
#-- ggplot2perfcolor-end
grdevices.dev_off()

# grdevices.png('../../_static/graphics_ggplot2coordtranssqrt.png',
#               width = 612, height = 612)
# #-- ggplot2coordtranssqrt-begin
# pp = gp + \
#      ggplot2.aes_string(x='wt', y='mpg') + \
#      ggplot2.scale_y_sqrt() + \
#      ggplot2.geom_point()
Example #11
0
      ggplot2.scale_colour_manual("Color", 
                                  values=colormap,
                                  breaks=colormap.names,
                                  labels=[elt[1] for elt in 
                                          colormap_labels]) + \
      ggplot2.geom_point(size=3) + \
      ggplot2.scale_linetype_manual(values=linemap) + \
      ggplot2.geom_line(size=1.5)

  # custom y-axis lines: major lines ("breaks") are every 10^n; 9
  #   minor lines ("minor_breaks") between major lines
  if (yscale == 'log'):
    pp = pp + \
        ggplot2.scale_y_log10(breaks = ro.r("10^(%d:%d)" % (gflops_range[0], 
                                                            gflops_range[1])),
                              minor_breaks = 
                              ro.r("rep(10^(%d:%d), each=9) * rep(1:9, %d)" %
                                   (gflops_range[0] - 1, gflops_range[1], 
                                    gflops_range[1] - gflops_range[0])))

  pp.plot(vp = vp)
#-- ggplot2perfcolor-end
grdevices.dev_off()



# grdevices.png('../../_static/graphics_ggplot2coordtranssqrt.png',
#               width = 612, height = 612)
# #-- ggplot2coordtranssqrt-begin
# pp = gp + \
#      ggplot2.aes_string(x='wt', y='mpg') + \
#      ggplot2.scale_y_sqrt() + \
Example #12
0
x_lab = r("expression(Area (km^{2}))")
annotate1 = r('annotate("text", x = '+str(max(areas)-30)+', y = 0.5, color = "red", label = "Mean Annual", parse=FALSE)')
annotate2 = r('annotate("text", x = '+str(max(areas)-30)+', y = 0.42, label = "'+r_sq_lab+'", color = "red", parse=TRUE)')
annotate3 = r('annotate("text", x = '+str(max(areas)-30)+', y = 0.34, label = "slope~'+sl+'", color = "red", parse=TRUE)')

annotate4 = r('annotate("text", x = '+str(max(areas)-150)+', y = 0.7, color = "blue", label = "LGM", parse=FALSE)')
annotate5 = r('annotate("text", x = '+str(max(areas)-150)+', y = 0.6, color = "blue", label = "'+r_sq_lab_lgm+'", parse=TRUE)')
annotate6 = r('annotate("text", x = '+str(max(areas)-150)+', y = 0.5, color = "blue", label = "slope~'+sl_lgm+'", parse=TRUE)')

pp = ggplot2.ggplot(dat_frame) + \
    ggplot2.aes_string(y='discharge', x='areas') + \
    ggplot2.ggtitle('Area vs. Sediment Flux') + \
    ggplot2.scale_x_log10(x_lab) + \
    ggplot2.theme_bw() + \
    ggplot2.stat_smooth(method = "lm", formula = 'y ~ x') + \
    ggplot2.scale_y_log10(y_lab) + \
    annotate1 + \
    annotate2 + \
    annotate3 + \
    annotate4 + \
    annotate5 + \
    annotate6 + \
    ggplot2.geom_point(color='blue') + \
    ggplot2.geom_errorbar(ggplot2.aes_string(ymin='min',ymax='max'), data=dat_frame, width=.02, alpha=.3) + \
    ggplot2.geom_point(data=dat_frame2,color='red',show_guide='FALSE' ) + \
    ggplot2.stat_smooth(data=dat_frame2, method = "lm", formula = 'y ~ x', color='red')

grdevices = importr('grDevices')

grdevices.pdf(file="area_qs.pdf")
pp.plot()
Example #13
0
#!/usr/bin/env python2

import cPickle
import numpy
from rpy2.robjects import FloatVector, DataFrame
from rpy2.robjects.lib import ggplot2
from rpy2.robjects.packages import importr

pscores = cPickle.load(open('bootstrap.pickle'))
pscores.sort()
proportion = numpy.linspace(1, len(pscores), len(pscores)) / len(pscores)
dataf = DataFrame({
    'pscore': FloatVector(pscores),
    'proportion': FloatVector(proportion),
})

grdevices = importr('grDevices')
#grdevices.postscript(file="pscores.eps", width=512, height=512)
grdevices.postscript(file='pscores.eps')
(
    ggplot2.ggplot(dataf)
    + ggplot2.aes_string(y='pscore', x='proportion')
    + ggplot2.geom_point()
    + ggplot2.scale_x_log10()
    + ggplot2.scale_y_log10()
    + ggplot2.stat_smooth(method='lm')
).plot()
grdevices.dev_off()