def _plot_with_rpy2(self, regions, filename): from rpy2 import robjects import rpy2.robjects.lib.ggplot2 as ggplot2 from rpy2.robjects.lib import grid from rpy2.robjects.packages import importr grdevices = importr('grDevices') base = importr('base') grdevices.pdf(file=filename + '.pdf') t = [x for x in range(-self.num_bins, self.num_bins + 1)] for region in regions[:self.num_regs]: if not np.any(region.weighted): logger.warning( "Warning: No data for region located on bin " + str(region.bin) + ". Not plotting this one.") continue middle = (len(region.weighted[0]) - 1) / 2 if middle < self.num_bins: logger.error("Warning: There are less bins calculated for regions than you want to plot.") sys.exit(1) d = {'map': robjects.StrVector( [str(m) for sublist in [[x] * len(t) for x in range(len(region.weighted))] for m in sublist]), 't': robjects.FloatVector(t * len(region.weighted)), 'e': robjects.FloatVector([i for sublist in region.weighted for i in sublist[middle - self.num_bins:middle + self.num_bins + 1]]), 'p': robjects.FloatVector([-np.log10(x) for sublist in region.pvalues for x in sublist[middle - self.num_bins:middle + self.num_bins + 1]]), 'c': robjects.FloatVector([-np.log10(x) for sublist in region.corrected_pvalues for x in sublist[middle - self.num_bins:middle + self.num_bins + 1]])} dataf = robjects.DataFrame(d) gp = ggplot2.ggplot(dataf) # first yellow second red p1 = gp + ggplot2.geom_line(mapping=ggplot2.aes_string(x='t', y='e', group='map', colour='map'), alpha=0.8) + ggplot2.scale_y_continuous(trans='log2') + ggplot2.ggtitle( "\n".join(wrap("Bin " + str(region.bin) + " : " + str(region.positions)))) + ggplot2.labs( y="log Intensity") + ggplot2.theme_classic() + ggplot2.theme( **{'axis.title.x': ggplot2.element_blank(), 'axis.text.y': ggplot2.element_text(angle=45), 'axis.text.x': ggplot2.element_blank(), 'legend.position': 'none'}) + ggplot2.scale_colour_brewer(palette="Set1") p2 = gp + ggplot2.geom_line(mapping=ggplot2.aes_string(x='t', y='p', group='map', colour='map'), alpha=0.8) + ggplot2.labs( y="-log10(p-value)") + ggplot2.theme_classic() + ggplot2.theme( **{'axis.title.x': ggplot2.element_blank(), 'axis.text.x': ggplot2.element_blank(), 'legend.position': 'none'}) + ggplot2.scale_colour_brewer(palette="Set1") p3 = gp + ggplot2.geom_line(mapping=ggplot2.aes_string(x='t', y='c', group='map', colour='map'), alpha=0.8) + ggplot2.labs(y="-log10(q-value)", x='bins (' + str(self.bin_res) + ' bp each)') + \ ggplot2.geom_hline(mapping=ggplot2.aes_string(yintercept=str(-np.log10(self.threshold))), colour='black', alpha=0.8, linetype='dashed') + ggplot2.theme_classic() + \ ggplot2.theme(**{'legend.position': 'none'}) + ggplot2.scale_colour_brewer(palette="Set1") g1 = ggplot2.ggplot2.ggplotGrob(p1) g2 = ggplot2.ggplot2.ggplotGrob(p2) g3 = ggplot2.ggplot2.ggplotGrob(p3) robjects.globalenv["g"] = base.rbind(g1, g2, g3, size='first') robjects.r("grid::grid.draw(g)") grid.newpage() logger.debug('Plotted region ' + str(region.bin)) grdevices.dev_off()
def plot_fetch_status(self, data, row_filter, img_file, ratio=1.0): if row_filter: data = data[data['type'].isin(row_filter)] data = data[['crawl', 'percentage', 'type']] categories = [] for value in row_filter: if re.search('^fetcher:(?:aggr:)?', value): replacement = re.sub('^fetcher:(?:aggr:)?', '', value) categories.append(replacement) data.replace(to_replace=value, value=replacement, inplace=True) data['type'] = pandas.Categorical(data['type'], ordered=True, categories=categories.reverse()) ratio = 0.1 + len(data['crawl'].unique()) * .03 # print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='percentage', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='RdYlGn', type='sequential', guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path, height=int(7 * ratio), width=7) return p
def gg_funcs(functions,bottom,top,N=1000,labels = ["Baseline"], title = "Consumption and Cash-on-Hand", ylab = "y", xlab="x", loc = loc, ltitle = 'Variable', file_name = None): if type(functions)==list: function_list = functions else: function_list = [functions] step = (top-bottom)/N x = np.arange(bottom,top,step) fig = pd.DataFrame({'x': x}) #xx there's got to be a better way to scroll through this list i = 0 for function in function_list: fig[labels[i]] = function(x) #print labels[i] i=i+1 fig = pd.melt(fig, id_vars=['x']) #print(fig) g = gg.ggplot(fig) + \ mp.base_plot + mp.line + mp.point + \ mp.theme_bw(base_size=9) + mp.fte_theme +mp.colors + \ gg.labs(title=title,y=ylab,x=xlab) + mp.legend_f(loc) + mp.legend_t_c(ltitle) + mp.legend_t_s(ltitle) #+ \ # #gg.geom_text(data=pd.DataFrame(data={'l':"test"},index=np.arange(1)), x = "1", y = "1",group="1",colour="1", label = "plot mpg vs. wt") #gg.geom_text(data=pd.DataFrame(data={'l':"test"},index=np.arange(1)), mapping=gg.aes_string(x="1", y="1",group="1",colour="1",shape="1", mapping="l")) if file_name is not None: mp.ggsave(file_name,g) return(g)
def plot_crawldb_status(self, data, row_filter, img_file, ratio=1.0): if row_filter: data = data[data['type'].isin(row_filter)] categories = [] for value in row_filter: if re.search('^crawldb:status:db_', value): replacement = re.sub('^crawldb:status:db_', '', value) categories.append(replacement) data.replace(to_replace=value, value=replacement, inplace=True) data['type'] = pandas.Categorical(data['type'], ordered=True, categories=categories.reverse()) data['size'] = data['size'].astype(float) ratio = 0.1 + len(data['crawl'].unique()) * .03 print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='size', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='Pastel1', type='sequential', guide=ggplot2.guide_legend(reverse=False)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='CrawlDb Size and Status Counts\n(before crawling)', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path, height=int(7 * ratio), width=7) return p
def line_plot(self, data, title, ylabel, img_file, x='date', y='size', c='type', clabel=''): if PLOTLIB == 'ggplot': # date_label = "%Y\n%b" date_label = "%Y\n%W" # year + week number p = ggplot(data, aes(x=x, y=y, color=c)) \ + ggtitle(title) \ + ylab(ylabel) \ + xlab(' ') \ + scale_x_date(breaks=date_breaks('3 months'), labels=date_label) \ + geom_line() + geom_point() elif PLOTLIB == 'rpy2.ggplot2': # convert y axis to float because R uses 32-bit signed integers, # values > 2 bln. (2^31) will overflow data[y] = data[y].astype(float) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x=x, y=y, color=c) \ + ggplot2.geom_line() + ggplot2.geom_point() \ + GGPLOT2_THEME \ + ggplot2.labs(title=title, x='', y=ylabel, color=clabel) img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) # data.to_csv(img_path + '.csv') return p
def plot_total_bp(parser, args, tot_bp_per_pore): """ Plot the pore performance """ import math r = robjects.r r.library("ggplot2") grdevices = importr("grDevices") flowcell_layout = minion_flowcell_layout() pore_values = [] for pore in flowcell_layout: if pore in tot_bp_per_pore: pore_values.append(math.log10(tot_bp_per_pore[pore])) else: pore_values.append(0) # make a data frame of the lists d = { "rownum": robjects.IntVector(range(1, 17) * 32), "colnum": robjects.IntVector(sorted(range(1, 33) * 16)), "log10_tot_bp": robjects.IntVector(pore_values), "labels": robjects.IntVector(flowcell_layout), } df = robjects.DataFrame(d) gp = gg.ggplot(df) pp = ( gp + gg.aes_string(y="factor(rownum, rev(rownum))", x="factor(colnum)") + gg.geom_point(gg.aes_string(color="log10_tot_bp"), size=7) + gg.geom_text(gg.aes_string(label="labels"), colour="white", size=2) + gg.scale_colour_gradient2(low="black", mid="black", high="red") + gg.coord_fixed(ratio=1.4) + gg.labs(x=gg.NULL, y=gg.NULL) ) if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width=11, height=8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width=11, height=8.5, units="in", res=300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print ("Type enter to exit.") raw_input()
def plot_start(x, y): import rpy2.robjects.lib.ggplot2 as ggplot2 ##由于这一条import会有警告信息,放到这里,只有调用这个函数才会出现警告。 utils = importr('utils') data = utils.read_csv(glob('*.csv')[0]) plot = ggplot2.ggplot(data) plot = (plot + ggplot2.aes_string(x=x, y=y) + ggplot2.geom_point() + ggplot2.scale_colour_gradient(low="yellow", high="red") + ggplot2.labs(title="mtcars", x='wt', y='mpg')) plot.save('point.png')
def histogram(self, dataframe, filename, parm, group, units): with suppress_stdout(): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=parm,fill = group) geom = ggplot2.geom_histogram(colour="black") labs = ggplot2.labs(x=parm + " " + units) gg = data + aes + geom + labs gg.plot() grdevices.dev_off()
def histogram(self, dataframe, filename, parm, group, units): with suppress_stdout(): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=parm, fill=group) geom = ggplot2.geom_histogram(colour="black") labs = ggplot2.labs(x=parm + " " + units) gg = data + aes + geom + labs gg.plot() grdevices.dev_off()
def plot_similarity_matrix(self, item_type, image_file, title): '''Plot similarities of crawls (overlap of unique items) as heat map matrix''' data = defaultdict(dict) n = 1 for crawl1 in self.similarity[item_type]: for crawl2 in self.similarity[item_type][crawl1]: similarity = self.similarity[item_type][crawl1][crawl2] data['crawl1'][n] = MonthlyCrawl.short_name(crawl1) data['crawl2'][n] = MonthlyCrawl.short_name(crawl2) data['similarity'][n] = similarity data['sim_rounded'][n] = similarity # to be rounded n += 1 data = pandas.DataFrame(data) print(data) # select median of similarity values as midpoint of similarity scale midpoint = data['similarity'].median() decimals = 3 textsize = 2 minshown = .0005 if (data['similarity'].max()-data['similarity'].min()) > .2: decimals = 2 textsize = 2.8 minshown = .005 data['sim_rounded'] = data['sim_rounded'].apply( lambda x: ('{0:.'+str(decimals)+'f}').format(x).lstrip('0') if x >= minshown else '0') print('Median of similarities for', item_type, '=', midpoint) matrix_size = len(self.similarity[item_type]) if matrix_size > self.MAX_MATRIX_SIZE: n = 0 for crawl1 in sorted(self.similarity[item_type], reverse=True): short_name = MonthlyCrawl.short_name(crawl1) if n > self.MAX_MATRIX_SIZE: data = data[data['crawl1'] != short_name] data = data[data['crawl2'] != short_name] n += 1 p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl2', y='crawl1', fill='similarity', label='sim_rounded') \ + ggplot2.geom_tile(color="white") \ + ggplot2.scale_fill_gradient2(low="red", high="blue", mid="white", midpoint=midpoint, space="Lab") \ + GGPLOT2_THEME \ + ggplot2.coord_fixed() \ + ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle=45, vjust=1, hjust=1)}) \ + ggplot2.labs(title=title, x='', y='') \ + ggplot2.geom_text(color='black', size=textsize) img_path = os.path.join(PLOTDIR, image_file) p.save(img_path) return p
def plot_total_bp(parser, args, tot_bp_per_pore): """ Plot the pore performance """ import math r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') flowcell_layout = minion_flowcell_layout() pore_values = [] for pore in flowcell_layout: if pore in tot_bp_per_pore: pore_values.append(math.log10(tot_bp_per_pore[pore])) else: pore_values.append(0) # make a data frame of the lists d = {'rownum': robjects.IntVector(range(1,17)*32), 'colnum': robjects.IntVector(sorted(range(1,33)*16)), 'log10_tot_bp': robjects.IntVector(pore_values), 'labels': robjects.IntVector(flowcell_layout) } df = robjects.DataFrame(d) gp = gg.ggplot(df) pp = gp + gg.aes_string(y = 'factor(rownum, rev(rownum))', \ x = 'factor(colnum)') \ + gg.geom_point(gg.aes_string(color='log10_tot_bp'), size = 7) \ + gg.geom_text(gg.aes_string(label ='labels'), colour="white", size = 2) \ + gg.scale_colour_gradient2(low = "black", mid= "black", high="red") \ + gg.coord_fixed(ratio=1.4) \ + gg.labs(x=gg.NULL, y=gg.NULL) if args.saveas is not None: plot_file = args.saveas if plot_file.endswith(".pdf"): grdevices.pdf(plot_file, width = 11, height = 8.5) elif plot_file.endswith(".png"): grdevices.png(plot_file, width = 11, height = 8.5, units = "in", res = 300) else: logger.error("Unrecognized extension for %s!" % (plot_file)) sys.exit() pp.plot() grdevices.dev_off() else: pp.plot() # keep the plot open until user hits enter print('Type enter to exit.') raw_input()
def direct_taxon_abundance_box_plot(data, plot_file_path, title, xlabel, ylabel): grdevices.pdf(file=plot_file_path) gp = ggplot2.ggplot(data) pp = gp \ + ggplot2.aes_string(x='genotype', y='abundance') \ + ggplot2.geom_boxplot() \ + ggplot2.ggtitle(title) \ + ggplot2.labs(x=xlabel, y=ylabel) \ + ggplot2.geom_jitter(position=ggplot2.position_jitter(w=0.1)) \ + ggplot2.geom_point() pp.plot() grdevices.dev_off()
def plot_dupl_url(self): # -- pages per URL (URL-level duplicates) row_filter = ['url'] data = self.histogr data = data[data['type'].isin(row_filter)] title = 'Pages per URL (URL-level duplicates)' p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='count', y='frequency') \ + ggplot2.geom_jitter() \ + ggplot2.facet_wrap('crawl', ncol=5) \ + ggplot2.labs(title=title, x='(duplicate) pages per URL', y='log(frequency)') \ + ggplot2.scale_y_log10() # + ggplot2.scale_x_log10() # could use log-log scale img_path = os.path.join(PLOTDIR, 'crawler/histogr_url_dupl.png') p.save(img_path) # data.to_csv(img_path + '.csv') return p
def plot_host_domain_tld(self): # -- pages/URLs per host / domain / tld data = self.histogr data = data[data['type'].isin(['host', 'domain', 'tld'])] data = data[data['type_counted'].isin(['url'])] img_path = os.path.join(PLOTDIR, 'crawler/histogr_host_domain_tld.png') # data.to_csv(img_path + '.csv') title = 'URLs per Host / Domain / TLD' p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='count', weight='frequency', color='type') \ + ggplot2.geom_freqpoly(bins=20) \ + ggplot2.facet_wrap('crawl', ncol=4) \ + ggplot2.labs(title='', x=title, y='Frequency') \ + ggplot2.scale_y_log10() \ + ggplot2.scale_x_log10() p.save(img_path) return p
def scatter(self, dataframe, filename, parm1, parm2, units1, units2, group,logx,logy): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=parm1, y=parm2,colour=group) geom = ggplot2.geom_point(alpha = 0.7) labs = ggplot2.labs(x=parm1+ " " + units1, y=parm2 + " " + units2) xlogscale = ggplot2.scale_x_log10() ylogscale = ggplot2.scale_y_log10() if logx == True and logy == True: gg = data + aes + geom + labs + xlogscale + ylogscale elif logx == True: gg = data + aes + geom + labs + xlogscale elif logy == True: gg = data + aes + geom + labs + ylogscale else: gg = data + aes + geom + labs gg.plot() grdevices.dev_off()
def plot_histogram(fastq_file, plot_filename_png): """Plots histogram of length distribution of sequence in fastq_file and saves to plot_filename_png""" r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') sizes = [] with open(fastq_file, 'rb') as f: # skip first line for _ in itertools.islice(f, 0, 1): pass # Get every 4th line with raw sequence letters fourthlines = itertools.islice(f, 0, None, 4) for line in fourthlines: sizes.append(len(line.strip())) sizes = robjects.IntVector([s for s in sizes]) sizes_min = min(sizes) sizes_max = max(sizes) binwidth = (sizes_max - sizes_min) / 20 d = {'sizes' : sizes} df = robjects.DataFrame(d) # plot gp = ggplot2.ggplot(df) pp = gp + ggplot2.aes_string(x='sizes') \ + ggplot2.geom_histogram(binwidth=binwidth) \ + ggplot2.theme_grey() \ + ggplot2.labs(title =plot_filename_png, \ x = "Size (in nucleotides)", y = "Count") grdevices.png(plot_filename_png, width = 8.5, height = 8.5, units = "in", res = 300) pp.plot() grdevices.dev_off()
def scatter(self, dataframe, filename, parm1, parm2, units1, units2, group, logx, logy): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=parm1, y=parm2, colour=group) geom = ggplot2.geom_point(alpha=0.7) labs = ggplot2.labs(x=parm1 + " " + units1, y=parm2 + " " + units2) xlogscale = ggplot2.scale_x_log10() ylogscale = ggplot2.scale_y_log10() if logx == True and logy == True: gg = data + aes + geom + labs + xlogscale + ylogscale elif logx == True: gg = data + aes + geom + labs + xlogscale elif logy == True: gg = data + aes + geom + labs + ylogscale else: gg = data + aes + geom + labs gg.plot() grdevices.dev_off()
def plot_stacked_bar(self, data, row_filter, img_file, ratio=1.0): if len(row_filter) > 0: data = data[data['type'].isin(row_filter)] for value in row_filter: if re.search('^fetcher:(?:aggr:)?', value): replacement = re.sub('^fetcher:(?:aggr:)?', '', value) data.replace(to_replace=value, value=replacement, inplace=True) # print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='percentage', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='RdYlGn', type='sequential', guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) return p
def plot_fetch_status(self, data, row_filter, img_file, ratio=1.0): if len(row_filter) > 0: data = data[data['type'].isin(row_filter)] for value in row_filter: if re.search('^fetcher:(?:aggr:)?', value): replacement = re.sub('^fetcher:(?:aggr:)?', '', value) data.replace(to_replace=value, value=replacement, inplace=True) # print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='percentage', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='RdYlGn', type='sequential', guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) return p
def rpy2_plotter(anno, clusters, name): """Plot genes distribution in clusters using ggplot2 from R.""" pandas2ri.activate() grdevices = importr('grDevices') rprint = robjects.globalenv.get("print") anno = anno.sort_values(by="n_ft", ascending=False) anno = anno.head(n=10) category = anno["category"].tolist() clusters = clusters[clusters["category"].isin(category)] clusters = pandas2ri.py2ri(clusters) pp = ggplot2.ggplot(clusters) + ggplot2.aes_string( x="n_features") + ggplot2.geom_histogram( binwidth=1) + ggplot2.facet_wrap(robjects.Formula("~category"), ncol=5) + ggplot2.labs( x="Number of Features", y="Number of Clusters", title="Clusters distribution") grdevices.pdf(file=name, width=11.692, height=8.267) rprint(pp) grdevices.dev_off()
def plot_crawldb_status(self, data, row_filter, img_file, ratio=1.0): if len(row_filter) > 0: data = data[data['type'].isin(row_filter)] for value in row_filter: if re.search('^crawldb:status:db_', value): replacement = re.sub('^crawldb:status:db_', '', value) data.replace(to_replace=value, value=replacement, inplace=True) data['size'] = data['size'].astype(float) print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='size', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='Pastel1', type='sequential', guide=ggplot2.guide_legend(reverse=False)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='CrawlDb Size and Status Counts (before crawling)', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) return p
def plot_domain_cumul(self, crawl): # -- coverage (cumulative pages) per domain data = self.histogr data = data[data['type'].isin(['domain'])] data = data[data['crawl'] == crawl] data = data[data['type_counted'].isin(['url'])] data['urls'] = data['count']*data['frequency'] print(data) data = data[['urls', 'count', 'frequency']] data = data.sort_values(['count'], ascending=0) data['cum_domains'] = data['frequency'].cumsum() data['cum_urls'] = data['urls'].cumsum() data_perc = data.apply(lambda x: round(100.0*x/float(x.sum()), 1)) data['%domains'] = data_perc['frequency'] data['%urls'] = data_perc['urls'] data['%cum_domains'] = data['cum_domains'].apply( lambda x: round(100.0*x/float(data['frequency'].sum()), 1)) data['%cum_urls'] = data['cum_urls'].apply( lambda x: round(100.0*x/float(data['urls'].sum()), 1)) with pandas.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 200): print(data) img_path = os.path.join(PLOTDIR, 'crawler/histogr_domain_cumul.png') # data.to_csv(img_path + '.csv') title = 'Cumulative URLs for Top Domains' p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='cum_domains', y='cum_urls') \ + ggplot2.geom_line() + ggplot2.geom_point() \ + GGPLOT2_THEME \ + ggplot2.labs(title=title, x='domains cumulative', y='URLs cumulative') \ + ggplot2.scale_y_log10() \ + ggplot2.scale_x_log10() p.save(img_path) return p
def plot_domain_cumul(self, crawl): # -- coverage (cumulative pages) per domain data = self.histogr data = data[data['type'].isin(['domain'])] data = data[data['crawl'] == crawl] data = data[data['type_counted'].isin(['url'])] data['urls'] = data['count'] * data['frequency'] print(data) data = data[['urls', 'count', 'frequency']] data = data.sort_values(['count'], ascending=0) data['cum_domains'] = data['frequency'].cumsum() data['cum_urls'] = data['urls'].cumsum() data_perc = data.apply(lambda x: round(100.0 * x / float(x.sum()), 1)) data['%domains'] = data_perc['frequency'] data['%urls'] = data_perc['urls'] data['%cum_domains'] = data['cum_domains'].apply( lambda x: round(100.0 * x / float(data['frequency'].sum()), 1)) data['%cum_urls'] = data['cum_urls'].apply( lambda x: round(100.0 * x / float(data['urls'].sum()), 1)) with pandas.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 200): print(data) img_path = os.path.join(PLOTDIR, 'crawler/histogr_domain_cumul.png') # data.to_csv(img_path + '.csv') title = 'Cumulative URLs for Top Domains' p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='cum_domains', y='cum_urls') \ + ggplot2.geom_line() + ggplot2.geom_point() \ + GGPLOT2_THEME \ + ggplot2.labs(title=title, x='domains cumulative', y='URLs cumulative') \ + ggplot2.scale_y_log10() \ + ggplot2.scale_x_log10() p.save(img_path) return p
def plot_volcano_with_r( data, xlabel='Estimated effect (change in H/L ratio)', title='', max_labels=20, color_background='#737373', color_significant='#252525', color_significant_muted='#252525', label_only_large_fc=False, special_labels=None, special_palette=None, base_size=12, label_size=3, x='logFC', y='neg_log10_p_adjust', special_labels_mode='all', xlim=None, skip_labels=None, nudges=None, ): r_data, r_like_data = transform_data_for_ggplot( data, label_only_large_fc=label_only_large_fc, special_labels=special_labels, max_labels=max_labels, special_labels_mode=special_labels_mode, skip_labels=skip_labels, nudges=nudges) plot = r_ggplot2.ggplot(r_data) plot += r_ggplot2.theme_minimal(base_size=base_size) plot += r_ggplot2.theme( **{ 'panel.grid.major': r_ggplot2.element_blank(), 'panel.grid.minor': r_ggplot2.element_blank(), 'panel.border': r_ggplot2.element_rect(fill=robjects.rinterface.NA, color="black") }) plot += r_ggplot2.theme( text=r_ggplot2.element_text(family='Helvetica', face='plain')) plot += r_ggplot2.theme( **{ 'plot.title': r_ggplot2.element_text(hjust=0.5), # 'axis.title.y': r_ggplot2.element_text((t = 0, r = 20, b = 0, l = 0)), }) aes_points = r_ggplot2.aes_string(x=x, y=y, color='group') scale_points = r_ggplot2.scale_colour_manual( aes_points, values=r_label_palette( r_like_data, special_palette, color_background=color_background, color_significant=color_significant, color_significant_muted=color_significant_muted)) plot += aes_points plot += scale_points if xlim is not None: plot += r_ggplot2.scale_x_continuous( labels=r_custom.formatterFunTwoDigits, limits=robjects.r.c(*xlim)) else: plot += r_ggplot2.scale_x_continuous( labels=r_custom.formatterFunTwoDigits) plot += r_ggplot2.scale_y_continuous(labels=r_custom.formatterFunOneDigit) plot += r_ggplot2.geom_hline( yintercept=float(-np.log10(FDR_THRESHOLD_RESPONSE)), color='#BDBDBD', alpha=.3) plot += r_ggplot2.geom_vline(xintercept=float(FC_THRESHOLD_RESPONSE), color='#BDBDBD', alpha=.3) plot += r_ggplot2.geom_vline(xintercept=-float(FC_THRESHOLD_RESPONSE), color='#BDBDBD', alpha=.3) plot += r_ggplot2.geom_point(**{'show.legend': False}) aes_text = r_ggplot2.aes_string(label='label') plot += aes_text plot += r_ggrepel.geom_text_repel( aes_text, nudge_x=r_dollar(r_data, 'nudgex'), nudge_y=r_dollar(r_data, 'nudgey'), size=label_size, family='Helvetica', **{ 'show.legend': False, 'point.padding': 0.25, 'min.segment.length': 0, #'max.iter':0, 'segment.color': '#BDBDBD' }, ) plot += r_ggplot2.labs(x=xlabel, y='Adjusted p value (-log10)', title=title) plot.plot()
def rest(): df = q1_median_q3_rep_wide pops = ["pdc", "dc-cd11b", "dc-cd8a"] stats_l = [] for stat, (popa, popb) in product(["Q1", "median", "Q3"], product(pops, pops)): print(stat, popa, popb) popa = "hsc" popb = "pdc" stat = "median" mw_u, pvalue = scipy.stats.mannwhitneyu( [0.8, 0.81, 0.79], [0.4, 0.39, 0.41], # df.query("Population == @popa")[stat].to_numpy(), # df.query("Population == @popb")[stat].to_numpy(), use_continuity=True, alternative="two-sided", ) pvalue stats_l.append([stat, popa, popb, mw_u, pvalue]) stats_df = pd.DataFrame(stats_l).set_axis( ["stat", "popA", "popB", "U", "pvalue"], axis=1) kruskal_format_means = pd.pivot( q1_median_q3_rep_wide.query("Population in @pops"), index="Population", columns="Replicate", values="mean", ) import scikit_posthocs stat, p_value = scipy.stats.kruskal( *[kruskal_format_means.loc[pop].to_numpy() for pop in pops], ) dunn_res_df = scikit_posthocs.posthoc_dunn( kruskal_format_means.to_numpy(), p_adjust='fdr_bh', sort=True, ) stat, pvalue = scipy.stats.f_oneway( *[kruskal_format_means.loc[pop].to_numpy() for pop in pops], ) import statsmodels df = kruskal_format_means.stack().reset_index() kruskal_format_means res = statsmodels.stats.multicomp.pairwise_tukeyhsd( df[0], df['Population'].to_numpy(), alpha=0.05) res.pvalues res.summary() # wilcox.test(c(0.8, 0.79, 0.81), c(0.4, 0.39, 0.41), paired=F, exact=F) plot_pops = ["pdc", "dc-cd8a", "dc-cd11b"] results_dir = "/icgc/dkfzlsdf/analysis/hs_ontogeny/notebook-data/gNs4xcMJscaLLwlt" point_plot_quartiles_png = results_dir + "/point-plot-quartiles.png" q1_median_q3_rep_wide ggplot_data = ( q1_median_q3_rep_long.query("Population in @plot_pops").sort_values( "value", ascending=False, ).groupby(["Population", "stat"]).apply( lambda df: df.assign(group_order=np.arange(1, df.shape[0] + 1)))) g = (gg.ggplot(ggplot_data) + gg.aes_string( x="Population", y="value", group="group_order", color="stat") + gg.geom_point(position=gg.position_dodge(width=0.5), size=1) + mh_rpy2_styling.gg_paper_theme + gg.labs(y='Methylation (%)', x='')) a = 3 rpy2_utils.image_png2(g, (ut.cm(6), ut.cm(6))) ut.save_and_display( g, png_path=point_plot_quartiles_png, # additional_formats=tuple(), height=ut.cm(6), width=ut.cm(6), ) q1_median_q3_rep_wide g = ( gg.ggplot( q1_median_q3_rep_wide.query("Population in @plot_pops").assign( sample=lambda df: df["Population"].astype(str) + df[ "Replicate"].astype(str))) + gg.geom_boxplot( gg.aes_string( x="Population", fill="Population", group="sample", lower="Q1", upper="Q3", middle="median", ymin="min1", ymax="max99", # position=gg.position_dodge(width=0.5), ), stat="identity", ) # + mh_rpy2_styling.gg_paper_theme + gg.theme(axis_text_x=gg.element_text(angle=90, hjust=1)) + gg.scale_fill_brewer(guide=False)) a = 3 ut.save_and_display( g, png_path=point_plot_quartiles_png, additional_formats=tuple(), height=ut.cm(6), width=ut.cm(7), ) # image_png2(g, (ut.cm(12), ut.cm(12))) beta_values.loc[:, ("hsc", "1")]
def test_labs(self): la = ggplot2.labs() assert isinstance(la, ggplot2.Labs)
hour_speed # Pickups by hour (aggregated across all days) max_passenger_count = np.max(np.log1p(pickups_hr['passenger_count'])) min_passenger_count = np.log1p(1) for i in range(0,24): temp = pickups_hr[(pickups_hr.hour==i)].reset_index() temp.passenger_count = np.log1p(temp.passenger_count) temp_r = pandas2ri.py2ri(temp) p = ggplot2.ggplot(temp_r) + \ ggplot2.aes_string(x='rounded_lon', y='rounded_lat', color='passenger_count') + \ ggplot2.geom_point(size=0.5) + \ ggplot2.scale_color_gradient(low='black', high='white', limits=np.array([min_passenger_count, max_passenger_count])) + \ ggplot2.xlim(-74.2, -73.7) + ggplot2.ylim(40.56, 40.93) + \ ggplot2.labs(x=' ', y=' ', title='NYC taxi pickups %02i:00' % i) + \ ggplot2.guides(color=False) p.save('./plots/taxi_pickups%02i.png' % i, width=4, height=4.5) # Create animated .gif of pickups by hour import imageio file_names = sorted((fn for fn in os.listdir('./plots') if fn.startswith('taxi_pickups'))) file_names = ['plots/' + s for s in file_names] images = [] for filename in file_names: images.append(imageio.imread(filename)) imageio.mimsave('./plots/pickups_movie.gif', images, duration=0.4) # total pickups by date, color p1 = ggplot2.ggplot(pandas2ri.py2ri(date_avgs)) + \
from rpy2.robjects.lib import ggplot2 p = ggplot2.ggplot(dataf) + \ ggplot2.geom_line(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.geom_point(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.facet_wrap(Formula('~sequence')) + \ ggplot2.scale_y_continuous('running time') + \ ggplot2.scale_x_continuous('repeated n times', ) + \ ggplot2.xlim(0, max(n_loops)) + \ ggplot2.labs(title = "Benchmark (running time)") from rpy2.robjects.packages import importr grdevices = importr('grDevices') grdevices.png('../../_static/benchmark_sum.png', width = 712, height = 512) p.plot() grdevices.dev_off() #base = importr("base") stats = importr('stats') nlme = importr("nlme") fit = nlme.lmList(Formula('time ~ n_loop | group'), data = dataf, na_action = stats.na_exclude)
def show1(): open1() r.source('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/R/head1.r',encoding="utf-8") data = DataFrame.from_csvfile('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/temp/day1.csv') pp = ggplot2.ggplot(data)+ggplot2.aes_string(x='project', y='time',fill = 'project')+ggplot2.geom_bar(stat ='identity')+ggplot2.ggtitle("今日项目时间分布图")+ggplot2.labs(x='项目',y='时间 (min)')+ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle = 45)}) pp.plot()
def show4(): open4() r.source('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/R/end.R',encoding="utf-8") data = DataFrame.from_csvfile('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/temp/project2.csv') pp = ggplot2.ggplot(data)+ggplot2.aes_string(x='day', y='time',fill = 'factor(project)')+ggplot2.geom_bar(stat ='identity',position = 'dodge')+ggplot2.ggtitle("两项目时间对比图")+ggplot2.labs(x='日期',y='时间 (min)')+ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle = 45)}) pp.plot()
linetype='variable') line = ggplot2.geom_line() point = ggplot2.geom_point() vert_line_onset = ggplot2.geom_vline(xintercept=-1.5, linetype=2, colour="#999999") vert_line_exhaust = ggplot2.geom_vline(xintercept=5.5, linetype=2, colour="#999999") vert_line_exhaust_FL = ggplot2.geom_vline(xintercept=3.5, linetype=2, colour="#999999") colors = ggplot2.scale_colour_manual(values=robjects.r.palette_lines) hollow = ggplot2.scale_shape_manual( values=robjects.r('c(16,17,15,18,6,7,9,3)')) xlab = ggplot2.labs(x="Months Since First UI Check") loc_default = robjects.r('c(1,0)') legend_f = lambda loc=loc_default: ggplot2.theme(**{ 'legend.position': loc, 'legend.justification': loc }) ggsave = lambda filename, plot: robjects.r.ggsave( filename="../out/" + filename + ".pdf", plot=plot, width=6, height=4) colors_alt = ggplot2.scale_colour_manual(values=robjects.r.palette_lines[1]) shape_alt = ggplot2.scale_shape_manual(values=17) ggplot2_env = robjects.baseenv['as.environment']('package:ggplot2') class GBaseObject(robjects.RObject):
[d['code'][x] + ':' + d['sequence'][x] for x in range(len(d['n_loop']))]) dataf = DataFrame(d) from rpy2.robjects.lib import ggplot2 p = ggplot2.ggplot(dataf) + \ ggplot2.geom_line(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.geom_point(ggplot2.aes_string(x="n_loop", y="time", colour="code")) + \ ggplot2.facet_wrap(Formula('~sequence')) + \ ggplot2.scale_y_continuous('running time') + \ ggplot2.scale_x_continuous('repeated n times', ) + \ ggplot2.xlim(0, max(n_loops)) + \ ggplot2.labs(title = "Benchmark (running time)") from rpy2.robjects.packages import importr grdevices = importr('grDevices') grdevices.png('../../_static/benchmark_sum.png', width=812, height=612, type="cairo") p.plot() grdevices.dev_off() #base = importr("base") stats = importr('stats') nlme = importr("nlme") fit = nlme.lmList(Formula('time ~ n_loop | group'), data=dataf,
def _plt_percountr(dat, independentpdf=False, fname='xpercount.pdf'): def _filt_dat(dat, item, getlabel=True): df = pd.DataFrame(dat[item].value_counts()) df.columns = ['count'] if getlabel: df['label'] = [ list(dat[dat[item] == i]['label'])[0] for i in df.index ] n = len(df) mx = max(df['count']) return df, n, mx dat = dat[dat['label'] != 'NA'] ## NUMBER OF MIRNA PER TSS df, n, mx = _filt_dat(dat, 'tss', False) df = {'count': robjects.IntVector(df['count'])} df = robjects.DataFrame(df) pt = ggplot2.ggplot(df) + \ ggplot2.geom_histogram(binwidth=1, origin=-.5, alpha=.5, position="identity") + \ ggplot2.xlim(-.5, mx+1) + \ ggplot2.aes_string(x='count') + \ ggplot2.ggtitle('TSS [Total = %s]' % n) + \ ggplot2.labs(x='Number of miRNA per TSS (max = %s)' % mx) pt_den = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='count', y='..density..') + \ ggplot2.geom_density(binwidth=1, alpha=.5, origin=-.5) + \ ggplot2.geom_histogram(binwidth=1, alpha=.33, position='identity', origin=-.5) + \ ggplot2.ggtitle('TSS [Total = %s]' % n) + \ ggplot2.labs(x='Number of miRNA per TSS (max = %s)' % mx) ## NUMBER OF TSS PER MIRNA df, n, mx = _filt_dat(dat, 'mirna') df = { 'count': robjects.IntVector(df['count']), 'label': robjects.StrVector(df['label']) } df = robjects.DataFrame(df) _pm = ggplot2.ggplot(df) + \ ggplot2.geom_histogram(binwidth=1, origin=-.5, alpha=.5, position="identity") + \ ggplot2.xlim(-.5, mx+1) + \ ggplot2.ggtitle('miRNA [Total = %s]' % n) _pm_den = ggplot2.ggplot(df) + \ ggplot2.geom_density(binwidth=1, alpha=.5, origin=-.5) + \ ggplot2.geom_histogram(binwidth=1, alpha=.33, position='identity', origin=-.5) + \ ggplot2.ggtitle('miRNA [Total = %s]' % n) ## not split by label pm = _pm + ggplot2.aes_string(x='count') pm_den = _pm_den + ggplot2.aes_string(x='count', y='..density..') ## split by label pms = _pm + ggplot2.aes_string(x='count', fill='label') pm_dens = _pm_den + ggplot2.aes_string( x='count', fill='label', y='..density..') ## add xlabelling (need to be added after aes_string) _xlab = ggplot2.labs(x='Number of TSS per miRNA (max = %s)' % mx) pm += _xlab pm_den += _xlab pms += _xlab pm_dens += _xlab if independentpdf: grdevices = importr('grDevices') grdevices.pdf(fname) pt.plot() pt_den.plot() pm.plot() pm_den.plot() pms.plot() pm_dens.plot() grdevices.dev_off() else: pt.plot() pt_den.plot() pm.plot() pm_den.plot() pms.plot() pm_dens.plot() return
import pandas as pd import rpy2.robjects.packages as packages import rpy2.robjects.lib.ggplot2 as ggplot2 import rpy2.robjects as ro # Importando o dataset do R, o mtcars R = ro.r datasets = packages.importr('datasets') mtcars = packages.data(datasets).fetch('mtcars')['mtcars'] # Gerando o gráfico com ggplot gp = ggplot2.ggplot(mtcars) pyplot = (gp + ggplot2.aes_string(x = 'wt', y = 'mpg') + ggplot2.geom_point(ggplot2.aes_string(colour = 'qsec')) + ggplot2.scale_colour_gradient(low = "yellow", high = "red") + ggplot2.geom_smooth(method = 'auto') + ggplot2.labs(title = "mtcars", x = 'wt', y = 'mpg')) pyplot.plot() print("\nAnálise de Variância") print("--------------------") import rpy2.robjects as robjects r = robjects.r controle = robjects.FloatVector([4.17,5.58,5.18,6.11,4.50,4.61, 5.17,4.53,5.33,5.14]) tratamento = robjects.FloatVector([4.81,4.17,4.41,3.59,5.87,3.83, 6.03,4.89,4.32,4.69]) grupo = r.gl(2, 10, 20, labels = ["Controle","Tratamento"])