def compare_sum_barplot(locus_table, interval_table, intervals, loci, names, rows): frame = get_r_data_by_top(locus_table, interval_table, intervals, names, rows) #pdb.set_trace() frame2 = robjects.r('''agg_data <- aggregate(pi ~ interval + db, data = data, sum)''') if len(intervals) > 1: sort_string = '''agg_data$interval <- factor(agg_data$interval,{})'''.format(order_intervals(frame2[0])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''agg_data''')) plot = gg_frame + \ ggplot2.aes_string( x = 'interval', y = 'pi', fill='factor(db)' ) + \ ggplot2.geom_bar(**{ 'position':'dodge', 'colour':'#767676', 'alpha':0.6 } ) + \ ggplot2.scale_y_continuous('net phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') + \ ggplot2.scale_fill_brewer("database", palette="Blues") return plot
def plot_crawldb_status(self, data, row_filter, img_file, ratio=1.0): if row_filter: data = data[data['type'].isin(row_filter)] categories = [] for value in row_filter: if re.search('^crawldb:status:db_', value): replacement = re.sub('^crawldb:status:db_', '', value) categories.append(replacement) data.replace(to_replace=value, value=replacement, inplace=True) data['type'] = pandas.Categorical(data['type'], ordered=True, categories=categories.reverse()) data['size'] = data['size'].astype(float) ratio = 0.1 + len(data['crawl'].unique()) * .03 print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='size', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='Pastel1', type='sequential', guide=ggplot2.guide_legend(reverse=False)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='CrawlDb Size and Status Counts\n(before crawling)', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path, height=int(7 * ratio), width=7) return p
def plot_fetch_status(self, data, row_filter, img_file, ratio=1.0): if row_filter: data = data[data['type'].isin(row_filter)] data = data[['crawl', 'percentage', 'type']] categories = [] for value in row_filter: if re.search('^fetcher:(?:aggr:)?', value): replacement = re.sub('^fetcher:(?:aggr:)?', '', value) categories.append(replacement) data.replace(to_replace=value, value=replacement, inplace=True) data['type'] = pandas.Categorical(data['type'], ordered=True, categories=categories.reverse()) ratio = 0.1 + len(data['crawl'].unique()) * .03 # print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='percentage', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='RdYlGn', type='sequential', guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path, height=int(7 * ratio), width=7) return p
def plot(data, filename, title, ggplotter, xid="N", yid="RunTime", factorid="Step"): df = make_dataframe(data, xid, yid, factorid) grdevices.pdf(file=filename, width=10, height=6) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string(x=xid, y=yid) + \ ggplot2.aes_string(size=.5) + \ ggplotter() + \ ggplot2.aes_string(colour='factor(%s)' % factorid) + \ ggplot2.aes_string(fill='factor(%s)' % factorid) + \ ggplot2.opts(title=title) + \ ggplot2.scale_fill_brewer(palette="Set2") + \ ggplot2.scale_colour_brewer(palette="Set2") pp.plot() grdevices.dev_off()
def compare_mean_boxplot(locus_table, interval_table, intervals, loci, names, rows): frame = get_r_data_by_top(locus_table, interval_table, intervals, names, rows) if len(intervals) > 1: sort_string = '''data$interval <- factor(data$interval, {})'''.format(order_intervals(frame[1])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''data''')) plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi') + \ ggplot2.geom_boxplot(ggplot2.aes_string(fill = 'factor(db)'), **{ 'outlier.size':3, 'outlier.colour':'#767676', 'outlier.alpha':0.3, 'alpha':0.6 } ) + \ ggplot2.scale_y_continuous('mean phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') + \ ggplot2.scale_fill_brewer("database", palette='Blues') return plot
def plot_stacked_bar(self, data, row_filter, img_file, ratio=1.0): if len(row_filter) > 0: data = data[data['type'].isin(row_filter)] for value in row_filter: if re.search('^fetcher:(?:aggr:)?', value): replacement = re.sub('^fetcher:(?:aggr:)?', '', value) data.replace(to_replace=value, value=replacement, inplace=True) # print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='percentage', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='RdYlGn', type='sequential', guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) return p
def plot_fetch_status(self, data, row_filter, img_file, ratio=1.0): if len(row_filter) > 0: data = data[data['type'].isin(row_filter)] for value in row_filter: if re.search('^fetcher:(?:aggr:)?', value): replacement = re.sub('^fetcher:(?:aggr:)?', '', value) data.replace(to_replace=value, value=replacement, inplace=True) # print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='percentage', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='RdYlGn', type='sequential', guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) return p
def plot_crawldb_status(self, data, row_filter, img_file, ratio=1.0): if len(row_filter) > 0: data = data[data['type'].isin(row_filter)] for value in row_filter: if re.search('^crawldb:status:db_', value): replacement = re.sub('^crawldb:status:db_', '', value) data.replace(to_replace=value, value=replacement, inplace=True) data['size'] = data['size'].astype(float) print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='size', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='Pastel1', type='sequential', guide=ggplot2.guide_legend(reverse=False)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='CrawlDb Size and Status Counts (before crawling)', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) return p
def rest(): df = q1_median_q3_rep_wide pops = ["pdc", "dc-cd11b", "dc-cd8a"] stats_l = [] for stat, (popa, popb) in product(["Q1", "median", "Q3"], product(pops, pops)): print(stat, popa, popb) popa = "hsc" popb = "pdc" stat = "median" mw_u, pvalue = scipy.stats.mannwhitneyu( [0.8, 0.81, 0.79], [0.4, 0.39, 0.41], # df.query("Population == @popa")[stat].to_numpy(), # df.query("Population == @popb")[stat].to_numpy(), use_continuity=True, alternative="two-sided", ) pvalue stats_l.append([stat, popa, popb, mw_u, pvalue]) stats_df = pd.DataFrame(stats_l).set_axis( ["stat", "popA", "popB", "U", "pvalue"], axis=1) kruskal_format_means = pd.pivot( q1_median_q3_rep_wide.query("Population in @pops"), index="Population", columns="Replicate", values="mean", ) import scikit_posthocs stat, p_value = scipy.stats.kruskal( *[kruskal_format_means.loc[pop].to_numpy() for pop in pops], ) dunn_res_df = scikit_posthocs.posthoc_dunn( kruskal_format_means.to_numpy(), p_adjust='fdr_bh', sort=True, ) stat, pvalue = scipy.stats.f_oneway( *[kruskal_format_means.loc[pop].to_numpy() for pop in pops], ) import statsmodels df = kruskal_format_means.stack().reset_index() kruskal_format_means res = statsmodels.stats.multicomp.pairwise_tukeyhsd( df[0], df['Population'].to_numpy(), alpha=0.05) res.pvalues res.summary() # wilcox.test(c(0.8, 0.79, 0.81), c(0.4, 0.39, 0.41), paired=F, exact=F) plot_pops = ["pdc", "dc-cd8a", "dc-cd11b"] results_dir = "/icgc/dkfzlsdf/analysis/hs_ontogeny/notebook-data/gNs4xcMJscaLLwlt" point_plot_quartiles_png = results_dir + "/point-plot-quartiles.png" q1_median_q3_rep_wide ggplot_data = ( q1_median_q3_rep_long.query("Population in @plot_pops").sort_values( "value", ascending=False, ).groupby(["Population", "stat"]).apply( lambda df: df.assign(group_order=np.arange(1, df.shape[0] + 1)))) g = (gg.ggplot(ggplot_data) + gg.aes_string( x="Population", y="value", group="group_order", color="stat") + gg.geom_point(position=gg.position_dodge(width=0.5), size=1) + mh_rpy2_styling.gg_paper_theme + gg.labs(y='Methylation (%)', x='')) a = 3 rpy2_utils.image_png2(g, (ut.cm(6), ut.cm(6))) ut.save_and_display( g, png_path=point_plot_quartiles_png, # additional_formats=tuple(), height=ut.cm(6), width=ut.cm(6), ) q1_median_q3_rep_wide g = ( gg.ggplot( q1_median_q3_rep_wide.query("Population in @plot_pops").assign( sample=lambda df: df["Population"].astype(str) + df[ "Replicate"].astype(str))) + gg.geom_boxplot( gg.aes_string( x="Population", fill="Population", group="sample", lower="Q1", upper="Q3", middle="median", ymin="min1", ymax="max99", # position=gg.position_dodge(width=0.5), ), stat="identity", ) # + mh_rpy2_styling.gg_paper_theme + gg.theme(axis_text_x=gg.element_text(angle=90, hjust=1)) + gg.scale_fill_brewer(guide=False)) a = 3 ut.save_and_display( g, png_path=point_plot_quartiles_png, additional_formats=tuple(), height=ut.cm(6), width=ut.cm(7), ) # image_png2(g, (ut.cm(12), ut.cm(12))) beta_values.loc[:, ("hsc", "1")]
# print str(a) try: if dsumFC.has_key(drug): dsumFC[drug]['Fold_Change'].append(math.log10(float(val))) dsumY[drug]['Year'].append(yr) else: dsumFC[drug]= {'Fold_Change': [math.log10(float(val)),]} dsumY[drug]= {'Year': [yr,]} except: print "FAILURE: dsumFC="+str(dsumFC)+"\n\ndsumY="+str(dsumY) sys.exit() drugs = dsumFC.keys() for x in drugs: od = rlc.OrdDict([('Fold_Change',robjects.FloatVector(dsumFC[x]['Fold_Change'])),('Year',robjects.FactorVector(dsumY[x]['Year'])),('Drug',robjects.FactorVector(x))]) grdevices.pdf(file="drugs.pdf",width=7,height=7) dataf = robjects.DataFrame(od) gp3 = ggplot2.ggplot(dataf) pp3 = gp3 + ggplot2.scale_fill_brewer(palette='BrBG',name="Year")+ ggplot2.aes_string(x='Year',y='Fold_Change',fill='factor(Year)') + ggplot2.geom_boxplot() + ggplot2.opts(title = x+" Yearly Trend") # pp3 = gp3 + ggplot2.scale_colour_hue(h=base.c(180,270),name="Year")+ ggplot2.aes_string(x='Year',y='Fold_Change',fill='factor(Year)') + ggplot2.geom_boxplot() + ggplot2.opts(title = x+" Yearly Trend") #+ ggplot2.scale_y_log10() pp3.plot() grdevices.dev_off() f.close() print "\nfinished\n"