def bargraph_language(results): r = robjects.r for language in languages: varis = [] probs = [] locs = [] for (lang, prob, var) in results.keys(): if lang == language: loc = results[(lang, prob, var)] varis.append(pretty_varis[var]) probs.append(prob) locs.append(loc) r.pdf('bargraph-loc-lang-' + language + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Variation': StrVector(varis), 'Problem': StrVector(probs), 'Lines': IntVector(locs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Lines', fill='Variation') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Lines of Code")') pp.plot() r['dev.off']()
def plot_crawldb_status(self, data, row_filter, img_file, ratio=1.0): if row_filter: data = data[data['type'].isin(row_filter)] categories = [] for value in row_filter: if re.search('^crawldb:status:db_', value): replacement = re.sub('^crawldb:status:db_', '', value) categories.append(replacement) data.replace(to_replace=value, value=replacement, inplace=True) data['type'] = pandas.Categorical(data['type'], ordered=True, categories=categories.reverse()) data['size'] = data['size'].astype(float) ratio = 0.1 + len(data['crawl'].unique()) * .03 print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='size', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='Pastel1', type='sequential', guide=ggplot2.guide_legend(reverse=False)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='CrawlDb Size and Status Counts\n(before crawling)', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path, height=int(7 * ratio), width=7) return p
def compare_sum_barplot(locus_table, interval_table, intervals, loci, names, rows): frame = get_r_data_by_top(locus_table, interval_table, intervals, names, rows) #pdb.set_trace() frame2 = robjects.r('''agg_data <- aggregate(pi ~ interval + db, data = data, sum)''') if len(intervals) > 1: sort_string = '''agg_data$interval <- factor(agg_data$interval,{})'''.format(order_intervals(frame2[0])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''agg_data''')) plot = gg_frame + \ ggplot2.aes_string( x = 'interval', y = 'pi', fill='factor(db)' ) + \ ggplot2.geom_bar(**{ 'position':'dodge', 'colour':'#767676', 'alpha':0.6 } ) + \ ggplot2.scale_y_continuous('net phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') + \ ggplot2.scale_fill_brewer("database", palette="Blues") return plot
def interval(locus_table, interval_table, intervals, loci, boxplot = True): qry = get_interval_query(intervals, loci, locus_table, interval_table) frame = robjects.r('''data <- dbGetQuery(con, {})'''.format(qry)) # because we're sorting by interval, which is a factor, we need to # explicitly re-sort the data by the first integer value # of the interval. This is a bit cumbersome, because sorting # in R is less than pleasant. sort_string = '''data$interval <- factor(data$interval, {})'''.format(order_intervals(frame[1])) robjects.r(sort_string) gg_frame = ggplot2.ggplot(robjects.r('''data''')) if boxplot: plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi') + \ ggplot2.geom_boxplot(**{ 'outlier.size':0, 'alpha':0.3 } ) + \ ggplot2.geom_jitter(ggplot2.aes_string(color = 'locus'), size = 3, \ alpha = 0.6, position=ggplot2.position_jitter(width=0.25)) + \ ggplot2.scale_y_continuous('phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') else: plot = gg_frame + ggplot2.aes_string(x = 'interval', y = 'pi', fill='locus') + ggplot2.geom_bar() + \ ggplot2.facet_wrap(robjects.Formula('~ locus')) + \ ggplot2.opts(**{ 'axis.text.x':ggplot2.theme_text(angle = -90, hjust = 0), 'legend.position':'none' }) + \ ggplot2.scale_y_continuous('phylogenetic informativeness') + \ ggplot2.scale_x_discrete('interval (years ago)') return plot
def main(): usage = 'usage: %prog [options] arg' parser = OptionParser(usage) #parser.add_option() (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide BAM file') else: bam_file = args[0] align_lengths = {} for aligned_read in pysam.Samfile(bam_file, 'rb'): align_lengths[aligned_read.qlen] = align_lengths.get(aligned_read.qlen,0) + 1 min_len = min(align_lengths.keys()) max_len = max(align_lengths.keys()) # construct data frame len_r = ro.IntVector(range(min_len,max_len+1)) counts_r = ro.IntVector([align_lengths.get(l,0) for l in range(min_len,max_len+1)]) df = ro.DataFrame({'length':len_r, 'counts':counts_r}) # construct full plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='length', y='counts') + \ ggplot2.geom_bar(stat='identity') + \ ggplot2.scale_x_continuous('Alignment length') + \ ggplot2.scale_y_continuous('') # plot to file grdevices.pdf(file='align_lengths.pdf') gp.plot() grdevices.dev_off()
def plot_fetch_status(self, data, row_filter, img_file, ratio=1.0): if row_filter: data = data[data['type'].isin(row_filter)] data = data[['crawl', 'percentage', 'type']] categories = [] for value in row_filter: if re.search('^fetcher:(?:aggr:)?', value): replacement = re.sub('^fetcher:(?:aggr:)?', '', value) categories.append(replacement) data.replace(to_replace=value, value=replacement, inplace=True) data['type'] = pandas.Categorical(data['type'], ordered=True, categories=categories.reverse()) ratio = 0.1 + len(data['crawl'].unique()) * .03 # print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='percentage', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='RdYlGn', type='sequential', guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path, height=int(7 * ratio), width=7) return p
def barPlot(self, dataframe, filename, x_parm, y_parm): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=x_parm,y=y_parm) geom = ggplot2.geom_bar(stat = "identity") gg = data + aes + geom gg.plot() grdevices.dev_off()
def barPlot(self, dataframe, filename, x_parm, y_parm): grdevices.png(file=filename, width=512, height=512) data = ggplot2.ggplot(dataframe) aes = ggplot2.aes_string(x=x_parm, y=y_parm) geom = ggplot2.geom_bar(stat="identity") gg = data + aes + geom gg.plot() grdevices.dev_off()
def groupBar(fi_data): dev_off = robjects.r('dev.off') read_delim = robjects.r('read.delim') #print(fi_data) class_data = read_delim(fi_data, header=True, stringsAsFactors=False) robjects.r.assign('class.data', class_data) robjects.r.pdf(fi_data + ".Bar.pdf") robjects.r('class_data <- class.data') class_data = robjects.r('class_data') ggplot2.theme = SignatureTranslatedFunction(ggplot2.theme, init_prm_translate={'axis_text_x': 'axis.text.x', 'axis_text_y': 'axis.text.y', 'axis_text_fill': 'axis.text.fill'}) bar = ggplot2.ggplot(class_data) + ggplot2.geom_bar(stat='identity', position='dodge') + ggplot2.aes_string(x='Class',y='Percent',fill='Group') + ggplot2.theme(axis_text_x=ggplot2.element_text(angle=90, hjust=1)) bar.plot() dev_off()
def bargraph_variation_diff(): r = robjects.r for (standard, expert) in [('seq', 'expertseq'), ('par', 'expertpar')]: langs = [] probs = [] diffs = [] for lang in languages: for prob in problems: error = False try: time = result[lang][prob][standard] except KeyError: error = True try: time_expert = result[lang][prob][expert] except KeyError: error = True if not error: diff = (float(time_expert + time) / float(time) - 1) else: diff = 0 langs.append(pretty_langs[lang]) probs.append(prob) diffs.append(diff) r.pdf('bargraph-codingtime-diff-' + standard + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Difference': FloatVector(diffs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Difference', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('ylab("Coding time difference (in percent)")') +\ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('scale_y_continuous(labels = percent_format())') pp.plot() r['dev.off']()
def generate_histogram(subgroups_to_sses_to_n_count, tname, file_name): columns_to_data = {'subgroup': [], tname: [], 'count': []} max_count = 0 for subgroup, sses_to_n_count in subgroups_to_sses_to_n_count.items(): for ss, n_count in sses_to_n_count.items(): columns_to_data['subgroup'].append(subgroup) columns_to_data[tname].append(ss) columns_to_data['count'].append(n_count) if n_count > max_count: max_count = n_count r_columns_to_data = { 'subgroup': ro.FactorVector(columns_to_data['subgroup'], levels=ro.StrVector( _sort_subgroup(set(columns_to_data['subgroup'])))), tname: ro.StrVector(columns_to_data[tname]), 'count': ro.IntVector(columns_to_data['count']) } df = ro.DataFrame(r_columns_to_data) max_count = int(max_count / 1000 * 1000 + 1000) histogram_file_path = os.path.join(OUTPUT_PATH, file_name) logging.debug( str.format("The Data Frame for file {}: \n{}", histogram_file_path, df)) grdevices.png(file=histogram_file_path, width=1200, height=800) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string(x='subgroup', y='count', fill=tname) + \ ggplot2.geom_bar(position="dodge",width=0.8, stat="identity") + \ ggplot2.theme_bw() + \ ggplot2.theme_classic() + \ ggplot2.theme(**{'legend.title': ggplot2.element_blank()}) + \ ggplot2.theme(**{'legend.text': ggplot2.element_text(size=40)}) + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=40,angle=45)}) + \ ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=40)}) + \ ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]), limits=ro.IntVector([0, max_count])) + \ ggplot2.geom_text(ggplot2.aes_string(label='count'), size=6, angle=35, hjust=-0.1, position=ggplot2.position_dodge(width=0.8), vjust=-0.2) pp.plot() logging.info(str.format("Output step3 file {}", histogram_file_path)) grdevices.dev_off()
def bargraph_language(cfg, values): r = robjects.r for lang in cfg.languages: times = [] varss = [] probs = [] ses = [] for prob in cfg.problems: for var in cfg.variations: # we use the pretty names to make the varss.append(pretty_varis[var]) probs.append(prob) data = FloatVector(values[prob][var][lang][0]) times.append(r['mean'](data)[0]) t_result = r['t.test'](data, **{ " conf.level": 0.999 }).rx('conf.int')[0] ses.append((t_result[1] - t_result[0]) / 2) r.pdf('bargraph-executiontime-lang-' + lang + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Variation': StrVector(varss), 'Problem': StrVector(probs), 'Time': FloatVector(times), 'SE': FloatVector(ses) }) limits = ggplot2.aes(ymax='Time + SE', ymin='Time - SE') dodge = ggplot2.position_dodge(width=0.9) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Variation') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.geom_errorbar (limits, position=dodge, width=0.25) + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time (in seconds)")') pp.plot() r['dev.off']()
def bargraph_language (cfg, values): r = robjects.r for lang in cfg.languages: times = [] varss = [] probs = [] ses = [] for prob in cfg.problems: for var in cfg.variations: # we use the pretty names to make the varss.append (pretty_varis [var]) probs.append (prob) data = FloatVector (values[prob][var][lang][0]) times.append (r['mean'] (data)[0]) t_result = r['t.test'] (data, **{" conf.level": 0.999}).rx ('conf.int')[0] ses.append ((t_result[1] - t_result[0])/2) r.pdf ('bargraph-executiontime-lang-' + lang + '.pdf', height=pdf_height (), width=pdf_width ()) df = robjects.DataFrame({'Variation': StrVector (varss), 'Problem': StrVector (probs), 'Time' : FloatVector (times), 'SE' : FloatVector (ses) }) limits = ggplot2.aes (ymax = 'Time + SE', ymin = 'Time - SE') dodge = ggplot2.position_dodge (width=0.9) gp = ggplot2.ggplot (df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Variation') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.geom_errorbar (limits, position=dodge, width=0.25) + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time (in seconds)")') pp.plot () r['dev.off']()
def bargraph_language(): r = robjects.r for language in languages: varis = [] probs = [] times = [] for prob in problems: for var in variations: try: time = result[language][prob][var] except KeyError: time = 0 # for the expert times, add expert and non-expert times together if var.startswith('expert'): try: time = time + result[language][prob][var.replace( 'expert', '')] except KeyError: pass varis.append(pretty_varis[var]) probs.append(prob) times.append(time) r.pdf('bargraph-codingtime-lang-' + language + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Variation': StrVector(varis), 'Problem': StrVector(probs), 'Time': IntVector(times), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Variation') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Coding time (in minutes)")') pp.plot() r['dev.off']()
def generate_step3_9_n_count_histogram(place_type_pos_type_to_count, file_name): columns_to_data = {'place': [], 'pos': [], 'count': []} max_count = 0 for place_pos_type, n_count in place_type_pos_type_to_count.items(): place_type, pos_type = place_pos_type.split('_') columns_to_data['place'].append(place_type) columns_to_data['pos'].append(pos_type) columns_to_data['count'].append(n_count) if n_count > max_count: max_count = n_count r_columns_to_data = { 'place': ro.StrVector(columns_to_data['place']), 'pos': ro.StrVector(columns_to_data['pos']), 'count': ro.IntVector(columns_to_data['count']) } df = ro.DataFrame(r_columns_to_data) if max_count > 1000: max_count = int(max_count / 1000 * 1000 + 1000) else: max_count = int(max_count / 100 * 100 + 100) histogram_file_path = os.path.join(OUTPUT_PATH, file_name) logging.debug( str.format("The Data Frame for file {}: \n{}", histogram_file_path, df)) grdevices.png(file=histogram_file_path, width=1024, height=512) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string(x='pos', y='count', fill='place') + \ ggplot2.geom_bar(position="dodge", stat="identity") + \ ggplot2.theme_bw() + \ ggplot2.theme_classic() + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=35)}) + \ ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=35)}) + \ ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]), limits=ro.IntVector([0, max_count])) + \ ggplot2.geom_text(ggplot2.aes_string(label='count'), position=ggplot2.position_dodge(width=0.8), size=10, angle=35, hjust=-0.2, vjust=-0.5) pp.plot() logging.info(str.format("Output step3 file {}", histogram_file_path)) grdevices.dev_off()
def bargraph_variation_diff(cfg, values): r = robjects.r for (standard, expert) in [('seq', 'expertseq'), ('par', 'expertpar')]: langs = [] probs = [] diffs = [] for lang in cfg.languages: for prob in cfg.problems: data = FloatVector(values[prob][standard][lang][0]) data_expert = FloatVector(values[prob][expert][lang][0]) mean = r['mean'](data)[0] mean_expert = r['mean'](data_expert)[0] diff = (float(mean_expert) / float(mean) - 1) langs.append(pretty_langs[lang]) probs.append(prob) diffs.append(diff) r.pdf('bargraph-executiontime-diff-' + standard + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Difference': FloatVector(diffs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Difference', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time difference (in percent)")') +\ robjects.r('scale_y_continuous(labels = percent_format())') pp.plot() r['dev.off']()
def bargraph_variation_norm(results): r = robjects.r for variation in variations: langs = [] probs = [] locs = [] for problem in problems: results_filtered = { key: results[key] for key in [(lang, problem, variation) for lang in languages] } loc_min = min(results_filtered.values()) for (lang, prob, var) in results_filtered.keys(): loc_norm = (float( results_filtered[(lang, prob, var)])) / float(loc_min) langs.append(pretty_langs[lang]) probs.append(prob) locs.append(loc_norm) r.pdf('bargraph-loc-var-norm-' + variation + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Lines': FloatVector(locs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Lines', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Lines of Code (normalized to smallest)")') pp.plot() r['dev.off']()
def mem_usage_graph(cfg): r = robjects.r varis = [] langs = [] probs = [] mems = [] for var in cfg.variations: for lang in cfg.languages: for prob in cfg.problems: mem_filename = get_mem_output(lang, prob, var) with open(mem_filename, 'r') as mem_file: mem = mem_file.readline() mems.append(float(mem)) varis.append(pretty_varis[var]) langs.append(pretty_langs[lang]) probs.append(prob) # memory usage is a simple histogram with all information in one graph. r.pdf('bargraph-memusage.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Variation': StrVector(varis), 'Mem': FloatVector(mems) }) gp = ggplot2.ggplot(df) # we rotate the x labels to make sure they don't overlap pp = gp +\ ggplot2.opts (**{'axis.text.x': ggplot2.theme_text (angle = 90, hjust=1)}) + \ ggplot2.aes_string (x='Problem', y='Mem', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.facet_wrap ('Variation') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Memory usage (in bytes)")')# + \ pp.plot() r['dev.off']()
def mem_usage_graph (cfg): r = robjects.r varis = [] langs = [] probs = [] mems = [] for var in cfg.variations: for lang in cfg.languages: for prob in cfg.problems: mem_filename = get_mem_output (lang, prob, var) with open (mem_filename, 'r') as mem_file: mem = mem_file.readline() mems.append (float (mem)) varis.append (pretty_varis [var]) langs.append (pretty_langs [lang]) probs.append (prob) # memory usage is a simple histogram with all information in one graph. r.pdf ('bargraph-memusage.pdf', height=pdf_height (), width=pdf_width ()) df = robjects.DataFrame({'Language': StrVector (langs), 'Problem': StrVector (probs), 'Variation' : StrVector (varis), 'Mem' : FloatVector (mems) }) gp = ggplot2.ggplot (df) # we rotate the x labels to make sure they don't overlap pp = gp +\ ggplot2.opts (**{'axis.text.x': ggplot2.theme_text (angle = 90, hjust=1)}) + \ ggplot2.aes_string (x='Problem', y='Mem', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.facet_wrap ('Variation') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Memory usage (in bytes)")')# + \ pp.plot () r['dev.off']()
def singleTablePlot_gg(parser, args): ''' kmerdict is a defaultdict(int) It can take both empty and non-empty kmerdicts returns update of the input kmerdict given the input string and k''' r = robjects.r r.library("ggplot2") grdevices = importr('grDevices') kmerdict = kmercount_in_table(args.table1) data = defaultdict(list) numKmers = len(kmerdict) for k in sorted(kmerdict.keys()): data['kmers'].append(k) data['counts'].append(kmerdict[k]) df = robjects.DataFrame(data) gp = ggplot2.ggplot(df) ## pp = gp + ggplot2.geom_bar(stat="identity") pp = gp + ggplot2.aes_string(x=range(1,numKmers+1),y=data['counts']) \ + ggplot2.geom_bar(stat="identity") \ + ggplot2.scale_x_continuous(name="kmer", breaks=0.5+(range(1,numKmers+1)), labels=kmers) pp.plot() print('Type enter to exit.') raw_input()
def bargraph_variation_diff(results): r = robjects.r for (standard, expert) in [('seq', 'expertseq'), ('par', 'expertpar')]: langs = [] probs = [] diffs = [] for lang in languages: for prob in problems: loc = results[(lang, prob, standard)] loc_expert = results[(lang, prob, expert)] diff = (float(loc_expert) / float(loc) - 1) langs.append(pretty_langs[lang]) probs.append(prob) diffs.append(diff) r.pdf('bargraph-loc-diff-' + standard + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Difference': FloatVector(diffs), }) #print (df) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Difference', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('ylab("Lines of code difference (in percent)")') +\ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('scale_y_continuous(labels = percent_format())') pp.plot() r['dev.off']()
def plot_stacked_bar(self, data, row_filter, img_file, ratio=1.0): if len(row_filter) > 0: data = data[data['type'].isin(row_filter)] for value in row_filter: if re.search('^fetcher:(?:aggr:)?', value): replacement = re.sub('^fetcher:(?:aggr:)?', '', value) data.replace(to_replace=value, value=replacement, inplace=True) # print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='percentage', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='RdYlGn', type='sequential', guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) return p
def plot_fetch_status(self, data, row_filter, img_file, ratio=1.0): if len(row_filter) > 0: data = data[data['type'].isin(row_filter)] for value in row_filter: if re.search('^fetcher:(?:aggr:)?', value): replacement = re.sub('^fetcher:(?:aggr:)?', '', value) data.replace(to_replace=value, value=replacement, inplace=True) # print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='percentage', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='RdYlGn', type='sequential', guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) return p
def bargraph_variation_diff (cfg, values): r = robjects.r for (standard, expert) in [('seq', 'expertseq'), ('par', 'expertpar')]: langs = [] probs = [] diffs = [] for lang in cfg.languages: for prob in cfg.problems: data = FloatVector (values[prob][standard][lang][0]) data_expert = FloatVector (values[prob][expert][lang][0]) mean = r['mean'] (data)[0] mean_expert = r['mean'] (data_expert)[0] diff = (float(mean_expert) / float(mean) - 1) langs.append (pretty_langs [lang]) probs.append (prob) diffs.append (diff) r.pdf ('bargraph-executiontime-diff-' + standard + '.pdf', height=pdf_height (), width=pdf_width ()) df = robjects.DataFrame({'Language': StrVector (langs), 'Problem': StrVector (probs), 'Difference' : FloatVector (diffs), }) #print (df) gp = ggplot2.ggplot (df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Difference', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time difference (in percent)")') +\ robjects.r('scale_y_continuous(labels = percent_format())') pp.plot () r['dev.off']()
def plot_crawldb_status(self, data, row_filter, img_file, ratio=1.0): if len(row_filter) > 0: data = data[data['type'].isin(row_filter)] for value in row_filter: if re.search('^crawldb:status:db_', value): replacement = re.sub('^crawldb:status:db_', '', value) data.replace(to_replace=value, value=replacement, inplace=True) data['size'] = data['size'].astype(float) print(data) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x='crawl', y='size', fill='type') \ + ggplot2.geom_bar(stat='identity', position='stack', width=.9) \ + ggplot2.coord_flip() \ + ggplot2.scale_fill_brewer(palette='Pastel1', type='sequential', guide=ggplot2.guide_legend(reverse=False)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', 'aspect.ratio': ratio}) \ + ggplot2.labs(title='CrawlDb Size and Status Counts (before crawling)', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) return p
def main(): usage = 'usage: %prog [options] arg' parser = OptionParser(usage) #parser.add_option() (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide BAM file') else: bam_file = args[0] align_lengths = {} for aligned_read in pysam.Samfile(bam_file, 'rb'): align_lengths[aligned_read.qlen] = align_lengths.get( aligned_read.qlen, 0) + 1 min_len = min(align_lengths.keys()) max_len = max(align_lengths.keys()) # construct data frame len_r = ro.IntVector(range(min_len, max_len + 1)) counts_r = ro.IntVector( [align_lengths.get(l, 0) for l in range(min_len, max_len + 1)]) df = ro.DataFrame({'length': len_r, 'counts': counts_r}) # construct full plot gp = ggplot2.ggplot(df) + \ ggplot2.aes_string(x='length', y='counts') + \ ggplot2.geom_bar(stat='identity') + \ ggplot2.scale_x_continuous('Alignment length') + \ ggplot2.scale_y_continuous('') # plot to file grdevices.pdf(file='align_lengths.pdf') gp.plot() grdevices.dev_off()
# Restructure hour data hour_arr = [] hits_arr = [] for hour in hour_hits: hour_arr.append(datetime.min + timedelta(hours=int(hour[0:2]))) hits_arr.append(hour_hits[hour]) hour = POSIXct(hour_arr) hits = IntVector(hits_arr) grdevices.png('analytics_out/hits_by_time.png') df = robjects.DataFrame({'hour': POSIXct(hour), 'hits': IntVector(hits)}) pp = ggplot.ggplot(df) + \ ggplot.aes_string(x = 'hour', y = 'hits') + \ ggplot.scale_x_datetime(labels = scales.date_format('%H:%M UTC')) + \ ggplot.geom_bar(stat = 'identity') pp.plot() grdevices.dev_off() # Restructure circular hit time datax time_hits_vec = FloatVector(time_hits) hits_circ = circular.circular(time_hits_vec, units='hours', template='clock24') hits_density = circular.density_circular(hits_circ, bw=100) print('Von Mises fit for hits by time (hours past 00:00 UTC)') hits_mle = circular.mle_vonmises(hits_circ) mu = base.cbind(hits_mle.rx('mu'))[0][0] mu_se = hits_mle.rx('se.mu')[0][0] kappa = hits_mle.rx('kappa')[0][0] kappa_se = hits_mle.rx('se.kappa')[0][0] print('MLE: mu = %0.2f (%0.2f) kappa = %0.2f (%0.2f)' %
'axis.title.x':element_text(size=size,color=robjects.r.color_axis_title, vjust=0), #'panel.grid.major':element_line(color=robjects.r.color_grid_major,size=.25), 'axis.title.y':element_text(size=size,color=robjects.r.color_axis_title,angle=90)}) #??? efficiently change legend titles #right now it takes two legend calls to make this work #alternatives that tried and failed #base_plot = lambda gr_name = 'variable': ggplot2.aes_string(x='x', y='value',group=gr_name,colour=gr_name, shape = gr_name) #colors = ggplot2.scale_colour_manual(values=robjects.r.palette_lines, name = ltitle) pandas2ri.activate() #set up basic, repetitive plot features base_plot = ggplot2.aes_string(x='x', y='value',group='variable',colour='variable', shape = 'variable') line = ggplot2.geom_line() point = ggplot2.geom_point() bar = ggplot2.geom_bar(stat="identity") vert_line_onset = ggplot2.geom_vline(xintercept=-1, linetype=2, colour="red", alpha=0.25) vert_line_exhaust = ggplot2.geom_vline(xintercept=5, linetype=2, colour="red", alpha=0.25) ltitle = "crazy" ltitle_default = 'Variable' #colors = lambda ltitle = ltitle_default: ggplot2.scale_colour_manual(values=robjects.r.palette_lines, name = ltitle) colors = ggplot2.scale_colour_manual(values=robjects.r.palette_lines) legend_t_c = lambda ltitle = ltitle_default: ggplot2.scale_color_discrete(name = ltitle) legend_t_s = lambda ltitle = ltitle_default: ggplot2.scale_shape_discrete(name = ltitle) loc_default = robjects.r('c(1,0)') legend_f = lambda loc = loc_default: ggplot2.theme(**{'legend.position':loc, 'legend.justification':loc}) ggsave = lambda filename, plot: robjects.r.ggsave(filename=out_path + filename + ".pdf", plot=plot, width = 6, height = 4) colors_alt = ggplot2.scale_colour_manual(values=robjects.r.palette_lines[1]) shape_alt = ggplot2.scale_shape_manual(values=17)
def bargraph_variation(): r = robjects.r for var in variations: # each variation gets plot values = [] # normalized values nvalues = [] langs = [] probs = [] for prob in problems: # aggregate by problems lvalues = [] for lang in languages: # each problem displays a list of language times for that problem langs.append(pretty_langs[lang]) probs.append(prob) value = 0 try: value = result[lang][prob][var] except KeyError: print "Warning: no value for:" print(lang, prob, var) value = 0 # FIXME to account for missing seq-version of Erlang # for the expert times, add expert and non-expert times together if var.startswith('expert'): try: value = value + result[lang][prob][var.replace( 'expert', '')] except KeyError: pass lvalues.append(value) values.extend(lvalues) lmin = min([x for x in lvalues if x != 0]) nvalues.extend([(lambda x: x / lmin)(la) for la in lvalues]) # plot histogram of actual times r.pdf('bargraph-codingtime-var-' + var + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Time': FloatVector(values), }) dodge = ggplot2.position_dodge(width=0.9) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Coding time (in minutes)")') pp.plot() # plot histogram of times normalized with respect to fastest time for a problem r.pdf('bargraph-codingtime-var-norm-' + var + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Time': FloatVector(nvalues), }) dodge = ggplot2.position_dodge(width=0.9) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Coding time (normalized to fastest)")') pp.plot() r['dev.off']()
def speedup_diffs (values, basis): r = robjects.r speedups = {} for var in ['par', 'expertpar']: speedups[var] = {} for lang in languages: speedups[var][lang] = {} i = 0 p1 = 0 print lang for prob in problems: i = i + 1 speedups[var][lang][prob] = [] base = r.mean (FloatVector (values [cfg.threads[-1]][prob][var.replace ('par','seq')][lang][0]))[0] # base with p = 1 base_p1 = r.mean (FloatVector (values [1][prob][var][lang][0]))[0] # use fastest sequential program if basis == 'fastest' and base_p1 < base: base = base_p1 p1 = p1 + 1 elif basis == 'seq': pass elif basis == 'p1': base = base_p1 mn = (r.mean (FloatVector (values[32][prob][var][lang][0])))[0] speedups[var][lang][prob].append (float (base) / float (mn)) print i print p1 langs = [] probs = [] diffs = [] for lang in languages: for prob in problems: sp = speedups['par'][lang][prob][0] sp_expert = speedups['expertpar'][lang][prob][0] diff = (float(sp_expert) / float(sp)) langs.append (pretty_langs [lang]) probs.append (prob) diffs.append (diff) r.pdf ('bargraph-speedup-diff.pdf', height=pdf_height (), width=pdf_width ()) df = robjects.DataFrame({'Language': StrVector (langs), 'Problem': StrVector (probs), 'Difference' : FloatVector (diffs), }) #print (df) gp = ggplot2.ggplot (df) scale = r(''' xformatter <- function(x) { sprintf("%d x", x) } scale_y_continuous(labels = xformatter) ''') pp = gp + \ ggplot2.aes_string (x='Problem', y='Difference', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ ggplot2_options () + \ ggplot2_colors () + \ r('ylab("Change in speedup")') +\ scale # r('scale_y_continuous(labels = percent_format())') pp.plot () r['dev.off']()
def bargraph_variation (cfg, values): r = robjects.r for var in cfg.variations: # each variation gets plot avgs = [] ses = [] # normalized values navgs = [] nses = [] langs = [] probs = [] for prob in cfg.problems: # aggregate by problems lavgs = [] lses = [] for lang in cfg.languages: # each problem displays a list of language times for that problem data = FloatVector (values[prob][var][lang][0]) langs.append (pretty_langs [lang]) probs.append (prob) mean = r['mean'] (data)[0] lavgs.append (mean) t_result = r['t.test'] (data, **{"conf.level": 0.999}).rx ('conf.int')[0] lses.append ((t_result[1] - t_result[0])/2) avgs.extend (lavgs) ses.extend (lses) lmin = min (lavgs) navgs.extend ([la/lmin for la in lavgs]) nses.extend ([ls/lmin for ls in lses]) df = robjects.DataFrame({'Language': StrVector (langs), 'Problem': StrVector (probs), 'Time' : FloatVector (avgs), 'SE' : FloatVector (ses), 'NormTime' : FloatVector (navgs), 'NormSE' : FloatVector (nses), 'TimeLabel' : StrVector ([str(round(time, 1)) + "s" for time in avgs]) }) # plot histogram of actual times r.pdf ('bargraph-executiontime-var-' + var + '.pdf', height=pdf_height (), width=pdf_width ()) limits = ggplot2.aes (ymax = 'Time + SE', ymin = 'Time - SE') dodge = ggplot2.position_dodge (width=0.9) gp = ggplot2.ggplot (df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.geom_errorbar (limits, position=dodge, width=0.25) + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time (in seconds)")') pp.plot () # plot histogram of times normalized with respect to fastest time for a problem r.pdf ('bargraph-executiontime-var-norm-' + var + '.pdf', height=pdf_height (), width=pdf_width ()) limits = ggplot2.aes (ymax = 'NormTime + NormSE', ymin = 'NormTime - NormSE') dodge = ggplot2.position_dodge (width=0.9) gp = ggplot2.ggplot (df) pp = gp + \ ggplot2.aes_string (x='Problem', y='NormTime', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.geom_errorbar (limits, position=dodge, width=0.25) +\ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time (normalized to fastest)")') #ggplot2.geom_text(data=df, # mapping = ggplot2.aes_string (x='Problem', # y='NormTime + NormSE + 0.1', # label='TimeLabel') pp.plot () r['dev.off']()
def speedup_diffs(values, basis): r = robjects.r speedups = {} for var in ['par', 'expertpar']: speedups[var] = {} for lang in languages: speedups[var][lang] = {} i = 0 p1 = 0 print lang for prob in problems: i = i + 1 speedups[var][lang][prob] = [] base = r.mean( FloatVector(values[cfg.threads[-1]][prob][var.replace( 'par', 'seq')][lang][0]))[0] # base with p = 1 base_p1 = r.mean(FloatVector(values[1][prob][var][lang][0]))[0] # use fastest sequential program if basis == 'fastest' and base_p1 < base: base = base_p1 p1 = p1 + 1 elif basis == 'seq': pass elif basis == 'p1': base = base_p1 mn = (r.mean(FloatVector(values[32][prob][var][lang][0])))[0] speedups[var][lang][prob].append(float(base) / float(mn)) print i print p1 langs = [] probs = [] diffs = [] for lang in languages: for prob in problems: sp = speedups['par'][lang][prob][0] sp_expert = speedups['expertpar'][lang][prob][0] diff = (float(sp_expert) / float(sp)) langs.append(pretty_langs[lang]) probs.append(prob) diffs.append(diff) r.pdf('bargraph-speedup-diff.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Difference': FloatVector(diffs), }) #print (df) gp = ggplot2.ggplot(df) scale = r(''' xformatter <- function(x) { sprintf("%d x", x) } scale_y_continuous(labels = xformatter) ''') pp = gp + \ ggplot2.aes_string (x='Problem', y='Difference', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ ggplot2_options () + \ ggplot2_colors () + \ r('ylab("Change in speedup")') +\ scale # r('scale_y_continuous(labels = percent_format())') pp.plot() r['dev.off']()
def show4(): open4() r.source('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/R/end.R',encoding="utf-8") data = DataFrame.from_csvfile('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/temp/project2.csv') pp = ggplot2.ggplot(data)+ggplot2.aes_string(x='day', y='time',fill = 'factor(project)')+ggplot2.geom_bar(stat ='identity',position = 'dodge')+ggplot2.ggtitle("两项目时间对比图")+ggplot2.labs(x='日期',y='时间 (min)')+ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle = 45)}) pp.plot()
print("Start plotting...") grdevices = importr('grDevices') ro.r('''change_name=function(pop_size, generations,freq){ name=sprintf("../results/mcm_%sNe_%sfreq_%sgen.png", pop_size,freq,generations) return(name)} ''') name = ro.r['change_name'] name = name(args.ps, args.gen, args.freq) print("Output figure in:", name) grdevices.png(file=name, width=700, height=700) gp = ggplot2.ggplot(res2) pp = gp + ggplot2.aes_string( x='Counts', y='Proportion') + ggplot2.geom_bar( stat="identity", color="darkgoldenrod3") + ggplot2.theme_bw() pp.plot() grdevices.dev_off() print("Plot done!") elif ( args.diff ): ###references:doi: 10.1093/molbev/msx254 && https://doi.org/10.1111/j.1365-294X.2010.04997.x p = args.freq N = args.ps t = args.gen newx = [] x = np.arange(0, 1.001, 0.001001001) res = [0] * len(x) print("Estimating allele counts") for i in range(1, 101):
def bargraph_variation(cfg, values): r = robjects.r for var in cfg.variations: # each variation gets plot avgs = [] ses = [] # normalized values navgs = [] nses = [] langs = [] probs = [] for prob in cfg.problems: # aggregate by problems lavgs = [] lses = [] for lang in cfg.languages: # each problem displays a list of language times for that problem data = FloatVector(values[prob][var][lang][0]) langs.append(pretty_langs[lang]) probs.append(prob) mean = r['mean'](data)[0] lavgs.append(mean) t_result = r['t.test'](data, **{ "conf.level": 0.999 }).rx('conf.int')[0] lses.append((t_result[1] - t_result[0]) / 2) avgs.extend(lavgs) ses.extend(lses) lmin = min(lavgs) navgs.extend([la / lmin for la in lavgs]) nses.extend([ls / lmin for ls in lses]) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Time': FloatVector(avgs), 'SE': FloatVector(ses), 'NormTime': FloatVector(navgs), 'NormSE': FloatVector(nses), 'TimeLabel': StrVector([str(round(time, 1)) + "s" for time in avgs]) }) # plot histogram of actual times r.pdf('bargraph-executiontime-var-' + var + '.pdf', height=pdf_height(), width=pdf_width()) limits = ggplot2.aes(ymax='Time + SE', ymin='Time - SE') dodge = ggplot2.position_dodge(width=0.9) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.geom_errorbar (limits, position=dodge, width=0.25) + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time (in seconds)")') pp.plot() # plot histogram of times normalized with respect to fastest time for a problem r.pdf('bargraph-executiontime-var-norm-' + var + '.pdf', height=pdf_height(), width=pdf_width()) limits = ggplot2.aes(ymax='NormTime + NormSE', ymin='NormTime - NormSE') dodge = ggplot2.position_dodge(width=0.9) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='NormTime', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.geom_errorbar (limits, position=dodge, width=0.25) +\ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time (normalized to fastest)")') #ggplot2.geom_text(data=df, # mapping = ggplot2.aes_string (x='Problem', # y='NormTime + NormSE + 0.1', # label='TimeLabel') pp.plot() r['dev.off']()
def show1(): open1() r.source('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/R/head1.r',encoding="utf-8") data = DataFrame.from_csvfile('D:/Postgraduate/Course/2-semester/R-language/TimeAnalyze/Programe/temp/day1.csv') pp = ggplot2.ggplot(data)+ggplot2.aes_string(x='project', y='time',fill = 'project')+ggplot2.geom_bar(stat ='identity')+ggplot2.ggtitle("今日项目时间分布图")+ggplot2.labs(x='项目',y='时间 (min)')+ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle = 45)}) pp.plot()