def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", default="-") parser.add_argument("-o", "--output", default="-") parser.add_argument("-p", "--profile", nargs='+') args = parser.parse_args() profile = list(parse_profile(args.profile[0])) if len(args.profile) > 1: profile.append(args.profile[1]) with fileopen(args.input, 'rt') as inp: lines = list(csv.reader(inp, delimiter="|")) assert len(lines) >= 2 header = lines[0] maxrss_col = header.index('MaxRSS') maxrss = 0 maxrow = None for row in lines[1:]: row_maxrss = parse_size(row[maxrss_col]) if row_maxrss > maxrss: maxrss = row_maxrss maxrow = row with fileopen(args.output, 'wt') as out: print( *('prog', 'prog2', 'threads', 'dataset', 'qcut'), *header, sep="\t", file=out) print(*profile, *maxrow, sep="\t", file=out)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", default="-") parser.add_argument("-o", "--output", default="-") parser.add_argument("-p", "--profile") args = parser.parse_args() prog, prog2, threads, dataset, qcut = parse_profile(args.profile) with fileopen(args.input, 'rt') as i: lines = [line.strip() for line in i.readlines()] cpu = lines[3] cpu_match = re.match("Percent of CPU this job got: (\\d+)%", cpu) assert cpu_match is not None cpu_pct = float(cpu_match.group(1)) wc_time = lines[4] wc_match_prefix = "Elapsed \\(wall clock\\) time \\(h:mm:ss or m:ss\\): " wc_match = re.match(wc_match_prefix + "(?:(\\d+)h )?(\\d+)m ([\\d\\.]+)s", wc_time) if wc_match is None: wc_match = re.match(wc_match_prefix + "(?:(\\d+):)?(\\d+):([\\d\\.]+)", wc_time) assert wc_match is not None hrs = int(wc_match.group(1) or 0) mins = int(wc_match.group(2)) secs = float(wc_match.group(3)) duration = ':'.join( ('{:02d}'.format(hrs), '{:02d}'.format(mins), '{:0.2f}'.format(secs))) duration_secs = (hrs * 3600) + (mins * 60) + secs memory = lines[9] memory_match = re.match("Maximum resident set size \\(kbytes\\): (\\d+)", memory) assert memory_match is not None memory_mbytes = int(memory_match.group(1)) / 1000 with fileopen(args.output, "wt") as out: print(prog, prog2, threads, dataset, qcut, duration_secs, cpu_pct, memory_mbytes, sep="\t", file=out)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", default="-") parser.add_argument("-o", "--output", default="-") parser.add_argument("-m", "--metric", nargs='*', default=None) parser.add_argument( "-f", "--formats", choices=('txt', 'tex', 'pickle', 'svg'), nargs='*', default=['txt']) args = parser.parse_args() header = None rows = [] with fileopen(args.input, 'rt') as inp: for i, line in enumerate(csv.reader(inp, delimiter="\t")): if i == 0: header = line elif i % 2 == 0: if len(line) == 0: break else: assert header == line else: rows.append(line) metrics = Metrics( header, rows, args.output, args.formats, os.path.dirname(__file__)) for metric in args.metric: name, column = metric.split('=') metrics.show(name, column)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", default="-") parser.add_argument("-o", "--output", default="-") parser.add_argument("-p", "--profile") args = parser.parse_args() prog, prog2, threads, dataset, qcut = parse_profile(args.profile) with fileopen(args.input, 'rt') as i: lines = [line.strip() for line in i.readlines()] cpu = lines[3] cpu_match = re.match("Percent of CPU this job got: (\\d+)%", cpu) assert cpu_match is not None cpu_pct = float(cpu_match.group(1)) wc_time = lines[4] wc_match_prefix = "Elapsed \\(wall clock\\) time \\(h:mm:ss or m:ss\\): " wc_match = re.match(wc_match_prefix + "(?:(\\d+)h )?(\\d+)m ([\\d\\.]+)s", wc_time) if wc_match is None: wc_match = re.match(wc_match_prefix + "(?:(\\d+):)?(\\d+):([\\d\\.]+)", wc_time) assert wc_match is not None hrs = int(wc_match.group(1) or 0) mins = int(wc_match.group(2)) secs = float(wc_match.group(3)) duration = ':'.join(( '{:02d}'.format(hrs), '{:02d}'.format(mins), '{:0.2f}'.format(secs))) duration_secs = (hrs * 3600) + (mins * 60) + secs memory = lines[9] memory_match = re.match("Maximum resident set size \\(kbytes\\): (\\d+)", memory) assert memory_match is not None memory_mbytes = int(memory_match.group(1)) / 1000 with fileopen(args.output, "wt") as out: print(prog, prog2, threads, dataset, qcut, duration_secs, cpu_pct, memory_mbytes, sep="\t", file=out)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", default="-") parser.add_argument("-o", "--output", default="-") parser.add_argument("-p", "--profile", nargs='+') args = parser.parse_args() profile = list(parse_profile(args.profile[0])) if len(args.profile) > 1: profile.append(args.profile[1]) with fileopen(args.input, 'rt') as i: lines = [line.strip() for line in i.readlines()] mem_matcher = re.compile("MemTotal:\s+(\d+ .*)") mem = None cpu_matcher = re.compile("model name\s*:\s*(.*)") cpus = defaultdict(int) for line in lines: if mem is None: mem_match = mem_matcher.match(line) if mem_match: mem = mem_match.group(1) continue cpu_match = cpu_matcher.match(line) if cpu_match: cpus[cpu_match.group(1)] += 1 total_cpus = sum(cpus.values()) cpu_str = "; ".join("{} {}".format(count, cpu) for cpu, count in cpus.items()) with fileopen(args.output, "wt") as out: print(*profile, total_cpus, mem, cpu_str, sep="\t", file=out)
def show(self, name, column, **kwargs): table = self._get_table(column) prefix = "{}.{}".format(self.output, name) for fmt in self.formats: outfile = "{}.{}".format(prefix, fmt) if fmt == 'txt': table.to_csv(outfile, sep="\t", index=False) elif fmt == 'pickle': import pickle with fileopen(outfile, 'wb') as out: pickle.dump(table, out) else: fn = getattr(self, "{}_{}".format(name, fmt)) fn(table, column, outfile)
def mem_tex(self, table, column, outfile, name=None, caption=None): texdat = (table. drop('Program', 1). rename(columns={ 'Program2' : 'Program', column : 'Memory' }). groupby(['Dataset', 'Threads', 'Program']). agg({ 'Memory' : max })) texdat = texdat.assign(MemoryMB=round(texdat['Memory'] / 1000000, 1)) from mako.template import Template table_template = Template(filename=os.path.join( self.template_path, "job_memory_table.tex")) with fileopen(outfile, "wt") as o: o.write(table_template.render( name=name, caption=caption, table=texdat))
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", default="-") parser.add_argument("-o", "--output", default="-") parser.add_argument("-n", "--name", default="table") parser.add_argument("-c", "--caption", default="") parser.add_argument( "-t", "--threads", type=int, default=None, help="Set all rows to have the same value for the Threads column.") parser.add_argument("-f", "--formats", choices=('txt', 'tex', 'svg', 'pickle'), nargs='*', default=['tex', 'svg']) args = parser.parse_args() # read raw data with fileopen(args.input, "rt") as inp: table = pd.read_csv(inp, sep='\t', names=('Program', 'Program2', 'Threads', 'Dataset', 'Quality', 'DurationSecs', 'CPUPct', 'MemoryMB'), dtype={ 'Program': 'category', 'Dataset': 'category' }) if args.threads: table.Threads = args.threads # save table (useful if input was stdin) if 'txt' in args.formats: table.to_csv(args.output + ".txt", sep="\t", index=False) if 'pickle' in args.formats: import pickle pickle_file = args.output + '.pickle' with fileopen(pickle_file, 'wb') as out: pickle.dump(table, out) # generate latex table if 'tex' in args.formats: texdat = table.melt( id_vars=['Program2', 'Threads', 'Dataset', 'Quality'], value_vars=['DurationSecs', 'CPUPct', 'MemoryMB']) texdat = (texdat.groupby(['Threads', 'Program2', 'variable']).agg({ 'value': [min, max] }).sort_index()) texdat.columns = texdat.columns.droplevel() from mako.template import Template table_template = Template(filename=os.path.join( os.path.dirname(__file__), "performance_table.tex")) tex_file = args.output + ".tex" with fileopen(tex_file, "wt") as o: o.write( table_template.render(name=args.name, caption=args.caption, table=texdat)) # generate figure if 'svg' in args.formats: import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sb sb.set(style="whitegrid") import numpy as np progs = list( zip(('adapterremoval', 'atropos (adapter)', 'atropos (insert)', 'seqpurge', 'skewer'), ('AdapterRemoval', 'Atropos (adapter)', 'Atropos (insert)', 'SeqPurge', 'Skewer'), sb.color_palette(n_colors=5))) pal = dict((p[1], p[2]) for p in progs) svgdat = table.melt( id_vars=['Program', 'Threads', 'Dataset', 'Quality'], value_vars=['DurationSecs', 'CPUPct', 'MemoryMB']) svgdat['Program'] = svgdat['Program'].map( dict((p[0], p[1]) for p in progs)) svgdat['Program'] = svgdat['Program'].astype('category') svgdat['Dataset'] = svgdat['Dataset'].astype('category') svgdat['variable'] = pd.Categorical( svgdat['variable'], categories=['DurationSecs', 'MemoryMB', 'CPUPct']) threads = svgdat.Threads.unique() plot = sb.factorplot(x='Threads', y="value", col="variable", hue="Program", data=svgdat, kind="bar", sharey=False, estimator=np.mean) if len(threads) == 1: plot.set_xticklabels('') plot.set_xlabels('') else: plot.set_xlabels('Threads') plot.axes[0, 0].set_ylabel('Runtime (sec)') plot.axes[0, 1].set_ylabel('Memory (Mb)') plot.axes[0, 2].set_ylabel('CPU (%)') plot.fig.subplots_adjust(wspace=0.35) plot.set_titles('') svg_file = args.output + ".svg" plot.savefig(svg_file)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", default="-") parser.add_argument("-o", "--output") parser.add_argument("-n", "--name", default="table") parser.add_argument("-c", "--caption", default="") parser.add_argument( "-e", "--error-rate-file", help="Table generated by adjust_error_profiles.R that maps requested" "to actual error rates. Required for latex output.") parser.add_argument( "-t", "--tool-name-file", help="File that maps profile names to display names for tools") parser.add_argument( "-f", "--formats", choices=('txt', 'tex', 'pickle'), nargs="+", default=['tex']) args = parser.parse_args() with fileopen(args.input, 'rt') as inp: table = pd.read_csv(inp, sep="\t", names=summary_fields) if 'txt' in args.formats: with fileopen(args.output + '.txt', 'wt') as out: table.to_csv(out, sep="\t", index=False) if 'pickle' in args.formats: import pickle with fileopen(args.output + '.pickle', 'wb') as out: pickle.dump(table, out, protocol=pickle.HIGHEST_PROTOCOL) if 'tex' in args.formats: import numpy as np adapter_cols = ( "non-adapter reads trimmed", "adapter reads overtrimmed", "total adapter reads undertrimmed") adapter_pct_cols = tuple('pct' + col for col in adapter_cols) base_cols = ( "overtrimmed bases", "undertrimmed bases") total_cols = ( "reads total error", "bases total error") all_cols = adapter_cols + adapter_pct_cols + base_cols + total_cols new_cols = ( "Wrongly Trimmed", "Over-trimmed", "Under-trimmed", "Wrongly Trimmed", "Over-trimmed", "Under-trimmed", "Over-trimmed", "Under-trimmed", "Total Error", "Total Error" ) col_map = dict(zip(all_cols, new_cols)) # Since we're evaluating adatper trimming accurracy, the number of threads # don't matter (there's no randomness, so the results should be the same for # every run), and we don't want to consider any quality trimming. textable = table[(table.threads==4) & (table.qcut==0)] # Add additional columns textable["total adapter reads undertrimmed"] = ( textable["adapter reads untrimmed"] + textable["adapter reads undertrimmed"]) for adapter_col, adapter_pct_col in zip(adapter_cols, adapter_pct_cols): textable[adapter_pct_col] = textable[adapter_col] / textable['retained reads'] textable["reads total error"] = textable.loc[:,adapter_cols].apply(sum, 1) / textable['retained reads'] textable["bases total error"] = textable.loc[:,base_cols].apply(sum, 1) / textable['total ref bases'] # Melt into tidy format textable = textable.melt(id_vars=['dataset', 'program'], value_vars=all_cols) # Add the "level" - reads/read pct/bases def to_level(var): if 'pct' in var: return 'pct' elif 'reads' in var: return 'reads' else: return 'bases' textable['datalevel'] = list(to_level(var) for var in textable.variable) # Replace the variable names with those we want in the final table textable = textable.replace({ 'variable' : col_map }) # Finally, pivot the table into the grouped format we want to use in the latex template and sort textable = textable.pivot_table(index=['dataset', 'program'], columns=['datalevel', 'variable']).sort_index() # Drop the unnecessary first column level textable.columns = textable.columns.droplevel(0) # Replace dataset names with actual error rates if args.error_rate_file: with open(args.error_rate_file, 'rt') as inp: error_rate_table = pd.read_csv(inp, sep="\t") error_rate_table = error_rate_table.groupby('Requested').agg(np.mean) datasets = textable.index.levels[0] textable.index = textable.index.set_levels( datasets.map(lambda x: str(round(error_rate_table.loc[x,'Actual'], 3))).values, 'dataset') # Replace tool names with display versions if args.tool_name_file: with open(args.tool_name_file, 'rt') as inp: tool_name_table = pd.read_csv(inp, sep="\t", index_col='ProfileName') programs = textable.index.levels[1] textable.index = textable.index.set_levels( programs.map(lambda x: tool_name_table.loc[x, 'DisplayName']).values, 'program') # Now render the template from mako import exceptions from mako.template import Template table_template = Template(filename=os.path.join( os.path.dirname(__file__), "simulated_accuracy_table.tex")) tex_file = args.output + ".tex" with fileopen(tex_file, "wt") as o: try: o.write(table_template.render( name=args.name, caption=args.caption, table=textable)) except: print(exceptions.text_error_template().render())
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", default="-") parser.add_argument("-o", "--output") parser.add_argument("-n", "--name", default="table") parser.add_argument("-c", "--caption", default="") parser.add_argument( "-e", "--error-rate-file", help="Table generated by adjust_error_profiles.R that maps requested" "to actual error rates. Required for latex output.") parser.add_argument( "-t", "--tool-name-file", help="File that maps profile names to display names for tools") parser.add_argument("-f", "--formats", choices=('txt', 'tex', 'pickle'), nargs="+", default=['tex']) args = parser.parse_args() with fileopen(args.input, 'rt') as inp: table = pd.read_csv(inp, sep="\t", names=summary_fields) if 'txt' in args.formats: with fileopen(args.output + '.txt', 'wt') as out: table.to_csv(out, sep="\t", index=False) if 'pickle' in args.formats: import pickle with fileopen(args.output + '.pickle', 'wb') as out: pickle.dump(table, out, protocol=pickle.HIGHEST_PROTOCOL) if 'tex' in args.formats: import numpy as np adapter_cols = ("non-adapter reads trimmed", "adapter reads overtrimmed", "total adapter reads undertrimmed") adapter_pct_cols = tuple('pct' + col for col in adapter_cols) base_cols = ("overtrimmed bases", "undertrimmed bases") total_cols = ("reads total error", "bases total error") all_cols = adapter_cols + adapter_pct_cols + base_cols + total_cols new_cols = ("Wrongly Trimmed", "Over-trimmed", "Under-trimmed", "Wrongly Trimmed", "Over-trimmed", "Under-trimmed", "Over-trimmed", "Under-trimmed", "Total Error", "Total Error") col_map = dict(zip(all_cols, new_cols)) # Since we're evaluating adatper trimming accurracy, the number of threads # don't matter (there's no randomness, so the results should be the same for # every run), and we don't want to consider any quality trimming. textable = table[(table.threads == 4) & (table.qcut == 0)] # Add additional columns textable["total adapter reads undertrimmed"] = ( textable["adapter reads untrimmed"] + textable["adapter reads undertrimmed"]) for adapter_col, adapter_pct_col in zip(adapter_cols, adapter_pct_cols): textable[adapter_pct_col] = textable[adapter_col] / textable[ 'retained reads'] textable["reads total error"] = textable.loc[:, adapter_cols].apply( sum, 1) / textable['retained reads'] textable["bases total error"] = textable.loc[:, base_cols].apply( sum, 1) / textable['total ref bases'] # Melt into tidy format textable = textable.melt(id_vars=['dataset', 'program'], value_vars=all_cols) # Add the "level" - reads/read pct/bases def to_level(var): if 'pct' in var: return 'pct' elif 'reads' in var: return 'reads' else: return 'bases' textable['datalevel'] = list( to_level(var) for var in textable.variable) # Replace the variable names with those we want in the final table textable = textable.replace({'variable': col_map}) # Finally, pivot the table into the grouped format we want to use in the latex template and sort textable = textable.pivot_table(index=['dataset', 'program'], columns=['datalevel', 'variable']).sort_index() # Drop the unnecessary first column level textable.columns = textable.columns.droplevel(0) # Replace dataset names with actual error rates if args.error_rate_file: with open(args.error_rate_file, 'rt') as inp: error_rate_table = pd.read_csv(inp, sep="\t") error_rate_table = error_rate_table.groupby('Requested').agg( np.mean) datasets = textable.index.levels[0] textable.index = textable.index.set_levels( datasets.map(lambda x: str( round(error_rate_table.loc[x, 'Actual'], 3))).values, 'dataset') # Replace tool names with display versions if args.tool_name_file: with open(args.tool_name_file, 'rt') as inp: tool_name_table = pd.read_csv(inp, sep="\t", index_col='ProfileName') programs = textable.index.levels[1] textable.index = textable.index.set_levels( programs.map( lambda x: tool_name_table.loc[x, 'DisplayName']).values, 'program') # Now render the template from mako import exceptions from mako.template import Template table_template = Template(filename=os.path.join( os.path.dirname(__file__), "simulated_accuracy_table.tex")) tex_file = args.output + ".tex" with fileopen(tex_file, "wt") as o: try: o.write( table_template.render(name=args.name, caption=args.caption, table=textable)) except: print(exceptions.text_error_template().render())