Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", default="-")
    parser.add_argument("-o", "--output", default="-")
    parser.add_argument("-p", "--profile", nargs='+')
    args = parser.parse_args()
    
    profile = list(parse_profile(args.profile[0]))
    if len(args.profile) > 1:
        profile.append(args.profile[1])
                
    with fileopen(args.input, 'rt') as inp:
        lines = list(csv.reader(inp, delimiter="|"))
    
    assert len(lines) >= 2
    header = lines[0]
    maxrss_col = header.index('MaxRSS')
    maxrss = 0
    maxrow = None
    for row in lines[1:]:
        row_maxrss = parse_size(row[maxrss_col])
        if row_maxrss > maxrss:
            maxrss = row_maxrss
            maxrow = row
    
    with fileopen(args.output, 'wt') as out:
        print(
            *('prog', 'prog2', 'threads', 'dataset', 'qcut'), *header,
            sep="\t", file=out)
        print(*profile, *maxrow, sep="\t", file=out)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", default="-")
    parser.add_argument("-o", "--output", default="-")
    parser.add_argument("-p", "--profile")
    args = parser.parse_args()

    prog, prog2, threads, dataset, qcut = parse_profile(args.profile)

    with fileopen(args.input, 'rt') as i:
        lines = [line.strip() for line in i.readlines()]

    cpu = lines[3]
    cpu_match = re.match("Percent of CPU this job got: (\\d+)%", cpu)
    assert cpu_match is not None
    cpu_pct = float(cpu_match.group(1))

    wc_time = lines[4]
    wc_match_prefix = "Elapsed \\(wall clock\\) time \\(h:mm:ss or m:ss\\): "
    wc_match = re.match(wc_match_prefix + "(?:(\\d+)h )?(\\d+)m ([\\d\\.]+)s",
                        wc_time)
    if wc_match is None:
        wc_match = re.match(wc_match_prefix + "(?:(\\d+):)?(\\d+):([\\d\\.]+)",
                            wc_time)
    assert wc_match is not None
    hrs = int(wc_match.group(1) or 0)
    mins = int(wc_match.group(2))
    secs = float(wc_match.group(3))
    duration = ':'.join(
        ('{:02d}'.format(hrs), '{:02d}'.format(mins), '{:0.2f}'.format(secs)))
    duration_secs = (hrs * 3600) + (mins * 60) + secs

    memory = lines[9]
    memory_match = re.match("Maximum resident set size \\(kbytes\\): (\\d+)",
                            memory)
    assert memory_match is not None
    memory_mbytes = int(memory_match.group(1)) / 1000

    with fileopen(args.output, "wt") as out:
        print(prog,
              prog2,
              threads,
              dataset,
              qcut,
              duration_secs,
              cpu_pct,
              memory_mbytes,
              sep="\t",
              file=out)
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", default="-")
    parser.add_argument("-o", "--output", default="-")
    parser.add_argument("-m", "--metric", nargs='*', default=None)
    parser.add_argument(
        "-f", "--formats", 
        choices=('txt', 'tex', 'pickle', 'svg'), nargs='*', default=['txt'])
    args = parser.parse_args()
    
    header = None
    rows = []
    
    with fileopen(args.input, 'rt') as inp:
        for i, line in enumerate(csv.reader(inp, delimiter="\t")):
            if i == 0:
                header = line
            elif i % 2 == 0:
                if len(line) == 0:
                    break
                else:
                    assert header == line
            else:
                rows.append(line)
    
    metrics = Metrics(
        header, rows, args.output, args.formats, os.path.dirname(__file__))
    
    for metric in args.metric:
        name, column = metric.split('=')
        metrics.show(name, column)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", default="-")
    parser.add_argument("-o", "--output", default="-")
    parser.add_argument("-p", "--profile")
    args = parser.parse_args()

    prog, prog2, threads, dataset, qcut = parse_profile(args.profile)
    
    with fileopen(args.input, 'rt') as i:
        lines = [line.strip() for line in i.readlines()]

    cpu = lines[3]
    cpu_match = re.match("Percent of CPU this job got: (\\d+)%", cpu)
    assert cpu_match is not None
    cpu_pct = float(cpu_match.group(1))

    wc_time = lines[4]
    wc_match_prefix = "Elapsed \\(wall clock\\) time \\(h:mm:ss or m:ss\\): "
    wc_match = re.match(wc_match_prefix + "(?:(\\d+)h )?(\\d+)m ([\\d\\.]+)s", wc_time)
    if wc_match is None:
        wc_match = re.match(wc_match_prefix + "(?:(\\d+):)?(\\d+):([\\d\\.]+)", wc_time)
    assert wc_match is not None
    hrs = int(wc_match.group(1) or 0)
    mins = int(wc_match.group(2))
    secs = float(wc_match.group(3))
    duration = ':'.join((
        '{:02d}'.format(hrs),
        '{:02d}'.format(mins),
        '{:0.2f}'.format(secs)))
    duration_secs = (hrs * 3600) + (mins * 60) + secs

    memory = lines[9]
    memory_match = re.match("Maximum resident set size \\(kbytes\\): (\\d+)", memory)
    assert memory_match is not None
    memory_mbytes = int(memory_match.group(1)) / 1000

    with fileopen(args.output, "wt") as out:
        print(prog, prog2, threads, dataset, qcut, duration_secs,
        cpu_pct, memory_mbytes, sep="\t", file=out)
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", default="-")
    parser.add_argument("-o", "--output", default="-")
    parser.add_argument("-p", "--profile", nargs='+')
    args = parser.parse_args()
    
    profile = list(parse_profile(args.profile[0]))
    if len(args.profile) > 1:
        profile.append(args.profile[1])
    
    with fileopen(args.input, 'rt') as i:
        lines = [line.strip() for line in i.readlines()]
    
    mem_matcher = re.compile("MemTotal:\s+(\d+ .*)")
    mem = None
    
    cpu_matcher = re.compile("model name\s*:\s*(.*)")
    cpus = defaultdict(int)
    
    for line in lines:
        if mem is None:
            mem_match = mem_matcher.match(line)
            if mem_match:
                mem = mem_match.group(1)
                continue
        
        cpu_match = cpu_matcher.match(line)
        if cpu_match:
            cpus[cpu_match.group(1)] += 1
    
    total_cpus = sum(cpus.values())
    cpu_str = "; ".join("{} {}".format(count, cpu) for cpu, count in cpus.items())
    
    with fileopen(args.output, "wt") as out:
        print(*profile, total_cpus, mem, cpu_str, sep="\t", file=out)
Ejemplo n.º 6
0
 def show(self, name, column, **kwargs):
     table = self._get_table(column)
     prefix = "{}.{}".format(self.output, name)
     
     for fmt in self.formats:
         outfile = "{}.{}".format(prefix, fmt)
         if fmt == 'txt':
             table.to_csv(outfile, sep="\t", index=False)
         elif fmt == 'pickle':
             import pickle
             with fileopen(outfile, 'wb') as out:
                 pickle.dump(table, out)
         else:
             fn = getattr(self, "{}_{}".format(name, fmt))
             fn(table, column, outfile)
Ejemplo n.º 7
0
 def mem_tex(self, table, column, outfile, name=None, caption=None):
     texdat = (table.
         drop('Program', 1).
         rename(columns={
             'Program2' : 'Program',
             column : 'Memory'
         }).
         groupby(['Dataset', 'Threads', 'Program']).
         agg({ 'Memory' : max }))
     texdat = texdat.assign(MemoryMB=round(texdat['Memory'] / 1000000, 1))
     
     from mako.template import Template
     table_template = Template(filename=os.path.join(
         self.template_path, "job_memory_table.tex"))
     with fileopen(outfile, "wt") as o:
         o.write(table_template.render(
             name=name, caption=caption, table=texdat))
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", default="-")
    parser.add_argument("-o", "--output", default="-")
    parser.add_argument("-n", "--name", default="table")
    parser.add_argument("-c", "--caption", default="")
    parser.add_argument(
        "-t",
        "--threads",
        type=int,
        default=None,
        help="Set all rows to have the same value for the Threads column.")
    parser.add_argument("-f",
                        "--formats",
                        choices=('txt', 'tex', 'svg', 'pickle'),
                        nargs='*',
                        default=['tex', 'svg'])
    args = parser.parse_args()

    # read raw data
    with fileopen(args.input, "rt") as inp:
        table = pd.read_csv(inp,
                            sep='\t',
                            names=('Program', 'Program2', 'Threads', 'Dataset',
                                   'Quality', 'DurationSecs', 'CPUPct',
                                   'MemoryMB'),
                            dtype={
                                'Program': 'category',
                                'Dataset': 'category'
                            })

    if args.threads:
        table.Threads = args.threads

    # save table (useful if input was stdin)

    if 'txt' in args.formats:
        table.to_csv(args.output + ".txt", sep="\t", index=False)

    if 'pickle' in args.formats:
        import pickle
        pickle_file = args.output + '.pickle'
        with fileopen(pickle_file, 'wb') as out:
            pickle.dump(table, out)

    # generate latex table
    if 'tex' in args.formats:
        texdat = table.melt(
            id_vars=['Program2', 'Threads', 'Dataset', 'Quality'],
            value_vars=['DurationSecs', 'CPUPct', 'MemoryMB'])
        texdat = (texdat.groupby(['Threads', 'Program2', 'variable']).agg({
            'value': [min, max]
        }).sort_index())
        texdat.columns = texdat.columns.droplevel()
        from mako.template import Template
        table_template = Template(filename=os.path.join(
            os.path.dirname(__file__), "performance_table.tex"))
        tex_file = args.output + ".tex"
        with fileopen(tex_file, "wt") as o:
            o.write(
                table_template.render(name=args.name,
                                      caption=args.caption,
                                      table=texdat))

    # generate figure
    if 'svg' in args.formats:
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        import seaborn as sb
        sb.set(style="whitegrid")
        import numpy as np

        progs = list(
            zip(('adapterremoval', 'atropos (adapter)', 'atropos (insert)',
                 'seqpurge', 'skewer'),
                ('AdapterRemoval', 'Atropos (adapter)', 'Atropos (insert)',
                 'SeqPurge', 'Skewer'), sb.color_palette(n_colors=5)))
        pal = dict((p[1], p[2]) for p in progs)

        svgdat = table.melt(
            id_vars=['Program', 'Threads', 'Dataset', 'Quality'],
            value_vars=['DurationSecs', 'CPUPct', 'MemoryMB'])
        svgdat['Program'] = svgdat['Program'].map(
            dict((p[0], p[1]) for p in progs))
        svgdat['Program'] = svgdat['Program'].astype('category')
        svgdat['Dataset'] = svgdat['Dataset'].astype('category')
        svgdat['variable'] = pd.Categorical(
            svgdat['variable'],
            categories=['DurationSecs', 'MemoryMB', 'CPUPct'])

        threads = svgdat.Threads.unique()
        plot = sb.factorplot(x='Threads',
                             y="value",
                             col="variable",
                             hue="Program",
                             data=svgdat,
                             kind="bar",
                             sharey=False,
                             estimator=np.mean)
        if len(threads) == 1:
            plot.set_xticklabels('')
            plot.set_xlabels('')
        else:
            plot.set_xlabels('Threads')
        plot.axes[0, 0].set_ylabel('Runtime (sec)')
        plot.axes[0, 1].set_ylabel('Memory (Mb)')
        plot.axes[0, 2].set_ylabel('CPU (%)')
        plot.fig.subplots_adjust(wspace=0.35)
        plot.set_titles('')
        svg_file = args.output + ".svg"
        plot.savefig(svg_file)
Ejemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", default="-")
    parser.add_argument("-o", "--output")
    parser.add_argument("-n", "--name", default="table")
    parser.add_argument("-c", "--caption", default="")
    parser.add_argument(
        "-e", "--error-rate-file", 
        help="Table generated by adjust_error_profiles.R that maps requested"
             "to actual error rates. Required for latex output.")
    parser.add_argument(
        "-t", "--tool-name-file",
        help="File that maps profile names to display names for tools")
    parser.add_argument(
        "-f", "--formats", choices=('txt', 'tex', 'pickle'), nargs="+",
        default=['tex'])
    args = parser.parse_args()
    
    with fileopen(args.input, 'rt') as inp:
        table = pd.read_csv(inp, sep="\t", names=summary_fields)
    
    if 'txt' in args.formats:
        with fileopen(args.output + '.txt', 'wt') as out:
            table.to_csv(out, sep="\t", index=False)
        
    if 'pickle' in args.formats:
        import pickle
        with fileopen(args.output + '.pickle', 'wb') as out:
            pickle.dump(table, out, protocol=pickle.HIGHEST_PROTOCOL)
    
    if 'tex' in args.formats:
        import numpy as np
        
        adapter_cols = (
            "non-adapter reads trimmed",
            "adapter reads overtrimmed",
            "total adapter reads undertrimmed")
        adapter_pct_cols = tuple('pct' + col for col in adapter_cols)
        base_cols = (
            "overtrimmed bases",
            "undertrimmed bases")
        total_cols = (
            "reads total error",
            "bases total error")
        all_cols = adapter_cols + adapter_pct_cols + base_cols + total_cols
        new_cols = (
            "Wrongly Trimmed",
            "Over-trimmed",
            "Under-trimmed",
            "Wrongly Trimmed",
            "Over-trimmed",
            "Under-trimmed",
            "Over-trimmed",
            "Under-trimmed",
            "Total Error", 
            "Total Error"
        )
        col_map = dict(zip(all_cols, new_cols))
        
        # Since we're evaluating adatper trimming accurracy, the number of threads 
        # don't matter (there's no randomness, so the results should be the same for
        # every run), and we don't want to consider any quality trimming.
        textable = table[(table.threads==4) & (table.qcut==0)]
        # Add additional columns
        textable["total adapter reads undertrimmed"] = (
            textable["adapter reads untrimmed"] + 
            textable["adapter reads undertrimmed"])
        for adapter_col, adapter_pct_col in zip(adapter_cols, adapter_pct_cols):
            textable[adapter_pct_col] = textable[adapter_col] / textable['retained reads']
        textable["reads total error"] = textable.loc[:,adapter_cols].apply(sum, 1) / textable['retained reads']
        textable["bases total error"] = textable.loc[:,base_cols].apply(sum, 1) / textable['total ref bases']
        # Melt into tidy format
        textable = textable.melt(id_vars=['dataset', 'program'], value_vars=all_cols)
        # Add the "level" - reads/read pct/bases
        def to_level(var):
            if 'pct' in var:
                return 'pct'
            elif 'reads' in var:
                return 'reads'
            else:
                return 'bases'
        textable['datalevel'] = list(to_level(var) for var in textable.variable)
        # Replace the variable names with those we want in the final table
        textable = textable.replace({ 'variable' : col_map })
        # Finally, pivot the table into the grouped format we want to use in the latex template and sort
        textable = textable.pivot_table(index=['dataset', 'program'], columns=['datalevel', 'variable']).sort_index()
        # Drop the unnecessary first column level
        textable.columns = textable.columns.droplevel(0)
        
        # Replace dataset names with actual error rates
        if args.error_rate_file:
            with open(args.error_rate_file, 'rt') as inp:
                error_rate_table = pd.read_csv(inp, sep="\t")
            error_rate_table = error_rate_table.groupby('Requested').agg(np.mean)
            datasets = textable.index.levels[0]
            textable.index = textable.index.set_levels(
                datasets.map(lambda x: str(round(error_rate_table.loc[x,'Actual'], 3))).values, 
                'dataset')
        
        # Replace tool names with display versions
        if args.tool_name_file:
            with open(args.tool_name_file, 'rt') as inp:
                tool_name_table = pd.read_csv(inp, sep="\t", index_col='ProfileName')
            programs = textable.index.levels[1]
            textable.index = textable.index.set_levels(
                programs.map(lambda x: tool_name_table.loc[x, 'DisplayName']).values, 
                'program')
        
        # Now render the template
        from mako import exceptions
        from mako.template import Template
        table_template = Template(filename=os.path.join(
            os.path.dirname(__file__), "simulated_accuracy_table.tex"))
        tex_file = args.output + ".tex"
        with fileopen(tex_file, "wt") as o:
            try:
                o.write(table_template.render(
                    name=args.name, caption=args.caption, table=textable))
            except:
                print(exceptions.text_error_template().render())
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", default="-")
    parser.add_argument("-o", "--output")
    parser.add_argument("-n", "--name", default="table")
    parser.add_argument("-c", "--caption", default="")
    parser.add_argument(
        "-e",
        "--error-rate-file",
        help="Table generated by adjust_error_profiles.R that maps requested"
        "to actual error rates. Required for latex output.")
    parser.add_argument(
        "-t",
        "--tool-name-file",
        help="File that maps profile names to display names for tools")
    parser.add_argument("-f",
                        "--formats",
                        choices=('txt', 'tex', 'pickle'),
                        nargs="+",
                        default=['tex'])
    args = parser.parse_args()

    with fileopen(args.input, 'rt') as inp:
        table = pd.read_csv(inp, sep="\t", names=summary_fields)

    if 'txt' in args.formats:
        with fileopen(args.output + '.txt', 'wt') as out:
            table.to_csv(out, sep="\t", index=False)

    if 'pickle' in args.formats:
        import pickle
        with fileopen(args.output + '.pickle', 'wb') as out:
            pickle.dump(table, out, protocol=pickle.HIGHEST_PROTOCOL)

    if 'tex' in args.formats:
        import numpy as np

        adapter_cols = ("non-adapter reads trimmed",
                        "adapter reads overtrimmed",
                        "total adapter reads undertrimmed")
        adapter_pct_cols = tuple('pct' + col for col in adapter_cols)
        base_cols = ("overtrimmed bases", "undertrimmed bases")
        total_cols = ("reads total error", "bases total error")
        all_cols = adapter_cols + adapter_pct_cols + base_cols + total_cols
        new_cols = ("Wrongly Trimmed", "Over-trimmed", "Under-trimmed",
                    "Wrongly Trimmed", "Over-trimmed", "Under-trimmed",
                    "Over-trimmed", "Under-trimmed", "Total Error",
                    "Total Error")
        col_map = dict(zip(all_cols, new_cols))

        # Since we're evaluating adatper trimming accurracy, the number of threads
        # don't matter (there's no randomness, so the results should be the same for
        # every run), and we don't want to consider any quality trimming.
        textable = table[(table.threads == 4) & (table.qcut == 0)]
        # Add additional columns
        textable["total adapter reads undertrimmed"] = (
            textable["adapter reads untrimmed"] +
            textable["adapter reads undertrimmed"])
        for adapter_col, adapter_pct_col in zip(adapter_cols,
                                                adapter_pct_cols):
            textable[adapter_pct_col] = textable[adapter_col] / textable[
                'retained reads']
        textable["reads total error"] = textable.loc[:, adapter_cols].apply(
            sum, 1) / textable['retained reads']
        textable["bases total error"] = textable.loc[:, base_cols].apply(
            sum, 1) / textable['total ref bases']
        # Melt into tidy format
        textable = textable.melt(id_vars=['dataset', 'program'],
                                 value_vars=all_cols)

        # Add the "level" - reads/read pct/bases
        def to_level(var):
            if 'pct' in var:
                return 'pct'
            elif 'reads' in var:
                return 'reads'
            else:
                return 'bases'

        textable['datalevel'] = list(
            to_level(var) for var in textable.variable)
        # Replace the variable names with those we want in the final table
        textable = textable.replace({'variable': col_map})
        # Finally, pivot the table into the grouped format we want to use in the latex template and sort
        textable = textable.pivot_table(index=['dataset', 'program'],
                                        columns=['datalevel',
                                                 'variable']).sort_index()
        # Drop the unnecessary first column level
        textable.columns = textable.columns.droplevel(0)

        # Replace dataset names with actual error rates
        if args.error_rate_file:
            with open(args.error_rate_file, 'rt') as inp:
                error_rate_table = pd.read_csv(inp, sep="\t")
            error_rate_table = error_rate_table.groupby('Requested').agg(
                np.mean)
            datasets = textable.index.levels[0]
            textable.index = textable.index.set_levels(
                datasets.map(lambda x: str(
                    round(error_rate_table.loc[x, 'Actual'], 3))).values,
                'dataset')

        # Replace tool names with display versions
        if args.tool_name_file:
            with open(args.tool_name_file, 'rt') as inp:
                tool_name_table = pd.read_csv(inp,
                                              sep="\t",
                                              index_col='ProfileName')
            programs = textable.index.levels[1]
            textable.index = textable.index.set_levels(
                programs.map(
                    lambda x: tool_name_table.loc[x, 'DisplayName']).values,
                'program')

        # Now render the template
        from mako import exceptions
        from mako.template import Template
        table_template = Template(filename=os.path.join(
            os.path.dirname(__file__), "simulated_accuracy_table.tex"))
        tex_file = args.output + ".tex"
        with fileopen(tex_file, "wt") as o:
            try:
                o.write(
                    table_template.render(name=args.name,
                                          caption=args.caption,
                                          table=textable))
            except:
                print(exceptions.text_error_template().render())