Example #1
0
def main(countsfile, outpath, countsfile2, strand_symmetry, force_overwrite,
         dry_run, verbose):
    args = locals()

    table = LoadTable(countsfile, sep='\t')
    if not dry_run:
        log_file_path = os.path.join(util.abspath(outpath),
                                     'spectra_analysis.log')
        LOGGER.log_file_path = log_file_path
        LOGGER.log_message(str(args), label='vars')

    LOGGER.input_file(countsfile)
    # if there's a strand symmetry argument then we don't need a second file
    if strand_symmetry:
        group_label = 'strand'
        counts_table = util.spectra_table(table, group_label)

    if not strand_symmetry:
        group_label = 'group'

        # be sure there's two files
        counts_table2 = LoadTable(countsfile2, sep='\t')
        LOGGER.input_file(countsfile2)
        counts_table2 = counts_table2.with_new_column('group',
                                                      lambda x: '2', columns=counts_table2.header[0])
        counts_table1 = table.with_new_column('group',
                                              lambda x: '1', columns=table.header[0])

        counts_table1 = util.spectra_table(counts_table1, group_label)
        counts_table2 = util.spectra_table(counts_table2, group_label)

        # now combine
        header = ['group'] + counts_table2.header[:-1]
        raw1 = counts_table1.tolist(header)
        raw2 = counts_table2.tolist(header)
        counts_table = LoadTable(header=header, rows=raw1 + raw2)

        if verbose:
            print(counts_table)

    # spectra table has [count, start, end, group] order
    # we reduce comparisons to a start base
    results = []
    saveable = {}
    for start_base in counts_table.distinct_values('start'):
        subtable = counts_table.filtered('start == "%s"' % start_base)
        columns = [c for c in counts_table.header if c != 'start']
        subtable = subtable.get_columns(columns)
        total_re, dev, df, collated, formula = log_lin.spectra_difference(
            subtable, group_label)
        r = [list(x) for x in collated.to_records(index=False)]

        if not strand_symmetry:
            grp_labels = {'1': countsfile,
                          '2': countsfile2}
            grp_index = list(collated.columns).index('group')
            for row in r:
                row[grp_index] = grp_labels[row[grp_index]]

        p = chisqprob(dev, df)
        if p < 1e-6:
            prob = "%.2e" % p
        else:
            prob = "%.6f" % p

        for row in r:
            row.insert(0, start_base)
            row.append(prob)

        results += r

        significance = ["RE=%.6f" % total_re, "Dev=%.2f" % dev, "df=%d" % df,
                        "p=%s" % p]

        stats = "  :  ".join(significance)
        print("Start base=%s  %s" % (start_base, stats))
        saveable[start_base] = dict(rel_entropy=total_re, deviance=dev,
                                    df=df, prob=p,
                                    formula=formula, stats=collated.to_json())

    table = LoadTable(header=['start_base'] + list(collated.columns) +
                             ['prob'],
                      rows=results, digits=5).sorted(columns='ret')
    json_path = None

    outpath = util.abspath(outpath)
    if not dry_run:
        util.makedirs(outpath)
        json_path = os.path.join(outpath, 'spectra_analysis.json')
        dump_json(saveable, json_path)
        LOGGER.output_file(json_path)
        table_path = os.path.join(outpath, 'spectra_summary.txt')
        table.write(table_path, sep='\t')
        LOGGER.output_file(table_path)
        LOGGER.log_message(str(significance), label="significance")
Example #2
0
def main(counts_pattern, output_path, strand_symmetric, split_dir, dry_run,
         force_overwrite):
    """export tab delimited combined counts table by appending the 12 mutation
    direction tables, adding a new column ``direction``."""
    args = locals()
    output_path = abspath(output_path)
    if strand_symmetric and split_dir:
        split_dir = abspath(split_dir)
    else:
        split_dir = None

    # check we the glob pattern produces the correct number of files
    counts_files = glob.glob(counts_pattern)
    check_found_filenames(counts_files)

    counts_filename = os.path.join(output_path, 'combined_counts.txt')
    runlog_path = os.path.join(output_path, "combined_counts.log")

    if not dry_run:
        if not force_overwrite and (os.path.exists(counts_filename)
                                    or os.path.exists(runlog_path)):
            msg = "Either %s or %s already exist. Force overwrite of "\
                  "existing files with -F."
            raise ValueError(msg % (counts_filename, runlog_path))

        makedirs(output_path)
        if split_dir:
            makedirs(split_dir)

        LOGGER.log_file_path = runlog_path
        LOGGER.log_message(str(args), label='vars')
        for fn in counts_files:
            LOGGER.input_file(fn, label="count_file")

    start_time = time.time()

    # run the program
    all_counts = []
    header = None
    num_rows = 0
    basenames = []
    for fn in counts_files:
        basenames.append(os.path.basename(fn))
        mutation = direction.findall(fn)[0]
        table = LoadTable(fn, sep='\t')
        if header is None:
            header = list(table.header)
            header.append('direction')
            num_rows = table.shape[0]

        data = table.tolist()
        new = []
        for row in data:
            row.append(mutation)
            new.append(row)
        all_counts += new

    table = LoadTable(header=header, rows=all_counts)

    if strand_symmetric:
        table = make_strand_symmetric_table(table)

    if split_dir:
        group_subtables = get_subtables(table, group_label='direction')

    if not dry_run:
        table.write(counts_filename, sep='\t')
        LOGGER.output_file(counts_filename)

        if split_dir:
            for group, subtable in group_subtables:
                # we first assume that group is part of the filenames!
                fn = [bn for bn in basenames if group in bn]
                if len(fn) == 1:
                    fn = fn[0]
                else:
                    fn = "%s.txt" % group

                counts_filename = os.path.join(split_dir, fn)
                subtable.write(counts_filename, sep='\t')
                LOGGER.output_file(counts_filename)

    # determine runtime
    duration = time.time() - start_time
    if not dry_run:
        LOGGER.log_message("%.2f" % (duration / 60.),
                           label="run duration (minutes)")

    print("Done!")
def nbr(countsfile, outpath, countsfile2, first_order, strand_symmetry,
        group_label, group_ref, plot_cfg, no_type3, format, verbose, dry_run):
    '''log-linear analysis of neighbouring base influence on point mutation

    Writes estimated statistics, figures and a run log to the specified
    directory outpath.

    See documentation for count table format requirements.
    '''
    if no_type3:
        util.exclude_type3_fonts()

    args = locals()

    outpath = util.abspath(outpath)

    if not dry_run:
        util.makedirs(outpath)
        runlog_path = os.path.join(outpath, "analysis.log")
        LOGGER.log_file_path = runlog_path
        LOGGER.log_message(str(args), label='vars')

    counts_filename = util.abspath(countsfile)
    counts_table = util.load_table_from_delimited_file(counts_filename,
                                                       sep='\t')

    LOGGER.input_file(counts_filename, label="countsfile1_path")

    positions = [c for c in counts_table.header if c.startswith('pos')]
    if not first_order and len(positions) != 4:
        raise ValueError("Requires four positions for analysis")

    group_label = group_label or None
    group_ref = group_ref or None
    if strand_symmetry:
        group_label = 'strand'
        group_ref = group_ref or '+'
        if group_label not in counts_table.header:
            print("ERROR: no column named 'strand', exiting.")
            exit(-1)

    if countsfile2:
        print("Performing 2 group analysis")
        group_label = group_label or 'group'
        group_ref = group_ref or '1'
        counts_table1 = counts_table.with_new_column(group_label,
                                                     lambda x: '1',
                                                     columns=counts_table.header[0])

        fn2 = util.abspath(countsfile2)
        counts_table2 = util.load_table_from_delimited_file(fn2, sep='\t')

        LOGGER.input_file(fn2, label="countsfile2_path")

        counts_table2 = counts_table2.with_new_column(group_label,
                                                      lambda x: '2',
                                                      columns=counts_table2.header[0])
        # now combine
        header = [group_label] + counts_table2.header[:-1]
        raw1 = counts_table1.tolist(header)
        raw2 = counts_table2.tolist(header)
        counts_table = LoadTable(header=header, rows=raw1 + raw2)

        if not dry_run:
            outfile = os.path.join(outpath, 'group_counts_table.txt')
            counts_table.write(outfile, sep='\t')
            LOGGER.output_file(outfile, label="group_counts")

    if dry_run or verbose:
        print()
        print(counts_table)
        print()

    plot_config = util.get_plot_configs(cfg_path=plot_cfg)

    msg = single_group(counts_table, outpath, group_label, group_ref,
                       positions, plot_config, first_order,
                       dry_run)
    print(msg)
Example #4
0
def collate(base_path, output_path, exclude_paths, overwrite):
    """collates all classifier performance stats and writes
    to a single tsv file"""
    LOGGER.log_args()
    outpath = os.path.join(output_path, "collated.tsv.gz")
    logfile_path = os.path.join(output_path, "collated.log")
    if os.path.exists(outpath) and not overwrite:
        click.secho(f"Skipping. {outpath} exists. "
                    "Use overwrite to force.",
                    fg='green')
        exit(0)

    stat_fns = exec_command(f'find {base_path} -name' ' "*performance.json*"')
    stat_fns = stat_fns.splitlines()
    if not stat_fns:
        msg = f'No files matching "*performance.json*" in {base_path}'
        click.secho(msg, fg='red')
        return

    LOGGER.log_file_path = logfile_path

    records = []
    keys = set()
    exclude_paths = [] if exclude_paths is None else exclude_paths.split(',')
    num_skipped = 0
    for fn in tqdm(stat_fns, ncols=80):
        if skip_path(exclude_paths, fn):
            num_skipped += 1
            LOGGER.log_message(fn, label="SKIPPED FILE")
            continue

        LOGGER.input_file(fn)
        data = load_json(fn)
        labels = data['classification_report']['labels']
        fscores = data['classification_report']['f-score']
        row = {
            "stat_path": fn,
            "classifier_path": data["classifier_path"],
            "auc": data["auc"],
            "algorithm": data["classifier_label"],
            "mean_precision": data["mean_precision"],
            f"fscore({labels[0]})": fscores[0],
            f"fscore({labels[1]})": fscores[1],
            'balanced_accuracy': data['balanced_accuracy']
        }
        row.update(data["feature_params"])
        keys.update(row.keys())
        records.append(row)

    columns = sorted(keys)
    rows = list(map(lambda r: [r.get(c, None) for c in columns], records))
    table = LoadTable(header=columns, rows=rows)
    table = table.sorted(reverse="auc")
    table = table.with_new_column(
        "name",
        lambda x: model_name_from_features(*x),
        columns=["flank_size", "feature_dim", "usegc", "proximal"])
    table = table.with_new_column("size",
                                  sample_size_from_path,
                                  columns="classifier_path")
    table.write(outpath)
    LOGGER.output_file(outpath)

    # make summary statistics via grouping by factors
    factors = [
        "algorithm", "name", "flank_size", "feature_dim", "proximal", "usegc",
        "size"
    ]
    summary = summary_stat_table(table, factors=factors)
    outpath = os.path.join(output_path, "summary_statistics.tsv.gz")
    summary.write(outpath)
    LOGGER.output_file(outpath)
    if num_skipped:
        click.secho("Skipped %d files that matched exclude_paths" %
                    num_skipped,
                    fg='red')
def single_group(counts_table, outpath, group_label, group_ref, positions,
                 plot_config, first_order, dry_run):
    # Collect statistical analysis results
    summary = []

    max_results = {}
    # Single position analysis
    print("Doing single position analysis")
    single_results = single_position_effects(counts_table, positions,
                                             group_label=group_label)
    summary += make_summary(single_results)

    max_results[1] = max(single_results[p]['rel_entropy']
                         for p in single_results)
    if not dry_run:
        outfilename = os.path.join(outpath, "1.json")
        util.dump_loglin_stats(single_results, outfilename)
        LOGGER.output_file(outfilename, label="analysis1")

    fig = get_single_position_fig(
        single_results, positions,
        plot_config.get('1-way plot', 'figsize'),
        group_label=group_label,
        group_ref=group_ref,
        figwidth=plot_config.get('1-way plot', 'figwidth'),
        xlabel_fontsize=plot_config.get('1-way plot',
                                        'xlabel_fontsize'),
        ylabel_fontsize=plot_config.get('1-way plot',
                                        'ylabel_fontsize'),
        xtick_fontsize=plot_config.get('1-way plot',
                                       'xtick_fontsize'),
        ytick_fontsize=plot_config.get('1-way plot',
                                       'ytick_fontsize'))

    format_offset(fig, int(plot_config.get('1-way plot',
                                           'ytick_fontsize') * .8))
    if not dry_run:
        outfilename = os.path.join(outpath, "1.pdf")
        fig.savefig(outfilename, bbox_inches='tight')
        print("Wrote", outfilename)
        fig.clf()  # refresh for next section

    if first_order:
        msg = "Done! Check %s for your results" % outpath
        summary = LoadTable(header=['Position', 'RE', 'Deviance', 'df',
                                    'prob', 'formula'],
                            rows=summary, digits=2, space=2)
        if not dry_run:
            outfilename = os.path.join(outpath, "summary.txt")
            summary.write(outfilename, sep='\t')
            LOGGER.output_file(outfilename, label="summary")

        return msg

    print("Doing two positions analysis")
    results = get_two_position_effects(counts_table, positions,
                                       group_label=group_label)
    summary += make_summary(results)

    max_results[2] = max(results[p]['rel_entropy'] for p in results)
    if not dry_run:
        outfilename = os.path.join(outpath, "2.json")
        util.dump_loglin_stats(results, outfilename)
        LOGGER.output_file(outfilename, label="analysis2")

    fig = get_two_position_fig(results, positions,
                               plot_config.get('2-way plot', 'figsize'),
                               group_label=group_label, group_ref=group_ref,
                               xtick_fontsize=plot_config.get(
                                   '2-way plot', 'xtick_fontsize'),
                               ytick_fontsize=plot_config.get('2-way plot', 'ytick_fontsize'))
    fig.set_figwidth(plot_config.get('2-way plot', 'figwidth'))
    x_fsz = plot_config.get('2-way plot', 'xlabel_fontsize')
    y_fsz = plot_config.get('2-way plot', 'ylabel_fontsize')
    fig.text(0.5, plot_config.get('2-way plot', 'xlabel_pad'), 'Position',
             ha='center', va='center', fontsize=x_fsz)
    fig.text(plot_config.get('2-way plot', 'ylabel_pad'), 0.5, 'RE',
             ha='center', va='center', rotation='vertical', fontsize=y_fsz)
    format_offset(fig, int(plot_config.get('2-way plot',
                                           'ytick_fontsize') * .8))
    if not dry_run:
        outfilename = os.path.join(outpath, "2.pdf")
        fig.savefig(outfilename, bbox_inches='tight')
        print("Wrote", outfilename)
        fig.clf()  # refresh for next section

    print("Doing three positions analysis")
    results = get_three_position_effects(counts_table, positions,
                                         group_label=group_label)
    summary += make_summary(results)

    max_results[3] = max(results[p]['rel_entropy'] for p in results)
    if not dry_run:
        outfilename = os.path.join(outpath, "3.json")
        util.dump_loglin_stats(results, outfilename)
        LOGGER.output_file(outfilename, label="analysis3")

    fig = get_three_position_fig(results, positions,
                                 plot_config.get('3-way plot', 'figsize'),
                                 group_label=group_label, group_ref=group_ref,
                                 xtick_fontsize=plot_config.get(
                                     '3-way plot', 'xtick_fontsize'),
                                 ytick_fontsize=plot_config.get('3-way plot', 'ytick_fontsize'))
    fig.set_figwidth(plot_config.get('3-way plot', 'figwidth'))
    x_fsz = plot_config.get('3-way plot', 'xlabel_fontsize')
    y_fsz = plot_config.get('3-way plot', 'ylabel_fontsize')
    fig.text(0.5, plot_config.get('3-way plot', 'xlabel_pad'),
             'Position', ha='center', va='center', fontsize=x_fsz)
    fig.text(plot_config.get('3-way plot', 'ylabel_pad'), 0.5, 'RE',
             ha='center', va='center', rotation='vertical', fontsize=y_fsz)
    format_offset(fig,
                  int(plot_config.get('3-way plot', 'ytick_fontsize') * .8))
    if not dry_run:
        outfilename = os.path.join(outpath, "3.pdf")
        fig.savefig(outfilename, bbox_inches='tight')
        print("Wrote", outfilename)
        fig.clf()  # refresh for next section

    print("Doing four positions analysis")
    results = get_four_position_effects(counts_table, positions,
                                        group_label=group_label)
    summary += make_summary(results)

    max_results[4] = max(results[p]['rel_entropy'] for p in results)
    if not dry_run:
        outfilename = os.path.join(outpath, "4.json")
        util.dump_loglin_stats(results, outfilename)
        LOGGER.output_file(outfilename, label="analysis4")

    fig = get_four_position_fig(results, positions,
                                plot_config.get('4-way plot', 'figsize'),
                                group_label=group_label, group_ref=group_ref)
    fig.set_figwidth(plot_config.get('4-way plot', 'figwidth'))
    ax = fig.gca()
    x_fsz = plot_config.get('4-way plot', 'xlabel_fontsize')
    y_fsz = plot_config.get('4-way plot', 'ylabel_fontsize')
    ax.set_xlabel('Position', fontsize=x_fsz)
    ax.set_ylabel('RE', fontsize=y_fsz)
    format_offset(fig, int(plot_config.get('4-way plot',
                                           'ytick_fontsize') * .8))
    if not dry_run:
        outfilename = os.path.join(outpath, "4.pdf")
        fig.savefig(outfilename, bbox_inches='tight')
        print("Wrote", outfilename)
        fig.clf()  # refresh for next section

    # now generate summary plot
    bar_width = 0.5
    index = numpy.arange(4)
    y_lim = max(max_results.values())
    y_fmt = util.FixedOrderFormatter(numpy.floor(numpy.log10(y_lim)))

    fig = pyplot.figure(figsize=plot_config.get('summary plot', 'figsize'))
    ax = fig.gca()
    ax.yaxis.set_major_formatter(y_fmt)

    bar = pyplot.bar(index, [max_results[i] for i in range(1, 5)], bar_width)
    pyplot.xticks(index + (bar_width / 2.), list(range(1, 5)),
                  fontsize=plot_config.get('summary plot', 'xtick_fontsize'))
    x_sz = plot_config.get('summary plot', 'xlabel_fontsize')
    y_sz = plot_config.get('summary plot', 'ylabel_fontsize')
    ax.set_xlabel("Effect Order", fontsize=x_sz)
    ax.set_ylabel("RE$_{max}$", fontsize=y_sz)

    x_sz = plot_config.get('summary plot', 'xtick_fontsize')
    y_sz = plot_config.get('summary plot', 'ytick_fontsize')
    ax.tick_params(axis='x', labelsize=x_sz, pad=x_sz // 2, length=0)
    ax.tick_params(axis='y', labelsize=y_sz, pad=y_sz // 2)
    format_offset(fig, int(plot_config.get('summary plot',
                                           'ytick_fontsize') * .8))
    if not dry_run:
        outfilename = os.path.join(outpath, "summary.pdf")
        pyplot.savefig(outfilename, bbox_inches='tight')
        print("Wrote", outfilename)

    summary = LoadTable(header=['Position', 'RE', 'Deviance', 'df',
                                'prob', 'formula'],
                        rows=summary, digits=2, space=2)
    if not dry_run:
        outfilename = os.path.join(outpath, "summary.txt")
        summary.write(outfilename, sep='\t')
        LOGGER.output_file(outfilename, label="summary")

    print(summary)
    pyplot.close('all')
    msg = "Done! Check %s for your results" % outpath
    return msg