Ejemplo n.º 1
0
def main(align_path, output_path, flank_size, direction, seed, randomise, step,
         dry_run, force_overwrite):
    """Export tab delimited counts table from alignment centred on SNP position.

    Output file is written to the same path with just the file suffix changed
    from fasta to txt."""
    args = locals()
    if not seed:
        seed = str(time.time())

    align_path = abspath(align_path)
    output_path = abspath(output_path)

    counts_filename = get_counts_filename(align_path, output_path)
    runlog_path = counts_filename.replace(".txt", ".log")
    LOGGER.log_file_path = runlog_path

    if not dry_run:
        if not force_overwrite and (os.path.exists(counts_filename)
                                    or os.path.exists(runlog_path)):
            msg = "Either %s or %s already exist. Force overwrite of existing"\
                  " files with -F."
            raise ValueError(msg % (counts_filename, runlog_path))

        makedirs(output_path)

        LOGGER.log_message(str(args), label='vars')
        LOGGER.input_file(align_path, label="align_path")
        LOGGER.log_message(str(seed), label="random_number_seed")

    start_time = time.time()

    # run the program

    counts_table = align_to_counts(align_path, output_path, flank_size,
                                   direction, step, seed, randomise, dry_run)

    if not dry_run:
        counts_table.write(counts_filename, sep='\t')
        LOGGER.output_file(counts_filename)

    # determine runtime
    duration = time.time() - start_time
    if not dry_run:
        LOGGER.log_message("%.2f" % (duration / 60.),
                           label="run duration (minutes)")
Ejemplo n.º 2
0
def align_to_counts(align_path, output_path, flank_size, direction, step, seed,
                    randomise, dry_run):
    '''returns counts table from alignment of sequences centred on a SNP'''

    if not dry_run:
        makedirs(output_path)

    print("Deriving counts from sequence file")
    step = int(step)

    direction = tuple(direction.split('to'))
    chosen_base = direction[0]
    orig_seqs = load_from_fasta(os.path.abspath(align_path))
    seqs = orig_seqs.array_seqs
    seqs = just_nucs(seqs)
    if not randomise:
        orig, ctl = profile.get_profiles(seqs,
                                         chosen_base=chosen_base,
                                         step=step,
                                         flank_size=flank_size,
                                         seed=seed)
    else:
        LOGGER.log_message("A randomised selection of starting base "
                           "locations use for observed counts.")
        # we are setting a randomised set of locations as our observed SNPs
        ctl = profile.get_control(seqs,
                                  chosen_base=chosen_base,
                                  step=step,
                                  flank_size=flank_size,
                                  seed=seed)
        orig = profile.get_control(seqs,
                                   chosen_base=chosen_base,
                                   step=step,
                                   flank_size=flank_size,
                                   seed=seed)

    # convert profiles to a motif count table
    orig_counts = motif_count.profile_to_seq_counts(orig,
                                                    flank_size=flank_size)
    ctl_counts = motif_count.profile_to_seq_counts(ctl, flank_size=flank_size)
    counts_table = motif_count.get_count_table(orig_counts, ctl_counts,
                                               flank_size * 2)
    counts_table = counts_table.sorted(columns='mut')
    return counts_table
Ejemplo n.º 3
0
    def test_aln_to_counts(self):
        """exercising aln_to_counts"""
        if os.path.exists(self.dirname):
            shutil.rmtree(self.dirname)

        makedirs(self.dirname)
        runner = CliRunner()
        # should fail, as data files not in this directory
        r = runner.invoke(aln_to_counts_main, ["-adata/sample_AtoC.fasta", "-o%s" % self.dirname,
                                               "-f1", "--direction=AtoC", "-S111", "-F"])
        dirlist = os.listdir(self.dirname)
        self.assertEqual(r.exit_code, 0)
        self.assertEqual(set(dirlist),
                         set(["sample_AtoC.txt", "sample_AtoC.log"]))
        counts = LoadTable(os.path.join(
            self.dirname, "sample_AtoC.txt"), sep="\t")
        # two columns with pos, two groups giving shape=2*16
        self.assertEqual(counts.shape[0], 32)
        shutil.rmtree(self.dirname)
Ejemplo n.º 4
0
def grid(fig_config, figpath, format, no_type3):
    """draws an arbitrary shaped grid of mutation motifs based on fig_config"""
    # we read in the config file and determine number of rows and columns
    # paths, headings, etc ..
    # then create the figure and axes and call the mutation_motif drawing code

    args = locals()
    if no_type3:
        util.exclude_type3_fonts()

    if not figpath:
        dirname = os.path.dirname(fig_config.name)
        figpath = os.path.join(dirname, "drawn_array.%s" % format)
        log_file_path = os.path.join(dirname, "drawn_array.log")
    else:
        figpath = util.abspath(figpath)
        log_file_path = "%s.log" % ".".join(figpath.split(".")[:-1])

    util.makedirs(os.path.dirname(figpath))
    LOGGER.log_file_path = log_file_path
    LOGGER.log_message(str(args), label='vars')

    ncols, nrows, figsize, col_labels, row_labels, paths, axis_cfg = \
        read_plot_array_config(fig_config)
    print("ncols:", ncols)
    print("nrows:", nrows)
    print("figsize:", figsize)
    print("col_labels:", col_labels)
    print("row_labels:", row_labels)
    print("paths:", paths)
    print("axis_cfg:", axis_cfg)

    #TODO: Convert below into Cogent3 Plotly

    #-Plotly
    layout = UnionDict(shapes=[])
    adaptive_y = 0
    plottable = {}
    for coord in paths:
        data = util.load_loglin_stats(paths[coord])
        positions = list(data)
        positions.sort()
        heights, characters, indices = get_plot_data(data, positions)
        adaptive_y = max(adaptive_y, logo.est_ylim(heights))
        plottable[coord] = dict(char_heights=heights,
                                characters=characters,
                                position_indices=indices)

    ylim = axis_cfg.get("ylim", adaptive_y)
    for coord in plottable:
        kwargs = plottable[coord]
        kwargs["ax"] = coord
        kwargs["ylim"] = ylim
        r = logo.draw_multi_position_cogent3(**kwargs)
        for key in r:
            if key == "shapes":
                layout.shapes.extend(r.shapes)
            else:
                layout[key] = r[key]

    for i in range(0, ncols):
        xaxis = "xaxis" + str(i + 1 if i != 0 else "")
        layout[xaxis]["domain"] = [
            0.0 + (i * (1 / ncols)), (i * (1 / ncols)) + (1 / ncols)
        ]

    print(layout)
    MARGININCHES = 0
    PPI = 100
    fig = Drawable(layout=layout,
                   width=(figsize[0] - MARGININCHES) * PPI,
                   height=(figsize[1] - MARGININCHES) * PPI)

    #export
    fig.write(path=figpath)
    click.secho("Wrote Cogent3 %s" % figpath, fg="green")
    """
Ejemplo n.º 5
0
def main(countsfile, outpath, countsfile2, strand_symmetry, force_overwrite,
         dry_run, verbose):
    args = locals()

    table = LoadTable(countsfile, sep='\t')
    if not dry_run:
        log_file_path = os.path.join(util.abspath(outpath),
                                     'spectra_analysis.log')
        LOGGER.log_file_path = log_file_path
        LOGGER.log_message(str(args), label='vars')

    LOGGER.input_file(countsfile)
    # if there's a strand symmetry argument then we don't need a second file
    if strand_symmetry:
        group_label = 'strand'
        counts_table = util.spectra_table(table, group_label)

    if not strand_symmetry:
        group_label = 'group'

        # be sure there's two files
        counts_table2 = LoadTable(countsfile2, sep='\t')
        LOGGER.input_file(countsfile2)
        counts_table2 = counts_table2.with_new_column('group',
                                                      lambda x: '2', columns=counts_table2.header[0])
        counts_table1 = table.with_new_column('group',
                                              lambda x: '1', columns=table.header[0])

        counts_table1 = util.spectra_table(counts_table1, group_label)
        counts_table2 = util.spectra_table(counts_table2, group_label)

        # now combine
        header = ['group'] + counts_table2.header[:-1]
        raw1 = counts_table1.tolist(header)
        raw2 = counts_table2.tolist(header)
        counts_table = LoadTable(header=header, rows=raw1 + raw2)

        if verbose:
            print(counts_table)

    # spectra table has [count, start, end, group] order
    # we reduce comparisons to a start base
    results = []
    saveable = {}
    for start_base in counts_table.distinct_values('start'):
        subtable = counts_table.filtered('start == "%s"' % start_base)
        columns = [c for c in counts_table.header if c != 'start']
        subtable = subtable.get_columns(columns)
        total_re, dev, df, collated, formula = log_lin.spectra_difference(
            subtable, group_label)
        r = [list(x) for x in collated.to_records(index=False)]

        if not strand_symmetry:
            grp_labels = {'1': countsfile,
                          '2': countsfile2}
            grp_index = list(collated.columns).index('group')
            for row in r:
                row[grp_index] = grp_labels[row[grp_index]]

        p = chisqprob(dev, df)
        if p < 1e-6:
            prob = "%.2e" % p
        else:
            prob = "%.6f" % p

        for row in r:
            row.insert(0, start_base)
            row.append(prob)

        results += r

        significance = ["RE=%.6f" % total_re, "Dev=%.2f" % dev, "df=%d" % df,
                        "p=%s" % p]

        stats = "  :  ".join(significance)
        print("Start base=%s  %s" % (start_base, stats))
        saveable[start_base] = dict(rel_entropy=total_re, deviance=dev,
                                    df=df, prob=p,
                                    formula=formula, stats=collated.to_json())

    table = LoadTable(header=['start_base'] + list(collated.columns) +
                             ['prob'],
                      rows=results, digits=5).sorted(columns='ret')
    json_path = None

    outpath = util.abspath(outpath)
    if not dry_run:
        util.makedirs(outpath)
        json_path = os.path.join(outpath, 'spectra_analysis.json')
        dump_json(saveable, json_path)
        LOGGER.output_file(json_path)
        table_path = os.path.join(outpath, 'spectra_summary.txt')
        table.write(table_path, sep='\t')
        LOGGER.output_file(table_path)
        LOGGER.log_message(str(significance), label="significance")
Ejemplo n.º 6
0
def main(counts_pattern, output_path, strand_symmetric, split_dir, dry_run,
         force_overwrite):
    """export tab delimited combined counts table by appending the 12 mutation
    direction tables, adding a new column ``direction``."""
    args = locals()
    output_path = abspath(output_path)
    if strand_symmetric and split_dir:
        split_dir = abspath(split_dir)
    else:
        split_dir = None

    # check we the glob pattern produces the correct number of files
    counts_files = glob.glob(counts_pattern)
    check_found_filenames(counts_files)

    counts_filename = os.path.join(output_path, 'combined_counts.txt')
    runlog_path = os.path.join(output_path, "combined_counts.log")

    if not dry_run:
        if not force_overwrite and (os.path.exists(counts_filename)
                                    or os.path.exists(runlog_path)):
            msg = "Either %s or %s already exist. Force overwrite of "\
                  "existing files with -F."
            raise ValueError(msg % (counts_filename, runlog_path))

        makedirs(output_path)
        if split_dir:
            makedirs(split_dir)

        LOGGER.log_file_path = runlog_path
        LOGGER.log_message(str(args), label='vars')
        for fn in counts_files:
            LOGGER.input_file(fn, label="count_file")

    start_time = time.time()

    # run the program
    all_counts = []
    header = None
    num_rows = 0
    basenames = []
    for fn in counts_files:
        basenames.append(os.path.basename(fn))
        mutation = direction.findall(fn)[0]
        table = LoadTable(fn, sep='\t')
        if header is None:
            header = list(table.header)
            header.append('direction')
            num_rows = table.shape[0]

        data = table.tolist()
        new = []
        for row in data:
            row.append(mutation)
            new.append(row)
        all_counts += new

    table = LoadTable(header=header, rows=all_counts)

    if strand_symmetric:
        table = make_strand_symmetric_table(table)

    if split_dir:
        group_subtables = get_subtables(table, group_label='direction')

    if not dry_run:
        table.write(counts_filename, sep='\t')
        LOGGER.output_file(counts_filename)

        if split_dir:
            for group, subtable in group_subtables:
                # we first assume that group is part of the filenames!
                fn = [bn for bn in basenames if group in bn]
                if len(fn) == 1:
                    fn = fn[0]
                else:
                    fn = "%s.txt" % group

                counts_filename = os.path.join(split_dir, fn)
                subtable.write(counts_filename, sep='\t')
                LOGGER.output_file(counts_filename)

    # determine runtime
    duration = time.time() - start_time
    if not dry_run:
        LOGGER.log_message("%.2f" % (duration / 60.),
                           label="run duration (minutes)")

    print("Done!")
Ejemplo n.º 7
0
def grid(fig_config, figpath, format, no_type3):
    """draws an arbitrary shaped grid of mutation motifs based on fig_config"""
    # we read in the config file and determine number of rows and columns
    # paths, headings, etc ..
    # then create the figure and axes and call the mutation_motif drawing code

    args = locals()
    if no_type3:
        util.exclude_type3_fonts()

    if not figpath:
        dirname = os.path.dirname(fig_config.name)
        figpath = os.path.join(dirname, "drawn_array.%s" % format)
        log_file_path = os.path.join(dirname, "drawn_array.log")
    else:
        figpath = util.abspath(figpath)
        log_file_path = "%s.log" % ".".join(figpath.split(".")[:-1])

    util.makedirs(os.path.dirname(figpath))
    LOGGER.log_file_path = log_file_path
    LOGGER.log_message(str(args), label='vars')

    ncols, nrows, figsize, col_labels, row_labels, paths, axis_cfg = \
        read_plot_array_config(fig_config)

    fig, axes = pyplot.subplots(nrows=nrows, ncols=ncols, figsize=figsize,
                                sharex=True, sharey=True)
    figwidth = fig.get_figwidth()
    try:
        axes[0]
    except TypeError:
        axes = numpy.array([[axes]])

    if len(axes.shape) == 1:
        # required for indexing of appropriate axis
        axes = numpy.vstack(axes)
        if nrows == 1:
            axes = axes.T

    adaptive_y = 0
    plottable = {}
    for coord in paths:
        data = util.load_loglin_stats(paths[coord])
        positions = list(data)
        positions.sort()
        heights, characters, indices = get_plot_data(data, positions)
        adaptive_y = max(adaptive_y, logo.est_ylim(heights))
        plottable[coord] = dict(char_heights=heights,
                                characters=characters,
                                position_indices=indices,
                                figwidth=figwidth,
                                verbose=False)

    ylim = axis_cfg.get("ylim", adaptive_y)
    for coord in plottable:
        kwargs = plottable[coord]
        kwargs["ax"] = axes[coord]
        kwargs["ylim"] = ylim
        fig = logo.draw_multi_position(**kwargs)

    xformat = FuncFormatter(format_float(1e-3, float_places=2))

    for col in range(ncols):
        top_ax = axes[0, col]
        top_ax.set_title(col_labels[col], fontsize=axis_cfg["xlabel_fontsize"],
                         weight="bold", y=1.1)
        btm_ax = axes[-1, col]
        for xticklabel in btm_ax.get_xticklabels():
            xticklabel.set_fontsize(axis_cfg["xtick_fontsize"])
            xticklabel.set_rotation(0)
        btm_ax.set_xlabel("Position", fontsize=axis_cfg["xlabel_fontsize"],
                          weight="bold")
        btm_ax.xaxis.labelpad = axis_cfg['xlabel_pad']

    for row in range(nrows):
        lft_ax = axes[row, 0]
        for yticklabel in lft_ax.get_yticklabels():
            yticklabel.set_fontsize(axis_cfg["ytick_fontsize"])
            yticklabel.set_rotation(0)

        lft_ax.yaxis.set_major_formatter(FuncFormatter(xformat))
        lft_ax.yaxis.labelpad = axis_cfg['ylabel_pad']
        lft_ax.set_ylabel(row_labels[row], rotation=0,
                          fontsize=axis_cfg['ylabel_fontsize'],
                          weight="bold")

    fig.tight_layout()
    fig.savefig(figpath)
    click.secho("Wrote %s" % figpath, fg="green")
Ejemplo n.º 8
0
def run(input_path, output_path, direction, prefix, chrom_class, gc_class,
        freq_class, adjust_strand, limit, force_overwrite, dry_run, verbose):
    if not dry_run:
        makedirs(output_path)

    correct_freq = {
        'All': everything,
        'Common': MakeFreqCompare(MAF, ge=True, get_freq=min, verbose=verbose),
        'Rare': MakeFreqCompare(MAF, ge=False, get_freq=min, verbose=verbose)
    }[freq_class]

    correct_comp = {
        'All': everything,
        'Hi': MakeFreqCompare(0.5,
                              ge=True,
                              get_freq=get_gc_freq,
                              verbose=verbose),
        'Lo': MakeFreqCompare(0.4,
                              ge=False,
                              get_freq=get_gc_freq,
                              verbose=verbose)
    }[gc_class]

    correct_chrom = {
        'All': everything,
        'A': is_autosome,
        'X': is_xchrom
    }[chrom_class]

    seen = set()
    chroms = set()
    if not os.path.exists(input_path):
        raise IOError("no files matching %s" % input_path)

    name_components = dict(freq_class='freq_' + freq_class,
                           chrom_class='chrom_' + chrom_class,
                           gc_class='GC_' + gc_class,
                           direction=direction,
                           prefix=prefix or '')

    outfilename = os.path.join(
        output_path,
        '%(prefix)s%(freq_class)s-%(chrom_class)s-%(gc_class)s-%(direction)s.fasta.gz'
        % name_components)

    runlog_path = os.path.join(
        output_path,
        '%(prefix)s%(freq_class)s-%(chrom_class)s-%(gc_class)s-%(direction)s.log'
        % name_components)
    LOGGER.log_file_path = runlog_path

    if not force_overwrite and (os.path.exists(outfilename)
                                or os.path.exists(runlog_path)):
        msg = "Either %s or %s already exist. Force overwrite of existing files with -F."
        raise ValueError(msg % (outfilename, runlog_path))

    LOGGER.input_file(input_path)

    direction = tuple(direction.split('to'))

    with open_(input_path) as infile:
        with open_(outfilename, 'w') as outfile:
            num = 0
            for record in filtered_records(infile,
                                           direction,
                                           seen,
                                           chroms,
                                           correct_chrom=correct_chrom,
                                           correct_freq=correct_freq,
                                           correct_comp=correct_comp,
                                           stranded=adjust_strand,
                                           verbose=False):
                outfile.write(record)
                num += 1
                if limit and num >= limit:
                    break

        LOGGER.output_file(outfilename)
    msg = "Wrote %d records to %s" % (num, outfilename)
    print(msg)
    LOGGER.log_message(msg + "\n", label="completed")
Ejemplo n.º 9
0
def nbr(countsfile, outpath, countsfile2, first_order, strand_symmetry,
        group_label, group_ref, plot_cfg, no_type3, format, verbose, dry_run):
    '''log-linear analysis of neighbouring base influence on point mutation

    Writes estimated statistics, figures and a run log to the specified
    directory outpath.

    See documentation for count table format requirements.
    '''
    if no_type3:
        util.exclude_type3_fonts()

    args = locals()

    outpath = util.abspath(outpath)

    if not dry_run:
        util.makedirs(outpath)
        runlog_path = os.path.join(outpath, "analysis.log")
        LOGGER.log_file_path = runlog_path
        LOGGER.log_message(str(args), label='vars')

    counts_filename = util.abspath(countsfile)
    counts_table = util.load_table_from_delimited_file(counts_filename,
                                                       sep='\t')

    LOGGER.input_file(counts_filename, label="countsfile1_path")

    positions = [c for c in counts_table.header if c.startswith('pos')]
    if not first_order and len(positions) != 4:
        raise ValueError("Requires four positions for analysis")

    group_label = group_label or None
    group_ref = group_ref or None
    if strand_symmetry:
        group_label = 'strand'
        group_ref = group_ref or '+'
        if group_label not in counts_table.header:
            print("ERROR: no column named 'strand', exiting.")
            exit(-1)

    if countsfile2:
        print("Performing 2 group analysis")
        group_label = group_label or 'group'
        group_ref = group_ref or '1'
        counts_table1 = counts_table.with_new_column(group_label,
                                                     lambda x: '1',
                                                     columns=counts_table.header[0])

        fn2 = util.abspath(countsfile2)
        counts_table2 = util.load_table_from_delimited_file(fn2, sep='\t')

        LOGGER.input_file(fn2, label="countsfile2_path")

        counts_table2 = counts_table2.with_new_column(group_label,
                                                      lambda x: '2',
                                                      columns=counts_table2.header[0])
        # now combine
        header = [group_label] + counts_table2.header[:-1]
        raw1 = counts_table1.tolist(header)
        raw2 = counts_table2.tolist(header)
        counts_table = make_table(header=header, rows=raw1 + raw2)

        if not dry_run:
            outfile = os.path.join(outpath, 'group_counts_table.txt')
            counts_table.write(outfile, sep='\t')
            LOGGER.output_file(outfile, label="group_counts")

    if dry_run or verbose:
        print()
        print(counts_table)
        print()

    plot_config = util.get_plot_configs(cfg_path=plot_cfg)

    msg = single_group(counts_table, outpath, group_label, group_ref,
                       positions, plot_config, first_order,
                       dry_run)
    print(msg)