Exemple #1
0
def main():

    # Setup the workspaces:

    import docopt
    args = docopt.docopt(__doc__)

    work_blast, work_msa = MsaWorkspace.from_path(args['<msa_workspace>'])
    work_dels = LoophashWorkspace(work_msa)

    if args['--force']:
        work_dels.rmdir()
        work_dels.mkdir()

    # Choose which deletions to make:

    if work_dels.deletions_hdf5.exists():
        dels = pd.read_hdf(work_dels.deletions_hdf5)
        filters = GapFilters.from_toml(work_dels.filters_toml)
    else:
        msa = load_weighted_msa(work_msa)
        scores = calc_deletion_scores(msa)
        gaps, filters = load_spannable_gaps(work_dels, msa)
        dels = choose_gaps_to_delete(scores, gaps, filters)

    # Analyze the results:

    print(filters)
    print(dels.describe())
    print()

    work_dels.write_deletions(dels, filters)
    work_dels.write_metadata()
Exemple #2
0
def main():

    # Setup the workspaces:

    import docopt
    args = docopt.docopt(__doc__)

    work_blast, work_msa = MsaWorkspace.from_path(args['<msa_workspace>'])
    work_dels = DeletionsWorkspace(work_msa, 'threshold')

    if args['--force']:
        work_dels.rmdir()
        work_dels.mkdir()

    # Choose which deletions to make:

    if work_dels.deletions_hdf5.exists():
        dels = pd.read_hdf(work_dels.deletions_hdf5)
    else:
        msa = load_weighted_msa(work_msa)
        scores = calc_deletion_scores(msa)
        dels = choose_deletions_via_thresholds(scores)

    # Record the results:

    print(f"Chose {len(dels)} deletions:\n")
    print(dels.describe())

    work_dels.write_deletions(dels)
    work_dels.write_metadata()
def main():

    # Parse the command-line arguments.

    import docopt
    args = docopt.docopt(__doc__)
    work_homs, work_msa = MsaWorkspace.from_path(args['<msa_workspace>'])

    msa = load_weighted_msa(work_msa)
    scores = calc_deletion_scores(msa)

    if os.fork():
        sys.exit()

    # Report statistics about the MSA.

    print(f"{len(msa)} sequences aligned")
    print(f"{sum(x.percent_id > 0.3 for x in msa)} exceed 30% identity")

    y, xx = np.histogram(
        [x.percent_id for x in msa],
        bins=100,
        range=(0, 1),
    )
    x = (xx[:-1] + xx[1:]) / 2

    plt.subplot(2, 1, 1)
    plt.title(work_msa.relpath)
    plt.plot(x, y)
    plt.axvline(0.3, color='red')
    plt.ylabel('Count')
    plt.xlabel('% Identity')

    # Report statistics about the deletion scores.

    threshold = np.mean(scores)
    n = len(scores)
    n_del = sum(scores > threshold)
    print(
        f"{n_del}/{n} residues ({100*n_del/n:.1f}%) have above-average deletion scores."
    )

    i = np.arange(n)
    plt.subplot(2, 1, 2)
    plt.plot(i, scores)
    plt.tight_layout()
    plt.ylabel('Score')
    plt.xlabel('Residue index')
    plt.xlim((0, len(i)))
    plt.axhline(threshold, color='red')

    plt.gcf().canvas.set_window_title(str(work_msa.relpath))
    plt.show()
Exemple #4
0
    def __init__(self, dels_workspace, cursor=0, low=0, high=50):
        work_dels = DeletionsWorkspace.from_path(dels_workspace)
        msa = load_weighted_msa(work_dels.msa)
        scores = calc_deletion_scores(msa)

        self.dels = pd.read_hdf(work_dels.deletions_hdf5).sort_values(['del_start', 'del_end'])
        self.cursor = cursor
        self.low = float(low)
        self.high = float(high)
        self.sele = f'chain B and polymer.protein'

        cmd.alter(
                self.sele,
                'b=scores[int(resi)-1]',
                space={**locals(), **globals()},
        )
        self.redraw()
def main():

    # Parse the command-line arguments.

    import docopt
    args = docopt.docopt(__doc__)

    # Make a histogram for each given MSA.

    work_msas = [
            MsaWorkspace.from_path(p)[1]
            for p in args['<msa_workspaces>']
    ]
    by_limit_then_algorithm = lambda x: (x.limit, x.algorithm)

    if os.fork():
        sys.exit()

    for work_msa in sorted(work_msas, key=by_limit_then_algorithm):
        msa = load_weighted_msa(work_msa)

        y, xx = np.histogram(
                [x.percent_id for x in msa],
                bins=100,
                range=(0,1),
        )
        x = (xx[:-1] + xx[1:]) / 2
        plt.plot(x, y, label=work_msa.relpath)


    # Show the histograms.

    plt.ylabel('Count')
    plt.xlabel('% Identity')
    plt.axvline(0.3, color='grey', linestyle='--', zorder=-1)
    plt.legend(loc='best')
    plt.title(work_msa.shared.root.name)
    plt.gcf().canvas.set_window_title(work_msa.shared.root.name)
    plt.show()
Exemple #6
0
def paint_deletion_scores(msa_workspace, low=0, high=50):
    """
DESCRIPTION

    Color each residue by its "deletion score", a measure of how commonly it is 
    deleted in homologous sequences.

USAGE

    paint_deletion_scores dels_workspace

ARGUMENTS

    dels_workspace = a path to a workspace for the final step of the pipeline, 
    which is to pick deletions.

    low = the percentile of the data to make the most blue {default: 0}.

    high = the percentile of the data to make the most red {default: 50}.

NOTES

    This script assumes that the only structure loaded in pymol is the one 
    contained in the given workspace.
"""
    work_blast, work_msa = MsaWorkspace.from_path(msa_workspace)
    msa = load_weighted_msa(work_msa)
    scores = calc_deletion_scores(msa)

    cutoff_low = np.percentile(scores, float(low))
    cutoff_high = np.percentile(scores, float(high))

    sele = f'chain B and polymer.protein'
    cmd.alter(sele, 'b=scores[int(resi)-1]', space={**locals(), **globals()})
    cmd.spectrum('b', 'blue_white_red', sele, minimum=cutoff_low, maximum=cutoff_high)

    print(f'low:  {cutoff_low:.2f} ({float(low):.2f}%)')
    print(f'high: {cutoff_high:.2f} ({float(high):.2f}%)')
Exemple #7
0
def paint_deletions(dels_workspace):
    """
DESCRIPTION

    Color the protein by how often residue is proposed to be deleted.

USAGE

    paint_deletions dels_workspace

ARGUMENTS

    dels_workspace = a path to a workspace for the final step of the pipeline, 
    which is to pick deletions.

NOTES

    This script assumes that the only structure loaded in pymol is the one 
    contained in the given workspace.
"""

    work_dels = DeletionsWorkspace.from_path(dels_workspace)
    msa = load_weighted_msa(work_dels.msa)
    dels = pd.read_hdf(work_dels.deletions_hdf5)
    n_dels = count_deletions(msa, dels)

    low = 0
    high = max(n_dels)

    sele = f'chain B and polymer.protein'
    cmd.alter(sele, 'b=n_dels[int(resi)-1]', space={**locals(), **globals()})
    cmd.spectrum('b', 'blue_white_red', sele, minimum=low, maximum=high)

    print(f'n:    {len(dels)}')
    print(f'low:  {low:.2f}')
    print(f'high: {high:.2f}')
def main():
    import docopt
    args = docopt.docopt(__doc__)

    plots = []

    for path in args['<dels_workspace>']:
        work_dels = DeletionsWorkspace.from_path(path)
        msa = load_weighted_msa(work_dels.msa)
        dels = pd.read_hdf(work_dels.deletions_hdf5)

        plot = Plot()
        plot.y = count_deletions(msa, dels)
        plot.x = np.arange(len(plot.y))
        plot.label = f'{work_dels.relpath} (N={len(dels)})'
        plot.seq = msa.ref_ungapped
        plots.append(plot)

    if args['--align']:
        if len(plots) != 2:
            fatal("Must specify 2 worksapces to use the --align option.")

        # I decided to use BLOSUM62 because the two sequences in this case may
        # not be particularly similar.  I used the corresponding gap penalties
        # from BLAST 2.2.27, which I found in the reference below:
        #
        # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3848038/

        alignments = align.globalds(
            plots[0].seq,
            plots[1].seq,
            blosum62,
            -11,
            -1,
        )
        aligned_seq1, aligned_seq2, score, start, end = alignments[0]
        aligned_x1 = []
        aligned_x2 = []

        for i, (aa1, aa2) in enumerate(zip(aligned_seq1, aligned_seq2)):
            if aa1 not in '-':
                aligned_x1.append(i)

            if aa2 not in '-':
                aligned_x2.append(i)

        plots[0].x = np.array(aligned_x1)
        plots[1].x = np.array(aligned_x2)

        percent_id = sum(x[0] == x[1] and '-' not in x
                         for x in zip(aligned_seq1, aligned_seq2))
        percent_id /= max(len(p.seq) for p in plots)

        print(f"Scaffolds aligned with {100*percent_id:.2f}% identity.")

    if os.fork():
        sys.exit()

    for p in plots:
        plt.plot(p.x, p.y, label=p.label)

    plt.xlabel("aligned residue index" if args['--align'] else "residue index")
    plt.ylabel("relative deletions" if args['--normalize'] else "deletions")
    plt.xlim((0, max(p.x[-1] for x in plots)))
    plt.legend(loc='best')
    plt.show()