Exemple #1
0
def main():

    # Setup the workspaces:

    import docopt
    args = docopt.docopt(__doc__)

    work_blast, work_msa = MsaWorkspace.from_path(args['<msa_workspace>'])
    work_dels = LoophashWorkspace(work_msa)

    if args['--force']:
        work_dels.rmdir()
        work_dels.mkdir()

    # Choose which deletions to make:

    if work_dels.deletions_hdf5.exists():
        dels = pd.read_hdf(work_dels.deletions_hdf5)
        filters = GapFilters.from_toml(work_dels.filters_toml)
    else:
        msa = load_weighted_msa(work_msa)
        scores = calc_deletion_scores(msa)
        gaps, filters = load_spannable_gaps(work_dels, msa)
        dels = choose_gaps_to_delete(scores, gaps, filters)

    # Analyze the results:

    print(filters)
    print(dels.describe())
    print()

    work_dels.write_deletions(dels, filters)
    work_dels.write_metadata()
Exemple #2
0
def main():

    # Setup the workspaces:

    import docopt
    args = docopt.docopt(__doc__)

    work_blast, work_msa = MsaWorkspace.from_path(args['<msa_workspace>'])
    work_dels = DeletionsWorkspace(work_msa, 'threshold')

    if args['--force']:
        work_dels.rmdir()
        work_dels.mkdir()

    # Choose which deletions to make:

    if work_dels.deletions_hdf5.exists():
        dels = pd.read_hdf(work_dels.deletions_hdf5)
    else:
        msa = load_weighted_msa(work_msa)
        scores = calc_deletion_scores(msa)
        dels = choose_deletions_via_thresholds(scores)

    # Record the results:

    print(f"Chose {len(dels)} deletions:\n")
    print(dels.describe())

    work_dels.write_deletions(dels)
    work_dels.write_metadata()
def main():

    # Parse the command-line arguments.

    import docopt
    args = docopt.docopt(__doc__)
    work_homs, work_msa = MsaWorkspace.from_path(args['<msa_workspace>'])

    msa = load_weighted_msa(work_msa)
    scores = calc_deletion_scores(msa)

    if os.fork():
        sys.exit()

    # Report statistics about the MSA.

    print(f"{len(msa)} sequences aligned")
    print(f"{sum(x.percent_id > 0.3 for x in msa)} exceed 30% identity")

    y, xx = np.histogram(
        [x.percent_id for x in msa],
        bins=100,
        range=(0, 1),
    )
    x = (xx[:-1] + xx[1:]) / 2

    plt.subplot(2, 1, 1)
    plt.title(work_msa.relpath)
    plt.plot(x, y)
    plt.axvline(0.3, color='red')
    plt.ylabel('Count')
    plt.xlabel('% Identity')

    # Report statistics about the deletion scores.

    threshold = np.mean(scores)
    n = len(scores)
    n_del = sum(scores > threshold)
    print(
        f"{n_del}/{n} residues ({100*n_del/n:.1f}%) have above-average deletion scores."
    )

    i = np.arange(n)
    plt.subplot(2, 1, 2)
    plt.plot(i, scores)
    plt.tight_layout()
    plt.ylabel('Score')
    plt.xlabel('Residue index')
    plt.xlim((0, len(i)))
    plt.axhline(threshold, color='red')

    plt.gcf().canvas.set_window_title(str(work_msa.relpath))
    plt.show()
Exemple #4
0
    def __init__(self, dels_workspace, cursor=0, low=0, high=50):
        work_dels = DeletionsWorkspace.from_path(dels_workspace)
        msa = load_weighted_msa(work_dels.msa)
        scores = calc_deletion_scores(msa)

        self.dels = pd.read_hdf(work_dels.deletions_hdf5).sort_values(['del_start', 'del_end'])
        self.cursor = cursor
        self.low = float(low)
        self.high = float(high)
        self.sele = f'chain B and polymer.protein'

        cmd.alter(
                self.sele,
                'b=scores[int(resi)-1]',
                space={**locals(), **globals()},
        )
        self.redraw()
Exemple #5
0
def paint_deletion_scores(msa_workspace, low=0, high=50):
    """
DESCRIPTION

    Color each residue by its "deletion score", a measure of how commonly it is 
    deleted in homologous sequences.

USAGE

    paint_deletion_scores dels_workspace

ARGUMENTS

    dels_workspace = a path to a workspace for the final step of the pipeline, 
    which is to pick deletions.

    low = the percentile of the data to make the most blue {default: 0}.

    high = the percentile of the data to make the most red {default: 50}.

NOTES

    This script assumes that the only structure loaded in pymol is the one 
    contained in the given workspace.
"""
    work_blast, work_msa = MsaWorkspace.from_path(msa_workspace)
    msa = load_weighted_msa(work_msa)
    scores = calc_deletion_scores(msa)

    cutoff_low = np.percentile(scores, float(low))
    cutoff_high = np.percentile(scores, float(high))

    sele = f'chain B and polymer.protein'
    cmd.alter(sele, 'b=scores[int(resi)-1]', space={**locals(), **globals()})
    cmd.spectrum('b', 'blue_white_red', sele, minimum=cutoff_low, maximum=cutoff_high)

    print(f'low:  {cutoff_low:.2f} ({float(low):.2f}%)')
    print(f'high: {cutoff_high:.2f} ({float(high):.2f}%)')