def main(): # Setup the workspaces: import docopt args = docopt.docopt(__doc__) work_blast, work_msa = MsaWorkspace.from_path(args['<msa_workspace>']) work_dels = LoophashWorkspace(work_msa) if args['--force']: work_dels.rmdir() work_dels.mkdir() # Choose which deletions to make: if work_dels.deletions_hdf5.exists(): dels = pd.read_hdf(work_dels.deletions_hdf5) filters = GapFilters.from_toml(work_dels.filters_toml) else: msa = load_weighted_msa(work_msa) scores = calc_deletion_scores(msa) gaps, filters = load_spannable_gaps(work_dels, msa) dels = choose_gaps_to_delete(scores, gaps, filters) # Analyze the results: print(filters) print(dels.describe()) print() work_dels.write_deletions(dels, filters) work_dels.write_metadata()
def main(): # Setup the workspaces: import docopt args = docopt.docopt(__doc__) work_blast, work_msa = MsaWorkspace.from_path(args['<msa_workspace>']) work_dels = DeletionsWorkspace(work_msa, 'threshold') if args['--force']: work_dels.rmdir() work_dels.mkdir() # Choose which deletions to make: if work_dels.deletions_hdf5.exists(): dels = pd.read_hdf(work_dels.deletions_hdf5) else: msa = load_weighted_msa(work_msa) scores = calc_deletion_scores(msa) dels = choose_deletions_via_thresholds(scores) # Record the results: print(f"Chose {len(dels)} deletions:\n") print(dels.describe()) work_dels.write_deletions(dels) work_dels.write_metadata()
def main(): # Parse the command-line arguments. import docopt args = docopt.docopt(__doc__) work_homs, work_msa = MsaWorkspace.from_path(args['<msa_workspace>']) msa = load_weighted_msa(work_msa) scores = calc_deletion_scores(msa) if os.fork(): sys.exit() # Report statistics about the MSA. print(f"{len(msa)} sequences aligned") print(f"{sum(x.percent_id > 0.3 for x in msa)} exceed 30% identity") y, xx = np.histogram( [x.percent_id for x in msa], bins=100, range=(0, 1), ) x = (xx[:-1] + xx[1:]) / 2 plt.subplot(2, 1, 1) plt.title(work_msa.relpath) plt.plot(x, y) plt.axvline(0.3, color='red') plt.ylabel('Count') plt.xlabel('% Identity') # Report statistics about the deletion scores. threshold = np.mean(scores) n = len(scores) n_del = sum(scores > threshold) print( f"{n_del}/{n} residues ({100*n_del/n:.1f}%) have above-average deletion scores." ) i = np.arange(n) plt.subplot(2, 1, 2) plt.plot(i, scores) plt.tight_layout() plt.ylabel('Score') plt.xlabel('Residue index') plt.xlim((0, len(i))) plt.axhline(threshold, color='red') plt.gcf().canvas.set_window_title(str(work_msa.relpath)) plt.show()
def __init__(self, dels_workspace, cursor=0, low=0, high=50): work_dels = DeletionsWorkspace.from_path(dels_workspace) msa = load_weighted_msa(work_dels.msa) scores = calc_deletion_scores(msa) self.dels = pd.read_hdf(work_dels.deletions_hdf5).sort_values(['del_start', 'del_end']) self.cursor = cursor self.low = float(low) self.high = float(high) self.sele = f'chain B and polymer.protein' cmd.alter( self.sele, 'b=scores[int(resi)-1]', space={**locals(), **globals()}, ) self.redraw()
def paint_deletion_scores(msa_workspace, low=0, high=50): """ DESCRIPTION Color each residue by its "deletion score", a measure of how commonly it is deleted in homologous sequences. USAGE paint_deletion_scores dels_workspace ARGUMENTS dels_workspace = a path to a workspace for the final step of the pipeline, which is to pick deletions. low = the percentile of the data to make the most blue {default: 0}. high = the percentile of the data to make the most red {default: 50}. NOTES This script assumes that the only structure loaded in pymol is the one contained in the given workspace. """ work_blast, work_msa = MsaWorkspace.from_path(msa_workspace) msa = load_weighted_msa(work_msa) scores = calc_deletion_scores(msa) cutoff_low = np.percentile(scores, float(low)) cutoff_high = np.percentile(scores, float(high)) sele = f'chain B and polymer.protein' cmd.alter(sele, 'b=scores[int(resi)-1]', space={**locals(), **globals()}) cmd.spectrum('b', 'blue_white_red', sele, minimum=cutoff_low, maximum=cutoff_high) print(f'low: {cutoff_low:.2f} ({float(low):.2f}%)') print(f'high: {cutoff_high:.2f} ({float(high):.2f}%)')