def main(): # Setup the workspaces: import docopt args = docopt.docopt(__doc__) work_blast, work_msa = MsaWorkspace.from_path(args['<msa_workspace>']) work_dels = LoophashWorkspace(work_msa) if args['--force']: work_dels.rmdir() work_dels.mkdir() # Choose which deletions to make: if work_dels.deletions_hdf5.exists(): dels = pd.read_hdf(work_dels.deletions_hdf5) filters = GapFilters.from_toml(work_dels.filters_toml) else: msa = load_weighted_msa(work_msa) scores = calc_deletion_scores(msa) gaps, filters = load_spannable_gaps(work_dels, msa) dels = choose_gaps_to_delete(scores, gaps, filters) # Analyze the results: print(filters) print(dels.describe()) print() work_dels.write_deletions(dels, filters) work_dels.write_metadata()
def main(): # Setup the workspaces: import docopt args = docopt.docopt(__doc__) work_blast, work_msa = MsaWorkspace.from_path(args['<msa_workspace>']) work_dels = DeletionsWorkspace(work_msa, 'threshold') if args['--force']: work_dels.rmdir() work_dels.mkdir() # Choose which deletions to make: if work_dels.deletions_hdf5.exists(): dels = pd.read_hdf(work_dels.deletions_hdf5) else: msa = load_weighted_msa(work_msa) scores = calc_deletion_scores(msa) dels = choose_deletions_via_thresholds(scores) # Record the results: print(f"Chose {len(dels)} deletions:\n") print(dels.describe()) work_dels.write_deletions(dels) work_dels.write_metadata()
def main(): # Parse the command-line arguments. import docopt args = docopt.docopt(__doc__) work_homs, work_msa = MsaWorkspace.from_path(args['<msa_workspace>']) msa = load_weighted_msa(work_msa) scores = calc_deletion_scores(msa) if os.fork(): sys.exit() # Report statistics about the MSA. print(f"{len(msa)} sequences aligned") print(f"{sum(x.percent_id > 0.3 for x in msa)} exceed 30% identity") y, xx = np.histogram( [x.percent_id for x in msa], bins=100, range=(0, 1), ) x = (xx[:-1] + xx[1:]) / 2 plt.subplot(2, 1, 1) plt.title(work_msa.relpath) plt.plot(x, y) plt.axvline(0.3, color='red') plt.ylabel('Count') plt.xlabel('% Identity') # Report statistics about the deletion scores. threshold = np.mean(scores) n = len(scores) n_del = sum(scores > threshold) print( f"{n_del}/{n} residues ({100*n_del/n:.1f}%) have above-average deletion scores." ) i = np.arange(n) plt.subplot(2, 1, 2) plt.plot(i, scores) plt.tight_layout() plt.ylabel('Score') plt.xlabel('Residue index') plt.xlim((0, len(i))) plt.axhline(threshold, color='red') plt.gcf().canvas.set_window_title(str(work_msa.relpath)) plt.show()
def __init__(self, dels_workspace, cursor=0, low=0, high=50): work_dels = DeletionsWorkspace.from_path(dels_workspace) msa = load_weighted_msa(work_dels.msa) scores = calc_deletion_scores(msa) self.dels = pd.read_hdf(work_dels.deletions_hdf5).sort_values(['del_start', 'del_end']) self.cursor = cursor self.low = float(low) self.high = float(high) self.sele = f'chain B and polymer.protein' cmd.alter( self.sele, 'b=scores[int(resi)-1]', space={**locals(), **globals()}, ) self.redraw()
def main(): # Parse the command-line arguments. import docopt args = docopt.docopt(__doc__) # Make a histogram for each given MSA. work_msas = [ MsaWorkspace.from_path(p)[1] for p in args['<msa_workspaces>'] ] by_limit_then_algorithm = lambda x: (x.limit, x.algorithm) if os.fork(): sys.exit() for work_msa in sorted(work_msas, key=by_limit_then_algorithm): msa = load_weighted_msa(work_msa) y, xx = np.histogram( [x.percent_id for x in msa], bins=100, range=(0,1), ) x = (xx[:-1] + xx[1:]) / 2 plt.plot(x, y, label=work_msa.relpath) # Show the histograms. plt.ylabel('Count') plt.xlabel('% Identity') plt.axvline(0.3, color='grey', linestyle='--', zorder=-1) plt.legend(loc='best') plt.title(work_msa.shared.root.name) plt.gcf().canvas.set_window_title(work_msa.shared.root.name) plt.show()
def paint_deletion_scores(msa_workspace, low=0, high=50): """ DESCRIPTION Color each residue by its "deletion score", a measure of how commonly it is deleted in homologous sequences. USAGE paint_deletion_scores dels_workspace ARGUMENTS dels_workspace = a path to a workspace for the final step of the pipeline, which is to pick deletions. low = the percentile of the data to make the most blue {default: 0}. high = the percentile of the data to make the most red {default: 50}. NOTES This script assumes that the only structure loaded in pymol is the one contained in the given workspace. """ work_blast, work_msa = MsaWorkspace.from_path(msa_workspace) msa = load_weighted_msa(work_msa) scores = calc_deletion_scores(msa) cutoff_low = np.percentile(scores, float(low)) cutoff_high = np.percentile(scores, float(high)) sele = f'chain B and polymer.protein' cmd.alter(sele, 'b=scores[int(resi)-1]', space={**locals(), **globals()}) cmd.spectrum('b', 'blue_white_red', sele, minimum=cutoff_low, maximum=cutoff_high) print(f'low: {cutoff_low:.2f} ({float(low):.2f}%)') print(f'high: {cutoff_high:.2f} ({float(high):.2f}%)')
def paint_deletions(dels_workspace): """ DESCRIPTION Color the protein by how often residue is proposed to be deleted. USAGE paint_deletions dels_workspace ARGUMENTS dels_workspace = a path to a workspace for the final step of the pipeline, which is to pick deletions. NOTES This script assumes that the only structure loaded in pymol is the one contained in the given workspace. """ work_dels = DeletionsWorkspace.from_path(dels_workspace) msa = load_weighted_msa(work_dels.msa) dels = pd.read_hdf(work_dels.deletions_hdf5) n_dels = count_deletions(msa, dels) low = 0 high = max(n_dels) sele = f'chain B and polymer.protein' cmd.alter(sele, 'b=n_dels[int(resi)-1]', space={**locals(), **globals()}) cmd.spectrum('b', 'blue_white_red', sele, minimum=low, maximum=high) print(f'n: {len(dels)}') print(f'low: {low:.2f}') print(f'high: {high:.2f}')
def main(): import docopt args = docopt.docopt(__doc__) plots = [] for path in args['<dels_workspace>']: work_dels = DeletionsWorkspace.from_path(path) msa = load_weighted_msa(work_dels.msa) dels = pd.read_hdf(work_dels.deletions_hdf5) plot = Plot() plot.y = count_deletions(msa, dels) plot.x = np.arange(len(plot.y)) plot.label = f'{work_dels.relpath} (N={len(dels)})' plot.seq = msa.ref_ungapped plots.append(plot) if args['--align']: if len(plots) != 2: fatal("Must specify 2 worksapces to use the --align option.") # I decided to use BLOSUM62 because the two sequences in this case may # not be particularly similar. I used the corresponding gap penalties # from BLAST 2.2.27, which I found in the reference below: # # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3848038/ alignments = align.globalds( plots[0].seq, plots[1].seq, blosum62, -11, -1, ) aligned_seq1, aligned_seq2, score, start, end = alignments[0] aligned_x1 = [] aligned_x2 = [] for i, (aa1, aa2) in enumerate(zip(aligned_seq1, aligned_seq2)): if aa1 not in '-': aligned_x1.append(i) if aa2 not in '-': aligned_x2.append(i) plots[0].x = np.array(aligned_x1) plots[1].x = np.array(aligned_x2) percent_id = sum(x[0] == x[1] and '-' not in x for x in zip(aligned_seq1, aligned_seq2)) percent_id /= max(len(p.seq) for p in plots) print(f"Scaffolds aligned with {100*percent_id:.2f}% identity.") if os.fork(): sys.exit() for p in plots: plt.plot(p.x, p.y, label=p.label) plt.xlabel("aligned residue index" if args['--align'] else "residue index") plt.ylabel("relative deletions" if args['--normalize'] else "deletions") plt.xlim((0, max(p.x[-1] for x in plots))) plt.legend(loc='best') plt.show()