def main(): parser = argparse.ArgumentParser( description='Apply alphabet reduction to MSA') parser.add_argument('file', help='either seq file or bimarg file') parser.add_argument('alphamap') parser.add_argument('--alpha', default='protgap') parser.add_argument('--out') args = parser.parse_args(sys.argv[1:]) alphabets = { 'protein': IUPAC.protein.letters, 'protgap': '-' + IUPAC.protein.letters, 'charge': '0+-', 'nuc': "ACGT" } alpha = alphabets.get(args.alpha, args.alpha) with open(args.alphamap) as f: # assumed to be a file containing the output of alphabet reduction, but # only for one reduction level. Each line should look like: # ALPHA8 -DNAGSQFMYCI E HWP K L R T V newalphas = [a.split()[1:] for a in f.readlines()] try: bimarg = np.load(args.file) except ValueError: seqs = seqload.loadSeqs(args.file, alpha)[0] reduceSeqAlphaPerpos(seqs, newalphas, alpha, args.out) else: reduceBimAlphaPerpos(bimarg, newalphas, alpha, args.out)
def main_TVD(name, args): parser = argparse.ArgumentParser() parser.add_argument('ref_seqs') parser.add_argument('--save') args = parser.parse_args(args) ref_seqs = loadSeqs(args.ref_seqs, alpha=ALPHA)[0] vae = loadVAE(name) rh = histsim(ref_seqs).astype(float) rh = rh / np.sum(rh) seqs = np.concatenate(list(vae.generate(10000))) h = histsim(seqs).astype(float) h[-1] = 0 h = h / np.sum(h) if args.save: np.save(args.save, h) plt.figure() plt.plot(rh, label='ref') plt.plot(h, label='model') plt.legend() plt.savefig("TVD_{}.png".format(name))
def main_seq_accuracy(name, args): parser = argparse.ArgumentParser() parser.add_argument('seqs') args = parser.parse_args(args) seqs = loadSeqs(args.seqs, alpha=ALPHA)[0] N, L = seqs.shape vae = loadVAE(name) # pad sequences to batch_size padN = ((N - 1) // vae.batch_size + 1) * vae.batch_size padseqs = np.tile(seqs, ((padN - 1) // N + 1, 1))[:padN] brnll = vae.single_sample(padseqs) brnll = brnll.reshape((brnll.shape[0], L, q))[:N, :, :] pwm = brnll / np.sum(brnll, axis=-1, keepdims=True) for n, s, o in zip(range(N), seqs, pwm): o = o.reshape(L, q) acc = np.mean(s == np.argmax(o, axis=1)) p = np.sum(np.log(o[np.arange(L), s])) plt.figure(figsize=(16, 4)) plt.imshow(o.T, cmap='gray_r', interpolation='nearest') plt.plot(np.arange(L), s, 'r.', ms=2) plt.title("Seq {} Acc={:.3f} log-p={:.4f}".format(str(n), acc, p)) plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05) plt.savefig("test_seq_{}_{}.png".format(name, n)) plt.close()
def loadSequenceDir(sdir, alpha, log): log("Loading sequences from dir {}".format(sdir)) seqs = [] while True: sfile = os.path.join(sdir, 'seqs-{}'.format(len(seqs))) if not os.path.exists(sfile): break seqs.append(seqload.loadSeqs(sfile, names=alpha)[0].astype('<u1')) return seqs
def make_db(args): parser = argparse.ArgumentParser() parser.add_argument('name') parser.add_argument('reps', type=int) parser.add_argument('msa') parser.add_argument('--weights') parser.add_argument('--npos', default='2-10') parser.add_argument('--topmode', default='20', help='integer, string of form ">0.01", or "nonzero"') args = parser.parse_args(args) reps = args.reps name = args.name.strip('.db') topmode = args.topmode dataseqs = seqload.loadSeqs(args.msa)[0] weights = None if args.weights is not None: weights = np.load(args.weights).astype('f4') print("using weights") Nd, L = dataseqs.shape msa = np.asfortranarray(dataseqs) npos_lo, npos_hi = [int(x) for x in args.npos.split('-')] npos_range = range(npos_lo, npos_hi + 1) root_seed = np.random.SeedSequence() # set up globals *before* forking global makedb_globals makedb_globals = (msa, weights, topmode) print("Starting workers...") print("") jobs = ((n, i, root_seed.spawn(1)[0]) for n in npos_range[::-1] for i in range(reps)) # use only 64 tasks per child to fix memory leak in child processes I can't find # I suspect a bug in multiprocessing since others have similar unsolved leaks: # https://stackoverflow.com/questions/21485319/high-memory-usage-using-python-multiprocessing # https://stackoverflow.com/questions/56922672/multiprocessing-child-task-continues-to-leak-more-memory with Pool(os.cpu_count(), maxtasksperchild=64) as pool: with open("{}.db".format(name), "wt") as f: for n, i, dat in pool.imap_unordered(makedb_job, jobs): f.write(dat) print("\r{} {} ".format(n, i), end="") print("Done!")
def count_msas(args): parser = argparse.ArgumentParser() parser.add_argument('db_name') parser.add_argument('out_name') parser.add_argument('msas', nargs='*') parser.add_argument('--weights', nargs='*') parser.add_argument('--score', default='pearson', choices=['pearson', 'spearman', 'pcttvd']) args = parser.parse_args(args) msas = [np.asfortranarray(seqload.loadSeqs(m)[0]) for m in args.msas] weights = None if args.weights: weights = [np.load(w) if w != 'None' else None for m in args.weights] positionsets = {} with open('{}.db'.format(args.db_name.rstrip('.db')), "rt") as f: while True: lines = [] while lines == [] or lines[-1] not in ['\n', '']: lines.append(f.readline()) if lines[-1] == '': break pos = [int(p) for p in lines[0].split()] slines = (l.split() for l in lines[1:] if not l.isspace()) seqs, fs = zip(*((s, float(f)) for s, f in slines)) seqs = [[alpha.index(c) for c in s] for s in seqs] seqs = np.array(seqs, dtype='u1') fs = np.array(fs, dtype='f4') npos = len(pos) if npos not in positionsets: positionsets[npos] = [] positionsets[npos].append((pos, seqs, fs)) npos = list(positionsets.keys()) npos.sort() global count_msas_globals count_msas_globals = (msas, weights, args.score) print("Starting workers...") with Pool(os.cpu_count(), maxtasksperchild=64) as pool: print("Using {} workers".format(os.cpu_count())) res = pool.map(count_job, (d for n in npos for d in positionsets[n])) res = np.array(res).reshape((len(npos), -1, len(msas))) np.save(args.out_name, res)
def main_train(name, args): parser = argparse.ArgumentParser() parser.add_argument('vae_type', choices=vaes.keys()) parser.add_argument('seqs') parser.add_argument('latent_dim', type=int) parser.add_argument('args', nargs='*') parser.add_argument('--epochs', type=int, default=32) parser.add_argument('--patience') parser.add_argument('--batch_size', type=int, default=200) parser.add_argument('--TVDseqs', action='store_true') args = parser.parse_args(args) latent_dim = args.latent_dim seqs = loadSeqs(args.seqs, alpha=ALPHA)[0] N, L = seqs.shape print("L = {}".format(L)) #np.random.seed(42) #np.random.seed(256) batch_size = args.batch_size #inner_dim = args.inner_dim assert (N % batch_size == 0) n_batches = N // batch_size validation_batches = int(n_batches * 0.1) train_seqs = seqs[:-validation_batches * batch_size] val_seqs = seqs[-validation_batches * batch_size:] TVDseqs = None if args.TVDseqs: TVDseqs = val_seqs[:1000] patience = None if args.patience: patience = int(args.patience) vae = vaes[args.vae_type]() vae.create_model(L, q, latent_dim, batch_size, *args.args) vae.summarize() hist = vae.train(args.epochs, train_seqs, val_seqs, name=name, TVDseqs=TVDseqs, early_patience=patience) vae.save(name) plot_performance(vae, hist, name)
def main(): parser = argparse.ArgumentParser(description='Compute phylogenetic weights') parser.add_argument('-alpha', default='protgap') parser.add_argument('sim', default='none', help="Similarity Threshold") parser.add_argument('seqfile') parser.add_argument('outfile') args = parser.parse_args(sys.argv[1:]) alphabets = {'protein': IUPAC.protein.letters, 'protgap': '-' + IUPAC.protein.letters, 'charge': '0+-', 'nuc': "ACGT"} letters = alphabets.get(args.alpha, args.alpha) nBases = len(letters) seqs = seqload.loadSeqs(args.seqfile, letters)[0] nSeq, seqLen = seqs.shape if args.sim == 'none': sim = 0 elif args.sim == 'unique': sim = 0.5/seqLen elif args.sim.startswith('m'): sim = (float(args.sim[1:])+0.5)/seqLen else: sim = float(args.sim) sim = 1-sim if sim < 0 or sim > 1: raise Exception("Similarity threshold must be between 0 and 1") if sim != 1.0: similarityCutoff = int(np.ceil((1-sim)*seqLen)) print("Identity cutoff:", similarityCutoff, file=sys.stderr) weights = 1.0/seqtools.nsim(seqs, similarityCutoff) else: weights = np.ones(seqs.shape[0]) M_eff = np.sum(weights) print(M_eff) np.save(args.outfile, weights)
def main_energy(name, args): parser = argparse.ArgumentParser() parser.add_argument('seqs') parser.add_argument('--ref_energy') parser.add_argument('--n_samples', default=1000, type=int) args = parser.parse_args(args) seqs = loadSeqs(args.seqs, alpha=ALPHA)[0] vae = loadVAE(name) nlelbo = -vae.lELBO(seqs, n_samples=args.n_samples) logp = vae.logp(seqs, n_samples=args.n_samples) np.save('nlelbo_{}'.format(name), nlelbo) np.save('logp_{}'.format(name), logp) plt.figure(figsize=(6, 4)) plt.plot(nlelbo, logp, '.') plt.xlabel('$-\log \mathrm{ELBO}(S)$') plt.ylabel(r'$\log p_\theta(S)$') plt.savefig("energies_{}.png".format(name), dpi=300) plt.close() if args.ref_energy: refE = np.load(args.ref_energy) plt.figure() plt.plot(refE, nlelbo, '.') plt.xlabel('Ref E') plt.xlabel('$-\log$ ELBO') plt.title(r'$\rho = {:.3f}$'.format(pearsonr(refE, nlelbo)[0])) plt.savefig("energies_elbo_{}.png".format(name)) plt.close() plt.figure() plt.plot(refE, logp, '.') plt.xlabel('Ref E') plt.ylabel('$\log p(x)$') plt.title(r'$\rho = {:.3f}$'.format(pearsonr(refE, logp)[0])) plt.savefig("energies_logp_{}.png".format(name)) plt.close()
#!/usr/bin/env python3 import numpy as np import seqload, seqtools from numpy.random import randint import sys, os s = seqload.loadSeqs(sys.argv[1])[0] cutoff = 1 - float(sys.argv[2]) L = s.shape[1] inds = [] out_seq = [] while s.shape[0] != 0: ind = randint(s.shape[0]) out_seq.append(s[ind].copy()) # no ref to s s = s[np.sum(s == s[ind, :], axis=1) / float(L) < cutoff, :] print(s.shape, file=sys.stderr) with os.fdopen(sys.stdout.fileno(), 'wb', closefd=False) as fp: seqload.writeSeqs(fp, np.array(out_seq), noheader=True)
def loadSequenceFile(sfile, alpha, log): log("Loading sequences from file {}".format(sfile)) seqs = seqload.loadSeqs(sfile, names=alpha)[0].astype('<u1') return seqs
def main_plot_latent(name, args): parser = argparse.ArgumentParser() parser.add_argument('seqs') parser.add_argument('--save') args = parser.parse_args(args) seqs = loadSeqs(args.seqs, alpha=ALPHA)[0] # only plot first 10,000 seqs = seqs[:10000] vae = loadVAE(name) latent_dim = vae.latent_dim m, lv = vae.encode(seqs) st = np.exp(lv / 2) if args.save: np.save(args.save, np.dstack([m, lv])) # make 1d distribution plots fig = plt.figure(figsize=(12, 12)) nx = max(latent_dim // 2, 1) ny = (latent_dim - 1) // nx + 1 for z1 in range(latent_dim): fig.add_subplot(nx, ny, z1 + 1) h, b, _ = plt.hist(m[:, z1], bins=100, density=True) wm, ws = m[0][z1], st[0][z1] x = np.linspace(wm - 5 * ws, wm + 5 * ws, 200) y = norm.pdf(x, wm, ws) y = y * np.max(h) / np.max(y) plt.plot(x, y, 'r-') plt.xlim(-5, 5) plt.title('Z{}, <z{}>_std={:.2f}'.format(z1, z1, np.std(m[:, z1]))) plt.savefig('LatentTraining_1d_{}.png'.format(name)) plt.close() # special plot for l=1 case: vary the 1 dimension, make movie of output if vae.latent_dim == 1: z = np.linspace(-4, 4, vae.batch_size) psm = vae.decode_bernoulli(z) import matplotlib.animation as animation fig = plt.figure(figsize=(16, 4)) ax1 = plt.subplot(211) ax2 = plt.subplot(212) h, b, _ = ax1.hist(m[:, 0], bins=100, density=True) ax1.set_xlim(-4, 4) artists = [] for p, zi in zip(psm, z): zpt = ax1.plot([zi], [0], 'r.', ms=20)[0] im = ax2.imshow(p.T, cmap='gray_r', interpolation='nearest', animated=True) artists.append([im, zpt]) #print("".join(ALPHA[c] for c in np.argmax(p, axis=1))) ani = animation.ArtistAnimation(fig, artists, interval=40, blit=True, repeat_delay=1000) #ani.save('vary_z1_{}.mp4'.format(name)) ani.save('vary_z1_{}.gif'.format(name), dpi=80, writer='imagemagick') plt.close() return # make 2d distribution plots r = 4 s = np.linspace(-r, r, 50) X, Y = np.meshgrid(s, s) red = np.broadcast_to(np.array([1., 0, 0, 1]), (len(s), len(s), 4)).copy() fig = plt.figure(figsize=(12, 12)) counter = 0 for z1 in range(latent_dim): print('Var z{}: {}'.format(z1, np.var(m[:, z1]))) for z2 in range(z1 + 1, latent_dim): counter += 1 fig.add_subplot(latent_dim - 1, latent_dim - 1, counter) plt.hist2d(m[:, z2], m[:, z1], bins=np.linspace(-r, r, 50), cmap=DarkBlue, cmin=1) nn = (norm.pdf(X, m[0][z2], st[0][z2]) * norm.pdf(Y, m[0][z1], st[0][z1])) nn = nn / np.max(nn) / 1.5 red[:, :, 3] = nn plt.imshow(red, extent=(-r, r, -r, r), origin='lower', zorder=2) ##wildtype in red #plt.scatter(m[0][z1], m[0][z2],c="r", alpha=1) # make 1std oval for wt #wtv = Ellipse((m[0][z2], m[0][z1]), # width=st[0][z2], height=st[0][z1], # facecolor='none', edgecolor='red', lw=2) #plt.gca().add_patch(wtv) #wtv = Ellipse((m[0][z2], m[0][z1]), # width=2*st[0][z2], height=2*st[0][z1], # facecolor='none', edgecolor='red', lw=1) #plt.gca().add_patch(wtv) plt.xlim(-4, 4) plt.ylim(-4, 4) fs = 26 if latent_dim <= 7: fs *= 2 if z1 == 0: plt.xlabel('$z_{{{}}}$'.format(z2), fontsize=fs, labelpad=fs / 2) plt.gca().xaxis.set_label_position('top') if z2 == latent_dim - 1: plt.ylabel('$z_{{{}}}$'.format(z1), fontsize=fs) plt.gca().yaxis.set_label_position('right') plt.xticks([]) plt.yticks([]) counter += z1 + 1 plt.subplots_adjust(right=0.92, bottom=0.01, left=0.01, top=0.92) plt.savefig('LatentTraining_{}.png'.format(name)) plt.close()