def main():
    parser = argparse.ArgumentParser(
        description='Apply alphabet reduction to MSA')
    parser.add_argument('file', help='either seq file or bimarg file')
    parser.add_argument('alphamap')
    parser.add_argument('--alpha', default='protgap')
    parser.add_argument('--out')

    args = parser.parse_args(sys.argv[1:])
    alphabets = {
        'protein': IUPAC.protein.letters,
        'protgap': '-' + IUPAC.protein.letters,
        'charge': '0+-',
        'nuc': "ACGT"
    }
    alpha = alphabets.get(args.alpha, args.alpha)

    with open(args.alphamap) as f:
        # assumed to be a file containing the output of alphabet reduction, but
        # only for one reduction level.  Each line should look like:
        # ALPHA8 -DNAGSQFMYCI E HWP K L R T V
        newalphas = [a.split()[1:] for a in f.readlines()]

    try:
        bimarg = np.load(args.file)
    except ValueError:
        seqs = seqload.loadSeqs(args.file, alpha)[0]
        reduceSeqAlphaPerpos(seqs, newalphas, alpha, args.out)
    else:
        reduceBimAlphaPerpos(bimarg, newalphas, alpha, args.out)
Beispiel #2
0
def main_TVD(name, args):
    parser = argparse.ArgumentParser()
    parser.add_argument('ref_seqs')
    parser.add_argument('--save')
    args = parser.parse_args(args)

    ref_seqs = loadSeqs(args.ref_seqs, alpha=ALPHA)[0]

    vae = loadVAE(name)

    rh = histsim(ref_seqs).astype(float)
    rh = rh / np.sum(rh)

    seqs = np.concatenate(list(vae.generate(10000)))
    h = histsim(seqs).astype(float)
    h[-1] = 0
    h = h / np.sum(h)

    if args.save:
        np.save(args.save, h)

    plt.figure()
    plt.plot(rh, label='ref')
    plt.plot(h, label='model')
    plt.legend()
    plt.savefig("TVD_{}.png".format(name))
Beispiel #3
0
def main_seq_accuracy(name, args):
    parser = argparse.ArgumentParser()
    parser.add_argument('seqs')
    args = parser.parse_args(args)

    seqs = loadSeqs(args.seqs, alpha=ALPHA)[0]
    N, L = seqs.shape

    vae = loadVAE(name)

    # pad sequences to batch_size
    padN = ((N - 1) // vae.batch_size + 1) * vae.batch_size
    padseqs = np.tile(seqs, ((padN - 1) // N + 1, 1))[:padN]

    brnll = vae.single_sample(padseqs)
    brnll = brnll.reshape((brnll.shape[0], L, q))[:N, :, :]
    pwm = brnll / np.sum(brnll, axis=-1, keepdims=True)

    for n, s, o in zip(range(N), seqs, pwm):
        o = o.reshape(L, q)
        acc = np.mean(s == np.argmax(o, axis=1))
        p = np.sum(np.log(o[np.arange(L), s]))

        plt.figure(figsize=(16, 4))
        plt.imshow(o.T, cmap='gray_r', interpolation='nearest')
        plt.plot(np.arange(L), s, 'r.', ms=2)
        plt.title("Seq {}   Acc={:.3f} log-p={:.4f}".format(str(n), acc, p))
        plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05)
        plt.savefig("test_seq_{}_{}.png".format(name, n))
        plt.close()
Beispiel #4
0
def loadSequenceDir(sdir, alpha, log):
    log("Loading sequences from dir {}".format(sdir))
    seqs = []
    while True:
        sfile = os.path.join(sdir, 'seqs-{}'.format(len(seqs)))
        if not os.path.exists(sfile):
            break
        seqs.append(seqload.loadSeqs(sfile, names=alpha)[0].astype('<u1'))
    return seqs
Beispiel #5
0
def make_db(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('name')
    parser.add_argument('reps', type=int)
    parser.add_argument('msa')
    parser.add_argument('--weights')
    parser.add_argument('--npos', default='2-10')
    parser.add_argument('--topmode',
                        default='20',
                        help='integer, string of form ">0.01", or "nonzero"')
    args = parser.parse_args(args)

    reps = args.reps
    name = args.name.strip('.db')
    topmode = args.topmode

    dataseqs = seqload.loadSeqs(args.msa)[0]
    weights = None
    if args.weights is not None:
        weights = np.load(args.weights).astype('f4')
        print("using weights")

    Nd, L = dataseqs.shape
    msa = np.asfortranarray(dataseqs)

    npos_lo, npos_hi = [int(x) for x in args.npos.split('-')]
    npos_range = range(npos_lo, npos_hi + 1)

    root_seed = np.random.SeedSequence()

    # set up globals *before* forking
    global makedb_globals
    makedb_globals = (msa, weights, topmode)

    print("Starting workers...")
    print("")
    jobs = ((n, i, root_seed.spawn(1)[0]) for n in npos_range[::-1]
            for i in range(reps))

    # use only 64 tasks per child to fix memory leak in child processes I can't find
    # I suspect a bug in multiprocessing since others have similar unsolved leaks:
    # https://stackoverflow.com/questions/21485319/high-memory-usage-using-python-multiprocessing
    # https://stackoverflow.com/questions/56922672/multiprocessing-child-task-continues-to-leak-more-memory
    with Pool(os.cpu_count(), maxtasksperchild=64) as pool:
        with open("{}.db".format(name), "wt") as f:
            for n, i, dat in pool.imap_unordered(makedb_job, jobs):
                f.write(dat)
                print("\r{} {}       ".format(n, i), end="")

    print("Done!")
Beispiel #6
0
def count_msas(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('db_name')
    parser.add_argument('out_name')
    parser.add_argument('msas', nargs='*')
    parser.add_argument('--weights', nargs='*')
    parser.add_argument('--score',
                        default='pearson',
                        choices=['pearson', 'spearman', 'pcttvd'])
    args = parser.parse_args(args)

    msas = [np.asfortranarray(seqload.loadSeqs(m)[0]) for m in args.msas]
    weights = None
    if args.weights:
        weights = [np.load(w) if w != 'None' else None for m in args.weights]

    positionsets = {}
    with open('{}.db'.format(args.db_name.rstrip('.db')), "rt") as f:
        while True:
            lines = []
            while lines == [] or lines[-1] not in ['\n', '']:
                lines.append(f.readline())
            if lines[-1] == '':
                break

            pos = [int(p) for p in lines[0].split()]
            slines = (l.split() for l in lines[1:] if not l.isspace())
            seqs, fs = zip(*((s, float(f)) for s, f in slines))
            seqs = [[alpha.index(c) for c in s] for s in seqs]
            seqs = np.array(seqs, dtype='u1')
            fs = np.array(fs, dtype='f4')

            npos = len(pos)
            if npos not in positionsets:
                positionsets[npos] = []
            positionsets[npos].append((pos, seqs, fs))
    npos = list(positionsets.keys())
    npos.sort()

    global count_msas_globals
    count_msas_globals = (msas, weights, args.score)

    print("Starting workers...")
    with Pool(os.cpu_count(), maxtasksperchild=64) as pool:
        print("Using {} workers".format(os.cpu_count()))
        res = pool.map(count_job, (d for n in npos for d in positionsets[n]))
    res = np.array(res).reshape((len(npos), -1, len(msas)))
    np.save(args.out_name, res)
Beispiel #7
0
def main_train(name, args):
    parser = argparse.ArgumentParser()
    parser.add_argument('vae_type', choices=vaes.keys())
    parser.add_argument('seqs')
    parser.add_argument('latent_dim', type=int)
    parser.add_argument('args', nargs='*')
    parser.add_argument('--epochs', type=int, default=32)
    parser.add_argument('--patience')
    parser.add_argument('--batch_size', type=int, default=200)
    parser.add_argument('--TVDseqs', action='store_true')
    args = parser.parse_args(args)

    latent_dim = args.latent_dim
    seqs = loadSeqs(args.seqs, alpha=ALPHA)[0]
    N, L = seqs.shape
    print("L = {}".format(L))

    #np.random.seed(42)
    #np.random.seed(256)
    batch_size = args.batch_size
    #inner_dim = args.inner_dim

    assert (N % batch_size == 0)
    n_batches = N // batch_size
    validation_batches = int(n_batches * 0.1)
    train_seqs = seqs[:-validation_batches * batch_size]
    val_seqs = seqs[-validation_batches * batch_size:]
    TVDseqs = None
    if args.TVDseqs:
        TVDseqs = val_seqs[:1000]
    patience = None
    if args.patience:
        patience = int(args.patience)

    vae = vaes[args.vae_type]()
    vae.create_model(L, q, latent_dim, batch_size, *args.args)
    vae.summarize()
    hist = vae.train(args.epochs,
                     train_seqs,
                     val_seqs,
                     name=name,
                     TVDseqs=TVDseqs,
                     early_patience=patience)
    vae.save(name)
    plot_performance(vae, hist, name)
def main():
    parser = argparse.ArgumentParser(description='Compute phylogenetic weights')
    parser.add_argument('-alpha', default='protgap')
    parser.add_argument('sim', default='none', help="Similarity Threshold")
    parser.add_argument('seqfile')
    parser.add_argument('outfile')

    args = parser.parse_args(sys.argv[1:])
    
    alphabets = {'protein': IUPAC.protein.letters, 
                 'protgap': '-' + IUPAC.protein.letters, 
                 'charge': '0+-', 
                 'nuc': "ACGT"}
    letters = alphabets.get(args.alpha, args.alpha)
    nBases = len(letters)

    seqs = seqload.loadSeqs(args.seqfile, letters)[0]
    nSeq, seqLen = seqs.shape

    if args.sim == 'none':
        sim = 0
    elif args.sim == 'unique':
        sim = 0.5/seqLen
    elif args.sim.startswith('m'):
        sim = (float(args.sim[1:])+0.5)/seqLen
    else:
        sim = float(args.sim)

    sim = 1-sim
    if sim < 0 or sim > 1:
        raise Exception("Similarity threshold must be between 0 and 1")

    if sim != 1.0:
        similarityCutoff = int(np.ceil((1-sim)*seqLen))
        print("Identity cutoff:", similarityCutoff, file=sys.stderr)
        weights = 1.0/seqtools.nsim(seqs, similarityCutoff)
    else:
        weights = np.ones(seqs.shape[0])
    M_eff = np.sum(weights)
    print(M_eff)

    np.save(args.outfile, weights)
Beispiel #9
0
def main_energy(name, args):
    parser = argparse.ArgumentParser()
    parser.add_argument('seqs')
    parser.add_argument('--ref_energy')
    parser.add_argument('--n_samples', default=1000, type=int)
    args = parser.parse_args(args)

    seqs = loadSeqs(args.seqs, alpha=ALPHA)[0]

    vae = loadVAE(name)

    nlelbo = -vae.lELBO(seqs, n_samples=args.n_samples)
    logp = vae.logp(seqs, n_samples=args.n_samples)

    np.save('nlelbo_{}'.format(name), nlelbo)
    np.save('logp_{}'.format(name), logp)

    plt.figure(figsize=(6, 4))
    plt.plot(nlelbo, logp, '.')
    plt.xlabel('$-\log \mathrm{ELBO}(S)$')
    plt.ylabel(r'$\log p_\theta(S)$')
    plt.savefig("energies_{}.png".format(name), dpi=300)
    plt.close()

    if args.ref_energy:
        refE = np.load(args.ref_energy)

        plt.figure()
        plt.plot(refE, nlelbo, '.')
        plt.xlabel('Ref E')
        plt.xlabel('$-\log$ ELBO')
        plt.title(r'$\rho = {:.3f}$'.format(pearsonr(refE, nlelbo)[0]))
        plt.savefig("energies_elbo_{}.png".format(name))
        plt.close()

        plt.figure()
        plt.plot(refE, logp, '.')
        plt.xlabel('Ref E')
        plt.ylabel('$\log p(x)$')
        plt.title(r'$\rho = {:.3f}$'.format(pearsonr(refE, logp)[0]))
        plt.savefig("energies_logp_{}.png".format(name))
        plt.close()
#!/usr/bin/env python3
import numpy as np
import seqload, seqtools
from numpy.random import randint
import sys, os

s = seqload.loadSeqs(sys.argv[1])[0]
cutoff = 1 - float(sys.argv[2])
L = s.shape[1]

inds = []
out_seq = []
while s.shape[0] != 0:
    ind = randint(s.shape[0])
    out_seq.append(s[ind].copy())  # no ref to s
    s = s[np.sum(s == s[ind, :], axis=1) / float(L) < cutoff, :]
    print(s.shape, file=sys.stderr)

with os.fdopen(sys.stdout.fileno(), 'wb', closefd=False) as fp:
    seqload.writeSeqs(fp, np.array(out_seq), noheader=True)
Beispiel #11
0
def loadSequenceFile(sfile, alpha, log):
    log("Loading sequences from file {}".format(sfile))
    seqs = seqload.loadSeqs(sfile, names=alpha)[0].astype('<u1')
    return seqs
Beispiel #12
0
def main_plot_latent(name, args):
    parser = argparse.ArgumentParser()
    parser.add_argument('seqs')
    parser.add_argument('--save')
    args = parser.parse_args(args)

    seqs = loadSeqs(args.seqs, alpha=ALPHA)[0]
    # only plot first 10,000
    seqs = seqs[:10000]

    vae = loadVAE(name)
    latent_dim = vae.latent_dim

    m, lv = vae.encode(seqs)
    st = np.exp(lv / 2)

    if args.save:
        np.save(args.save, np.dstack([m, lv]))

    # make 1d distribution plots
    fig = plt.figure(figsize=(12, 12))
    nx = max(latent_dim // 2, 1)
    ny = (latent_dim - 1) // nx + 1
    for z1 in range(latent_dim):
        fig.add_subplot(nx, ny, z1 + 1)
        h, b, _ = plt.hist(m[:, z1], bins=100, density=True)
        wm, ws = m[0][z1], st[0][z1]
        x = np.linspace(wm - 5 * ws, wm + 5 * ws, 200)
        y = norm.pdf(x, wm, ws)
        y = y * np.max(h) / np.max(y)
        plt.plot(x, y, 'r-')
        plt.xlim(-5, 5)
        plt.title('Z{}, <z{}>_std={:.2f}'.format(z1, z1, np.std(m[:, z1])))

    plt.savefig('LatentTraining_1d_{}.png'.format(name))
    plt.close()

    # special plot for l=1 case: vary the 1 dimension, make movie of output
    if vae.latent_dim == 1:
        z = np.linspace(-4, 4, vae.batch_size)
        psm = vae.decode_bernoulli(z)

        import matplotlib.animation as animation
        fig = plt.figure(figsize=(16, 4))
        ax1 = plt.subplot(211)
        ax2 = plt.subplot(212)
        h, b, _ = ax1.hist(m[:, 0], bins=100, density=True)
        ax1.set_xlim(-4, 4)
        artists = []
        for p, zi in zip(psm, z):
            zpt = ax1.plot([zi], [0], 'r.', ms=20)[0]
            im = ax2.imshow(p.T,
                            cmap='gray_r',
                            interpolation='nearest',
                            animated=True)
            artists.append([im, zpt])
            #print("".join(ALPHA[c] for c in np.argmax(p, axis=1)))
        ani = animation.ArtistAnimation(fig,
                                        artists,
                                        interval=40,
                                        blit=True,
                                        repeat_delay=1000)
        #ani.save('vary_z1_{}.mp4'.format(name))
        ani.save('vary_z1_{}.gif'.format(name), dpi=80, writer='imagemagick')
        plt.close()
        return

    # make 2d distribution plots
    r = 4
    s = np.linspace(-r, r, 50)
    X, Y = np.meshgrid(s, s)
    red = np.broadcast_to(np.array([1., 0, 0, 1]), (len(s), len(s), 4)).copy()

    fig = plt.figure(figsize=(12, 12))
    counter = 0
    for z1 in range(latent_dim):
        print('Var z{}: {}'.format(z1, np.var(m[:, z1])))
        for z2 in range(z1 + 1, latent_dim):
            counter += 1
            fig.add_subplot(latent_dim - 1, latent_dim - 1, counter)

            plt.hist2d(m[:, z2],
                       m[:, z1],
                       bins=np.linspace(-r, r, 50),
                       cmap=DarkBlue,
                       cmin=1)

            nn = (norm.pdf(X, m[0][z2], st[0][z2]) *
                  norm.pdf(Y, m[0][z1], st[0][z1]))
            nn = nn / np.max(nn) / 1.5
            red[:, :, 3] = nn
            plt.imshow(red, extent=(-r, r, -r, r), origin='lower', zorder=2)

            ##wildtype in red
            #plt.scatter(m[0][z1], m[0][z2],c="r", alpha=1)
            # make 1std oval for wt
            #wtv = Ellipse((m[0][z2],  m[0][z1]),
            #              width=st[0][z2], height=st[0][z1],
            #              facecolor='none', edgecolor='red', lw=2)
            #plt.gca().add_patch(wtv)
            #wtv = Ellipse((m[0][z2],  m[0][z1]),
            #              width=2*st[0][z2], height=2*st[0][z1],
            #              facecolor='none', edgecolor='red', lw=1)
            #plt.gca().add_patch(wtv)
            plt.xlim(-4, 4)
            plt.ylim(-4, 4)

            fs = 26
            if latent_dim <= 7:
                fs *= 2
            if z1 == 0:
                plt.xlabel('$z_{{{}}}$'.format(z2),
                           fontsize=fs,
                           labelpad=fs / 2)
                plt.gca().xaxis.set_label_position('top')
            if z2 == latent_dim - 1:
                plt.ylabel('$z_{{{}}}$'.format(z1), fontsize=fs)
                plt.gca().yaxis.set_label_position('right')

            plt.xticks([])
            plt.yticks([])
        counter += z1 + 1

    plt.subplots_adjust(right=0.92, bottom=0.01, left=0.01, top=0.92)
    plt.savefig('LatentTraining_{}.png'.format(name))
    plt.close()