Ejemplo n.º 1
0
def predict_contacts(model, x, y, use_cuda):
    b = len(x)
    x, order = pack_sequences(x)
    x = PackedSequence(Variable(x.data), x.batch_sizes)
    z = model(x)  # embed the sequences
    z = unpack_sequences(z, order)

    logits = []
    y_list = []
    for i in range(b):
        zi = z[i]
        lp = model.predict(zi.unsqueeze(0)).view(-1)

        yi = y[i].view(-1)
        if use_cuda:
            yi = yi.cuda()
        mask = (yi < 0)

        lp = lp[~mask]
        yi = yi[~mask]

        logits.append(lp)
        y_list.append(yi)

    return logits, y_list
Ejemplo n.º 2
0
def eval_similarity(model, test_iterator, use_cuda):
    y = []
    logits = []
    for x0, x1, y_mb in test_iterator:

        if use_cuda:
            y_mb = y_mb.cuda()
        y.append(y_mb.long())

        b = len(x0)
        x = x0 + x1

        x, order = pack_sequences(x)
        x = PackedSequence(Variable(x.data), x.batch_sizes)
        z = model(x)  # embed the sequences
        z = unpack_sequences(z, order)

        z0 = z[:b]
        z1 = z[b:]

        for i in range(b):
            z_a = z0[i]
            z_b = z1[i]
            logits.append(model.score(z_a, z_b))

    y = torch.cat(y, 0)
    logits = torch.stack(logits, 0)

    p = F.sigmoid(logits).data
    ones = p.new(p.size(0), 1).zero_() + 1
    p_ge = torch.cat([ones, p], 1)
    p_lt = torch.cat([1 - p, ones], 1)
    p = p_ge * p_lt
    p = p / p.sum(1, keepdim=True)  # make sure p is normalized

    loss = F.cross_entropy(p, y).item()

    _, y_hard = torch.max(p, 1)
    levels = torch.arange(5).to(p.device)
    y_hat = torch.sum(p * levels, 1)

    accuracy = torch.mean((y == y_hard).float()).item()
    mse = torch.mean((y.float() - y_hat)**2).item()

    y = y.cpu().numpy()
    y_hat = y_hat.cpu().numpy()

    r, _ = pearsonr(y_hat, y)
    rho, _ = spearmanr(y_hat, y)

    return loss, accuracy, mse, r, rho
    def __call__(self, x):
        c = [torch.from_numpy(x_).long() for x_ in x]

        c, order = pack_sequences(c)
        if self.use_cuda:
            c = c.cuda()

        if self.full_features:
            z = featurize(c, self.lm_embed, self.lstm_stack, self.proj)
        else:
            z = self.model(c)  # embed the sequences
        z = unpack_sequences(z, order)

        return z
Ejemplo n.º 4
0
def similarity_grad(model, x0, x1, y, use_cuda, weight=0.5):
    if use_cuda:
        y = y.cuda()
    y = Variable(y)

    b = len(x0)
    x = x0 + x1

    x, order = pack_sequences(x)
    x = PackedSequence(Variable(x.data), x.batch_sizes)
    z = model(x)  # embed the sequences
    z = unpack_sequences(z, order)

    z0 = z[:b]
    z1 = z[b:]

    logits = []
    for i in range(b):
        z_a = z0[i]
        z_b = z1[i]
        logits.append(model.score(z_a, z_b))
    logits = torch.stack(logits, 0)

    loss = F.binary_cross_entropy_with_logits(logits, y.float())

    # backprop weighted loss
    w_loss = loss * weight
    w_loss.backward()

    # calculate minibatch performance metrics
    with torch.no_grad():
        p = F.sigmoid(logits)
        ones = p.new(b, 1).zero_() + 1
        p_ge = torch.cat([ones, p], 1)
        p_lt = torch.cat([1 - p, ones], 1)
        p = p_ge * p_lt
        p = p / p.sum(1, keepdim=True)  # make sure p is normalized

        _, y_hard = torch.max(p, 1)
        levels = torch.arange(5).to(p.device)
        y_hat = torch.sum(p * levels, 1)
        y = torch.sum(y.data, 1)

        loss = F.cross_entropy(
            p, y).item()  # calculate cross entropy loss from p vector

        correct = torch.sum((y == y_hard).float()).item()
        mse = torch.mean((y.float() - y_hat)**2).item()

    return loss, correct, mse, b
Ejemplo n.º 5
0
def predict_minibatch(model, x, use_cuda):
    b = len(x)
    x, order = pack_sequences(x)
    x = PackedSequence(x.data, x.batch_sizes)
    z = model(x)  # embed the sequences
    z = unpack_sequences(z, order)

    logits = []
    for i in range(b):
        zi = z[i]
        lp = model.predict(zi.unsqueeze(0)).view(zi.size(0), zi.size(0))
        logits.append(lp)

    return logits
    def __call__(self, x, y):
        n = len(x)
        c = [torch.from_numpy(x_).long()
             for x_ in x] + [torch.from_numpy(y_).long() for y_ in y]

        c, order = pack_sequences(c)
        if self.use_cuda:
            c = c.cuda()

        with torch.no_grad():
            z = self.model(c)  # embed the sequences
            z = unpack_sequences(z, order)

            scores = np.zeros(n)
            if self.mode == 'align':
                for i in range(n):
                    z_x = z[i]
                    z_y = z[i + n]

                    logits = self.model.score(z_x, z_y)
                    p = F.sigmoid(logits).cpu()
                    p_ge = torch.ones(p.size(0) + 1)
                    p_ge[1:] = p
                    p_lt = torch.ones(p.size(0) + 1)
                    p_lt[:-1] = 1 - p
                    p = p_ge * p_lt
                    p = p / p.sum()  # make sure p is normalized
                    levels = torch.arange(5).float()
                    scores[i] = torch.sum(p * levels).item()

            elif self.mode == 'coarse':
                z_x = z[:n]
                z_y = z[n:]
                z_x = torch.stack([z.mean(0) for z in z_x], 0)
                z_y = torch.stack([z.mean(0) for z in z_y], 0)
                scores[:] = -torch.sum(torch.abs(z_x - z_y), 1).cpu().numpy()

        return scores
Ejemplo n.º 7
0
def contacts_grad(model, x, y, use_cuda, weight=0.5):
    b = len(x)
    x, order = pack_sequences(x)
    x = PackedSequence(Variable(x.data), x.batch_sizes)
    z = model(x)  # embed the sequences
    z = unpack_sequences(z, order)

    logits = []
    for i in range(b):
        zi = z[i]
        lp = model.predict(zi.unsqueeze(0)).view(-1)
        logits.append(lp)
    logits = torch.cat(logits, 0)

    y = torch.cat([yi.view(-1) for yi in y])
    if use_cuda:
        y = y.cuda()
    mask = (y < 0)

    logits = logits[~mask]
    y = Variable(y[~mask])
    b = y.size(0)

    loss = F.binary_cross_entropy_with_logits(logits, y)

    # backprop weighted loss
    w_loss = loss * weight
    w_loss.backward()

    # calculate the recall and precision
    with torch.no_grad():
        p_hat = F.sigmoid(logits)
        tp = torch.sum(p_hat * y).item()
        gp = y.sum().item()
        pp = p_hat.sum().item()

    return loss.item(), tp, gp, pp, b
def main():
    import argparse
    parser = argparse.ArgumentParser(
        'Script for training embedding model on SCOP.')

    parser.add_argument('--dev',
                        action='store_true',
                        help='use train/dev split')

    parser.add_argument(
        '-m',
        '--model',
        choices=['ssa', 'ua', 'me'],
        default='ssa',
        help=
        'alignment scoring method for comparing sequences in embedding space [ssa: soft symmetric alignment, ua: uniform alignment, me: mean embedding] (default: ssa)'
    )
    parser.add_argument('--allow-insert',
                        action='store_true',
                        help='model insertions (default: false)')

    parser.add_argument('--norm',
                        choices=['l1', 'l2'],
                        default='l1',
                        help='comparison norm (default: l1)')

    parser.add_argument('--rnn-type',
                        choices=['lstm', 'gru'],
                        default='lstm',
                        help='type of RNN block to use (default: lstm)')
    parser.add_argument('--embedding-dim',
                        type=int,
                        default=100,
                        help='embedding dimension (default: 100)')
    parser.add_argument('--input-dim',
                        type=int,
                        default=512,
                        help='dimension of input to RNN (default: 512)')
    parser.add_argument('--rnn-dim',
                        type=int,
                        default=512,
                        help='hidden units of RNNs (default: 512)')
    parser.add_argument('--num-layers',
                        type=int,
                        default=3,
                        help='number of RNN layers (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout probability (default: 0)')

    parser.add_argument('--epoch-size',
                        type=int,
                        default=100000,
                        help='number of examples per epoch (default: 100,000)')
    parser.add_argument('--epoch-scale',
                        type=int,
                        default=5,
                        help='scaling on epoch size (default: 5)')
    parser.add_argument('--num-epochs',
                        type=int,
                        default=100,
                        help='number of epochs (default: 100)')

    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        help='minibatch size (default: 64)')

    parser.add_argument('--weight-decay',
                        type=float,
                        default=0,
                        help='L2 regularization (default: 0)')
    parser.add_argument('--lr', type=float, default=0.001)

    parser.add_argument('--tau',
                        type=float,
                        default=0.5,
                        help='sampling proportion exponent (default: 0.5)')
    parser.add_argument(
        '--augment',
        type=float,
        default=0,
        help=
        'probability of resampling amino acid for data augmentation (default: 0)'
    )
    parser.add_argument('--lm',
                        help='pretrained LM to use as initial embedding')

    parser.add_argument('-o',
                        '--output',
                        help='output file path (default: stdout)')
    parser.add_argument('--save-prefix', help='path prefix for saving models')
    parser.add_argument('-d',
                        '--device',
                        type=int,
                        default=-2,
                        help='compute device to use')

    args = parser.parse_args()

    prefix = args.output

    ## set the device
    d = args.device
    use_cuda = (d != -1) and torch.cuda.is_available()
    if d >= 0:
        torch.cuda.set_device(d)

    ## make the datasets
    astral_train_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.fa'
    astral_testpairs_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.test.sampledpairs.txt'
    if args.dev:
        astral_train_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.train.fa'
        astral_testpairs_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.dev.sampledpairs.txt'

    alphabet = Uniprot21()

    print('# loading training sequences:', astral_train_path, file=sys.stderr)
    with open(astral_train_path, 'rb') as f:
        names_train, structs_train, sequences_train = scop.parse_astral(
            f, encoder=alphabet)
    x_train = [torch.from_numpy(x).long() for x in sequences_train]
    if use_cuda:
        x_train = [x.cuda() for x in x_train]
    y_train = torch.from_numpy(structs_train)

    print('# loaded', len(x_train), 'training sequences', file=sys.stderr)

    print('# loading test sequence pairs:',
          astral_testpairs_path,
          file=sys.stderr)
    test_pairs_table = pd.read_csv(astral_testpairs_path, sep='\t')
    x0_test = [
        x.encode('utf-8').upper() for x in test_pairs_table['sequence_A']
    ]
    x0_test = [torch.from_numpy(alphabet.encode(x)).long() for x in x0_test]
    x1_test = [
        x.encode('utf-8').upper() for x in test_pairs_table['sequence_B']
    ]
    x1_test = [torch.from_numpy(alphabet.encode(x)).long() for x in x1_test]
    if use_cuda:
        x0_test = [x.cuda() for x in x0_test]
        x1_test = [x.cuda() for x in x1_test]
    y_test = test_pairs_table['similarity'].values
    y_test = torch.from_numpy(y_test).long()

    dataset_test = PairedDataset(x0_test, x1_test, y_test)
    print('# loaded', len(x0_test), 'test pairs', file=sys.stderr)

    ## make the dataset iterators
    scale = args.epoch_scale

    epoch_size = args.epoch_size
    batch_size = args.batch_size

    # precompute the similarity pairs
    y_train_levels = torch.cumprod(
        (y_train.unsqueeze(1) == y_train.unsqueeze(0)).long(), 2)

    # data augmentation by resampling amino acids
    augment = None
    p = 0
    if args.augment > 0:
        p = args.augment
        trans = torch.ones(len(alphabet), len(alphabet))
        trans = trans / trans.sum(1, keepdim=True)
        if use_cuda:
            trans = trans.cuda()
        augment = MultinomialResample(trans, p)
    print('# resampling amino acids with p:', p, file=sys.stderr)
    dataset_train = AllPairsDataset(x_train, y_train_levels, augment=augment)

    similarity = y_train_levels.numpy().sum(2)
    levels, counts = np.unique(similarity, return_counts=True)
    order = np.argsort(levels)
    levels = levels[order]
    counts = counts[order]

    print('#', levels, file=sys.stderr)
    print('#', counts / np.sum(counts), file=sys.stderr)

    weight = counts**0.5
    print('#', weight / np.sum(weight), file=sys.stderr)

    weight = counts**0.33
    print('#', weight / np.sum(weight), file=sys.stderr)

    weight = counts**0.25
    print('#', weight / np.sum(weight), file=sys.stderr)

    tau = args.tau
    print('# using tau:', tau, file=sys.stderr)
    print('#', counts**tau / np.sum(counts**tau), file=sys.stderr)
    weights = counts**tau / counts
    weights = weights[similarity].ravel()
    #weights = np.ones(len(dataset_train))
    sampler = torch.utils.data.sampler.WeightedRandomSampler(
        weights, epoch_size)

    # two training dataset iterators for sampling pairs of sequences for training
    train_iterator = torch.utils.data.DataLoader(
        dataset_train,
        batch_size=batch_size,
        sampler=sampler,
        collate_fn=collate_paired_sequences)
    test_iterator = torch.utils.data.DataLoader(
        dataset_test,
        batch_size=batch_size,
        collate_fn=collate_paired_sequences)

    ## initialize the model
    rnn_type = args.rnn_type
    rnn_dim = args.rnn_dim
    num_layers = args.num_layers

    embedding_size = args.embedding_dim
    input_dim = args.input_dim

    dropout = args.dropout

    allow_insert = args.allow_insert

    print('# initializing model with:', file=sys.stderr)
    print('# embedding_size:', embedding_size, file=sys.stderr)
    print('# input_dim:', input_dim, file=sys.stderr)
    print('# rnn_dim:', rnn_dim, file=sys.stderr)
    print('# num_layers:', num_layers, file=sys.stderr)
    print('# dropout:', dropout, file=sys.stderr)
    print('# allow_insert:', allow_insert, file=sys.stderr)

    compare_type = args.model
    print('# comparison method:', compare_type, file=sys.stderr)

    lm = None
    if args.lm is not None:
        lm = torch.load(args.lm)
        lm.eval()
        ## do not update the LM parameters
        for param in lm.parameters():
            param.requires_grad = False
        print('# using LM:', args.lm, file=sys.stderr)

    if num_layers > 0:
        embedding = bepler.models.embedding.StackedRNN(len(alphabet),
                                                       input_dim,
                                                       rnn_dim,
                                                       embedding_size,
                                                       nlayers=num_layers,
                                                       dropout=dropout,
                                                       lm=lm)
    else:
        embedding = bepler.models.embedding.Linear(len(alphabet),
                                                   input_dim,
                                                   embedding_size,
                                                   lm=lm)

    if args.norm == 'l1':
        norm = bepler.models.comparison.L1()
        print('# norm: l1', file=sys.stderr)
    elif args.norm == 'l2':
        norm = bepler.models.comparison.L2()
        print('# norm: l2', file=sys.stderr)
    model = bepler.models.comparison.OrdinalRegression(
        embedding,
        5,
        align_method=compare_type,
        compare=norm,
        allow_insertions=allow_insert)

    if use_cuda:
        model.cuda()

    ## setup training parameters and optimizer
    num_epochs = args.num_epochs

    weight_decay = args.weight_decay
    lr = args.lr

    print('# training with Adam: lr={}, weight_decay={}'.format(
        lr, weight_decay),
          file=sys.stderr)
    params = [p for p in model.parameters() if p.requires_grad]
    optim = torch.optim.Adam(params, lr=lr, weight_decay=weight_decay)

    ## train the model
    print('# training model', file=sys.stderr)

    save_prefix = args.save_prefix
    output = args.output
    if output is None:
        output = sys.stdout
    else:
        output = open(output, 'w')
    digits = int(np.floor(np.log10(num_epochs))) + 1
    line = '\t'.join(['epoch', 'split', 'loss', 'mse', 'accuracy', 'r', 'rho'])
    print(line, file=output)

    for epoch in range(num_epochs):
        # train epoch
        model.train()
        it = 0
        n = 0
        loss_estimate = 0
        mse_estimate = 0
        acc_estimate = 0

        for x0, x1, y in train_iterator:  # zip(train_iterator_0, train_iterator_1):

            if use_cuda:
                y = y.cuda()
            y = Variable(y)

            b = len(x0)
            x = x0 + x1

            x, order = pack_sequences(x)
            x = PackedSequence(Variable(x.data), x.batch_sizes)
            z = model(x)  # embed the sequences
            z = unpack_sequences(z, order)

            z0 = z[:b]
            z1 = z[b:]

            logits = []
            for i in range(b):
                z_a = z0[i]
                z_b = z1[i]
                logits.append(model.score(z_a, z_b))
            logits = torch.stack(logits, 0)

            loss = F.binary_cross_entropy_with_logits(logits, y.float())
            loss.backward()

            optim.step()
            optim.zero_grad()
            model.clip(
            )  # projected gradient for bounding ordinal regressionn parameters

            p = F.sigmoid(logits)
            ones = p.new(b, 1).zero_() + 1
            p_ge = torch.cat([ones, p], 1)
            p_lt = torch.cat([1 - p, ones], 1)
            p = p_ge * p_lt
            p = p / p.sum(1, keepdim=True)  # make sure p is normalized

            _, y_hard = torch.max(p, 1)
            levels = torch.arange(5).to(p.device)
            y_hat = torch.sum(p * levels, 1)
            y = torch.sum(y.data, 1)

            loss = F.cross_entropy(
                p, y)  # calculate cross entropy loss from p vector

            correct = torch.sum((y == y_hard).float())
            mse = torch.sum((y.float() - y_hat)**2)

            n += b
            delta = b * (loss.item() - loss_estimate)
            loss_estimate += delta / n
            delta = correct.item() - b * acc_estimate
            acc_estimate += delta / n
            delta = mse.item() - b * mse_estimate
            mse_estimate += delta / n

            if (n - b) // 100 < n // 100:
                print(
                    '# [{}/{}] training {:.1%} loss={:.5f}, mse={:.5f}, acc={:.5f}'
                    .format(epoch + 1, num_epochs, n / epoch_size,
                            loss_estimate, mse_estimate, acc_estimate),
                    end='\r',
                    file=sys.stderr)
        print(' ' * 80, end='\r', file=sys.stderr)
        line = '\t'.join([
            str(epoch + 1).zfill(digits), 'train',
            str(loss_estimate),
            str(mse_estimate),
            str(acc_estimate), '-', '-'
        ])
        print(line, file=output)
        output.flush()

        # eval and save model
        model.eval()

        y = []
        logits = []
        with torch.no_grad():
            for x0, x1, y_mb in test_iterator:

                if use_cuda:
                    y_mb = y_mb.cuda()
                y.append(y_mb.long())

                b = len(x0)
                x = x0 + x1

                x, order = pack_sequences(x)
                x = PackedSequence(Variable(x.data), x.batch_sizes)
                z = model(x)  # embed the sequences
                z = unpack_sequences(z, order)

                z0 = z[:b]
                z1 = z[b:]

                for i in range(b):
                    z_a = z0[i]
                    z_b = z1[i]
                    logits.append(model.score(z_a, z_b))

            y = torch.cat(y, 0)
            logits = torch.stack(logits, 0)

            p = F.sigmoid(logits).data
            ones = p.new(p.size(0), 1).zero_() + 1
            p_ge = torch.cat([ones, p], 1)
            p_lt = torch.cat([1 - p, ones], 1)
            p = p_ge * p_lt
            p = p / p.sum(1, keepdim=True)  # make sure p is normalized

            loss = F.cross_entropy(p, y).item()

            _, y_hard = torch.max(p, 1)
            levels = torch.arange(5).to(p.device)
            y_hat = torch.sum(p * levels, 1)

            accuracy = torch.mean((y == y_hard).float()).item()
            mse = torch.mean((y.float() - y_hat)**2).item()

            y = y.cpu().numpy()
            y_hat = y_hat.cpu().numpy()

            r, _ = pearsonr(y_hat, y)
            rho, _ = spearmanr(y_hat, y)

        line = '\t'.join([
            str(epoch + 1).zfill(digits), 'test',
            str(loss),
            str(mse),
            str(accuracy),
            str(r),
            str(rho)
        ])
        print(line, file=output)
        output.flush()

        # save the model
        if save_prefix is not None:
            save_path = save_prefix + '_epoch' + str(epoch +
                                                     1).zfill(digits) + '.sav'
            model.cpu()
            torch.save(model, save_path)
            if use_cuda:
                model.cuda()