Exemple #1
0
def load_data():
    alphabet = Uniprot21()

    path = 'data/davis2011kinase/uniprot_sequences_processed.fasta'
    names, seqs = load_2line(path, alphabet)

    datasets = {'dataset0': (seqs, names)}

    return datasets
def allele_encoding(alleles):
    # Load amino acid alphabet 
    alphabet = Uniprot21()

    # Encode peptide sequences as arrays of amino acid indexes
    peptide_list = []
    for peptide in peptides:
        peptide_list.append(encode_sequence(peptide, alphabet))
        
    return peptide_list
def peptide_encoding(peptides):
    # Load amino acid alphabet 
    alphabet = Uniprot21()
    # Convert peptide sequences to list of amino acid strings
    peptides = peptides.values.tolist()

    # Encode peptide sequences as arrays of amino acid indexes
    peptide_list = []
    for peptide in peptides:
        peptide_list.append(encode_sequence(peptide, alphabet))
        
    return peptide_list
Exemple #4
0
def embed_sequence(x,
                   lm_embed,
                   lstm_stack,
                   proj,
                   include_lm=True,
                   final_only=False,
                   pool='none',
                   use_cuda=False):

    if len(x) == 0:
        return None

    alphabet = Uniprot21()
    x = x.upper()
    # convert to alphabet index
    x = alphabet.encode(x)
    x = torch.from_numpy(x)
    if use_cuda:
        #x = x.cuda()
        x = x.to(DEVICE)

    # embed the sequence
    with torch.no_grad():
        x = x.long().unsqueeze(0)
        z = embed_stack(x,
                        lm_embed,
                        lstm_stack,
                        proj,
                        include_lm=include_lm,
                        final_only=final_only)
        # pool if needed
        z = z.squeeze(0)
        if pool == 'sum':
            z = z.sum(0)
        elif pool == 'max':
            z, _ = z.max(0)
        elif pool == 'avg':
            z = z.mean(0)
        z = z.cpu().numpy()

    return z
Exemple #5
0
def load_data():
    alphabet = Uniprot21()

    path = 'data/transmembrane/TOPCONS2_datasets/TM.3line'
    x_tm, y_tm = load_3line(path, alphabet)

    path = 'data/transmembrane/TOPCONS2_datasets/SP+TM.3line'
    x_tm_sp, y_tm_sp = load_3line(path, alphabet)

    path = 'data/transmembrane/TOPCONS2_datasets/Globular.3line'
    x_glob, y_glob = load_3line(path, alphabet)

    path = 'data/transmembrane/TOPCONS2_datasets/Globular+SP.3line'
    x_glob_sp, y_glob_sp = load_3line(path, alphabet)

    datasets = {
        'TM': (x_tm, y_tm),
        'SP+TM': (x_tm_sp, y_tm_sp),
        'Globular': (x_glob, y_glob),
        'Globular+SP': (x_glob_sp, y_glob_sp)
    }

    return datasets
def main():
    import argparse
    parser = argparse.ArgumentParser(
        'Script for training embedding model on SCOP.')

    parser.add_argument('--dev',
                        action='store_true',
                        help='use train/dev split')

    parser.add_argument(
        '-m',
        '--model',
        choices=['ssa', 'ua', 'me'],
        default='ssa',
        help=
        'alignment scoring method for comparing sequences in embedding space [ssa: soft symmetric alignment, ua: uniform alignment, me: mean embedding] (default: ssa)'
    )
    parser.add_argument('--allow-insert',
                        action='store_true',
                        help='model insertions (default: false)')

    parser.add_argument('--norm',
                        choices=['l1', 'l2'],
                        default='l1',
                        help='comparison norm (default: l1)')

    parser.add_argument('--rnn-type',
                        choices=['lstm', 'gru'],
                        default='lstm',
                        help='type of RNN block to use (default: lstm)')
    parser.add_argument('--embedding-dim',
                        type=int,
                        default=100,
                        help='embedding dimension (default: 100)')
    parser.add_argument('--input-dim',
                        type=int,
                        default=512,
                        help='dimension of input to RNN (default: 512)')
    parser.add_argument('--rnn-dim',
                        type=int,
                        default=512,
                        help='hidden units of RNNs (default: 512)')
    parser.add_argument('--num-layers',
                        type=int,
                        default=3,
                        help='number of RNN layers (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout probability (default: 0)')

    parser.add_argument('--epoch-size',
                        type=int,
                        default=100000,
                        help='number of examples per epoch (default: 100,000)')
    parser.add_argument('--epoch-scale',
                        type=int,
                        default=5,
                        help='scaling on epoch size (default: 5)')
    parser.add_argument('--num-epochs',
                        type=int,
                        default=100,
                        help='number of epochs (default: 100)')

    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        help='minibatch size (default: 64)')

    parser.add_argument('--weight-decay',
                        type=float,
                        default=0,
                        help='L2 regularization (default: 0)')
    parser.add_argument('--lr', type=float, default=0.001)

    parser.add_argument('--tau',
                        type=float,
                        default=0.5,
                        help='sampling proportion exponent (default: 0.5)')
    parser.add_argument(
        '--augment',
        type=float,
        default=0,
        help=
        'probability of resampling amino acid for data augmentation (default: 0)'
    )
    parser.add_argument('--lm',
                        help='pretrained LM to use as initial embedding')

    parser.add_argument('-o',
                        '--output',
                        help='output file path (default: stdout)')
    parser.add_argument('--save-prefix', help='path prefix for saving models')
    parser.add_argument('-d',
                        '--device',
                        type=int,
                        default=-2,
                        help='compute device to use')

    args = parser.parse_args()

    prefix = args.output

    ## set the device
    d = args.device
    use_cuda = (d != -1) and torch.cuda.is_available()
    if d >= 0:
        torch.cuda.set_device(d)

    ## make the datasets
    astral_train_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.fa'
    astral_testpairs_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.test.sampledpairs.txt'
    if args.dev:
        astral_train_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.train.fa'
        astral_testpairs_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.dev.sampledpairs.txt'

    alphabet = Uniprot21()

    print('# loading training sequences:', astral_train_path, file=sys.stderr)
    with open(astral_train_path, 'rb') as f:
        names_train, structs_train, sequences_train = scop.parse_astral(
            f, encoder=alphabet)
    x_train = [torch.from_numpy(x).long() for x in sequences_train]
    if use_cuda:
        x_train = [x.cuda() for x in x_train]
    y_train = torch.from_numpy(structs_train)

    print('# loaded', len(x_train), 'training sequences', file=sys.stderr)

    print('# loading test sequence pairs:',
          astral_testpairs_path,
          file=sys.stderr)
    test_pairs_table = pd.read_csv(astral_testpairs_path, sep='\t')
    x0_test = [
        x.encode('utf-8').upper() for x in test_pairs_table['sequence_A']
    ]
    x0_test = [torch.from_numpy(alphabet.encode(x)).long() for x in x0_test]
    x1_test = [
        x.encode('utf-8').upper() for x in test_pairs_table['sequence_B']
    ]
    x1_test = [torch.from_numpy(alphabet.encode(x)).long() for x in x1_test]
    if use_cuda:
        x0_test = [x.cuda() for x in x0_test]
        x1_test = [x.cuda() for x in x1_test]
    y_test = test_pairs_table['similarity'].values
    y_test = torch.from_numpy(y_test).long()

    dataset_test = PairedDataset(x0_test, x1_test, y_test)
    print('# loaded', len(x0_test), 'test pairs', file=sys.stderr)

    ## make the dataset iterators
    scale = args.epoch_scale

    epoch_size = args.epoch_size
    batch_size = args.batch_size

    # precompute the similarity pairs
    y_train_levels = torch.cumprod(
        (y_train.unsqueeze(1) == y_train.unsqueeze(0)).long(), 2)

    # data augmentation by resampling amino acids
    augment = None
    p = 0
    if args.augment > 0:
        p = args.augment
        trans = torch.ones(len(alphabet), len(alphabet))
        trans = trans / trans.sum(1, keepdim=True)
        if use_cuda:
            trans = trans.cuda()
        augment = MultinomialResample(trans, p)
    print('# resampling amino acids with p:', p, file=sys.stderr)
    dataset_train = AllPairsDataset(x_train, y_train_levels, augment=augment)

    similarity = y_train_levels.numpy().sum(2)
    levels, counts = np.unique(similarity, return_counts=True)
    order = np.argsort(levels)
    levels = levels[order]
    counts = counts[order]

    print('#', levels, file=sys.stderr)
    print('#', counts / np.sum(counts), file=sys.stderr)

    weight = counts**0.5
    print('#', weight / np.sum(weight), file=sys.stderr)

    weight = counts**0.33
    print('#', weight / np.sum(weight), file=sys.stderr)

    weight = counts**0.25
    print('#', weight / np.sum(weight), file=sys.stderr)

    tau = args.tau
    print('# using tau:', tau, file=sys.stderr)
    print('#', counts**tau / np.sum(counts**tau), file=sys.stderr)
    weights = counts**tau / counts
    weights = weights[similarity].ravel()
    #weights = np.ones(len(dataset_train))
    sampler = torch.utils.data.sampler.WeightedRandomSampler(
        weights, epoch_size)

    # two training dataset iterators for sampling pairs of sequences for training
    train_iterator = torch.utils.data.DataLoader(
        dataset_train,
        batch_size=batch_size,
        sampler=sampler,
        collate_fn=collate_paired_sequences)
    test_iterator = torch.utils.data.DataLoader(
        dataset_test,
        batch_size=batch_size,
        collate_fn=collate_paired_sequences)

    ## initialize the model
    rnn_type = args.rnn_type
    rnn_dim = args.rnn_dim
    num_layers = args.num_layers

    embedding_size = args.embedding_dim
    input_dim = args.input_dim

    dropout = args.dropout

    allow_insert = args.allow_insert

    print('# initializing model with:', file=sys.stderr)
    print('# embedding_size:', embedding_size, file=sys.stderr)
    print('# input_dim:', input_dim, file=sys.stderr)
    print('# rnn_dim:', rnn_dim, file=sys.stderr)
    print('# num_layers:', num_layers, file=sys.stderr)
    print('# dropout:', dropout, file=sys.stderr)
    print('# allow_insert:', allow_insert, file=sys.stderr)

    compare_type = args.model
    print('# comparison method:', compare_type, file=sys.stderr)

    lm = None
    if args.lm is not None:
        lm = torch.load(args.lm)
        lm.eval()
        ## do not update the LM parameters
        for param in lm.parameters():
            param.requires_grad = False
        print('# using LM:', args.lm, file=sys.stderr)

    if num_layers > 0:
        embedding = src.models.embedding.StackedRNN(len(alphabet),
                                                    input_dim,
                                                    rnn_dim,
                                                    embedding_size,
                                                    nlayers=num_layers,
                                                    dropout=dropout,
                                                    lm=lm)
    else:
        embedding = src.models.embedding.Linear(len(alphabet),
                                                input_dim,
                                                embedding_size,
                                                lm=lm)

    if args.norm == 'l1':
        norm = src.models.comparison.L1()
        print('# norm: l1', file=sys.stderr)
    elif args.norm == 'l2':
        norm = src.models.comparison.L2()
        print('# norm: l2', file=sys.stderr)
    model = src.models.comparison.OrdinalRegression(
        embedding,
        5,
        align_method=compare_type,
        compare=norm,
        allow_insertions=allow_insert)

    if use_cuda:
        model.cuda()

    ## setup training parameters and optimizer
    num_epochs = args.num_epochs

    weight_decay = args.weight_decay
    lr = args.lr

    print('# training with Adam: lr={}, weight_decay={}'.format(
        lr, weight_decay),
          file=sys.stderr)
    params = [p for p in model.parameters() if p.requires_grad]
    optim = torch.optim.Adam(params, lr=lr, weight_decay=weight_decay)

    ## train the model
    print('# training model', file=sys.stderr)

    save_prefix = args.save_prefix
    output = args.output
    if output is None:
        output = sys.stdout
    else:
        output = open(output, 'w')
    digits = int(np.floor(np.log10(num_epochs))) + 1
    line = '\t'.join(['epoch', 'split', 'loss', 'mse', 'accuracy', 'r', 'rho'])
    print(line, file=output)

    for epoch in range(num_epochs):
        # train epoch
        model.train()
        it = 0
        n = 0
        loss_estimate = 0
        mse_estimate = 0
        acc_estimate = 0

        for x0, x1, y in train_iterator:  # zip(train_iterator_0, train_iterator_1):

            if use_cuda:
                y = y.cuda()
            y = Variable(y)

            b = len(x0)
            x = x0 + x1

            x, order = pack_sequences(x)
            x = PackedSequence(Variable(x.data), x.batch_sizes)
            z = model(x)  # embed the sequences
            z = unpack_sequences(z, order)

            z0 = z[:b]
            z1 = z[b:]

            logits = []
            for i in range(b):
                z_a = z0[i]
                z_b = z1[i]
                logits.append(model.score(z_a, z_b))
            logits = torch.stack(logits, 0)

            loss = F.binary_cross_entropy_with_logits(logits, y.float())
            loss.backward()

            optim.step()
            optim.zero_grad()
            model.clip(
            )  # projected gradient for bounding ordinal regressionn parameters

            p = F.sigmoid(logits)
            ones = p.new(b, 1).zero_() + 1
            p_ge = torch.cat([ones, p], 1)
            p_lt = torch.cat([1 - p, ones], 1)
            p = p_ge * p_lt
            p = p / p.sum(1, keepdim=True)  # make sure p is normalized

            _, y_hard = torch.max(p, 1)
            levels = torch.arange(5).to(p.device)
            y_hat = torch.sum(p * levels, 1)
            y = torch.sum(y.data, 1)

            loss = F.cross_entropy(
                p, y)  # calculate cross entropy loss from p vector

            correct = torch.sum((y == y_hard).float())
            mse = torch.sum((y.float() - y_hat)**2)

            n += b
            delta = b * (loss.item() - loss_estimate)
            loss_estimate += delta / n
            delta = correct.item() - b * acc_estimate
            acc_estimate += delta / n
            delta = mse.item() - b * mse_estimate
            mse_estimate += delta / n

            if (n - b) // 100 < n // 100:
                print(
                    '# [{}/{}] training {:.1%} loss={:.5f}, mse={:.5f}, acc={:.5f}'
                    .format(epoch + 1, num_epochs, n / epoch_size,
                            loss_estimate, mse_estimate, acc_estimate),
                    end='\r',
                    file=sys.stderr)
        print(' ' * 80, end='\r', file=sys.stderr)
        line = '\t'.join([
            str(epoch + 1).zfill(digits), 'train',
            str(loss_estimate),
            str(mse_estimate),
            str(acc_estimate), '-', '-'
        ])
        print(line, file=output)
        output.flush()

        # eval and save model
        model.eval()

        y = []
        logits = []
        with torch.no_grad():
            for x0, x1, y_mb in test_iterator:

                if use_cuda:
                    y_mb = y_mb.cuda()
                y.append(y_mb.long())

                b = len(x0)
                x = x0 + x1

                x, order = pack_sequences(x)
                x = PackedSequence(Variable(x.data), x.batch_sizes)
                z = model(x)  # embed the sequences
                z = unpack_sequences(z, order)

                z0 = z[:b]
                z1 = z[b:]

                for i in range(b):
                    z_a = z0[i]
                    z_b = z1[i]
                    logits.append(model.score(z_a, z_b))

            y = torch.cat(y, 0)
            logits = torch.stack(logits, 0)

            p = F.sigmoid(logits).data
            ones = p.new(p.size(0), 1).zero_() + 1
            p_ge = torch.cat([ones, p], 1)
            p_lt = torch.cat([1 - p, ones], 1)
            p = p_ge * p_lt
            p = p / p.sum(1, keepdim=True)  # make sure p is normalized

            loss = F.cross_entropy(p, y).item()

            _, y_hard = torch.max(p, 1)
            levels = torch.arange(5).to(p.device)
            y_hat = torch.sum(p * levels, 1)

            accuracy = torch.mean((y == y_hard).float()).item()
            mse = torch.mean((y.float() - y_hat)**2).item()

            y = y.cpu().numpy()
            y_hat = y_hat.cpu().numpy()

            r, _ = pearsonr(y_hat, y)
            rho, _ = spearmanr(y_hat, y)

        line = '\t'.join([
            str(epoch + 1).zfill(digits), 'test',
            str(loss),
            str(mse),
            str(accuracy),
            str(r),
            str(rho)
        ])
        print(line, file=output)
        output.flush()

        # save the model
        if save_prefix is not None:
            save_path = save_prefix + '_epoch' + str(epoch +
                                                     1).zfill(digits) + '.sav'
            model.cpu()
            torch.save(model, save_path)
            if use_cuda:
                model.cuda()
Exemple #7
0
def prot2feature(examples, all_name_array, max_seq_length, do_kmer, subset_name_array, has_ppi_emb):

  alphabet = Uniprot21() ## convert string to indexing.
  # >>> alphabet.encode(b'ARNDCQEGHILKMFPSTWYVXOUBZ')
  # array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
  #       17, 18, 19, 20, 11,  4, 20, 20], dtype=uint8)

  # output is batch x max_len x dim when use @alphabet encoder
  # masking will be needed


  if subset_name_array is not None:
    print ('\nmodel runs only on subset of labels, so make 1-hot in subset of GO database, not all GO terms\n')
    all_name_array = subset_name_array

  label_map = {label : i for i, label in enumerate(all_name_array)}

  features = []

  for (ex_index, example) in tqdm(enumerate(examples)):

    if do_kmer:
      # print ('\nusing kmer with deepgo data will have seq max len 1000\n')
      max_seq_length = MAX_SEQ_LEN_KMER ## OVER RIDE
      input_id, input_len = prot2kmer (example.aa_seq)

    else:
      aa_seq = example.aa_seq[ 1:(len(example.aa_seq)-1) ]
      input_id = alphabet.encode(aa_seq.encode('utf-8'))
      input_id = list(input_id.astype(int))
      input_len = len(input_id)

    label = example.label.split(";") ## split 0016021;0031224;0044425
    label_id = np.zeros( len(label_map) )
    where_one = np.array ( [ label_map[g] for g in label if g in label_map ] )
    # if len(where_one) > 0:
    label_id [ where_one ] = 1 # 1-hot

    input_mask = [1] * input_len ## masking for batch mode

    ## !! be careful, because 0 in @alphabet is not a padding. we will need to use masking later
    padding = [0] * (max_seq_length - input_len)  # pad zero until max len
    input_id = input_id + padding
    input_mask = input_mask + padding

    if do_kmer:
      input_len = MAX_SEQ_LEN_KMER ## following deepgo, we take flatten, so we cannot have various lengths.

    if ex_index < 3:
      print ('\nsee sample {}'.format(ex_index))
      print ('sequence {}'.format(example.aa_seq))
      print ('input index {}'.format(input_id))
      print ('label index {}'.format(label_id))

    if has_ppi_emb:
      input_emb = [float(j) for j in example.prot_emb.split(";")] ## read this in a str, so convert to float. later conver to tensor
    else:
      input_emb = None

    features.append(InputFeatures(label_id=label_id,
                                  input_id=input_id, ## indexing of animo
                                  input_mask=input_mask,
                                  input_len=input_len,
                                  input_name=example.prot_name,
                                  input_emb=input_emb
                                  ) )

  return features
Exemple #8
0
def main():
    import argparse
    parser = argparse.ArgumentParser('Script for training contact prediction model')

    parser.add_argument('--dev', action='store_true', help='use train/dev split')

    parser.add_argument('--rnn-type', choices=['lstm', 'gru'], default='lstm', help='type of RNN block to use (default: lstm)')
    parser.add_argument('--embedding-dim', type=int, default=100, help='embedding dimension (default: 40)')
    parser.add_argument('--input-dim', type=int, default=512, help='dimension of input to RNN (default: 512)')
    parser.add_argument('--rnn-dim', type=int, default=512, help='hidden units of RNNs (default: 128)')
    parser.add_argument('--num-layers', type=int, default=3, help='number of RNN layers (default: 3)')
    parser.add_argument('--dropout', type=float, default=0, help='dropout probability (default: 0)')


    parser.add_argument('--hidden-dim', type=int, default=50, help='number of hidden units for comparison layer in contact predictionn (default: 50)')
    parser.add_argument('--width', type=int, default=7, help='width of convolutional filter for contact prediction (default: 7)')


    parser.add_argument('--epoch-size', type=int, default=100000, help='number of examples per epoch (default: 100,000)')
    parser.add_argument('--epoch-scale', type=int, default=5, help='report heldout performance every this many epochs (default: 5)')
    parser.add_argument('--num-epochs', type=int, default=100, help='number of epochs (default: 100)')


    parser.add_argument('--similarity-batch-size', type=int, default=64, help='minibatch size for similarity prediction loss in pairs (default: 64)')
    parser.add_argument('--contact-batch-size', type=int, default=10, help='minibatch size for contact predictionn loss (default: 10)')


    parser.add_argument('--weight-decay', type=float, default=0, help='L2 regularization (default: 0)')
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--lambda', dest='lambda_', type=float, default=0.5, help='weight on the similarity objective, contact map objective weight is one minus this (default: 0.5)')

    parser.add_argument('--tau', type=float, default=0.5, help='sampling proportion exponent (default: 0.5)')
    parser.add_argument('--augment', type=float, default=0, help='probability of resampling amino acid for data augmentation (default: 0)')

    parser.add_argument('--lm', help='pretrained LM to use as initial embedding')

    parser.add_argument('-o', '--output', help='output file path (default: stdout)')
    parser.add_argument('--save-prefix', help='path prefix for saving models')
    parser.add_argument('-d', '--device', type=int, default=-2, help='compute device to use')

    args = parser.parse_args()


    prefix = args.output


    ## set the device
    d = args.device
    use_cuda = (d != -1) and torch.cuda.is_available()
    if d >= 0:
        torch.cuda.set_device(d)

    ## make the datasets
    alphabet = Uniprot21()

    astral_train_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.fa'
    astral_test_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.test.fa'
    astral_testpairs_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.test.sampledpairs.txt'
    if args.dev:
        astral_train_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.train.fa'
        astral_test_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.dev.fa'
        astral_testpairs_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.dev.sampledpairs.txt'


    print('# loading training sequences:', astral_train_path, file=sys.stderr)
    x_train, structs_train, contacts_train = load_data(astral_train_path, alphabet)
    if use_cuda:
        x_train = [x.cuda() for x in x_train]
        #contacts_train = [c.cuda() for c in contacts_train]
    print('# loaded', len(x_train), 'training sequences', file=sys.stderr)

    print('# loading test sequences:', astral_test_path, file=sys.stderr)
    x_test, _, contacts_test = load_data(astral_test_path, alphabet)
    if use_cuda:
        x_test = [x.cuda() for x in x_test]
        #contacts_test = [c.cuda() for c in contacts_test]
    print('# loaded', len(x_test), 'contact map test sequences', file=sys.stderr)

    x0_test, x1_test, y_scop_test = load_scop_testpairs(astral_testpairs_path, alphabet)
    if use_cuda:
        x0_test = [x.cuda() for x in x0_test]
        x1_test = [x.cuda() for x in x1_test]
    print('# loaded', len(x0_test), 'scop test pairs', file=sys.stderr)

    ## make the dataset iterators

    # data augmentation by resampling amino acids
    augment = None
    p = 0
    if args.augment > 0:
        p = args.augment
        trans = torch.ones(len(alphabet),len(alphabet))
        trans = trans/trans.sum(1, keepdim=True)
        if use_cuda:
            trans = trans.cuda()
        augment = MultinomialResample(trans, p)
    print('# resampling amino acids with p:', p, file=sys.stderr)

    # SCOP structural similarity datasets
    scop_levels = torch.cumprod((structs_train.unsqueeze(1) == structs_train.unsqueeze(0)).long(), 2)
    scop_train = AllPairsDataset(x_train, scop_levels, augment=augment)
    scop_test = PairedDataset(x0_test, x1_test, y_scop_test)

    # contact map datasets
    cmap_train = ContactMapDataset(x_train, contacts_train, augment=augment)
    cmap_test = ContactMapDataset(x_test, contacts_test)

    # iterators for contacts data
    batch_size = args.contact_batch_size
    cmap_train_iterator = torch.utils.data.DataLoader(cmap_train
                                                    , batch_size=batch_size
                                                    , shuffle=True
                                                    , collate_fn=collate_lists
                                                    )
    cmap_test_iterator = torch.utils.data.DataLoader(cmap_test
                                                   , batch_size=batch_size
                                                   , collate_fn=collate_lists
                                                   )

    # make the SCOP training iterator have same number of minibatches
    num_steps = len(cmap_train_iterator)
    batch_size = args.similarity_batch_size
    epoch_size = num_steps*batch_size

    similarity = scop_levels.numpy().sum(2)
    levels,counts = np.unique(similarity, return_counts=True)
    order = np.argsort(levels)
    levels = levels[order]
    counts = counts[order]

    tau = args.tau
    print('# using tau:', tau, file=sys.stderr)
    print('#', counts**tau/np.sum(counts**tau), file=sys.stderr)
    weights = counts**tau/counts
    weights = weights[similarity].ravel()
    sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, epoch_size)
    N = epoch_size

    # iterators for similarity data
    scop_train_iterator = torch.utils.data.DataLoader(scop_train
                                                    , batch_size=batch_size
                                                    , sampler=sampler
                                                    , collate_fn=collate_paired_sequences
                                                    )
    scop_test_iterator = torch.utils.data.DataLoader(scop_test
                                                   , batch_size=batch_size
                                                   , collate_fn=collate_paired_sequences
                                                   )

    report_steps = args.epoch_scale


    ## initialize the model 
    rnn_type = args.rnn_type
    rnn_dim = args.rnn_dim
    num_layers = args.num_layers

    embedding_size = args.embedding_dim
    input_dim = args.input_dim
    dropout = args.dropout
    
    print('# initializing embedding model with:', file=sys.stderr)
    print('# embedding_size:', embedding_size, file=sys.stderr)
    print('# input_dim:', input_dim, file=sys.stderr)
    print('# rnn_dim:', rnn_dim, file=sys.stderr)
    print('# num_layers:', num_layers, file=sys.stderr)
    print('# dropout:', dropout, file=sys.stderr)

    lm = None
    if args.lm is not None:
        print('# using pretrained LM:', args.lm, file=sys.stderr)
        lm = torch.load(args.lm)
        lm.eval()
        ## do not update the LM parameters
        for param in lm.parameters():
            param.requires_grad = False

    embedding = src.models.embedding.StackedRNN(len(alphabet), input_dim, rnn_dim
                                               , embedding_size, nlayers=num_layers
                                               , dropout=dropout, lm=lm)

    # similarity prediction parameters
    similarity_kwargs = {}

    # contact map prediction parameters
    hidden_dim = args.hidden_dim
    width = args.width
    cmap_kwargs = {'hidden_dim': hidden_dim, 'width': width}
    
    model = src.models.multitask.SCOPCM(embedding, similarity_kwargs=similarity_kwargs,
                                        cmap_kwargs=cmap_kwargs)
    if use_cuda:
        model.cuda()

    ## setup training parameters and optimizer
    num_epochs = args.num_epochs

    weight_decay = args.weight_decay
    lr = args.lr

    print('# training with Adam: lr={}, weight_decay={}'.format(lr, weight_decay), file=sys.stderr)
    params = [p for p in model.parameters() if p.requires_grad]
    optim = torch.optim.Adam(params, lr=lr, weight_decay=weight_decay)

    scop_weight = args.lambda_
    cmap_weight = 1 - scop_weight

    print('# weighting tasks with SIMILARITY: {:.3f}, CONTACTS: {:.3f}'.format(scop_weight, cmap_weight), file=sys.stderr)

    ## train the model
    print('# training model', file=sys.stderr)

    save_prefix = args.save_prefix
    output = args.output
    if output is None:
        output = sys.stdout
    else:
        output = open(output, 'w')
    digits = int(np.floor(np.log10(num_epochs))) + 1
    tokens = ['sim_loss', 'sim_mse', 'sim_acc', 'sim_r', 'sim_rho'
             ,'cmap_loss', 'cmap_pr', 'cmap_re', 'cmap_f1', 'cmap_aupr']
    line = '\t'.join(['epoch', 'split'] + tokens)
    print(line, file=output)

    prog_template = '# [{}/{}] training {:.1%} sim_loss={:.5f}, sim_acc={:.5f}, cmap_loss={:.5f}, cmap_f1={:.5f}'

    for epoch in range(num_epochs):
        # train epoch
        model.train()

        scop_n = 0
        scop_loss_accum = 0
        scop_mse_accum = 0
        scop_acc_accum = 0

        cmap_n = 0
        cmap_loss_accum = 0
        cmap_pp = 0
        cmap_pr_accum = 0
        cmap_gp = 0
        cmap_re_accum = 0

        for (cmap_x, cmap_y), (scop_x0, scop_x1, scop_y) in zip(cmap_train_iterator, scop_train_iterator):

            # calculate gradients and metrics for similarity part
            loss, correct, mse, b = similarity_grad(model, scop_x0, scop_x1, scop_y, use_cuda, weight=scop_weight)

            scop_n += b
            delta = b*(loss - scop_loss_accum)
            scop_loss_accum += delta/scop_n
            delta = correct - b*scop_acc_accum
            scop_acc_accum += delta/scop_n
            delta = b*(mse - scop_mse_accum)
            scop_mse_accum += delta/scop_n
            
            report = ((scop_n - b)//100 < scop_n//100)
           
            # calculate the contact map prediction gradients and metrics
            loss, tp, gp_, pp_, b = contacts_grad(model, cmap_x, cmap_y, use_cuda, weight=cmap_weight)

            cmap_gp += gp_
            delta = tp - gp_*cmap_re_accum
            cmap_re_accum += delta/cmap_gp

            cmap_pp += pp_
            delta = tp - pp_*cmap_pr_accum
            cmap_pr_accum += delta/cmap_pp

            cmap_n += b
            delta = b*(loss - cmap_loss_accum)
            cmap_loss_accum += delta/cmap_n

            ## update the parameters
            optim.step()
            optim.zero_grad()
            model.clip()

            if report:
                f1 = 2*cmap_pr_accum*cmap_re_accum/(cmap_pr_accum + cmap_re_accum)
                line = prog_template.format(epoch+1, num_epochs, scop_n/N, scop_loss_accum
                                           , scop_acc_accum, cmap_loss_accum, f1)
                print(line, end='\r', file=sys.stderr)
        print(' '*80, end='\r', file=sys.stderr)
        f1 = 2*cmap_pr_accum*cmap_re_accum/(cmap_pr_accum + cmap_re_accum)
        tokens = [ scop_loss_accum, scop_mse_accum, scop_acc_accum, '-', '-'
                 , cmap_loss_accum, cmap_pr_accum, cmap_re_accum, f1, '-']
        tokens = [x if type(x) is str else '{:.5f}'.format(x) for x in tokens]

        line = '\t'.join([str(epoch+1).zfill(digits), 'train'] + tokens)
        print(line, file=output)
        output.flush()

        # eval and save model
        if (epoch+1) % report_steps == 0:
            model.eval()
            with torch.no_grad():
                scop_loss, scop_acc, scop_mse, scop_r, scop_rho = \
                        eval_similarity(model, scop_test_iterator, use_cuda)
                cmap_loss, cmap_pr, cmap_re, cmap_f1, cmap_aupr = \
                        eval_contacts(model, cmap_test_iterator, use_cuda)

            tokens = [ scop_loss, scop_mse, scop_acc, scop_r, scop_rho
                     , cmap_loss, cmap_pr, cmap_re, cmap_f1, cmap_aupr]
            tokens = ['{:.5f}'.format(x) for x in tokens]

            line = '\t'.join([str(epoch+1).zfill(digits), 'test'] + tokens)
            print(line, file=output)
            output.flush()

            # save the model
            if save_prefix is not None:
                save_path = save_prefix + '_epoch' + str(epoch+1).zfill(digits) + '.sav'
                model.cpu()
                torch.save(model, save_path)
                if use_cuda:
                    model.cuda()
Exemple #9
0
def main():
    args = parser.parse_args()

    alph = Uniprot21()
    ntokens = len(alph)

    ## load the training sequences
    train_group, X_train = load_pfam(pfam_train, alph)
    print('# loaded',
          len(X_train),
          'sequences from',
          pfam_train,
          file=sys.stderr)

    ## load the testing sequences
    test_group, X_test = load_pfam(pfam_test, alph)
    print('# loaded',
          len(X_test),
          'sequences from',
          pfam_test,
          file=sys.stderr)

    ## initialize the model
    nin = ntokens + 1
    nout = ntokens
    embedding_dim = 21
    hidden_dim = args.hidden_dim
    num_layers = args.num_layers
    mask_idx = ntokens
    dropout = args.dropout

    tied = not args.untied

    model = src.models.sequence.BiLM(nin,
                                     nout,
                                     embedding_dim,
                                     hidden_dim,
                                     num_layers,
                                     mask_idx=mask_idx,
                                     dropout=dropout,
                                     tied=tied)
    print('# initialized model', file=sys.stderr)

    device = args.device
    use_cuda = torch.cuda.is_available() and (device == -2 or device >= 0)
    if device >= 0:
        torch.cuda.set_device(device)
    if use_cuda:
        model = model.cuda()

    ## form the data iterators and optimizer
    lr = args.lr
    l2 = args.l2
    solver = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2)

    def collate(xs):
        B = len(xs)
        N = max(len(x) for x in xs)
        lengths = np.array([len(x) for x in xs], dtype=int)

        order = np.argsort(lengths)[::-1]
        lengths = lengths[order]

        X = torch.LongTensor(B, N).zero_() + mask_idx
        for i in range(B):
            x = xs[order[i]]
            n = len(x)
            X[i, :n] = torch.from_numpy(x)
        return X, lengths

    mb = args.minibatch_size

    train_iterator = torch.utils.data.DataLoader(X_train,
                                                 batch_size=mb,
                                                 shuffle=True,
                                                 collate_fn=collate)
    test_iterator = torch.utils.data.DataLoader(X_test,
                                                batch_size=mb,
                                                collate_fn=collate)

    ## fit the model!

    print('# training model', file=sys.stderr)

    output = sys.stdout
    if args.output is not None:
        output = open(args.output, 'w')

    num_epochs = args.num_epochs
    clip = args.clip

    save_prefix = args.save_prefix
    digits = int(np.floor(np.log10(num_epochs))) + 1

    print('epoch\tsplit\tlog_p\tperplexity\taccuracy', file=output)
    output.flush()

    for epoch in range(num_epochs):
        # train epoch
        model.train()
        it = 0
        n = 0
        accuracy = 0
        loss_accum = 0
        for X, lengths in train_iterator:
            if use_cuda:
                X = X.cuda()
            X = Variable(X)
            logp = model(X)

            mask = (X != mask_idx)

            index = X * mask.long()
            loss = -logp.gather(2, index.unsqueeze(2)).squeeze(2)
            loss = torch.mean(loss.masked_select(mask))

            loss.backward()

            # clip the gradient
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

            solver.step()
            solver.zero_grad()

            _, y_hat = torch.max(logp, 2)
            correct = torch.sum((y_hat == X).masked_select(mask))
            #correct = torch.sum((y_hat == X)[mask.nonzero()].float())

            b = mask.long().sum().item()
            n += b
            delta = b * (loss.item() - loss_accum)
            loss_accum += delta / n
            delta = correct.item() - b * accuracy
            accuracy += delta / n

            b = X.size(0)
            it += b
            if (it - b) // 100 < it // 100:
                print(
                    '# [{}/{}] training {:.1%} loss={:.5f}, acc={:.5f}'.format(
                        epoch + 1, num_epochs, it / len(X_train), loss_accum,
                        accuracy),
                    end='\r',
                    file=sys.stderr)
        print(' ' * 80, end='\r', file=sys.stderr)

        perplex = np.exp(loss_accum)
        string = str(epoch+1).zfill(digits) + '\t' + 'train' + '\t' + str(loss_accum) \
                 + '\t' + str(perplex) + '\t' + str(accuracy)
        print(string, file=output)
        output.flush()

        # test epoch
        model.eval()
        it = 0
        n = 0
        accuracy = 0
        loss_accum = 0
        with torch.no_grad():
            for X, lengths in test_iterator:
                if use_cuda:
                    X = X.cuda()
                X = Variable(X)
                logp = model(X)

                mask = (X != mask_idx)

                index = X * mask.long()
                loss = -logp.gather(2, index.unsqueeze(2)).squeeze(2)
                loss = torch.mean(loss.masked_select(mask))

                _, y_hat = torch.max(logp, 2)
                correct = torch.sum((y_hat == X).masked_select(mask))

                b = mask.long().sum().item()
                n += b
                delta = b * (loss.item() - loss_accum)
                loss_accum += delta / n
                delta = correct.item() - b * accuracy
                accuracy += delta / n

                b = X.size(0)
                it += b
                if (it - b) // 100 < it // 100:
                    print(
                        '# [{}/{}] test {:.1%} loss={:.5f}, acc={:.5f}'.format(
                            epoch + 1, num_epochs, it / len(X_test),
                            loss_accum, accuracy),
                        end='\r',
                        file=sys.stderr)
        print(' ' * 80, end='\r', file=sys.stderr)

        perplex = np.exp(loss_accum)
        string = str(epoch+1).zfill(digits) + '\t' + 'test' + '\t' + str(loss_accum) \
                 + '\t' + str(perplex) + '\t' + str(accuracy)
        print(string, file=output)
        output.flush()

        ## save the model
        if save_prefix is not None:
            save_path = save_prefix + '_epoch' + str(epoch +
                                                     1).zfill(digits) + '.sav'
            model = model.cpu()
            torch.save(model, save_path)
            if use_cuda:
                model = model.cuda()
Exemple #10
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        'Script for evaluating contact map models.')
    parser.add_argument('model', help='path to saved model')
    parser.add_argument('--dataset',
                        default='2.06 test',
                        help='which dataset (default: 2.06 test)')
    parser.add_argument(
        '--batch-size',
        default=10,
        type=int,
        help='number of sequences to process in each batch (default: 10)')
    parser.add_argument('-o',
                        '--output',
                        help='output file path (default: stdout)')
    parser.add_argument('-d',
                        '--device',
                        type=int,
                        default=-2,
                        help='compute device to use')
    args = parser.parse_args()

    # load the data
    if args.dataset == '2.06 test':
        fasta_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.test.fa'
        contact_paths = glob.glob('data/SCOPe/pdbstyle-2.06/*/*.png')
    elif args.dataset == '2.07 test' or args.dataset == '2.07 new test':
        fasta_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.07-new.fa'
        contact_paths = glob.glob('data/SCOPe/pdbstyle-2.07/*/*.png')
    else:
        raise Exception('Bad dataset argument ' + args.dataset)

    alphabet = Uniprot21()
    x, y, names = load_data(fasta_path, contact_paths, alphabet)

    ## set the device
    d = args.device
    use_cuda = (d != -1) and torch.cuda.is_available()
    if d >= 0:
        torch.cuda.set_device(d)

    if use_cuda:
        x = [x_.cuda() for x_ in x]
        y = [y_.cuda() for y_ in y]

    model = torch.load(args.model)
    model.eval()
    if use_cuda:
        model.cuda()

    # predict contact maps
    batch_size = args.batch_size
    dataset = ContactMapDataset(x, y)
    iterator = torch.utils.data.DataLoader(dataset,
                                           batch_size=batch_size,
                                           collate_fn=collate_lists)
    logits = []
    with torch.no_grad():
        for xmb, ymb in iterator:
            lmb = predict_minibatch(model, xmb, use_cuda)
            logits += lmb

    # calculate performance metrics
    lengths = np.array([len(x_) for x_ in x])
    logits = [logit.cpu().numpy() for logit in logits]
    y = [y_.cpu().numpy() for y_ in y]

    output = args.output
    if output is None:
        output = sys.stdout
    else:
        output = open(output, 'w')
    line = '\t'.join([
        'Distance', 'Precision', 'Recall', 'F1', 'AUPR', 'Precision@L',
        'Precision@L/2', 'Precision@L/5'
    ])
    print(line, file=output)
    output.flush()

    # for all contacts
    y_flat = []
    logits_flat = []
    for i in range(len(y)):
        yi = y[i]
        mask = (yi < 0)
        y_flat.append(yi[~mask])
        logits_flat.append(logits[i][~mask])

    # calculate precision, recall, F1, and area under the precision recall curve for all contacts
    precision = np.zeros(len(x))
    recall = np.zeros(len(x))
    F1 = np.zeros(len(x))
    AUPR = np.zeros(len(x))
    prL = np.zeros(len(x))
    prL2 = np.zeros(len(x))
    prL5 = np.zeros(len(x))
    for i in range(len(x)):
        pr, re, f1, aupr = calc_metrics(logits_flat[i], y_flat[i])
        precision[i] = pr
        recall[i] = re
        F1[i] = f1
        AUPR[i] = aupr

        order = np.argsort(logits_flat[i])[::-1]
        n = lengths[i]
        topL = order[:n]
        prL[i] = y_flat[i][topL].mean()
        topL2 = order[:n // 2]
        prL2[i] = y_flat[i][topL2].mean()
        topL5 = order[:n // 5]
        prL5[i] = y_flat[i][topL5].mean()

    template = 'All\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}'
    line = template.format(precision.mean(), recall.mean(), F1.mean(),
                           AUPR.mean(), prL.mean(), prL2.mean(), prL5.mean())
    print(line, file=output)
    output.flush()

    # for Medium/Long range contacts
    y_flat = []
    logits_flat = []
    for i in range(len(y)):
        yi = y[i]
        mask = (yi < 0)

        medlong = np.tril_indices(len(yi), k=11)
        medlong_mask = np.zeros((len(yi), len(yi)), dtype=np.uint8)
        medlong_mask[medlong] = 1
        mask = mask | (medlong_mask == 1)

        y_flat.append(yi[~mask])
        logits_flat.append(logits[i][~mask])

    # calculate precision, recall, F1, and area under the precision recall curve for all contacts
    precision = np.zeros(len(x))
    recall = np.zeros(len(x))
    F1 = np.zeros(len(x))
    AUPR = np.zeros(len(x))
    prL = np.zeros(len(x))
    prL2 = np.zeros(len(x))
    prL5 = np.zeros(len(x))
    for i in range(len(x)):
        pr, re, f1, aupr = calc_metrics(logits_flat[i], y_flat[i])
        precision[i] = pr
        recall[i] = re
        F1[i] = f1
        AUPR[i] = aupr

        order = np.argsort(logits_flat[i])[::-1]
        n = lengths[i]
        topL = order[:n]
        prL[i] = y_flat[i][topL].mean()
        topL2 = order[:n // 2]
        prL2[i] = y_flat[i][topL2].mean()
        topL5 = order[:n // 5]
        prL5[i] = y_flat[i][topL5].mean()

    template = 'Medium/Long\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}'
    line = template.format(np.nanmean(precision), np.nanmean(recall),
                           np.nanmean(F1), np.nanmean(AUPR), np.nanmean(prL),
                           np.nanmean(prL2), np.nanmean(prL5))
    print(line, file=output)
    output.flush()
def main():
    import argparse
    parser = argparse.ArgumentParser(
        'Script for evaluating similarity model on SCOP test set.')

    parser.add_argument(
        'features',
        help=
        'path to saved embedding model file or "1-", "3-", or "5-mer" for k-mer features'
    )

    parser.add_argument('--num-epochs',
                        type=int,
                        default=10,
                        help='number of epochs to train for (default: 10)')
    parser.add_argument('--all-hidden',
                        action='store_true',
                        help='use all hidden layers as features')

    parser.add_argument('-v',
                        '--print-examples',
                        default=0,
                        type=int,
                        help='number of examples to print (default: 0)')

    parser.add_argument('-o',
                        '--output',
                        help='output file path (default: stdout)')
    parser.add_argument('--save-prefix', help='path prefix for saving models')
    parser.add_argument('-d',
                        '--device',
                        type=int,
                        default=-2,
                        help='compute device to use')

    args = parser.parse_args()
    num_epochs = args.num_epochs

    ## load the data
    alphabet = Uniprot21()
    secstr = SecStr8

    names_train, x_train, y_train = load_secstr(secstr_train_path, alphabet,
                                                secstr)
    names_test, x_test, y_test = load_secstr(secstr_test_path, alphabet,
                                             secstr)

    sequences_test = [
        ''.join(alphabet[c] for c in x_test[i]) for i in range(len(x_test))
    ]

    y_train = np.concatenate(y_train, 0)

    ## set the device
    d = args.device
    use_cuda = (d != -1) and torch.cuda.is_available()
    if d >= 0:
        torch.cuda.set_device(d)

    if args.features == '1-mer':
        n = len(alphabet)
        x_test = [x.astype(int) for x in x_test]
    elif args.features == '3-mer':
        x_train, n = kmer_features(x_train, len(alphabet), 3)
        x_test, _ = kmer_features(x_test, len(alphabet), 3)
    elif args.features == '5-mer':
        x_train, n = kmer_features(x_train, len(alphabet), 5)
        x_test, _ = kmer_features(x_test, len(alphabet), 5)
    else:
        features = torch.load(args.features)
        features.eval()

        if use_cuda:
            features.cuda()

        features = TorchModel(features,
                              use_cuda,
                              full_features=args.all_hidden)
        batch_size = 32  # batch size for featurizing sequences

        with torch.no_grad():
            z_train = []
            for i in range(0, len(x_train), batch_size):
                for z in features(x_train[i:i + batch_size]):
                    z_train.append(z.cpu().numpy())
            x_train = z_train

            z_test = []
            for i in range(0, len(x_test), batch_size):
                for z in features(x_test[i:i + batch_size]):
                    z_test.append(z.cpu().numpy())
            x_test = z_test

        n = x_train[0].shape[1]
        del features
        del z_train
        del z_test

    print('split', 'epoch', 'loss', 'perplexity', 'accuracy')

    if args.features.endswith('-mer'):
        x_train = np.concatenate(x_train, 0)
        model = fit_kmer_potentials(x_train, y_train, n, len(secstr))
    else:
        x_train = torch.cat([torch.from_numpy(x) for x in x_train], 0)
        if use_cuda and not args.all_hidden:
            x_train = x_train.cuda()

        num_hidden = 1024
        model = nn.Sequential(nn.Linear(n, num_hidden), nn.ReLU(),
                              nn.Linear(num_hidden, num_hidden), nn.ReLU(),
                              nn.Linear(num_hidden, len(secstr)))

        y_train = torch.from_numpy(y_train).long()
        if use_cuda:
            y_train = y_train.cuda()
            model.cuda()

        fit_nn_potentials(model,
                          x_train,
                          y_train,
                          num_epochs=num_epochs,
                          use_cuda=use_cuda)

    if use_cuda:
        model.cuda()
    model.eval()

    num_examples = args.print_examples
    if num_examples > 0:
        names_examples = names_test[:num_examples]
        x_examples = x_test[:num_examples]
        y_examples = y_test[:num_examples]

    A = np.zeros((8, 3), dtype=np.float32)
    I = np.zeros(8, dtype=int)
    # helix
    A[0, 0] = 1.0
    A[3, 0] = 1.0
    A[4, 0] = 1.0
    I[0] = 0
    I[3] = 0
    I[4] = 0
    # sheet
    A[1, 1] = 1.0
    A[2, 1] = 1.0
    I[1] = 1
    I[2] = 1
    # coil
    A[5, 2] = 1.0
    A[6, 2] = 1.0
    A[7, 2] = 1.0
    I[5] = 2
    I[6] = 2
    I[7] = 2

    A = torch.from_numpy(A)
    I = torch.from_numpy(I)
    if use_cuda:
        A = A.cuda()
        I = I.cuda()

    n = 0
    acc_8 = 0
    acc_3 = 0
    loss_8 = 0
    loss_3 = 0

    x_test = torch.cat([torch.from_numpy(x) for x in x_test], 0)
    y_test = torch.cat([torch.from_numpy(y).long() for y in y_test], 0)

    if use_cuda and not args.all_hidden:
        x_test = x_test.cuda()
        y_test = y_test.cuda()

    mb = 256
    with torch.no_grad():
        for i in range(0, len(x_test), mb):
            x = x_test[i:i + mb]
            y = y_test[i:i + mb]

            if use_cuda:
                x = x.cuda()
                y = y.cuda()

            potentials = model(x).view(x.size(0), -1)

            ## 8-class SS
            l = F.cross_entropy(potentials, y).item()
            _, y_hat = potentials.max(1)
            correct = torch.sum((y == y_hat).float()).item()

            n += x.size(0)
            delta = x.size(0) * (l - loss_8)
            loss_8 += delta / n
            delta = correct - x.size(0) * acc_8
            acc_8 += delta / n

            ## 3-class SS
            y = I[y]
            p = F.softmax(potentials, 1)
            p = torch.mm(p, A)  # ss3 probabilities
            log_p = torch.log(p)
            l = F.nll_loss(log_p, y).item()
            _, y_hat = log_p.max(1)
            correct = torch.sum((y == y_hat).float()).item()

            delta = x.size(0) * (l - loss_3)
            loss_3 += delta / n
            delta = correct - x.size(0) * acc_3
            acc_3 += delta / n

    print('-', '-', '8-class', '-', '3-class', '-')
    print('split', 'perplexity', 'accuracy', 'perplexity', 'accuracy')
    print('test', np.exp(loss_8), acc_8, np.exp(loss_3), acc_3)

    if num_examples > 0:
        for i in range(num_examples):
            name = names_examples[i].decode('utf-8')
            x = x_examples[i]
            y = y_examples[i]

            seq = sequences_test[i]

            print('>' + name + ' sequence')
            print(seq)
            print('')

            ss = ''.join(secstr[c] for c in y)
            ss = ss.replace(' ', 'C')
            print('>' + name + ' secstr')
            print(ss)
            print('')

            x = torch.from_numpy(x)
            if use_cuda:
                x = x.cuda()
            potentials = model(x)
            _, y_hat = torch.max(potentials, 1)
            y_hat = y_hat.cpu().numpy()

            ss_hat = ''.join(secstr[c] for c in y_hat)
            ss_hat = ss_hat.replace(' ', 'C')
            print('>' + name + ' predicted')
            print(ss_hat)
            print('')
from src.alphabets import Uniprot21
import src.scop as scop
from src.utils import pack_sequences, unpack_sequences
from src.utils import PairedDataset, AllPairsDataset, collate_paired_sequences
from src.utils import MultinomialResample
import src.models.embedding
import src.models.comparison

model = torch.load(
    "/local/datdb/ProteinEmbMethodGithub/protein-sequence-embedding-iclr2019/pretrained_models/me_L1_100d_lstm3x512_lm_i512_mb64_tau0.5_p0.05_epoch100.sav"
)

model.eval()
model.cuda()

alphabet = Uniprot21()  ##!! convert string to indexing.


def submitJobs(onto):

    os.chdir("/local/datdb/UniprotJan2020")
    ## create an array in the exact order as file
    for onto in [onto]:  # ['cc','bp','mf']:
        fin = open("uniprot-" + onto + ".tsv",
                   "r")  # Entry   Gene ontology IDs       Sequence
        fout = open("uniprot-" + onto + "-bonnie.tsv", "w")
        for index, line in tqdm(enumerate(
                fin)):  #### retain the same ordering as original input
            if index == 0:
                fout.write(line.strip() + "\tprot3dvec\n")  ## header
                continue  ##!! skip header
def main():
    import argparse
    parser = argparse.ArgumentParser(
        'Script for evaluating similarity model on SCOP test set.')

    parser.add_argument(
        'model',
        help=
        'path to saved model file or "nw-align" for Needleman-Wunsch alignment score baseline'
    )

    parser.add_argument('--dev',
                        action='store_true',
                        help='use train/dev split')

    parser.add_argument(
        '--batch-size',
        default=64,
        type=int,
        help='number of sequence pairs to process in each batch (default: 64)')

    parser.add_argument('-d',
                        '--device',
                        type=int,
                        default=-2,
                        help='compute device to use')

    parser.add_argument('--coarse',
                        action='store_true',
                        help='use coarse comparison rather than full SSA')

    args = parser.parse_args()

    scop_train_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.sampledpairs.txt'

    eval_paths = [
        ('2.06-test',
         'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.test.sampledpairs.txt'
         ),
        ('2.07-new',
         'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.07-new.allpairs.txt'
         )
    ]
    if args.dev:
        scop_train_path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.train.sampledpairs.txt'

        eval_paths = [(
            '2.06-dev',
            'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.dev.sampledpairs.txt'
        )]

    ## load the data
    alphabet = Uniprot21()
    x0_train, x1_train, y_train = load_pairs(scop_train_path, alphabet)

    ## load the model
    if args.model == 'nw-align':
        model = NWAlign(alphabet)
    elif args.model in ['hhalign', 'phmmer', 'TMalign']:
        model = args.model
    else:
        model = torch.load(args.model)
        model.eval()

        ## set the device
        d = args.device
        use_cuda = (d != -1) and torch.cuda.is_available()
        if d >= 0:
            torch.cuda.set_device(d)

        if use_cuda:
            model.cuda()

        mode = 'align'
        if args.coarse:
            mode = 'coarse'
        model = TorchModel(model, use_cuda, mode=mode)

    batch_size = args.batch_size

    ## for calculating the classification accuracy, first find the best partitions using the training set
    if type(model) is str:
        path = 'data/SCOPe/astral-scopedom-seqres-gd-sel-gs-bib-95-2.06.train.sampledpairs.' \
                + model + '.npy'
        scores = np.load(path)
        scores = scores.mean(1)
    else:
        scores = score_pairs(model, x0_train, x1_train, batch_size)
    thresholds = find_best_thresholds(scores, y_train)

    print(
        'Dataset\tAccuracy\tPearson\'s r\tSpearman\'s rho\tClass\tFold\tSuperfamily\tFamily'
    )

    accuracy, r, rho, aupr = calculate_metrics(scores, y_train, thresholds)

    template = '{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}'

    line = '2.06-train\t' + template.format(accuracy, r, rho, aupr[0], aupr[1],
                                            aupr[2], aupr[3])
    #line = '\t'.join(['2.06-train', str(accuracy), str(r), str(rho), str(aupr[0]), str(aupr[1]), str(aupr[2]), str(aupr[3])])
    print(line)

    for dset, path in eval_paths:
        x0_test, x1_test, y_test = load_pairs(path, alphabet)
        if type(model) is str:
            path = os.path.splitext(path)[0]
            path = path + '.' + model + '.npy'
            scores = np.load(path)
            scores = scores.mean(1)
        else:
            scores = score_pairs(model, x0_test, x1_test, batch_size)
        accuracy, r, rho, aupr = calculate_metrics(scores, y_test, thresholds)

        line = dset + '\t' + template.format(accuracy, r, rho, aupr[0],
                                             aupr[1], aupr[2], aupr[3])
        #line = '\t'.join([dset, str(accuracy), str(r), str(rho), str(aupr[0]), str(aupr[1]), str(aupr[2]), str(aupr[3])])
        print(line)
Exemple #14
0
def main():
    import argparse
    parser = argparse.ArgumentParser('Script for evaluating contact map models.')
    parser.add_argument('model', help='path to saved model')
    parser.add_argument('--batch-size', default=10, type=int, help='number of sequences to process in each batch (default: 10)')
    parser.add_argument('-o', '--output', help='output file path (default: stdout)')
    parser.add_argument('-d', '--device', type=int, default=-2, help='compute device to use')
    parser.add_argument('--individual', action='store_true')
    args = parser.parse_args()

    # load the data
    fasta_path = 'data/casp12/casp12.fm-domains.seq.fa'
    contact_paths = glob.glob('data/casp12/domains_T0/*.png')
    
    alphabet = Uniprot21()
    baselines = None
    if args.model == 'baselines':
        x,y,names,baselines = load_data(fasta_path, contact_paths, alphabet, baselines=True)
    else:
        x,y,names = load_data(fasta_path, contact_paths, alphabet)

    if baselines is not None:
        output = args.output
        if output is None:
            output = sys.stdout
        else:
            output = open(output, 'w')

        lengths = np.array([len(x_) for x_ in x])
        calc_baselines(baselines, y, lengths, names, output=output, individual=args.individual)

        sys.exit(0)

    ## set the device
    d = args.device
    use_cuda = (d != -1) and torch.cuda.is_available()
    if d >= 0:
        torch.cuda.set_device(d)

    if use_cuda:
        x = [x_.cuda() for x_ in x]
        y = [y_.cuda() for y_ in y]

    model = torch.load(args.model)
    model.eval()
    if use_cuda:
        model.cuda()

    # predict contact maps
    batch_size = args.batch_size
    dataset = ContactMapDataset(x, y)
    iterator = torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=collate_lists)
    logits = []
    with torch.no_grad():
        for xmb,ymb in iterator:
            lmb = predict_minibatch(model, xmb, use_cuda)
            logits += lmb

    # calculate performance metrics
    lengths = np.array([len(x_) for x_ in x])
    logits = [logit.cpu().numpy() for logit in logits]
    y = [y_.cpu().numpy() for y_ in y]

    output = args.output
    if output is None:
        output = sys.stdout
    else:
        output = open(output, 'w')
    if args.individual:
        line = '\t'.join(['Distance', 'Protein', 'Precision', 'Recall', 'F1', 'AUPR', 'Precision@L', 'Precision@L/2', 'Precision@L/5'])
    else:
        line = '\t'.join(['Distance', 'Precision', 'Recall', 'F1', 'AUPR', 'Precision@L', 'Precision@L/2', 'Precision@L/5'])
    print(line, file=output)
    output.flush()

    # for all contacts
    y_flat = []
    logits_flat = []
    for i in range(len(y)):
        yi = y[i]
        mask = (yi < 0)
        y_flat.append(yi[~mask])
        logits_flat.append(logits[i][~mask])

    # calculate precision, recall, F1, and area under the precision recall curve for all contacts
    precision = np.zeros(len(x))
    recall = np.zeros(len(x))
    F1 = np.zeros(len(x))
    AUPR = np.zeros(len(x))
    prL = np.zeros(len(x))
    prL2 = np.zeros(len(x))
    prL5 = np.zeros(len(x))
    for i in range(len(x)):
        pr,re,f1,aupr = calc_metrics(logits_flat[i], y_flat[i])
        precision[i] = pr
        recall[i] = re
        F1[i] = f1
        AUPR[i] = aupr

        order = np.argsort(logits_flat[i])[::-1]
        n = lengths[i]
        topL = order[:n]
        prL[i] = y_flat[i][topL].mean()
        topL2 = order[:n//2]
        prL2[i] = y_flat[i][topL2].mean()
        topL5 = order[:n//5]
        prL5[i] = y_flat[i][topL5].mean()

    if args.individual:
        template = 'All\t{}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}'
        for i in range(len(x)):
            name = names[i]
            line = template.format(name,precision[i], recall[i], F1[i], AUPR[i], prL[i], prL2[i], prL5[i])
            print(line, file=output)
    else:
        template = 'All\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}'
        line = template.format(precision.mean(), recall.mean(), F1.mean(), AUPR.mean(), prL.mean(), prL2.mean(), prL5.mean())
        print(line, file=output)
    output.flush()

    # for Medium/Long range contacts
    y_flat = []
    logits_flat = []
    for i in range(len(y)):
        yi = y[i]
        mask = (yi < 0)

        medlong = np.tril_indices(len(yi), k=11)
        medlong_mask = np.zeros((len(yi),len(yi)), dtype=np.uint8)
        medlong_mask[medlong] = 1
        mask = mask | (medlong_mask == 1)

        y_flat.append(yi[~mask])
        logits_flat.append(logits[i][~mask])

    # calculate precision, recall, F1, and area under the precision recall curve for all contacts
    precision = np.zeros(len(x))
    recall = np.zeros(len(x))
    F1 = np.zeros(len(x))
    AUPR = np.zeros(len(x))
    prL = np.zeros(len(x))
    prL2 = np.zeros(len(x))
    prL5 = np.zeros(len(x))
    for i in range(len(x)):
        pr,re,f1,aupr = calc_metrics(logits_flat[i], y_flat[i])
        precision[i] = pr
        recall[i] = re
        F1[i] = f1
        AUPR[i] = aupr

        order = np.argsort(logits_flat[i])[::-1]
        n = lengths[i]
        topL = order[:n]
        prL[i] = y_flat[i][topL].mean()
        topL2 = order[:n//2]
        prL2[i] = y_flat[i][topL2].mean()
        topL5 = order[:n//5]
        prL5[i] = y_flat[i][topL5].mean()

    if args.individual:
        template = 'Medium/Long\t{}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}'
        for i in range(len(x)):
            name = names[i]
            line = template.format(name,precision[i], recall[i], F1[i], AUPR[i], prL[i], prL2[i], prL5[i])
            print(line, file=output)
    else:
        template = 'Medium/Long\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}\t{:.5f}'
        line = template.format(precision.mean(), recall.mean(), F1.mean(), AUPR.mean(), prL.mean(), prL2.mean(), prL5.mean())
        print(line, file=output)
    output.flush()