Ejemplo n.º 1
0
def main(args):
    """Output learned representation of RRMs from CNN_LSTM autoencoder."""

    # Load pickled vocab
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Load the already preprocessed data
    df_aligned = preprocess(preprocessed=True,
                            RRM_path=args.processed_RRM_path,
                            output_path=args.processed_RRM_path,
                            vocab=vocab)

    # Data loader
    loader = RRM_Sequence(df_aligned, vocab)
    loader = DataLoader(loader, 16, shuffle=True, collate_fn=collate_fn)

    encoderCNN = ResNetEncoder(84, 26, 64)  # TODO don't hardcode?
    decoderRNN = DecoderRNN(64, 128, 26, 1)  # TODO don't hardcode?

    # Use CUDA if available
    if torch.cuda.is_available():
        encoderCNN.cuda()
        decoderRNN.cuda()

    # Load pickled models
    with open(args.encoder_path, 'rb') as encoder:
        encoderCNN.load_state_dict(torch.load(encoder))
    with open(args.decoder_path, 'rb') as decoder:
        decoderRNN.load_state_dict(torch.load(decoder))

    # Loop over data
    for batch_idx, (names, rrms_aligned, rrms_unaligned,
                    lengths) in enumerate(loader):
        rrms_aligned = to_var(rrms_aligned)
        rrms_unaligned = to_var(rrms_unaligned)

        features = encoderCNN(rrms_aligned)

        if args.which_representation == 'encoder':
            hiddens = features
        else:
            hiddens = forward(decoderRNN, features, rrms_unaligned, lengths)

        hiddens = hiddens.data.cpu().numpy()
        # hiddens have shape (16, 64) for encoder, (16, 128) for decoder

        if batch_idx == 0:
            df = pd.DataFrame(hiddens)
            df['name'] = names
        else:
            df1 = pd.DataFrame(hiddens)
            df1['name'] = names
            df = pd.concat([df, df1])

    # Write to file
    df.to_csv(args.hidden_path)
Ejemplo n.º 2
0
def main(args):
    # Make a directory to save models
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Preprocess the RRM data
    vocab, df_aligned = preprocess(preprocessed=args.preprocessed,
                                   RRM_path=args.aligned_RRM_path,
                                   output_path=args.processed_RRM_path,
                                   sep=args.sep)
    df_aligned = train_test_split(df_aligned)
    with open(os.path.join(args.model_path, 'vocab.pkl'), 'wb') as f:
        pickle.dump(vocab, f)

    # Prepare the training and validation sets
    train_index = pd.read_csv('../data/train_index.csv', header=None).iloc[:,
                                                                           0]
    train_loader = RRM_Sequence(df_aligned.loc[train_index, :], vocab)
    train_loader = DataLoader(train_loader,
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=collate_fn)

    val_index = pd.read_csv('../data/val_index.csv', header=None).iloc[:, 0]
    val_loader = RRM_Sequence(df_aligned.loc[val_index, :], vocab)
    val_loader = DataLoader(val_loader,
                            batch_size=args.batch_size,
                            shuffle=True,
                            collate_fn=collate_fn)

    # Define the models
    encoder = ResNetEncoder(df_aligned.shape[1], len(vocab), args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Use CUDA if available
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Define the loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(train_loader)
    val_loss_history = []
    for epoch_num, epoch in enumerate(range(args.num_epochs)):
        for batch_idx, (names, rrms_aligned, rrms_unaligned,
                        lengths) in enumerate(train_loader):
            rrms_aligned = to_var(rrms_aligned)
            rrms_unaligned = to_var(rrms_unaligned)
            targets = pack_padded_sequence(rrms_unaligned,
                                           lengths,
                                           batch_first=True)[0]

            # Forward, backward, and optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(rrms_aligned)
            outputs = decoder(features, rrms_unaligned, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if (batch_idx + 1) % args.log_step == 0:
                val_loss = validate(val_loader, encoder, decoder, criterion)
                val_loss_history.append(val_loss)
                print(
                    'Epoch [%d/%d], Step [%d/%d], Training Loss: %.4f, Validation loss: %.4f'
                    % (epoch + 1, args.num_epochs, batch_idx + 1, total_step,
                       loss.data[0], val_loss))
                stop = early_stop(val_loss_history)
                if stop:
                    print(
                        '=== Early stopping === Validation loss not improving significantly ==='
                    )
                    torch.save(
                        decoder.state_dict(),
                        os.path.join(
                            args.model_path,
                            'decoder-anneal%s-%dcolumns-%d-%d.pkl' %
                            (args.learning_rate_annealing, df_aligned.shape[1],
                             epoch + 1, batch_idx + 1)))
                    torch.save(
                        encoder.state_dict(),
                        os.path.join(
                            args.model_path,
                            'encoder-anneal%s-%dcolumns-%d-%d.pkl' %
                            (args.learning_rate_annealing, df_aligned.shape[1],
                             epoch + 1, batch_idx + 1)))
                    break

            # Save the models
            if (batch_idx + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(
                        args.model_path,
                        'decoder-anneal%s-%dcolumns-%d-%d.pkl' %
                        (args.learning_rate_annealing, df_aligned.shape[1],
                         epoch + 1, batch_idx + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(
                        args.model_path,
                        'encoder-anneal%s-%dcolumns-%d-%d.pkl' %
                        (args.learning_rate_annealing, df_aligned.shape[1],
                         epoch + 1, batch_idx + 1)))

        # Decay the learning rate if specified
        if args.learning_rate_annealing:
            adjust_learning_rate(optimizer, epoch + 1)

        if stop:
            break