Exemple #1
0
    def load_saved_state(self, autoclass_state_filename: str) -> None:
        checkpoint = torch.load(autoclass_state_filename)

        if not 'context-filter' in checkpoint:
            print(
                "Warning: could not find context filter in saved autoclass state, "
                "using default...")
            checkpoint['context-filter'] = "default"

        assert checkpoint['tokenizer']
        assert checkpoint['tokenizer-name']
        assert checkpoint['stem-embedding']
        assert checkpoint['decoder']
        assert checkpoint['num-decoder-layers']
        assert checkpoint['encoded-size']
        assert checkpoint['hidden-size']
        assert checkpoint['context-filter']
        assert checkpoint['learning-rate']
        assert checkpoint['training-loss']
        assert checkpoint['epoch']

        self.options = [
            ("tokenizer", checkpoint['tokenizer-name']),
            ("# input keywords", checkpoint['num-keywords']),
            ("max input length", checkpoint['max-length']),
            ("# encoder layers", checkpoint['num-encoder-layers']),
            ("hidden size", checkpoint['hidden-size']),
            ("# decoder layers", checkpoint['num-decoder-layers']),
            ("context filter", checkpoint['context-filter']),
            ("optimizer (autoencoder)", checkpoint['autoenc-optimizer']),
            ("optimizer (classifier)", checkpoint['optimizer']),
            ("learning rate (autoencoder)",
             checkpoint['autoenc-learning-rate']),
            ("learning rate (classifier)", checkpoint['learning-rate']),
            ("training loss (autoencoder)",
             "{:.4f}".format(checkpoint['autoenc-training-loss'])),
            ("training loss (classifier)",
             "{:.4f}".format(checkpoint['training-loss'])),
            ("# epochs (autoencoder)", checkpoint['autoenc-epoch'] + 1),
            ("# epochs (classifier)", checkpoint['epoch'] + 1)
        ]

        self.tokenizer = checkpoint['tokenizer']
        self.embedding = checkpoint['stem-embedding']
        self.encoder = maybe_cuda(
            EncoderRNN(self.tokenizer.numTokens(), checkpoint['hidden-size'],
                       checkpoint['num-encoder-layers']))
        self.encoder.load_state_dict(checkpoint['encoder'])
        print("Have {} embedding tokens".format(self.embedding.num_tokens()))
        self.decoder = maybe_cuda(
            ClassifierDNN(checkpoint['encoded-size'],
                          checkpoint['hidden-size'],
                          self.embedding.num_tokens(),
                          checkpoint['num-decoder-layers']))
        self.decoder.load_state_dict(checkpoint['decoder'])
        self.max_length = checkpoint['max-length']
        self.context_filter = checkpoint['context-filter']
Exemple #2
0
def main(arg_list: List[str]) -> None:
    parser = argparse.ArgumentParser(description="Autoencoder for coq terms")
    parser.add_argument("scrape_file")
    parser.add_argument("autoencoder_weights")
    parser.add_argument("save_file")
    parser.add_argument("--num-epochs",
                        dest="num_epochs",
                        default=15,
                        type=int)
    parser.add_argument("--batch-size",
                        dest="batch_size",
                        default=256,
                        type=int)
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        default=None,
                        type=int)
    parser.add_argument("--print-every",
                        dest="print_every",
                        default=10,
                        type=int)
    parser.add_argument("--learning-rate",
                        dest="learning_rate",
                        default=.7,
                        type=float)
    parser.add_argument("--gamma", default=.9, type=float)
    parser.add_argument("--epoch-step", dest="epoch_step", default=5, type=int)
    parser.add_argument("--optimizer",
                        choices=list(stdargs.optimizers.keys()),
                        type=str,
                        default=list(stdargs.optimizers.keys())[0])
    parser.add_argument("--num-classifier-layers",
                        dest="num_classifier_layers",
                        default=3,
                        type=int)
    parser.add_argument("--classifier-hidden-size",
                        dest="classifier_hidden_size",
                        default=128,
                        type=int)
    parser.add_argument("--train-autoencoder",
                        dest="train_autoencoder",
                        default=False,
                        const=True,
                        action='store_const')
    args = parser.parse_args(arg_list)
    print("Loading autoencoder state...")
    autoenc_state = torch.load(args.autoencoder_weights)
    cfilter = autoenc_state['context-filter']

    text_data = get_text_data(args)
    print("Encoding data...")
    start = time.time()
    tokenizer = autoenc_state['tokenizer']
    embedding = SimpleEmbedding()
    dataset = [(tokenizer.toTokenList(goal),
                embedding.encode_token(get_stem(tactic)))
               for prev_tactics, hyps, goal, tactic in text_data]
    timeTaken = time.time() - start
    print("Encoded data in {:.2f}".format(timeTaken))

    loadedAutoencoder = maybe_cuda(
        EncoderRNN(tokenizer.numTokens(), autoenc_state['hidden-size'],
                   autoenc_state['num-encoder-layers'], args.batch_size))
    loadedAutoencoder.load_state_dict(autoenc_state['encoder'])
    checkpoints = train(
        dataset, loadedAutoencoder, args.train_autoencoder,
        autoenc_state['max-length'],
        autoenc_state['hidden-size'], args.classifier_hidden_size,
        embedding.num_tokens(), args.num_classifier_layers, args.batch_size,
        args.learning_rate, args.gamma, args.epoch_step, args.num_epochs,
        args.print_every, stdargs.optimizers[args.optimizer])

    for epoch, (decoder_state, autoencoder_state,
                training_loss) in enumerate(checkpoints):
        print("Autoenc training loss is {:.4f}".format(
            autoenc_state['training-loss']))
        state = {
            'epoch': epoch,
            'training-loss': training_loss,
            'autoenc-training-loss': autoenc_state['training-loss'],
            'autoenc-epoch': autoenc_state['epoch'],
            'tokenizer': tokenizer,
            'tokenizer-name': autoenc_state['tokenizer-name'],
            'optimizer': args.optimizer,
            'autoenc-optimizer': autoenc_state['optimizer'],
            'learning-rate': args.learning_rate,
            'autoenc-learning-rate': autoenc_state['learning-rate'],
            'encoder': autoencoder_state,
            'decoder': decoder_state,
            'num-decoder-layers': args.num_classifier_layers,
            'num-encoder-layers': autoenc_state['num-encoder-layers'],
            'context-filter': cfilter,
            'max-length': autoenc_state['max-length'],
            'encoded-size': autoenc_state['hidden-size'],
            'hidden-size': args.classifier_hidden_size,
            'num-keywords': autoenc_state['num-keywords'],
            'stem-embedding': embedding,
        }
        with open(args.save_file, 'wb') as f:
            print("=> Saving checkpoint at epoch {}".format(epoch))
            torch.save(state, f)
Exemple #3
0
def train(dataset : ClassifySequenceDataset,
          autoencoder : EncoderRNN, train_autoencoder: bool, max_length : int,
          encoder_hidden_size : int, classifier_hidden_size : int,
          output_vocab_size : int, num_layers : int, batch_size : int,
          learning_rate : float, gamma : float, epoch_step : int, num_epochs : int,
          print_every : int, optimizer_f : Callable[..., Optimizer]) \
          -> Iterable[Checkpoint]:
    print("Initializing PyTorch...")
    in_stream = [
        normalizeSentenceLength(goal, max_length) for goal, tactic in dataset
    ]
    out_stream = [tactic for goal, tactic in dataset]
    dataloader = \
        torchdata.DataLoader(torchdata.TensorDataset(torch.LongTensor(in_stream),
                                                     torch.LongTensor(out_stream)),
                             batch_size=batch_size, num_workers=0,
                             shuffle=True, pin_memory=True, drop_last=True)

    classifier = maybe_cuda(
        ClassifierDNN(encoder_hidden_size, classifier_hidden_size,
                      output_vocab_size, num_layers, batch_size))
    optimizers = [optimizer_f(classifier.parameters(), lr=learning_rate)]
    if train_autoencoder:
        optimizers += [optimizer_f(autoencoder.parameters(), lr=learning_rate)]
    criterion = maybe_cuda(nn.NLLLoss())
    adjusters = [
        scheduler.StepLR(optimizer, epoch_step, gamma)
        for optimizer in optimizers
    ]

    start = time.time()
    num_items = len(dataset) * num_epochs
    total_loss = 0

    print("Training...")
    for epoch in range(num_epochs):
        print("Epoch {}".format(epoch))
        for adjuster in adjusters:
            adjuster.step()
        for batch_num, (input_batch, output_batch) in enumerate(dataloader):

            # Reset the optimizer
            for optimizer in optimizers:
                optimizer.zero_grad()

            # Run the classifier on pre-encoded vectors
            encoded_input_batch = autoencoder.run(
                cast(torch.LongTensor, input_batch))
            prediction_distribution = classifier.run(encoded_input_batch)

            # Get the loss
            output_var = maybe_cuda(Variable(output_batch))
            loss = criterion(prediction_distribution, output_var)

            # Update the weights
            loss.backward()
            for optimizer in optimizers:
                optimizer.step()

            # Report progress
            items_processed = (batch_num +
                               1) * batch_size + epoch * len(dataset)
            total_loss += loss.item() * batch_size
            assert isinstance(total_loss, float)

            if (batch_num + 1) % print_every == 0:

                progress = items_processed / num_items
                print("{} ({:7} {:5.2f}%) {:.4f}".format(
                    timeSince(start, progress), items_processed,
                    progress * 100, total_loss / items_processed))

        yield Checkpoint(classifier_state=classifier.state_dict(),
                         autoencoder_state=autoencoder.state_dict(),
                         training_loss=total_loss / items_processed)