def load_saved_state(self, autoclass_state_filename: str) -> None: checkpoint = torch.load(autoclass_state_filename) if not 'context-filter' in checkpoint: print( "Warning: could not find context filter in saved autoclass state, " "using default...") checkpoint['context-filter'] = "default" assert checkpoint['tokenizer'] assert checkpoint['tokenizer-name'] assert checkpoint['stem-embedding'] assert checkpoint['decoder'] assert checkpoint['num-decoder-layers'] assert checkpoint['encoded-size'] assert checkpoint['hidden-size'] assert checkpoint['context-filter'] assert checkpoint['learning-rate'] assert checkpoint['training-loss'] assert checkpoint['epoch'] self.options = [ ("tokenizer", checkpoint['tokenizer-name']), ("# input keywords", checkpoint['num-keywords']), ("max input length", checkpoint['max-length']), ("# encoder layers", checkpoint['num-encoder-layers']), ("hidden size", checkpoint['hidden-size']), ("# decoder layers", checkpoint['num-decoder-layers']), ("context filter", checkpoint['context-filter']), ("optimizer (autoencoder)", checkpoint['autoenc-optimizer']), ("optimizer (classifier)", checkpoint['optimizer']), ("learning rate (autoencoder)", checkpoint['autoenc-learning-rate']), ("learning rate (classifier)", checkpoint['learning-rate']), ("training loss (autoencoder)", "{:.4f}".format(checkpoint['autoenc-training-loss'])), ("training loss (classifier)", "{:.4f}".format(checkpoint['training-loss'])), ("# epochs (autoencoder)", checkpoint['autoenc-epoch'] + 1), ("# epochs (classifier)", checkpoint['epoch'] + 1) ] self.tokenizer = checkpoint['tokenizer'] self.embedding = checkpoint['stem-embedding'] self.encoder = maybe_cuda( EncoderRNN(self.tokenizer.numTokens(), checkpoint['hidden-size'], checkpoint['num-encoder-layers'])) self.encoder.load_state_dict(checkpoint['encoder']) print("Have {} embedding tokens".format(self.embedding.num_tokens())) self.decoder = maybe_cuda( ClassifierDNN(checkpoint['encoded-size'], checkpoint['hidden-size'], self.embedding.num_tokens(), checkpoint['num-decoder-layers'])) self.decoder.load_state_dict(checkpoint['decoder']) self.max_length = checkpoint['max-length'] self.context_filter = checkpoint['context-filter']
def main(arg_list: List[str]) -> None: parser = argparse.ArgumentParser(description="Autoencoder for coq terms") parser.add_argument("scrape_file") parser.add_argument("autoencoder_weights") parser.add_argument("save_file") parser.add_argument("--num-epochs", dest="num_epochs", default=15, type=int) parser.add_argument("--batch-size", dest="batch_size", default=256, type=int) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--print-every", dest="print_every", default=10, type=int) parser.add_argument("--learning-rate", dest="learning_rate", default=.7, type=float) parser.add_argument("--gamma", default=.9, type=float) parser.add_argument("--epoch-step", dest="epoch_step", default=5, type=int) parser.add_argument("--optimizer", choices=list(stdargs.optimizers.keys()), type=str, default=list(stdargs.optimizers.keys())[0]) parser.add_argument("--num-classifier-layers", dest="num_classifier_layers", default=3, type=int) parser.add_argument("--classifier-hidden-size", dest="classifier_hidden_size", default=128, type=int) parser.add_argument("--train-autoencoder", dest="train_autoencoder", default=False, const=True, action='store_const') args = parser.parse_args(arg_list) print("Loading autoencoder state...") autoenc_state = torch.load(args.autoencoder_weights) cfilter = autoenc_state['context-filter'] text_data = get_text_data(args) print("Encoding data...") start = time.time() tokenizer = autoenc_state['tokenizer'] embedding = SimpleEmbedding() dataset = [(tokenizer.toTokenList(goal), embedding.encode_token(get_stem(tactic))) for prev_tactics, hyps, goal, tactic in text_data] timeTaken = time.time() - start print("Encoded data in {:.2f}".format(timeTaken)) loadedAutoencoder = maybe_cuda( EncoderRNN(tokenizer.numTokens(), autoenc_state['hidden-size'], autoenc_state['num-encoder-layers'], args.batch_size)) loadedAutoencoder.load_state_dict(autoenc_state['encoder']) checkpoints = train( dataset, loadedAutoencoder, args.train_autoencoder, autoenc_state['max-length'], autoenc_state['hidden-size'], args.classifier_hidden_size, embedding.num_tokens(), args.num_classifier_layers, args.batch_size, args.learning_rate, args.gamma, args.epoch_step, args.num_epochs, args.print_every, stdargs.optimizers[args.optimizer]) for epoch, (decoder_state, autoencoder_state, training_loss) in enumerate(checkpoints): print("Autoenc training loss is {:.4f}".format( autoenc_state['training-loss'])) state = { 'epoch': epoch, 'training-loss': training_loss, 'autoenc-training-loss': autoenc_state['training-loss'], 'autoenc-epoch': autoenc_state['epoch'], 'tokenizer': tokenizer, 'tokenizer-name': autoenc_state['tokenizer-name'], 'optimizer': args.optimizer, 'autoenc-optimizer': autoenc_state['optimizer'], 'learning-rate': args.learning_rate, 'autoenc-learning-rate': autoenc_state['learning-rate'], 'encoder': autoencoder_state, 'decoder': decoder_state, 'num-decoder-layers': args.num_classifier_layers, 'num-encoder-layers': autoenc_state['num-encoder-layers'], 'context-filter': cfilter, 'max-length': autoenc_state['max-length'], 'encoded-size': autoenc_state['hidden-size'], 'hidden-size': args.classifier_hidden_size, 'num-keywords': autoenc_state['num-keywords'], 'stem-embedding': embedding, } with open(args.save_file, 'wb') as f: print("=> Saving checkpoint at epoch {}".format(epoch)) torch.save(state, f)
def train(dataset : ClassifySequenceDataset, autoencoder : EncoderRNN, train_autoencoder: bool, max_length : int, encoder_hidden_size : int, classifier_hidden_size : int, output_vocab_size : int, num_layers : int, batch_size : int, learning_rate : float, gamma : float, epoch_step : int, num_epochs : int, print_every : int, optimizer_f : Callable[..., Optimizer]) \ -> Iterable[Checkpoint]: print("Initializing PyTorch...") in_stream = [ normalizeSentenceLength(goal, max_length) for goal, tactic in dataset ] out_stream = [tactic for goal, tactic in dataset] dataloader = \ torchdata.DataLoader(torchdata.TensorDataset(torch.LongTensor(in_stream), torch.LongTensor(out_stream)), batch_size=batch_size, num_workers=0, shuffle=True, pin_memory=True, drop_last=True) classifier = maybe_cuda( ClassifierDNN(encoder_hidden_size, classifier_hidden_size, output_vocab_size, num_layers, batch_size)) optimizers = [optimizer_f(classifier.parameters(), lr=learning_rate)] if train_autoencoder: optimizers += [optimizer_f(autoencoder.parameters(), lr=learning_rate)] criterion = maybe_cuda(nn.NLLLoss()) adjusters = [ scheduler.StepLR(optimizer, epoch_step, gamma) for optimizer in optimizers ] start = time.time() num_items = len(dataset) * num_epochs total_loss = 0 print("Training...") for epoch in range(num_epochs): print("Epoch {}".format(epoch)) for adjuster in adjusters: adjuster.step() for batch_num, (input_batch, output_batch) in enumerate(dataloader): # Reset the optimizer for optimizer in optimizers: optimizer.zero_grad() # Run the classifier on pre-encoded vectors encoded_input_batch = autoencoder.run( cast(torch.LongTensor, input_batch)) prediction_distribution = classifier.run(encoded_input_batch) # Get the loss output_var = maybe_cuda(Variable(output_batch)) loss = criterion(prediction_distribution, output_var) # Update the weights loss.backward() for optimizer in optimizers: optimizer.step() # Report progress items_processed = (batch_num + 1) * batch_size + epoch * len(dataset) total_loss += loss.item() * batch_size assert isinstance(total_loss, float) if (batch_num + 1) % print_every == 0: progress = items_processed / num_items print("{} ({:7} {:5.2f}%) {:.4f}".format( timeSince(start, progress), items_processed, progress * 100, total_loss / items_processed)) yield Checkpoint(classifier_state=classifier.state_dict(), autoencoder_state=autoencoder.state_dict(), training_loss=total_loss / items_processed)