def model_fn(model_dir): model = BiLSTMModel(torch.zeros((41299, 300)), nClasses=4, hiddenSizeEncoder=2048, hiddenSizeCls=512, layers=1, dropProb=0.0) weights = torch.load(Path(model_dir) / '{}.pt'.format(MODEL_NAME), map_location=DEVICE) model.load_state_dict(weights) model.to(DEVICE) model.eval() tokenizer = Tokenizer(Vocab()) tokenizer.from_disk(Path(model_dir) / '{}'.format(TOKENIZER)) return {'model': model, 'tokenizer': tokenizer}
# Create reverse dicts i2w = {v: k for k, v in w2i.items()} i2w[UNK] = "<unk>" i2t = {v: k for k, v in t2i.items()} ntags = len(t2i) log.pr_cyan("The vocabulary size is %d" %(nwords)) if MODEL_TYPE == 'emb-att': model = EmbAttModel(nwords, EMB_SIZE, ntags) elif MODEL_TYPE == 'emb-lstm-att': model = BiLSTMAttModel(nwords, EMB_SIZE, HID_SIZE, ntags) elif MODEL_TYPE == 'no-att-only-lstm': model = BiLSTMModel(nwords, EMB_SIZE, HID_SIZE, ntags) else: raise ValueError("model type not compatible") calc_ce_loss = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) type = torch.LongTensor float_type = torch.FloatTensor use_cuda = torch.cuda.is_available() if use_cuda: type = torch.cuda.LongTensor float_type = torch.cuda.FloatTensor model.cuda() print ("evaluating without any training ...")
# Model ####################################### # LSTM only baseline model if args.model == 'lstm': lstm_kwargs = { 'vocab_size': vocab_size, 'pad_idx': pad_idx, 'rnn_emb_dim': args.rnn_emb_dim, 'rnn_dim': args.rnn_dim, 'rnn_num_layers': args.rnn_num_layers, 'rnn_dropout': args.rnn_dropout, 'fc_dims': args.fc_dims, 'fc_use_batchnorm': args.fc_use_batchnorm, 'fc_dropout': args.fc_dropout, } model = BiLSTMModel(**lstm_kwargs) # LSTM + CNN baseline model elif args.model == 'cnn': cnnlstm_kwargs = { 'vocab_size': vocab_size, 'pad_idx': pad_idx, 'rnn_emb_dim': args.rnn_emb_dim, 'rnn_dim': args.rnn_dim, 'rnn_num_layers': args.rnn_num_layers, 'rnn_dropout': args.rnn_dropout, 'cnn_feat_dim': args.cnn_feat_dim, 'cnn_res_block_dim': args.cnn_res_block_dim, 'cnn_num_res_blocks': args.cnn_num_res_blocks, 'cnn_proj_dim': args.cnn_proj_dim, 'cnn_pooling': args.cnn_pooling,
opt.length, word2id=train_dataset.word2id, id2word=train_dataset.id2word) if opt.model == 'GRU': model = GRUModel( vocab_size=train_dataset.vocab_size, embedding_size=opt.embedding_size, output_size=opt.output_dim, hidden_dim=opt.hidden_dim, n_layers=opt.n_layer, ) elif opt.model == 'BiLSTM': model = BiLSTMModel(vocab_size=train_dataset.vocab_size, embedding_size=opt.embedding_size, output_size=opt.output_dim, hidden_dim=opt.hidden_dim, n_layers=opt.n_layer, bidirectional=True) else: model = RNNModel( vocab_size=train_dataset.vocab_size, embedding_size=opt.embedding_size, output_size=opt.output_dim, hidden_dim=opt.hidden_dim, n_layers=opt.n_layer, ) optimizer = optim.Adam(model.parameters(), lr=opt.lr, betas=opt.betas) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") trainer = Trainer(model,
def train(params): print('=' * 89) print('Initializing...') # Initialize NLP tools vectorPath = Path(params.vector_cache_dir) / 'snli_1.0' / params.word_embeds vocab = VocabBuilder(Path(vectorPath)) if params.rebuild_vocab or not (Path.is_file(vocab.tokenizerPath) and Path.is_dir(vocab.vectorPath)): print('No vocabulary found. Rebuilding from dataset') if not Path.is_dir(Path.cwd() / vectorPath): Path.mkdir(Path.cwd() / vectorPath, parents=True) nlp = spacy.load(params.word_embeds) fullTokenizer = nlp.tokenizer fullVectors = nlp.vocab.vectors # Combine all sequences in SNLI dataset sequences = loadSequences( Path.cwd() / params.data_dir / 'snli_1.0', filenames=('snli_1.0_train.txt', 'snli_1.0_test.txt', 'snli_1.0_dev.txt'), seq1Col='sentence1', seq2Col='sentence2', filterPred=lambda r: r['gold_label'] != '-', sep='\t') # Learn vocabulary from sequences vocab.learnVocab(sequences, fullTokenizer, fullVectors) vocab.toDisk() vocab.fromDisk() torch.save(vocab.tokenizer, Path.cwd() / params.model_dir / '{0}Tokenizer.pt'.format(params.word_embeds)) # Preprocess datasets print('Preprocessing dataset...') datasetConsMap = { 'snli_1.0': SNLIDataset, 'asap-sas': ASAPSASDataset, 'MRPC': MRPCDataset } datasetDir = Path.cwd() / params.data_dir / params.dataset datasetCons = datasetConsMap[params.dataset] trainDataLoader = DataLoader( datasetCons(datasetDir, tokenizer=vocab.tokenizer, split='train'), batch_size=params.batch_size, shuffle=True, num_workers=4) evalLoader = DataLoader( datasetCons(datasetDir, tokenizer=vocab.tokenizer, split='dev'), batch_size=params.batch_size, shuffle=True, num_workers=4) trainSize = len(trainDataLoader.dataset) evalSize = len(evalLoader.dataset) nClasses = trainDataLoader.dataset.nClasses # Model parameters modelName = 'encoder' if params.mode == 'train_encoder' else 'cls' if params.mode == 'train_encoder': model = BiLSTMModel( torch.Tensor(vocab.vectors.data), nClasses=nClasses, hiddenSizeEncoder=params.hidden_size_encoder, hiddenSizeCls=params.hidden_size_cls, layers=params.lstm_layers, dropProb=params.dropout) elif params.mode == 'train_cls': model = BiLSTMModel( torch.zeros(vocab.vectors.data.shape), nClasses=3, hiddenSizeEncoder=params.hidden_size_encoder, hiddenSizeCls=params.hidden_size_cls, layers=params.lstm_layers, dropProb=0.0) bestWeights = torch.load(Path.cwd() / params.model_dir / 'encoderParams.pt', map_location=DEVICE) model.load_state_dict(bestWeights) model.classifier = SeqClassifier( params.hidden_size_encoder*8, hiddenSize=params.hidden_size_cls, nClasses=nClasses, dropProb=params.dropout) model.freezeEncoder() model = model.to(DEVICE) # Training parameters lossFunc = nn.CrossEntropyLoss() optimizerCons = optim.SGD if params.optimizer == 'sgd' else optim.Adam optimizer = optimizerCons(model.parameters(), lr=params.learning_rate) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=params.lr_patience, factor=0.2) # Train/test loop bestAcc = 0 bestLoss = 1E9 startTime = time.time() bestWeights = copy.deepcopy(model.state_dict()) stopTraining = False stopCount = 0 print('-' * 89) print('Beginning training...') for epoch in range(1, params.epochs+1): epochStartTime = time.time() trainLoss, trainCorrect = trainEpoch(model, trainDataLoader, optimizer, lossFunc) evalLoss, evalCorrect, scores, preds = evalEpoch(model, evalLoader, optimizer, lossFunc) # Epoch Loss epochLossTrain = trainLoss / trainSize epochAccTrain = trainCorrect.double() / trainSize epochLossEval = evalLoss / evalSize epochAccEval = evalCorrect.double() / evalSize print('[Epoch:\t{}/{}] | time {:5.2f}s | train loss: {:.4f} acc: {:.4f}\t' '| eval loss: {:.4f} acc: {:.4f} nCorrect: {:d}'.format(epoch, params.epochs, time.time()-epochStartTime, epochLossTrain, epochAccTrain, epochLossEval, epochAccEval, evalCorrect)) # Update learning rate scheduler.step(epochLossEval) if epochLossEval < bestLoss: bestLoss = epochLossEval stopCount = 0 elif params.optimizer == 'sgd': optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] / 5 if optimizer.param_groups[0]['lr'] < 1e-5: stopTraining = True else: stopCount += 1 if stopCount >= 4: stopTraining = True optimizer.param_groups[0]['lr'] *= (1 if params.optimizer != 'sgd' else 0.99) # Save state if epochAccEval > bestAcc: bestAcc = epochAccEval bestWeights = copy.deepcopy(model.state_dict()) torch.save(bestWeights, Path.cwd() / params.model_dir / '{0}Params.pt'.format(modelName)) torch.save(model, Path.cwd() / params.model_dir / '{0}Model.pt'.format(modelName)) # Check for early stopping if stopTraining: break trainTime = time.time() - startTime print('Training complete in {:.0f}m {:.0f}s'.format( trainTime // 60, trainTime % 60)) print('Best eval Acc: {:4f}'.format(bestAcc)) # Load best model weights model.load_state_dict(bestWeights) return model, vocab.tokenizer