def test(dim, args): import torch import numpy as np from features import ExtractWordEmbeddings from preprocess_data import batchify, padBatch from models.lstm import LSTMClassifier from sklearn.utils import shuffle from sklearn.metrics import roc_auc_score, recall_score, accuracy_score # hyperparameters is_cuda = True batch_size = 60 embedding_dim = 300 hidden_dim = args.hidden_dim weight_dir = 'weights/LSTM/%s' % dim weight_file = join(weight_dir, 'best-weights.pth') assert os.path.exists( weight_file), "The file directory for the saved model doesn't exist" # load datasets X_t, y_t = loadDatasetForLSTM(dim, 'test') # load model and settings for training model = LSTMClassifier(embedding_dim=embedding_dim, hidden_dim=hidden_dim) state_dict = torch.load(weight_file) model.load_state_dict(state_dict) if is_cuda: model.cuda() em = ExtractWordEmbeddings(emb_type='glove') # validate y_scores = [] X_t, y_t = shuffle(X_t, y_t) val_batches = batchify(X_t, y_t, batch_size) model.eval() with torch.no_grad(): for X_b, y_b in val_batches: inputs = torch.tensor( padBatch([ em.obtain_vectors_from_sentence(sent, True) for sent in X_b ])).float() targets = torch.tensor(y_b, dtype=torch.float32) if is_cuda: inputs, targets = inputs.cuda(), targets.cuda() outputs = model(inputs).tolist() y_scores.extend(outputs) y_preds = np.array(np.array(y_scores) >= 0.5, dtype=int) auc = roc_auc_score(y_true=y_t, y_score=y_scores) rec = recall_score(y_true=y_t, y_pred=y_preds) acc = accuracy_score(y_true=y_t, y_pred=y_preds) print('AUC: ', round(auc, 2)) print('REC: ', round(rec, 2)) print('ACC: ', round(acc, 2)) with open(join(weight_dir, 'scores.txt'), 'w') as f: f.write('AUC: %1.2f\n' % auc) f.write('REC: %1.2f\n' % rec) f.write('ACC: %1.2f\n' % acc) return
class LSTMTrainer(Trainer): def __init__(self, opt, emb_matrix=None): self.opt = opt self.emb_matrix = emb_matrix self.model = LSTMClassifier(opt, emb_matrix=emb_matrix) self.criterion = nn.CrossEntropyLoss() self.parameters = [p for p in self.model.parameters() if p.requires_grad] if opt['cuda']: self.model.cuda() self.criterion.cuda() self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr']) def update(self, batch): inputs, labels = unpack_batch(batch) # Step 1 init and forward self.model.train() self.optimizer.zero_grad() logits = self.model(inputs) loss = self.criterion(logits, labels) loss_val = loss.item() # Step 2 backward loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.opt['max_grad_norm']) # Step 3 update self.optimizer.step() return loss_val def predict(self, batch, unsort=True): inputs, labels = unpack_batch(batch) self.model.eval() logits = self.model(inputs) loss = self.criterion(logits, labels) loss_val = loss.item() probs = F.softmax(logits, 1).data.cpu().numpy().tolist() predictions = np.argmax(logits.data.cpu().numpy(), axis=1).tolist() labels = labels.data.cpu().numpy().tolist() return predictions, probs, labels, loss_val
def train(dim, args): import torch from torch import nn, optim import numpy as np from features import ExtractWordEmbeddings from preprocess_data import batchify, padBatch from models.lstm import LSTMClassifier from sklearn.utils import shuffle # hyperparameters embedding_dim = 300 # changes only with different word embeddings hidden_dim = args.hidden_dim max_epochs = args.max_epochs is_cuda = True batch_size = 60 lr = args.lr n_decreases = 10 save_dir = 'weights/LSTM/%s' % dim if not os.path.exists(save_dir): os.makedirs(save_dir) """ Loading train / validation datasets X_tr: a list of tokenized sentences y_tr: a list of 0 and 1 """ X_tr, y_tr = loadDatasetForLSTM(dim, 'train') # a list of tokenized sentences X_d, y_d = loadDatasetForLSTM(dim, 'dev') # load model and settings for training model = LSTMClassifier(embedding_dim=embedding_dim, hidden_dim=hidden_dim) if is_cuda: model.cuda() optimizer = optim.AdamW(model.parameters(), lr=lr) flag = True old_val = np.inf # previous validation error em = ExtractWordEmbeddings(emb_type='glove') loss_fn = nn.BCELoss() # train model epoch = 0 cnt_decrease = 0 while (flag): tr_loss = 0.0 epoch += 1 if (epoch > max_epochs) | (cnt_decrease > n_decreases): break # train model.train() # for each iteration, shuffles X_tr and y_tr and puts them into batches X_tr, y_tr = shuffle(X_tr, y_tr) tr_batches = batchify(X_tr, y_tr, batch_size) for X_b, y_b in tr_batches: # X_b is still a list of tokenized sentences (list of list of words) optimizer.zero_grad() """ obtain_vectors_from_sentence(sent=list of words, include_unk=True) : changes each word into an embedding, and returns a list of embeddings padBatch(list of embedding lists, max_seq=None) : for each batch, returns a tensor fixed to the max size, applies zero padding """ inputs = torch.tensor( padBatch([ em.obtain_vectors_from_sentence(sent, True) for sent in X_b ])).float() # here, inputs become a tensor of shape (B * seq_len * dim) targets = torch.tensor(y_b, dtype=torch.float32) if is_cuda: inputs, targets = inputs.cuda(), targets.cuda() outputs = model(inputs) loss = loss_fn(outputs, targets) # error here loss.backward() tr_loss += loss.item() optimizer.step() print("[Epoch %d] train loss: %1.3f" % (epoch, tr_loss)) # validate model.eval() current_loss = 0.0 X_d, y_d = shuffle(X_d, y_d) val_batches = batchify(X_d, y_d, batch_size) with torch.no_grad(): for X_b, y_b in val_batches: inputs = torch.tensor( padBatch([ em.obtain_vectors_from_sentence(sent, True) for sent in X_b ])).float() targets = torch.tensor(y_b, dtype=torch.float32) if is_cuda: inputs, targets = inputs.cuda(), targets.cuda() outputs = model(inputs) loss = loss_fn(outputs, targets) # error here current_loss += loss.item() print("[Epoch %d] validation loss: %1.3f" % (epoch, current_loss)) if current_loss < old_val: # if current round is better than the previous round best_state = model.state_dict() # save this model torch.save(best_state, join(save_dir, 'best-weights.pth')) print("Updated model") old_val = current_loss cnt_decrease = 0 else: # if the current round is doing worse cnt_decrease += 1 if cnt_decrease >= n_decreases: flag = False return
def main(): parser = argparse.ArgumentParser() parser.add_argument('--phase', type=str, help='Train or test.') parser.add_argument('--embedding_file', type=str, help='Filename to save the trained word embeddings.') parser.add_argument('--model_path', type=str, help='The file of the lstm model.') parser.add_argument('--test_file', type=str, help='The file of the tesing data.') parser.add_argument('--epochs', type=int, default=100, help='The number of training epochs.') parser.add_argument('--batch_size', type=int, default=50, help='The batch size of the training phrase.') args = parser.parse_args() phase = args.phase embedding_file = args.embedding_file model_path = args.model_path embeddings, word2id, id2word = load_embedding(embedding_file) id2label = dict({ 0: u'游戏', 1: u'角色扮演', 2: u'moba', 3: u'运动', 4: u'三国', 5: u'战争', 6: u'服饰', 7: u'T恤', 8: u'婚姻' }) EMBEDDING_DIM = 100 HIDDEN_DIM = 200 LINEAR_HIDDEN_DIM = 100 N_CLASSES = len(id2label) # Create the lstm model model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, LINEAR_HIDDEN_DIM, len(word2id.keys()), N_CLASSES, embeddings) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-4) print(model) if phase == 'train': print('Load the training data and prepare labels...') game_roleplay = 'data/train/1k_std_rollplay.word' game_moba = 'data/train/1k_std_moba.word' game_sport = 'data/train/1k_std_sport_game.word' sanguo_battle = 'data/train/1k_std_sanguo.word' cloth_shirt = 'data/train/1k_std_cloth.word' marriage = 'data/train/1k_std_marriage.word' sport = 'data/train/1k_std_sport.word' corpus2label = dict({ 'game_roleplay': (game_roleplay, [1, 1, 0, 0, 0, 0, 0, 0, 0]), 'game_moba': (game_moba, [1, 0, 1, 0, 0, 0, 0, 0, 0]), 'game_sport': (game_sport, [1, 0, 0, 1, 0, 0, 0, 0, 0]), 'sanguo_battle': (sanguo_battle, [0, 0, 0, 0, 1, 1, 0, 0, 0]), 'cloth_shirt': (cloth_shirt, [0, 0, 0, 0, 0, 0, 1, 1, 0]), 'marriage': (marriage, [0, 0, 0, 0, 0, 0, 0, 0, 1]), 'sport': (sport, [0, 0, 0, 1, 0, 0, 0, 0, 0]) }) corpus_data = [] labels = [] for file_name, label in corpus2label.values(): print(file_name, label) tmp_codes, tmp_labels = encode_setence(file_name, word2id, label) corpus_data.extend(tmp_codes) labels.extend(tmp_labels) corpus_data, lengths = get_padding_codes(corpus_data) corpus_data = torch.tensor(np.array(corpus_data), dtype=torch.long) lengths = torch.tensor(np.array(lengths), dtype=torch.long) labels = torch.tensor(np.array(labels), dtype=torch.float) # Train and validate # labels = np.array(labels) train_size = int(corpus_data.shape[0] * 0.8) indices = list(range(corpus_data.shape[0])) random.shuffle(indices) train_indices = indices[0:train_size] validate_indices = indices[train_size:] train_data = corpus_data[train_indices, :] train_labels = labels[train_indices, :] train_lengths = lengths[train_indices] validate_data = corpus_data[validate_indices, :] validate_labels = labels[validate_indices, :] validate_lengths = lengths[validate_indices] # bind variables to cuda if torch.cuda.is_available: train_data = train_data.cuda() train_lengths = train_lengths.cuda() train_labels = train_labels.cuda() validate_data = validate_data.cuda() validate_labels = validate_labels.cuda() validate_lengths = validate_lengths.cuda() model.cuda() text_data = TextDataset(train_data, train_labels, train_lengths) train_dataloader = data.DataLoader(text_data, batch_size=args.batch_size, shuffle=True) print('Train the LSTM text classifier model...') train_lstm(model, model_path, optimizer, train_dataloader, validate_data, validate_labels, validate_lengths, args.epochs) if phase == 'test': test_file = args.test_file model.load_state_dict(torch.load(model_path)) optimizer.zero_grad() test_data, labels = encode_setence(test_file, word2id, 1) padding_test_data, lengths = get_padding_codes(test_data) padding_test_data = torch.tensor(np.array(padding_test_data), dtype=torch.long) lengths = torch.tensor(np.array(lengths), dtype=torch.long) scores = evaluate_lstm(model, padding_test_data, lengths) scoers = scores.data.cpu().numpy() # for print the result for idx, score in enumerate(scores): sentence = [id2word[int(code)] for code in test_data[idx]] tmp_labels = [id2label[i] for i in np.where(score > 0.5)[0]] tmp_score = np.array( [float(score[i]) for i in np.where(score > 0.5)[0]]) tmp_score = tmp_score.prod() print(idx), print(' '.join(sentence).encode('utf-8').decode('utf-8')) print(' '.join(tmp_labels).encode('utf-8').decode('utf-8')), print(tmp_score)
def __init__(self, models_dir='./models/lstm_trained_models', embeddings_dir='./embeddings', is_cuda=False): """ @param models_dir: the directory where the LSTM models are stored @param embeddings_dir: the directory where the embeddings are stored. The directory must contain the following subdirectories: word2vec/GoogleNews-vectors-negative300.wv fasttext/wiki-news-300d-1M-subword.wv glove/glove.42B.300d.wv @param is_cuda: to enable cuda """ self.is_cuda = is_cuda self.models_dir = models_dir self.embeddings_dir = embeddings_dir #load embeddings self.em_glove = ExtractWordEmbeddings('glove', emb_dir=self.embeddings_dir) self.em_word2vec = ExtractWordEmbeddings('word2vec', emb_dir=self.embeddings_dir) self.em_fasttext = ExtractWordEmbeddings('fasttext', emb_dir=self.embeddings_dir) self.dimensions_list = [ 'support', 'knowledge', 'conflict', 'power', 'similarity', 'fun', 'status', 'trust', 'identity', 'romance' ] #load models self.dim2model = {} self.dim2embedding = {} for dim in self.dimensions_list: model = LSTMClassifier(embedding_dim=300, hidden_dim=300) if self.is_cuda: print(f'Torch version: {torch.__version__}') print(f'Torch CUDA available : {torch.cuda.is_available()}') if torch.cuda.is_available(): print( f'Torch current device : {torch.cuda.current_device()}' ) print(f'Torch device count : {torch.cuda.device_count()}') print( f'Torch device name : {torch.cuda.get_device_name(0)}') model.cuda() else: print( 'Cuda not available. Instantiated the TenDimensionsClassifier with CUDA=False' ) self.is_cuda = False model.eval() for modelname in os.listdir(self.models_dir): if ('-best.lstm' in modelname) & (dim in modelname): best_state = torch.load(join(self.models_dir, modelname), map_location='cpu') model.load_state_dict(best_state) if 'glove' in modelname: em = self.em_glove elif 'word2vec' in modelname: em = self.em_word2vec elif 'fasttext' in modelname: em = self.em_fasttext self.dim2model[dim] = model self.dim2embedding[dim] = em break