def eval(tag_path, corpus_path): correct = 0 total = 0 acc_list = [] model_name = MODEL_NAME embedding_dim = EMBEDDING_DIM hidden_dim = HIDDEN_DIM word_to_ix = WORD_TO_IX model = BiLSTM(len(word_to_ix), 5, embedding_dim, hidden_dim) checkpoint = torch.load(model_name) model.load_state_dict(checkpoint['model_state_dict']) model.eval() tag_to_ix = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4} sentences, tags = load_train_data(tag_path, corpus_path) labels = torch.tensor([[tag_to_ix[tag]] for tag in tags[:]]) with torch.no_grad(): for i, sen in enumerate(tqdm(sentences[:])): input = prepare_sequence(sen, word_to_ix) output = model(input) _, predicted = torch.max(output.data, 1) label = labels[i] total += label.size(0) correct += (predicted == label).sum().item() acc = round(100 * correct / total, 2) acc_list.append(acc) assert len(acc_list) == len(sentences) final_acc = acc plt.plot(list(range(len(tags))), acc_list) plt.xlabel('pred_num') plt.ylabel('accuracy / %') plt.show() return final_acc
def predict(sentence): sentence = sentence.split() model_name = BEST_NAME embedding_dim = EMBEDDING_DIM hidden_dim = HIDDEN_DIM word_to_ix = WORD_TO_IX model = BiLSTM(len(word_to_ix), 5, embedding_dim, hidden_dim) checkpoint = torch.load(model_name) model.load_state_dict(checkpoint['model_state_dict']) input = prepare_sequence(sentence, word_to_ix) with torch.no_grad(): output = model(input) print(output) _, predicted = torch.max(output.data, 1) print(predicted)
def get_time_to_score(tsv_path, thing, model_path): time_to_count = {} time_to_scoresum = {} if thing == 'hair_dryer': id = '732252283' elif thing == 'microwave': id = '423421857' else: id = '246038397' with open('train_' + thing + '_word_to_ix.json', 'r') as j: word_to_ix = json.load(j) embedding_dim = EMBEDDING_DIM hidden_dim = HIDDEN_DIM model = BiLSTM(len(word_to_ix), 5, embedding_dim, hidden_dim) checkpoints = torch.load(model_path) model.load_state_dict(checkpoints['model_state_dict']) model.eval() with open(tsv_path, 'r') as f: reader = csv.reader(f, delimiter='\t') for i, r in enumerate(reader): if i == 0 or r[4] != id: continue month, _, year = r[14].split('/') if year not in {'2014', '2015'}: continue time = get_idx_by_year_month(int(year), int(month)) if time < 8: continue sen = (r[12] + ' ' + r[13]).lower() sen = re.sub(r'[^A-Za-z0-9,.!]+', ' ', sen) input = prepare_sequence(sen.split(), word_to_ix) with torch.no_grad(): output = model(input) _, predicted = torch.max(output.data, 1) pred_score = predicted.item() if time not in time_to_count: time_to_count[time] = 0 time_to_scoresum[time] = 0. time_to_count[time] += 1 time_to_scoresum[time] += pred_score time_to_scoremean = {} for time in time_to_count.keys(): time_to_scoremean[time] = time_to_scoresum[time] / time_to_count[time] print(time_to_count) return time_to_scoremean
def model_load_test(test_df, vocab_file, embeddings_file, pretrained_file, test_prediction_dir, test_prediction_name, mode, num_labels=2, max_length=50, gpu_index=0, batch_size=128): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for testing ", 20 * "=") if platform == "linux" or platform == "linux2": checkpoint = torch.load(pretrained_file) else: checkpoint = torch.load(pretrained_file, map_location=device) # Retrieving model parameters from checkpoint. embeddings = load_embeddings(embeddings_file) print("\t* Loading test data...") test_data = My_Dataset(test_df, vocab_file, max_length, mode) test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) print("\t* Building model...") model = BiLSTM(embeddings, num_labels=num_labels, max_length=max_length, device=device).to(device) model.load_state_dict(checkpoint["model"]) print(20 * "=", " Testing BiLSTM model on device: {} ".format(device), 20 * "=") batch_time, total_time, accuracy, predictions = test(model, test_loader) print( "\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%\n" .format(batch_time, total_time, (accuracy * 100))) test_prediction = pd.DataFrame({'prediction': predictions}) if not os.path.exists(test_prediction_dir): os.makedirs(test_prediction_dir) test_prediction.to_csv(os.path.join(test_prediction_dir, test_prediction_name), index=False)
max_sent_length = 36 # set from the paper # ---- Define Model, Loss, Optim ------ config = args config.d_out = num_classes config.n_directions = 2 if config.birnn else 1 print(config) model = BiLSTM(config) loss_function = nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=args.lr) # ---- Test Model ------ if args.test: print("Test Mode: loading pre-trained model and testing on test set...") # model = torch.load(args.resume_snapshot, map_location=lambda storage, location: storage.cuda(args.gpu)) model.load_state_dict(torch.load(args.resume_snapshot)) test_acc = evaluate_dataset_batch(test_set, max_sent_length, model, w2v_map, label_to_ix) print("Accuracy: {}".format(test_acc)) sys.exit(0) # ---- Train Model ------ start = time.time() best_val_acc = -1 iter = 0 header = ' Time Epoch Iteration Loss Train/Acc. Val/Acc.' print(header) log_template = ' '.join('{:>6.0f},{:>5.0f},{:>9.0f},{:>9.6f}'.split(',')) dev_log_template = ' '.join( '{:>6.0f},{:>5.0f},{:>9.0f},{:>9.6f},{:9.6f},{:11.6f}'.split(','))
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--model', type=str, default='rnn', help= "Available models are: 'rnn', 'cnn', 'bilstm', 'fasttext', and 'distilbert'\nDefault is 'rnn'" ) parser.add_argument('--train_data_path', type=str, default="./data/train_clean.csv", help="Path to the training data") parser.add_argument('--test_data_path', type=str, default="./data/dev_clean.csv", help="Path to the test data") parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--vectors', type=str, default='fasttext.simple.300d', help=""" Pretrained vectors: Visit https://github.com/pytorch/text/blob/9ce7986ddeb5b47d9767a5299954195a1a5f9043/torchtext/vocab.py#L146 for more """) parser.add_argument('--max_vocab_size', type=int, default=750) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--bidirectional', type=bool, default=True) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--hidden_dim', type=int, default=64) parser.add_argument('--output_dim', type=int, default=1) parser.add_argument('--n_layers', type=int, default=2) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--n_epochs', type=int, default=5) parser.add_argument('--n_filters', type=int, default=100) parser.add_argument('--filter_sizes', type=list, default=[3, 4, 5]) args = parser.parse_args() torch.manual_seed(args.seed) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ########## BILSTM ########## if args.model == "bilstm": print('\nBiLSTM') TEXT = Field(tokenize='spacy') LABEL = LabelField(dtype=torch.float) data_fields = [("text", TEXT), ("label", LABEL)] train_data = TabularDataset(args.train_data_path, format='csv', fields=data_fields, skip_header=True, csv_reader_params={'delimiter': ","}) test_data = TabularDataset(args.test_data_path, format='csv', fields=data_fields, skip_header=True, csv_reader_params={'delimiter': ","}) train_data, val_data = train_data.split(split_ratio=0.8, random_state=random.seed( args.seed)) TEXT.build_vocab(train_data, max_size=args.max_vocab_size, vectors=args.vectors, unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data) train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, val_data, test_data), batch_size=args.batch_size, sort_key=lambda x: len(x.text), device=device) input_dim = len(TEXT.vocab) embedding_dim = get_embedding_dim(args.vectors) pad_idx = TEXT.vocab.stoi[TEXT.pad_token] unk_idx = TEXT.vocab.stoi[TEXT.unk_token] model = BiLSTM(input_dim, embedding_dim, args.hidden_dim, args.output_dim, args.n_layers, args.bidirectional, args.dropout, pad_idx) pretrained_embeddings = TEXT.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim) model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim) optimizer = optim.Adam(model.parameters(), lr=args.lr) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) best_valid_loss = float('inf') print("\nTraining...") print("===========") for epoch in range(1, args.n_epochs + 1): start_time = time.time() train_loss, train_acc = train(model, train_iterator, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), './checkpoints/{}-model.pt'.format(args.model)) print( f'[Epoch: {epoch:02}] | Epoch Time: {epoch_mins}m {epoch_secs}s' ) print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%' ) model.load_state_dict( torch.load('./checkpoints/{}-model.pt'.format(args.model))) test_loss, test_acc = evaluate(model, test_iterator, criterion) print('\nEvaluating...') print("=============") print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%' ) # Test Loss: 0.139, Test Acc: 95.27% ########## VANILLA RNN ########## else: print('\nVanilla RNN') TEXT = Field(tokenize='spacy') LABEL = LabelField(dtype=torch.float) data_fields = [("text", TEXT), ("label", LABEL)] train_data = TabularDataset(args.train_data_path, format='csv', fields=data_fields, skip_header=True, csv_reader_params={'delimiter': ","}) test_data = TabularDataset(args.test_data_path, format='csv', fields=data_fields, skip_header=True, csv_reader_params={'delimiter': ","}) train_data, val_data = train_data.split(split_ratio=0.8, random_state=random.seed( args.seed)) TEXT.build_vocab(train_data, max_size=args.max_vocab_size, vectors=args.vectors) LABEL.build_vocab(train_data) train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, val_data, test_data), batch_size=args.batch_size, sort_key=lambda x: len(x.text), device=device) input_dim = len(TEXT.vocab) embedding_dim = get_embedding_dim(args.vectors) model = RNN(input_dim, embedding_dim, args.hidden_dim, args.output_dim) pretrained_embeddings = TEXT.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) optimizer = optim.Adam(model.parameters(), lr=args.lr) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) best_valid_loss = float('inf') print("\nTraining...") print("===========") for epoch in range(1, args.n_epochs + 1): start_time = time.time() train_loss, train_acc = train(model, train_iterator, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), './checkpoints/{}-model.pt'.format(args.model)) print( f'[Epoch: {epoch:02}] | Epoch Time: {epoch_mins}m {epoch_secs}s' ) print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%' ) model.load_state_dict( torch.load('./checkpoints/{}-model.pt'.format(args.model))) test_loss, test_acc = evaluate(model, test_iterator, criterion) print('\nEvaluating...') print("=============") print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%' ) # Test Loss: 0.138, Test Acc: 95.05%
def model_train_validate_test(train_df, dev_df, test_df, embeddings_file, vocab_file, target_dir, mode, num_labels=2, max_length=50, epochs=50, batch_size=128, lr=0.0005, patience=5, max_grad_norm=10.0, gpu_index=0, if_save_model=False, checkpoint=None): device = torch.device( "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for training ", 20 * "=") # 保存模型的路径 if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") train_data = My_Dataset(train_df, vocab_file, max_length, mode) train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size) print("\t* Loading validation data...") dev_data = My_Dataset(dev_df, vocab_file, max_length, mode) dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size) print("\t* Loading test data...") test_data = My_Dataset(test_df, vocab_file, max_length, mode) test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size) # -------------------- Model definition ------------------- # print("\t* Building model...") if (embeddings_file is not None): embeddings = load_embeddings(embeddings_file) else: embeddings = None model = BiLSTM(embeddings, num_labels=num_labels, device=device).to(device) total_params = sum(p.numel() for p in model.parameters()) print(f'{total_params:,} total parameters.') total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'{total_trainable_params:,} training parameters.') # -------------------- Preparation for training ------------------- # criterion = nn.CrossEntropyLoss() # 过滤出需要梯度更新的参数 parameters = filter(lambda p: p.requires_grad, model.parameters()) # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"]) optimizer = torch.optim.Adam(parameters, lr=lr) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.85, patience=0) best_score = 0.0 start_epoch = 1 # Data for loss curves plot epochs_count = [] train_losses = [] valid_losses = [] # Continuing training from a checkpoint if one was given as argument if checkpoint: checkpoint = torch.load(checkpoint) start_epoch = checkpoint["epoch"] + 1 best_score = checkpoint["best_score"] print("\t* Training will continue on existing model from epoch {}...". format(start_epoch)) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) epochs_count = checkpoint["epochs_count"] train_losses = checkpoint["train_losses"] valid_losses = checkpoint["valid_losses"] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy, _, = validate(model, dev_loader, criterion) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%". format(valid_loss, (valid_accuracy * 100))) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training BiLSTM model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs + 1): epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader, optimizer, criterion, epoch, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy, _, = validate( model, dev_loader, criterion) valid_losses.append(epoch_loss) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n". format(epoch_time, epoch_loss, (epoch_accuracy * 100))) # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 if (if_save_model): torch.save( { "epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses }, os.path.join(target_dir, "best.pth.tar")) print("save model succesfully!\n") print("* Test for epoch {}:".format(epoch)) _, _, test_accuracy, predictions = validate( model, test_loader, criterion) print("Test accuracy: {:.4f}%\n".format(test_accuracy)) test_prediction = pd.DataFrame({'prediction': predictions}) test_prediction.to_csv(os.path.join(target_dir, "test_prediction.csv"), index=False) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break
class Seq_MNIST_Trainer(): def __init__(self, trainer_params, args): self.args = args self.trainer_params = trainer_params random.seed(trainer_params.random_seed) torch.manual_seed(trainer_params.random_seed) if args.cuda: torch.cuda.manual_seed_all(trainer_params.random_seed) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} self.train_data = seq_mnist_train(trainer_params) self.val_data = seq_mnist_val(trainer_params) self.train_loader = DataLoader(self.train_data, batch_size=trainer_params.batch_size, shuffle=True, **kwargs) self.val_loader = DataLoader(self.val_data, batch_size=trainer_params.test_batch_size, shuffle=True, **kwargs) self.starting_epoch = 1 self.prev_loss = 10000 self.model = BiLSTM(trainer_params) self.criterion = wp.CTCLoss(size_average=True) self.labels = [i for i in range(trainer_params.num_classes-1)] self.decoder = seq_mnist_decoder(labels=self.labels) if args.resume or args.eval or args.export: print("Loading model from {}".format(args.save_path)) package = torch.load(args.save_path, map_location=lambda storage, loc: storage) self.model.load_state_dict(package['state_dict']) if args.cuda: torch.cuda.set_device(args.gpus) self.model = self.model.cuda() self.optimizer = optim.Adam(self.model.parameters(), lr=trainer_params.lr) if args.resume: self.optimizer.load_state_dict(package['optim_dict']) self.starting_epoch = package['starting_epoch'] self.prev_loss = package['prev_loss'] if args.cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.init_bn_fc_fusion: if not trainer_params.prefused_bn_fc: self.model.batch_norm_fc.init_fusion() self.trainer_params.prefused_bn_fc = True else: raise Exception("BN and FC are already fused.") def serialize(self, model, trainer_params, optimizer, starting_epoch, prev_loss): package = {'state_dict': model.state_dict(), 'trainer_params': trainer_params, 'optim_dict' : optimizer.state_dict(), 'starting_epoch' : starting_epoch, 'prev_loss': prev_loss } return package def save_model(self, epoch, loss_value): print("Model saved at: {}\n".format(self.args.save_path)) self.prev_loss = loss_value torch.save(self.serialize(model=self.model, trainer_params=self.trainer_params, optimizer=self.optimizer, starting_epoch=epoch + 1, prev_loss=self.prev_loss), self.args.save_path) def train(self, epoch): self.model.train() for i, (item) in enumerate(self.train_loader): data, labels, output_len, lab_len = item data = Variable(data.transpose(1,0), requires_grad=False) labels = Variable(labels.view(-1), requires_grad=False) output_len = Variable(output_len.view(-1), requires_grad=False) lab_len = Variable(lab_len.view(-1), requires_grad=False) if self.args.cuda: data = data.cuda() output = self.model(data) # print("Input = ", data.shape) # print("model output (x) = ", output) # print("GTs (y) = ", labels.type()) # print("model output len (xs) = ", output_len.type()) # print("GTs len (ys) = ", lab_len.type()) # exit(0) loss = self.criterion(output, labels, output_len, lab_len) loss_value = loss.data[0] print("Loss value for epoch = {}/{} and batch {}/{} is = {:.4f}".format(epoch, self.trainer_params.epochs, (i+1)*self.trainer_params.batch_size, len(self.train_data) , loss_value)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() if self.args.cuda: torch.cuda.synchronize() def test(self, epoch=0, save_model_flag=False): self.model.eval() loss_value = 0 for i, (item) in enumerate(self.val_loader): data, labels, output_len, lab_len = item data = Variable(data.transpose(1,0), requires_grad=False) labels = Variable(labels.view(-1), requires_grad=False) output_len = Variable(output_len.view(-1), requires_grad=False) lab_len = Variable(lab_len.view(-1), requires_grad=False) if self.args.cuda: data = data.cuda() output = self.model(data) # print("Input = ", data) # print("model output (x) = ", output.shape) # print("model output (x) = ", output) # print("Label = ", labels) # print("model output len (xs) = ", output_len) # print("GTs len (ys) = ", lab_len) index = random.randint(0,self.trainer_params.test_batch_size-1) label = labels[index*self.trainer_params.word_size:(index+1)*self.trainer_params.word_size].data.numpy() label = label-1 prediction = self.decoder.decode(output[:,index,:], output_len[index], lab_len[index]) accuracy = self.decoder.hit(prediction, label) print("Sample Label = {}".format(self.decoder.to_string(label))) print("Sample Prediction = {}".format(self.decoder.to_string(prediction))) print("Accuracy on Sample = {:.2f}%\n\n".format(accuracy)) loss = self.criterion(output, labels, output_len, lab_len) loss_value += loss.data.numpy() loss_value /= (len(self.val_data)//self.trainer_params.test_batch_size) print("Average Loss Value for Val Data is = {:.4f}\n".format(float(loss_value))) if loss_value < self.prev_loss and save_model_flag: self.save_model(epoch, loss_value) def eval_model(self): self.test() def train_model(self): for epoch in range(self.starting_epoch, self.trainer_params.epochs + 1): self.train(epoch) self.test(epoch=epoch, save_model_flag=True) if epoch%20==0: self.optimizer.param_groups[0]['lr'] = self.optimizer.param_groups[0]['lr']*0.98 def export_model(self, simd_factor, pe): self.model.eval() self.model.export('r_model_fw_bw.hpp', simd_factor, pe) def export_image(self, idx=100): img, label = self.val_data.images[:,idx,:], self.val_data.labels[0][idx] img = img.transpose(1, 0) label -= 1 label = self.decoder.to_string(label) from PIL import Image from matplotlib import cm im = Image.fromarray(np.uint8(cm.gist_earth(img)*255)) im.save('test_image.png') img = img.transpose(1, 0) img = np.reshape(img, (-1, 1)) np.savetxt("test_image.txt", img, fmt='%.10f') f = open('test_image_gt.txt','w') f.write(label) f.close() print("Exported image with label = {}".format(label))
import torchvision from model import BiLSTM from data import load_dataset from config import model_name, device if __name__ == "__main__": # the string to test! test_string = "<s> john can" # ######################## # LOAD DATASET # ######################## corpus, word_to_idx, idx_to_word, train_dataset = load_dataset() # ######################## # TEST VARIABLES # ######################## model = BiLSTM(len(corpus)) model.load_state_dict(torch.load(model_name)) model.eval() sentence = test_string.split() sentence = torch.tensor([[word_to_idx[w] for w in sentence]]) s = model.sample(sentence) print(test_string.split() + s)
char_to_id = mapping['char_to_id'] word_embeds = mapping['word_embeds'] model = BiLSTM(voca_size=len(word_to_id), word_emb_dim=100, pre_word_emb=word_embeds, char_emb_dim=25, char_lstm_dim=25, char_to_ix=char_to_id, n_cap=4, cap_emb_dim=8, hidden_dim=200, tag_to_ix=tag_to_id) x = torch.load(model_path) model.load_state_dict(x()) model.eval() def test(): test_sentences = loader.load_data(test_path, zeros=False) loader.update_tag_scheme(test_sentences, 'iob') test_data = loader.pepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id) print("%i sentences in test." % (len(test_data))) confusion_matrix = torch.zeros((len(tag_to_id) - 2, len(tag_to_id) - 2))
if 'cuda' in args.device: if torch.cuda.is_available(): device = torch.device(args.device) else: print("cuda not available...") print("Using device {}".format(device)) print("loading datasets...") n = None train_data = DataSource("train", n=n) print("loaded {} train data".format(len(train_data))) dev_data = DataSource("dev", n=n) print("loaded {} dev data".format(len(dev_data))) test_data = DataSource("test", n=n) print("loaded {} test data".format(len(test_data))) model = BiLSTM(128, device) print("allocated model") if args.restore == "": losses = train() print("graphing") graph_losses(losses) else: model.load_state_dict(torch.load(args.restore)) print("loaded weights from {}".format(args.restore)) confusion = evaluate() print(confusion) print("accuracy: {}".format(np.sum(np.diagonal(confusion))))
train_dataloader = create_dataloader("./data/wsj0_train", "./data/wsj0_train_merged_labels.npy", batch_size=batch_size, shuffle=True) test_dataloader = create_dataloader("./data/wsj0_test", None, batch_size=batch_size, test=True, shuffle=False) model = BiLSTM(40, 256, 47, 5, use_gpu=True) # model = Model(40, 47, 256) if checkpoint: model.load_state_dict(torch.load(checkpoint)) model = model.cuda() ctc_loss = nn.CTCLoss() def criterion(out, label, data_len, label_len): loss = ctc_loss(out, label, data_len, label_len) reg_loss = 0 for param in model.parameters(): reg_loss += (param**2).sum() factor = 0.00001 loss += factor * reg_loss return loss optimizer = Adam(model.parameters(), lr=1e-4, weight_decay=5e-5)
def train(): logging.basicConfig(level=logging.INFO, filename='log.txt', format='%(message)s') tag_path = TRAIN_TAG_PATH corpus_path = TRAIN_CORPUS_PATH save_model_name = MODEL_NAME best_model_name = BEST_NAME load_model_path = None embedding_dim = EMBEDDING_DIM hidden_dim = HIDDEN_DIM train_epoch = TRAIN_EPOCH word_to_ix = WORD_TO_IX start_epoch = 0 best_score = 0. loss_info, train_avg_info, test_avg_info = [], [], [] sentences, tags = load_train_data(tag_path, corpus_path) tag_to_ix = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4} label = torch.tensor([[tag_to_ix[tag]] for tag in tags]) model = BiLSTM(len(word_to_ix), 5, embedding_dim, hidden_dim, dropout=0.3) optimizer = optim.Adam(model.parameters(), lr=0.001) criterion = nn.CrossEntropyLoss() if load_model_path is not None: checkpoints = torch.load(load_model_path) model.load_state_dict(checkpoints['model_state_dict']) optimizer.load_state_dict(checkpoints['optim_state_dict']) start_epoch = checkpoints['epoch'] start_time = time.time() logging.info('----------------------') for epoch in range(start_epoch, train_epoch): running_loss = 0.0 for i, sen in enumerate(tqdm(sentences)): optimizer.zero_grad() input = prepare_sequence(sen, word_to_ix) output = model(input) loss = criterion(output, label[i]) running_loss += loss.item() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 15) optimizer.step() torch.save( { 'model_state_dict': model.state_dict(), 'optim_state_dict': optimizer.state_dict(), 'epoch': epoch + 1 }, save_model_name) train_avg = eval(TRAIN_TAG_PATH, TRAIN_CORPUS_PATH) test_avg = eval(TEST_TAG_PATH, TEST_CORPUS_PATH) loss_info.append(running_loss) train_avg_info.append(train_avg) test_avg_info.append(test_avg) logging.info('********') logging.info('epoch: {}'.format(epoch + 1)) logging.info('loss: {}'.format(running_loss)) logging.info('train avg: {}'.format(train_avg)) logging.info('test avg: {}'.format(test_avg)) if test_avg > best_score: torch.save({ 'model_state_dict': model.state_dict(), }, best_model_name) best_score = test_avg print('save best') print('training time:', time.time() - start_time)
def getModelOptimizerTokenizer(model_type, vocab_file, embed_file=None, bert_config_file=None, init_checkpoint=None, label_list=None, do_lower_case=True, num_train_steps=None, learning_rate=None, base_learning_rate=None, warmup_proportion=None): if embed_file is not None: # in case pretrain embeddings embeddings = pickle.load(open(embed_file, 'rb')) if model_type == "BiLSTM": logger.info("model = BiLSTM") tokenizer = WordLevelTokenizer(vocab_file=vocab_file) model = BiLSTM(pretrain_embeddings=embeddings, freeze=True) optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4) # if pretrain, we will load here if init_checkpoint is not None: logger.info("retraining with saved model.") checkpoint = torch.load(init_checkpoint, map_location='cpu') model.load_state_dict(checkpoint) elif model_type == "BERTSimple": logger.info("model = BERTSimple") tokenizer = WordLevelTokenizer(vocab_file=vocab_file) bert_config = BertConfig(hidden_size=300, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02) if embed_file is None: raise ValueError("BERTSimple needs a pretrain embedding file.") model = \ BertSimpleForSequenceClassification(bert_config, pretrain_embeddings=embeddings, num_labels=len(label_list), type_id_enable=True, position_enable=True) if init_checkpoint is not None: logger.info("retraining with saved model.") model.bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) # instead of BERTAdam, we use Adam to be able to perform gs on bias optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4) elif model_type == "BERTPretrain": logger.info("model = BERTPretrain") if bert_config_file is not None: bert_config = BertConfig.from_json_file(bert_config_file) else: # default? bert_config = BertConfig( hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02 ) tokenizer = FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case, pretrain=False) # overwrite the vocab size to be exact. this also save space incase # vocab size is shrinked. bert_config.vocab_size = len(tokenizer.vocab) # model and optimizer model = BertForSequenceClassification(bert_config, len(label_list)) if init_checkpoint is not None: logger.info("retraining with saved model.") model.bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_steps) elif model_type == "ContextBERT": logger.info("model = ContextBERT") # this is the model we develop tokenizer = FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case, pretrain=False) if bert_config_file is not None: bert_config = BertConfig.from_json_file(bert_config_file) else: # default? bert_config = BertConfig( hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02 ) # overwrite the vocab size to be exact. this also save space incase # vocab size is shrinked. bert_config.vocab_size = len(tokenizer.vocab) # model and optimizer model = ContextAwareBertForSequenceClassification( bert_config, len(label_list), init_weight=True) if init_checkpoint is not None: logger.info("retraining with saved model.") # only load fields that are avaliable if "checkpoint" in init_checkpoint: # load full is it is not google BERT original pretrain model.load_state_dict(torch.load(init_checkpoint, map_location='cpu'), strict=False) else: model.bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'), strict=False) ####################################################################### # Instead of BERTAdam, we use Adam to be able to perform gs on bias # we will have a smaller learning rate for BERT orignal parameters # and a higher learning rate for new parameters # orignal_bert = BertForSequenceClassification(bert_config, len(label_list)) # original_params = [] # exclude_params = ["classifier.weight", "classifier.bias"] # for params in orignal_bert.named_parameters(): # if params not in exclude_params: # original_params.append(params[0]) # no_decay = ['bias', 'gamma', 'beta'] # base_params_no_decay = list(map(lambda x: x[1], # list(filter(lambda kv: kv[0] in original_params \ # and any(nd in kv[0] for nd in no_decay), # model.named_parameters())))) # base_params_decay = list(map(lambda x: x[1], # list(filter(lambda kv: kv[0] in original_params \ # and not any(nd in kv[0] for nd in no_decay), # model.named_parameters())))) # params = list(map(lambda x: x[1], # list(filter(lambda kv: kv[0] not in original_params \ # or kv[0] in exclude_params, # model.named_parameters())))) # optimizer_parameters = [ # {'params': base_params_decay, 'weight_decay_rate': 0.01}, # {'params': base_params_no_decay, 'weight_decay_rate': 0.0}, # {'params': params, 'lr': learning_rate, 'weight_decay_rate': 0.01}] # optimizer = BERTAdam(optimizer_parameters, # lr=base_learning_rate, # warmup=warmup_proportion, # t_total=num_train_steps) # orignal_bert = BertForSequenceClassification(bert_config, len(label_list)) # original_params = [] # exclude_params = ["classifier.weight", "classifier.bias"] # for params in orignal_bert.named_parameters(): # if params not in exclude_params: # original_params.append(params[0]) # no_decay = ['bias', 'gamma', 'beta'] # base_params_no_decay = list(map(lambda x: x[1], # list(filter(lambda kv: kv[0] in original_params \ # and any(nd in kv[0] for nd in no_decay), # model.named_parameters())))) # base_params_decay = list(map(lambda x: x[1], # list(filter(lambda kv: kv[0] in original_params \ # and not any(nd in kv[0] for nd in no_decay), # model.named_parameters())))) # params = list(map(lambda x: x[1], # list(filter(lambda kv: kv[0] not in original_params \ # or kv[0] in exclude_params, # model.named_parameters())))) # no_decay = ['bias', 'gamma', 'beta'] # optimizer_parameters = [ # {'params': [p for n, p in model.named_parameters() # if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, # {'params': [p for n, p in model.named_parameters() # if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} # ] # optimizer = BERTAdam(optimizer_parameters, # lr=learning_rate, # warmup=warmup_proportion, # t_total=num_train_steps) optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4) ####################################################################### elif model_type == "HeadwiseContextBERT": logger.info("model = HeadwiseContextBERT") # this is the model we develop tokenizer = FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case, pretrain=False) if bert_config_file is not None: bert_config = BertConfig.from_json_file(bert_config_file) else: # default? bert_config = BertConfig() # overwrite the vocab size to be exact. this also save space incase # vocab size is shrinked. bert_config.vocab_size = len(tokenizer.vocab) # model and optimizer model = HeadwiseContextAwareBertForSequenceClassification( bert_config, len(label_list), init_weight=True) if init_checkpoint is not None: logger.info("retraining with saved model.") # only load fields that are avaliable if "checkpoint" in init_checkpoint: logger.info("retraining with a checkpoint model instead.") # load full is it is not google BERT original pretrain model.load_state_dict(torch.load(init_checkpoint, map_location='cpu'), strict=False) else: model.bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'), strict=False) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_steps) else: logger.info("***** Not Support Model Type *****") return model, optimizer, tokenizer