def main(): nb_epochs = 30 batch_size = 200 hidden_size = 256 embedding_dim = 300 max_len = 20 teacher_forcing = 0.6 min_count = 2 max_grad_norm = 5 val_len = 5000 weight_decay = 0.00001 model_filename = '/home/mattd/pycharm/yelp/models' \ '/baseline_frozen_pretrained' eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train' dataset = SentenceDataset(eng_fr_filename, max_len, min_count) print('Dataset: {}'.format(len(dataset))) train_len = len(dataset) - val_len dataset_train, dataset_val = torch.utils.data.dataset.random_split( dataset, [train_len, val_len]) print('Train {}, val: {}'.format(len(dataset_train), len(dataset_val))) embeddings_dir = '/home/mattd/pycharm/yelp/embeddings.npy' embeddings = cuda(get_pretrained_embeddings(embeddings_dir, dataset)) data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False) vocab_size = len(dataset.vocab) padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN] init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN] model = Seq2SeqModel(embeddings, hidden_size, padding_idx, init_idx, max_len, teacher_forcing) model = cuda(model) parameters = list(model.parameters()) optimizer = torch.optim.Adam(parameters, amsgrad=True, weight_decay=weight_decay) criterion = torch.nn.CrossEntropyLoss( ignore_index=dataset.vocab[SentenceDataset.PAD_TOKEN]) phases = [ 'train', 'val', ] data_loaders = [ data_loader_train, data_loader_val, ] lowest_loss = 500 for epoch in range(nb_epochs): for phase, data_loader in zip(phases, data_loaders): if phase == 'train': model.train() else: model.eval() epoch_loss = [] for i, (inputs, targets) in enumerate(data_loader): optimizer.zero_grad() inputs = variable(inputs) targets = variable(targets) outputs = model(inputs, targets) targets = targets.view(-1) outputs = outputs.view(targets.size(0), -1) loss = criterion(outputs, targets) if phase == 'train': loss.backward() torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm) optimizer.step() epoch_loss.append(float(loss)) epoch_loss = np.mean(epoch_loss) if epoch_loss < lowest_loss: save_checkpoint(model, loss, optimizer, model_filename) lowest_loss = epoch_loss if phase == 'train': print('Epoch {:03d} | {} loss: {:.3f}'.format( epoch, phase, epoch_loss), end='') else: print(', {} loss: {:.3f}'.format(phase, epoch_loss), end='\n') # print random sentence if phase == 'val': random_idx = np.random.randint(len(dataset_val)) inputs, targets = dataset_val[random_idx] inputs_var = variable(inputs) outputs_var = model(inputs_var.unsqueeze( 0)) # unsqueeze to get the batch dimension outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy() print(u'> {}'.format( get_sentence_from_indices(inputs, dataset.vocab, SentenceDataset.EOS_TOKEN))) print(u'= {}'.format( get_sentence_from_indices(targets, dataset.vocab, SentenceDataset.EOS_TOKEN))) print(u'< {}'.format( get_sentence_from_indices(outputs, dataset.vocab, SentenceDataset.EOS_TOKEN))) print()
def main(): nb_epochs = 50 batch_size = 500 hidden_size = 256 embedding_dim = 300 pretrained_embeddings = "/embeddings_min2_max15.npy" max_grad_norm = 5 max_len = 15 min_count = 2 weight_decay = 0.00001 learning_rate = 0.001 model_group = "/auto_encoder" autoencoder_name = "/auto_encoder_3" autoencoder_version = 1 project_file = "/home/mattd/PycharmProjects/reddit" dataset_path = "/home/mattd/PycharmProjects/reddit/data/" string = 'nb_epochs: {}\nbatch_size: {}\nhidden_size: {}\nembedding_dim: ' \ '{}\npretrained_embeddings: {}\nmax_len: {}\nmin_countmin_count: '\ '{}\nweight_decay: {}\nlearning_rate: {}\nmodel_group: ' \ '{}\nautoencoder_name: {}\nautoencoder_version: {}\n'.format( nb_epochs, batch_size, hidden_size, embedding_dim, pretrained_embeddings, max_len, min_count, weight_decay, learning_rate, model_group, autoencoder_name,autoencoder_version) print(string) output = string + '\n' # embedding_filename = 'embeddings_20_1.npy's' model_filename = '{}{}s{}'.format( project_file, model_group, autoencoder_name) new_model_filename = '{}_{}'.format(model_filename, autoencoder_version) output_file = '{}{}_outputs{}_{}'.format( project_file, model_group, autoencoder_name, autoencoder_version) description_filename = \ '{}/description/description_1.txt'.format(project_file) # eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train' dataset_train_filename = "{}train.csv".format(dataset_path) dataset_val_filename = "{}validation.csv".format(dataset_path) dataset_train = SentenceDataset(dataset_train_filename, max_len, min_count) dataset_val = SentenceDataset(dataset_val_filename, max_len, min_count, dataset_train.vocab) string = 'Train {}, val: {}'.format(len(dataset_train), len(dataset_val)) print(string) output += string + '\n' # getting pretrained embeddings if pretrained_embeddings is not None: embeddings_dir = '{}{}'.format(project_file, pretrained_embeddings) pretrained_embeddings = cuda( get_pretrained_embeddings(embeddings_dir)) embedding_dim = pretrained_embeddings.shape[1] data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False) vocab_size = len(dataset_val.vocab) padding_idx = dataset_val.vocab[SentenceDataset.PAD_TOKEN] init_idx = dataset_val.vocab[SentenceDataset.INIT_TOKEN] model = Seq2SeqModel(hidden_size, padding_idx, init_idx, max_len, vocab_size, embedding_dim, pretrained_embeddings) model = cuda(model) parameters = list(model.parameters()) optimizer = torch.optim.Adam( parameters, amsgrad=True, weight_decay=weight_decay, lr=learning_rate) criterion = torch.nn.CrossEntropyLoss(ignore_index=dataset_val.vocab[ SentenceDataset.PAD_TOKEN]) model, optimizer, lowest_loss, description, last_epoch, \ train_loss, val_loss, found_model = load_checkpoint(model_filename, model, optimizer) if found_model: string = 'Loaded Model:\nlowest_validation_loss: {}\ndescription: {}' \ '\nlast_epoch:{}\n'.format(lowest_loss, description, last_epoch) else: string = 'No model found at {}\n'.format(model_filename) print(string) output = output + string + '\n' outfile = open(output_file, 'w') outfile.write(output) outfile.close() phases = ['train', 'val', ] data_loaders = [data_loader_train, data_loader_val, ] intervals = 6 for epoch in range(last_epoch, last_epoch+nb_epochs): start = time.clock() #if epoch == 6: # model.unfreeze_embeddings() # parameters = list(model.parameters()) # optimizer = torch.optim.Adam( # parameters, amsgrad=True, weight_decay=weight_decay) for phase, data_loader in zip(phases, data_loaders): if phase == 'train': model.train() else: model.eval() epoch_loss = [] epoch_sentenence_accuracy = [] epoch_token_accuracy = [] j = 1 for i, inputs in tqdm(enumerate(data_loader)): optimizer.zero_grad() inputs = variable(inputs) targets = variable(inputs) outputs = model.auto_encoder(inputs, targets) targets = targets.view(-1) outputs = outputs.view(targets.size(0), -1) loss = criterion(outputs, targets) epoch_loss.append(float(loss)) average_epoch_loss = np.mean(epoch_loss) if phase == 'train': loss.backward() torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm) optimizer.step() if (len(data_loader) / intervals)*j <= i+1: train_loss.append(average_epoch_loss) string = ( 'Epoch {:03d} Example {:03d} | {} loss: {:.3f}'.format( epoch, i, phase, average_epoch_loss)) print(string, end='\n') output = output + string + '\n' j += 1 else: predicted = torch.argmax( outputs.view(-1, max_len, vocab_size), -1) batch_sentence_accuracy, batch_token_accuracy = encoder_accuracy( targets.view(-1, max_len), predicted) epoch_sentenence_accuracy.append(batch_sentence_accuracy) epoch_token_accuracy.append(batch_token_accuracy) if phase == 'val': averege_epoch_sentenence_accuracy = np.mean(epoch_sentenence_accuracy) averege_epoch_token_accuracy = np.mean(epoch_token_accuracy) time_taken = time.clock() - start val_loss.append(average_epoch_loss) string = ' {} loss: {:.3f} | time: {:.3f}'.format( phase, average_epoch_loss, time_taken) print(string, end='') output = output + '\n' + string + '\n' string = '| sentence accuracy:{:.3f}| token accuracy:{:.3f}'.format( averege_epoch_sentenence_accuracy, averege_epoch_token_accuracy) print(string, end='\n') output = output + string + '\n' if average_epoch_loss < lowest_loss: save_checkpoint( model, average_epoch_loss, optimizer, new_model_filename, description_filename, epoch, train_loss, val_loss) lowest_loss = average_epoch_loss random_idx = np.random.randint(len(dataset_val)) inputs = dataset_val[random_idx] targets = inputs inputs_var = variable(inputs) outputs_var = model.auto_encoder(inputs_var.unsqueeze(0)) # # unsqueeze to # get the batch dimension outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy() string = '> {}\n'.format(get_sentence_from_indices( inputs, dataset_val.vocab, SentenceDataset.EOS_TOKEN)) string = string + u'= {}\n'.format(get_sentence_from_indices( targets, dataset_val.vocab, SentenceDataset.EOS_TOKEN)) string = string + u'< {}'.format(get_sentence_from_indices( outputs, dataset_val.vocab, SentenceDataset.EOS_TOKEN)) print(string, end='\n') output = output + string + '\n' + '\n' outfile = open(output_file, 'w') outfile.write(output) outfile.close()
def main(): file = { "model_group": "/seq_len_exp", "model_name": "/generation_6", "old_model_name": None, "model_version": 0, "project_file": "/home/mattd/PycharmProjects/reddit/generation" } file["dataset_path"] = "{}/data/".format(file["project_file"]) file["model_filename"] = '{}{}s{}_{}'.format(file["project_file"], file["model_group"], file["model_name"], file["model_version"]) file["output_file"] = '{}{}_outputs{}_{}'.format(file["project_file"], file["model_group"], file["model_name"], file["model_version"]) # check_files(file) use_old_model = file["old_model_name"] is not None params = {} if use_old_model: file["old_model_filename"] = '{}{}s{}'.format(file["project_file"], file["model_group"], file["old_model_name"]) params, old_files = load_params(file["old_model_filename"]) use_old_model = old_files != {} if not use_old_model: params = { "batch_size": 1000, "hidden_size": 256, "embedding_dim": 300, "pretrained_embeddings": True, "max_grad_norm": 5, "max_len": 30, "min_count": 2, "weight_decay": 0.00001, "learning_rate": 0.005, } params["num_training_examples"] = 78260 params["num_val_examples"] = -1 params["nb_epochs"] = 40 if params["pretrained_embeddings"]: file["pretrained_embeddings_file"] = \ "/embeddings/embeddings_min{}_max{}.npy".format( params["min_count"], params["max_len"]) string = "" for k, v in file.items(): string += "{}: {}\n".format(k, v) for k, v in params.items(): string += "{}: {}\n".format(k, v) print(string) output = string + '\n' # eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train' dataset_train_filename = "{}train.csv".format(file["dataset_path"]) dataset_val_filename = "{}validation.csv".format(file["dataset_path"]) dataset_train = PairsDataset(dataset_train_filename, params["max_len"], params["min_count"]) dataset_val = PairsDataset(dataset_val_filename, params["max_len"], params["min_count"], dataset_train.vocab) string = 'Vocab size {}\n'.format(len(dataset_train.vocab)) string += 'Train {} '.format(len(dataset_train)) if params["num_training_examples"] != -1: dataset_train.prune_examples(params["num_training_examples"]) string += '> {}'.format(len(dataset_train)) string += '\nVal: {}'.format(len(dataset_val)) if params["num_val_examples"] != -1: dataset_val.prune_examples(params["num_val_examples"]) string += '-> {}'.format(len(dataset_val)) print(string) output += string + '\n' if params["pretrained_embeddings"]: embeddings_dir = '{}{}'.format(file["project_file"], file["pretrained_embeddings_file"]) pretrained_embeddings = cuda(get_pretrained_embeddings(embeddings_dir)) params["embedding_dim"] = pretrained_embeddings.shape[1] else: pretrained_embeddings = None data_loader_train = torch.utils.data.DataLoader(dataset_train, params["batch_size"], shuffle=True) data_loader_val = torch.utils.data.DataLoader(dataset_val, params["batch_size"], shuffle=False) vocab_size = len(dataset_train.vocab) padding_idx = dataset_train.vocab[PairsDataset.PAD_TOKEN] init_idx = dataset_train.vocab[PairsDataset.INIT_TOKEN] model = Seq2SeqModel(params["hidden_size"], padding_idx, init_idx, params["max_len"], vocab_size, params["embedding_dim"], pretrained_embeddings) model = cuda(model) parameters = list(model.parameters()) optimizer = torch.optim.Adam(parameters, amsgrad=True, weight_decay=params["weight_decay"], lr=params["learning_rate"]) criterion = torch.nn.CrossEntropyLoss() if use_old_model: model, optimizer = load_checkpoint(file["old_model_filename"], model, optimizer) lowest_loss = 100 train_loss = [] val_loss = [] best_model = model best_optimizer = optimizer average_epoch_loss = 0 metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []} outfile = open(file["output_file"], 'w') outfile.write(output) outfile.close() phases = [ 'train', 'val', ] data_loaders = [ data_loader_train, data_loader_val, ] intervals = 2 highest_acc = 0 for epoch in range(0, params["nb_epochs"]): start = time.clock() string = 'Epoch: {}\n'.format(epoch) print(string, end='') output = output + '\n' + string #if epoch == 6: # model.unfreeze_embeddings() # parameters = list(model.parameters()) # optimizer = torch.optim.Adam( # parameters, amsgrad=True, weight_decay=weight_decay) for phase, data_loader in zip(phases, data_loaders): if phase == 'train': model.train() string = 'Train: \n' else: model.eval() string = 'Validation \n' print(string, end='') output = output + '\n' + string epoch_loss = [] epoch_accuracy = [] epoch_precision = [] epoch_recall = [] epoch_f1 = [] j = 1 for i, (sentence_1, sentence_2, labels) in tqdm(enumerate(data_loader)): optimizer.zero_grad() sentence_1 = variable(sentence_1) sentence_2 = variable(sentence_2) targets = variable(labels) outputs = model(sentence_1, sentence_2, targets) targets = targets.view(-1) outputs = outputs.view(targets.size(0), -1) loss = criterion(outputs, targets) epoch_loss.append(float(loss)) average_epoch_loss = np.mean(epoch_loss) if phase == 'train': loss.backward() torch.nn.utils.clip_grad_norm_(parameters, params["max_grad_norm"]) optimizer.step() if (len(data_loader) / intervals) * j <= i + 1: string = ('Example {:03d} | {} loss: {:.3f}'.format( i, phase, average_epoch_loss)) print(string, end='\n') output = output + string + '\n' j += 1 else: # get result metrics accuracy, precision, recall, f1 = classifier_accuracy( targets.cpu().numpy(), torch.argmax(outputs.view(-1, 2), -1).cpu().numpy()) #print('{},{},{},{}'.format(accuracy, precision, recall, # f1)) epoch_accuracy.append(accuracy) epoch_precision.append(precision) epoch_recall.append(recall) epoch_f1.append(f1) # print random sentence if phase == 'val': time_taken = time.clock() - start val_loss.append(average_epoch_loss) string = ' {} loss: {:.3f} | time: {:.3f}'.format( phase, average_epoch_loss, time_taken) string += ' | lowest loss: {:.3f} highest accuracy:' \ ' {:.3f}'.format(lowest_loss, highest_acc) print(string, end='\n') output = output + '\n' + string + '\n' average_epoch_accuracy = np.mean(epoch_accuracy) average_epoch_precision = np.mean(epoch_precision) average_epoch_recall = np.mean(epoch_recall) average_epoch_f1 = np.mean(epoch_f1) metrics["accuracy"].append(average_epoch_accuracy), metrics["precision"].append(average_epoch_precision) metrics["recall"].append(average_epoch_recall) metrics["f1"].append(average_epoch_f1) if average_epoch_loss < lowest_loss: best_model = model best_optimizer = optimizer best_epoch = epoch lowest_loss = average_epoch_loss save_checkpoint(best_epoch, best_model, best_optimizer, epoch, model, optimizer, train_loss, val_loss, metrics, params, file) if average_epoch_accuracy > highest_acc: highest_acc = average_epoch_accuracy string = "Accuracy: {:.3f}\nPrecision: {:.3f}\nRecall: {:.3f}\n" \ "F1: {:.3f}\n".format( average_epoch_accuracy, average_epoch_precision, average_epoch_recall, average_epoch_f1) print(string, end='\n') output = output + string + '\n' random_idx = np.random.randint(len(dataset_val)) sentence_1, sentence_2, labels = dataset_val[random_idx] targets = labels sentence_1_var = variable(sentence_1) sentence_2_var = variable(sentence_2) outputs_var = model(sentence_1_var.unsqueeze(0), sentence_2_var.unsqueeze(0)) # unsqueeze # to get the batch dimension outputs = outputs_var.squeeze(0).data.cpu().numpy() string = '> {}\n'.format( get_sentence_from_indices(sentence_1, dataset_val.vocab, PairsDataset.EOS_TOKEN)) string = string + u'> {}\n'.format( get_sentence_from_indices(sentence_2, dataset_val.vocab, PairsDataset.EOS_TOKEN)) string = string + u'target:{}| P false:{:.3f}, P true:' \ u' {:.3f}'.format(targets, float(outputs[0]), float(outputs[1])) print(string, end='\n\n') output = output + string + '\n' + '\n' else: train_loss.append(average_epoch_loss) outfile = open(file["output_file"], 'w') outfile.write(output) outfile.close()
def main(): nb_epochs = 100 batch_size = 500 hidden_size = 256 embedding_dim = 300 pretrained_embeddings = None max_len = 20 min_count = 2 max_grad_norm = 5 val_len = 10000 weight_decay = 0.00001 model_filename = '/home/mattd/pycharm/encoder/models3' \ '/Baseline' description_filename = \ '/home/mattd/pycharm/encoder/description/description2.txt' output_file = '/home/mattd/pycharm/encoder/model_outputs_3/baseline' outfile = open(output_file, 'w') eng_fr_filename = '/home/okovaleva/projects/forced_apart/autoencoder/data' \ '/train_1M.txt' dataset = SentenceDataset(eng_fr_filename, max_len, min_count) string = 'Dataset: {}'.format(len(dataset)) print(string) outfile.write(string+'\n') train_len = len(dataset) - val_len dataset_train, dataset_val = torch.utils.data.dataset.random_split(dataset, [train_len, val_len]) string = 'Train {}, val: {}'.format(len(dataset_train), len(dataset_val)) print(string) outfile.write(string+'\n') embeddings_dir = '/home/mattd/pycharm/encoder' \ '/embeddings_3min.npy' pretrained_embeddings = cuda( get_pretrained_embeddings(embeddings_dir, dataset)) embedding_dim = pretrained_embeddings.shape[1] data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False) vocab_size = len(dataset.vocab) padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN] init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN] model = Seq2SeqModel( pretrained_embeddings, hidden_size, padding_idx, init_idx, max_len, vocab_size, embedding_dim) model = cuda(model) parameters = list(model.parameters()) optimizer = torch.optim.Adam(parameters, amsgrad=True, weight_decay=weight_decay) criterion = torch.nn.CrossEntropyLoss(ignore_index=dataset.vocab[SentenceDataset.PAD_TOKEN]) model, optimizer, lowest_loss, description, last_epoch, \ train_loss, val_loss = load_checkpoint(model_filename, model, optimizer) print(description) phases = ['train', 'val', ] data_loaders = [data_loader_train, data_loader_val, ] for epoch in range(last_epoch, last_epoch+nb_epochs): start = time.clock() #if epoch == 6: # model.unfreeze_embeddings() # parameters = list(model.parameters()) # optimizer = torch.optim.Adam( # parameters, amsgrad=True, weight_decay=weight_decay) for phase, data_loader in zip(phases, data_loaders): if phase == 'train': model.train() else: model.eval() epoch_loss = [] epoch_sentenence_accuracy = [] epoch_token_accuracy = [] for i, inputs in enumerate(data_loader): optimizer.zero_grad() inputs = variable(inputs) targets = variable(inputs) outputs = model(inputs, targets) targets = targets.view(-1) outputs = outputs.view(targets.size(0), -1) loss = criterion(outputs, targets) if phase == 'train': loss.backward() torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm) optimizer.step() if phase == 'val': predicted = torch.argmax(outputs.view(batch_size, max_len, -1), -1) batch_sentenence_accuracy, batch_token_accuracy = accuracy( targets.view(batch_size, -1), predicted) epoch_sentenence_accuracy.append(batch_sentenence_accuracy) epoch_token_accuracy.append(batch_token_accuracy) epoch_loss.append(float(loss)) epoch_loss = np.mean(epoch_loss) if phase == 'train': train_loss.append(epoch_loss) string = ('Epoch {:03d} | {} loss: {:.3f}'.format( epoch, phase, epoch_loss)) print(string, end='\n') outfile.write(string+'\n') else: averege_epoch_sentenence_accuracy = sum(epoch_sentenence_accuracy) / \ len(epoch_sentenence_accuracy) averege_epoch_token_accuracy = sum(epoch_token_accuracy) / \ len(epoch_token_accuracy) time_taken = time.clock() - start val_loss.append(epoch_loss) string = ' {} loss: {:.3f} | time: {:.3f}'.format( phase, epoch_loss, time_taken) print(string, end='') string = '| sentence accuracy:{:.3f}| token accuracy:{:.3f}'.format( averege_epoch_sentenence_accuracy, averege_epoch_token_accuracy) print(string, end='\n') outfile.write(string+'\n') if epoch_loss < lowest_loss: save_checkpoint( model, epoch_loss, optimizer, model_filename, description_filename, epoch, train_loss, val_loss) lowest_loss = epoch_loss # print random sentence if phase == 'val': random_idx = np.random.randint(len(dataset_val)) inputs = dataset_val[random_idx] targets = inputs inputs_var = variable(inputs) outputs_var = model(inputs_var.unsqueeze(0)) # unsqueeze to get the batch dimension outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy() string = '> {}'.format(get_sentence_from_indices( inputs, dataset.vocab, SentenceDataset.EOS_TOKEN)) print(string, end='\n') outfile.write(string+'\n') string = u'= {}'.format(get_sentence_from_indices( targets, dataset.vocab, SentenceDataset.EOS_TOKEN)) print(string, end='\n') outfile.write(string+'\n') string = u'< {}'.format(get_sentence_from_indices( outputs, dataset.vocab, SentenceDataset.EOS_TOKEN)) print(string, end='\n') outfile.write(string+'\n') print() outfile.close()
def main(): nb_epochs = 30 #nb_epochs = 1 batch_size = 64 hidden_size = 256 embedding_dim = 300 max_len = 20 teacher_forcing = 0.6 min_count = 2 max_grad_norm = 5 val_len = 5000 weight_decay = 0.00001 eng_fr_filename = './data/eng-fra.txt' dataset = TSVSentencePairDataset(eng_fr_filename, max_len, min_count) print('Dataset: {}'.format(len(dataset))) train_len = len(dataset) - val_len dataset_train, dataset_val = torch.utils.data.dataset.random_split( dataset, [train_len, val_len]) print('Train {}, val: {}'.format(len(dataset_train), len(dataset_val))) data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False) vocab_size = len(dataset.vocab) padding_idx = dataset.vocab[TSVSentencePairDataset.PAD_TOKEN] init_idx = dataset.vocab[TSVSentencePairDataset.INIT_TOKEN] model = Seq2SeqModel(vocab_size, embedding_dim, hidden_size, padding_idx, init_idx, max_len, teacher_forcing) model = cuda(model) parameters = list(model.parameters()) optimizer = torch.optim.Adam(parameters, weight_decay=weight_decay) criterion = torch.nn.CrossEntropyLoss( ignore_index=dataset.vocab[TSVSentencePairDataset.PAD_TOKEN]) phases = [ 'train', 'val', ] data_loaders = [ data_loader_train, data_loader_val, ] for epoch in range(nb_epochs): for phase, data_loader in zip(phases, data_loaders): if phase == 'train': model.train() else: model.eval() epoch_loss = [] for i, (inputs, targets) in enumerate(data_loader): optimizer.zero_grad() inputs = variable(inputs) targets = variable(targets) outputs = model(inputs, targets) targets = targets.view(-1) outputs = outputs.view(targets.size(0), -1) loss = criterion(outputs, targets) if phase == 'train': loss.backward() torch.nn.utils.clip_grad_norm(parameters, max_grad_norm) optimizer.step() epoch_loss.append(float(loss)) epoch_loss = np.mean(epoch_loss) if phase == 'train': print('Epoch {:03d} | {} loss: {:.3f}'.format( epoch, phase, epoch_loss), end='') else: print(', {} loss: {:.3f}'.format(phase, epoch_loss), end='\n') # print random sentence if phase == 'val': random_idx = np.random.randint(len(dataset_val)) inputs, targets = dataset_val[random_idx] inputs_var = variable(inputs) print("Test") outputs_var = model(inputs_var.unsqueeze( 0)) # unsqueeze to get the batch dimension #outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy() softmax = torch.nn.Softmax(dim=1) outputs = softmax(outputs_var) outputs = torch.multinomial(outputs, 1).data.view(-1) print(u'> {}'.format( get_sentence_from_indices( inputs, dataset.vocab, TSVSentencePairDataset.EOS_TOKEN))) print(u'= {}'.format( get_sentence_from_indices( targets, dataset.vocab, TSVSentencePairDataset.EOS_TOKEN))) print(u'< {}'.format( get_sentence_from_indices( outputs, dataset.vocab, TSVSentencePairDataset.EOS_TOKEN))) print()