def get_training_setting(): with open('vocab.pkl', 'rb') as file: vocab = pkl.load(file) reverse_vocab = dict((v, k) for k, v in vocab.items()) emsize = 40 # embedding dimension nhid = 40 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 3 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 4 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value model = TransformerModel(emsize, len(vocab), nhead, nhid, nlayers, dropout).to(device) lr = 1e-2 # learning rate optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.5) dataloader = Data.DataLoader( dataset = SentenceDataset('train.csv', vocab), batch_size = 64,\ shuffle = True, collate_fn = collate_fn, num_workers = 4 ) val_data = Data.DataLoader( dataset = SentenceDataset('valid.csv', vocab), batch_size=64,\ shuffle = False, collate_fn = collate_fn, num_workers=4 ) return model, scheduler, optimizer, dataloader, val_data, reverse_vocab
def main(): max_len = 15 min_count = 2 embeddings_dir = '/home/mattd/embeddings/reddit_2/' #dataset_path = '/home/mattd/datasets/AskReddit/' dataset_path = "/home/mattd/PycharmProjects/reddit/generation/data/" dataset_train_filename = "{}train.csv".format(dataset_path) dataset_val_filename = "{}validation.csv".format(dataset_path) save_dir = "/home/mattd/PycharmProjects/reddit/generation/embeddings/" dataset_train = SentenceDataset(dataset_train_filename, max_len, min_count) dataset_val = SentenceDataset(dataset_val_filename, max_len, min_count, dataset_train.vocab) #dataset.add_file(eng_fr_filename2) vectors = embeddings.load_from_dir(embeddings_dir) #emb = embeddings.load_from_dir(embeddings_dir) embs_matrix = np.zeros((len(dataset_val.vocab), len(vectors.matrix[0]))) for i, token in enumerate(dataset_val.vocab.token2id): if vectors.has_word(token): embs_matrix[i] = vectors.get_vector(token) np.save('{}embeddings_min{}_max{}'.format(save_dir, min_count, max_len), embs_matrix)
def main() -> None: tokenizer = Tokenizer(args.vocab_file) vocabulary_size = len(tokenizer) dataset = SentenceDataset(args.input_file, tokenizer=tokenizer.encode) loader = DataLoader(dataset, args.batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=False) searcher = BeamSearch(tokenizer.eos_index, beam_size=args.search_width) model = VAE( num_embeddings=len(tokenizer), dim_embedding=args.dim_embedding, dim_hidden=args.dim_hidden, dim_latent=args.dim_latent, num_layers=args.num_layers, bidirectional=args.bidirectional, dropout=0., word_dropout=0., dropped_index=tokenizer.unk_index, ).to(device) model.load_state_dict(torch.load(args.checkpoint_file, map_location=device)) model.eval() print('Generating sentence...') all_hypotheses = [] with torch.no_grad(): for s in tqdm(loader): s = s.to(device) length = torch.sum(s != tokenizer.pad_index, dim=-1) bsz = s.shape[0] mean, logvar = model.encode(s, length) # z = model.reparameterize(mean, logvar) z = mean hidden = model.fc_hidden(z) hidden = hidden.view(bsz, -1, model.dim_hidden).transpose(0, 1).contiguous() start_predictions = torch.zeros(bsz, device=device).fill_( tokenizer.bos_index).long() start_state = {'hidden': hidden.permute(1, 0, 2)} predictions, log_probabilities = searcher.search( start_predictions, start_state, model.step) for preds in predictions: tokens = preds[0] tokens = tokens[tokens != tokenizer.eos_index].tolist() all_hypotheses.append(tokenizer.decode(tokens)) print('Done') with open(args.output_file, 'w') as f: f.write('\n'.join(all_hypotheses))
def main(_): train_path = FLAGS.train_path val_path = FLAGS.val_path vocab_path = FLAGS.vocab_path train_batch_size = FLAGS.train_batch_size val_batch_size = FLAGS.val_batch_size save_model = FLAGS.save_model embed_size = FLAGS.embed_size hidden_size = FLAGS.hidden_size lr = FLAGS.lr epochs = FLAGS.epochs save_every = FLAGS.save_every display_every = FLAGS.display_every device = torch.device('cuda:0' if FLAGS.device=='cuda' else 'cpu') vocab = Vocabulary.load(vocab_path) if FLAGS.load_model: load_model = FLAGS.load_model model = Paraphraser.load(load_model, device) else: model = Paraphraser(embed_size, hidden_size, vocab, device) # uniformly initialize the parameters for param in model.parameters(): param.data.uniform_(-0.1, 0.1) train_data_source, train_data_target = read(train_path) val_data_source, val_data_target = read(val_path) train_dataset = SentenceDataset(train_data_source, train_data_target, vocab) train_loader = torch.utils.data.DataLoader(train_dataset, train_batch_size, shuffle=True) val_dataset = SentenceDataset(val_data_source, val_data_target, vocab) val_loader = torch.utils.data.DataLoader(val_dataset, val_batch_size) optimizer = torch.optim.Adam(model.parameters(), lr) print('Started training... ') train(model, vocab, train_loader, val_loader, optimizer, embed_size, hidden_size, epochs, save_model, save_every, display_every, device) convert_onnx(model.load(save_model, device), val_loader, vocab)
def main(): embeddings_dir = '/mnt/data1/embeddings/crawl/' eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train' dataset = SentenceDataset(eng_fr_filename, 20, 2) emb = embeddings.load_from_dir(embeddings_dir) vocab_embs = np.zeros((len(dataset.vocab), emb.matrix.shape[1])) for i, token in enumerate(dataset.vocab.token2id): if emb.has_word(token): vocab_embs[i] = emb.get_vector(token) np.save('embeddings', vocab_embs)
def main(): hidden_size = 256 embedding_dim = 300 pretrained_embeddings = None max_len = 20 min_count = 2 max_grad_norm = 5 val_len = 10000 weight_decay = 0.00001 model_filename_1 = '/home/mattd/pycharm/encoder/models3/Baseline' model_filename_2 = '/home/mattd/pycharm/encoder/models3/Attention' eng_fr_filename = '/home/okovaleva/projects/forced_apart/autoencoder/data' \ '/train_1M.txt' dataset = SentenceDataset(eng_fr_filename, max_len, min_count) vocab_size = len(dataset.vocab) padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN] init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN] model = Seq2SeqModel(pretrained_embeddings, hidden_size, padding_idx, init_idx, max_len, vocab_size, embedding_dim) parameters = list(model.parameters()) optimizer = torch.optim.Adam(parameters, amsgrad=True, weight_decay=weight_decay) model, optimizer, lowest_loss, description, last_epoch, \ train_loss_1, val_loss_1 = load_checkpoint(model_filename_1, model, optimizer) model = Seq2SeqModelAttention(pretrained_embeddings, hidden_size, padding_idx, init_idx, max_len, vocab_size, embedding_dim) parameters = list(model.parameters()) optimizer = torch.optim.Adam(parameters, amsgrad=True, weight_decay=weight_decay) model, optimizer, lowest_loss, description, last_epoch, \ train_loss_2, val_loss_2 = load_checkpoint(model_filename_2, model, optimizer) plot_data(train_loss_1, val_loss_1) plot_data(train_loss_2, val_loss_2)
def do_test(model_ckpt_path, test_data_path, result_path, word_dict_path, emb_dim, hid_dim): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') word_dict = load_pkl(word_dict_path) PAD_IDX = word_dict.word2idx("<PAD>") print("load data...") testData = SentenceDataset(load_pkl(test_data_path), word_dict, PAD_IDX, training=False) print("load model...") model = get_model(word_dict.get_len(), word_dict.get_len(), emb_dim, hid_dim, device) model.load_state_dict(torch.load(model_ckpt_path)) model.to(device) print("predicting...") make_prediction(model, testData, word_dict, result_path, device)
def main(): embeddings_dir = '/home/okovaleva/projects/forced_apart/autoencoder/data' \ '/w2vec.pkl' eng_fr_filename = '/home/okovaleva/projects/forced_apart/autoencoder/data' \ '/train.txt' eng_fr_filename2 = '/home/okovaleva/projects/forced_apart/autoencoder/data' \ '/test.txt' dataset = SentenceDataset(eng_fr_filename, 20, 2) #dataset.add_file(eng_fr_filename2) dataset.vocab.prune_vocab(2) vectors = get_vectors(embeddings_dir) #emb = embeddings.load_from_dir(embeddings_dir) embs_matrix = np.zeros((len(dataset.vocab), vectors['r'].size)) for i, token in enumerate(dataset.vocab.token2id): if token in vectors: embs_matrix[i] = vectors[token] np.save('embeddings_2min', embs_matrix)
def __init__(self): self._dataset = SentenceDataset() self.sentence_dataset_names = ["MSRvid", "SmartTextile", "MTurk", "environment"]
class SentenceSimEvaluation(WordSimEvaluation): def __init__(self): self._dataset = SentenceDataset() self.sentence_dataset_names = ["MSRvid", "SmartTextile", "MTurk", "environment"] def evaluate_all_sentence_datasets(self, display_table=True, **kwargs): cors = [self.evaluate_sentence_similarity(name, **kwargs) for name in self.sentence_dataset_names] if display_table: df_wpath = pd.DataFrame([cors], index=["wpath"], columns=self.sentence_dataset_names) return display(df_wpath) return cors def evaluate_sentence_similarity(self, dataset_name="MSRvid", metric = "wpath_graph", relatedness=True, save_results = False, database="wikidata"): concepts, cc, texts = get_ideas_in_format(dataset_name, database=database) KG = DAC(concepts=concepts, dataset=dataset_name, relatedness=relatedness, database=database) if(KG.graph.__len__()==0): print("start building knowledge graph") KG.build_nx_graph() ConSim = ConceptSimilarity(KG) sim_M = ConSim.similarityMatrix(lcs_pref_value="freq1", metric=metric) WMD = WordMoversSimilarity(sim_M, KG._concepts) sen_pairs, human_sim = self._dataset.load_sentence_pairs_and_similarities(dataset_name) sim_values = [] map_sen2bow = dict(zip(texts, [[c["id"] for c in bow] for bow in cc])) pg, total_len = 0 , len(sen_pairs) remove_index = [] for sen1, sen2 in sen_pairs: show_progression(pg, total_len) bow1, bow2 = list(set(map_sen2bow[sen1]) & set(KG._concepts)), list(set(map_sen2bow[sen2]) & set(KG._concepts)) sim_value = WMD.word_mover_distance(bow1, bow2) if sim_value is None: print(sen1, sen2) remove_index.append(pg) else: sim_values.append(sim_value) pg = pg+1 human_sim = np.delete(human_sim, remove_index) cor = pearsonr(sim_values, human_sim)[0] if save_results: results = list(zip([round(x, 3) for x in sim_values], sen_pairs)) self._dataset.save_dataset(dict(zip(("correlation", "similarities"),(cor, results))), dataset_name+"_"+metric) return cor def compute_concept_sentence_M(self, dataset_name="gold", database="wikidata",metric="wpath", lcs_pref_value="freq1", relatedness=True): concepts, cc, texts = get_ideas_in_format(dataset_name, database=database) bows = [[c["id"] for c in bow] for bow in cc] KG = DAC(concepts=concepts, dataset=dataset_name, relatedness=relatedness, database=database) if(KG.graph.__len__()==0): print("start building knowledge graph") KG.build_nx_graph() ConSim = ConceptSimilarity(KG) sim_M = ConSim.similarityMatrix(lcs_pref_value="freq1", metric=metric) WMD = WordMoversSimilarity(sim_M, KG._concepts) ideaM = WMD.sentenceSimilarityMatrix(bows) con2ideaM = WMD.concepts2sentenceSIM(bows) SIM_data = { "concepts":KG._concepts, "ideas":texts, "conceptSIM":sim_M.tolist(), "ideaSIM":ideaM.tolist(), "concept2ideaSIM":con2ideaM.tolist() } print(SIM_data["concepts"].__len__() == SIM_data["conceptSIM"].__len__()) self._dataset.save_dataset(SIM_data, dataset_name) return SIM_data
def main(): logger = logging.getLogger(__name__) handler1 = logging.StreamHandler() handler1.setLevel(logging.INFO) handler2 = logging.FileHandler(filename=args.log_file, mode='w') handler2.setFormatter( logging.Formatter("%(asctime)s %(levelname)8s %(message)s")) handler2.setLevel(logging.INFO) logger.setLevel(logging.INFO) logger.addHandler(handler1) logger.addHandler(handler2) tokenizer = Tokenizer(args.vocab_file) train_dataset = SentenceDataset(args.train_file, tokenizer.encode) valid_dataset = SentenceDataset(args.valid_file, tokenizer.encode) train_loader = DataLoader(train_dataset, args.batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, drop_last=True) valid_loader = DataLoader(valid_dataset, args.batch_size, shuffle=False, collate_fn=valid_dataset.collate_fn, drop_last=True) model = VAE( num_embeddings=len(tokenizer), dim_embedding=args.dim_embedding, dim_hidden=args.dim_hidden, dim_latent=args.dim_latent, num_layers=args.num_layers, bidirectional=args.bidirectional, dropout=args.dropout, word_dropout=args.word_dropout, dropped_index=tokenizer.unk_index, ).to(device) annealer = KLAnnealer(x0=args.x0, k=args.k) criterion = LmCrossEntropyLoss(tokenizer.pad_index, reduction='batchmean') optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(0.9, 0.98), eps=1e-09) logger.info('Start training') for epoch in range(args.num_epochs): train_loss, train_ce_loss, train_kl_loss, valid_loss, valid_ce_loss, valid_kl_loss = 0., 0., 0., 0., 0., 0. pbar = tqdm(train_loader) pbar.set_description("[Epoch %d/%d]" % (epoch, args.num_epochs)) # Train model.train() for itr, s in enumerate(pbar): beta = annealer() s = s.to(device) length = torch.sum(s != tokenizer.pad_index, dim=-1) output, mean, logvar, z = model(s, length) ce_loss = criterion(output[:, :-1, :], s[:, 1:]) kl_loss = -0.5 * torch.mean( torch.sum(1 + logvar - mean.pow(2) - logvar.exp(), dim=-1)) loss = ce_loss + beta * kl_loss optimizer.zero_grad() loss.backward() optimizer.step() annealer.step() train_loss += loss.item() train_ce_loss += ce_loss.item() train_kl_loss += kl_loss.item() if itr % args.print_every == 0: pbar.set_postfix(loss=train_loss / (itr + 1), beta=beta) train_loss /= len(train_loader) train_ce_loss /= len(train_loader) train_kl_loss /= len(train_loader) # Valid model.eval() with torch.no_grad(): for s in valid_loader: beta = annealer() s = s.to(device) length = torch.sum(s != tokenizer.pad_index, dim=-1) output, mean, logvar, z = model(s, length) ce_loss = criterion(output[:, :-1, :], s[:, 1:]) kl_loss = -0.5 * torch.mean( torch.sum(1 + logvar - mean.pow(2) - logvar.exp(), dim=-1)) loss = ce_loss + beta * kl_loss valid_loss += loss.item() valid_ce_loss += ce_loss.item() valid_kl_loss += kl_loss.item() valid_loss /= len(valid_loader) valid_ce_loss /= len(valid_loader) valid_kl_loss /= len(valid_loader) logger.info( '[Epoch %d/%d] Training loss: %.2f, CE loss: %.2f, KL loss: %.2f, Validation loss: %.2f, CE loss: %.2f, KL loss: %.2f' % ( epoch, args.num_epochs, train_loss, train_ce_loss, train_kl_loss, valid_loss, valid_ce_loss, valid_kl_loss, )) torch.save(model.state_dict(), args.checkpoint_file)
def main(): nb_epochs = 30 batch_size = 200 hidden_size = 256 embedding_dim = 300 max_len = 20 teacher_forcing = 0.6 min_count = 2 max_grad_norm = 5 val_len = 5000 weight_decay = 0.00001 model_filename = '/home/mattd/pycharm/yelp/models' \ '/baseline_frozen_pretrained' eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train' dataset = SentenceDataset(eng_fr_filename, max_len, min_count) print('Dataset: {}'.format(len(dataset))) train_len = len(dataset) - val_len dataset_train, dataset_val = torch.utils.data.dataset.random_split( dataset, [train_len, val_len]) print('Train {}, val: {}'.format(len(dataset_train), len(dataset_val))) embeddings_dir = '/home/mattd/pycharm/yelp/embeddings.npy' embeddings = cuda(get_pretrained_embeddings(embeddings_dir, dataset)) data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False) vocab_size = len(dataset.vocab) padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN] init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN] model = Seq2SeqModel(embeddings, hidden_size, padding_idx, init_idx, max_len, teacher_forcing) model = cuda(model) parameters = list(model.parameters()) optimizer = torch.optim.Adam(parameters, amsgrad=True, weight_decay=weight_decay) criterion = torch.nn.CrossEntropyLoss( ignore_index=dataset.vocab[SentenceDataset.PAD_TOKEN]) phases = [ 'train', 'val', ] data_loaders = [ data_loader_train, data_loader_val, ] lowest_loss = 500 for epoch in range(nb_epochs): for phase, data_loader in zip(phases, data_loaders): if phase == 'train': model.train() else: model.eval() epoch_loss = [] for i, (inputs, targets) in enumerate(data_loader): optimizer.zero_grad() inputs = variable(inputs) targets = variable(targets) outputs = model(inputs, targets) targets = targets.view(-1) outputs = outputs.view(targets.size(0), -1) loss = criterion(outputs, targets) if phase == 'train': loss.backward() torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm) optimizer.step() epoch_loss.append(float(loss)) epoch_loss = np.mean(epoch_loss) if epoch_loss < lowest_loss: save_checkpoint(model, loss, optimizer, model_filename) lowest_loss = epoch_loss if phase == 'train': print('Epoch {:03d} | {} loss: {:.3f}'.format( epoch, phase, epoch_loss), end='') else: print(', {} loss: {:.3f}'.format(phase, epoch_loss), end='\n') # print random sentence if phase == 'val': random_idx = np.random.randint(len(dataset_val)) inputs, targets = dataset_val[random_idx] inputs_var = variable(inputs) outputs_var = model(inputs_var.unsqueeze( 0)) # unsqueeze to get the batch dimension outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy() print(u'> {}'.format( get_sentence_from_indices(inputs, dataset.vocab, SentenceDataset.EOS_TOKEN))) print(u'= {}'.format( get_sentence_from_indices(targets, dataset.vocab, SentenceDataset.EOS_TOKEN))) print(u'< {}'.format( get_sentence_from_indices(outputs, dataset.vocab, SentenceDataset.EOS_TOKEN))) print()
def main(): nb_epochs = 100 batch_size = 500 hidden_size = 256 embedding_dim = 300 pretrained_embeddings = None max_len = 20 min_count = 2 max_grad_norm = 5 val_len = 10000 weight_decay = 0.00001 model_filename = '/home/mattd/pycharm/encoder/models3' \ '/Baseline' description_filename = \ '/home/mattd/pycharm/encoder/description/description2.txt' output_file = '/home/mattd/pycharm/encoder/model_outputs_3/baseline' outfile = open(output_file, 'w') eng_fr_filename = '/home/okovaleva/projects/forced_apart/autoencoder/data' \ '/train_1M.txt' dataset = SentenceDataset(eng_fr_filename, max_len, min_count) string = 'Dataset: {}'.format(len(dataset)) print(string) outfile.write(string+'\n') train_len = len(dataset) - val_len dataset_train, dataset_val = torch.utils.data.dataset.random_split(dataset, [train_len, val_len]) string = 'Train {}, val: {}'.format(len(dataset_train), len(dataset_val)) print(string) outfile.write(string+'\n') embeddings_dir = '/home/mattd/pycharm/encoder' \ '/embeddings_3min.npy' pretrained_embeddings = cuda( get_pretrained_embeddings(embeddings_dir, dataset)) embedding_dim = pretrained_embeddings.shape[1] data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False) vocab_size = len(dataset.vocab) padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN] init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN] model = Seq2SeqModel( pretrained_embeddings, hidden_size, padding_idx, init_idx, max_len, vocab_size, embedding_dim) model = cuda(model) parameters = list(model.parameters()) optimizer = torch.optim.Adam(parameters, amsgrad=True, weight_decay=weight_decay) criterion = torch.nn.CrossEntropyLoss(ignore_index=dataset.vocab[SentenceDataset.PAD_TOKEN]) model, optimizer, lowest_loss, description, last_epoch, \ train_loss, val_loss = load_checkpoint(model_filename, model, optimizer) print(description) phases = ['train', 'val', ] data_loaders = [data_loader_train, data_loader_val, ] for epoch in range(last_epoch, last_epoch+nb_epochs): start = time.clock() #if epoch == 6: # model.unfreeze_embeddings() # parameters = list(model.parameters()) # optimizer = torch.optim.Adam( # parameters, amsgrad=True, weight_decay=weight_decay) for phase, data_loader in zip(phases, data_loaders): if phase == 'train': model.train() else: model.eval() epoch_loss = [] epoch_sentenence_accuracy = [] epoch_token_accuracy = [] for i, inputs in enumerate(data_loader): optimizer.zero_grad() inputs = variable(inputs) targets = variable(inputs) outputs = model(inputs, targets) targets = targets.view(-1) outputs = outputs.view(targets.size(0), -1) loss = criterion(outputs, targets) if phase == 'train': loss.backward() torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm) optimizer.step() if phase == 'val': predicted = torch.argmax(outputs.view(batch_size, max_len, -1), -1) batch_sentenence_accuracy, batch_token_accuracy = accuracy( targets.view(batch_size, -1), predicted) epoch_sentenence_accuracy.append(batch_sentenence_accuracy) epoch_token_accuracy.append(batch_token_accuracy) epoch_loss.append(float(loss)) epoch_loss = np.mean(epoch_loss) if phase == 'train': train_loss.append(epoch_loss) string = ('Epoch {:03d} | {} loss: {:.3f}'.format( epoch, phase, epoch_loss)) print(string, end='\n') outfile.write(string+'\n') else: averege_epoch_sentenence_accuracy = sum(epoch_sentenence_accuracy) / \ len(epoch_sentenence_accuracy) averege_epoch_token_accuracy = sum(epoch_token_accuracy) / \ len(epoch_token_accuracy) time_taken = time.clock() - start val_loss.append(epoch_loss) string = ' {} loss: {:.3f} | time: {:.3f}'.format( phase, epoch_loss, time_taken) print(string, end='') string = '| sentence accuracy:{:.3f}| token accuracy:{:.3f}'.format( averege_epoch_sentenence_accuracy, averege_epoch_token_accuracy) print(string, end='\n') outfile.write(string+'\n') if epoch_loss < lowest_loss: save_checkpoint( model, epoch_loss, optimizer, model_filename, description_filename, epoch, train_loss, val_loss) lowest_loss = epoch_loss # print random sentence if phase == 'val': random_idx = np.random.randint(len(dataset_val)) inputs = dataset_val[random_idx] targets = inputs inputs_var = variable(inputs) outputs_var = model(inputs_var.unsqueeze(0)) # unsqueeze to get the batch dimension outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy() string = '> {}'.format(get_sentence_from_indices( inputs, dataset.vocab, SentenceDataset.EOS_TOKEN)) print(string, end='\n') outfile.write(string+'\n') string = u'= {}'.format(get_sentence_from_indices( targets, dataset.vocab, SentenceDataset.EOS_TOKEN)) print(string, end='\n') outfile.write(string+'\n') string = u'< {}'.format(get_sentence_from_indices( outputs, dataset.vocab, SentenceDataset.EOS_TOKEN)) print(string, end='\n') outfile.write(string+'\n') print() outfile.close()
def predict(path): with open('vocab.pkl', 'rb') as file: vocab = pkl.load(file) reverse_vocab = {v: k for k, v in vocab.items()} emsize = 40 # embedding dimension nhid = 40# the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 3 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 4 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value model = TransformerModel(emsize, len(vocab), nhead, nhid, nlayers, dropout).to(device) model.load_state_dict(torch.load(path)['model']) test_data = Data.DataLoader( dataset = SentenceDataset('test.csv', vocab, True), batch_size = 64,\ shuffle = False, collate_fn = pred_collate_fn, num_workers = 4 ) model.eval() rewrite_indexes = [] cut_length = [] for batch in test_data: data, masks, length= batch['fail_sent'], batch['mask'], batch['length'] if cuda: data, masks = data.to(device), masks.to(device) output = model(data).squeeze(2) probs = (F.sigmoid(output) > 0.6) for i in range(data.shape[0]): check = ','.join([reverse_vocab[idx.item()] for idx in data[i]]) if check == 'Nb,P,Neu,Nf,Na,VG,Neu,Nf,Ng': pdb.set_trace() d, l, p = data[i], length[i], probs[i] one_hot = (p == True).tolist()[:l] index = [i for i in range(len(one_hot)) if one_hot[i] == 1] cut_length.append(len(one_hot)-len(index)) rewrite_indexes.append(index) output = {} answers = [] with open('answer.txt', 'r') as file: for line in file: answers.append(line.split('\n')[0]) plt.hist(cut_length, histtype='stepfilled', alpha=0.3, bins=list(set(cut_length))) plt.savefig('cut_length_rouge_transformer.png') df = pd.read_csv('test.csv') for i, row in df.iterrows(): index = rewrite_indexes[i] word_list = row['Original'].split(',') mapping = row['Mapping'] sent = [word_list[ind] for ind in index] if mapping not in output: output[mapping] = [sent] else: output[mapping].append(sent) with open('rewrite_index.txt', 'w') as file: for key, value in output.items(): out = "" for sent in value: out += ''.join(sent)+',' try: out = out[:-1] + '?\t' + answers[key] + '\n' except: pdb.set_trace() file.write(out)
def main(): train_data = SentenceDataset(args.train_file, encoding_type=args.encoding_type, filter_threshold=args.filter_threshold) val_data = SentenceDataset(args.val_file, encoding_type=args.encoding_type, filter_threshold=args.filter_threshold) train_loader = torch.utils.data.DataLoader(train_data, args.batch_size, shuffle=True) val_loader = torch.utils.data.DataLoader(val_data, args.batch_size) print(len(train_loader)) input_dim = len(train_data.vocab.source_vocab) output_dim = len(train_data.vocab.target_vocab) static = args.embedding_type == 'static' device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') enc_embedding = Embeddings(input_dim, args.hidden_dim, args.max_len, device, static) encoder_layer = EncoderLayer(args.hidden_dim, args.num_enc_heads, args.inner_dim, args.dropout) encoder = Encoder(enc_embedding, encoder_layer, args.num_enc_layers, args.dropout) dec_embedding = Embeddings(input_dim, args.hidden_dim, args.max_len, device, static) decoder_layer = DecoderLayer(args.hidden_dim, args.num_dec_heads, args.inner_dim, args.dropout) decoder = Decoder(output_dim, args.hidden_dim, dec_embedding, decoder_layer, args.num_dec_layers, args.dropout) pad_id = train_data.vocab.source_vocab['<pad>'] model = Transformer(encoder, decoder, pad_id, device) print('Transformer has {:,} trainable parameters'.format( count_parames(model))) if args.load_model is not None: model.load(args.load_model) else: model.apply(init_weights) if args.mode == 'test': inferencer = Inferencer(model, train_data.vocab, device) greedy_out = inferencer.infer_greedy( 'helo world, I m testin a typo corector') print(greedy_out) elif args.mode == 'train': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) loss_function = nn.NLLLoss(ignore_index=pad_id) print('Started training...') train(model, train_loader, val_loader, optimizer, loss_function, device) else: raise ValueError('Mode not recognized')