def main(args): # Load tokens source_tokens = load_tokens( os.path.join(args.path_to_embeddings, args.lang + '.tok')) target_tokens = load_tokens(os.path.join(args.path_to_embeddings, 'en.tok')) source_idx_to_token, source_token_to_idx = {}, {} target_idx_to_token, target_token_to_idx = {}, {} for i, t in enumerate(source_tokens): source_idx_to_token[i] = t source_token_to_idx[t] = i for i, t in enumerate(target_tokens): target_idx_to_token[i] = t target_token_to_idx[t] = i # Load word embeddings train_embed = True if str(args.train_embed).lower() == 'true' else False source_word_embeddings = load_word_embeddings( os.path.join(args.path_to_embeddings, 'wiki.' + args.lang + '.vec'), source_tokens, train_embed) target_word_embeddings = load_word_embeddings( os.path.join(args.path_to_embeddings, 'wiki.en.vec'), target_tokens, train_embed) print('Source word embeddings size:', source_word_embeddings.size()) print('Target word embeddings size:', target_word_embeddings.size()) # Build network if str(args.self_attention).lower() == 'true': encoder = SelfAttentionEncoder(source_word_embeddings, args.encode_max_len, hidden_size=args.hidden_size) else: encoder = RNNEncoder(source_word_embeddings, bidirectional=True, hidden_size=args.hidden_size, num_hidden_layers=args.num_hidden_layers) decoder = RNNDecoder( target_word_embeddings, args.decode_max_len, True if str(args.attention).lower() == 'true' else False, hidden_size=args.hidden_size, num_hidden_layers=args.num_hidden_layers) gpu = True if str(args.gpu).lower() == 'true' else False if gpu: encoder = encoder.cuda() decoder = decoder.cuda() if str(args.self_attention).lower() == 'true': for i in range(encoder.num_blocks): encoder.encoder_blocks[i] = encoder.encoder_blocks[i].cuda() for j in range(encoder.encoder_blocks[i].num_attention_heads): encoder.encoder_blocks[i].Qs[j] = encoder.encoder_blocks[ i].Qs[j].cuda() encoder.encoder_blocks[i].Ks[j] = encoder.encoder_blocks[ i].Ks[j].cuda() encoder.encoder_blocks[i].Vs[j] = encoder.encoder_blocks[ i].Vs[j].cuda() if args.path_to_log is not None: load_model(encoder, decoder, args.path_to_log, gpu) print('Encoder and decoder built.') should_save_model = True if str( args.save_model).lower() == 'true' else False # Define loss function # Because the output from our decoder is log softmax # here we use negative log likelihood function # so that the end results are just cross entropy loss criterion = nn.NLLLoss(ignore_index=SPECIAL_TOKENS.index('<pad>')) if args.mode == 'train': # Prepare data train_data_generator = TranslationGenerator(args.batch_size, args.lang, args.path_to_data, 'train', source_token_to_idx, target_token_to_idx, args.encode_max_len) val_data_generator = TranslationGenerator(args.batch_size, args.lang, args.path_to_data, 'dev', source_token_to_idx, target_token_to_idx, args.decode_max_len, should_shuffle=False) # Summarize model parameters for training params = [source_word_embeddings, target_word_embeddings] params += list(encoder.parameters()) + list(decoder.parameters()) # Define an Adam optimizer optimizer = optim.Adam(params, lr=args.lr) best_val_bleu = 0 val_size = val_data_generator.data_size if args.val_size <= 0 else args.val_size losses, val_bleus = [], [] for itr in range(MAX_ITRS): encoder.train() decoder.train() # Get data raw_X, raw_y, X, X_seq_lens, y, y_seq_lens = next( train_data_generator) if gpu: X = X.cuda() y = y.cuda() # Reset gradients optimizer.zero_grad() # Forward pass - encoder encoder_output, h_n = encoder(X, input_lengths=X_seq_lens, gpu=gpu) # Forward pass - decoder output_log_softmax, preds = decoder(encoder_output, h_n, gpu=gpu, y=y, y_seq_lens=y_seq_lens) # Compute loss loss = compute_loss(criterion, output_log_softmax, y[:, 1:]) losses.append(loss.item()) # Backward pass loss.backward() # Update parameters optimizer.step() # Do some logging if itr % LOG_PER_ITRS == 0: print('Itr {}, Loss: {}'.format(itr, loss.item())) # Validation if itr % VAL_PER_ITRS == 0: pred_stream, ref_stream = corpus_predict(val_data_generator, encoder, decoder, target_idx_to_token, gpu=gpu, val_size=val_size, beam=args.beam) val_bleu = corpus_bleu(pred_stream, ref_stream, tokenize='none', lowercase=True) print('{}, Validation BLEU: {}'.format( time.strftime("%Y-%m-%d %H:%M"), val_bleu)) val_bleus.append(val_bleu) # Save losses if args.path_to_log is not None: if not os.path.isdir(args.path_to_log): os.mkdir(args.path_to_log) # Record losses with open(os.path.join(args.path_to_log, 'losses'), 'a') as f: for l in losses: f.write(str(l)) f.write('\n') with open(os.path.join(args.path_to_log, 'val_bleus'), 'a') as f: for b in val_bleus: f.write(str(b.score)) f.write('\n') # Reset losses losses, val_bleus = [], [] # Save model if itr > 0 and args.path_to_log is not None and should_save_model and val_bleu.score > best_val_bleu: best_val_bleu = val_bleu.score save_model(encoder, decoder, args.path_to_log) print('Saved model to {}'.format(args.path_to_log)) elif args.mode == 'test': test_data_generator = TranslationGenerator(1, args.lang, args.path_to_data, 'test', source_token_to_idx, target_token_to_idx, args.decode_max_len, should_shuffle=False) val_size = test_data_generator.data_size if args.val_size <= 0 else args.val_size pred_stream, ref_stream = corpus_predict(test_data_generator, encoder, decoder, target_idx_to_token, gpu=gpu, val_size=val_size, beam=args.beam) test_bleu = corpus_bleu(pred_stream, ref_stream, tokenize='none', lowercase=True) print('{}, Testing BLEU: {}'.format(time.strftime("%Y-%m-%d %H:%M"), test_bleu))
collate_fn=collate_fn)).next() if __name__ == "__main__": CONTEXT_SIZE = 3 C = constant.C H = constant.H D = constant.D with open('data/prep/empathetic-dialogue/lang_shared.pkl', 'rb') as f: lang = pickle.load(f) V = len(lang) # define and load policy model encoder = RNNEncoder(V=V, D=D, H=H, L=1, embedding=None) decoder = RNNDecoder(V=V, D=D, H=H, L=1, embedding=None) model = RLSeq(encoder=encoder, decoder=decoder, vocab=lang) constant.bi = 'none' reward_model = BinaryClassifier(encoder=RNNEncoder(V=V, D=D, H=300, L=1), enc_type='rnn', H=300) constant.bi = 'bi' model.init_reward(reward_model) model.init_baseline_reward() model = load_model(model, constant.test_path) model.eval() # context = 'hello my name is Midnight' # x, _ = batchify(lang, context) # sent = model.predict_one(x)
def train_model_encdec(train_data, dev_data, input_indexer, output_indexer, args): # Sort in descending order by x_indexed, essential for pack_padded_sequence global max_denotation train_data.sort(key=lambda ex: len(ex.x_indexed), reverse=True) dev_data.sort(key=lambda ex: len(ex.x_indexed), reverse=True) # Create model model_input_emb = EmbeddingLayer(args.input_dim, len(input_indexer), args.emb_dropout) model_output_emb = EmbeddingLayer(args.output_dim, len(output_indexer), args.emb_dropout) model_enc = RNNEncoder(args.input_dim, args.hidden_size, args.rnn_dropout, args.bidirectional) # len(output_indexer) is 153 and represents the size of the output vocabulary if args.attn: model_dec = AttnDecoder(args.output_dim, args.hidden_size, len(output_indexer), args, dropout=args.dec_dropout) else: model_dec = RNNDecoder(args.output_dim, args.hidden_size, len(output_indexer), dropout=args.dec_dropout) # pack all models to pass to decode_forward function all_models = (model_input_emb, model_output_emb, model_enc, model_dec) # Create optimizers for every model inp_emb_optim = torch.optim.Adam(model_input_emb.parameters(), args.lr) out_emb_optim = torch.optim.Adam(model_output_emb.parameters(), args.lr) enc_optim = torch.optim.Adam(model_enc.parameters(), args.lr) dec_optim = torch.optim.Adam(model_dec.parameters(), args.lr) criterion = torch.nn.NLLLoss() # Iterate through epochs for epoch in range(1, args.epochs + 1): global total_sentences global exact total_sentences = 0.0 exact = 0.0 model_output_emb.train() model_input_emb.train() model_enc.train() model_dec.train() print("Epoch ", epoch) with open(args.eval_file, "a") as f: f.write("Epoch {}\n".format(epoch)) total_loss = 0.0 # Loop over all examples in training data for pair_idx in range(len(train_data)): # extract data from train_data # Zero gradients inp_emb_optim.zero_grad() out_emb_optim.zero_grad() enc_optim.zero_grad() dec_optim.zero_grad() # Forward Pass if args.attn: loss = attn_forward(train_data, all_models, pair_idx, criterion, args) else: loss = decode_forward(train_data, all_models, pair_idx, criterion, args) total_loss += loss # Backpropogation loss.backward() # Optimizer step inp_emb_optim.step() out_emb_optim.step() enc_optim.step() dec_optim.step() with open(args.eval_file, "a") as f: f.write("Total loss is {}\n".format(total_loss)) print("Total loss is {}".format(total_loss)) if args.attn: parser = parsers.AttnParser(model_dec, model_enc, model_input_emb, model_output_emb, output_indexer, args) else: parser = parsers.Seq2SeqSemanticParser(model_dec, model_enc, model_input_emb, model_output_emb, output_indexer, args) if args.copy: print("{}% correct on copy task".format(100*float(exact/total_sentences))) else: pass # evaluate(dev_data, parser, args, print_output=True, outfile="geo_test_output.tsv") denotation = evaluate(dev_data, parser, args, print_output=True) denotation = float(denotation.split(" ")[-1]) if denotation > max_denotation: max_parser = parser max_denotation = denotation if args.copy: print("Done with copy task, exiting before evaluation") exit() try: return max_parser except: return parser
def train_recombination(train_data, dev_data, input_indexer, output_indexer, args): global max_denotation maybe_add_feature([], input_indexer, True, "CITYID") maybe_add_feature([], input_indexer, True, "CITYSTATEID") maybe_add_feature([], output_indexer, True, "CITYID") maybe_add_feature([], output_indexer, True, "CITYSTATEID") # Add state placeholders to indexers maybe_add_feature([], input_indexer, True, "STATEID") maybe_add_feature([], output_indexer, True, "STATEID") # Sort in descending order by x_indexed, essential for pack_padded_sequence # train_data.sort(key=lambda ex: len(ex.x_indexed), reverse=True) # dev_data.sort(key=lambda ex: len(ex.x_indexed), reverse=True) ratios = [args.abs_ent_ratio/2, args.abs_ent_ratio/2, args.concat_ratio] # Create model model_input_emb = EmbeddingLayer(args.input_dim, len(input_indexer), args.emb_dropout) model_output_emb = EmbeddingLayer(args.output_dim, len(output_indexer), args.emb_dropout) model_enc = RNNEncoder(args.input_dim, args.hidden_size, args.rnn_dropout, args.bidirectional) # len(output_indexer) is 153 and represents the size of the output vocabulary if args.attn: model_dec = AttnDecoder(args.output_dim, args.hidden_size, len(output_indexer), args, dropout=args.dec_dropout) else: model_dec = RNNDecoder(args.output_dim, args.hidden_size, len(output_indexer), dropout=args.dec_dropout) # pack all models to pass to decode_forward function all_models = (model_input_emb, model_output_emb, model_enc, model_dec) # Create optimizers for every model inp_emb_optim = torch.optim.Adam(model_input_emb.parameters(), args.lr) out_emb_optim = torch.optim.Adam(model_output_emb.parameters(), args.lr) enc_optim = torch.optim.Adam(model_enc.parameters(), args.lr) dec_optim = torch.optim.Adam(model_dec.parameters(), args.lr) criterion = torch.nn.NLLLoss() # Iterate through epochs for epoch in range(1, args.epochs + 1): train_data_recomb = deepcopy(train_data) # Add the recombination data to the training set train_data_recomb.extend(recombine(train_data, input_indexer, output_indexer, args.recomb_size, args, ratios=ratios)) random.shuffle(train_data_recomb) max_out_len = max([len(ex.y_indexed) for ex in train_data_recomb]) global total_sentences global exact total_sentences = 0.0 exact = 0.0 model_output_emb.train() model_input_emb.train() model_enc.train() model_dec.train() print("Epoch ", epoch) with open(args.eval_file, "a") as f: f.write("Epoch {}\n".format(epoch)) total_loss = 0.0 # Loop over all examples in training data for pair_idx in range(len(train_data_recomb)): # extract data from train_data # Zero gradients inp_emb_optim.zero_grad() out_emb_optim.zero_grad() enc_optim.zero_grad() dec_optim.zero_grad() # Forward Pass if args.attn: if epoch==1 and pair_idx == 0: print("Running Attention Model") loss = attn_forward(train_data_recomb, all_models, pair_idx, criterion, args) else: if epoch==1 and pair_idx == 0: print("Running Base Model") loss = decode_forward(train_data_recomb, all_models, pair_idx, criterion, args) total_loss += loss # Backpropogation loss.backward() # Optimizer step inp_emb_optim.step() out_emb_optim.step() enc_optim.step() dec_optim.step() with open(args.eval_file, "a") as f: f.write("Total loss is {}\n".format(total_loss)) print("Total loss is {}".format(total_loss)) if args.attn: parser = parsers.AttnParser(model_dec, model_enc, model_input_emb, model_output_emb, output_indexer, args, max_output_len = max_out_len) else: parser = parsers.Seq2SeqSemanticParser(model_dec, model_enc, model_input_emb, model_output_emb, output_indexer, args, max_output_len=max_out_len) if args.copy: print("{}% correct on copy task".format(100*float(exact/total_sentences))) else: # pass denotation = float(evaluate(dev_data, parser, args, print_output=True)) denotation = float(denotation.split(" ")[-1]) if denotation > max_denotation: max_parser = parser max_denotation = denotation if args.copy: print("Done with copy task, exiting before evaluation") exit() try: return max_parser except: return parser
H = constant.H D = constant.D V = len(train_dataset.lang) # Shared Encoder-Decoder Embedding embedding = None if constant.share_embeddings: embedding = nn.Embedding(V, D) if constant.embedding == 'fasttext': embedding.weight = nn.Parameter( torch.from_numpy(train_dataset.fasttext).float()) embedding.weight.requires_grad = constant.update_embeddings if constant.task == 'multiseq': encoder = RNNEncoder(V=V, D=D, H=H, L=1, embedding=embedding) decoder = RNNDecoder(V=V, D=D, H=H, L=1, embedding=embedding) if constant.share_rnn: decoder.rnn = encoder.rnn model = MultiSeq2Seq(C=C, encoder=encoder, decoder=decoder, vocab=train_dataset.lang) if constant.policy_model != '': seq2seq = load_model( Seq2Seq(encoder=encoder, decoder=decoder, vocab=train_dataset.lang), constant.policy_model) model.encoder = deepcopy(seq2seq.encoder) model.decoder = deepcopy(seq2seq.decoder) if constant.bi == 'bi': model.reduce_state = deepcopy(seq2seq.reduce_state)
def eval_seq2seq(model, dataloader, bleu=False, beam=False, raise_oom=False, test=False, save=False): model.eval() criterion = nn.CrossEntropyLoss(ignore_index=constant.pad_idx) loss_log = [] ppl_log = [] vocab = dataloader.dataset.lang ctx = [] ref = [] g_hyps = [] b_hyps = [] bow_sims = [] # automated metrics if test and bleu: embedding_metrics = EmbeddingSim(dataloader.dataset.fasttext) # define and load sentiment clf sentiment_clf = BinaryClassifier( encoder=BertModel.from_pretrained('bert-base-cased'), enc_type='bert', H=768) sentiment_clf = load_model(sentiment_clf, constant.sentiment_clf) # define and load user model encoder = RNNEncoder(V=len(dataloader.dataset.lang), D=constant.D, H=constant.H, L=1, embedding=None) decoder = RNNDecoder(V=len(dataloader.dataset.lang), D=constant.D, H=constant.H, L=1, embedding=None) user_model = Seq2Seq(encoder=encoder, decoder=decoder, vocab=dataloader.dataset.lang) user_model = load_model(user_model, constant.user_model) user_model.eval() if constant.USE_CUDA: sentiment_clf.cuda() user_model.cuda() tokenizer = BertTokenizer.from_pretrained('bert-base-cased') ref_lens = [] gen_lens = [] ref_sentiments = [] gen_sentiments = [] sentiment_agreement = [] ref_improvement = [] gen_improvement = [] # distinct_ngrams = { # 'ref': set(), # 'gen': set() # } # total_ngrams = { # 'ref': 0, # 'gen': 0 # } with torch.no_grad(): try: for dialogs, lens, targets, unsort, _, _, _, _, _ in dataloader: logits = model(dialogs, lens, targets) if bleu: # Calculate BLEU probs, sents = model(dialogs, lens, targets, test=True) # corrects: B x T r = [ " ".join([ vocab.index2word[x_t] for x_t in iter(lambda x=iter(gens): next(x), constant.eou_idx) ]) for gens in targets[unsort].cpu().data.numpy() ] c = [ " ".join([ vocab.index2word[x_t] for x_t in iter(lambda x=iter(gens): next(x), constant.pad_idx) ]) for gens in dialogs[unsort].cpu().data.numpy() ] ref += r ctx += c if test: # calculate sentiment agreement ref_sentiment = get_sentiment( sentiment_clf, r, tokenizer).squeeze() > 0.5 gen_sentiment = get_sentiment( sentiment_clf, np.array(sents)[unsort].tolist(), tokenizer).squeeze() > 0.5 sentiment_agreement += (ref_sentiment == gen_sentiment ).cpu().numpy().tolist() ref_sentiments += ref_sentiment.cpu().numpy().tolist() gen_sentiments += gen_sentiment.cpu().numpy().tolist() # calculate sentiment improvement refs = [ context + ' ' + sent for context, sent in zip(c, r) ] gens = [ context + ' ' + sent for context, sent in zip( c, np.array(sents)[unsort].tolist()) ] ref_simulation = get_user_response( user_model, targets, refs, model.vocab) gen_simulation = get_user_response( user_model, targets, gens, model.vocab) ctx_sentiment = get_sentiment(sentiment_clf, c, tokenizer).squeeze() user_ref_sentiments = get_sentiment( sentiment_clf, ref_simulation, tokenizer).squeeze() user_gen_sentiments = get_sentiment( sentiment_clf, gen_simulation, tokenizer).squeeze() ref_improvement += ( user_ref_sentiments - ctx_sentiment).cpu().numpy().tolist() gen_improvement += ( user_gen_sentiments - ctx_sentiment).cpu().numpy().tolist() # average generation lengths ref_lens += [len(t.split()) for t in r] gen_lens += [len(s.split()) for s in sents] # calculate BoW embedding similarity seqs = np.array( [vocab.transform_one(sent) for sent in sents]) lens = [len(seq) for seq in seqs] sort = np.argsort(lens)[::-1].tolist() unsort = np.argsort(sort).tolist() seqs = seqs[sort] lens = np.array(lens)[sort].tolist() padded_gens = np.ones((len(seqs), lens[0])).astype(int) for b in range(len(seqs)): padded_gens[b, :lens[b]] = np.array(seqs[b]) extrema, avg, greedy = embedding_metrics.sim_bow( padded_gens, lens, targets.cpu().numpy()[sort], [len(t.split()) for t in r]) bow_sims.append((extrema, avg, greedy)) if beam: g_hyps += model.greedy_search(probs, vocab) b_hyps += model.beam_search(dialogs, lens, targets.shape[0], targets.shape[1], vocab) else: g_hyps += np.array(sents)[unsort].tolist() # Masked CEL trick: Reshape logits to (B*L, V) and targets to (B*L,) and ignore pad idx batch_size, max_target_len = targets.shape logits = logits.transpose(0, 1).contiguous().view( batch_size * max_target_len, -1) targets = targets.contiguous().view(batch_size * max_target_len) loss = criterion(logits, targets) # loss = masked_cross_entropy(logits.transpose(0, 1).contiguous(), targets.contiguous(), target_lens) loss_log.append(loss.item()) ppl_log.append(math.exp(loss_log[-1])) except RuntimeError as e: if 'out of memory' in str(e) and not raise_oom: print('| WARNING: ran out of memory, retrying batch') for p in model.parameters(): if p.grad is not None: del p.grad # free some memory torch.cuda.empty_cache() return eval_seq2seq(model, dataloader, bleu, raise_oom=True) else: raise e if not constant.grid_search: if save: if bleu and test: if not constant.topk: fname = "samples/{}.greedy.txt".format( constant.test_path.split('/')[1]) else: fname = "samples/{}.topk.{:.4f}.txt".format( constant.test_path.split('/')[1], pearsonr(ref_sentiments, gen_sentiments)[0]) else: fname = "samples/{}.greedy.txt".format( constant.test_path.split('/')[1]) with open(fname, "w") as f: for i, (c, r, h) in enumerate(zip(ctx, ref, g_hyps)): f.write("DIAL {}: {}\n".format(i, c)) f.write("GOLD: {}\n".format(r)) f.write("PRED: {}\n".format(h)) f.write("\n") else: count = 0 if not beam: for c, r, h in zip(ctx, ref, g_hyps): if count < 100: print("DIAL: ", c) print("GOLD: ", r) print("PRED: ", h) print() count += 1 else: break else: for c, r, g, b in zip(ctx, ref, g_hyps, b_hyps): if count < 100: print("DIAL: ") print(c) print("GOLD: ") print(r) print("GRDY: ") print(g) print("BEAM: ") print(b) print() count += 1 else: break if bleu: hyps = b_hyps if beam else g_hyps bleu_score, bleus = moses_multi_bleu(np.array(hyps), np.array(ref), lowercase=True) bow_sims = np.array(bow_sims) if test: return np.mean(loss_log), np.mean( ppl_log ), bleu_score, bleus, np.mean(bleus), np.mean(ref_lens), np.mean( gen_lens ), distinct_ngrams(ref), distinct_ngrams(g_hyps), pearsonr( ref_sentiments, gen_sentiments )[0], sum(sentiment_agreement) / len(sentiment_agreement), np.mean( ref_improvement), np.mean(gen_improvement), np.mean(bow_sims, axis=0) else: return np.mean(loss_log), np.mean(ppl_log), bleu_score, bleus else: return np.mean(loss_log), np.mean(ppl_log)
def eval_rl(model, dataloader, bleu=False, raise_oom=False, save=False, test=False): model.eval() preds = [] golds = [] reward_log = [] ori_reward_log = [] aux_reward_log = [] inv_loss_log = [] vocab = dataloader.dataset.lang ctx = [] ref = [] g_hyps = [] bow_sims = [] # mle_criterion = nn.CrossEntropyLoss(ignore_index=constant.pad_idx) # automated metrics if test and bleu: tokenizer = model.reward_tokenizer embedding_metrics = EmbeddingSim(dataloader.dataset.fasttext) # define and load sentiment clf if constant.reward_model == constant.sentiment_clf: sentiment_clf = model.reward else: sentiment_clf = BinaryClassifier( encoder=BertModel.from_pretrained('bert-base-cased'), enc_type='bert', H=768) sentiment_clf = load_model(sentiment_clf, constant.sentiment_clf) if constant.use_user: user_model = model.user_model else: # define and load user model encoder = RNNEncoder(V=len(dataloader.dataset.lang), D=constant.D, H=constant.H, L=1, embedding=None) decoder = RNNDecoder(V=len(dataloader.dataset.lang), D=constant.D, H=constant.H, L=1, embedding=None) user_model = Seq2Seq(encoder=encoder, decoder=decoder, vocab=dataloader.dataset.lang) user_model = load_model(user_model, constant.user_model) user_model.eval() if constant.USE_CUDA: sentiment_clf.cuda() user_model.cuda() ref_lens = [] gen_lens = [] ref_sentiments = [] gen_sentiments = [] ref_improvement = [] gen_improvement = [] sentiment_agreement = [] with torch.no_grad(): try: for dialogs, lens, targets, unsort, _, sentiments, sentiments_b, _, _ in dataloader: if constant.use_sentiment: if constant.aux_reward_model != '': _, _, _, R_l, R_s, _, clf_logits = model( dialogs, lens, targets, sentiments=sentiments) R = constant.lambda_aux * R_l + R_s ori_reward_log.append(torch.mean(R_l).item()) aux_reward_log.append(torch.mean(R_s).item()) else: _, _, _, R, _, clf_logits = model( dialogs, lens, targets, sentiments=sentiments) pred = torch.sigmoid(clf_logits.squeeze()) > 0.5 preds.append(pred.detach().cpu().numpy()) golds.append(sentiments_b.cpu().numpy()) elif constant.use_sentiment_agreement: _, _, _, R, _ = model(dialogs, lens, targets, sentiments=sentiments) elif constant.use_curiosity: _, dec_lens_var, _, R, R_i, L_i, _ = model( dialogs, lens, targets) R_i = torch.mean( torch.sum(R_i.transpose(0, 1).contiguous(), dim=1) / dec_lens_var.float()) aux_reward_log.append(torch.mean(R_i).item()) inv_loss_log.append(L_i.item()) else: _, _, _, R, _ = model(dialogs, lens, targets, sentiments=sentiments, test=True) reward_log.append(torch.mean(R).item()) if bleu: # Calculate BLEU _, sents = model(dialogs, lens, targets, test=True, use_mle=True) g_hyps += np.array(sents)[unsort].tolist() # corrects: B x T r = [ " ".join([ vocab.index2word[x_t] for x_t in iter(lambda x=iter(gens): next(x), constant.eou_idx) ]) for gens in targets[unsort].cpu().data.numpy() ] c = [ " ".join([ vocab.index2word[x_t] for x_t in iter(lambda x=iter(gens): next(x), constant.pad_idx) ]) for gens in dialogs[unsort].cpu().data.numpy() ] ref += r ctx += c if test: # calculate sentiment agreement ref_sentiment = get_sentiment( sentiment_clf, r, tokenizer).squeeze() > 0.5 gen_sentiment = get_sentiment( sentiment_clf, np.array(sents)[unsort].tolist(), tokenizer).squeeze() > 0.5 sentiment_agreement += (ref_sentiment == gen_sentiment ).cpu().numpy().tolist() ref_sentiments += ref_sentiment.cpu().numpy().tolist() gen_sentiments += gen_sentiment.cpu().numpy().tolist() # calculate sentiment improvement refs = [ context + ' ' + sent for context, sent in zip(c, r) ] gens = [ context + ' ' + sent for context, sent in zip( c, np.array(sents)[unsort].tolist()) ] ref_simulation = get_user_response( user_model, targets, refs, model.vocab) gen_simulation = get_user_response( user_model, targets, gens, model.vocab) ctx_sentiment = get_sentiment(sentiment_clf, c, tokenizer).squeeze() user_ref_sentiments = get_sentiment( sentiment_clf, ref_simulation, tokenizer).squeeze() user_gen_sentiments = get_sentiment( sentiment_clf, gen_simulation, tokenizer).squeeze() ref_improvement += ( user_ref_sentiments - ctx_sentiment).cpu().numpy().tolist() gen_improvement += ( user_gen_sentiments - ctx_sentiment).cpu().numpy().tolist() # average generation lengths ref_lens += [len(t.split()) for t in r] gen_lens += [len(s.split()) for s in sents] # calculate BoW embedding similarity seqs = np.array( [vocab.transform_one(sent) for sent in sents]) lens = [len(seq) for seq in seqs] sort = np.argsort(lens)[::-1].tolist() unsort = np.argsort(sort).tolist() seqs = seqs[sort] lens = np.array(lens)[sort].tolist() padded_gens = np.ones((len(seqs), lens[0])).astype(int) for b in range(len(seqs)): padded_gens[b, :lens[b]] = np.array(seqs[b]) extrema, avg, greedy = embedding_metrics.sim_bow( padded_gens, lens, targets.cpu().numpy()[sort], [len(t.split()) for t in r]) bow_sims.append((extrema, avg, greedy)) except RuntimeError as e: if 'out of memory' in str(e) and not raise_oom: print('| WARNING: ran out of memory, retrying batch') for p in model.parameters(): if p.grad is not None: del p.grad # free some memory torch.cuda.empty_cache() return eval_rl(model, dataloader, bleu, raise_oom=True) else: raise e if not constant.grid_search: if save: if bleu and test: if not constant.topk: fname = "samples/{}.greedy.txt".format( constant.test_path.split('/')[1]) else: fname = "samples/{}.topk.{:.4f}.txt".format( constant.test_path.split('/')[1], pearsonr(ref_sentiments, gen_sentiments)[0]) else: fname = "samples/{}.greedy.txt".format( constant.test_path.split('/')[1]) with open(fname, "w") as f: for i, (c, r, h) in enumerate(zip(ctx, ref, g_hyps)): f.write("DIAL {}: {}\n".format(i, c)) f.write("GOLD: {}\n".format(r)) f.write("PRED: {}\n".format(h)) f.write("\n") else: count = 0 for c, r, h in zip(ctx, ref, g_hyps): if count < 100: print("DIAL: ", c) print("GOLD: ", r) print("GRDY: ", h) print() count += 1 else: break if bleu: bleu_score, bleus = moses_multi_bleu(np.array(g_hyps), np.array(ref), lowercase=True) if test: bow_sims = np.array(bow_sims) if constant.use_sentiment and constant.aux_reward_model != '': return [ np.mean(reward_log), np.mean(ori_reward_log), np.mean(aux_reward_log) ], bleu_score, bleus elif constant.use_sentiment: preds = np.hstack(np.array(preds)) golds = np.concatenate(golds) f1 = f1_score(preds, golds, average='weighted') return np.mean(reward_log), f1, bleu_score, bleus, np.mean( bleus ), np.mean(ref_lens), np.mean(gen_lens), distinct_ngrams( ref), distinct_ngrams(g_hyps), pearsonr( ref_sentiments, gen_sentiments)[0], sum(sentiment_agreement) / len( sentiment_agreement), np.mean( ref_improvement), np.mean( gen_improvement), np.mean(bow_sims, axis=0) elif constant.use_curiosity: return np.mean(reward_log), np.mean(aux_reward_log), np.mean( inv_loss_log), bleu_score, bleus else: return np.mean(reward_log), bleu_score, bleus, np.mean( bleus ), np.mean(ref_lens), np.mean(gen_lens), distinct_ngrams( ref), distinct_ngrams(g_hyps), pearsonr( ref_sentiments, gen_sentiments)[0], sum(sentiment_agreement) / len( sentiment_agreement), np.mean( ref_improvement), np.mean( gen_improvement), np.mean(bow_sims, axis=0) elif constant.use_curiosity: return np.mean(reward_log), np.mean(aux_reward_log), np.mean( inv_loss_log), bleu_score, bleus elif constant.use_sentiment: if constant.use_sentiment_agreement: return np.mean(reward_log), bleu_score, bleus preds = np.hstack(np.array(preds)) golds = np.concatenate(golds) f1 = f1_score(preds, golds, average='weighted') if constant.aux_reward_model != '': return [ np.mean(reward_log), np.mean(ori_reward_log), np.mean(aux_reward_log) ], f1, bleu_score, bleus else: return np.mean(reward_log), f1, bleu_score, bleus else: return np.mean(reward_log), bleu_score, bleus else: if test: if constant.use_curiosity: return np.mean(reward_log), np.mean(aux_reward_log), np.mean( inv_loss_log) return np.mean(reward_log) elif constant.use_curiosity: return np.mean(reward_log), np.mean(aux_reward_log), np.mean( inv_loss_log) elif constant.use_sentiment: if constant.use_sentiment_agreement: return np.mean(reward_log) preds = np.hstack(np.array(preds)) golds = np.concatenate(golds) f1 = f1_score(preds, golds, average='weighted') return np.mean(reward_log), f1 else: return np.mean(reward_log)