def run_test_our_method(cla_batcher, cnn_classifier, sess_cnn, filename): test_to_true = read_test_result_our(filename, 1) test_to_false = read_test_result_our(filename, 0) gold_to_true = read_test_input(filename, 1) gold_to_false = read_test_input(filename, 0) list_ref = [] list_pre = [] right_cnn = 0 all_cnn = 0 for i in range(len(gold_to_true) // 64): example_list = [] for j in range(64): # example_list.append(test_true[i*64+j]) new_dis_example = bc.Example(test_to_true[i * 64 + j], 1, cla_batcher._vocab, cla_batcher._hps) # list_pre.append(test_false[i*64+j].split()) example_list.append(new_dis_example) # list_ref.append([gold_text[i*64+j].split()]) cla_batch = bc.Batch(example_list, cla_batcher._hps, cla_batcher._vocab) right_s, all_s, _, pre = cnn_classifier.run_eval_step( sess_cnn, cla_batch) right_cnn += right_s all_cnn += all_s for j in range(64): if len(gold_to_true[i * 64 + j].split()) > 2 and len( test_to_true[i * 64 + j].split()) > 2 and 1 == pre[j]: list_ref.append([gold_to_true[i * 64 + j].split()]) list_pre.append(test_to_true[i * 64 + j].split()) for i in range(len(gold_to_false) // 64): example_list = [] for j in range(64): # example_list.append(test_true[i*64+j]) new_dis_example = bc.Example(test_to_false[i * 64 + j], 0, cla_batcher._vocab, cla_batcher._hps) # list_pre.append(test_false[i*64+j].split()) example_list.append(new_dis_example) # list_ref.append([gold_text[i*64+j].split()]) cla_batch = bc.Batch(example_list, cla_batcher._hps, cla_batcher._vocab) right_s, all_s, _, pre = cnn_classifier.run_eval_step( sess_cnn, cla_batch) right_cnn += right_s all_cnn += all_s for j in range(64): if len(gold_to_false[i * 64 + j].split()) > 2 and len( test_to_false[i * 64 + j].split()) > 2 and 0 == pre[j]: list_ref.append([gold_to_false[i * 64 + j].split()]) list_pre.append(test_to_false[i * 64 + j].split()) tf.logging.info("cnn test acc: " + str(right_cnn * 1.0 / all_cnn)) cc = SmoothingFunction() tf.logging.info( "BLEU: " + str(corpus_bleu(list_ref, list_pre, smoothing_function=cc.method1)))
def evaluate_autoencoder(whichdecoder, data_source, epoch): # Turn on evaluation mode which disables dropout. eos_id = corpus.dictionary.word2idx['<eos>'] autoencoder.eval() ntokens = len(corpus.dictionary.word2idx) n_sents = 0.0 total_loss = 0.0 token_accuracies = 0.0 all_source_sents = [] all_transfer_sents = [] pbar = tqdm(range(len(data_source))) for ii in pbar: batch = data_source[ii] source, target, lengths = batch source = to_gpu(use_cuda, Variable(source, requires_grad=False)) target = to_gpu(use_cuda, Variable(target, requires_grad=False)) n_sents += source.size()[0] mask = target.gt(0) masked_target = target.masked_select(mask) # examples x ntokens output_mask = mask.unsqueeze(1).expand(mask.size(0), ntokens) hidden = autoencoder(0, source, lengths, noise=False, encode_only=True) # output: batch x seq_len x ntokens if whichdecoder == 0: output = autoencoder(0, source, lengths, noise=False) flattened_output = output.view(-1, ntokens) masked_output = flattened_output.masked_select(output_mask).view(-1, ntokens) # accuracy max_vals1, max_indices1 = torch.max(masked_output, 1) token_accuracies += torch.mean(max_indices1.eq(masked_target).float()).item() max_values1, max_indices1 = torch.max(output, 2) max_indices2 = autoencoder.generate(1, hidden, maxlen=50) else: output = autoencoder(1, source, lengths, noise=False) flattened_output = output.view(-1, ntokens) masked_output = flattened_output.masked_select(output_mask).view(-1, ntokens) # accuracy max_vals2, max_indices2 = torch.max(masked_output, 1) token_accuracies += torch.mean(max_indices2.eq(masked_target).float()).item() max_values2, max_indices2 = torch.max(output, 2) max_indices1 = autoencoder.generate(0, hidden, maxlen=50) # forward total_loss += criterion_ce(masked_output / args.temp, masked_target).data # all_source_sents, all_transfer_sents max_indices1 = max_indices1.view(output.size(0), -1).data.cpu().numpy() max_indices2 = max_indices2.view(output.size(0), -1).data.cpu().numpy() target = target.view(output.size(0), -1).data.cpu().numpy() tran_indices = max_indices2 if whichdecoder == 0 else max_indices1 for t, tran_idx in zip(target, tran_indices): # real sentence truncated_to_eos = t.tolist().index(eos_id) if eos_id in t.tolist() else len(t) chars = " ".join([corpus.dictionary.idx2word[x] for x in t[:truncated_to_eos]]) all_source_sents.append(chars) # transfer sentence truncated_to_eos = tran_idx.tolist().index(eos_id) if eos_id in tran_idx.tolist() else len(tran_idx) chars = " ".join([corpus.dictionary.idx2word[x] for x in tran_idx[:truncated_to_eos]]) all_transfer_sents.append(chars) # compare the original and transfer aeoutf_from = "{}/{}_output_decoder_{}_from.txt".format(args.outf, epoch, whichdecoder) aeoutf_tran = "{}/{}_output_decoder_{}_tran.txt".format(args.outf, epoch, whichdecoder) with open(aeoutf_from, 'w') as f_from, open(aeoutf_tran, 'w') as f_trans: # laplacian smoothing # for word in corpus.dictionary.word2idx.keys(): # f_from.write(word + "\n") # f_trans.write(word + "\n") for i in range(len(all_source_sents)): # real sentence f_from.write(all_source_sents[i]) # transfer sentence f_trans.write(all_transfer_sents[i]) if i != len(all_source_sents) - 1: f_from.write("\n") f_trans.write("\n") # bleu all_bleu_scores = 0.0 for i in range(len(all_source_sents)): sou = all_source_sents[i].split(' ') tran = all_transfer_sents[i].split(' ') all_bleu_scores += sentence_bleu([sou], tran,smoothing_function=SmoothingFunction().method7,weights=[1.0/3.0]*3) bleu = all_bleu_scores / n_sents * 100.0 # forward and reverse loss = total_loss.item() / len(data_source) ppl = math.exp(loss) #print('bleu {:4.2f} | ppl {:4.3f}'.format(bleu, ppl)) #logging.info('bleu {:4.2f} | ppl {:4.3f}'.format(bleu, ppl)) # transfer labels = fasttext_classifier.predict(all_transfer_sents) truth = str(1 - whichdecoder) transfer = float(sum([l == truth for ll in labels for l in ll])) / n_sents * 100.0 # load sentences to evaluate on arpa_path = '{}/{}_lm_{}.arpa'.format(args.outf, epoch, whichdecoder) kenlm_model = train_ngram_lm(args.kenlm_path, aeoutf_from, arpa_path, args.N) forward = get_ppl(kenlm_model, all_transfer_sents) kenlm_model = train_ngram_lm(args.kenlm_path, aeoutf_tran, arpa_path, args.N) reverse = get_ppl(kenlm_model, all_source_sents) #print('transfer {:4.2f} | forward {:4.3f} | reverse {:4.3f}'.format(transfer, forward, reverse)) #logging.info('transfer {:4.2f} | forward {:4.3f} | reverse {:4.3f}'.format(transfer, forward, reverse)) return bleu, ppl, transfer, forward, reverse
import argparse import csv import sys, os, pdb import nltk import time import random from nltk.translate.bleu_score import SmoothingFunction chencherry = SmoothingFunction() MAX_REF_COUNT = 6 def get_annotations(line): set_info, post_id, best, valids, confidence = line.split(',') annotator_name = set_info.split('_')[0] sitename = set_info.split('_')[1] best = int(best) valids = [int(v) for v in valids.split()] confidence = int(confidence) return post_id, annotator_name, sitename, best, valids, confidence def read_human_annotations(human_annotations_filename): human_annotations_file = open(human_annotations_filename, 'r') annotations = {} for line in human_annotations_file.readlines(): line = line.strip('\n') splits = line.split('\t') post_id1, annotator_name1, sitename1, best1, valids1, confidence1 = get_annotations( splits[0]) post_id2, annotator_name2, sitename2, best2, valids2, confidence2 = get_annotations(
def analyze_decode_results(dataset, decode_results, verbose=True): from lang.py.parse import tokenize_code, de_canonicalize_code # tokenize_code = tokenize_for_bleu_eval import ast assert dataset.count == len(decode_results) f = f_decode = None if verbose: f = open(dataset.name + '.exact_match', 'w') exact_match_ids = [] f_decode = open(dataset.name + '.decode_results.txt', 'w') eid_to_annot = dict() if data_type == 'django': for raw_id, line in enumerate(open(DJANGO_ANNOT_FILE)): eid_to_annot[raw_id] = line.strip() f_bleu_eval_ref = open(dataset.name + '.ref', 'w') f_bleu_eval_hyp = open(dataset.name + '.hyp', 'w') logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count) cum_oracle_bleu = 0.0 cum_oracle_acc = 0.0 cum_bleu = 0.0 cum_acc = 0.0 sm = SmoothingFunction() all_references = [] all_predictions = [] if all(len(cand) == 0 for cand in decode_results): logging.ERROR('Empty decoding results for the current dataset!') return -1, -1 binned_results_dict = defaultdict(list) def get_binned_key(ast_size): cutoff = 50 if data_type == 'django' else 250 k = 10 if data_type == 'django' else 25 # for hs if ast_size >= cutoff: return '%d - inf' % cutoff lower = int(ast_size / k) * k upper = lower + k key = '%d - %d' % (lower, upper) return key for eid in range(dataset.count): example = dataset.examples[eid] ref_code = example.code ref_ast_tree = ast.parse(ref_code).body[0] refer_source = astor.to_source(ref_ast_tree).strip() # refer_source = ref_code refer_tokens = tokenize_code(refer_source) cur_example_acc = 0.0 decode_cands = decode_results[eid] if len(decode_cands) == 0: continue decode_cand = decode_cands[0] cid, cand, ast_tree, code = decode_cand code = astor.to_source(ast_tree).strip() # simple_url_2_re = re.compile('_STR:0_', re.)) try: predict_tokens = tokenize_code(code) except: logging.error('error in tokenizing [%s]', code) continue if refer_tokens == predict_tokens: cum_acc += 1 cur_example_acc = 1.0 if verbose: exact_match_ids.append(example.raw_id) f.write('-' * 60 + '\n') f.write('example_id: %d\n' % example.raw_id) f.write(code + '\n') f.write('-' * 60 + '\n') if data_type == 'django': ref_code_for_bleu = example.meta_data['raw_code'] pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code']) # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code']) # convert canonicalized code to raw code for literal, place_holder in example.meta_data['str_map'].iteritems(): pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal) # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal) elif data_type == 'hs': ref_code_for_bleu = ref_code pred_code_for_bleu = code # we apply Ling Wang's trick when evaluating BLEU scores refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu) pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu) shorter = len(pred_tokens_for_bleu) < len(refer_tokens_for_bleu) all_references.append([refer_tokens_for_bleu]) all_predictions.append(pred_tokens_for_bleu) # try: ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu)) bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3) cum_bleu += bleu_score # except: # pass if verbose: print 'raw_id: %d, bleu_score: %f' % (example.raw_id, bleu_score) f_decode.write('-' * 60 + '\n') f_decode.write('example_id: %d\n' % example.raw_id) f_decode.write('intent: \n') if data_type == 'django': f_decode.write(eid_to_annot[example.raw_id] + '\n') elif data_type == 'hs': f_decode.write(' '.join(example.query) + '\n') f_bleu_eval_ref.write(' '.join(refer_tokens_for_bleu) + '\n') f_bleu_eval_hyp.write(' '.join(pred_tokens_for_bleu) + '\n') f_decode.write('canonicalized reference: \n') f_decode.write(refer_source + '\n') f_decode.write('canonicalized prediction: \n') f_decode.write(code + '\n') f_decode.write('reference code for bleu calculation: \n') f_decode.write(ref_code_for_bleu + '\n') f_decode.write('predicted code for bleu calculation: \n') f_decode.write(pred_code_for_bleu + '\n') f_decode.write('pred_shorter_than_ref: %s\n' % shorter) # f_decode.write('weired: %s\n' % weired) f_decode.write('-' * 60 + '\n') # compute oracle best_bleu_score = 0. cur_oracle_acc = 0. for ast_tree in decode_results: try: code = astor.to_source(ast_tree).strip() predict_tokens = tokenize_code(code) if predict_tokens == refer_tokens: cur_oracle_acc = 1. if data_type == 'django': pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code']) # convert canonicalized code to raw code for literal, place_holder in example.meta_data['str_map'].iteritems(): pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal) elif data_type == 'hs': pred_code_for_bleu = code # we apply Ling Wang's trick when evaluating BLEU scores pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu) ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu)) cand_bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3) if cand_bleu_score > best_bleu_score: best_bleu_score = cand_bleu_score except: continue cum_oracle_bleu += best_bleu_score cum_oracle_acc += cur_oracle_acc ref_ast_size = example.parse_tree.size binned_key = get_binned_key(ref_ast_size) binned_results_dict[binned_key].append((bleu_score, cur_example_acc, best_bleu_score, cur_oracle_acc)) cum_bleu /= dataset.count cum_acc /= dataset.count cum_oracle_bleu /= dataset.count cum_oracle_acc /= dataset.count logging.info('corpus level bleu: %f', corpus_bleu(all_references, all_predictions, smoothing_function=sm.method3)) logging.info('sentence level bleu: %f', cum_bleu) logging.info('accuracy: %f', cum_acc) logging.info('oracle bleu: %f', cum_oracle_bleu) logging.info('oracle accuracy: %f', cum_oracle_acc) keys = sorted(binned_results_dict, key=lambda x: int(x.split(' - ')[0])) Y = [[], [], [], []] X = [] for binned_key in keys: entry = binned_results_dict[binned_key] avg_bleu = np.average([t[0] for t in entry]) avg_acc = np.average([t[1] for t in entry]) avg_oracle_bleu = np.average([t[2] for t in entry]) avg_oracle_acc = np.average([t[3] for t in entry]) print binned_key, avg_bleu, avg_acc, avg_oracle_bleu, avg_oracle_acc, len(entry) Y[0].append(avg_bleu) Y[1].append(avg_acc) Y[2].append(avg_oracle_bleu) Y[3].append(avg_oracle_acc) X.append(int(binned_key.split(' - ')[0])) import matplotlib.pyplot as plt from pylab import rcParams rcParams['figure.figsize'] = 6, 2.5 if data_type == 'django': fig, ax = plt.subplots() ax.plot(X, Y[0], 'bs--', label='BLEU', lw=1.2) # ax.plot(X, Y[2], 'r^--', label='oracle BLEU', lw=1.2) ax.plot(X, Y[1], 'r^--', label='acc', lw=1.2) # ax.plot(X, Y[3], 'r^--', label='oracle acc', lw=1.2) ax.set_ylabel('Performance') ax.set_xlabel('Reference AST Size (# nodes)') plt.legend(loc='upper right', ncol=6) plt.tight_layout() # plt.savefig('django_acc_ast_size.pdf', dpi=300) # os.system('pcrop.sh django_acc_ast_size.pdf') plt.savefig('django_perf_ast_size.pdf', dpi=300) os.system('pcrop.sh django_perf_ast_size.pdf') else: fig, ax = plt.subplots() ax.plot(X, Y[0], 'bs--', label='BLEU', lw=1.2) # ax.plot(X, Y[2], 'r^--', label='oracle BLEU', lw=1.2) ax.plot(X, Y[1], 'r^--', label='acc', lw=1.2) # ax.plot(X, Y[3], 'r^--', label='oracle acc', lw=1.2) ax.set_ylabel('Performance') ax.set_xlabel('Reference AST Size (# nodes)') plt.legend(loc='upper right', ncol=6) plt.tight_layout() # plt.savefig('hs_bleu_ast_size.pdf', dpi=300) # os.system('pcrop.sh hs_bleu_ast_size.pdf') plt.savefig('hs_perf_ast_size.pdf', dpi=300) os.system('pcrop.sh hs_perf_ast_size.pdf') if verbose: f.write(', '.join(str(i) for i in exact_match_ids)) f.close() f_decode.close() f_bleu_eval_ref.close() f_bleu_eval_hyp.close() return cum_bleu, cum_acc
def evaluate_decode_results(data_type, dataset, decode_results, verbose=True): from lang.py.parse import tokenize_code, de_canonicalize_code # tokenize_code = tokenize_for_bleu_eval import ast assert dataset.count == len(decode_results) f = f_decode = None if verbose: f = open(dataset.name + '.exact_match', 'w') exact_match_ids = [] f_decode = open(dataset.name + '.decode_results.txt', 'w') eid_to_annot = dict() if data_type == 'django': for raw_id, line in enumerate(open(DJANGO_ANNOT_FILE)): eid_to_annot[raw_id] = line.strip() f_bleu_eval_ref = open(dataset.name + '.ref', 'w') f_bleu_eval_hyp = open(dataset.name + '.hyp', 'w') f_generated_code = open(dataset.name + '.geneated_code', 'w') logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count) cum_oracle_bleu = 0.0 cum_oracle_acc = 0.0 cum_bleu = 0.0 cum_acc = 0.0 sm = SmoothingFunction() all_references = [] all_predictions = [] for eid in range(dataset.count): example = dataset.examples[eid] ref_code = example.code ref_ast_tree = ast.parse(ref_code).body[0] refer_source = astor.to_source(ref_ast_tree).strip() # refer_source = ref_code refer_tokens = tokenize_code(refer_source) cur_example_correct = False ast_tree = decode_results[eid] code = astor.to_source(ast_tree).strip() # simple_url_2_re = re.compile('_STR:0_', re.)) try: predict_tokens = tokenize_code(code) except: logging.error('error in tokenizing [%s]', code) continue if refer_tokens == predict_tokens: cum_acc += 1 cur_example_correct = True if verbose: exact_match_ids.append(example.raw_id) f.write('-' * 60 + '\n') f.write('example_id: %d\n' % example.raw_id) f.write(code + '\n') f.write('-' * 60 + '\n') if data_type == 'django': ref_code_for_bleu = example.meta_data['raw_code'] pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code']) # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code']) # convert canonicalized code to raw code for literal, place_holder in example.meta_data['str_map'].iteritems(): pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal) # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal) elif data_type == 'hs': ref_code_for_bleu = ref_code pred_code_for_bleu = code # we apply Ling Wang's trick when evaluating BLEU scores refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu) pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu) # The if-chunk below is for debugging purpose, sometimes the reference cannot match with the prediction # because of inconsistent quotes (e.g., single quotes in reference, double quotes in prediction). # However most of these cases are solved by cannonicalizing the reference code using astor (parse the reference # into AST, and regenerate the code. Use this regenerated one as the reference) weired = False if refer_tokens_for_bleu == pred_tokens_for_bleu and refer_tokens != predict_tokens: # cum_acc += 1 weired = True elif refer_tokens == predict_tokens: # weired! # weired = True pass shorter = len(pred_tokens_for_bleu) < len(refer_tokens_for_bleu) all_references.append([refer_tokens_for_bleu]) all_predictions.append(pred_tokens_for_bleu) # try: ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu)) bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3) cum_bleu += bleu_score # except: # pass if verbose: print 'raw_id: %d, bleu_score: %f' % (example.raw_id, bleu_score) f_decode.write('-' * 60 + '\n') f_decode.write('example_id: %d\n' % example.raw_id) f_decode.write('intent: \n') if data_type == 'django': f_decode.write(eid_to_annot[example.raw_id] + '\n') elif data_type == 'hs': f_decode.write(' '.join(example.query) + '\n') f_bleu_eval_ref.write(' '.join(refer_tokens_for_bleu) + '\n') f_bleu_eval_hyp.write(' '.join(pred_tokens_for_bleu) + '\n') f_decode.write('canonicalized reference: \n') f_decode.write(refer_source + '\n') f_decode.write('canonicalized prediction: \n') f_decode.write(code + '\n') f_decode.write('reference code for bleu calculation: \n') f_decode.write(ref_code_for_bleu + '\n') f_decode.write('predicted code for bleu calculation: \n') f_decode.write(pred_code_for_bleu + '\n') f_decode.write('pred_shorter_than_ref: %s\n' % shorter) f_decode.write('weired: %s\n' % weired) f_decode.write('-' * 60 + '\n') # for Hiro's evaluation f_generated_code.write(pred_code_for_bleu.replace('\n', '#NEWLINE#') + '\n') # compute oracle best_score = 0. cur_oracle_acc = 0. for ast_tree in decode_results: try: code = astor.to_source(ast_tree).strip() predict_tokens = tokenize_code(code) if predict_tokens == refer_tokens: cur_oracle_acc = 1 if data_type == 'django': pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code']) # convert canonicalized code to raw code for literal, place_holder in example.meta_data['str_map'].iteritems(): pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal) elif data_type == 'hs': pred_code_for_bleu = code # we apply Ling Wang's trick when evaluating BLEU scores pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu) ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu)) bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3) if bleu_score > best_score: best_score = bleu_score except: continue cum_oracle_bleu += best_score cum_oracle_acc += cur_oracle_acc cum_bleu /= dataset.count cum_acc /= dataset.count cum_oracle_bleu /= dataset.count cum_oracle_acc /= dataset.count logging.info('corpus level bleu: %f', corpus_bleu(all_references, all_predictions, smoothing_function=sm.method3)) logging.info('sentence level bleu: %f', cum_bleu) logging.info('accuracy: %f', cum_acc) logging.info('oracle bleu: %f', cum_oracle_bleu) logging.info('oracle accuracy: %f', cum_oracle_acc) if verbose: f.write(', '.join(str(i) for i in exact_match_ids)) f.close() f_decode.close() f_bleu_eval_ref.close() f_bleu_eval_hyp.close() f_generated_code.close() print cum_bleu, cum_acc return cum_bleu, cum_acc
def main(model_name, use_cuda, n_print, idxs_print, use_train_dataset, val_size, batch_size, interact, unsmear): #model_path = './model/' + model_name + '/' model_path = model_name if use_cuda: #encoder_decoder = torch.load(model_path + model_name + '.pt') encoder_decoder = torch.load(model_path) else: encoder_decoder = torch.load(model_path + model_name + '.pt', map_location=lambda storage, loc: storage) if use_cuda: encoder_decoder = encoder_decoder.cuda() else: encoder_decoder = encoder_decoder.cpu() dataset = SequencePairDataset(data_path='data/parsed/', lang=encoder_decoder.lang, use_cuda=use_cuda, val_size=val_size, data_type='dev') data_loader = DataLoader(dataset, batch_size=batch_size) get_bleu = True if get_bleu: dev_file = open("data/parsed/copynet_dev.txt", "r", encoding='utf-8') out_file = open("results/" + model_name.split('/')[-1] + ".txt", 'w', encoding='utf-8') total_score = 0.0 num = 0.0 for i, row in enumerate(tqdm(dev_file)): sql = row.split('\t')[1] gold_nl = row.split('\t')[0] predicted = encoder_decoder.get_response(sql) predicted = predicted.replace('<SOS>', '') predicted = predicted.replace('<EOS>', '') predicted = predicted.rstrip() out_file.write(predicted + "\n") score = sentence_bleu( [gold_nl.split()], predicted.split(), smoothing_function=SmoothingFunction().method2) # score = sentence_bleu(ref, pred) total_score += score num += 1 ''' if i == 1000: break ''' del encoder_decoder dev_file.close() out_file.close() print("BLEU score on test set is " + str(total_score * 100 / num)) return if interact: encoder_decoder.interactive(unsmear) if n_print is not None: for _ in range(n_print): i_seq, t_seq, i_str, t_str = random.choice(dataset) i_length = (i_seq > 0).sum() t_length = (i_seq > 0).sum() i_seq = i_seq[:i_length] t_seq = t_seq[:t_length] i_tokens = i_str.split() t_tokens = t_str.split() print_output(i_seq, encoder_decoder, input_tokens=i_tokens, target_tokens=t_tokens, target_seq=t_seq) elif idxs_print is not None: for idx in idxs_print: i_seq, t_seq, i_str, t_str = dataset[idx] i_length = (i_seq > 0).sum() t_length = (i_seq > 0).sum() i_seq = i_seq[:i_length] t_seq = t_seq[:t_length] i_tokens = i_str.split()[:i_length] t_tokens = t_str.split() print_output(i_seq, encoder_decoder, input_tokens=i_tokens, target_tokens=t_tokens, target_seq=t_seq) else: evaluate(encoder_decoder, data_loader)
learning_rate = args.learning_rate # learning rate for the optimizer dropout = args.dropout top_x = args.top_x start_epoch = 0 epochs = args.epochs # number of epochs to train for (if early stopping is not triggered) epochs_since_improvement = 0 # keeps track of number of epochs since there's been an improvement in validation BLEU best_bleu4 = 0.0 # to store the best BLEU-4 score best_cider = 0.0 # to store the best CIDER score best_loss = 0.0 # to store the best Cross-Entropy loss best_ours = 100.0 # to store the best metric (ours) smoothing_method = SmoothingFunction().method1 # epsilon method for bleu print_freq = args.print_freq # print training/validation stats every __ batches checkpoint = args.checkpoint # path to checkpoint, None if none # Read word map # WARNING union vocab from pretrained + didec itself word_map_file = os.path.join(data_folder, 'WORDMAP_union.json') with open(word_map_file, 'r') as j: word_map = json.load(j) rev_word_map = {v: k for k, v in word_map.items()} print('vocab len', len(word_map)) i2w = dict()
def test_micro_bleu_smooth2(candidates, references): _test(candidates, references, "micro", "smooth2", SmoothingFunction().method2, 3)
def test_macro_bleu_smooth1(candidates, references): _test(candidates, references, "macro", "smooth1", SmoothingFunction().method1)
if sampled_token == '<end>': break decoded_seq[0, i + 1] = sampled_index ref = [] for cap in real_captions: l_temp = cap.split() l_temp.remove('<start>') l_temp.remove('<end>') ref.append(l_temp) actual.append(ref) predicted.append(decoded_tokens[:-1]) smoothie = SmoothingFunction() print("BLEU-1: {}".format( corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0), smoothing_function=smoothie.method4))) print("BLEU-2: {}".format( corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie.method4))) print("BLEU-3: {}".format( corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0),
def test_macro_bleu_nltk_smooth2(candidates, references): _test(candidates, references, "macro", "nltk_smooth2", SmoothingFunction().method2)
def calc_bleu(self, reference, hypothesis, weight): return nltk.translate.bleu_score.sentence_bleu( reference, hypothesis, weight, smoothing_function=SmoothingFunction().method1)
def evaluate(beam_size): """ Evaluation :param beam_size: beam size at which to generate captions for evaluation :return: BLEU-4 score """ # DataLoader _transforms = [normalize] if use_clip: _, preprocess = clip.load('ViT-B/32') preprocess.transforms = preprocess.transforms[:2] _transforms = preprocess.transforms + _transforms _transforms = transforms.Compose(_transforms) loader = torch.utils.data.DataLoader(CaptionDataset(data_folder, data_name, 'TEST', transform=_transforms), batch_size=1, shuffle=True, num_workers=1, pin_memory=True) # TODO: Batched Beam Search # Therefore, do not use a batch_size greater than 1 - IMPORTANT! # Lists to store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] references = list() hypotheses = list() # For each image for i, (image, caps, caplens, allcaps) in enumerate( tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))): k = beam_size # Move to GPU device, if available image = image.to(device) # (1, 3, 256, 256) # Encode encoder_out = encoder( image) # (1, enc_image_size, enc_image_size, encoder_dim) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(3) # Flatten encoding encoder_out = encoder_out.view( 1, -1, encoder_dim) # (1, num_pixels, encoder_dim) num_pixels = encoder_out.size(1) # We'll treat the problem as having a batch size of k encoder_out = encoder_out.expand( k, num_pixels, encoder_dim) # (k, num_pixels, encoder_dim) # Tensor to store top k previous words at each step; now they're just <start> k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to( device) # (k, 1) # Tensor to store top k sequences; now they're just <start> seqs = k_prev_words # (k, 1) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(k, 1).to(device) # (k, 1) # Lists to store completed sequences and scores complete_seqs = list() complete_seqs_scores = list() # Start decoding step = 1 h, c = decoder.init_hidden_state(encoder_out) rev_word_map = {v: k for k, v in word_map.items()} if clip_beam_search: with torch.no_grad(): image_features = encoder.clip_model.encode_image(image) image_features /= image_features.norm(dim=-1, keepdim=True) def get_clip_scores(seqs, scores): nonlocal top_k_scores special_words = ['<start>', '<end>'] replace_words = { '<unk>': '<averyunpleasantword>', '<pad>': '<anotherveryunpleasantword>' } special_words_enc = [word_map[w] for w in special_words] if step == 1: top_k_scores, next_word_inds = scores[0].topk( k, 0, True, True) # (s) return torch.zeros(k, device=device).long(), next_word_inds next_word_inds = scores.topk(k)[1] inds = [] text = [] weights = torch.ones(k**2).to(device) count = 0 for idx, (prev_seq, next_words) in enumerate( zip(seqs.tolist(), next_word_inds.tolist())): prev_words = [ rev_word_map[w] for w in prev_seq if w not in special_words_enc ] for word in next_words: cap_words = copy.copy(prev_words) if word not in special_words: word_char = rev_word_map[word] word_char = replace_words.get(word_char) or word_char cap_words.append(word_char) text.append(' '.join(cap_words)) inds.append([idx, word]) if rev_word_map[word] == '<end>': weights[count] = 1.5 count += 1 inds = np.array(inds) text = clip.tokenize(text).to(device) with torch.no_grad(): text_features = encoder.clip_model.encode_text(text) # Pick the top k most similar captions for the image text_features /= text_features.norm(dim=-1, keepdim=True) similarity = (image_features @ text_features.T * weights).log_softmax(dim=-1) top_k_scores, indices = similarity.view(-1).topk(k, 0, True, True) prev_inds = torch.tensor([inds[idx][0] for idx in indices], device=device) next_inds = torch.tensor([inds[idx][1] for idx in indices], device=device) return prev_inds, next_inds # s is a number less than or equal to k, because sequences are removed from this process once they hit <end> while True: embeddings = decoder.embedding(k_prev_words).squeeze( 1) # (s, embed_dim) awe, _ = decoder.attention(encoder_out, h) # (s, encoder_dim), (s, num_pixels) gate = decoder.sigmoid( decoder.f_beta(h)) # gating scalar, (s, encoder_dim) awe = gate * awe h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c)) # (s, decoder_dim) scores = decoder.fc(h) # (s, vocab_size) scores = F.log_softmax(scores, dim=1) # Add scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size) if clip_beam_search: prev_word_inds, next_word_inds = get_clip_scores(seqs, scores) else: # For the first step, all k points will have the same scores (since same k previous words, h, c) if step == 1: top_k_scores, top_k_words = scores[0].topk( k, 0, True, True) # (s) else: # Unroll and find top scores, and their unrolled indices top_k_scores, top_k_words = scores.view(-1).topk( k, 0, True, True) # (s) # Convert unrolled indices to actual indices of scores prev_word_inds = (top_k_words / vocab_size).long() # (s) next_word_inds = (top_k_words % vocab_size).long() # (s) # Add new words to sequences seqs = torch.cat( [seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1) # Which sequences are incomplete (didn't reach <end>)? incomplete_inds = [ ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>'] ] complete_inds = list( set(range(len(next_word_inds))) - set(incomplete_inds)) # Set aside complete sequences if len(complete_inds) > 0: complete_seqs.extend(seqs[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) k -= len(complete_inds) # reduce beam length accordingly # Proceed with incomplete sequences if k == 0: break seqs = seqs[incomplete_inds] h = h[prev_word_inds[incomplete_inds]] c = c[prev_word_inds[incomplete_inds]] encoder_out = encoder_out[prev_word_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1) k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1) # Break if things have been going on too long if step > 50: break step += 1 if len(complete_inds) > 0: i = complete_seqs_scores.index(max(complete_seqs_scores)) seq = complete_seqs[i] else: i = top_k_scores.argmax().item() seq = seqs[i].tolist() # References img_caps = allcaps[0].tolist() img_captions = list( map( lambda c: [ rev_word_map[w] for w in c if w not in { word_map['<start>'], word_map['<end>'], word_map[ '<pad>'] } ], img_caps)) # remove <start> and pads references.append(img_captions) # Hypotheses hypotheses.append([ rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']} ]) assert len(references) == len(hypotheses) bleu4 = corpus_bleu(references, hypotheses, smoothing_function=SmoothingFunction().method1) return bleu4
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) tf.logging.set_verbosity( tf.logging.INFO) # choose what level of logging you want tf.logging.info('Starting running in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) if not os.path.exists(FLAGS.log_root): os.makedirs(FLAGS.log_root) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = [ 'mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps_generator = namedtuple("HParams", hps_dict.keys())(**hps_dict) hparam_list = [ 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps' ] hps_dict = {} for key, val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val # add it to the dict hps_discriminator = namedtuple("HParams", hps_dict.keys())(**hps_dict) tf.set_random_seed( 111 ) # a seed value for randomness # train-classification train-sentiment train-cnn-classificatin train-generator if FLAGS.mode == "train-classifier": #print("Start pre-training......") model_class = Classification(hps_discriminator, vocab) cla_batcher = ClaBatcher(hps_discriminator, vocab) sess_cls, saver_cls, train_dir_cls = setup_training_classification( model_class) print("Start pre-training classification......") run_pre_train_classification(model_class, cla_batcher, 1, sess_cls, saver_cls, train_dir_cls) #10 generated = Generate_training_sample(model_class, vocab, cla_batcher, sess_cls) print("Generating training examples......") generated.generate_training_example("train") generated.generate_test_example("test") elif FLAGS.mode == "train-sentimentor": model_class = Classification(hps_discriminator, vocab) cla_batcher = ClaBatcher(hps_discriminator, vocab) sess_cls, saver_cls, train_dir_cls = setup_training_classification( model_class) print("Start pre_train_sentimentor......") model_sentiment = Sentimentor(hps_generator, vocab) sentiment_batcher = SenBatcher(hps_generator, vocab) sess_sen, saver_sen, train_dir_sen = setup_training_sentimentor( model_sentiment) util.load_ckpt(saver_cls, sess_cls, ckpt_dir="train-classification") run_pre_train_sentimentor(model_sentiment, sentiment_batcher, 1, sess_sen, saver_sen, train_dir_sen) #1 elif FLAGS.mode == "test": config = { 'n_epochs': 5, 'kernel_sizes': [3, 4, 5], 'dropout_rate': 0.5, 'val_split': 0.4, 'edim': 300, 'n_words': None, # Leave as none 'std_dev': 0.05, 'sentence_len': 50, 'n_filters': 100, 'batch_size': 50 } config['n_words'] = 50000 cla_cnn_batcher = CNN_ClaBatcher(hps_discriminator, vocab) cnn_classifier = CNN(config) sess_cnn_cls, saver_cnn_cls, train_dir_cnn_cls = setup_training_cnnclassifier( cnn_classifier) #util.load_ckpt(saver_cnn_cls, sess_cnn_cls, ckpt_dir="train-cnnclassification") run_train_cnn_classifier(cnn_classifier, cla_cnn_batcher, 1, sess_cnn_cls, saver_cnn_cls, train_dir_cnn_cls) #1 print("Generating test transfer files") files = os.listdir("test-generate-transfer/") for file_ in files: run_test_our_method(cla_cnn_batcher, cnn_classifier, sess_cnn_cls, "test-generate-transfer/" + file_ + "/*") #elif FLAGS.mode == "test": elif FLAGS.mode == "train-generator": model_class = Classification(hps_discriminator, vocab) cla_batcher = ClaBatcher(hps_discriminator, vocab) sess_cls, saver_cls, train_dir_cls = setup_training_classification( model_class) model_sentiment = Sentimentor(hps_generator, vocab) sentiment_batcher = SenBatcher(hps_generator, vocab) sess_sen, saver_sen, train_dir_sen = setup_training_sentimentor( model_sentiment) config = { 'n_epochs': 5, 'kernel_sizes': [3, 4, 5], 'dropout_rate': 0.5, 'val_split': 0.4, 'edim': 300, 'n_words': None, # Leave as none 'std_dev': 0.05, 'sentence_len': 50, 'n_filters': 100, 'batch_size': 50 } config['n_words'] = 50000 cla_cnn_batcher = CNN_ClaBatcher(hps_discriminator, vocab) cnn_classifier = CNN(config) sess_cnn_cls, saver_cnn_cls, train_dir_cnn_cls = setup_training_cnnclassifier( cnn_classifier) model = Generator(hps_generator, vocab) batcher = GenBatcher(vocab, hps_generator) sess_ge, saver_ge, train_dir_ge = setup_training_generator(model) #util.load_ckpt(saver_cnn_cls, sess_cnn_cls, ckpt_dir="train-cnnclassification") util.load_ckpt(saver_sen, sess_sen, ckpt_dir="train-sentimentor") generated = Generated_sample(model, vocab, batcher, sess_ge) tf.logging.info("Start pre-training generator......") run_pre_train_generator(model, batcher, 1, sess_ge, saver_ge, train_dir_ge, generated, cla_cnn_batcher, cnn_classifier, sess_cnn_cls) # 4 generated.generate_test_negetive_example( "temp_negetive", batcher) # batcher, model_class, sess_cls, cla_batcher generated.generate_test_positive_example("temp_positive", batcher) tf.logging.info("finished pre-training generator") #run_test_our_method(cla_cnn_batcher, cnn_classifier, sess_cnn_cls, # "temp_negetive" + "/*") tf.logging.info("begin reinforcement learning:") total_epochs = 30 step = 1 for epoch in range(total_epochs): batches = batcher.get_batches(mode='train') tf.logging.info("num_batches: {}".format(len(batches))) for i in range(len(batches)): current_batch = copy.deepcopy(batches[i]) sentiment_batch = batch_sentiment_batch( current_batch, sentiment_batcher) result = model_sentiment.max_generator(sess_sen, sentiment_batch) weight = result['generated'] current_batch.weight = weight sentiment_batch.weight = weight cla_batch = batch_classification_batch(current_batch, batcher, cla_batcher) result = model_class.run_ypred_auc(sess_cls, cla_batch) cc = SmoothingFunction() reward_sentiment = 1 - np.abs(0.5 - result['y_pred_auc']) reward_BLEU = [] for k in range(FLAGS.batch_size): reward_BLEU.append( sentence_bleu( [current_batch.original_reviews[k].split()], cla_batch.original_reviews[k].split(), smoothing_function=cc.method1)) reward_BLEU = np.array(reward_BLEU) reward_de = (2 / (1.0 / (1e-6 + reward_sentiment) + 1.0 / (1e-6 + reward_BLEU))) result = model.run_train_step(sess_ge, current_batch) train_step = result[ 'global_step'] # we need this to update our running average loss loss = result['loss'] tf.logging.info('epoch: %d/%d, step: %d/%d, loss: %f', epoch + 1, total_epochs, i + 1, len(batches), loss) if not step % 10000: tf.logging.info("generating test examples") generated.generate_test_negetive_example( "test-generate-transfer/" + str(epoch) + "epoch_step" + str(train_step) + "_temp_positive", batcher) generated.generate_test_positive_example( "test-generate/" + str(epoch) + "epoch_step" + str(train_step) + "_temp_positive", batcher) # saver_ge.save(sess, train_dir + "/model", global_step=train_step) run_test_our_method( cla_cnn_batcher, cnn_classifier, sess_cnn_cls, "test-generate-transfer/" + str(epoch) + "epoch_step" + str(train_step) + "_temp_positive" + "/*") tf.logging.info("classifying output and evaluating") cla_batch, bleu = output_to_classification_batch( result['generated'], current_batch, batcher, cla_batcher, cc) result = model_class.run_ypred_auc(sess_cls, cla_batch) reward_result_sentiment = result['y_pred_auc'] reward_result_bleu = np.array(bleu) reward_result = (2 / (1.0 / (1e-6 + reward_result_sentiment) + 1.0 / (1e-6 + reward_result_bleu))) current_batch.score = 1 - current_batch.score result = model.max_generator(sess_ge, current_batch) tf.logging.info("classifying output and re-evaluating") cla_batch, bleu = output_to_classification_batch( result['generated'], current_batch, batcher, cla_batcher, cc) result = model_class.run_ypred_auc(sess_cls, cla_batch) reward_result_transfer_sentiment = result['y_pred_auc'] reward_result_transfer_bleu = np.array(bleu) reward_result_transfer = ( 2 / (1.0 / (1e-6 + reward_result_transfer_sentiment) + 1.0 / (1e-6 + reward_result_transfer_bleu))) #tf.logging.info("reward_nonsentiment: "+str(reward_sentiment) +" output_original_sentiment: "+str(reward_result_sentiment)+" output_original_bleu: "+str(reward_result_bleu)) reward = reward_result_transfer #reward_de + reward_result_sentiment + #tf.logging.info("reward_de: "+str(reward_de)) tf.logging.info("running sentiment train step") model_sentiment.run_train_step(sess_sen, sentiment_batch, reward) step += 1
def compute_bleu(self, predictions): # Hide warnings warnings.filterwarnings('ignore') # NLTK # Download Punkt tokenizer (for word_tokenize method) # Download stopwords (for stopword removal) nltk.download('punkt') nltk.download('stopwords') # English Stopwords stops = set(stopwords.words("english")) #stops.remove('no') # Stemming stemmer = SnowballStemmer("english") # Remove punctuation from string translator = str.maketrans('', '', string.punctuation) candidate_pairs = self.readresult(predictions) gt_pairs = self.readresult(self.gt) # Define max score and current score max_score = len(gt_pairs) current_score = 0 i = 0 for image_key in candidate_pairs: # Get candidate and GT caption candidate_caption = candidate_pairs[image_key] gt_caption = gt_pairs[image_key] # Optional - Go to lowercase if not VqaMedEvaluator.case_sensitive: candidate_caption = candidate_caption.lower() gt_caption = gt_caption.lower() # Split caption into individual words (remove punctuation) candidate_words = nltk.tokenize.word_tokenize( candidate_caption.translate(translator)) gt_words = nltk.tokenize.word_tokenize( gt_caption.translate(translator)) # Optional - Remove stopwords if VqaMedEvaluator.remove_stopwords: candidate_words = [ word for word in candidate_words if word.lower() not in stops ] gt_words = [ word for word in gt_words if word.lower() not in stops ] # Optional - Apply stemming if VqaMedEvaluator.stemming: candidate_words = [ stemmer.stem(word) for word in candidate_words ] gt_words = [stemmer.stem(word) for word in gt_words] # Calculate BLEU score for the current caption try: # If both the GT and candidate are empty, assign a score of 1 for this caption if len(gt_words) == 0 and len(candidate_words) == 0: bleu_score = 1 # Calculate the BLEU score else: bleu_score = bleu_score = nltk.translate.bleu_score.sentence_bleu( [gt_words], candidate_words, smoothing_function=SmoothingFunction().method0) # Handle problematic cases where BLEU score calculation is impossible except ZeroDivisionError: pass #raise Exception('Problem with {} {}', gt_words, candidate_words) # Increase calculated score current_score += bleu_score return current_score / max_score
def get_bleu4(dialog_acts, golden_utts, gen_utts, data_key): das2utts = {} for das, utt, gen in zip(dialog_acts, golden_utts, gen_utts): intent_frequency = defaultdict(int) for act in das: cur_act = copy.copy(act) # intent list facility = None # for 酒店设施 if '酒店设施' in cur_act[2]: facility = cur_act[2].split('-')[1] if cur_act[0] == 'Inform': cur_act[2] = cur_act[2].split('-')[0] + '+' + cur_act[3] elif cur_act[0] == 'Request': cur_act[2] = cur_act[2].split('-')[0] if cur_act[0] == 'Select': cur_act[2] = '源领域+' + cur_act[3] intent = '+'.join(cur_act[:-1]) if '+'.join(cur_act) == 'Inform+景点+门票+免费' or cur_act[-1] == '无': intent = '+'.join(cur_act) intent_frequency[intent] += 1 # utt content replacement if (act[0] in ['Inform', 'Recommend'] or '酒店设施' in intent) and not intent.endswith('无'): if act[3] in utt or (facility and facility in utt): # value to be replaced if '酒店设施' in intent: value = facility else: value = act[3] # placeholder placeholder = '[' + intent + ']' placeholder_one = '[' + intent + '1]' placeholder_with_number = '[' + intent + str( intent_frequency[intent]) + ']' if intent_frequency[intent] > 1: utt = utt.replace(placeholder, placeholder_one) utt = utt.replace(value, placeholder_with_number) gen = gen.replace(placeholder, placeholder_one) gen = gen.replace(value, placeholder_with_number) else: utt = utt.replace(value, placeholder) gen = gen.replace(value, placeholder) hash_key = '' for act in sorted(das): hash_key += act2intent(act) das2utts.setdefault(hash_key, {'refs': [], 'gens': []}) das2utts[hash_key]['refs'].append(utt) das2utts[hash_key]['gens'].append({'das': das, 'gen': gen}) refs, gens = [], [] for das in das2utts.keys(): assert len(das2utts[das]['refs']) == (len(das2utts[das]['gens'])) for gen_pair in das2utts[das]['gens']: lex_das = gen_pair['das'] # das w/ value gen = gen_pair['gen'] lex_gen = value_replace(gen, lex_das) gens.append([x for x in jieba.lcut(lex_gen) if x.strip()]) refs.append([[ x for x in jieba.lcut(value_replace(s, lex_das)) if x.strip() ] for s in das2utts[das]['refs']]) with open(os.path.join('', 'generated_sens_%s.json' % data_key), 'w', encoding='utf-8') as f: json.dump({ 'refs': refs, 'gens': gens }, f, indent=4, sort_keys=True, ensure_ascii=False) print('generated_sens_%s.txt saved!' % data_key) print('Start calculating bleu score...') bleu = corpus_bleu(refs, gens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1) return bleu
def get_metrics(pred, target): turns = len(target) bleu_2 = 0 bleu_4 = 0 meteor = 0 nist_2 = 0 nist_4 = 0 for index in range(turns): pred_utt = pred[index] target_utt = target[index] min_len = min(len(pred_utt), len(target_utt)) lens = min(min_len, 4) if lens == 0: continue if lens >= 4: bleu_4_utt = sentence_bleu([target_utt], pred_utt, weights = (0.25, 0.25, 0.25, 0.25), smoothing_function = SmoothingFunction().method1) nist_4_utt = sentence_nist([target_utt], pred_utt, 4) else: bleu_4_utt = 0 nist_4_utt = 0 if lens >= 2: bleu_2_utt = sentence_bleu([target_utt], pred_utt, weights = (0.5, 0.5), smoothing_function = SmoothingFunction().method1) nist_2_utt = sentence_nist([target_utt], pred_utt, 2) else: bleu_2_utt = 0 nist_2_utt = 0 bleu_2 += bleu_2_utt bleu_4 += bleu_4_utt meteor += meteor_score([" ".join(target_utt)], " ".join(pred_utt)) nist_2 += nist_2_utt nist_4 += nist_4_utt bleu_2 /= turns bleu_4 /= turns meteor /= turns nist_2 /= turns nist_4 /= turns return bleu_2, bleu_4, meteor, nist_2, nist_4
def calculate_bleu(pred_trg, real_trg): smoothie = SmoothingFunction().method4 return sentence_bleu(real_trg, pred_trg, smoothing_function=smoothie)
def test_bleu_bug(): ref = [[[1, 3], [3], [4]]] gen = [[1]] with pytest.raises(ZeroDivisionError): corpus_bleu(ref, gen, smoothing_function=SmoothingFunction().method3)
from nltk.translate.bleu_score import corpus_bleu from nltk.translate.bleu_score import SmoothingFunction smooth = SmoothingFunction() def eval_bleu(ref, pred): """ :param ref: list(list(list(any))), a list of reference sentences, each element of the list is a list of references :param pred: list(list(any)), a list of predictions :return: corpus bleu score """ return corpus_bleu(ref, pred, smoothing_function=smooth.method1)
def rl_train(self, sess, batch, with_ce): feed_dict = self.run_encoder(sess, batch, only_feed_dict=True) feed_dict[self.decoder_inputs] = batch.decoder_inputs greedy_outputs = sess.run(self.greedy_words, feed_dict) greedy_outputs = greedy_outputs.tolist() gold_output = batch.question_words.tolist() # baseline outputs by flipping coin flipp = 0.1 baseline_outputs = np.copy(batch.question_words) for i in range(batch.question_words.shape[0]): seq_len = min(self.flags.max_question_len, batch.question_lengths[i] - 1) # don't change stop token '</s>' for j in range(seq_len): if greedy_outputs[i][j] != 0 and random.random() < flipp: baseline_outputs[i, j] = greedy_outputs[i][j] baseline_outputs = baseline_outputs.tolist() rl_inputs = [] rl_outputs = [] rl_input_lengths = [] rewards = [] for i, (baseline_output, greedy_output) in enumerate( zip(baseline_outputs, greedy_outputs)): _, baseline_output_words = self.word_vocab.getLexical( baseline_output) greedy_output, greedy_output_words = self.word_vocab.getLexical( greedy_output) _, gold_output_words = self.word_vocab.getLexical(gold_output[i]) rl_inputs.append([int(batch.decoder_inputs[i, 0])] + greedy_output[:-1]) rl_outputs.append(greedy_output) rl_input_lengths.append(len(greedy_output)) baseline_output_words_list = baseline_output_words.split() greedy_output_words_list = greedy_output_words.split() gold_output_words_list = gold_output_words.split() if self.flags.reward_type == 'bleu': cc = SmoothingFunction() reward = sentence_bleu([gold_output_words_list], greedy_output_words_list, smoothing_function=cc.method3) baseline = sentence_bleu([gold_output_words_list], baseline_output_words_list, smoothing_function=cc.method3) rewards.append(reward - baseline) elif self.flags.reward_type == 'rouge': reward = rouge.rouge([gold_output_words], [greedy_output_words])["rouge_l/f_score"] baseline = rouge.rouge( [gold_output_words], [baseline_output_words])["rouge_l/f_score"] rewards.append(reward - baseline) else: raise ValueError("Reward type is not bleu or rouge!") rl_inputs = padding_utils.pad_2d_vals(rl_inputs, len(rl_inputs), self.flags.max_question_len) rl_outputs = padding_utils.pad_2d_vals(rl_outputs, len(rl_outputs), self.flags.max_question_len) rl_input_lengths = np.array(rl_input_lengths, dtype=np.int32) rewards = np.array(rewards, dtype=np.float32) #reward = rescale(reward) assert rl_inputs.shape == rl_outputs.shape feed_dict = self.run_encoder(sess, batch, only_feed_dict=True) feed_dict[self.rewards] = rewards if with_ce: feed_dict[self.decoder_inputs_rl] = rl_inputs feed_dict[self.question_words_rl] = rl_outputs feed_dict[self.question_lengths_rl] = rl_input_lengths feed_dict[self.decoder_inputs] = batch.decoder_inputs feed_dict[self.question_words] = batch.question_words feed_dict[self.question_lengths] = batch.question_lengths else: feed_dict[self.decoder_inputs] = rl_inputs feed_dict[self.question_words] = rl_outputs feed_dict[self.question_lengths] = rl_input_lengths _, loss = sess.run([self.train_op, self.loss], feed_dict) return loss
def modified_corpus_bleu(list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None, auto_reweigh=False): """ modified from nltk.translate.bleu_score.corpus_bleu, returns 'multi-bleu.perl'-like intermediate results. Args: list_of_references: hypotheses: weights: smoothing_function: auto_reweigh: Returns: """ # Before proceeding to compute BLEU, perform sanity checks. p_numerators = Counter( ) # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter( ) # Key = ngram order, and value = no. of ngram in ref. hyp_lengths, ref_lengths = 0, 0 assert len(list_of_references) == len(hypotheses), f"The number of hypotheses and their reference(s) should be " \ f"the same: {len(list_of_references)} != {len(hypotheses)}" # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(weights, start=1): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. bp = brevity_penalty(ref_lengths, hyp_lengths) # Uniformly re-weighting based on maximum hypothesis lengths if largest # order of n-grams < 4 and weights is set at default. if auto_reweigh: if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25): weights = (1 / hyp_lengths, ) * hyp_lengths # Collects the various precision values for the different ngram orders. p_n = [ Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(weights, start=1) ] # Returns 0 if there's no matching n-grams # We only need to check for p_numerators[1] == 0, since if there's # no unigrams, there won't be any higher order ngrams. if p_numerators[1] == 0: return 0 # If there's no smoothing, set use method0 from SmoothinFunction class. if not smoothing_function: smoothing_function = SmoothingFunction().method0 # Smoothen the modified precision. # Note: smoothing_function() may convert values into floats; # it tries to retain the Fraction object as much as the # smoothing method allows. p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_len) s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n)) s = bp * math.exp(math.fsum(s)) return s, p_n, bp, hyp_lengths / ref_lengths, hyp_lengths, ref_lengths
def evaluate_seq2seq_decode_results(dataset, seq2seq_decode_file, seq2seq_ref_file, verbose=True, is_nbest=False): from lang.py.parse import parse f_seq2seq_decode = open(seq2seq_decode_file) f_seq2seq_ref = open(seq2seq_ref_file) if verbose: logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count) cum_bleu = 0.0 cum_acc = 0.0 sm = SmoothingFunction() decode_file_data = [l.strip() for l in f_seq2seq_decode.readlines()] ref_code_data = [l.strip() for l in f_seq2seq_ref.readlines()] if is_nbest: for i in xrange(len(decode_file_data)): d = decode_file_data[i].split(' ||| ') decode_file_data[i] = (int(d[0]), d[1]) def is_well_formed_python_code(_hyp): try: _hyp = _hyp.replace('#NEWLINE#', '\n').replace('#INDENT#', ' ').replace(' #MERGE# ', '') hyp_ast_tree = parse(_hyp) return True except: return False for eid in range(dataset.count): example = dataset.examples[eid] cur_example_correct = False if is_nbest: # find the best-scored well-formed code from the n-best list n_best_list = filter(lambda x: x[0] == eid, decode_file_data) code = top_scored_code = n_best_list[0][1] for _, hyp in n_best_list: if is_well_formed_python_code(hyp): code = hyp break if top_scored_code != code: print '*' * 60 print top_scored_code print code print '*' * 60 code = n_best_list[0][1] else: code = decode_file_data[eid] code = code.replace('#NEWLINE#', '\n').replace('#INDENT#', ' ').replace(' #MERGE# ', '') ref_code = ref_code_data[eid].replace('#NEWLINE#', '\n').replace('#INDENT#', ' ').replace(' #MERGE# ', '') if code == ref_code: cum_acc += 1 cur_example_correct = True if data_type == 'django': ref_code_for_bleu = example.meta_data['raw_code'] pred_code_for_bleu = code # de_canonicalize_code(code, example.meta_data['raw_code']) # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code']) # convert canonicalized code to raw code for literal, place_holder in example.meta_data['str_map'].iteritems(): pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal) # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal) elif data_type == 'hs': ref_code_for_bleu = example.code pred_code_for_bleu = code # we apply Ling Wang's trick when evaluating BLEU scores refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu) pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu) ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu)) bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3) cum_bleu += bleu_score cum_bleu /= dataset.count cum_acc /= dataset.count logging.info('sentence level bleu: %f', cum_bleu) logging.info('accuracy: %f', cum_acc)
from __future__ import print_function, division import torch import torchtext import seq2seq import autoeval from seq2seq.loss import NLLLoss from nltk.translate.bleu_score import corpus_bleu from nltk.translate.bleu_score import SmoothingFunction from autoeval.eval_embedding import Embed from autoeval.eval_distinct import distinct smoothie = SmoothingFunction().method4 class Evaluator(object): """ Class to evaluate models with given datasets. Args: loss (seq2seq.loss, optional): loss for evaluator (default: seq2seq.loss.NLLLoss) batch_size (int, optional): batch size for evaluator (default: 64) """ def __init__(self, loss=NLLLoss(), batch_size=64): self.loss = loss self.batch_size = batch_size def evaluate(self, model, data, vocabs=None,
def __init__(self): self.rouge = Rouge() self.smooth = SmoothingFunction().method1 self.best_bleu = 0.
def get_sentence_bleu(self, example, hyp): return sentence_bleu([tokenize_for_bleu_eval(example.info['example_dict']['snippet'])], tokenize_for_bleu_eval(hyp.decanonical_code), smoothing_function=SmoothingFunction().method3)
import time import numpy as np import codecs from vocab_utils import Vocab import namespace_utils import NP2P_data_stream from NP2P_model_graph import ModelGraph FLAGS = None import tensorflow as tf tf.logging.set_verbosity( tf.logging.ERROR) # DEBUG, INFO, WARN, ERROR, and FATAL from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu cc = SmoothingFunction() import metric_utils import platform def get_machine_name(): return platform.node() def vec2string(val): result = "" for v in val: result += " {}".format(v) return result.strip()
def evaluate_dataset(self, dataset, decode_results, fast_mode=False): examples = dataset assert len(examples) == len(decode_results) # speed up, cache tokenization results if not hasattr(examples[0], 'reference_code_tokens'): for example in examples: setattr(example, 'reference_code_tokens', tokenize_for_bleu_eval(example.info['example_dict']['snippet'])) if not hasattr(decode_results[0][0], 'decanonical_code_tokens'): for i, example in enumerate(examples): hyp_list = decode_results[i] # here we prune any hypothesis that throws an error when converting back to the decanonical code! # This modifies the decode_results in-place! filtered_hyp_list = [] for hyp in hyp_list: if not hasattr(hyp, 'decanonical_code'): try: hyp.decanonical_code = decanonicalize_code(hyp.code, slot_map=example.info['slot_map']) if hyp.decanonical_code: hyp.decanonical_code_tokens = tokenize_for_bleu_eval(hyp.decanonical_code) filtered_hyp_list.append(hyp) except: pass decode_results[i] = filtered_hyp_list if fast_mode: references = [e.reference_code_tokens for e in examples] hypotheses = [hyp_list[0].decanonical_code_tokens if hyp_list else [] for hyp_list in decode_results] bleu_tup = compute_bleu([[x] for x in references], hypotheses, smooth=False) bleu = bleu_tup[0] return bleu else: tokenized_ref_snippets = [] hyp_code_tokens = [] best_hyp_code_tokens = [] sm_func = SmoothingFunction().method3 sent_bleu_scores = [] oracle_bleu_scores = [] oracle_exact_match = [] for example, hyp_list in zip(examples, decode_results): tokenized_ref_snippets.append(example.reference_code_tokens) example_hyp_bleu_scores = [] if hyp_list: for i, hyp in enumerate(hyp_list): hyp.bleu_score = sentence_bleu([example.reference_code_tokens], hyp.decanonical_code_tokens, smoothing_function=sm_func) hyp.is_correct = self.is_hyp_correct(example, hyp) example_hyp_bleu_scores.append(hyp.bleu_score) top_decanonical_code_tokens = hyp_list[0].decanonical_code_tokens sent_bleu_score = hyp_list[0].bleu_score best_hyp_idx = np.argmax(example_hyp_bleu_scores) oracle_sent_bleu = example_hyp_bleu_scores[best_hyp_idx] _best_hyp_code_tokens = hyp_list[best_hyp_idx].decanonical_code_tokens else: top_decanonical_code_tokens = [] sent_bleu_score = 0. oracle_sent_bleu = 0. _best_hyp_code_tokens = [] oracle_exact_match.append(any(hyp.is_correct for hyp in hyp_list)) hyp_code_tokens.append(top_decanonical_code_tokens) sent_bleu_scores.append(sent_bleu_score) oracle_bleu_scores.append(oracle_sent_bleu) best_hyp_code_tokens.append(_best_hyp_code_tokens) bleu_tup = compute_bleu([[x] for x in tokenized_ref_snippets], hyp_code_tokens, smooth=False) corpus_bleu = bleu_tup[0] bleu_tup = compute_bleu([[x] for x in tokenized_ref_snippets], best_hyp_code_tokens, smooth=False) oracle_corpus_bleu = bleu_tup[0] avg_sent_bleu = np.average(sent_bleu_scores) oracle_avg_sent_bleu = np.average(oracle_bleu_scores) exact = sum([1 if h == r else 0 for h, r in zip(hyp_code_tokens, tokenized_ref_snippets)]) / float( len(examples)) oracle_exact_match = np.average(oracle_exact_match) return {'corpus_bleu': corpus_bleu, 'oracle_corpus_bleu': oracle_corpus_bleu, 'avg_sent_bleu': avg_sent_bleu, 'oracle_avg_sent_bleu': oracle_avg_sent_bleu, 'exact_match': exact, 'oracle_exact_match': oracle_exact_match}
def get_bleu(self): ngram = self.gram bleu = list() reference = self.get_reference() weight = tuple((1. / ngram for _ in range(ngram))) with open(self.test_data) as test_data: for hypothesis in test_data: hypothesis = nltk.word_tokenize(hypothesis) bleu.append(nltk.translate.bleu_score.sentence_bleu(reference, hypothesis, weight, smoothing_function=SmoothingFunction().method1)) return sum(bleu) / len(bleu)
def bleu_score(a, b): cc = SmoothingFunction() bl = sentence_bleu([a], b, smoothing_function=cc.method1) return bl