def compute_metrics(self, data, preds): em = 0 hyps, refs = [], [] utt_hyps, utt_refs = [], [] for ex in data: p = preds[ex['id']] gsql = ex['g_query_recov'].lower().split() psql = p['query'].lower().split() refs.append([gsql]) hyps.append(psql) em += psql == gsql utt_hyps.append(p['utt_toks']) utt_refs.append([ex['g_question_toks']]) metrics = { 'em': em / len(data), 'bleu': corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method3), 'utt_bleu': corpus_bleu(utt_refs, utt_hyps, smoothing_function=SmoothingFunction().method3), } if not self.training: metrics.update(self.compute_official_eval(data, preds)) else: metrics['official_em'] = metrics['em'] return metrics
def evaluate(gen_chars, gen_actions, gen_outputs, ref_chars, ref_actions, ref_outputs, idx2word): ''' bleu ppl char_acc ''' # char total_sum = total_length = 0 for gc, rc in zip(gen_chars, ref_chars): eq = np.equal(gc, rc) total_sum += sum(eq) total_length += len(eq) print('char_acc:{}'.format((total_sum / total_length) * 100), flush=True) # action total_sum = total_length = 0 for ga, ra in zip(gen_actions, ref_actions): eq = np.equal(ga, ra) total_sum += sum(eq) total_length += len(eq) print('action_acc:{}'.format((total_sum / total_length) * 100), flush=True) gen_outputs = [' '.join([' '.join([idx2word[id.item()] for id in s]) for s in outputs]) for outputs in gen_outputs] ref_outputs = [[' '.join([' '.join([idx2word[id.item()] for id in s]) for s in outputs])] for outputs in ref_outputs] # TODO restore name? scores, _ = bleu.corpus_bleu(gen_outputs, ref_outputs, max_n=4) final_bleu, bleu1, bleu2, bleu3, bleu4 = scores print('bleu1:{}, bleu2:{}, bleu3:{}, bleu4:{}'.format(bleu1 * 100, bleu2 * 100, bleu3 * 100, bleu4 * 100), flush=True)
def evaluate_trans(thenet, references, vali_data, vali_raw_data): hypothesis = [] score_total = 0. num_word_total = 0 for batch in vali_data: pred_batch, gold_batch, pred_scores, gold_scores, attn, src = thenet.translate( batch, vali_raw_data) score_total += sum([score[0] for score in pred_scores]) num_word_total += sum(len(x) for x in batch.tgt[1:]) hypothesis.extend([' '.join(x[0]) for x in pred_batch]) ppl = math.exp(-score_total / num_word_total) bleu_score = bleu.corpus_bleu( hypothesis, references)[0][0] #[final, n-gram1,n-gram2,...], [bp, ...] nlg_ref = [[x[0] for x in references if x is not None]] nlg_eval = NLGEval() save_txt('/fl/txtfile/rnn_h1.txt', hypothesis) metrics_eval = nlg_eval.compute_metrics(nlg_ref, hypothesis) print(metrics_eval) print('BLEU: {}'.format(bleu_score)) # training/validation 阶段的ppl计算在onmt/Trainer.py的Statisci()中;translating的ppl计算在 translate.py中的reprot_score函数里 print('PPL: {}'.format(ppl)) return torch.FloatTensor([ppl, bleu_score, 0.0]) # the last reserved for rank number
def compute_metrics(self, data, preds): em = 0 hyps, refs = [], [] for ex in data: if ex['id'] not in preds and self.training: continue p = preds[ex['id']] gsql = ex['query'].lower().split() psql = p['query'].lower().split() refs.append([gsql]) hyps.append(psql) em += psql == gsql metrics = { 'em': em / len(data), 'bleu': corpus_bleu(refs, hyps, smoothing_function=SmoothingFunction().method3), } if not self.training: metrics.update(self.compute_official_eval(data, preds)) else: metrics['official_em'] = metrics['em'] return metrics
def compute_metrics(self, data, preds): em = 0 utt_hyps, utt_refs = [], [] generated = dataset.Dataset() for ex in data: p = preds[ex['id']] utt_hyps.append(p['utt_toks']) utt_refs.append([ex['g_question_toks']]) # make new example db_id = ex['db_id'] db = self.conv.database_schemas[db_id] question_toks = p['utt_toks'] query_context = preprocess_nl2sql.SQLDataset.build_contexts( question_toks, db, self.bert_tokenizer) if 'g_sql' not in ex: ex['g_sql'] = self.conv.build_sql(ex['query'], db_id) new = dict( id=ex['id'], question=ex['question'], db_id=db_id, g_question_toks=question_toks, query=ex['query'], g_values=ex['g_values'], g_sql=ex['g_sql'], value_context=[self.bert_tokenizer.cls_token] + question_toks + [self.bert_tokenizer.sep_token], query_context=query_context, invalid=False, cands_query=preprocess_nl2sql.SQLDataset.make_column_cands( query_context), ) new['cands_query'], new[ 'cands_value'] = preprocess_nl2sql.SQLDataset.make_cands( new, self.nl2sql.sql_vocab) generated.append(new) metrics = { 'utt_bleu': corpus_bleu(utt_refs, utt_hyps, smoothing_function=SmoothingFunction().method3), } if not self.training: with torch.no_grad(): self.nl2sql.eval() preds = self.nl2sql.run_pred(generated, self.nl2sql.args, verbose=True, desc='cycle_pred') metrics.update(self.nl2sql.compute_official_eval(generated, preds)) return metrics
def populate_train_dict(self, sess, targets_batch): """Prepares the feed dictionary for training. Args: targets_batch: Target sentences in ids [batch_size, max_length] Returns: train_dict: Feed dictionary for training """ hidden_states = np.zeros( (self.config.max_length, len(self.cur_hypos), self.config.d_model), dtype=np.float32) actions = np.zeros((self.config.max_length, len(self.cur_hypos)), dtype=np.int32) for step in range(self.config.max_length - 1): # -1 since already have a READ #prev_lengths = [len(x[0].actions) for x in self.cur_hypos] hidden_states[step, :, :] = self._get_hidden_states() actions[step, :], probs, _ = self.predict_one_step( sess, hidden_states[step, :, :]) #logging.info("step %d actions:" % step) #logging.info(actions[step,:]) #logging.info("probs:") #logging.info(np.max(probs, axis=1)) #logging.info("Actions length of 1st sentence %d" % len(self.cur_hypos[0][0].actions)) #logging.info("\n") self._update_hidden_states(actions[step, :]) #new_lengths = [len(x[0].actions) for x in self.cur_hypos] #logging.info("change in actions length:") #logging.info(np.array(new_lengths)-np.array(prev_lengths)) #logging.info(np.array([x[0].netRead for x in self.cur_hypos])) #logging.info("\n") batch_average_delay = 0.0 BLEU_hypos = [] BLEU_refs = [] # Generate full hypotheses from partial hypotheses for idx, hypo in enumerate(self.cur_hypos): self.cur_hypos[idx] = hypo[0].generate_full_hypothesis() batch_average_delay += self.cur_hypos[idx].get_average_delay() BLEU_hypos.append( [str(x) for x in self.cur_hypos[idx].trgt_sentence]) BLEU_refs.append([[str(x) for x in targets_batch[idx]]]) cum_rewards = self._get_bacth_cumulative_rewards(targets_batch) _, quality = corpus_bleu(BLEU_refs, BLEU_hypos) # BLEU score for the batch batch_average_delay /= len(self.cur_hypos) logging.info("\n batch average delay: %f\n" % batch_average_delay) logging.info("\n batch BLEU: %f\n" % quality) train_dict = self.create_feed_dict( np.reshape(hidden_states, (-1, self.config.d_model)), actions.flatten(), cum_rewards.flatten(), self.config.dropout) return train_dict
def evaluate_trans(thenet, references, vali_data, vali_raw_data): hypothesis = [] score_total = 0. num_word_total = 0 for batch in vali_data: pred_batch, gold_batch, pred_scores, gold_scores, attn, src = thenet.translate( batch, vali_raw_data) score_total += sum([score[0] for score in pred_scores]) num_word_total += sum(len(x) for x in batch.tgt[1:]) hypothesis.extend([' '.join(x[0]) for x in pred_batch]) ppl = math.exp(-score_total / num_word_total) bleu_score = bleu.corpus_bleu(hypothesis, references)[0][0] # the last reserved for rank number return torch.FloatTensor([ppl, bleu_score, 0.0])
for i in range(len(pre_references)): assert len(hypothesis) == len(pre_references[i]) references = [] for i in range(len(hypothesis)): ref_for_instance = [] for j in range(len(pre_references)): ref_for_instance.append(pre_references[j][i]) references.append(ref_for_instance) assert len(references) == len(pre_references) * len(hypothesis) # calculate ngram match (BLEU) tokenized_hyps = [x.split() for x in hypothesis] tokenized_refs = [[x.split() for x in reference] for reference in references] ngram_match_score = bleu.corpus_bleu(tokenized_refs, tokenized_hyps) # calculate weighted ngram match keywords = [ x.strip() for x in open('keywords/' + args.lang + '.txt', 'r', encoding='utf-8').readlines() ] def make_weights(reference_tokens, key_word_list): return {token: 1 if token in key_word_list else 0.2 \ for token in reference_tokens} tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)] \
def learn_comment_iter(self): self.trainExamples = [] self.validExamples = [] if 'train_text_files' in self.args: train_text_files, ns = self.getDataSetDistribution('text') for j, file_name in enumerate(train_text_files): trainExamples = loadTrainExamples(self.args['train_text_files'][0]+file_name) shuffle(trainExamples) self.trainExamples.extend(trainExamples[:ns[j]]) if 'valid_text_files' in self.args: for file_name in self.args['valid_text_files'][1]: validExamples = loadTrainExamples(self.args['valid_text_files'][0]+file_name) self.validExamples.extend(validExamples) # define train classification shuffle(self.trainExamples) n_e = len(self.trainExamples) if n_e: class_examples = [[], [], [], [], [], []] for i in range(n_e): e = self.trainExamples[i] if e==None: continue class_examples[0].append(e) if e[-1]!=None: e = list(e) if isinstance(e[-1], tuple): cs = set(e[-1][1]) e[-1]=e[-1][0] for c in cs: class_examples[c].append(e) if 'test_with_no_ai' in self.nnet.args and self.nnet.args['test_with_no_ai'] and 1 not in cs: class_examples[1].append(e) else: le = e[-1].split('@\t@') assert(len(le)==2) e[-1]=le[1] cs = set([int(s) for s in le[0].split()]) for c in cs: class_examples[c].append(e) if 'test_with_no_ai' in self.nnet.args and self.nnet.args['test_with_no_ai'] and 1 not in cs: class_examples[1].append(e) self.trainExamples[i] = None print('TrainExamples Distribution: ', [len(e) for e in class_examples]) if self.args['comment_chess_training']: print('Training Chess: ') self.nnet.train(class_examples[0], transform=True, models=[0]) for m in self.nnet.args['models']['comment']: i = ord(m)-ord('A')+1 if len(class_examples[i]): print('Training Class %s:\n'%(m)) self.nnet.train(class_examples[i], transform=True, models=[i]) # define dev n_e = len(self.validExamples) if n_e: class_examples = [[], [], [], [], [], []] for i in range(n_e): e = self.validExamples[i] if e==None: continue class_examples[0].append(e) if e[-1]!=None: e = list(e) if isinstance(e[-1], tuple): cs = set(e[-1][1]) e[-1]=e[-1][0] for c in cs: class_examples[c].append(e) if 'test_with_no_ai' in self.nnet.args and self.nnet.args['test_with_no_ai'] and 1 not in cs: class_examples[1].append(e) else: le = e[-1].split('@\t@') assert(len(le)==2) e[-1]=le[1] cs = set([int(s) for s in le[0].split()]) for c in cs: class_examples[c].append(e) if 'test_with_no_ai' in self.nnet.args and self.nnet.args['test_with_no_ai'] and 1 not in cs: class_examples[1].append(e) self.validExamples[i] = None print('ValidExamples Distribution: ', [len(e) for e in class_examples]) for m in self.nnet.args['models']['comment']: i = ord(m)-ord('A')+1 if len(class_examples[i]): print('Evaluating Class %s:'%(m)) n = len(class_examples[i]) bsize = self.nnet.args['batch_size'] predict_texts = [] gold_texts = [] for b in range((n+bsize-1)//bsize): batch = class_examples[i][b*bsize:min((b+1)*bsize,n)] n_batch = len(batch) if n_batch<bsize: batch += class_examples[i][:bsize-n_batch] boards, pis, vs, valids, texts = list(zip(*batch)) rets = list(self.nnet.predict(boards, [pis, valids], models=[i], transform=True)) for tb in range(n_batch): if boards[tb][-1]==-1 and len(rets[tb])>0: rets[tb] = self.postProcess(rets[tb], player="black") elif len(rets[tb])>0: rets[tb] = self.postProcess(rets[tb], player="white") predict_texts.extend(rets[:n_batch]) gold_texts.extend(texts[:n_batch]) # for k in range(n_batch): # if random.random()<0.0005 or len(self.trainExamples)==0: # print('Board: ', boards[k][-1]) # print(chess.Board(boards[k][0]).unicode().replace(u'·', u'.')) # print('Move: ', self.game.action_list[pis[k][0]]) # print('Expected: ', texts[k].strip()) # print('Predicted: ', rets[k]) result = bleu.corpus_bleu(predict_texts, [[t.strip()] for t in gold_texts])[0][0] # refs = [] # hyps = [] # for p, t in zip(predict_texts, gold_texts): # refs.append([t.split()]) # hyps.append(p.split()) # result = nltk.translate.bleu_score.corpus_bleu(refs, hyps, auto_reweigh=True) print('BLEU-4 for Class %s: %.2f'%(m, result*100)) result = bleu.corpus_bleu(predict_texts, [[t.strip()] for t in gold_texts], max_n=2)[0][0] print('BLEU-2 for Class %s: %.2f'%(m, result*100)) save2text(gold_texts, self.args.checkpoint+'_gold%d-%d.txt'%(i, self.iter)) save2text(predict_texts, self.args.checkpoint+'_predicted%d-%d.txt'%(i, self.iter)) print('METEOR for Class %s: '%(m), meteor.evaluate(self.args.checkpoint+'_predicted%d-%d.txt'%(i, self.iter), self.args.checkpoint+'_gold%d-%d.txt'%(i, self.iter))) print('Dist-2 for Class %s: '%(m), diversity.corpus_diversity(predict_texts)) self.trainExamples = [] self.validExamples = []
def populate_train_dict(self, sess, targets_batch): """Prepares the feed dictionary for training. Args: targets_batch: Target sentences in ids [batch_size, max_length] Returns: train_dict: Feed dictionary for training """ hidden_states = np.zeros( (self.config.max_length, len(self.cur_hypos), self.config.d_model), dtype=np.float32 ) actions = np.zeros((self.config.max_length, len(self.cur_hypos)), dtype=np.int32) targets = np.zeros((self.config.max_length, len(self.cur_hypos)), dtype=np.int32) for step in range(self.config.max_length-1):# -1 since already have a READ #prev_lengths = [len(x[0].actions) for x in self.cur_hypos] hidden_states[step,:,:] = self._get_hidden_states() # current best qval is the target for the previous step actions[step,:], qvals, optimal_actions = self.predict_one_step(sess, hidden_states[step,:,:]) # If the target network exists, decouple the action selection from # target values prediction if hasattr(self, 'target'): targets[step-1,:] = self.target._get_targets(sess, hidden_states[step,:,:], optimal_actions) else: targets[step-1,:] = np.choose(optimal_actions, qvals.T) self._update_hidden_states(actions[step,:]) BLEU_hypos = [] BLEU_refs = [] wue_BLEU_hypos = [] # the hypo holder for WUE decoding # Generate full hypotheses from partial hypotheses for idx, hypo in enumerate(self.cur_hypos): self.cur_hypos[idx] = hypo[0].generate_full_hypothesis() action_length = len(self.cur_hypos[idx].actions) # get hypothesis and reference for BLEU evaluation BLEU_hypos.append([str(x) for x in self.cur_hypos[idx].trgt_sentence]) BLEU_refs.append([[str(x) for x in targets_batch[idx]]]) if hasattr(self, 'target') and self.config.useBLEUDrop: wue_BLEU_hypos.append([str(x) for x in self.all_wue_trans[self.cur_hypos[idx].lst_id]]) # targets are just as long as the action sequence, pad the remaining # targets with 0 targets[action_length-2:,idx] = 0 # give the quality rewards (BLEU) at the end _, quality = corpus_bleu(BLEU_refs, BLEU_hypos) # BLEU score for the batch if self.config.useBLEUDrop: _, wue_BLEU = corpus_bleu(BLEU_refs, wue_BLEU_hypos) quality = quality - wue_BLEU batch_average_delay = 0.0 for idx in range(len(self.cur_hypos)): batch_average_delay += self.cur_hypos[idx].get_average_delay() targets[len(self.cur_hypos[idx].actions)-2,idx] = \ quality + self.cur_hypos[idx].get_last_delay_reward(self.config) batch_average_delay /= len(self.cur_hypos) logging.info("\n batch average delay: %f\n" % batch_average_delay) logging.info("\n batch (delta) BLEU: %f\n" % quality) if self.config.useBLEUDrop: logging.info("\n batch WUE BLEU: %f\n" % wue_BLEU) train_dict = self.create_feed_dict( np.reshape(hidden_states, (-1, self.config.d_model)), actions.flatten(), targets.flatten(), self.config.dropout ) return train_dict