def get_metrics(self, ref_dir, dec_dir): reference = [] decoded = [] for i, j in zip(sorted(glob.glob(dec_dir + '/' + '*.txt')), sorted(glob.glob(ref_dir + '/' + '*.txt'))): ref_tex = '' dec_tex = '' for k in open(i).readlines(): dec_tex = dec_text + k.strip() if len(dec_tex) == 0: dec_tex = ' ' for l in open(j).readlines(): ref_tex = ref_tex + l reference.append(ref_tex) decoded.append(dec_tex) if len(reference) != len(decoded): raise ValueError( "Hypotheses and References don't have equal lengths") rouge_dict = rouge.rouge(decoded, reference) file_path = os.path.join(self._decode_dir, 'results.txt') f = open(file_path, 'w') for key in rouge_dict: print("%s\t%f" % (key, rouge_dict[key]), file=f) bleu_score = bleu.moses_multi_bleu(decoded, reference) print("%s\t%f" % ('bleu', bleu_score), file=f) tf.logging.info("BLEU, ROUGE values saved to results.txt")
def cal_bleu(infer, ref): while True: try: bleu_score = bleu.moses_multi_bleu(infer, ref) return bleu_score except FileNotFoundError: print("Failed to test bleu_score. Sleeping for %i secs...", 0.01) time.sleep(3)
def moses_bl_rouge(p, l): bl = bleu.moses_multi_bleu(p, l) x = rouge.rouge(p, l) print( 'Moses BLEU: %f\nROUGE1-F: %f\nROUGE1-P: %f\nROUGE1-R: %f\nROUGE2-F: %f\nROUGE2-P: %f\nROUGE2-R: %f\nROUGEL-F: %f\nROUGEL-P: %f\nROUGEL-R: %f' % (bl, x['rouge_1/f_score'], x['rouge_1/p_score'], x['rouge_1/r_score'], x['rouge_2/f_score'], x['rouge_2/p_score'], x['rouge_2/r_score'], x['rouge_l/f_score'], x['rouge_l/p_score'], x['rouge_l/r_score']))
def predict(self,sess,data_name): pred_losses = [] preds_all=np.zeros(self.max_decoder_len) for step, batch in enumerate(self.getBatches(self.data_list[data_name])): feed = self.create_feed_dict(batch,drop = 1.0,forward_only=True) loss, preds,= sess.run([self.loss_op, self.preds], feed_dict=feed) pred_losses.append(loss) preds_all = np.row_stack((preds_all,preds)) preds_ids=np.delete(preds_all,0,0) preds_list=self.get_words_from_ids(preds_ids)[0:len(self.data_list[data_name])] labels_list=self.get_targs_list(self.data[data_name]['dec_targ']) bl=bleu.moses_multi_bleu(preds_list,labels_list) return np.mean(pred_losses),bl,preds_list,labels_list
def evaluate(dataset_f, predictions_f, all_metrics=False, save_dir=""): with open(dataset_f) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] with open(predictions_f) as prediction_file: predictions = json.load(prediction_file) gt = [] pred = [] f1 = exact_match = total = count = 0 for article in dataset: for paragraph in article['paragraphs']: if str( article['title'] ) not in predictions: #needs a lookup in case of dev-v1.1.json continue for qa in paragraph['qas']: total += 1 ground_truths = list(map(lambda x: x['text'], qa['answers'])) if str(qa['id']) not in predictions: prediction = "" else: prediction = predictions[str(qa['id'])] if prediction == "": prediction = 'n_a' gt.append(ground_truths[0]) pred.append(prediction) exact_match += metric_max_over_ground_truths( exact_match_score, prediction, ground_truths) f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) exact_match = 100.0 * exact_match / total f1 = 100.0 * f1 / total if all_metrics: rouge_dict = rouge(pred, gt) file_path = os.path.join(save_dir, 'results.txt') f = open(file_path, 'w') for key in rouge_dict: print("%s\t%f" % (key, rouge_dict[key]), file=f) bleu_score = moses_multi_bleu(pred, gt) print("%s\t%f" % ('bleu', bleu_score), file=f) print("%s\t%f" % ('f1', f1), file=f) print("%s\t%f" % ('exact_match', exact_match), file=f) return exact_match, f1
def run_epoch(self,sess,epoch): t0 = time.time() losses = [] preds_all=np.zeros(self.max_decoder_len) for step, batch in enumerate(self.getBatches(self.data_list['train'])): feed = self.create_feed_dict(batch,drop = self.params.dropout,forward_only=False) loss, preds, _= sess.run([self.loss_op, self.preds, self.train_op], feed_dict=feed) losses.append(loss) preds_all = np.row_stack((preds_all,preds)) self.logger.info('Epoch:{} \t Step:{} \t Batch Loss: {} \t Avg loss: {}'.format(epoch+1, step,loss, np.mean(losses))) preds_ids=np.delete(preds_all,0,0) preds_list=self.get_words_from_ids(preds_ids)[0:len(self.data_list['train'])] labels_list=self.get_targs_list(self.data['train']['dec_targ']) bl=bleu.moses_multi_bleu(preds_list,labels_list) self.logger.info('Train Loss:{}, Train BLEU: {}'.format(np.mean(losses), bl)) t1 = time.time() self.logger.info('Time to run an epoch: %i seconds', t1 - t0) return np.mean(losses), bl,preds_list,labels_list