Esempio n. 1
0
    def get_metrics(self, ref_dir, dec_dir):
        reference = []
        decoded = []

        for i, j in zip(sorted(glob.glob(dec_dir + '/' + '*.txt')),
                        sorted(glob.glob(ref_dir + '/' + '*.txt'))):
            ref_tex = ''
            dec_tex = ''

            for k in open(i).readlines():
                dec_tex = dec_text + k.strip()
                if len(dec_tex) == 0:
                    dec_tex = ' '

            for l in open(j).readlines():
                ref_tex = ref_tex + l

            reference.append(ref_tex)
            decoded.append(dec_tex)

        if len(reference) != len(decoded):
            raise ValueError(
                "Hypotheses and References don't have equal lengths")

        rouge_dict = rouge.rouge(decoded, reference)
        file_path = os.path.join(self._decode_dir, 'results.txt')
        f = open(file_path, 'w')
        for key in rouge_dict:
            print("%s\t%f" % (key, rouge_dict[key]), file=f)
        bleu_score = bleu.moses_multi_bleu(decoded, reference)
        print("%s\t%f" % ('bleu', bleu_score), file=f)
        tf.logging.info("BLEU, ROUGE values saved to results.txt")
Esempio n. 2
0
def cal_bleu(infer, ref):

    while True:
        try:
            bleu_score = bleu.moses_multi_bleu(infer, ref)
            return bleu_score
        except FileNotFoundError:
            print("Failed to test bleu_score. Sleeping for %i secs...", 0.01)
            time.sleep(3)
Esempio n. 3
0
def moses_bl_rouge(p, l):
    bl = bleu.moses_multi_bleu(p, l)
    x = rouge.rouge(p, l)
    print(
        'Moses BLEU: %f\nROUGE1-F: %f\nROUGE1-P: %f\nROUGE1-R: %f\nROUGE2-F: %f\nROUGE2-P: %f\nROUGE2-R: %f\nROUGEL-F: %f\nROUGEL-P: %f\nROUGEL-R: %f'
        %
        (bl, x['rouge_1/f_score'], x['rouge_1/p_score'], x['rouge_1/r_score'],
         x['rouge_2/f_score'], x['rouge_2/p_score'], x['rouge_2/r_score'],
         x['rouge_l/f_score'], x['rouge_l/p_score'], x['rouge_l/r_score']))
Esempio n. 4
0
    def predict(self,sess,data_name):
        pred_losses = []
        preds_all=np.zeros(self.max_decoder_len)
        for step, batch in enumerate(self.getBatches(self.data_list[data_name])):
            feed = self.create_feed_dict(batch,drop = 1.0,forward_only=True)
            loss, preds,= sess.run([self.loss_op, self.preds], feed_dict=feed)
            pred_losses.append(loss)
            preds_all = np.row_stack((preds_all,preds))

        preds_ids=np.delete(preds_all,0,0)
        preds_list=self.get_words_from_ids(preds_ids)[0:len(self.data_list[data_name])]
        labels_list=self.get_targs_list(self.data[data_name]['dec_targ'])
        bl=bleu.moses_multi_bleu(preds_list,labels_list)
        
        return np.mean(pred_losses),bl,preds_list,labels_list
Esempio n. 5
0
def evaluate(dataset_f, predictions_f, all_metrics=False, save_dir=""):
    with open(dataset_f) as dataset_file:
        dataset_json = json.load(dataset_file)
        dataset = dataset_json['data']
    with open(predictions_f) as prediction_file:
        predictions = json.load(prediction_file)
    gt = []
    pred = []
    f1 = exact_match = total = count = 0
    for article in dataset:
        for paragraph in article['paragraphs']:
            if str(
                    article['title']
            ) not in predictions:  #needs a lookup in case of dev-v1.1.json
                continue

            for qa in paragraph['qas']:
                total += 1

                ground_truths = list(map(lambda x: x['text'], qa['answers']))
                if str(qa['id']) not in predictions:
                    prediction = ""
                else:
                    prediction = predictions[str(qa['id'])]
                if prediction == "":
                    prediction = 'n_a'
                gt.append(ground_truths[0])
                pred.append(prediction)
                exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
                f1 += metric_max_over_ground_truths(f1_score, prediction,
                                                    ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total
    if all_metrics:
        rouge_dict = rouge(pred, gt)
        file_path = os.path.join(save_dir, 'results.txt')
        f = open(file_path, 'w')
        for key in rouge_dict:
            print("%s\t%f" % (key, rouge_dict[key]), file=f)
        bleu_score = moses_multi_bleu(pred, gt)
        print("%s\t%f" % ('bleu', bleu_score), file=f)
        print("%s\t%f" % ('f1', f1), file=f)
        print("%s\t%f" % ('exact_match', exact_match), file=f)

    return exact_match, f1
Esempio n. 6
0
    def run_epoch(self,sess,epoch):
        t0 = time.time()
        losses = []
        preds_all=np.zeros(self.max_decoder_len)
        for step, batch in enumerate(self.getBatches(self.data_list['train'])):
            feed = self.create_feed_dict(batch,drop = self.params.dropout,forward_only=False)
            loss, preds, _= sess.run([self.loss_op, self.preds, self.train_op], feed_dict=feed)
            losses.append(loss)
            preds_all = np.row_stack((preds_all,preds))
            self.logger.info('Epoch:{} \t Step:{} \t Batch Loss: {} \t Avg loss: {}'.format(epoch+1, step,loss, np.mean(losses)))

        preds_ids=np.delete(preds_all,0,0)
        preds_list=self.get_words_from_ids(preds_ids)[0:len(self.data_list['train'])]
        labels_list=self.get_targs_list(self.data['train']['dec_targ'])
        bl=bleu.moses_multi_bleu(preds_list,labels_list)

        self.logger.info('Train Loss:{}, Train BLEU: {}'.format(np.mean(losses), bl))
        t1 = time.time()
        self.logger.info('Time to run an epoch: %i seconds', t1 - t0)
        return np.mean(losses), bl,preds_list,labels_list