Ejemplo n.º 1
0
def main():

    references = [[]]
    hypotheses = []

   # Create NlG metrics evaluator
    nlgeval = NLGEval(metrics_to_omit=['SkipThoughtCS', 'GreedyMatchingScore', 'VectorExtremaCosineSimilarity', 'EmbeddingAverageCosineSimilarity'])

    with open('/home/jcardoso/MIMIC/encodedTestCaptionsF.json') as json_file:
        referenceCaptionsDict = json.load(json_file)

    with open('/home/jcardoso/MIMIC/encodedTrainCaptionsF.json') as json_file:
        KBCaptionsDict = json.load(json_file)

    reference_ids = list(referenceCaptionsDict.keys())

    KB_ids = list(KBCaptionsDict.keys())

    for i in tqdm(range(len(referenceCaptionsDict.keys()))):
        references[0].append(unifyCaption(referenceCaptionsDict[reference_ids[i]]))
        hypotheses.append(get_random_report(KB_ids, KBCaptionsDict))

    metrics_dict = nlgeval.compute_metrics(references, hypotheses)
    print(metrics_dict)

    with open("RandomRefs.txt", 'w+') as file:
        for reference in references[0]:
            file.write(reference.strip() + '\n')
    with open("RandomPreds.txt", 'w+') as file:
        for hypothesis in hypotheses:
            file.write(hypothesis.strip() + '\n')

    with open("random_TestResults.txt", "w+") as file:
      for metric in metrics_dict:
        file.write(metric + ":" + str(metrics_dict[metric]) + "\n")
    def run_metrics(self, output, refer_dataset):
        refer = refer_dataset.refer
        hypothesis = []
        references = []

        mp1 = 0.0
        mp2 = 0.0
        mean_objects = 0.0
        total = 0.0

        for row in output:
            ref_id = int(row['refID'])
            gen_sentence = row['gen_sentence']
            hypothesis.append(row['gen_sentence'])
            references.append(
                [s['sent'] for s in refer.Refs[ref_id]['sentences']])

            total += 1.0
            mean_objects += row['n_objects']
            mp1 += row['p@1']
            mp2 += row['p@2']

        references = list(zip(*references))
        nlgeval = NLGEval(no_skipthoughts=True,
                          no_glove=True,
                          metrics_to_omit=['METEOR'])  # loads the models
        metrics_dict = nlgeval.compute_metrics(references, hypothesis)

        metrics_dict['p@1'] = mp1 / total
        metrics_dict['p@2'] = mp2 / total

        return metrics_dict
Ejemplo n.º 3
0
def evaluate_trans(thenet, references, vali_data, vali_raw_data):
    hypothesis = []
    score_total = 0.
    num_word_total = 0
    for batch in vali_data:
        pred_batch, gold_batch, pred_scores, gold_scores, attn, src = thenet.translate(
            batch, vali_raw_data)
        score_total += sum([score[0] for score in pred_scores])
        num_word_total += sum(len(x) for x in batch.tgt[1:])
        hypothesis.extend([' '.join(x[0]) for x in pred_batch])
    ppl = math.exp(-score_total / num_word_total)
    bleu_score = bleu.corpus_bleu(
        hypothesis, references)[0][0]  #[final, n-gram1,n-gram2,...], [bp, ...]
    nlg_ref = [[x[0] for x in references if x is not None]]

    nlg_eval = NLGEval()
    save_txt('/fl/txtfile/rnn_h1.txt', hypothesis)
    metrics_eval = nlg_eval.compute_metrics(nlg_ref, hypothesis)
    print(metrics_eval)
    print('BLEU: {}'.format(bleu_score))
    # training/validation 阶段的ppl计算在onmt/Trainer.py的Statisci()中;translating的ppl计算在 translate.py中的reprot_score函数里
    print('PPL: {}'.format(ppl))

    return torch.FloatTensor([ppl, bleu_score,
                              0.0])  # the last reserved for rank number
Ejemplo n.º 4
0
class NLGMetrics(BaseMetric):
    def __init__(self, *args, **kwargs):
        self.nlgeval = NLGEval(no_glove=True, no_skipthoughts=True)

    @staticmethod
    def prepare_sent(tokens: List[str]) -> str:
        return recover_desc(tokens)

    def eval(self, hypos: Iterable[List[List[str]]], references: Iterable[List[str]],
             src_references: Iterable[List[str]], *args, **kwargs) -> dict:
        # List[str]
        first_hypos = [self.prepare_sent(hypo_list[0]) for hypo_list in hypos]
        src_ref_strs = [self.prepare_sent(src_ref) for src_ref in src_references]
        # List[List[str]]
        references_lists = [[self.prepare_sent(ref) for ref in references]]
        # distinct
        metrics_dict = self.nlgeval.compute_metrics(references_lists, first_hypos)
        # relative improve
        src_metrics_dict = self.nlgeval.compute_metrics(references_lists, src_ref_strs)
        relative_metrics_dict = OrderedDict({})
        for key in metrics_dict:
            relative_metrics_dict[key] = (metrics_dict[key] - src_metrics_dict[key]) / src_metrics_dict[key]
        return {
            'Bleu_4': metrics_dict['Bleu_4'],
            'METEOR': metrics_dict['METEOR']
        }
def get_evalutation_scores(hypothesis, refrences, testing_mode=False):
    gleu_scores = {"Gleu_1": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=1),
                   "Gleu_2": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=2),
                   "Gleu_3": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=3),
                   "Gleu_4": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=4)
                   }

    if testing_mode:
        for i in range(len(hypothesis)):
            hypothesis[i] = ' '.join(hypothesis[i])

        refs = [[]]
        for i in range(len(refrences)):
            refs[0].append(' '.join(refrences[i][0]))
            if refs[0][-1] == "":
                refs[0][-1] = "no"
        refrences = refs

        n = NLGEval()
        scores = n.compute_metrics(ref_list=refrences, hyp_list=hypothesis)
    else:
        scores = {"Bleu_1": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1.0]),
                  "Bleu_2": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 2, 1. / 2]),
                  "Bleu_3": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 3, 1. / 3, 1. / 3]),
                  "Bleu_4": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 4, 1. / 4, 1. / 4, 1. / 4])}

    for key, val in gleu_scores.items():
        scores[key] = val
    return scores
Ejemplo n.º 6
0
def get_all_nlgeval_metrics(complex_sentence, simple_sentence):
    if 'NLGEVAL' not in globals():
        global NLGEVAL
        print('Loading NLGEval models...')
        # Change False to True if you want to use skipthought or glove
        NLGEVAL = NLGEval(no_skipthoughts=True, no_glove=True)
        print('Done.')
    return NLGEVAL.compute_individual_metrics([complex_sentence], simple_sentence)
Ejemplo n.º 7
0
 def __init__(self):
     self.eval = NLGEval(no_skipthoughts=True,
                         no_glove=True,
                         metrics_to_omit=[
                             'EmbeddingAverageCosineSimilairty',
                             'VectorExtremaCosineSimilarity',
                             'GreedyMatchingScore'
                         ])
Ejemplo n.º 8
0
    def __init__(self,hparams,glove_comparer):

      
        super(Metrics_Calculator, self).__init__()
        self.nlg_eval = NLGEval(metrics_to_omit=['EmbeddingAverageCosineSimilairty', 'EmbeddingAverageCosineSimilarity','GreedyMatchingScore','SkipThoughtCS','VectorExtremaCosineSimilarity'])
        self.list_dict_track  = {"data":[]}
        self.hparams = hparams
        self.glove_comparer = glove_comparer
Ejemplo n.º 9
0
 def __init__(self, model, data, lr=0.001, to_record=None, patience=3,
              metric='Bleu_1'):
     self.model = model
     self.data = data
     self.criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
     self.optimizer = optim.Adam(model.parameters(), lr=lr)
     self.stats = Stats(to_record)
     self.evaluator = NLGEval(no_skipthoughts=True, no_glove=True)
     self.earlystopper = EarlyStopper(patience=patience, metric=metric)
Ejemplo n.º 10
0
def evaluate(hypothesis,
             references,
             no_skipthoughts=True,
             no_glove=True,
             metrics_to_omit=['METEOR']):
    nlgeval = NLGEval(no_skipthoughts=no_skipthoughts,
                      no_glove=no_glove,
                      metrics_to_omit=metrics_to_omit)
    return nlgeval.compute_metrics(references, hypothesis)
Ejemplo n.º 11
0
 def __init__(self,
              config,
              metric_names=[
                  "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "METEOR",
                  "ROUGE_L", "CIDEr"
              ]):
     super().__init__(config, metric_names)
     # please install NLGEval from `https://github.com/Maluuba/nlg-eval`
     from nlgeval import NLGEval
     self.nlg = NLGEval()
Ejemplo n.º 12
0
def eval_using_nlgeval(ref_list, pred_list, multiple):
    if VERBOSE:
        print('Loading the NLG eval model...')
    nlge = NLGEval(metrics_to_omit=['METEOR', 'CIDEr'],
                   no_skipthoughts=True,
                   no_glove=True)
    # nlge = NLGEval(metrics_to_omit=['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'CIDEr', 'ROUGE_L'], no_skipthoughts=True, no_glove=True)
    if VERBOSE:
        print('\nComputing Scores...')
    return nlge.compute_metrics(ref_list, pred_list, multiple=multiple)
Ejemplo n.º 13
0
 def com_score(self, ref, pre):
     # for gold, hype in zip(ref, pre):
     #     temp = []
     #     temp.append(gold)
     #     metrics_dict = compute_individual_metrics(temp, hype)
     #     break
     r_list = []
     r_list.append(ref)
     nlgeval = NLGEval()
     metrics_dict = nlgeval.compute_metrics(r_list, pre)
     return metrics_dict
 def meteor(self):
     """ Computes METEOR using the NLGEval library
         Link: https://github.com/Maluuba/nlg-eval"""
     metrics_to_omit = {
         "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "ROUGE_L", "CIDEr"
     }
     nlgeval = NLGEval(no_skipthoughts=True,
                       no_glove=True,
                       metrics_to_omit=metrics_to_omit)
     self.metrics.update(
         nlgeval.compute_metrics([self.target], self.hypothesis))
Ejemplo n.º 15
0
def test_oo_api():
    with open("examples/hyp.txt") as f:
        hyp = f.readlines()
    with open("examples/ref1.txt") as f:
        ref1 = f.readlines()
    with open("examples/ref2.txt") as f:
        ref2 = f.readlines()

    nlge = NLGEval()
    res = nlge.evaluate([ref1[0]] + [ref2[0]], hyp[0])
    res = nlge.evaluate([ref1[1]] + [ref2[1]], hyp[1])
Ejemplo n.º 16
0
def calculate_rouge(prediction, ground_truth, tokenizer):
    nlgeval = NLGEval()
    references = []
    hypotheses = []
    for x, y in zip(ground_truth, prediction):
        x = tokenizer.decode(x, skip_special_tokens=True)
        y = tokenizer.decode(y, skip_special_tokens=True)
        references.append([x])
        hypotheses.append(y)

    metrics_dict = nlgeval.compute_metrics(references, hypotheses)
    return metrics_dict['ROUGE_L'], references, hypotheses
Ejemplo n.º 17
0
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    # step over the whole set
    for key, desc_list in descriptions.items():
        # generate description
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        # store actual and predicted
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())
    # calculate all metrics
    nlgeval = NLGEval()  # loads the models
    metrics_dict = nlgeval.compute_metrics(actual, predicted)
Ejemplo n.º 18
0
def evaluate(loader, lstmDec, linNet, VocabData):
	Index2Word = dict([val,key] for key,val in VocabData['word_dict'].items()) # dictionary from index to word
	# if torch.cuda.is_available():
	lstmDec = lstmDec.to(device).eval()
	linNet = linNet.to(device).eval()  # nn.DataParallel(linNet,device_ids=[0, 1]).to(device)
	nlgeval = NLGEval()

	ld = iter(loader)
	numiters = len(ld)
	qdar = tqdm.tqdm(range(numiters), total=numiters, ascii=True)
	loss_itr_list = []
	
	def linOut2DecIn(global_hidden, box_feat):	# box_feat [8, 4, 4096, 3, 3]
		global_hidden = global_hidden.unsqueeze(0)
		encoder_hidden = (global_hidden,torch.zeros_like(global_hidden).to(device))
		B,M,D,H,W = box_feat.size()
		encoder_outputs = box_feat.permute(0,1,3,4,2).contiguous().view(B,-1,D)
		return encoder_hidden, encoder_outputs

	def lstr(ts,pres=3):
		return str(np.round(ts.data.cpu().numpy(), 3))
	
	with torch.no_grad(): # evaluate mode
		references = [[]]
		hypothesis = []
		for i in qdar:
			# step 1: load data
			batchdata = next(ld)
			box_feats, box_global_feats, numBoxes, box_captions_gt = makeInp(*batchdata)  # box_feats: (numImage,numBoxes,512,7,7) box_global_feats: list, numImage [(512,34,56)]

			references[0] += [" ".join([Index2Word[i] for i in s if Index2Word[i] != '<PAD>']) for s in box_captions_gt.data.cpu().numpy()] #create batch of reference based on indices

			# step 2: data transform by linNet
			box_feat,box_feat_dec, global_hidden = linNet(box_feats, box_global_feats)
			
			# step 3: decode to captions by lstmDec
			encoder_hidden, encoder_outputs = linOut2DecIn(global_hidden,box_feat_dec)
			decoder_outputs, decoder_hidden, ret_dict = lstmDec(encoder_hidden=encoder_hidden, encoder_outputs=encoder_outputs, max_len=int(5*numBoxes)) # box_feat [8, 4, 4096, 3, 3]
			
			# step 4: calculate loss
				# Loss 1: Similarity loss
			lengths = torch.LongTensor(ret_dict['length']).to(device)
			decoder_outputs = torch.stack([decoder_outputs[i] for i in range(len(decoder_outputs))], 1) # decoder_outputs [8, 15, 10878]


			word_indices = decoder_outputs.argmax(2).data.cpu().numpy() #batch_size x seq_len
			hypothesis += [" ".join([Index2Word[i] for i in s if Index2Word[i] != '<PAD>']) for s in word_indices] #create batch of hypothesis based on indices
			if i == 10:
				break
		print(nlgeval.compute_metrics(references, hypothesis))
Ejemplo n.º 19
0
    def test_compute_metrics_omit(self):
        n = NLGEval(metrics_to_omit=['Bleu_3', 'METEOR', 'EmbeddingAverageCosineSimilarity'])

        # Individual Metrics
        scores = n.compute_individual_metrics(ref=["this is a test",
                                                   "this is also a test"],
                                              hyp="this is a good test")
        self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5)
        self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5)
        self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5)
        self.assertAlmostEqual(0.0, scores['CIDEr'], places=5)
        self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5)
        self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5)
        self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5)
        self.assertEqual(7, len(scores))
Ejemplo n.º 20
0
def evaluateNLG(gen_dials, ref_dialogues):
    hyp_list, ref_list = [], []
    for fname in gen_dials:
        hyp_list.extend(gen_dials[fname])  # list of sentence string
        ref_list.extend([
            s.strip() for s in ref_dialogues[fname]['sys']
        ])  # list of ref_list, each ref_list is a list of sentence string
    ref_lists = [ref_list]  # only put 1 reference

    from nlgeval import NLGEval
    nlgeval = NLGEval()  # loads the models
    metrics_dict = nlgeval.compute_metrics(ref_list=ref_lists,
                                           hyp_list=hyp_list)
    print(metrics_dict)
    return metrics_dict
Ejemplo n.º 21
0
def NLGE_evaluation(encoder,
                    decoder,
                    search_method,
                    word2ix,
                    ix2word,
                    input_seqs,
                    target_seqs,
                    templates=None):
    """
    Function that computes several metrics using the NLG-eval python package (https://github.com/Maluuba/nlg-eval)
    :param encoder: Pytorch model that serves as encoder.
    :param decoder: Pytorch model that serves as decoder.
    :param search_method: Pytorch model used for making searches during inference. (e.g GreedySearch)
    :param word2ix: Python dictionary with tokens as keys and indexes as values.
    :param ix2word: Python dictionary with indexes as keys and tokens as values.
    :param input_seqs: List containing the vectorized question that will be used for testing the model.
    :param target_seqs: List containing the vectorized ground truth answers that will be used for testing the model.
    """
    nlg_eval = NLGEval(metrics_to_omit=[
        'Bleu_3', 'Bleu_4', 'METEOR', 'CIDEr', 'SkipThoughtCS'
    ])
    hypothesis = []
    references = []
    if templates:
        vectorizer = TfidfVectorizer(stop_words='english',
                                     lowercase=True,
                                     strip_accents='ascii')
        template2vec = vectorizer.fit_transform(templates)
    for input_seq, target_seq in tqdm(zip(input_seqs, target_seqs),
                                      total=input_seqs.shape[0]):
        input_seq, input_length, _, _, _ = prepare_data([input_seq],
                                                        [target_seq])
        tokens = search_method(input_seq, input_length, 300, word2ix['_BOS_'])
        tokens = tokens.view(
            1, -1
        )[0] if search_method.__class__.__name__ == "GreedySearchDecoder" else tokens
        answer = ' '.join([
            ix2word[token] for token in tokens.cpu().numpy()
            if token != word2ix['_PAD_']
        ])
        if templates:
            template, score = template_retrieval(answer, templates,
                                                 template2vec, vectorizer)
            if score > 0.75:
                answer = template
        hypothesis.append(answer)
        references.append(' '.join([ix2word[token] for token in target_seq]))
    return nlg_eval.compute_metrics(ref_list=[references], hyp_list=hypothesis)
Ejemplo n.º 22
0
class EvaluateNL():
    def __init__(self):
        self.eval = NLGEval(no_skipthoughts=True,
                            no_glove=True,
                            metrics_to_omit=[
                                'EmbeddingAverageCosineSimilairty',
                                'VectorExtremaCosineSimilarity',
                                'GreedyMatchingScore'
                            ])

    def compute(self, refs, hyps):
        data = []
        for i, ref in enumerate(refs):
            ref = ref.replace('\n', '')
            hyp = hyps[i].replace('\n', '')

            if not ref:
                continue

            scores = self.eval.compute_individual_metrics(ref=[ref], hyp=hyp)
            scores = sorted(scores.items())
            self._metrics = [s[0] for s in scores]

            #data.append([ref, hyp])
            data.append([
                ref, hyp, *[
                    str(float('%0.6f' % (s[1]))).replace('.', ',')
                    for s in scores
                ]
            ])

        return pd.DataFrame(data,
                            columns=['Reference', 'Hypotesi', *self._metrics])
Ejemplo n.º 23
0
def get_nlg_eval():
    if not hasattr(get_nlg_eval, "nlg_eval"):
        print("Loading eval data (first time only)")
        get_nlg_eval.nlg_eval = NLGEval(no_glove=True,
                                        no_skipthoughts=True,
                                        metrics_to_omit=["CIDEr"])
    return get_nlg_eval.nlg_eval
Ejemplo n.º 24
0
 def __init__(self,
              model_name: str = "gpt2",
              range_cand: bool = False,
              make_eval: bool = False,
              tokenizer_path: str = "default",
              pretrained_path: str = "default") -> None:
     """
     Possible models: mt5-large, mt5-base, mt5-small, gpt2, gpt3
     :param model_name:
     :param make_filter:
     :param cache_file_path:
     """
     self.logger = logging.getLogger(__name__)
     self.tokenizer_path = tokenizer_path
     self.pretrained_path = pretrained_path
     self.make_eval = make_eval
     self.range_cand = range_cand
     self.device = torch.device("cpu")
     if self.range_cand:
         self.smodel = SentenceTransformer(
             "paraphrase-xlm-r-multilingual-v1")
     if self.make_eval:
         self.ngeval = NLGEval(metrics_to_omit=[
             "EmbeddingAverageCosineSimilairty",
             "CIDEr",
             "METEOR",
             "SkipThoughtCS",
             "VectorExtremaCosineSimilarity",
             "GreedyMatchingScore",
         ])
     self.model_name = model_name
     self._check_model(model_name)
Ejemplo n.º 25
0
class NLGMetric(Metric):
    def __init__(self,
                 config,
                 metric_names=[
                     "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "METEOR",
                     "ROUGE_L", "CIDEr"
                 ]):
        super().__init__(config, metric_names)
        # please install NLGEval from `https://github.com/Maluuba/nlg-eval`
        from nlgeval import NLGEval
        self.nlg = NLGEval()

    def compute_metrics(self, outputs, targets, **kwargs):
        return self.nlg.compute_metrics(hyp_list=outputs, ref_list=targets)

    def print_computed_metrics(self, metrics):
        Bleu_1 = metrics["Bleu_1"]
        Bleu_2 = metrics["Bleu_2"]
        Bleu_3 = metrics["Bleu_3"]
        Bleu_4 = metrics["Bleu_4"]
        METEOR = metrics["METEOR"]
        ROUGE_L = metrics["ROUGE_L"]
        CIDEr = metrics["CIDEr"]

        print(
            "Bleu_1: {:.4f} - Bleu_2: {:.4f} - Bleu_3: {:.4f} - Bleu_4: {:.4f} - METEOR: {:.4f} - ROUGE_L: {:.4f} - CIDEr: {:.4f}"
            .format(Bleu_1, Bleu_2, Bleu_3, Bleu_4, METEOR, ROUGE_L, CIDEr))
Ejemplo n.º 26
0
def main(args):
    nlgeval = NLGEval(no_skipthoughts=True, no_glove=True)

    samples = {}
    with open(args.gen_file) as f:
        for line in tqdm(f):
            hypo, refs = line.rstrip().split('\t')
            metrics_dict = nlgeval.compute_individual_metrics(
                refs.split('*#'), hypo)
            samples[(hypo, refs)] = metrics_dict['Bleu_4']

    for hypo, refs in sorted(samples.keys(), key=samples.__getitem__)[:args.num_samples]:
        print('BLEU:', samples[(hypo, refs)])
        print('H:', hypo)
        for r in refs.split('*#'):
            print('R:', r)
        print('---')
Ejemplo n.º 27
0
def get_nlgeval():
    try:
        from nlgeval import NLGEval
    except ModuleNotFoundError:
        print(
            'nlg-eval module not installed. Please install with ',
            'pip install nlg-eval@git+https://github.com/Maluuba/nlg-eval.git')
    print('Loading NLGEval models...')
    return NLGEval(no_skipthoughts=True, no_glove=True)
Ejemplo n.º 28
0
    def __init__(self):
        """
        Loads metrics without extra (slow) models.

        Calculates BLEU-1, BLEU-2, BLEU-3, BLEU-4, ROUGE-L, METEOER, and CIDEr.
        """
        self.nlgeval_metrics = NLGEval(no_overlap=False,
                                       no_glove=True,
                                       no_skipthoughts=True)
def test_oo_api():
    with open("examples/hyp.txt") as f:
        hyp = f.readlines()
        hyp = [x.strip() for x in hyp]
    with open("examples/ref1.txt") as f:
        ref1 = f.readlines()
        ref1 = [x.strip() for x in ref1]
    with open("examples/ref2.txt") as f:
        ref2 = f.readlines()
        ref2 = [x.strip() for x in ref2]

    nlge = NLGEval()

    res = nlge.compute_individual_metrics([ref1[0]] + [ref2[0]], hyp[0])
    res = nlge.compute_individual_metrics([ref1[1]] + [ref2[1]], hyp[1])

    hyp_list = hyp
    ref_list = [ref1, ref2]
    res = nlge.compute_metrics(ref_list, hyp_list)
Ejemplo n.º 30
0
def __calculate_scores(result_file, ref_file, block_print=True):
    reference_file = json.load(open(ref_file))
    ref_video_keys = sorted(list(reference_file.keys()))
    ref_text_list = sum(
        [reference_file[item]['sentences'] for item in ref_video_keys], [])

    file_data = json.load(open(result_file))
    hyp_text_list = sum(
        [[i['sentence'].lower() for i in file_data['results'][item]]
         for item in ref_video_keys], [])
    hyp_text_list = [
        '<NONE>' if len(item) == 0 else item for item in hyp_text_list
    ]  # for empty generated result
    nlgeval = NLGEval(no_skipthoughts=True, no_glove=True)

    result = nlgeval.compute_metrics(hyp_list=ref_text_list,
                                     ref_list=[hyp_text_list])
    metrics = {'Average across tIoUs': result}
    return metrics