Esempio n. 1
0
class NLGMetrics(BaseMetric):
    def __init__(self, *args, **kwargs):
        self.nlgeval = NLGEval(no_glove=True, no_skipthoughts=True)

    @staticmethod
    def prepare_sent(tokens: List[str]) -> str:
        return recover_desc(tokens)

    def eval(self, hypos: Iterable[List[List[str]]], references: Iterable[List[str]],
             src_references: Iterable[List[str]], *args, **kwargs) -> dict:
        # List[str]
        first_hypos = [self.prepare_sent(hypo_list[0]) for hypo_list in hypos]
        src_ref_strs = [self.prepare_sent(src_ref) for src_ref in src_references]
        # List[List[str]]
        references_lists = [[self.prepare_sent(ref) for ref in references]]
        # distinct
        metrics_dict = self.nlgeval.compute_metrics(references_lists, first_hypos)
        # relative improve
        src_metrics_dict = self.nlgeval.compute_metrics(references_lists, src_ref_strs)
        relative_metrics_dict = OrderedDict({})
        for key in metrics_dict:
            relative_metrics_dict[key] = (metrics_dict[key] - src_metrics_dict[key]) / src_metrics_dict[key]
        return {
            'Bleu_4': metrics_dict['Bleu_4'],
            'METEOR': metrics_dict['METEOR']
        }
Esempio n. 2
0
def main():

    references = [[]]
    hypotheses = []

   # Create NlG metrics evaluator
    nlgeval = NLGEval(metrics_to_omit=['SkipThoughtCS', 'GreedyMatchingScore', 'VectorExtremaCosineSimilarity', 'EmbeddingAverageCosineSimilarity'])

    with open('/home/jcardoso/MIMIC/encodedTestCaptionsF.json') as json_file:
        referenceCaptionsDict = json.load(json_file)

    with open('/home/jcardoso/MIMIC/encodedTrainCaptionsF.json') as json_file:
        KBCaptionsDict = json.load(json_file)

    reference_ids = list(referenceCaptionsDict.keys())

    KB_ids = list(KBCaptionsDict.keys())

    for i in tqdm(range(len(referenceCaptionsDict.keys()))):
        references[0].append(unifyCaption(referenceCaptionsDict[reference_ids[i]]))
        hypotheses.append(get_random_report(KB_ids, KBCaptionsDict))

    metrics_dict = nlgeval.compute_metrics(references, hypotheses)
    print(metrics_dict)

    with open("RandomRefs.txt", 'w+') as file:
        for reference in references[0]:
            file.write(reference.strip() + '\n')
    with open("RandomPreds.txt", 'w+') as file:
        for hypothesis in hypotheses:
            file.write(hypothesis.strip() + '\n')

    with open("random_TestResults.txt", "w+") as file:
      for metric in metrics_dict:
        file.write(metric + ":" + str(metrics_dict[metric]) + "\n")
    def run_metrics(self, output, refer_dataset):
        refer = refer_dataset.refer
        hypothesis = []
        references = []

        mp1 = 0.0
        mp2 = 0.0
        mean_objects = 0.0
        total = 0.0

        for row in output:
            ref_id = int(row['refID'])
            gen_sentence = row['gen_sentence']
            hypothesis.append(row['gen_sentence'])
            references.append(
                [s['sent'] for s in refer.Refs[ref_id]['sentences']])

            total += 1.0
            mean_objects += row['n_objects']
            mp1 += row['p@1']
            mp2 += row['p@2']

        references = list(zip(*references))
        nlgeval = NLGEval(no_skipthoughts=True,
                          no_glove=True,
                          metrics_to_omit=['METEOR'])  # loads the models
        metrics_dict = nlgeval.compute_metrics(references, hypothesis)

        metrics_dict['p@1'] = mp1 / total
        metrics_dict['p@2'] = mp2 / total

        return metrics_dict
Esempio n. 4
0
def evaluate_trans(thenet, references, vali_data, vali_raw_data):
    hypothesis = []
    score_total = 0.
    num_word_total = 0
    for batch in vali_data:
        pred_batch, gold_batch, pred_scores, gold_scores, attn, src = thenet.translate(
            batch, vali_raw_data)
        score_total += sum([score[0] for score in pred_scores])
        num_word_total += sum(len(x) for x in batch.tgt[1:])
        hypothesis.extend([' '.join(x[0]) for x in pred_batch])
    ppl = math.exp(-score_total / num_word_total)
    bleu_score = bleu.corpus_bleu(
        hypothesis, references)[0][0]  #[final, n-gram1,n-gram2,...], [bp, ...]
    nlg_ref = [[x[0] for x in references if x is not None]]

    nlg_eval = NLGEval()
    save_txt('/fl/txtfile/rnn_h1.txt', hypothesis)
    metrics_eval = nlg_eval.compute_metrics(nlg_ref, hypothesis)
    print(metrics_eval)
    print('BLEU: {}'.format(bleu_score))
    # training/validation 阶段的ppl计算在onmt/Trainer.py的Statisci()中;translating的ppl计算在 translate.py中的reprot_score函数里
    print('PPL: {}'.format(ppl))

    return torch.FloatTensor([ppl, bleu_score,
                              0.0])  # the last reserved for rank number
Esempio n. 5
0
class NLGMetric(Metric):
    def __init__(self,
                 config,
                 metric_names=[
                     "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "METEOR",
                     "ROUGE_L", "CIDEr"
                 ]):
        super().__init__(config, metric_names)
        # please install NLGEval from `https://github.com/Maluuba/nlg-eval`
        from nlgeval import NLGEval
        self.nlg = NLGEval()

    def compute_metrics(self, outputs, targets, **kwargs):
        return self.nlg.compute_metrics(hyp_list=outputs, ref_list=targets)

    def print_computed_metrics(self, metrics):
        Bleu_1 = metrics["Bleu_1"]
        Bleu_2 = metrics["Bleu_2"]
        Bleu_3 = metrics["Bleu_3"]
        Bleu_4 = metrics["Bleu_4"]
        METEOR = metrics["METEOR"]
        ROUGE_L = metrics["ROUGE_L"]
        CIDEr = metrics["CIDEr"]

        print(
            "Bleu_1: {:.4f} - Bleu_2: {:.4f} - Bleu_3: {:.4f} - Bleu_4: {:.4f} - METEOR: {:.4f} - ROUGE_L: {:.4f} - CIDEr: {:.4f}"
            .format(Bleu_1, Bleu_2, Bleu_3, Bleu_4, METEOR, ROUGE_L, CIDEr))
def get_evalutation_scores(hypothesis, refrences, testing_mode=False):
    gleu_scores = {"Gleu_1": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=1),
                   "Gleu_2": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=2),
                   "Gleu_3": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=3),
                   "Gleu_4": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=4)
                   }

    if testing_mode:
        for i in range(len(hypothesis)):
            hypothesis[i] = ' '.join(hypothesis[i])

        refs = [[]]
        for i in range(len(refrences)):
            refs[0].append(' '.join(refrences[i][0]))
            if refs[0][-1] == "":
                refs[0][-1] = "no"
        refrences = refs

        n = NLGEval()
        scores = n.compute_metrics(ref_list=refrences, hyp_list=hypothesis)
    else:
        scores = {"Bleu_1": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1.0]),
                  "Bleu_2": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 2, 1. / 2]),
                  "Bleu_3": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 3, 1. / 3, 1. / 3]),
                  "Bleu_4": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 4, 1. / 4, 1. / 4, 1. / 4])}

    for key, val in gleu_scores.items():
        scores[key] = val
    return scores
Esempio n. 7
0
def evaluate(hypothesis,
             references,
             no_skipthoughts=True,
             no_glove=True,
             metrics_to_omit=['METEOR']):
    nlgeval = NLGEval(no_skipthoughts=no_skipthoughts,
                      no_glove=no_glove,
                      metrics_to_omit=metrics_to_omit)
    return nlgeval.compute_metrics(references, hypothesis)
Esempio n. 8
0
def eval_using_nlgeval(ref_list, pred_list, multiple):
    if VERBOSE:
        print('Loading the NLG eval model...')
    nlge = NLGEval(metrics_to_omit=['METEOR', 'CIDEr'],
                   no_skipthoughts=True,
                   no_glove=True)
    # nlge = NLGEval(metrics_to_omit=['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'CIDEr', 'ROUGE_L'], no_skipthoughts=True, no_glove=True)
    if VERBOSE:
        print('\nComputing Scores...')
    return nlge.compute_metrics(ref_list, pred_list, multiple=multiple)
Esempio n. 9
0
 def com_score(self, ref, pre):
     # for gold, hype in zip(ref, pre):
     #     temp = []
     #     temp.append(gold)
     #     metrics_dict = compute_individual_metrics(temp, hype)
     #     break
     r_list = []
     r_list.append(ref)
     nlgeval = NLGEval()
     metrics_dict = nlgeval.compute_metrics(r_list, pre)
     return metrics_dict
 def meteor(self):
     """ Computes METEOR using the NLGEval library
         Link: https://github.com/Maluuba/nlg-eval"""
     metrics_to_omit = {
         "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "ROUGE_L", "CIDEr"
     }
     nlgeval = NLGEval(no_skipthoughts=True,
                       no_glove=True,
                       metrics_to_omit=metrics_to_omit)
     self.metrics.update(
         nlgeval.compute_metrics([self.target], self.hypothesis))
Esempio n. 11
0
def calculate_rouge(prediction, ground_truth, tokenizer):
    nlgeval = NLGEval()
    references = []
    hypotheses = []
    for x, y in zip(ground_truth, prediction):
        x = tokenizer.decode(x, skip_special_tokens=True)
        y = tokenizer.decode(y, skip_special_tokens=True)
        references.append([x])
        hypotheses.append(y)

    metrics_dict = nlgeval.compute_metrics(references, hypotheses)
    return metrics_dict['ROUGE_L'], references, hypotheses
Esempio n. 12
0
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()
    # step over the whole set
    for key, desc_list in descriptions.items():
        # generate description
        yhat = generate_desc(model, tokenizer, photos[key], max_length)
        # store actual and predicted
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())
    # calculate all metrics
    nlgeval = NLGEval()  # loads the models
    metrics_dict = nlgeval.compute_metrics(actual, predicted)
Esempio n. 13
0
def evaluate(loader, lstmDec, linNet, VocabData):
	Index2Word = dict([val,key] for key,val in VocabData['word_dict'].items()) # dictionary from index to word
	# if torch.cuda.is_available():
	lstmDec = lstmDec.to(device).eval()
	linNet = linNet.to(device).eval()  # nn.DataParallel(linNet,device_ids=[0, 1]).to(device)
	nlgeval = NLGEval()

	ld = iter(loader)
	numiters = len(ld)
	qdar = tqdm.tqdm(range(numiters), total=numiters, ascii=True)
	loss_itr_list = []
	
	def linOut2DecIn(global_hidden, box_feat):	# box_feat [8, 4, 4096, 3, 3]
		global_hidden = global_hidden.unsqueeze(0)
		encoder_hidden = (global_hidden,torch.zeros_like(global_hidden).to(device))
		B,M,D,H,W = box_feat.size()
		encoder_outputs = box_feat.permute(0,1,3,4,2).contiguous().view(B,-1,D)
		return encoder_hidden, encoder_outputs

	def lstr(ts,pres=3):
		return str(np.round(ts.data.cpu().numpy(), 3))
	
	with torch.no_grad(): # evaluate mode
		references = [[]]
		hypothesis = []
		for i in qdar:
			# step 1: load data
			batchdata = next(ld)
			box_feats, box_global_feats, numBoxes, box_captions_gt = makeInp(*batchdata)  # box_feats: (numImage,numBoxes,512,7,7) box_global_feats: list, numImage [(512,34,56)]

			references[0] += [" ".join([Index2Word[i] for i in s if Index2Word[i] != '<PAD>']) for s in box_captions_gt.data.cpu().numpy()] #create batch of reference based on indices

			# step 2: data transform by linNet
			box_feat,box_feat_dec, global_hidden = linNet(box_feats, box_global_feats)
			
			# step 3: decode to captions by lstmDec
			encoder_hidden, encoder_outputs = linOut2DecIn(global_hidden,box_feat_dec)
			decoder_outputs, decoder_hidden, ret_dict = lstmDec(encoder_hidden=encoder_hidden, encoder_outputs=encoder_outputs, max_len=int(5*numBoxes)) # box_feat [8, 4, 4096, 3, 3]
			
			# step 4: calculate loss
				# Loss 1: Similarity loss
			lengths = torch.LongTensor(ret_dict['length']).to(device)
			decoder_outputs = torch.stack([decoder_outputs[i] for i in range(len(decoder_outputs))], 1) # decoder_outputs [8, 15, 10878]


			word_indices = decoder_outputs.argmax(2).data.cpu().numpy() #batch_size x seq_len
			hypothesis += [" ".join([Index2Word[i] for i in s if Index2Word[i] != '<PAD>']) for s in word_indices] #create batch of hypothesis based on indices
			if i == 10:
				break
		print(nlgeval.compute_metrics(references, hypothesis))
Esempio n. 14
0
def evaluateNLG(gen_dials, ref_dialogues):
    hyp_list, ref_list = [], []
    for fname in gen_dials:
        hyp_list.extend(gen_dials[fname])  # list of sentence string
        ref_list.extend([
            s.strip() for s in ref_dialogues[fname]['sys']
        ])  # list of ref_list, each ref_list is a list of sentence string
    ref_lists = [ref_list]  # only put 1 reference

    from nlgeval import NLGEval
    nlgeval = NLGEval()  # loads the models
    metrics_dict = nlgeval.compute_metrics(ref_list=ref_lists,
                                           hyp_list=hyp_list)
    print(metrics_dict)
    return metrics_dict
Esempio n. 15
0
def NLGE_evaluation(encoder,
                    decoder,
                    search_method,
                    word2ix,
                    ix2word,
                    input_seqs,
                    target_seqs,
                    templates=None):
    """
    Function that computes several metrics using the NLG-eval python package (https://github.com/Maluuba/nlg-eval)
    :param encoder: Pytorch model that serves as encoder.
    :param decoder: Pytorch model that serves as decoder.
    :param search_method: Pytorch model used for making searches during inference. (e.g GreedySearch)
    :param word2ix: Python dictionary with tokens as keys and indexes as values.
    :param ix2word: Python dictionary with indexes as keys and tokens as values.
    :param input_seqs: List containing the vectorized question that will be used for testing the model.
    :param target_seqs: List containing the vectorized ground truth answers that will be used for testing the model.
    """
    nlg_eval = NLGEval(metrics_to_omit=[
        'Bleu_3', 'Bleu_4', 'METEOR', 'CIDEr', 'SkipThoughtCS'
    ])
    hypothesis = []
    references = []
    if templates:
        vectorizer = TfidfVectorizer(stop_words='english',
                                     lowercase=True,
                                     strip_accents='ascii')
        template2vec = vectorizer.fit_transform(templates)
    for input_seq, target_seq in tqdm(zip(input_seqs, target_seqs),
                                      total=input_seqs.shape[0]):
        input_seq, input_length, _, _, _ = prepare_data([input_seq],
                                                        [target_seq])
        tokens = search_method(input_seq, input_length, 300, word2ix['_BOS_'])
        tokens = tokens.view(
            1, -1
        )[0] if search_method.__class__.__name__ == "GreedySearchDecoder" else tokens
        answer = ' '.join([
            ix2word[token] for token in tokens.cpu().numpy()
            if token != word2ix['_PAD_']
        ])
        if templates:
            template, score = template_retrieval(answer, templates,
                                                 template2vec, vectorizer)
            if score > 0.75:
                answer = template
        hypothesis.append(answer)
        references.append(' '.join([ix2word[token] for token in target_seq]))
    return nlg_eval.compute_metrics(ref_list=[references], hyp_list=hypothesis)
def test_oo_api():
    with open("examples/hyp.txt") as f:
        hyp = f.readlines()
        hyp = [x.strip() for x in hyp]
    with open("examples/ref1.txt") as f:
        ref1 = f.readlines()
        ref1 = [x.strip() for x in ref1]
    with open("examples/ref2.txt") as f:
        ref2 = f.readlines()
        ref2 = [x.strip() for x in ref2]

    nlge = NLGEval()

    res = nlge.compute_individual_metrics([ref1[0]] + [ref2[0]], hyp[0])
    res = nlge.compute_individual_metrics([ref1[1]] + [ref2[1]], hyp[1])

    hyp_list = hyp
    ref_list = [ref1, ref2]
    res = nlge.compute_metrics(ref_list, hyp_list)
Esempio n. 17
0
def __calculate_scores(result_file, ref_file, block_print=True):
    reference_file = json.load(open(ref_file))
    ref_video_keys = sorted(list(reference_file.keys()))
    ref_text_list = sum(
        [reference_file[item]['sentences'] for item in ref_video_keys], [])

    file_data = json.load(open(result_file))
    hyp_text_list = sum(
        [[i['sentence'].lower() for i in file_data['results'][item]]
         for item in ref_video_keys], [])
    hyp_text_list = [
        '<NONE>' if len(item) == 0 else item for item in hyp_text_list
    ]  # for empty generated result
    nlgeval = NLGEval(no_skipthoughts=True, no_glove=True)

    result = nlgeval.compute_metrics(hyp_list=ref_text_list,
                                     ref_list=[hyp_text_list])
    metrics = {'Average across tIoUs': result}
    return metrics
Esempio n. 18
0
def evaluateNLGFile(gen_dials_fpath, ref_dialogues_fpath):
    with open(gen_dials_fpath, 'r') as gen, open(ref_dialogues_fpath,
                                                 'r') as ref:
        gen_dials = json.load(gen)
        ref_dialogues = json.load(ref)

    hyp_list, ref_list = [], []
    for fname in gen_dials:
        hyp_list.extend(gen_dials[fname])  # list of sentence string
        ref_list.extend([
            s.strip() for s in ref_dialogues[fname]['sys']
        ])  # list of ref_list, each ref_list is a list of sentence string
    ref_lists = [ref_list]  # only put 1 reference

    from nlgeval import NLGEval
    nlgeval = NLGEval()  # loads the models
    metrics_dict = nlgeval.compute_metrics(ref_list=ref_lists,
                                           hyp_list=hyp_list)
    print(metrics_dict)
    return metrics_dict
Esempio n. 19
0
def main():
    argParser = get_args()

    print(argParser)

    print(argParser.checkpoint)
    if (argParser.checkpoint is not None):
        modelInfo = torch.load(argParser.checkpoint)

    # Load model
    encoder, decoder = setupEncoderDecoder(argParser, modelInfo)

    # Create data loaders
    testLoader, _ = setupDataLoaders(argParser)

    # Load word <-> embeddings matrix index correspondence dictionaries
    idx2word, word2idx = loadWordIndexDicts(argParser)

    # Create NlG metrics evaluator
    nlgeval = NLGEval(metrics_to_omit=[
        'SkipThoughtCS', 'GreedyMatchingScore',
        'VectorExtremaCosineSimilarity', 'EmbeddingAverageCosineSimilarity'
    ])

    vocab_size = decoder.vocab_size

    references, hypotheses = evaluate_beam(argParser, BEAM_SIZE, encoder,
                                           decoder, testLoader, word2idx,
                                           idx2word)

    metrics_dict = nlgeval.compute_metrics(references, hypotheses)

    refs_path, preds_path = save_references_and_predictions(
        references, hypotheses, argParser.model_name, "Beam")

    with open(
            '../Experiments/' + argParser.model_name + "/BeamTestResults.txt",
            "w+") as file:
        for metric in metrics_dict:
            file.write(metric + ":" + str(metrics_dict[metric]) + "\n")
Esempio n. 20
0
def NLGE_evaluation(model, test_questions, test_answers, train_answers):
    """
    Function that computes several metrics using the NLG-eval python package (https://github.com/Maluuba/nlg-eval)
    :param model: sklearn tfidf model to be tested.
    :param test_questions: List containing several questions vectorized.
    :param test_answers: List containing the ground truth answers vectorized.
    :param train_answers: the pool of answer that the model will use to search for an answers (typically this pool is all the train answers that the model as seen)
    """
    # Creation of the pool of unique answers.
    unique_ans = np.unique(train_answers)
    possible_ans = [ans for ans in unique_ans]
    # We will not use all the metrics available in the package.
    nlg_eval = NLGEval(metrics_to_omit=['Bleu_3', 'Bleu_4', 'METEOR', 'CIDEr', 'SkipThoughtCS'])
    print ("Evaluating ranking among {} possible answers".format(len(possible_ans)))
    hypothesis = [] # List that will store our answer hypothesis.
    references = [] # List that will contain the reference answers.
    vector_doc = model.vectorizer.transform(possible_ans)
    for i in tqdm(range(len(test_questions))):
        vector_q = model.vectorizer.transform([test_questions[i]])
        result = cosine_similarity(vector_q, vector_doc)[0]
        hypothesis_idx = np.argsort(result, axis=0)[::-1][0]
        hypothesis.append(possible_ans[hypothesis_idx])
        references.append(test_answers[i])
    return nlg_eval.compute_metrics(ref_list=[references], hyp_list=hypothesis)
Esempio n. 21
0
with open(hyp_file, "r") as f:
    hyp_dict = {
        line.strip().split(" ")[0]: " ".join(line.strip().split(" ")[1:])
        for line in f.readlines()
    }

keys = [k for k, v in hyp_dict.items()]
labels = [ref_dict[k] for k, _ in hyp_dict.items()]
decoded_preds = [v for k, v in hyp_dict.items()]

metric = load_metric("bertscore")
result_bert = metric.compute(
    predictions=decoded_preds,
    references=labels,
    lang="en",
)

nlg = NLGEval()  # loads the models
print("Key", "\t", "METEOR", "\t", "ROUGE-L")
for (key, ref, hyp) in zip(keys, labels, decoded_preds):
    metrics_dict = nlg.compute_individual_metrics([ref], hyp)
    print(key, "\t", metrics_dict["METEOR"], "\t", metrics_dict["ROUGE_L"])
refs = [[x] for x in labels]
metrics_dict = nlg.compute_metrics(ref_list=[labels], hyp_list=decoded_preds)
metric = load_metric("rouge")
result = metric.compute(predictions=decoded_preds, references=labels)
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

print(f"RESULT {result['rouge1']} {result['rouge2']} {result['rougeL']} \
    {metrics_dict['METEOR']*100.0} {100*np.mean(result_bert['precision'])}")
Esempio n. 22
0
 def cal_score(self, metric):
     data_score = []
     for task_name, task in self.tasks.items():
         print("Task : " + task_name + " report ")
         if "emf1" in metric:
             em = 0
             total = 0
             f1 = 0
             for pos, predict in enumerate(task['predicted']):
                 em_list = []
                 f1_list = []
                 for target in task['targets'][pos]:
                     if _normalize_answer(
                             str(predict)
                     ) == _normalize_answer(str(target)) and len(
                             _normalize_answer(str(predict))) > 0 or len(
                                 str(predict)) == len(str(target)) == 0:
                         em_score = 1
                         f1_score = 1
                     else:
                         em_score = 0
                         f1_score = _f1_score(str(predict), str(target))
                     em_list.append(em_score)
                     f1_list.append(f1_score)
                 em += max(em_list)
                 f1 += max(f1_list)
                 data_score.append([
                     predict,
                     task['targets'][pos][em_list.index(max(em_list))], {
                         'em': max(em_list),
                         'f1': max(f1_list)
                     }
                 ])
                 total += 1
             result = {
                 "EM": em / (total or not total),
                 "F1": f1 / (total or not total)
             }
             data_score = sorted(data_score,
                                 key=lambda i: i[2]['em'],
                                 reverse=True)
         if "nlg" in metric:
             try:
                 from nlgeval import NLGEval
             except ImportError:
                 print(
                     "nlg-eval package not install, plz install it: pip install git+https://github.com/voidful/nlg-eval.git ; nlg-eval --setup ./nlg-eval-data/"
                 )
                 raise
             nlgeval = NLGEval(no_skipthoughts=True,
                               no_glove=True,
                               metrics_to_omit=["METEOR"])
             targets = task['targets']
             predicted = task['predicted']
             for t, p in zip(targets, predicted):
                 data_score.append([
                     p, t,
                     nlgeval.compute_metrics(ref_list=list(map(
                         list, zip(t))),
                                             hyp_list=[p])
                 ])
             result = nlgeval.compute_metrics(
                 ref_list=list(map(list,
                                   zip(*task['targets']))),  # transpose
                 hyp_list=predicted)
             data_score = sorted(data_score, key=lambda i: i[2]['ROUGE_L'])
         if "clas" in metric:
             from sklearn.metrics import classification_report
             from sklearn.preprocessing import MultiLabelBinarizer
             from sklearn.metrics import precision_recall_fscore_support
             target_key = [
                 t for t in self.target_list[task_name].keys() if len(t) > 0
             ]
             mlb = MultiLabelBinarizer().fit([target_key])
             # remove all blank target
             task['targets'] = [[j for j in sub if len(j) > 0]
                                for sub in task['targets']]
             # modify for tagging result
             if isinstance(task['predicteds'][0][0], list):
                 task['targets'] = sum([[[j] for j in sub]
                                        for sub in task['targets']], [])
                 task['predicteds'] = sum([[[j] for j in sub]
                                           for sub in task['predicted']],
                                          [])
                 if len(task['targets']) != len(task['predicteds']):
                     diff = len(task['targets']) - len(task['predicteds'])
                     task['predicteds'].extend([['']] * diff)
             targets = task['targets']
             predicted = task['predicteds']
             for p, t in zip(predicted, targets):
                 score = dict(
                     zip(["precision", "recall", "fbeta_score", "support"],
                         precision_recall_fscore_support(
                             mlb.transform([t]),
                             mlb.transform([p]),
                             average='weighted')))
                 data_score.append([p, t, score])
             print(mlb.classes_)
             result = classification_report(mlb.transform(targets),
                                            mlb.transform(predicted),
                                            target_names=list(mlb.classes_))
             data_score = sorted(data_score,
                                 key=lambda i: i[2]['fbeta_score'])
         yield (task_name, result, data_score)
Esempio n. 23
0
def main():
    argParser = get_args()

    print(argParser)

    modelInfo = None
    classifierInfo = None

    if (argParser.checkpoint is not None):
        modelInfo = torch.load(argParser.checkpoint)

    if (argParser.use_classifier_encoder) and modelInfo is None:
        classifierInfo = torch.load(argParser.classifier_checkpoint)

    if not os.path.isdir('../Experiments/' + argParser.model_name):
        os.mkdir('../Experiments/' + argParser.model_name)

    trainingEnvironment = TrainingEnvironment(argParser)

    cudnn.benchmark = True

    encoder, decoder = setupEncoderDecoder(argParser, modelInfo,
                                           classifierInfo)

    encoder_optimizer, decoder_optimizer = setupOptimizers(
        encoder, decoder, argParser, modelInfo)

    decoder_scheduler, encoder_scheduler = setupSchedulers(
        encoder_optimizer, decoder_optimizer, argParser)

    criterion = setupCriterion(argParser.loss)

    binary_criterion = nn.BCEWithLogitsLoss()

    trainLoader, valLoader = setupDataLoaders(argParser)

    # Load word <-> embeddings matrix index correspondence dictionaries
    idx2word, word2idx = loadWordIndexDicts(argParser)

    # Create NlG metrics evaluator
    nlgeval = NLGEval(metrics_to_omit=[
        'SkipThoughtCS', 'GreedyMatchingScore',
        'VectorExtremaCosineSimilarity', 'EmbeddingAverageCosineSimilarity'
    ])

    scheduled_sampling_prob = decoder.scheduled_sampling_prob

    for epoch in range(trainingEnvironment.start_epoch,
                       trainingEnvironment.epochs):

        if epoch > 1 and argParser.use_scheduled_sampling and epoch % argParser.scheduled_sampling_decay_epochs == 0:
            scheduled_sampling_prob += argParser.rate_change_scheduled_sampling_prob
            decoder.scheduled_sampling_prob = scheduled_sampling_prob

        if trainingEnvironment.epochs_since_improvement == argParser.early_stop_epoch_threshold:
            break

        train(argParser, encoder, decoder, trainLoader, word2idx, idx2word,
              criterion, encoder_optimizer, decoder_optimizer,
              binary_criterion, epoch)

        #      references, hypotheses = hierarchical_evaluate_beam(argParser, BEAM_SIZE, encoder, decoder, valLoader, word2idx, idx2word)
        references, hypotheses = evaluate_greedy(argParser, encoder, decoder,
                                                 valLoader, word2idx, idx2word)

        encoder_scheduler.step()
        decoder_scheduler.step()

        metrics_dict = nlgeval.compute_metrics(references, hypotheses)
        print(metrics_dict)

        with open('../Experiments/' + argParser.model_name + "/metrics.txt",
                  "a+") as file:
            file.write("Epoch " + str(epoch) + " results:\n")
            for metric in metrics_dict:
                file.write(metric + ":" + str(metrics_dict[metric]) + "\n")
            file.write("------------------------------------------\n")

        recent_bleu4 = metrics_dict['CIDEr']

        #     Check if there was an improvement
        is_best = recent_bleu4 > trainingEnvironment.best_bleu4

        trainingEnvironment.best_bleu4 = max(recent_bleu4,
                                             trainingEnvironment.best_bleu4)

        print("Best BLEU: ", trainingEnvironment.best_bleu4)
        if not is_best:
            trainingEnvironment.epochs_since_improvement += 1
            print("\nEpochs since last improvement: %d\n" %
                  (trainingEnvironment.epochs_since_improvement, ))
        else:
            trainingEnvironment.epochs_since_improvement = 0


#        recent_bleu4 = 0
#        is_best = True
#        metrics_dict = {}

# Save checkpoint
        save_checkpoint(argParser.model_name, epoch,
                        trainingEnvironment.epochs_since_improvement,
                        encoder.state_dict(), decoder.state_dict(),
                        encoder_optimizer.state_dict(),
                        decoder_optimizer.state_dict(), recent_bleu4, is_best,
                        metrics_dict, trainingEnvironment.best_loss)
Esempio n. 24
0
def eval(eval_filename, vocab_filename, alias2scientific_filename):
    def remove_stopwords(sent, stop_word_set):
        items = sent.split()
        items = [ite for ite in items if ite not in stop_word_set]
        return " ".join(items)

    with open("data/stopwords.txt") as f:
        stopwords = f.read().strip().split()
        stopwords = set(stopwords)

    bleu_nlgeval = NLGEval(metrics_to_omit=[
        "METEOR", "CIDEr", "ROUGE_L", "SkipThoughtCS",
        "EmbeddingAverageCosineSimilairty", "VectorExtremaCosineSimilarity",
        "GreedyMatchingScore"
    ])
    rouge_eval = RougeEval()
    disease2x = pandas.read_csv(vocab_filename)
    disease2x = disease2x[disease2x["Is_know"] > 0]
    disease2x = dict(zip(list(disease2x["Word"]), list(disease2x["Is_know"])))
    distinct_eval = DistinctEval(grams=[1, 2])

    with open(eval_filename) as f:
        sessions = json.load(f)

    gths = [[episode["gth"] for episode in session["session"]]
            for session in sessions]
    hyps = [[episode["hyp"] for episode in session["session"]]
            for session in sessions]
    entity_gths = [[
        " ".join([i for i in x.split(" ") if i in disease2x]) for x in y
    ] for y in gths]
    entity_hyps = [[
        " ".join([i for i in x.split(" ") if i in disease2x]) for x in y
    ] for y in hyps]

    def flat(lists):
        tmp = []
        for items in lists:
            tmp += items
        return tmp

    gths = flat(gths)
    hyps = flat(hyps)
    entity_gths = flat(entity_gths)
    entity_hyps = flat(entity_hyps)

    gths = [remove_stopwords(gth, stopwords) for gth in gths]
    hyps = [remove_stopwords(hyp, stopwords) for hyp in hyps]

    ret_metrics = OrderedDict()
    ret_metric = OrderedDict()

    bleu_score_matrix = [
        bleu_nlgeval.compute_individual_metrics([gth], hyp)
        for gth, hyp in zip(gths, hyps)
    ]
    b2s = [b["Bleu_2"] for b in bleu_score_matrix]
    ret_metrics["B@2"] = b2s
    bleu_score = bleu_nlgeval.compute_metrics([gths], hyps)
    b2 = bleu_score["Bleu_2"]
    ret_metric["B@2"] = b2
    rouge1, rouge2, r1s, r2s = rouge_eval.rouge_score(hyps,
                                                      gths,
                                                      ret_matrix=True)
    ret_metrics["R@2"] = r2s
    ret_metric["R@2"] = rouge2
    dist_scores = distinct_eval.distinct_score(hyps)
    ret_metric["D@1"] = dist_scores[0]
    ret_metric["D@2"] = dist_scores[1]
    ret_metrics["D@1"] = float("nan")
    ret_metrics["D@2"] = float("nan")
    eps = 1e-24

    def compute_f1(p, r):
        return 2 * p * r / (p + r + eps)

    overlapped_entity = [[i for i in x.split() if i in y.split()]
                         for x, y in zip(entity_hyps, entity_gths)]
    overlapped_entity = [list(set(x)) for x in overlapped_entity]
    hyp_entity = [set(y.split()) for y in entity_hyps]
    gth_entity = [set(y.split()) for y in entity_gths]
    entity2prf = OrderedDict()
    for oe, he, ge in zip(overlapped_entity, hyp_entity, gth_entity):
        for e in oe:
            if e not in entity2prf:
                entity2prf[e] = {"FN": 0, "FP": 0, "TP": 0}
            entity2prf[e]["TP"] += 1

        for e in he:
            if e not in entity2prf:
                entity2prf[e] = {"FN": 0, "FP": 0, "TP": 0}
            if e not in oe:
                entity2prf[e]["FP"] += 1

        for e in ge:
            if e not in entity2prf:
                entity2prf[e] = {"FN": 0, "FP": 0, "TP": 0}
            if e not in oe:
                entity2prf[e]["FN"] += 1

    counter = Counter()
    for gth in gth_entity:
        counter.update(gth)
    need_entity_ind = [x[0] for x in counter.most_common() if x[1] > 5]
    print("len(need_entity_ind) = {}".format(len(need_entity_ind)))
    ret_metrics["ma-P"] = [
        entity2prf[e]["TP"] / (entity2prf[e]["TP"] + entity2prf[e]["FP"] + eps)
        for e in need_entity_ind
    ]
    ret_metrics["ma-R"] = [
        entity2prf[e]["TP"] / (entity2prf[e]["TP"] + entity2prf[e]["FN"] + eps)
        for e in need_entity_ind
    ]
    ret_metrics["ma-F1"] = [
        compute_f1(p, r)
        for (p, r) in zip(ret_metrics["ma-P"], ret_metrics["ma-R"])
    ]
    ret_metric["ma-P"] = float(np.mean(ret_metrics["ma-P"]))
    ret_metric["ma-R"] = float(np.mean(ret_metrics["ma-R"]))
    ret_metric["ma-F1"] = compute_f1(ret_metric["ma-P"], ret_metric["ma-R"])
    mi_precision = [
        len(x) / (len(y) + 1e-14) for x, y in zip(
            overlapped_entity, [set(y.split()) for y in entity_hyps])
    ]
    mi_recall = [
        len(x) / (len(y) + 1e-14) for x, y in zip(
            overlapped_entity, [set(y.split()) for y in entity_gths])
    ]
    gth_n = [len(set(ws.split())) for ws in entity_gths]
    hyp_n = [len(set(ws.split())) for ws in entity_hyps]
    ret_metric["mi-P"] = np.sum([p * w for (p, w) in zip(mi_precision, hyp_n)
                                 ]) / np.sum(hyp_n)
    ret_metric["mi-R"] = np.sum([r * w for (r, w) in zip(mi_recall, gth_n)
                                 ]) / np.sum(gth_n)
    ret_metric["mi-F1"] = compute_f1(ret_metric["mi-P"], ret_metric["mi-R"])
    ret_metrics["mi-P"] = mi_precision
    ret_metrics["mi-R"] = mi_recall
    ret_metrics["mi-F1"] = [
        compute_f1(p, r) for (p, r) in zip(mi_precision, mi_recall)
    ]
    with open("data/word2embedding.txt") as f:
        content = f.read().strip()
    single_word2embedding = {}
    for line in content.split("\n"):
        item = line.split()
        word = item[0]
        embedding = np.asarray([float(x) for x in item[1:]])
        single_word2embedding[word] = embedding
    alias2scientific = json.load(open(alias2scientific_filename))
    padding_embed = np.zeros(768)

    hyp_emb_avg = [
        np.asarray([
            np.asarray([
                single_word2embedding.get(w, padding_embed)
                for w in alias2scientific.get(e, e)
            ]).mean(0) for e in entity_hyp.split()
        ]).mean(0) if len(entity_hyp.split()) > 0 else padding_embed
        for entity_hyp in entity_hyps
    ]
    gth_emb_avg = [
        np.asarray([
            np.asarray([
                single_word2embedding.get(w, padding_embed)
                for w in alias2scientific.get(e, e)
            ]).mean(0) for e in entity_gth.split()
        ]).mean(0) if len(entity_gth.split()) > 0 else padding_embed
        for entity_gth in entity_gths
    ]
    eas = [cosine_sim(h, g) for h, g in zip(hyp_emb_avg, gth_emb_avg)]
    ea = float(np.mean(eas))
    ret_metrics["EA"] = eas
    ret_metric["EA"] = ea

    hyp_emb_means = [[
        np.asarray([
            single_word2embedding.get(w, padding_embed)
            for w in alias2scientific.get(e, e)
        ]).mean(0) for e in entity_hyp.split()
    ] if len(entity_hyp.split()) > 0 else [padding_embed]
                     for entity_hyp in entity_hyps]
    gth_emb_means = [[
        np.asarray([
            single_word2embedding.get(w, padding_embed)
            for w in alias2scientific.get(e, e)
        ]).mean(0) for e in entity_gth.split()
    ] if len(entity_gth.split()) > 0 else [padding_embed]
                     for entity_gth in entity_gths]

    def eval_embed_greedy(a, b):
        scores = []

        for j in b:
            score = []
            for i in a:
                s = cosine_sim(i, j)
                score.append(s)
            scores.append(score)

        if len(b) == 1 and b[0].sum() == 0.0:
            return None
        else:
            scores = np.asarray(scores)
            score1 = scores.max(0).mean()
            score2 = scores.max(1).mean()
            return (float(score1) + float(score2)) / 2.0

    eg_scores = [
        x for x in [
            eval_embed_greedy(a, b)
            for (a, b) in zip(hyp_emb_means, gth_emb_means)
        ] if x is not None
    ]
    eg_score = np.asarray(eg_scores).mean()
    ret_metrics["EG"] = eg_scores
    ret_metric["EG"] = eg_score

    return ret_metrics, ret_metric
Esempio n. 25
0
def main(_argv):

    if FLAGS.num_gpus > 0:  # only supports 1 GPU
        ctx = mx.gpu()
    else:
        ctx = mx.cpu()

    key_flags = FLAGS.get_key_flags_for_module(sys.argv[0])
    print('\n'.join(f.serialize() for f in key_flags))

    # are we using features or do we include the CNN?
    if FLAGS.feats_model is None:
        backbone_net = get_model(FLAGS.backbone, pretrained=True, ctx=ctx).features
        cnn_model = FrameModel(backbone_net, 11)  # hardcoded the number of classes
        if FLAGS.backbone_from_id:
            if os.path.exists(os.path.join('models', 'vision', 'experiments', FLAGS.backbone_from_id)):
                files = os.listdir(os.path.join('models', 'vision', 'experiments', FLAGS.backbone_from_id))
                files = [f for f in files if f[-7:] == '.params']
                if len(files) > 0:
                    files = sorted(files, reverse=True)  # put latest model first
                    model_name = files[0]
                    cnn_model.load_parameters(os.path.join('models', 'vision', 'experiments', FLAGS.backbone_from_id, model_name), ctx=ctx)
                    print('Loaded backbone params: {}'.format(os.path.join('models', 'vision', 'experiments', FLAGS.backbone_from_id, model_name)))
            else:
                raise FileNotFoundError('{}'.format(os.path.join('models', 'vision', 'experiments', FLAGS.backbone_from_id)))

        if FLAGS.freeze_backbone:
            for param in cnn_model.collect_params().values():
                param.grad_req = 'null'

        cnn_model = TimeDistributed(cnn_model.backbone)

        src_embed = cnn_model

        transform_test = transforms.Compose([
            transforms.Resize(FLAGS.data_shape + 32),
            transforms.CenterCrop(FLAGS.data_shape),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

    else:
        from mxnet.gluon import nn  # need to do this to force no use of Embedding on src
        src_embed = nn.HybridSequential(prefix='src_embed_')
        with src_embed.name_scope():
            src_embed.add(nn.Dropout(rate=0.0))

        transform_train = None
        transform_test = None

    # setup the data
    data_train = TennisSet(split='train', transform=transform_train, captions=True, max_cap_len=FLAGS.tgt_max_len,
                           every=FLAGS.every, feats_model=FLAGS.feats_model)
    data_val = TennisSet(split='val', transform=transform_test, captions=True, vocab=data_train.vocab,
                         every=FLAGS.every, inference=True, feats_model=FLAGS.feats_model)
    data_test = TennisSet(split='test', transform=transform_test, captions=True, vocab=data_train.vocab,
                          every=FLAGS.every, inference=True, feats_model=FLAGS.feats_model)

    test_tgt_sentences = data_test.get_captions(split=True)
    write_sentences(test_tgt_sentences, os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, 'test_gt.txt'))

    # load embeddings for tgt_embed
    if FLAGS.emb_file:
        word_embs = nlp.embedding.TokenEmbedding.from_file(file_path=os.path.join('data', FLAGS.emb_file))
        data_test.vocab.set_embedding(word_embs)

        input_dim, output_dim = data_test.vocab.embedding.idx_to_vec.shape
        tgt_embed = gluon.nn.Embedding(input_dim, output_dim)
        tgt_embed.initialize(ctx=ctx)
        tgt_embed.weight.set_data(data_test.vocab.embedding.idx_to_vec)
    else:
        tgt_embed = None

    # setup the model
    encoder, decoder = get_gnmt_encoder_decoder(cell_type=FLAGS.cell_type,
                                                hidden_size=FLAGS.num_hidden,
                                                dropout=FLAGS.dropout,
                                                num_layers=FLAGS.num_layers,
                                                num_bi_layers=FLAGS.num_bi_layers)
    model = NMTModel(src_vocab=None, tgt_vocab=data_test.vocab, encoder=encoder, decoder=decoder,
                     embed_size=FLAGS.emb_size, prefix='gnmt_', src_embed=src_embed, tgt_embed=tgt_embed)

    model.initialize(init=mx.init.Uniform(0.1), ctx=ctx)
    static_alloc = True
    model.hybridize(static_alloc=static_alloc)
    print(model)

    if os.path.exists(os.path.join('models', 'captioning', 'experiments', FLAGS.model_id)):
        files = os.listdir(os.path.join('models', 'captioning', 'experiments', FLAGS.model_id))
        files = [f for f in files if f[-7:] == '.params']
        if len(files) > 0:
            files = sorted(files, reverse=True)  # put latest model first
            model_name = files[0]
            if model_name == 'valid_best.params':
                model_name = files[1]
            model.load_parameters(os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, model_name), ctx=ctx)
            print('Loaded model params: {}'.format(os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, model_name)))

    # setup the beam search
    translator = BeamSearchTranslator(model=model, beam_size=FLAGS.beam_size,
                                      scorer=nlp.model.BeamSearchScorer(alpha=FLAGS.lp_alpha, K=FLAGS.lp_k),
                                      max_length=FLAGS.tgt_max_len + 100)
    print('Use beam_size={}, alpha={}, K={}'.format(FLAGS.beam_size, FLAGS.lp_alpha, FLAGS.lp_k))

    # run the training
    train_data_loader, val_data_loader, test_data_loader = get_dataloaders(data_train, data_val, data_test)

    # load and evaluate the best model
    if os.path.exists(os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, 'valid_best.params')):
        model.load_parameters(os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, 'valid_best.params'))

    preds_path = os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, 'best_test_out.txt')
    if not os.path.exists(preds_path):
        _, test_translation_out = evaluate(test_data_loader, model, translator, data_train, ctx)
    else:
        test_translation_out = read_sentences(preds_path)

    str_ = ''
    nlgeval = NLGEval()
    metrics_dict = nlgeval.compute_metrics([[' '.join(sent) for sent in test_tgt_sentences]],
                                           [' '.join(sent) for sent in test_translation_out])

    for k, v in metrics_dict.items():
        str_ += ', test ' + k + '={:.4f}'.format(float(v))
    print(str_)

    write_sentences(test_translation_out, preds_path)
Esempio n. 26
0
        sentences=sentences_eval[idx_examples].values,
        labels=labels_eval[idx_examples].values,
        tokenizer=tokenizer,
        max_length_seq=max_length_seq,
        max_length_label=max_length_label,
    )
    results = generate_questions(
        model=model,
        dataset=metric_dataset,
        tokenizer=tokenizer,
        device=device,
        batch_size=args.batch_size,
        generation_hyperparameters=generation_hyperparameters,
    )
    references, hypothesis = [], []
    for elem in results:
        for i in range(len(elem[0])):
            references.append(elem[1][i])
            hypothesis.append(elem[0][i])
    nlgeval = NLGEval()  # loads the models
    metrics_dict = nlgeval.compute_metrics([references], hypothesis)
    print("Done.")
    str_ = ""
    with open(args.output_dir + '/logs.txt', "a") as writer:
        for metric in metrics_dict:
            str_ += metric + ": {:.3f}, ".format(metrics_dict[metric])
        str_ += "Retrieval score: {:.3f}".format(
            retrieval_score(hypothesis, references))
    writer.write(str_)
    print(str_)
Esempio n. 27
0
def BLEU(candidate, references):
    precisions = []
    for i in range(4):
        pr, bp = count_ngram(candidate, references, i + 1)
        precisions.append(pr)
    bleu = geometric_mean(precisions) * bp
    return bleu


if __name__ == "__main__":
    if len(sys.argv) == 2:
        candidate, references = fetch_data_from_one(sys.argv[1])
    else:
        candidate, references = fetch_data(sys.argv[1], sys.argv[2])
    print(len(candidate))
    print(len(references[0]))
    # candidate, references = fetch_data('bleu_data/tst.txt', 'bleu_data/ref.txt')
    # bleu1 = BLEU_n(candidate, references, 1)
    # bleu2 = BLEU_n(candidate, references, 2)
    # print(bleu1)
    # print(bleu2)
    # out = open('data/bleu_out.txt', 'a', encoding='utf8')
    # out.write(sys.argv[1] + ' ' + str(bleu1) + ' ' + str(bleu2) + '\n')
    # out.close()
    from nlgeval import NLGEval

    nlgeval = NLGEval()  # loads the models
    metrics_dict = nlgeval.compute_metrics(references, candidate)
    print(metrics_dict)
Esempio n. 28
0
    def test_compute_metrics_oo(self):
        # Create the object in the test so that it can be garbage collected once the test is done.
        n = NLGEval()

        # Individual Metrics
        scores = n.compute_individual_metrics(
            ref=["this is a test", "this is also a test"],
            hyp="this is a good test")
        self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5)
        self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5)
        self.assertAlmostEqual(0.5108729, scores['Bleu_3'], places=5)
        self.assertAlmostEqual(0.0000903602, scores['Bleu_4'], places=5)
        self.assertAlmostEqual(0.44434387, scores['METEOR'], places=5)
        self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5)
        self.assertAlmostEqual(0.0, scores['CIDEr'], places=5)
        self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5)
        self.assertAlmostEqual(0.980075,
                               scores['EmbeddingAverageCosineSimilairty'],
                               places=5)
        self.assertAlmostEqual(0.94509,
                               scores['VectorExtremaCosineSimilarity'],
                               places=5)
        self.assertAlmostEqual(0.960771,
                               scores['GreedyMatchingScore'],
                               places=5)
        self.assertEqual(11, len(scores))

        scores = n.compute_metrics(
            ref_list=[
                [
                    "this is one reference sentence for sentence1",
                    "this is a reference sentence for sentence2 which was generated by your model"
                ],
                [
                    "this is one more reference sentence for sentence1",
                    "this is the second reference sentence for sentence2"
                ],
            ],
            hyp_list=[
                "this is the model generated sentence1 which seems good enough",
                "this is sentence2 which has been generated by your model"
            ])
        self.assertAlmostEqual(0.55, scores['Bleu_1'], places=5)
        self.assertAlmostEqual(0.428174, scores['Bleu_2'], places=5)
        self.assertAlmostEqual(0.284043, scores['Bleu_3'], places=5)
        self.assertAlmostEqual(0.201143, scores['Bleu_4'], places=5)
        self.assertAlmostEqual(0.295797, scores['METEOR'], places=5)
        self.assertAlmostEqual(0.522104, scores['ROUGE_L'], places=5)
        self.assertAlmostEqual(1.242192, scores['CIDEr'], places=5)
        self.assertAlmostEqual(0.626149, scores['SkipThoughtCS'], places=5)
        self.assertAlmostEqual(0.88469,
                               scores['EmbeddingAverageCosineSimilairty'],
                               places=5)
        self.assertAlmostEqual(0.568696,
                               scores['VectorExtremaCosineSimilarity'],
                               places=5)
        self.assertAlmostEqual(0.784205,
                               scores['GreedyMatchingScore'],
                               places=5)
        self.assertEqual(11, len(scores))

        # Non-ASCII tests.
        scores = n.compute_individual_metrics(
            ref=["Test en français.", "Le test en français."],
            hyp="Le test est en français.")
        self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5)
        self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5)
        self.assertAlmostEqual(0.0000051, scores['Bleu_3'], places=5)
        self.assertAlmostEqual(0, scores['Bleu_4'], places=5)
        self.assertAlmostEqual(0.48372379050300296, scores['METEOR'], places=5)
        self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5)
        self.assertAlmostEqual(0.0, scores['CIDEr'], places=5)
        self.assertAlmostEqual(0.9192341566085815,
                               scores['SkipThoughtCS'],
                               places=5)
        self.assertAlmostEqual(0.906562,
                               scores['EmbeddingAverageCosineSimilairty'],
                               places=5)
        self.assertAlmostEqual(0.815158,
                               scores['VectorExtremaCosineSimilarity'],
                               places=5)
        self.assertAlmostEqual(0.940959,
                               scores['GreedyMatchingScore'],
                               places=5)
        self.assertEqual(11, len(scores))

        scores = n.compute_individual_metrics(ref=["テスト"], hyp="テスト")
        self.assertAlmostEqual(0.99999999, scores['Bleu_1'], places=5)
        self.assertAlmostEqual(1.0, scores['METEOR'], places=3)
        self.assertAlmostEqual(1.0, scores['ROUGE_L'], places=3)
        self.assertAlmostEqual(0.0, scores['CIDEr'], places=3)
        self.assertAlmostEqual(1.0, scores['SkipThoughtCS'], places=3)
        self.assertAlmostEqual(1.0, scores['GreedyMatchingScore'], places=3)
        self.assertEqual(11, len(scores))
                # model1_answers.append(id2word(outputs[0][i]))
                model_answers_id.append([m for m in outputs[0][i] if m > 2])

                true_answers.append(batch_answer_str[i])
                model_answers.append(
                    id2word(outputs[0][i]).replace("<EOS>", ""))

        # answers_save_path = os.path.join(args.savePath, "answer_save_beam2_23.json")
        # with open(answers_save_path, "w", encoding='UTF-8') as file:
        #     data = {"true_answers": true_answers,
        #             "model1_answers": model_answers}
        #     json.dump(data, file, ensure_ascii=False)
        #     print("save in ", answers_save_path)

        print("num_batch:", bbb, "beam_wide=", args.num_BeamSearch)
        model_n_b_beam2_metrics_dict = nlgeval.compute_metrics([true_answers],
                                                               model_answers)
        print("model_n_b_beam2:\n", model_n_b_beam2_metrics_dict)
        print("Bleu_total:", np.mean(Bleu_total))
        print("Bleu_total_1:", np.mean(Bleu_total_1))
        print("Bleu_total_2:", np.mean(Bleu_total_2))
        print("Bleu_total_3:", np.mean(Bleu_total_3))
        print("Bleu_total_4:", np.mean(Bleu_total_4))
        Bleu_total_all.append(np.mean(Bleu_total))
        Bleu_total_1_all.append(np.mean(Bleu_total_1))
        Bleu_total_2_all.append(np.mean(Bleu_total_2))
        Bleu_total_3_all.append(np.mean(Bleu_total_3))
        Bleu_total_4_all.append(np.mean(Bleu_total_4))

        print("F1_score:", np.mean(F1_score))
        F1_score_all.append(np.mean(F1_score))
Esempio n. 30
0
        # Perform nearest neighbour search using dot product
        # Given that vectors are normalized, this is equivalent to using cosine similarity
        D, I = index.search(encoder_out.to("cpu").detach().numpy(), k)

        for caption in caps:
            encodedCaption = [
                w for w in caption.tolist() if w not in
                {word2idx['<sos>'], word2idx['<eoc>'], word2idx['<pad>']}
            ]
            references[0].append(decodeCaption(encodedCaption, idx2word))

        for i in range(batch_size):
            hypotheses.append(train_captions[I[i][0]])

    return references, hypotheses


generate_train_images_matrix()

references, hypotheses = calculate_NN()

metrics_dict = nlgeval.compute_metrics(references, hypotheses)
print(metrics_dict)

with open("1NNRefs.txt", 'w+') as file:
    for reference in references[0]:
        file.write(reference.strip() + '\n')
with open("1NNPreds.txt", 'w+') as file:
    for hypothesis in hypotheses:
        file.write(hypothesis.strip() + '\n')