def CocoScore(ref, hyp, metrics_list=None, language='en'): """ Obtains the COCO scores from the references and hypotheses. :param ref: Dictionary of reference sentences (id, sentence) :param hyp: Dictionary of hypothesis sentences (id, sentence) :param metrics_list: List of metrics to evaluate on :param language: Language of the sentences (for METEOR) :return: dictionary of scores """ if metrics_list is None: metrics_list = ['bleu', 'ter', 'meteor', 'rouge_l', 'cider'] else: metrics_list = [metric.lower() for metric in metrics_list] scorers = [] if 'bleu' in metrics_list: scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])) if 'meteor' in metrics_list: scorers.append((Meteor(language), "METEOR")) if 'ter' in metrics_list: scorers.append((Ter(), "TER")) if 'rouge_l' in metrics_list or 'rouge' in metrics_list: scorers.append((Rouge(), "ROUGE_L")) if 'cider' in metrics_list: scorers.append((Cider(), "CIDEr")) final_scores = {} for scorer, method in scorers: score, _ = scorer.compute_score(ref, hyp) if isinstance(score, list): for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def CocoScore(ref, hypo, language='en'): """ Obtains the COCO scores from the references and hypotheses. :param ref: Dictionary of reference sentences (id, sentence) :param hypo: Dictionary of hypothesis sentences (id, sentence) :param language: Language of the sentences (for METEOR) :return: dictionary of scores """ scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(language), "METEOR"), (Ter(), "TER"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] final_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(ref, hypo) if type(score) == list: for m, s in zip(method, score): final_scores[m] = s else: final_scores[method] = score return final_scores
def get_coco_score(pred_list, verbose, extra_vars, split, **kwargs): """ COCO challenge metrics :param pred_list: dictionary of hypothesis sentences (id, sentence) :param verbose: if greater than 0 the metric measures are printed out :param extra_vars: extra variables, here are: extra_vars['references'] - dict mapping sample indices to list with all valid captions (id, [sentences]) extra_vars['tokenize_f'] - tokenization function used during model training (used again for validation) extra_vars['detokenize_f'] - detokenization function used during model training (used again for validation) extra_vars['tokenize_hypotheses'] - Whether tokenize or not the hypotheses during evaluation :param split: split on which we are evaluating :return: Dictionary with the coco scores """ from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.meteor.meteor import Meteor from pycocoevalcap.meteor import accepted_langs from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.ter.ter import Ter gts = extra_vars[split]['references'] if extra_vars.get('tokenize_hypotheses', False): hypo = { idx: list(map(extra_vars['tokenize_f'], [lines.strip()])) for (idx, lines) in list(enumerate(pred_list)) } else: hypo = { idx: [lines.strip()] for (idx, lines) in list(enumerate(pred_list)) } # Tokenize refereces if needed if extra_vars.get('tokenize_references', False): refs = { idx: list(map(extra_vars['tokenize_f'], gts[idx])) for idx in list(gts) } else: refs = gts # Detokenize references if needed. if extra_vars.get('apply_detokenization', False): refs = { idx: list(map(extra_vars['detokenize_f'], refs[idx])) for idx in refs } scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Ter(), "TER"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] if extra_vars.get('language', 'en') in accepted_langs: scorers.append( (Meteor(language=extra_vars.get('language', 'en')), "METEOR")) final_scores = {} for scorer, method in scorers: score, _ = scorer.compute_score(refs, hypo) if isinstance(score, list): for m, s in list(zip(method, score)): final_scores[m] = s else: final_scores[method] = score if verbose > 0: logger.info('Computing coco scores on the %s split...' % split) for metric in sorted(final_scores): value = final_scores[metric] logger.info(metric + ': ' + str(value)) return final_scores
def get_coco_score(pred_list, verbose, extra_vars, split): """ COCO challenge metrics :param pred_list: dictionary of hypothesis sentences (id, sentence) :param verbose: if greater than 0 the metric measures are printed out :param extra_vars: extra variables, here are: extra_vars['references'] - dict mapping sample indices to list with all valid captions (id, [sentences]) extra_vars['tokenize_f'] - tokenization function used during model training (used again for validation) extra_vars['detokenize_f'] - detokenization function used during model training (used again for validation) extra_vars['tokenize_hypotheses'] - Whether tokenize or not the hypotheses during evaluation extra_vars['tokenize_references'] - Whether tokenize or not the references during evaluation :param split: split on which we are evaluating :return: Dictionary with the coco scores """ from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.meteor.meteor import Meteor from pycocoevalcap.meteor import accepted_langs from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.ter.ter import Ter gts = extra_vars[split]['references'] #print("Gts = ", list(gts.values())[:10]) #print("Pred_list = ", pred_list[:10]) tok_hypo = extra_vars.get('tokenize_hypotheses', False) #print("TOK_HYPO: ", tok_hypo) if isinstance(tok_hypo, list) and tok_hypo[0]: hypo = { idx: list(map(extra_vars['tokenize_f'], [lines.strip()])) for (idx, lines) in list(enumerate(pred_list)) } elif tok_hypo and not isinstance(tok_hypo, list): hypo = { idx: list(map(extra_vars['tokenize_f'], [lines.strip()])) for (idx, lines) in list(enumerate(pred_list)) } else: hypo = { idx: [lines.strip()] for (idx, lines) in list(enumerate(pred_list)) } # if # hypo = {idx: list(map(extra_vars['tokenize_f'], [lines.strip()])) for (idx, lines) in list(enumerate(pred_list))} # else: # hypo = {idx: [lines.strip()] for (idx, lines) in list(enumerate(pred_list))} # tok_ref = extra_vars.get('tokenize_references', False) #print("TOK_REF: ", tok_ref) if isinstance(tok_ref, list) and tok_ref[0]: refs = { idx: list(map(extra_vars['tokenize_f'], gts[idx])) for idx in list(gts) } elif tok_ref and not isinstance(tok_ref, list): refs = { idx: list(map(extra_vars['tokenize_f'], gts[idx])) for idx in list(gts) } else: refs = gts # # Tokenize refereces if needed # print(extra_vars.get('tokenize_references')) # if extra_vars.get('tokenize_references', False): # print("TOKENIZO") # refs = {idx: list(map(extra_vars['tokenize_f'], gts[idx])) for idx in list(gts)} # else: # print("NO TOKENIZO") # refs = gts # Detokenize references if needed. # Hypotheses are already detokenized in callbacks.py if extra_vars.get('apply_detokenization_ref', False): refs = { idx: list(map(extra_vars['detokenize_f'], refs[idx])) for idx in refs } #hypo = {idx: [extra_vars['detokenize_f'](' '.join(line))] for idx, line in hypo.iteritems()} print("Hypotheses = ", list(hypo.values())[:5]) print("References = ", list(refs.values())[:5]) scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Ter(), "TER"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] if extra_vars.get('language', 'en') in accepted_langs: scorers.append( (Meteor(language=extra_vars.get('language', 'en')), "METEOR")) final_scores = {} for scorer, method in scorers: score, _ = scorer.compute_score(refs, hypo) if isinstance(score, list): for m, s in list(zip(method, score)): final_scores[m] = s else: final_scores[method] = score if verbose > 0: logging.info('Computing coco scores on the %s split...' % split) for metric in sorted(final_scores): value = final_scores[metric] logging.info(metric + ': ' + str(value)) return final_scores