def __init__(self, agent, train_test, env, trunc, sampling): Metric.__init__(self, agent, train_test, "cider", "scalar", env, trunc, sampling) self.score_function = Cider() self.tokenizer = PTBTokenizer() self.candidates = [] self.refs = []
class CiderMetric(Metric): def __init__(self, agent, train_test, env, trunc, sampling): Metric.__init__(self, agent, train_test, "cider", "scalar", env, trunc, sampling) self.score_function = Cider() self.tokenizer = PTBTokenizer() self.candidates = [] self.refs = [] def fill_(self, **kwargs): pass def compute_(self, **kwargs): question_decoded = self.dataset.question_tokenizer.decode( kwargs["state"].text.numpy()[0], ignored=["<SOS>"], stop_at_end=True) ref_questions = kwargs["ref_questions_decoded"][0] self.candidates.append(question_decoded) self.refs.append([ref_questions]) def post_treatment_(self): refs = { idx: list(map(_strip, ref)) for (idx, ref) in enumerate(self.refs) } hyps = { idx: [lines.strip()] for (idx, lines) in enumerate(self.candidates) } score, scores = self.score_function.compute_score(refs, hyps) self.metric_history.extend(scores)
def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False, no_glove=False): assert isinstance(hyp, six.string_types) if isinstance(ref, six.string_types): ref = ref.split('||<|>||') # special delimiter for backward compatibility ref = [a.strip() for a in ref] refs = {0: ref} ref_list = [ref] hyps = {0: [hyp.strip()]} hyp_list = [hyp] ret_scores = {} if not no_overlap: scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): ret_scores[m] = sc else: ret_scores[method] = score if isinstance(scorer, Meteor): scorer.close() del scorers if not no_skipthoughts: from nlgeval.skipthoughts import skipthoughts import numpy as np from sklearn.metrics.pairwise import cosine_similarity model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False) ref_list_T = np.array(ref_list).T.tolist() vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T) cosine_similarity = list(map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)) cosine_similarity = np.max(cosine_similarity, axis=0).mean() ret_scores['SkipThoughtCS'] = cosine_similarity if not no_glove: from nlgeval.word2vec.evaluate import eval_emb_metrics import numpy as np glove_hyps = [h.strip() for h in hyp_list] ref_list_T = np.array(ref_list).T.tolist() glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T) scores = eval_emb_metrics(glove_hyps, glove_refs) scores = scores.split('\n') for score in scores: name, value = score.split(':') value = float(value.strip()) ret_scores[name] = value return ret_scores
def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=True, no_glove=False): assert isinstance(hyp, six.string_types) if isinstance(ref, six.string_types): ref = ref.split( '||<|>||') # special delimiter for backward compatibility ref = [a.strip() for a in ref] refs = {0: ref} ref_list = [ref] hyps = {0: [hyp.strip()]} hyp_list = [hyp] ret_scores = {} if not no_overlap: scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): ret_scores[m] = sc else: ret_scores[method] = score return ret_scores
def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=True, no_glove=False): with open(hypothesis, 'r') as f: hyp_list = f.readlines() ref_list = [] for iidx, reference in enumerate(references): with open(reference, 'r') as f: ref_list.append(f.readlines()) ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)] refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)} hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)} assert len(refs) == len(hyps) ret_scores = {} if not no_overlap: scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): print("%s: %0.6f" % (m, sc)) ret_scores[m] = sc else: print("%s: %0.6f" % (method, score)) ret_scores[method] = score del scorers return ret_scores
def load_scorers(self): self.scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ]
def forward(self, images, features, tag_ids, input_ids, target, all_caps, beam_size, self_crit_seq_train=None): if self_crit_seq_train is None: preds_out = self.model(images, features, input_ids, tag_ids) loss = self.loss_fn( preds_out.reshape(-1, hyper_parameters['vocab_dim']), target.reshape(-1)) return loss, preds_out else: # self_crit_seq_train ref_list, hyp_list, scores = self.__generate__( images, features, tag_ids, all_caps, beam_size) refs = {idx: lines for (idx, lines) in enumerate(ref_list)} hyps = {idx: [lines] for (idx, lines) in enumerate(hyp_list)} _, reward = Cider().compute_score(refs, hyps) # (N, beam_size) reward = torch.from_numpy(reward).to(device).view(scores.shape) reward_baseline = torch.mean(reward, dim=1, keepdim=True) loss = -scores * (reward - reward_baseline) loss = loss.mean() return loss, hyp_list[::beam_size]
def evaluate_narrative_qa(ground_truth, predicted_answers): """Evaluation NarrativeQA predictions.""" scorers = [(Bleu(4), ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4']), (Rouge(), 'ROUGE_L'), (Cider(), 'CIDEr')] def preprocess(text): return text.lower().rstrip(' .').strip() common_keys = [k for k in predicted_answers if k in ground_truth] refs = {k: [preprocess(s) for s in ground_truth[k]] for k in common_keys} hyps = {k: [preprocess(predicted_answers[k])] for k in common_keys} ret_scores = dict(common=len(common_keys)) for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, _, m in zip(score, scores, method): # print('%s: %0.6f' % (m, sc)) ret_scores[m] = sc * 100 else: # print('%s: %0.6f' % (method, score)) ret_scores[method] = score * 100 if isinstance(scorer, Meteor): scorer.close() del scorers return ret_scores
def load_scorers(self): self.scorers = [] omit_bleu_i = False for i in range(1, 4 + 1): if 'Bleu_{}'.format(i) in self.metrics_to_omit: omit_bleu_i = True if i > 1: self.scorers.append((Bleu(i - 1), ['Bleu_{}'.format(j) for j in range(1, i)])) break if not omit_bleu_i: self.scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])) if 'ROUGE_L' not in self.metrics_to_omit: self.scorers.append((Rouge(), "ROUGE_L")) if 'CIDEr' not in self.metrics_to_omit: self.scorers.append((Cider(), "CIDEr"))
def compute_metrics(ref, hyp): # ref = ref.split('||<|>||') # special delimiter #ref = [a.strip() for a in ref] refs = {0: [ref]} #ref_list = [ref] hyps = {0: [hyp.strip()]} hyp_list = [hyp] ret_scores = {} scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): ret_scores[m] = sc else: ret_scores[method] = score return ret_scores
def compute_metrics(gt_caps, pred_caps): assert len(gt_caps) == len(pred_caps) gt_caps = add_space_to_cap_dict(gt_caps) pred_caps = add_space_to_cap_dict(pred_caps) ret_scores = {} scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] for scorer, method in scorers: score, scores = scorer.compute_score(gt_caps, pred_caps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): print("%s: %0.6f" % (m, sc)) ret_scores[m] = sc else: print("%s: %0.6f" % (method, score)) ret_scores[method] = score if isinstance(scorer, Meteor): scorer.close() del scorers return ret_scores
def compute_metrics_all(references, hypothesises): refs = { idx: [strippedlines.strip()] for (idx, strippedlines) in enumerate(references) } hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hypothesises)} assert len(refs) == len(hyps) ret_scores = {} scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): #print("%s: %0.6f" % (m, sc)) ret_scores[m] = sc else: #print("%s: %0.6f" % (method, score)) ret_scores[method] = score return ret_scores
def compute_metrics_by_file(references, hypothesis): """ Given a list of gold file names and a predict result file, calculate metrics. Same line number corresponds to the same instance to calculate metric. Ref: https://github.com/Maluuba/nlg-eval :param references: list of gold file names. :param hypothesis: predict file name. :return: a list of metric results. """ scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] def _strip(s): return s.strip() with open(hypothesis, encoding='utf-8') as f: hyp_list = f.readlines() ref_list = [] for iidx, reference in enumerate(references): with open(reference, encoding='utf-8') as f: ref_list.append(f.readlines()) ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)] refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)} hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)} assert len(refs) == len(hyps) ret_scores = {} for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): # print("%s: %0.6f" % (m, sc)) ret_scores[m] = sc else: # print("%s: %0.6f" % (method, score)) ret_scores[method] = score return ret_scores
def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False): with open(hypothesis, 'r') as f: hyp_list = f.readlines() ref_list = [] for iidx, reference in enumerate(references): with open(reference, 'r') as f: ref_list.append(f.readlines()) ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)] refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)} hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)} assert len(refs) == len(hyps) ret_scores = {} if not no_overlap: scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): print("%s: %0.6f" % (m, sc)) ret_scores[m] = sc else: print("%s: %0.6f" % (method, score)) ret_scores[method] = score if isinstance(scorer, Meteor): scorer.close() del scorers if not no_skipthoughts: from nlgeval.skipthoughts import skipthoughts import numpy as np from sklearn.metrics.pairwise import cosine_similarity model = skipthoughts.load_model() encoder = skipthoughts.Encoder(model) vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False) ref_list_T = np.array(ref_list).T.tolist() vector_refs = map( lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T) cosine_similarity = list( map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)) cosine_similarity = np.max(cosine_similarity, axis=0).mean() print("SkipThoughtsCosineSimilarity: %0.6f" % (cosine_similarity)) ret_scores['SkipThoughtCS'] = cosine_similarity del model if not no_glove: from nlgeval.word2vec.evaluate import eval_emb_metrics import numpy as np glove_hyps = [h.strip() for h in hyp_list] ref_list_T = np.array(ref_list).T.tolist() glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T) scores = eval_emb_metrics(glove_hyps, glove_refs) print(scores) scores = scores.split('\n') for score in scores: name, value = score.split(':') value = float(value.strip()) ret_scores[name] = value return ret_scores
def train(train_loader, decoder, decoder_optimizer, epoch, rev_word_map): """ Performs one epoch's training. :param train_loader: DataLoader for training data :param decoder: decoder model :param criterion_ce: cross entropy loss layer :param criterion_dis : discriminative loss layer :param decoder_optimizer: optimizer to update decoder's weights :param epoch: epoch number """ decoder.train() # train mode (dropout and batchnorm is used) batch_time = AverageMeter() # forward prop. + back prop. time data_time = AverageMeter() # data loading time losses = AverageMeter() # loss (per word decoded) top5accs = AverageMeter() # top5 accuracy start = time.time() # Batches for i, (imgs, caps, caplens, allcaps) in enumerate(train_loader): data_time.update(time.time() - start) # Move to GPU, if available imgs = imgs.to(device) caps = caps.to(device) caplens = caplens.to(device) # Forward prop. scores,scores1, caps_sorted, decode_lengths, sort_ind = decoder(imgs, caps, caplens) # /!\ scores shape: (batch_size, max_captions_real_length,vocab_size) # scores[0, t, :]= proba(y[t]|y[1:t-1]) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] # (batch_size, max_caption_real_length) scores_copy = scores.clone() # Remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True) targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True) # Calculate cross entropy # crit = criterion_xe(scores, targets) # References references = list() allcaps = allcaps[sort_ind] # because images were sorted in the decoder for j in range(allcaps.shape[0]): img_caps = allcaps[j].tolist() img_captions = list(map(lambda c: [rev_word_map[w] for w in c if w not in {word_map['<start>'], word_map['<pad>']}], img_caps)) # remove <start> and pads ref_caps = [' '.join(c) for c in img_captions] references.append(ref_caps) #print(references[-1]) # Hypotheses hypotheses = list() _, preds = torch.max(scores_copy, dim=2) preds = preds.tolist() temp_preds = list() for j, p in enumerate(preds): temp_preds.append(preds[j][:decode_lengths[j]]) # remove pads preds = temp_preds # print(preds[0]) preds_caption = list(map(lambda c: [rev_word_map[w] for w in c if w not in {word_map['<start>'], word_map['<pad>']}], preds)) preds_caption = [' '.join(c) for c in preds_caption] hypotheses.extend(preds_caption) assert len(references) == len(hypotheses) # Sample decoding samples = list() proba = softmax(scores_copy, dim=2) B, T, V = proba.size() sampled = np.zeros((B, T), dtype=np.int32) sampled_entropy = torch.zeros([B, T]).to(device) for b in range(B): for t in range(decode_lengths[b]): sampled[b][t] = torch.multinomial(proba[b][t].view(-1), 1).item() sampled_entropy[b][t] = torch.log(proba[b][t][sampled[b][t]]) temp_sampled = list() for j, p in enumerate(sampled): temp_sampled.append(sampled[j][:decode_lengths[j]]) # remove pads log_proba = torch.sum(sampled_entropy, dim=1) sampled_caption = list( map(lambda c: [rev_word_map[w] for w in c if w not in {word_map['<start>'], word_map['<pad>']}], temp_sampled)) sampled_caption = [' '.join(c) for c in sampled_caption] samples.extend(sampled_caption) # print(samples) # Calculate loss cider = Cider() cider_ = Cider() baseline = torch.Tensor(compute_metric(cider_, references, hypotheses)).to(device) reward = torch.Tensor(compute_metric(cider, references, samples)).to(device) # print(log_proba.requires_grad) # loss = -(compute_metric(cider, references,samples) - compute_metric(cider_,references, hypotheses)) * crit loss = -torch.sum((reward-baseline) * log_proba) # Back prop. decoder_optimizer.zero_grad() loss.backward() # Clip gradients when they are getting too large torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, decoder.parameters()), 0.25) # Update weights decoder_optimizer.step() # Keep track of metrics top5 = accuracy(scores, targets, 5) losses.update(loss.item(), sum(decode_lengths)) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() # Print status if i % print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.6f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format(epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top5=top5accs)) print('Reward : ', torch.mean(reward).item()) print('Baseline : ', torch.mean(baseline).item())
def validate(val_loader, decoder,rev_word_map): """ Performs one epoch's validation. :param val_loader: DataLoader for validation data. :param decoder: decoder model :param criterion_ce: cross entropy loss layer :param criterion_dis : discriminative loss layer :return: BLEU-4 score """ decoder.eval() # eval mode (no dropout or batchnorm) batch_time = AverageMeter() losses = AverageMeter() top5accs = AverageMeter() start = time.time() references_ = list() # references (true captions) for calculating BLEU-4 score hypotheses_ = list() # hypotheses (predictions) # Batches with torch.no_grad(): for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader): # Move to GPU, if available imgs = imgs.to(device) caps = caps.to(device) caplens = caplens.to(device) # Forward prop. scores, scores1,caps_sorted, decode_lengths, sort_ind = decoder(imgs, caps, caplens) # /!\ scores shape: (batch_size, max_captions_real_length,vocab_size) # scores[0, t, :]= proba(y[t]|y[1:t-1]) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] # (batch_size, max_caption_real_length) scores_copy = scores.clone() # Remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True) targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True) # Calculate cross entropy # crit = criterion_xe(scores, targets) # References references = list() allcaps = allcaps[sort_ind] # because images were sorted in the decoder for j in range(allcaps.shape[0]): img_caps = allcaps[j].tolist() img_captions = list(map(lambda c: [rev_word_map[w] for w in c if w not in {word_map['<start>'], word_map['<pad>']}], img_caps)) # remove <start> and pads ref_caps = [' '.join(c) for c in img_captions] references.append(ref_caps) # Hypotheses hypotheses = list() _, preds = torch.max(scores_copy, dim=2) preds = preds.tolist() temp_preds = list() for j, p in enumerate(preds): temp_preds.append(preds[j][:decode_lengths[j]]) # remove pads preds = temp_preds # print(preds[0]) preds_caption = list(map(lambda c: [rev_word_map[w] for w in c if w not in {word_map['<start>'], word_map['<pad>']}], preds)) preds_caption = [' '.join(c) for c in preds_caption] hypotheses.extend(preds_caption) assert len(references) == len(hypotheses) # Sample decoding samples = list() proba = softmax(scores_copy, dim=2) B, T, V = proba.size() sampled = np.zeros((B, T), dtype=np.int32) sampled_entropy = torch.zeros([B, T]).to(device) for b in range(B): for t in range(decode_lengths[b]): sampled[b][t] = torch.multinomial(proba[b][t].view(-1), 1).item() sampled_entropy[b][t] = torch.log(proba[b][t][sampled[b][t]]) temp_sampled = list() for j, p in enumerate(sampled): temp_sampled.append(sampled[j][:decode_lengths[j]]) # remove pads log_proba = torch.sum(sampled_entropy, dim=1) sampled_caption = list( map(lambda c: [rev_word_map[w] for w in c if w not in {word_map['<start>'], word_map['<pad>']}], temp_sampled)) sampled_caption = [' '.join(c) for c in sampled_caption] samples.extend(sampled_caption) # print(samples) # Calculate loss cider = Cider() cider_ = Cider() baseline = torch.Tensor(compute_metric(cider_, references, hypotheses)).to(device) reward = torch.Tensor(compute_metric(cider, references, samples)).to(device) # print(log_proba.requires_grad) # loss = -(compute_metric(cider, references,samples) - compute_metric(cider_,references, hypotheses)) * crit loss = -torch.sum((reward-baseline) * log_proba) # Keep track of metrics losses.update(loss.item(), sum(decode_lengths)) top5 = accuracy(scores, targets, 5) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() if i % print_freq == 0: print('Validation: [{0}/{1}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(i, len(val_loader), batch_time=batch_time, loss=losses, top5=top5accs)) # Store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] references_.extend(references) hypotheses_.extend(hypotheses) # Calculate BLEU-4 scores bleu4 = corpus_bleu(references_, hypotheses_) bleu4 = round(bleu4, 4) #calculate CIDEr avg_cider=Cider() #print(references) #print(hypotheses) print(len(compute_metric(avg_cider, references_, hypotheses_))) avg_reward=np.mean(compute_metric(avg_cider, references_, hypotheses_)) print('val reward', avg_reward) print( '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu} , CIDEr - {cidr}\n'.format( loss=losses, top5=top5accs, bleu=bleu4, cidr=avg_reward)) return avg_reward
def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False): with open(hypothesis, 'r') as f: hyp_list = f.readlines() ref_list = [] for iidx, reference in enumerate(references): with open(reference, 'r') as f: ref_list.append(f.readlines()) ref_list = [map(str.strip, refs) for refs in zip(*ref_list)] refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)} hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)} assert len(refs) == len(hyps) ret_scores = {} ret1_scores={} if not no_overlap: scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr") ] for scorer, method in scorers: score, scores = scorer.compute_score(refs, hyps) if isinstance(method, list): for sc, scs, m in zip(score, scores, method): # print("First print: %s:" %m) # print("%s: %0.6f" % (m, sc))//giving BLEu scores ret1_scores[m] = sc else: # print("Second print: %s: "%method) #print("%s: %0.6f" % (method, score))//gives meteor,rouge_l and cider ret_scores[method] = score #print(type(ret_scores)) # if not no_skipthoughts: # from nlgeval.skipthoughts import skipthoughts # import numpy as np # from sklearn.metrics.pairwise import cosine_similarity # model = skipthoughts.load_model() # encoder = skipthoughts.Encoder(model) # vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False) # ref_list_T = np.array(ref_list).T.tolist() # vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T) # cosine_similarity = map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs) # cosine_similarity = np.max(cosine_similarity, axis=0).mean() # print("SkipThoughtsCosineSimilairty: %0.6f" % (cosine_similarity)) # ret_scores['SkipThoughtCS'] = cosine_similarity if not no_glove: from nlgeval.word2vec.evaluate import eval_emb_metrics import numpy as np glove_hyps = [h.strip() for h in hyp_list] ref_list_T = np.array(ref_list).T.tolist() glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T) scores = eval_emb_metrics(glove_hyps, glove_refs) #print(scores) scores = scores.split('\n') for score in scores: name, value = score.split(':') value = float(value.strip()) ret_scores[name] = value # return ret_scores ret_scores["METEOR"]=ret_scores["METEOR"]*a ret_scores["ROUGE_L"]=ret_scores["ROUGE_L"]*b ret_scores["CIDEr"]=ret_scores["CIDEr"]*c ret_scores["EmbeddingAverageCosineSimilairty"]=ret_scores["EmbeddingAverageCosineSimilairty"]*d ret_scores["VectorExtremaCosineSimilarity"]=ret_scores["VectorExtremaCosineSimilarity"]*e # ret_scores["GreedyMatchingScore"]=ret_scores["GreedyMatchingScore"]*f sum=0 # for key in ret_scores: # sum=sum+ret_scores[key] sum=ret_scores["METEOR"]+ret_scores["ROUGE_L"]+ret_scores["CIDEr"]+ret_scores["EmbeddingAverageCosineSimilairty"]+ret_scores["VectorExtremaCosineSimilarity"] marks=sum*maximum_marks print("Marks: %0.2f" % marks)