def on_epoch_end(self, epoch, logs={}): images, captions, X = next(self.generator) if self.YL is None: self.YL = np.ones((len(images), self.max_lengths), dtype="int32") X = np.array(X, dtype="float32") # Y_pred = np.random.uniform(size=(len(X), self.max_lengths, 5000)) Y_pred = self.model.predict([X, self.YL]) Y_pred = np.argmax(Y_pred, axis=-1) with open(self.output_dir + "index.html", "w") as f: f.write("<h1> Epoch " + str(epoch + 1) + "</h1>") f.write("<h3> Trained : " + str((epoch + 1) * 1000 * 32) + " examples.</h3>") f.write("<h3> Loss : " + str(logs.get("loss")) + "</h3>") f.write("<h3> acc : " + str(logs.get("acc")) + "</h3><hr>") for i in range(len(images)): s = " ".join([self.i2w[int(k)] for k in Y_pred[i]]) s = s.split(" .")[0].strip() + " ." path = self.output_dir + ("images/%i.jpg" % i) imsave(path, images[i]) f.write(""" <div> <img src='%s' style="width:255px;height:255px;"><br><br> <b>Output :</b> %s<br><br> <b>Expected :</b> <ul style="margin-top : 0">""" % ("images/%i.jpg" % i, s)) for cap in captions[i]: f.write(""" <li>%s</li>""" % cap) f.write(""" </ul> <b>BLEU score :</b> %.3f<br> <hr> </div> """ % bleu(captions[i], s))
def eval(self, hypList, refList): number = len(hypList) n_ref = len(refList) // number result = { 'bleu_1':0.0, 'bleu_2':0.0, 'bleu_3':0.0, 'bleu_4':0.0, 'bleu':0.0 } for Index in range(0, number): ref = [refList[i].split() for i in range(Index * n_ref, (Index+1) * n_ref)] ref = [r[:-1] if r[-1] == '.' else r for r in ref] hyp = hypList[Index].split() if (hyp[-1] == '.'): hyp = hyp[:-1] #print type([ref]), type(ref), type(ref[0]) #print type(hyp), type(hyp[0]) Smooth = SmoothingFunction() bleu_1 = bleu(ref, hyp, weights=[1], smoothing_function = Smooth.method1) bleu_2 = bleu(ref, hyp, weights=[0, 1], smoothing_function = Smooth.method1) bleu_3 = bleu(ref, hyp, weights=[0, 0, 1], smoothing_function = Smooth.method1) bleu_4 = bleu(ref, hyp, weights=[0, 0, 0, 1], smoothing_function = Smooth.method1) bleu_all = bleu(ref, hyp, weights=[0.25, 0.25, 0.25, 0.25], smoothing_function = Smooth.method1) #print hyp, ref #print Index, bleu_1, bleu_2, bleu_3, bleu_4 result['bleu_1'] += bleu_1 result['bleu_2'] += bleu_2 result['bleu_3'] += bleu_3 result['bleu_4'] += bleu_4 result['bleu'] += bleu_all result['bleu_1'] /= number result['bleu_2'] /= number result['bleu_3'] /= number result['bleu_4'] /= number result['bleu'] /= number return result
def evaluate_pair(self, predWords, targetWords): """Compute the BLEU score of a prediction given a reference. Args: predWords: predicted words (a list of strings). targetWords: reference, same type as preWords. Returns: The BLEU score (uses = nltk.translate.bleu_score.sentence_bleu). """ return bleu([self._clear_special_tokens(targetWords)], self._clear_special_tokens(predWords), smoothing_function=SMOOTH.method3)
def update(self, question, response, answers, vectorizer): correct_answers = [a[0] for a in answers if a[1]] correct_answers = [re.split("\s+", a) for a in correct_answers] all_answers = [a[0] for a in answers] all_answers = [re.split("\s+", a) for a in all_answers] response = re.split("\s+", response) try: bleu_score = bleu(correct_answers, response, smoothing_function=self.smoothing_function) bleu_score_all = bleu(all_answers, response, smoothing_function=self.smoothing_function) except ZeroDivisionError: bleu_score = 0.0 bleu_score_all = 0.0 print("Bleu score 0 for response %s" % str(response)) self.total_docs += 1 self.total_bleu += bleu_score self.total_bleu_all += bleu_score_all
def find_best_translation(input_line, results): best_bleu_score = 0.0 best_index = 0 for index, result in enumerate(results): if len(result[1].split()) == 0: continue q2 = input_line.split('END')[2] bleu_score = bleu([q2.split()], result[1].split(), weights=(1.0,)) # bleu_score = bleu([input_line.split()], result[1].split(), weights=(1.0,)) if bleu_score > best_bleu_score: best_bleu_score = bleu_score best_index = index return best_index, best_bleu_score
def evaluate(mfccs, references, max_length_targ, encoder, decoder, targ_lang, device, beam_search=False, beam_width=3, alpha=0.3, nb_candidates=10): if beam_search == False: result= greedy_decode(mfccs, max_length_targ, encoder, decoder, targ_lang, device) else: result = beam_search_decode(mfccs, max_length_targ, encoder, decoder, targ_lang, device=device, beam_width=beam_width, nb_candidates=nb_candidates, alpha=alpha) result = result.split() BLEUscore = bleu([references], result, weights = (0.5, 0.5)) print("Input: {}".format(references)) print("\n") print("Predicted translation: {}".format(result)) print("\n") print("Bleu score: {}".format(BLEUscore))
def get_bleu_score(candidate_text, full_text, N=3): all_words = [] for line in full_text: words = line.split() # word/pos-tag pair for word in words: word = word.rsplit('/', 1)[0] all_words.append(word) weight = 1.0 / N bleu_score = 0.0 candidate_seq = candidate_text.split() candidate_seq = [word.rsplit('/', 1)[0] for word in candidate_seq] for index in range(len(candidate_seq) - 2): bleu_score += bleu([all_words], candidate_seq[index:index + 3], [weight]) return bleu_score
def update(self, question, response, answers, vectorizer): all_answers = [a[0] for a in answers] all_answers = [re.split("\s+", a) for a in all_answers] response = re.split("\s+", response) similarities = [] for a in all_answers: try: bleu_score = bleu([a], response, smoothing_function=self.smoothing_function) except ZeroDivisionError: bleu_score = 0.0 similarities.append(bleu_score) map_score_based_on_bleu = calculateMAP(similarities, [a[1] for a in answers]) average_bleu = np.mean(similarities) self.total_docs += 1 self.mapBLEU += map_score_based_on_bleu self.total_average_bleu += average_bleu return similarities
def eval(self, hypList, refList): # Lower hypList = [it.lower() for it in hypList] refList = [it.lower() for it in refList] number = len(hypList) n_ref = len(refList) // number result = { 'bleu_1': 0.0, 'bleu_2': 0.0, 'bleu_3': 0.0, 'bleu_4': 0.0, 'bleu': 0.0 } for Index in range(0, number): ref = [ refList[i].split() for i in range(Index * n_ref, (Index + 1) * n_ref) ] ref = [r[:-1] if r[-1] == '.' else r for r in ref] hyp = hypList[Index].split() if (hyp[-1] == '.'): hyp = hyp[:-1] Smooth = SmoothingFunction() bleu_1 = bleu(ref, hyp, weights=[1], smoothing_function=Smooth.method1) bleu_2 = bleu(ref, hyp, weights=[0, 1], smoothing_function=Smooth.method1) bleu_3 = bleu(ref, hyp, weights=[0, 0, 1], smoothing_function=Smooth.method1) bleu_4 = bleu(ref, hyp, weights=[0, 0, 0, 1], smoothing_function=Smooth.method1) bleu_all = bleu(ref, hyp, weights=[0.25, 0.25, 0.25, 0.25], smoothing_function=Smooth.method1) result['bleu_1'] += bleu_1 result['bleu_2'] += bleu_2 result['bleu_3'] += bleu_3 result['bleu_4'] += bleu_4 result['bleu'] += bleu_all result['bleu_1'] /= number result['bleu_2'] /= number result['bleu_3'] /= number result['bleu_4'] /= number result['bleu'] /= number return result
def compute_match_scores(tgt_seqs, pred_seqs, do_lower=True, do_stem=True, type='exact'): ''' If type='exact', returns a list of booleans indicating if a pred has a matching tgt If type='partial', returns a 2D matrix, each value v_ij is a float in range of [0,1] indicating the (jaccard) similarity between pred_i and tgt_j :param tgt_seqs: :param pred_seqs: :param do_stem: :param topn: :param type: 'exact' or 'partial' :return: ''' # do processing to baseline predictions if type == "exact": match_score = np.zeros(shape=(len(pred_seqs)), dtype='float32') else: match_score = np.zeros(shape=(len(pred_seqs), len(tgt_seqs)), dtype='float32') target_number = len(tgt_seqs) predicted_number = len(pred_seqs) metric_dict = { 'target_number': target_number, 'prediction_number': predicted_number, 'correct_number': match_score } # convert target index into string if do_lower: tgt_seqs = [[w.lower() for w in seq] for seq in tgt_seqs] pred_seqs = [[w.lower() for w in seq] for seq in pred_seqs] if do_stem: tgt_seqs = [stem_word_list(seq) for seq in tgt_seqs] pred_seqs = [stem_word_list(seq) for seq in pred_seqs] for pred_id, pred_seq in enumerate(pred_seqs): if type == 'exact': match_score[pred_id] = 0 for true_id, true_seq in enumerate(tgt_seqs): match = True if len(pred_seq) != len(true_seq): continue for pred_w, true_w in zip(pred_seq, true_seq): # if one two words are not same, match fails if pred_w != true_w: match = False break # if every word in pred_seq matches one true_seq exactly, match succeeds if match: match_score[pred_id] = 1 break elif type == 'ngram': # use jaccard coefficient as the similarity of partial match (1+2 grams) pred_seq_set = set(pred_seq) pred_seq_set.update( set([ pred_seq[i] + '_' + pred_seq[i + 1] for i in range(len(pred_seq) - 1) ])) for true_id, true_seq in enumerate(tgt_seqs): true_seq_set = set(true_seq) true_seq_set.update( set([ true_seq[i] + '_' + true_seq[i + 1] for i in range(len(true_seq) - 1) ])) if float( len(set.union(*[set(true_seq_set), set(pred_seq_set)]))) > 0: similarity = len(set.intersection(*[set(true_seq_set), set(pred_seq_set)])) \ / float(len(set.union(*[set(true_seq_set), set(pred_seq_set)]))) else: similarity = 0.0 match_score[pred_id, true_id] = similarity elif type == 'mixed': # similar to jaccard, but addtional to 1+2 grams we also put in the full string, serves like an exact+partial surrogate pred_seq_set = set(pred_seq) pred_seq_set.update( set([ pred_seq[i] + '_' + pred_seq[i + 1] for i in range(len(pred_seq) - 1) ])) pred_seq_set.update(set(['_'.join(pred_seq)])) for true_id, true_seq in enumerate(tgt_seqs): true_seq_set = set(true_seq) true_seq_set.update( set([ true_seq[i] + '_' + true_seq[i + 1] for i in range(len(true_seq) - 1) ])) true_seq_set.update(set(['_'.join(true_seq)])) if float( len(set.union(*[set(true_seq_set), set(pred_seq_set)]))) > 0: similarity = len(set.intersection(*[set(true_seq_set), set(pred_seq_set)])) \ / float(len(set.union(*[set(true_seq_set), set(pred_seq_set)]))) else: similarity = 0.0 match_score[pred_id, true_id] = similarity elif type == 'bleu': # account for the match of subsequences, like n-gram-based (BLEU) or LCS-based # n-grams precision doesn't work that well for true_id, true_seq in enumerate(tgt_seqs): match_score[pred_id, true_id] = bleu(pred_seq, [true_seq], [0.7, 0.3, 0.0]) return match_score
def evaluate(self, data_loader): loss = 0.0 data_size = 0 score = {'Bleu_1': 0, 'Bleu_4': 0, 'ROUGE_L': 0, 'METEOR': 0} r = Rouge() m = Meteor() criterion = nn.NLLLoss() for iter, (batch_x, batch_y) in enumerate(data_loader): batch_size = batch_x.size(0) encoder_hidden = self.encoder.initHidden(batch_size) batch_x = Variable(batch_x.transpose(0, 1)) batch_y = Variable(batch_y.transpose(0, 1)) input_length = batch_x.size(0) target_length = batch_y.size(0) data_size += batch_size output = torch.LongTensor(target_length, batch_size) encoder_outputs = torch.zeros(self.max_length, batch_size, self.encoder.hidden_size) encoder_outputs = encoder_outputs.cuda( ) if use_cuda else encoder_outputs for ei in range(input_length): encoder_output, encoder_hidden = self.encoder( batch_x[ei], batch_size, encoder_hidden) encoder_outputs[ei] = encoder_output[0] decoder_input = torch.LongTensor([SOS_token] * batch_size) decoder_input = decoder_input.cuda() if use_cuda else decoder_input decoder_hidden = encoder_hidden for di in range(target_length): decoder_output, decoder_hidden = self.decoder( decoder_input, batch_size, decoder_hidden) topv, topi = decoder_output.data.topk(1) output[di] = topi.view(-1) decoder_input = topi.view(-1) loss += criterion(decoder_output, batch_y[di]).item() output = output.transpose(0, 1) #(batch_Size, target_len) for di in range(output.size()[0]): ignore = [0, 1, 2] # [SOS_token, EOS_token, PAD_token] sent = [ str(word.item()) for word in output[di] if word not in ignore ] y = [ str(word.item()) for word in batch_y[di] if word not in ignore ] score['ROUGE_L'] += r.calc_score([' '.join(sent)], [' '.join(y)]) score['Bleu_1'] += bleu([y], sent, weights=[1.0]) score['Bleu_4'] += bleu([y], sent, weights=[0.25, 0.25, 0.25, 0.25]) score['METEOR'] += m._score(" ".join(sent), [" ".join(y)]) print 'data amount:%d' % data_size score['Bleu_1'] = score['Bleu_1'] / (target_length * data_size) score['Bleu_4'] = score['Bleu_4'] / (target_length * data_size) score['ROUGE_L'] = score['ROUGE_L'] / (target_length * data_size) score['METEOR'] = score['METEOR'] / (target_length * data_size) return loss / (target_length * data_size), score
def bleu_score(self, candidate, references): weights = [0.5, 0.5] candidate = [c for c in candidate if c != '<pad>'] references = [[c for c in ref if c != '<pad>'] for ref in references] return bleu(references, candidate, weights)