def get_evalutation_scores(hypothesis, refrences, testing_mode=False): gleu_scores = {"Gleu_1": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=1), "Gleu_2": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=2), "Gleu_3": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=3), "Gleu_4": gleu.corpus_gleu(refrences, hypothesis, min_len=1, max_len=4) } if testing_mode: for i in range(len(hypothesis)): hypothesis[i] = ' '.join(hypothesis[i]) refs = [[]] for i in range(len(refrences)): refs[0].append(' '.join(refrences[i][0])) if refs[0][-1] == "": refs[0][-1] = "no" refrences = refs n = NLGEval() scores = n.compute_metrics(ref_list=refrences, hyp_list=hypothesis) else: scores = {"Bleu_1": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1.0]), "Bleu_2": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 2, 1. / 2]), "Bleu_3": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 3, 1. / 3, 1. / 3]), "Bleu_4": bleu_score.corpus_bleu(refrences, hypothesis, weights=[1. / 4, 1. / 4, 1. / 4, 1. / 4])} for key, val in gleu_scores.items(): scores[key] = val return scores
def calculate_bleu(data, src_field, model, device, decodeType, max_len=30): cc = SmoothingFunction() sentBleu = 0.0 sentGleu = 0.0 trgs = [] pred_trgs = [] #bs = Beam_Search(model) for datum in tqdm(data): trg = vars(datum)['correction1'] src = vars(datum)['orig'] #translate_sentence(src, src_field, model, device, max_len = 25) #HERE if decodeType == "greedy": pred_trg = translate_sentence(src, src_field, model, device, max_len) else: #pred_trg = bs(src, src_field, device) pred_trg = beam_search(src, src_field, model, device, max_len) #cut off <eos> token #HERE #pred_trg = pred_trg[1:-1] #if len(pred_trg) < 2: pred_trg.append(".") sentBleu += sentence_bleu([trg], pred_trg, smoothing_function=cc.method3) sentGleu += sentence_gleu([trg], pred_trg) pred_trgs.append(pred_trg) trgs.append([trg]) sentBleu = sentBleu / len(data) sentGleu = sentGleu / len(data) corpusBleu = corpus_bleu(trgs, pred_trgs, smoothing_function=cc.method3) corpusGleu = corpus_gleu(trgs, pred_trgs) return sentBleu, sentGleu, corpusBleu, corpusGleu
def corpus_gleu(references: List[str], predictions: List[str]): if len(references) != len(predictions): raise ValueError("The lists must have the same length") references = [[o] for o in references] return gleu_score.corpus_gleu(references, predictions)
def test(): hyp1 = [ 'It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party' ] ref1a = [ 'It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands' ] ref1b = [ 'It', 'is', 'the', 'guiding', 'principle', 'which', 'guarantees', 'the', 'military', 'forces', 'always', 'being', 'under', 'the', 'command', 'of', 'the', 'Party' ] ref1c = [ 'It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army', 'always', 'to', 'heed', 'the', 'directions', 'of', 'the', 'party' ] hyp2 = str( 'he read the book because he was interested in world history').split() ref2a = str( 'he was interested in world history because he read the book').split() list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] hypotheses = [hyp1, hyp2] corpus_score = gleu.corpus_gleu(list_of_references, hypotheses) print("Corpus score: " + str(corpus_score)) #The example below show that corpus_gleu() is different from averaging sentence_gleu() for hypotheses score1 = gleu.sentence_gleu([ref1a], hyp1) score2 = gleu.sentence_gleu([ref2a], hyp2) average_score = (score1 + score2) / 2 print("Sentence score average: " + str(average_score))
def computeGLEU(outputs, targets, corpus=False, tokenizer=None): if tokenizer is None: tokenizer = revtok.tokenize if not corpus: return [my_sentence_gleu([t], o) for o, t in zip(outputs, targets)] return corpus_gleu([[t] for t in targets], [o for o in outputs])
def computeGLEU(outputs, targets, corpus=False, tokenizer=None): if tokenizer is None: tokenizer = revtok.tokenize outputs = [tokenizer(o) for o in outputs] targets = [tokenizer(t) for t in targets] if not corpus: return torch.Tensor( [sentence_gleu([t], o) for o, t in zip(outputs, targets)]) return corpus_gleu([[t] for t in targets], [o for o in outputs])
def computeGLEU(outputs, targets, corpus=False, tokenizer=None, segmenter=None): outputs = [tokenizer(o) for o in outputs] targets = [tokenizer(t) for t in targets] if segmenter is not None: outputs = segmenter(outputs) targets = segmenter(targets) if not corpus: return [sentence_gleu([t], o) for o, t in zip(outputs, targets)] return corpus_gleu([[t] for t in targets], [o for o in outputs])
def _compute( self, predictions: List[List[List[str]]], references: List[List[str]], min_len: int = 1, max_len: int = 4, ) -> Dict[str, float]: return { "google_bleu": gleu_score.corpus_gleu(list_of_references=references, hypotheses=predictions, min_len=min_len, max_len=max_len) }
def corpus_score(self, list_of_references, hypotheses, score_type="BLEU"): """ Score specifically implemented for corpus :param list_of_references: list of reference texts :param hypotheses: hypotheses relative to reference :param score_type: metric being used :return: corpus score """ corpus_score = None if utils.BLEU_NAME in score_type: corpus_score = bleu.corpus_bleu(list_of_references, hypotheses) elif utils.GOOGLE_BLEU_NAME in score_type: corpus_score = gleu.corpus_gleu(list_of_references, hypotheses) print("%s corpus score: %.4f" % (score_type, corpus_score)) return corpus_score
def txt_to_error_detection_tsv(source_txt, target_txt, main_label='i'): # Convert txt to tsv # returns DataTSV data structure overall_correct = 0 overall_incorrect = 0 average_incorrect = 0 average_correct = 0 tracker = 1 wrong_sents = read_txt(source_txt) correct_sents = read_txt(target_txt) assert len(wrong_sents) == len( correct_sents), "Files " + source_txt + " and " + target_txt + " do not have same number of sentences! Aborting..." data_list = [] for wrong, correct in zip(wrong_sents, correct_sents): label_string, incorrect_counter, correct_counter = get_alignment(correct, wrong) overall_correct += correct_counter overall_incorrect += incorrect_counter data_list += [[wrong, label_string]] average_incorrect = average_incorrect * (tracker - 1) / tracker + incorrect_counter / tracker average_correct = average_correct * (tracker - 1) / tracker + correct_counter / tracker tracker += 1 # Calculating GLEU score wrong_lister = [] right_lister = [] for sent in wrong_sents: split_sent = sent.split() wrong_lister.append(split_sent) for sent in correct_sents: split_sent = sent.split() right_lister.append(split_sent) # word_error_rate = wer(correct_sents, wrong_sents) # corpus_gleu = gleu_score.corpus_gleu(wrong_lister, right_lister) smoother = bleu_score.SmoothingFunction() corpus_gleu = gleu_score.corpus_gleu(right_lister, wrong_lister) #corpus_bleu = bleu_score.corpus_bleu(right_lister, wrong_lister, smoothing_function=smoother.method1) corpus_bleu = 0 tsv_data = DataTSV(main_label) tsv_data.data_list = data_list tsv_data.propagate_list() return tsv_data, corpus_gleu, corpus_bleu, overall_correct, overall_incorrect, average_incorrect, average_correct
def test(self, testData): """ Args: testData (list): [[FS, SS], ...], type(FS)=type(SS)=unicode """ TSSList = self.generateSSList([FS for FS, SS in testData]) filepath = SMT_RESULT_PATH + '/poemSMT_lmn%d_sm%.3f_lmw%.3f_be%d.txt' % ( self.LM_GRAM_NUM, self.SMOOTHING_LAMBDA, self.LM_WEIGHT, self.BEAM_SIZE) saveResult(testData, TSSList, filepath) refList = [[SS] for FS, SS in testData] bleu = corpus_bleu(refList, TSSList, weights=SCORE_WEIGHT) gleu = corpus_gleu(refList, TSSList, min_len=1, max_len=len(SCORE_WEIGHT)) infoStr = 'BLEU=%.4f, GLEU=%.4f |poemSMT_lmn%d_sm%.3f_lmw%.3f_be%d' % ( bleu, gleu, self.LM_GRAM_NUM, self.SMOOTHING_LAMBDA, self.LM_WEIGHT, self.BEAM_SIZE) return bleu, gleu, infoStr
def get_bleu_score(): """ :return: Bleu (optionally Gleu) score for the translation. """ folds = os.listdir(tr) # folds = [dir for dir in os.listdir(tr) if os.path.isdir(tr + dir)] print(folds) fbleu = 0 fgleu = 0 for fold in folds: target_file = os.path.join(data, fold + '/train.de') trans_file = os.path.join(tr, fold + '/train.en.trans.de') target = codecs.open(target_file, 'r', encoding='utf-8') # reference trans = codecs.open(trans_file, 'r', encoding='utf-8') # correction references = [] hypotheses = [] for pair in zip(target, trans): ref = [pair[0].split()] hp = pair[1].split() if '10_gram' in tr: h1 = ''.join(hp) hp = h1.split('_') # print(hp) references.append(ref) hypotheses.append(hp) # print(hypotheses) # print(references) # sys.exit() bleu_score = nltk.translate.bleu_score.corpus_bleu( references, hypotheses) gleu_score = gleu.corpus_gleu(references, hypotheses) fbleu += bleu_score fgleu += gleu_score print(f'Bleu score for {fold}: {bleu_score}') # print(f'Gleu score for {fold}: {gleu_score}') av_bleu = fbleu / len(folds) av_gleu = fgleu / len(folds) print(f'Average bleu score: {av_bleu}')
def toy(batch_size, max_len, vocab_size, seed, score_type, gs_type, output_dir, iterations, use_reg): assert gs_type in ['softmax', 'gs_hard'] min_len = max(5, max_len // 4) eos_id = 2 pad_id = 0 random.seed(seed) np.random.seed(seed) tf.set_random_seed(seed) reference_lengths = np.random.randint(low=min_len, high=max_len - 1, size=batch_size) reference_tokens = generate_reference(max_len, reference_lengths, vocab_size, eos_id, pad_id) ref_tokens_var = tf.Variable(reference_tokens, trainable=False) ref_onehot = onehot(reference_tokens, vocab_size) ref = tf.Variable(ref_onehot, dtype=tf.float32, trainable=False) hyp_shape = (batch_size, max_len, vocab_size) hyp_logits = tf.Variable(np.random.rand(*hyp_shape), dtype=tf.float32) preds = tf.to_int32(tf.arg_max(hyp_logits, dimension=-1)) #preds = p(preds, 'preds') global_step_var = tf.Variable(0, name='global_step', dtype=tf.int32, trainable=False) weights = tf.sequence_mask(reference_lengths, maxlen=max_len, dtype=tf.float32) # w_noise = tf.distributions.Bernoulli(probs=0.01, dtype=tf.float32).sample(tf.shape(weights)) # weights = tf.multiply(weights, w_noise) # mle_loss = sequence_loss(targets=ref_tokens_var, # logits=hyp_logits, # weights=weights, # average_across_batch=True) if gs_type == 'softmax': scorer_input = tf.nn.softmax(hyp_logits) else: scorer_input = gumbel_softmax(hyp_logits, 0.5, hard=True) scorer_class = BleuScorer if score_type == 'bleu' else GleuScorer scorer = scorer_class(seq_length=max_len, vocab_size=vocab_size, eos_idx=eos_id, reference=ref, hypothesis=scorer_input, ngram_lengths=[1, 2, 3, 4], input_type=ONEHOT_SOFT) score = scorer.batch_score length_diff = tf.abs(scorer.ref_lengths - scorer.hyp_lengths) ref_hyp_length_diff = tf.reduce_mean(length_diff) target_prob = .95 mean_max_prob = tf.reduce_mean( tf.reduce_max(tf.clip_by_value(scorer_input, -.1, target_prob), axis=-1)) #mean_max_prob = p(mean_max_prob, 'maxmean') #reg_on_softmax= -mean_max_prob reg_on_softmax = tf.reduce_mean(-tf.square(scorer_input) + scorer_input) #score_loss = -tf.log(1e-7 + score) + length_penalty_loss + mle_loss #score_loss = -tf.log(1e-7 + score) + length_penalty_loss #score_loss = -tf.log(1e-7 + score) + mle_loss scale = tf.clip_by_value(tf.to_float(global_step_var) / 1., 0., 50000.) #score_loss = -tf.log(1e-7 + score) + scale * reg_on_softmax + length_penalty_loss score_loss = -tf.log(1e-7 + score) if use_reg: score_loss = score_loss + 10000. * reg_on_softmax #score_loss = -score #score_loss = mle_loss optimizer = tf.train.AdamOptimizer(learning_rate=.01, beta1=0.9, beta2=0.98, epsilon=1e-8) grads_and_vars = optimizer.compute_gradients(score_loss) gradients, variables = list(zip(*grads_and_vars)) gradient_norm = tf.global_norm(gradients) #gradients, _ = tf.clip_by_global_norm(gradients, 5.0) train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=global_step_var) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) tf.summary.scalar('score', scorer.batch_score) tf.summary.scalar('score_loss', score_loss) sums_op = tf.summary.merge_all() with tf.train.MonitoredTrainingSession(checkpoint_dir=output_dir, save_summaries_steps=5, save_checkpoint_secs=1200) as sess: sum_writer = SummaryWriterCache.get(output_dir) sess.run(init_op) targets = [ train_op, global_step_var, sums_op, score_loss, score, gradient_norm, preds, ref_hyp_length_diff, mean_max_prob ] best_score = -np.infty nltk_score = 0 for step in tqdm(range(iterations), ncols=70, leave=False, unit='batch'): _, global_step, graph_sums, loss_value, score_value, norm, pred_values, diff, mmp = sess.run( targets) # Compute batch BLEU and GLEU and save summaries of them cropped_y = [[_crop(reference_tokens[k, :], eos_id)] for k in range(batch_size)] cropped_preds = [ _crop(pred_values[k, :], eos_id) for k in range(batch_size) ] nltk_bleu = corpus_bleu(cropped_y, cropped_preds) nltk_gleu = corpus_gleu(cropped_y, cropped_preds) nltk_score = nltk_bleu if score_type == 'bleu' else nltk_gleu if nltk_score > best_score: best_score = nltk_score if step % 10 == 0: msg = "Loss: {:.5e}, score: {:.5e}, nltk.score: {:.5e}, norm: {:.5e}, diff: {:01.2f}, maxprob: {:.2f}" #print(msg.format(loss_value, score_value, nltk_score, norm, diff, mmp)) sums = { 'nltk.bleu': nltk_bleu, 'nltk.gleu': nltk_gleu, } for label, measure in sums.items(): summary = tf.Summary( value=[tf.Summary.Value(tag=label, simple_value=measure)]) sum_writer.add_summary(summary, global_step=global_step) best_score_file = os.path.join(output_dir, 'best_score.txt') with open(best_score_file, 'w') as f: print("best score: {}".format(best_score), file=f) print("last score: {}".format(nltk_score), file=f)
print("Restoring model from {}...".format(ckpt_path)) optimistic_restore(sess, ckpt_path) print("done.") try: def cond(idx): return idx < args.training if args.training else True k = 0 i = 0 while cond(k): pred_values, y_values = sess.run([preds, y]) bleu_preds.extend([_crop(p, EOS) for p in pred_values.tolist()]) bleu_references.extend([[_crop(r, EOS)] for r in y_values.tolist()]) if args.decode: for t, p in zip(y_values.tolist(), pred_values.tolist()): i += 1 print("T[{}] : {}".format(i, decode(t))) print("P[{}] : {}".format(i, decode(p))) else: print("|", end='', flush=True) k += 1 except tf.errors.OutOfRangeError: pass bleu_score = corpus_bleu(bleu_references, bleu_preds) gleu_score = corpus_gleu(bleu_references, bleu_preds) print("BLEU: {}".format(bleu_score)) print("GLEU: {}".format(gleu_score))
def main(): from nltk.translate.gleu_score import corpus_gleu, sentence_gleu eos = 6 reference_batch = [[1, 1, 2, 1, eos]] #, [5, 1, eos, 0, 0], [2, 5, 3, eos, 1]] candidate_batch = [[1, 3, 1, eos, 0]] #, [5, 2, eos, 0, 0], [2, 2, 3, eos, 0]] row = 0 seq_length = len(candidate_batch[row]) true_batch_gleu = corpus_gleu([[_crop(r, eos)] for r in reference_batch], [_crop(c, eos) for c in candidate_batch]) gleu_score, n_match, tpfp, tpfn = custom_sentence_gleu( [_crop(reference_batch[row], eos)], _crop(candidate_batch[row], eos)) true_gleu_scores = [ sentence_gleu([_crop(reference_batch[k], eos)], _crop(candidate_batch[k], eos)) for k in range(len(candidate_batch)) ] print("true gleu: {}, n_match: {}, tpfp: {}, tpfn: {}".format( gleu_score, n_match, tpfp, tpfn)) gleu_scorer = GleuScorer(seq_length=seq_length, vocab_size=eos + 1, eos_idx=eos, input_type=ONEHOT_SOFT) #feed_hyp = np_label_smoothing(np_onehot(np.array(candidate_batch)), epsilon=1e-5) #feed_refs = np_label_smoothing(np_onehot(np.array(reference_batch)), epsilon=1e-5) feed_hyp = np_onehot(np.array(candidate_batch)) feed_refs = np_onehot(np.array(reference_batch)) #print("---> {}".format(feed_refs)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) feed_dict = { gleu_scorer.hypothesis: feed_hyp, gleu_scorer.reference: feed_refs } targets = [ gleu_scorer.batch_gleu_score, gleu_scorer.sentence_n_match, gleu_scorer.tpfn, gleu_scorer.tpfp, gleu_scorer.sentence_gleu_score, gleu_scorer.individual_ngrams[0] ] (batch_gleu, n_match, tpfn, tpfp, gleu, ngram) = sess.run(targets, feed_dict=feed_dict) print("our gleu: {}, n_match: {}, tpfp: {}, tpfn: {}".format( gleu[row], n_match[row], tpfp[row], tpfn[row])) print("\n\nBatch gleu's. official: {}. ours: {}".format( true_batch_gleu, batch_gleu)) print("\n\nall gleus....") print("true ones: {}".format(true_gleu_scores)) print("ours: {}".format(gleu)) print("ngram: {}".format(ngram))
def computeCorpGleu(target,reference): return corpus_gleu(reference,target)
def evaluation_metrics(dataset, steps, size): references = [] hypotheses = [] rouge = Rouge() rouge_dict = { "rouge-1": { "f": 0.0, "p": 0.0, "r": 0.0 }, "rouge-2": { "f": 0.0, "p": 0.0, "r": 0.0 }, "rouge-l": { "f": 0.0, "p": 0.0, "r": 0.0 } } # make references & hypotheses lists for inputs, targets in dataset.take(steps): for labels in target_tokenizer.sequences_to_texts( test_step(inputs, targets)): if len(labels) > 0: hypotheses.append(labels.split()) else: hypotheses.append([""]) for labels in input_tokenizer.sequences_to_texts(inputs.numpy()): references.append(word_split(labels)) for index, hypothesis in enumerate(hypotheses): max_score = { "rouge-1": { "f": 0.0, "p": 0.0, "r": 0.0 }, "rouge-2": { "f": 0.0, "p": 0.0, "r": 0.0 }, "rouge-l": { "f": 0.0, "p": 0.0, "r": 0.0 } } # one hypothesis may have several references for reference in references[index]: try: rouge_score = rouge.get_scores(" ".join(hypothesis), " ".join(reference))[0] # keep the best score if rouge_sum_score(rouge_score) > rouge_sum_score(max_score): max_score = rouge_score except ValueError: pass for method_key in rouge_dict: # fpr for traversing f1 precision recall for fpr in rouge_dict[method_key]: rouge_dict[method_key][fpr] += max_score[method_key][fpr] # average for method_key in rouge_dict: for fpr in rouge_dict[method_key]: rouge_dict[method_key][fpr] /= size bleu = bleu_score.corpus_bleu(references, hypotheses, weights=(1, )) gleu = gleu_score.corpus_gleu(references, hypotheses, max_len=1) nist = nist_score.corpus_nist(references, hypotheses, n=1) print("BLEU-1 Score: %.4f" % bleu) print("GLEU-1 Score: %.4f" % gleu) print("NIST-1 Score: %.4f" % nist) print("ROUGE Scores: %s" % rouge_dict_format(rouge_dict)) return bleu, gleu, nist, rouge_dict
def score_compute(comp_res): res_wer = [] bleu_indi1 = [] bleu_indi2 = [] bleu_indi3 = [] bleu_indi4 = [] bleu_cum2 = [] bleu_cum3 = [] bleu_cum4 = [] gleu_sent = [] meteor_score = [] rouge_score = [] translated = [] reference = [] for i in range(len(comp_res)): reference.append([comp_res[i][0].split(' ')]) translated.append(comp_res[i][1].split(' ')) bleu_corpus = corpus_bleu(reference, translated) #sacrebleu_corpus = sacrebleu.corpus_bleu( translated, reference) gleu_corpus = corpus_gleu(reference, translated) # evaluator obj for rouge-l metric evaluator = Rouge( metrics=['rouge-l'], limit_length=True, length_limit=100, length_limit_type='words', apply_avg=True, apply_best=False, alpha=0.5, # Default F1_score weight_factor=1.2, stemming=True) #for result_pair in compare_results: for result_pair in comp_res: # ------------ WER #res_back = wer( result_pair[0].split(' '), result_pair[1].split(' ')) res_back = wer(result_pair[0].split(' '), result_pair[1].split(' ')) res_wer.append(res_back) # ----------- BLEU indi1_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(1, 0, 0, 0)) # individual 1-gram indi2_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(0, 1, 0, 0)) # individual 2-gram indi3_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(0, 0, 1, 0)) # individual 3-gram indi4_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(0, 0, 0, 1)) # individual 4-gram # cumulative 2-gram, 3-gram, 4-gram bleu cum2_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(0.5, 0.5, 0, 0)) cum3_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(0.33, 0.33, 0.33, 0)) cum4_gr = sentence_bleu([result_pair[0].split(' ')], result_pair[1].split(' '), weights=(0.25, 0.25, 0.25, 0.25)) gleu_s = sentence_gleu([result_pair[0].split(' ')], result_pair[1].split(' ')) meteor = round(single_meteor_score(result_pair[0], result_pair[1]), 4) rouge_all = evaluator.get_scores(result_pair[1], result_pair[0]) rouge_l_f1 = rouge_all['rouge-l']['f'] bleu_indi1.append(indi1_gr) bleu_indi2.append(indi2_gr) bleu_indi3.append(indi3_gr) bleu_indi4.append(indi4_gr) bleu_cum2.append(cum2_gr) bleu_cum3.append(cum3_gr) bleu_cum4.append(cum4_gr) gleu_sent.append(gleu_s) meteor_score.append(meteor) rouge_score.append(rouge_l_f1) wer_mean = np.mean(res_wer) wer_var = np.var(res_wer) bleu_indi1_mean = np.mean(bleu_indi1) bleu_indi2_mean = np.mean(bleu_indi2) bleu_indi3_mean = np.mean(bleu_indi3) bleu_indi4_mean = np.mean(bleu_indi4) bleu_cum2_mean = np.mean(bleu_cum2) bleu_cum3_mean = np.mean(bleu_cum3) bleu_cum4_mean = np.mean(bleu_cum4) gleu_s_mean = np.mean(gleu_sent) meteor_s_mean = np.mean(meteor_score) rouge_s_mean = np.mean(rouge_score) bleus = (bleu_indi1_mean, bleu_indi2_mean, bleu_indi3_mean, bleu_indi4_mean, bleu_cum2_mean, bleu_cum3_mean, bleu_cum4_mean, bleu_corpus) gleus = (gleu_s_mean, gleu_corpus) return wer_mean, wer_var, bleus, gleus, meteor_s_mean, rouge_s_mean
def compute_gleu(targets, translations): references, translations = [ [target.replace("@@ ", "").split(" ")] for target in targets ], [t.replace("@@ ", "").split(" ") for t in translations] return gleu_score.corpus_gleu(references, translations)
def get_gleu_score(targets, decodes): return corpus_gleu([[t] for t in targets], [o for o in decodes]) #################################################################################################
def gleu(prediction, ground_truth): return corpus_gleu([[x] for x in ground_truth], prediction) * 100.
all_records_prediction.append(record_translation) # print("=" * 40) print("Done : ", text_index + 1) bleu_score_4 = corpus_bleu(list_of_references, hypothesis) bleu_score_1 = corpus_bleu(list_of_references, hypothesis, weights=(1.0, 0, 0, 0)) bleu_score_2 = corpus_bleu(list_of_references, hypothesis, weights=(0.5, 0.5, 0, 0)) bleu_score_3 = corpus_bleu(list_of_references, hypothesis, weights=(0.3, 0.3, 0.3, 0)) gleu_score = corpus_gleu(list_of_references, hypothesis) bleu_score_final = "Overall BLEU Score on FFR v1.0 Test Dataset : {}".format( round( max(bleu_score_1, bleu_score_2, bleu_score_3, bleu_score_4) * 100, 2)) gleu_score_ = "Overall GLEU Score on FFR v1.0 Test Dataset : {}".format( round(gleu_score * 100), 2) testing_scores = list() testing_scores.append(bleu_score_final) testing_scores.append(gleu_score_) # np_all_results = np.array(all_bleu_scores) # np_all_predictions = np.array(all_records_prediction) # np.save("all_bleu_results_fr", np_all_results) # np.save("all_records_prediction", np_all_predictions)
outputs.append(tokens_orig) # bleu = bleu_score([outputs[-1]], [targets[-1]]) # bleu_overall = bleu_score(outputs, targets) # BleuScores.append(bleu) # if np.int(np.floor((i + 1) /5)+ 1) % 2 == 0: # # bleu = bleu_score(outputs, targets) # print(np.int(np.floor((i + 1) /5)+1)) # bleu = bleu_score(outputs, targets) # BleuScores.append(bleu) # outputs = [] # targets = [] if (i - 1) % 5 == 0: targ = line.strip().split(".", 1)[0] print(line.strip()) print("+++++", targ) tokens_targ = [token.text.lower() for token in spacy_eng(targ)] targets.append([tokens_targ]) # BleuScores = np.array(BleuScores) # print("average = ", np.mean(BleuScores)) print(bleu_score(outputs, targets)) print(corpus_bleu(targets, outputs)) print(corpus_gleu(targets, outputs)) print(BleuScores)
import os os.system('pip install nltk') from nltk.translate.gleu_score import corpus_gleu ref_final = [] hyp_final = [] ref_list = input('Enter the list of refrences : ').split('.') hyp_list = input('Enter the list of hypothesis : ').split('.') for r in ref_list[:-1]: ref_final.append(r.split(' ')) for h in hyp_list[:-1]: hyp_final.append(h.split(' ')) print(corpus_gleu(ref_final, hyp_final))
# bleu = bleu_score([[tokens_orig]],[tokens_orig]) # # print(bleu) # f = open("test10k.tsv", "r") #9:13pm f = open("/data/chaudhryz/ankit/test10k.tsv", "r") BleuScores = [] targets = [] outputs = [] for i, line in enumerate(f): orig = line.split("\t")[0] targ = line.split("\t")[1] # print(orig) tokens_orig = [token.text.lower() for token in spacy_eng(orig)] tokens_targ = [token.text.lower() for token in spacy_eng(targ)] outputs.append(tokens_orig) targets.append([tokens_targ]) if i % 100 == 0: print(i) # print(bleu_score(outputs, targets)) print("NLTK BLEU score: ", corpus_bleu(targets, outputs)) print("NLTK GLEU score: ", corpus_gleu(targets, outputs))