def test_sentence_nist(self): ref_file = find("models/wmt15_eval/ref.ru") hyp_file = find("models/wmt15_eval/google.ru") mteval_output_file = find("models/wmt15_eval/mteval-13a.output") # Reads the NIST scores from the `mteval-13a.output` file. # The order of the list corresponds to the order of the ngrams. with open(mteval_output_file) as mteval_fin: # The numbers are located in the last 4th line of the file. # The first and 2nd item in the list are the score and system names. mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1]) with open(ref_file, encoding="utf8") as ref_fin: with open(hyp_file, encoding="utf8") as hyp_fin: # Whitespace tokenize the file. # Note: split() automatically strip(). hypotheses = list(map(lambda x: x.split(), hyp_fin)) # Note that the corpus_bleu input is list of list of references. references = list(map(lambda x: [x.split()], ref_fin)) # Without smoothing. for i, mteval_nist in zip(range(1, 10), mteval_nist_scores): nltk_nist = corpus_nist(references, hypotheses, i) # Check that the NIST scores difference is less than 0.5 assert abs(mteval_nist - nltk_nist) < 0.05
def get_nist(reference_file, hypotesis_file): if not os.path.exists(reference_file): # print(f"File '{reference_file}' not found") return 0 if not os.path.exists(hypotesis_file): # print(f"File '{hypotesis_file}' not found") return 0 with open(reference_file, 'r') as tf_ref, open(hypotesis_file, 'r') as tf_hyp: lines_ref = tf_ref.read().splitlines() lines_hyp = tf_hyp.read().splitlines() len_ref = len(lines_ref) len_hyp = len(lines_hyp) if len_ref != len_hyp: print( "Different number of lines in files: {0} (reference), {1} (hypotesis)" .format(len_ref, len_hyp)) return 0 # Str -> to tokens strings_ref = [] for i in range(0, len(lines_ref)): strings_ref.append([(lines_ref[i].split())]) # Double list strings_hyp = [] for i in range(0, len(lines_hyp)): strings_hyp.append(lines_hyp[i].split()) score = nist_score.corpus_nist(strings_ref, strings_hyp) return score
def compute_scores(pred_fname, ref_fname): # read files hyps = read_corpus(pred_fname, ref=False) refs = read_corpus(ref_fname, ref=True) # NIST score nist = ns.corpus_nist(refs, hyps, n=4) # BLEU score chencherry = bs.SmoothingFunction() bleu = bs.corpus_bleu(refs, hyps, smoothing_function=chencherry.method2) # ED total_len = 0.0 edi = 0.0 for r, h in zip(refs, hyps): total_len += max(len(r[0]), len(h)) edi += edit_distance( r[0], h) # TODO: strange -- inputs should be strings, not charlists! bleu_score = 100.0 * round(bleu, 4) edist_score = 100.0 * round(1 - edi / total_len, 4) nist_score = round(nist, 2) return Scores(bleu_score, edist_score, nist_score)
def main(): arguments = sys.argv[1:] num_args = len(arguments) if num_args != 2: print('Wrong number few arguments.') print(str(sys.argv[0]), 'system-dir', 'reference-dir') exit() system_path = arguments[0] ref_path = arguments[1] # For all files in system path. for filename in os.listdir(system_path): print('Filename', str(filename)) system_filename = os.path.join(system_path, filename) ref_filename = os.path.join(ref_path, filename) # read files ref = read_corpus(ref_filename, ref=True) hyp = read_corpus(system_filename, ref=False) # NIST score nist = ns.corpus_nist(ref, hyp, n=4) # BLEU score chencherry = bs.SmoothingFunction() bleu = bs.corpus_bleu(ref, hyp, smoothing_function=chencherry.method2) print('BLEU', str(round(bleu, 3))) total_len = 0.0 edi = 0.0 for r, h in zip(ref, hyp): total_len += max(len(r[0]), len(h)) edi += edit_distance(r[0], h) print('DIST', str(round(1 - edi / total_len, 3))) print('NIST', str(round(nist, 6))) print()
def show_nist(reference_file, hypotesis_file): cumulative_bleu_score = 0 with open(reference_file, 'r') as tf_ref, open(hypotesis_file, 'r') as tf_hyp: lines_ref = tf_ref.read().splitlines() lines_hyp = tf_hyp.read().splitlines() # lines_ref = lines_ref[0:500] # lines_hyp = lines_hyp[0:500] len_ref = len(lines_ref) len_hyp = len(lines_hyp) if len_ref != len_hyp: print( "Different number of lines in files: {0} (reference), {1} (hypotesis)" .format(len_ref, len_hyp)) return # Str -> to tokens strings_ref = [] for i in range(0, len(lines_ref)): strings_ref.append([(lines_ref[i].split())]) # Double list strings_hyp = [] for i in range(0, len(lines_hyp)): strings_hyp.append(lines_hyp[i].split()) bleu_score = nist_score.corpus_nist(strings_ref, strings_hyp) print("** NIST score (corpus): " + str(bleu_score))
def compute_metric_scores(refs, hyps): assert type(refs) == type(hyps) == list chencherry = SmoothingFunction() bl = corpus_bleu(refs, hyps, smoothing_function=chencherry.method2) ni = corpus_nist(refs, hyps, n=4) ed = compute_edit_distance(refs, hyps) return Score(bleu=bl, nist=ni, edist=ed)
def nist_bleu(refs: List[List[Sentence]], hyps: List[Sentence], n=4): assert len(refs) == len(hyps), f'refs:{len(refs)} == hyps:{len(hyps)} ? ' assert len(refs) > 0 assert n > 0 assert isinstance(hyps, list) assert isinstance(hyps[0], list) assert isinstance(hyps[0][0], str) assert isinstance(refs, list) assert isinstance(refs[0], list) assert isinstance(refs[0][0], list) assert isinstance(refs[0][0][0], str) return nist_score.corpus_nist(refs, hyps, n=n)
def calculate_nist_score(self): ''' this is the main method to calculate the nist score ''' hypo, ref, has_one_sentence = self._p.split_references_hypothesis() if ref is None: return 0 elif has_one_sentence: ''' 1 or more references with 1 sentence: ''' nist_score = sentence_nist(ref, hypo) # calculate the sentence-level nist score return nist_score else: ''' 1 or more references with more than 1 sentence: ''' nist_score = corpus_nist(ref, hypo) # calculate the corpus-level nist score return nist_score
def compute_scores(predictions: List[List[str]], references: List[List[str]]): scores = dict() #bleu scores['bleu'] = nt.bleu_score.corpus_bleu([references], predictions, smoothing_function=nt.bleu_score.SmoothingFunction().method4) #meteor # scores['meteor'] = 0. # for i in range(len(predictions)): # scores['meteor'] += nt.meteor_score.single_meteor_score(' '.join(references[i]), ' '.join(predictions[i])) # scores['meteor'] /= len(predictions) #nist scores['nist'] = nist_score.corpus_nist([references], predictions) #ribs scores['ribes'] = nt.ribes_score.corpus_ribes([references], predictions) return scores
def main(): ''' arguments = sys.argv[1:] num_args = len(arguments) if num_args != 2: print('Wrong number few arguments.') print(sys.argv[0], 'system-dir', 'referene-dir') exit() system_path = arguments[0] ref_path = arguments[1] ''' output_path = sys.argv[1] # For all files in system path. # read files ref, hyp = read(output_path) # NIST score nist = ns.corpus_nist(ref, hyp, n=4) # BLEU score chencherry = bs.SmoothingFunction() bleu = bs.corpus_bleu(ref, hyp, smoothing_function=chencherry.method2) print('BLEU', round(bleu, 3)) total_len = 0.0 edi = 0.0 for r, h in zip(ref, hyp): try: total_len += max(len(r[0][0]), len(h[0])) edi += edit_distance(r[0][0], h[0]) except: #print('r', r[0]) #print('h', h) print("ERROR") pass print('DIST', round(1 - edi / total_len, 3)) print('NIST', round(nist, 6)) print('')
def test_sentence_nist(self): ref_file = find('models/wmt15_eval/ref.ru') hyp_file = find('models/wmt15_eval/google.ru') mteval_output_file = find('models/wmt15_eval/mteval-13a.output') # Reads the NIST scores from the `mteval-13a.output` file. # The order of the list corresponds to the order of the ngrams. with open(mteval_output_file, 'r') as mteval_fin: # The numbers are located in the last 4th line of the file. # The first and 2nd item in the list are the score and system names. mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1]) with io.open(ref_file, 'r', encoding='utf8') as ref_fin: with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin: # Whitespace tokenize the file. # Note: split() automatically strip(). hypotheses = list(map(lambda x: x.split(), hyp_fin)) # Note that the corpus_bleu input is list of list of references. references = list(map(lambda x: [x.split()], ref_fin)) # Without smoothing. for i, mteval_nist in zip(range(1, 10), mteval_nist_scores): nltk_nist = corpus_nist(references, hypotheses, i) # Check that the NIST scores difference is less than 0.5 assert abs(mteval_nist - nltk_nist) < 0.05
if not args.mbleu and not args.mnist and not args.mwer and not args.ed and not args.mter: # and not args.mpyter: doBLEU = True doNIST = True doWER = True doED = True doTER = True if doBLEU: try: BLEU = corpus_bleu(references_tok, hypothesis_tok) print("BLEU: ", round(BLEU, rbleu)) except: print("ERROR: unable to calculate BLEU.") if doNIST: try: NIST = corpus_nist(references_tok, hypothesis_tok) print("NIST: ", round(NIST, rnist)) except: print("ERROR: unable to calculate NIST:") if doWER: try: WER = wer_corpus(references_tok, hypothesis_tok) print("WER: ", round(WER, rwer)) except: print("ERROR: unable to calculate WER.") if doED: try: edtotal = 0 chartotal = 0 for i in range(0, len(hypothesis)):
# -*- coding: utf-8 -*- """NIST.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1rtoEa5dozl1tvJPqVslUk-jxk_03cdET """ import os os.system('pip install nltk') from nltk.translate.nist_score import corpus_nist ref = [] refrences = input('Enter the refrences sentences : ').split('.') for r in refrences[:-1]: ref.append(r.split(' ')) hyp = input('Enter the hypothesis sentence.').split(' ') order = int(input('Enter the order of n (n-gram)')) score = corpus_nist([ref], [hyp], n=order) print(score)
f for f in listdir(reference_directory) if isfile(join(reference_directory, f)) ] for file in ref_files: with open(reference_directory + file, 'r', encoding="utf8") as f: fileTokens = [] for line in f: text = line.translate(str.maketrans('', '', string.punctuation)).lower() token = text.split() fileTokens.extend(token) references.append([fileTokens]) for directory in data_directories: candidates = [] only_files = [f for f in listdir(directory) if isfile(join(directory, f))] for file in only_files: fileTokens = [] with open(directory + file, 'r', encoding="utf8") as f: for line in f: text = line.translate(str.maketrans( '', '', string.punctuation)).lower() token = text.split() fileTokens.extend(token) candidates.append(fileTokens) bleu_score = corpus_bleu(references, candidates) nist_score = corpus_nist(references, candidates) print(directory) print("bleu {}".format(bleu_score)) print("mist {}".format(nist_score))
import sys from nltk.translate.nist_score import corpus_nist #Script arguments REFERENCE_PATH = sys.argv[1] HYPOTHESIS_PATH = sys.argv[2] ref_sents = [] hyp_sents = [] with open(REFERENCE_PATH) as ref, open(HYPOTHESIS_PATH) as hyp: for line_ref, line_hyp in zip(ref, hyp): ref_sents.append(line_ref.strip()) hyp_sents.append(line_hyp.strip()) chrf = corpus_nist(ref_sents, hyp_sents) print("NIST: %6.2f %%"%(chrf * 100))
def eval_nist(answers, result, n=5): answers_ = [[answer] for answer in answers] scores = corpus_nist(answers_, result, n) return scores
def calculate(): results_frame_text.delete('1.0', END) rfilename = F_frame_E_Ref.get() hfilename = F_frame_E_Hyp.get() rfile = codecs.open(rfilename, "r", encoding="utf-8") hfile = codecs.open(hfilename, "r", encoding="utf-8") sys.path.append(os.getcwd()) selectedtokenizer = combo_tokenizersF.get() if not selectedtokenizer.endswith(".py"): selectedtokenizer = selectedtokenizer + ".py" spec = importlib.util.spec_from_file_location('', selectedtokenizer) tokenizermod = importlib.util.module_from_spec(spec) spec.loader.exec_module(tokenizermod) tokenizer = tokenizermod.Tokenizer() references = [] references_tok = [] contref = 0 for linia in rfile: linia = linia.rstrip() #pot haver més d'una referència separada per tabuladors rd = [] rd_tok = [] linia = linia.rstrip() #pot haver més d'una referència separada per tabuladors for segment in linia.split("\t"): tokens = tokenizer.tokenize(segment).split(" ") rd.append(segment) rd_tok.append(tokens) references.append(rd) references_tok.append(rd_tok) contref += 1 hypothesis = [] hypothesis_tok = [] conthyp = 0 for linia in hfile: linia = linia.rstrip() tokens = tokenizer.tokenize(linia).split(" ") hypothesis.append(linia) hypothesis_tok.append(tokens) conthyp += 1 if not contref == conthyp: messagebox.showerror( "Error", "Reference and hypothesis files should have the same number of lines." ) if doBLEU: try: BLEU = corpus_bleu(references_tok, hypothesis_tok) cadena = "BLEU: " + str(round(BLEU, rbleu)) print(cadena) results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") except: print("ERROR: unable to calculate BLEU.", sys.exc_info()) cadena = "BLEU: Unable to calculate BLEU" results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") if doNIST: try: NIST = corpus_nist(references_tok, hypothesis_tok) cadena = "NIST: " + str(round(NIST, rnist)) print(cadena) results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") except: print("ERROR: unable to calculate NIST:", sys.exc_info()) cadena = "NIST: Unable to calculate NIST" results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") if doWER: try: WER = wer_corpus(references_tok, hypothesis_tok) cadena = "WER: " + str(round(WER, rwer)) print(cadena) results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") except: print("ERROR: unable to calculate WER.", sys.exc_info()) cadena = "WER: Unable to calculate WER" results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") if doED: try: edtotal = 0 chartotal = 0 for i in range(0, len(hypothesis)): editmin = 100000000 chartotal += len(hypothesis[i]) for h in references[i]: ed = edit_distance(hypothesis[i], h) if ed < editmin: editmin = ed edtotal += editmin EditDistance = 100 * (edtotal / chartotal) cadena = "%EdDist: " + str(round(EditDistance, reddist)) print(cadena) results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") except: print("ERROR: unable to calculate Ed", sys.exc_info()) cadena = "%EdDIst: Unable to calculate Ed" results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") if doTER: try: TERcorpus = ter_corpus(references_tok, hypothesis_tok) cadena = "TER: " + str(round(TERcorpus, rter)) print(cadena) results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") except: print("ERROR: unable to calculate TER", sys.exc_info()) cadena = "TER: Unable to calcualte TER" results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") print("-------------------------------------------") rfile.close() hfile.close() if 'selected' in F_frame_detailed.state(): sourcesegments = [] try: sourcef = codecs.open(F_frame_E_Source.get(), "r", encoding="utf-8") for linia in sourcef: linia = linia.rstrip() sourcesegments.append(linia) except: for i in range(0, contref): sourcesegments.append("") excelfile = F_frame_E_Detailed.get() textfile = F_frame_E_Detailed.get() if not excelfile.endswith(".xlsx"): excelfile = excelfile.replace(".txt", "") + ".xlsx" if not textfile.endswith(".txt"): textfile = textfile.replace(".xlsx", "") + ".txt" workbook = xlsxwriter.Workbook(excelfile) sheetAll = workbook.add_worksheet("All") sheetDetailed = workbook.add_worksheet("Detailed") sheetDetailed.set_column(1, 4, 30) bold = workbook.add_format({'bold': True}) red = workbook.add_format({'color': 'red'}) red.set_font_strikeout() blue = workbook.add_format({'color': 'blue'}) text_wrap = workbook.add_format({'text_wrap': 1, 'valign': 'top'}) sortida = codecs.open(textfile, "w", encoding="utf-8") sheetAll.write(0, 0, "SEGMENTS") sheetAll.write(0, 1, contref) sheetAll.write(1, 0, "BLEU") if doBLEU: sheetAll.write(1, 1, round(BLEU, rbleu)) sheetAll.write(2, 0, "NIST") if doNIST: sheetAll.write(2, 1, round(NIST, rnist)) sheetAll.write(3, 0, "WER") if doWER: sheetAll.write(3, 1, round(WER, rwer)) sheetAll.write(4, 0, "%EdDist") if doED: sheetAll.write(4, 1, round(EditDistance, reddist)) sheetAll.write(5, 0, "TER") if doTER: sheetAll.write(5, 1, round(TERcorpus, rter)) cadenasortida = [] cadenasortida.append("IDENT.") cadenasortida.append("Source.") cadenasortida.append("Reference") cadenasortida.append("Hyphotesis") cadenasortida.append("DIFF.") sheetDetailed.write(0, 0, "IDENT.", bold) sheetDetailed.write(0, 1, "Source", bold) sheetDetailed.write(0, 2, "Reference", bold) sheetDetailed.write(0, 3, "Hyphotesis", bold) sheetDetailed.write(0, 4, "DIFF.", bold) column = 5 if doBLEU: sheetDetailed.write(0, column, "BLEU", bold) cadenasortida.append("BLEU") columnBLEU = column column += 1 if doNIST: sheetDetailed.write(0, column, "NIST", bold) cadenasortida.append("NIST") columnNIST = column column += 1 if doWER: sheetDetailed.write(0, column, "WER", bold) cadenasortida.append("WER") columnWER = column column += 1 if doTER: sheetDetailed.write(0, column, "TER", bold) cadenasortida.append("TER") columnTER = column column += 1 if doED: sheetDetailed.write(0, column, "EditDistance", bold) cadenasortida.append("EditDistance") columnED = column column += 1 sortida.write("\t".join(cadenasortida) + "\n") for i in range(0, len(hypothesis)): cadenasortida = [] sheetDetailed.write(i + 1, 0, i + 1) cadenasortida.append(str(i + 1)) sheetDetailed.write(i + 1, 1, sourcesegments[i], text_wrap) cadenasortida.append(sourcesegments[i].replace("\t", " ")) sheetDetailed.write(i + 1, 3, hypothesis[i], text_wrap) rtok = [references_tok[i]] htok = [hypothesis_tok[i]] selectedreference = references[i][0] #NOTE: if more than one reference, the one used in the excel file is the first one sheetDetailed.write(i + 1, 2, selectedreference, text_wrap) cadenasortida.append(selectedreference.replace("\t", " ")) cadenasortida.append(sourcesegments[i].replace("\t", " ")) dE = differencesExcel(selectedreference, hypothesis[i], red, blue, bold) dEtext = differences(selectedreference.replace("\t", " "), hypothesis[i].replace("\t", " ")) cadenasortida.append(dEtext) sheetDetailed.write_rich_string(i + 1, 4, *dE, text_wrap) if doBLEU: try: BLEU = corpus_bleu(rtok, htok) sheetDetailed.write(i + 1, columnBLEU, round(BLEU, rbleu)) cadenasortida.append(str(round(BLEU, rbleu))) except: cadenasortida.append("") print("ERROR: unable to calculate detailed BLEU.") if doNIST: try: NIST = corpus_nist(rtok, htok) sheetDetailed.write(i + 1, columnNIST, round(NIST, rnist)) cadenasortida.append(str(round(NIST, rnist))) except: cadenasortida.append("") print("ERROR: unable to calculate detailed NIST:") if doWER: try: WER = wer_corpus(rtok, htok) sheetDetailed.write(i + 1, columnWER, round(WER, rwer)) cadenasortida.append(str(round(WER, rwer))) except: cadenasortida.append("") print("ERROR: unable to calculate detailed WER.") if doED: try: edtotal = 0 chartotal = 0 for i2 in range(0, len(htok)): editmin = 100000000 chartotal += len(htok[i2]) for h in rtok[i2]: ed = edit_distance(htok[i2], h) if ed < editmin: editmin = ed edtotal += editmin EditDistance = 100 * (edtotal / chartotal) sheetDetailed.write(i + 1, columnED, round(EditDistance, reddist)) cadenasortida.append(str(round(EditDistance, reddist))) except: cadenasortida.append("") print("ERROR: unable to calculate detailed Ed") if doTER: try: TERcorpus = ter_corpus(rtok, htok) sheetDetailed.write(i + 1, columnTER, round(TERcorpus, rter)) cadenasortida.append(str(round(TERcorpus, rter))) except: cadenasortida.append("") print("ERROR: unable to calculate detailed TER", sys.exc_info()) sortida.write("\t".join(cadenasortida) + "\n") workbook.close()
def evaluation_metrics(dataset, steps, size): references = [] hypotheses = [] rouge = Rouge() rouge_dict = { "rouge-1": { "f": 0.0, "p": 0.0, "r": 0.0 }, "rouge-2": { "f": 0.0, "p": 0.0, "r": 0.0 }, "rouge-l": { "f": 0.0, "p": 0.0, "r": 0.0 } } # make references & hypotheses lists for inputs, targets in dataset.take(steps): for labels in target_tokenizer.sequences_to_texts( test_step(inputs, targets)): if len(labels) > 0: hypotheses.append(labels.split()) else: hypotheses.append([""]) for labels in input_tokenizer.sequences_to_texts(inputs.numpy()): references.append(word_split(labels)) for index, hypothesis in enumerate(hypotheses): max_score = { "rouge-1": { "f": 0.0, "p": 0.0, "r": 0.0 }, "rouge-2": { "f": 0.0, "p": 0.0, "r": 0.0 }, "rouge-l": { "f": 0.0, "p": 0.0, "r": 0.0 } } # one hypothesis may have several references for reference in references[index]: try: rouge_score = rouge.get_scores(" ".join(hypothesis), " ".join(reference))[0] # keep the best score if rouge_sum_score(rouge_score) > rouge_sum_score(max_score): max_score = rouge_score except ValueError: pass for method_key in rouge_dict: # fpr for traversing f1 precision recall for fpr in rouge_dict[method_key]: rouge_dict[method_key][fpr] += max_score[method_key][fpr] # average for method_key in rouge_dict: for fpr in rouge_dict[method_key]: rouge_dict[method_key][fpr] /= size bleu = bleu_score.corpus_bleu(references, hypotheses, weights=(1, )) gleu = gleu_score.corpus_gleu(references, hypotheses, max_len=1) nist = nist_score.corpus_nist(references, hypotheses, n=1) print("BLEU-1 Score: %.4f" % bleu) print("GLEU-1 Score: %.4f" % gleu) print("NIST-1 Score: %.4f" % nist) print("ROUGE Scores: %s" % rouge_dict_format(rouge_dict)) return bleu, gleu, nist, rouge_dict
def validate(args, val_loader, model): batch_time = AverageMeter() data_time = AverageMeter() val_logger = LogCollector() # switch to evaluate mode model.val_start() model.logger = val_logger end = time.time() max_length=50 for i, val_data in enumerate(val_loader): decoder_outputs, sampled_idxs, mean, logvar, z = model.forward_emb(*val_data) if torch.cuda.is_available(): val_data[1]=val_data[1].cuda() batch_size = val_data[1].size(0) max_length=50 flattened_outputs = decoder_outputs.view(batch_size * max_length, -1) batch_outputs = trim_seqs(sampled_idxs) np_targets=trim_seqs(val_data[1].unsqueeze(-1)) batch_targets = [[seq] for seq in np_targets] corpus_bleu_score = corpus_bleu(batch_targets, batch_outputs, smoothing_function=SmoothingFunction().method1) model.logger.update('C-BLEU', corpus_bleu_score, batch_size) corpus_nist_score = corpus_nist(batch_targets, batch_outputs, n=4) model.logger.update('C-NIST', corpus_nist_score, batch_size) corpus_meteor_score=0 rouge_scores=0 for j in range(batch_size): reference=[] for tid in range(len(batch_targets[j][0])): tok=batch_targets[j][0][tid] reference.append(vocab_inv[str(tok)]) ref=[str(' '.join(reference))] hypothesis=[] for tid in range(len(batch_outputs[j])): tok=batch_outputs[j][tid] hypothesis.append(vocab_inv[str(tok)]) hypo=str(' '.join(hypothesis)) corpus_meteor_score+=meteor_score(ref, hypo) rouge_scores += rouge.score(ref[0],hypo)['rougeL'][2] rouge_scores=rouge_scores/batch_size model.logger.update('ROUGH-L', rouge_scores, batch_size) corpus_meteor_score=corpus_meteor_score/batch_size model.logger.update('C-METEOR', corpus_meteor_score, batch_size) batch_time.update(time.time() - end) end = time.time() # Print log info model.Eiters += 1 if model.Eiters % args.logging_step == 0: print('Test: [{0}/{1}]\t' '{e_log}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' .format( i, len(val_loader), batch_time=batch_time, e_log=str(model.logger))) print('Test: [{0}/{1}]\t' '{e_log}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' .format( i, len(val_loader), batch_time=batch_time, e_log=str(model.logger))) return 0
def calculate(): results_frame_text.delete('1.0', END) doBLEU = True doNIST = True doWER = True doED = True doTER = True rfilename = F_frame_E_Ref.get() hfilename = F_frame_E_Hyp.get() rfile = codecs.open(rfilename, "r", encoding="utf-8") hfile = codecs.open(hfilename, "r", encoding="utf-8") sys.path.append(os.getcwd()) selectedtokenizer = combo_tokenizersF.get() tokenizer = importlib.import_module(selectedtokenizer) references = [] references_tok = [] for linia in rfile: linia = linia.rstrip() #pot haver més d'una referència separada per tabuladors rd = [] rd_tok = [] linia = linia.rstrip() #pot haver més d'una referència separada per tabuladors for segment in linia.split("\t"): tokens = tokenizer.tokenize(segment).split(" ") rd.append(segment) rd_tok.append(tokens) references.append(rd) references_tok.append(rd_tok) hypothesis = [] hypothesis_tok = [] for linia in hfile: linia = linia.rstrip() tokens = tokenizer.tokenize(linia).split(" ") hypothesis.append(linia) hypothesis_tok.append(tokens) if doBLEU: try: BLEU = corpus_bleu(references_tok, hypothesis_tok) cadena = "BLEU: " + str(round(BLEU, rbleu)) print(cadena) results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") except: print("ERROR: unable to calculate BLEU.", sys.exc_info()) cadena = "BLEU: Unable to calculate BLEU" results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") if doNIST: try: NIST = corpus_nist(references_tok, hypothesis_tok) cadena = "NIST: " + str(round(NIST, rnist)) print(cadena) results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") except: print("ERROR: unable to calculate NIST:", sys.exc_info()) cadena = "NIST: Unable to calculate NIST" results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") if doWER: try: WER = wer_corpus(references_tok, hypothesis_tok) cadena = "WER: " + str(round(WER, rwer)) print(cadena) results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") except: print("ERROR: unable to calculate WER.", sys.exc_info()) cadena = "WER: Unable to calculate WER" results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") if doED: try: edtotal = 0 chartotal = 0 for i in range(0, len(hypothesis)): editmin = 100000000 chartotal += len(hypothesis[i]) for h in references[i]: ed = edit_distance(hypothesis[i], h) if ed < editmin: editmin = ed edtotal += editmin EditDistance = 100 * (edtotal / chartotal) cadena = "%EdDist: " + str(round(EditDistance, red)) print(cadena) results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") except: print("ERROR: unable to calculate Ed", sys.exc_info()) cadena = "%EdDIst: Unable to calculate Ed" results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") if doTER: try: TERcorpus = ter_corpus(references_tok, hypothesis_tok) cadena = "TER: " + str(round(TERcorpus, rter)) print(cadena) results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n") except: print("ERROR: unable to calculate TER", sys.exc_info()) cadena = "TER: Unable to calcualte TER" results_frame_text.insert(INSERT, cadena) results_frame_text.insert(INSERT, "\n")
# Read files test_query_file = open('data/test-queries.txt', 'r') test_queries = test_query_file.readlines() pred_query_file = open('data/pred-queries.txt', 'r') pred_queries = pred_query_file.readlines() # Clean them up and get the structure right test_queries = [s[:-1].split(" ") for s in test_queries] pred_queries = [s[:-1].split(" ") for s in pred_queries] candidates = pred_queries reference_list = [] for sentence in test_queries: reference_list.append([sentence]) # Calculate metrics from nltk.translate.bleu_score import corpus_bleu from nltk.translate.nist_score import corpus_nist bleu_score = round(corpus_bleu(reference_list, candidates), 3) nist_score = round(corpus_nist(reference_list, candidates), 3) print(f"BLEU:\t{bleu_score}\nNIST:\t{nist_score}") # OUTPUT # BLEU: 0.758 # NIST: 5.627
def corpus_NIST(references, hypotheses): ''' Same philosophy as with corpus BLEU. ''' return nist_score.corpus_nist(references, hypotheses)
import sys from nltk.tokenize import word_tokenize from nltk.translate.nist_score import corpus_nist target_file = sys.argv[1] pred_file = sys.argv[2] list_of_references = [] with open(target_file) as f: for line in f: list_of_references.append([word_tokenize(line)]) hypotheses = [] with open(pred_file) as f: for line in f: hypotheses.append(word_tokenize(line)) for i in range(5): print('NIST-' + str(i + 1) + ':', corpus_nist(list_of_references, hypotheses, i + 1))