def test_sentence_nist(self):
        ref_file = find("models/wmt15_eval/ref.ru")
        hyp_file = find("models/wmt15_eval/google.ru")
        mteval_output_file = find("models/wmt15_eval/mteval-13a.output")

        # Reads the NIST scores from the `mteval-13a.output` file.
        # The order of the list corresponds to the order of the ngrams.
        with open(mteval_output_file) as mteval_fin:
            # The numbers are located in the last 4th line of the file.
            # The first and 2nd item in the list are the score and system names.
            mteval_nist_scores = map(float,
                                     mteval_fin.readlines()[-4].split()[1:-1])

        with open(ref_file, encoding="utf8") as ref_fin:
            with open(hyp_file, encoding="utf8") as hyp_fin:
                # Whitespace tokenize the file.
                # Note: split() automatically strip().
                hypotheses = list(map(lambda x: x.split(), hyp_fin))
                # Note that the corpus_bleu input is list of list of references.
                references = list(map(lambda x: [x.split()], ref_fin))
                # Without smoothing.
                for i, mteval_nist in zip(range(1, 10), mteval_nist_scores):
                    nltk_nist = corpus_nist(references, hypotheses, i)
                    # Check that the NIST scores difference is less than 0.5
                    assert abs(mteval_nist - nltk_nist) < 0.05
Example #2
0
def get_nist(reference_file, hypotesis_file):
    if not os.path.exists(reference_file):
        #        print(f"File '{reference_file}' not found")
        return 0

    if not os.path.exists(hypotesis_file):
        #        print(f"File '{hypotesis_file}' not found")
        return 0

    with open(reference_file, 'r') as tf_ref, open(hypotesis_file,
                                                   'r') as tf_hyp:
        lines_ref = tf_ref.read().splitlines()
        lines_hyp = tf_hyp.read().splitlines()

        len_ref = len(lines_ref)
        len_hyp = len(lines_hyp)
        if len_ref != len_hyp:
            print(
                "Different number of lines in files: {0} (reference), {1} (hypotesis)"
                .format(len_ref, len_hyp))
            return 0

        # Str -> to tokens
        strings_ref = []
        for i in range(0, len(lines_ref)):
            strings_ref.append([(lines_ref[i].split())])  # Double list

        strings_hyp = []
        for i in range(0, len(lines_hyp)):
            strings_hyp.append(lines_hyp[i].split())

        score = nist_score.corpus_nist(strings_ref, strings_hyp)
        return score
def compute_scores(pred_fname, ref_fname):

    # read files
    hyps = read_corpus(pred_fname, ref=False)
    refs = read_corpus(ref_fname, ref=True)

    # NIST score
    nist = ns.corpus_nist(refs, hyps, n=4)

    # BLEU score
    chencherry = bs.SmoothingFunction()
    bleu = bs.corpus_bleu(refs, hyps, smoothing_function=chencherry.method2)

    # ED
    total_len = 0.0
    edi = 0.0
    for r, h in zip(refs, hyps):
        total_len += max(len(r[0]), len(h))
        edi += edit_distance(
            r[0],
            h)  # TODO: strange -- inputs should be strings, not charlists!

    bleu_score = 100.0 * round(bleu, 4)
    edist_score = 100.0 * round(1 - edi / total_len, 4)
    nist_score = round(nist, 2)

    return Scores(bleu_score, edist_score, nist_score)
def main():
    arguments = sys.argv[1:]
    num_args = len(arguments)
    if num_args != 2:
        print('Wrong number few arguments.')
        print(str(sys.argv[0]), 'system-dir', 'reference-dir')
        exit()
    system_path = arguments[0]
    ref_path = arguments[1]

    # For all files in system path.
    for filename in os.listdir(system_path):
        print('Filename', str(filename))
        system_filename = os.path.join(system_path, filename)
        ref_filename = os.path.join(ref_path, filename)

        # read files
        ref = read_corpus(ref_filename, ref=True)
        hyp = read_corpus(system_filename, ref=False)

        # NIST score
        nist = ns.corpus_nist(ref, hyp, n=4)

        # BLEU score
        chencherry = bs.SmoothingFunction()
        bleu = bs.corpus_bleu(ref, hyp, smoothing_function=chencherry.method2)
        print('BLEU', str(round(bleu, 3)))
        total_len = 0.0
        edi = 0.0
        for r, h in zip(ref, hyp):
            total_len += max(len(r[0]), len(h))
            edi += edit_distance(r[0], h)
        print('DIST', str(round(1 - edi / total_len, 3)))
        print('NIST', str(round(nist, 6)))
        print()
Example #5
0
def show_nist(reference_file, hypotesis_file):
    cumulative_bleu_score = 0
    with open(reference_file, 'r') as tf_ref, open(hypotesis_file,
                                                   'r') as tf_hyp:
        lines_ref = tf_ref.read().splitlines()
        lines_hyp = tf_hyp.read().splitlines()
        #        lines_ref = lines_ref[0:500]
        #        lines_hyp = lines_hyp[0:500]

        len_ref = len(lines_ref)
        len_hyp = len(lines_hyp)
        if len_ref != len_hyp:
            print(
                "Different number of lines in files: {0} (reference), {1} (hypotesis)"
                .format(len_ref, len_hyp))
            return

        # Str -> to tokens
        strings_ref = []
        for i in range(0, len(lines_ref)):
            strings_ref.append([(lines_ref[i].split())])  # Double list

        strings_hyp = []
        for i in range(0, len(lines_hyp)):
            strings_hyp.append(lines_hyp[i].split())

        bleu_score = nist_score.corpus_nist(strings_ref, strings_hyp)
        print("** NIST score (corpus): " + str(bleu_score))
    def compute_metric_scores(refs, hyps):
        assert type(refs) == type(hyps) == list
        chencherry = SmoothingFunction()

        bl = corpus_bleu(refs, hyps, smoothing_function=chencherry.method2)
        ni = corpus_nist(refs, hyps, n=4)
        ed = compute_edit_distance(refs, hyps)

        return Score(bleu=bl, nist=ni, edist=ed)
Example #7
0
def nist_bleu(refs: List[List[Sentence]], hyps: List[Sentence], n=4):
    assert len(refs) == len(hyps), f'refs:{len(refs)} == hyps:{len(hyps)} ? '
    assert len(refs) > 0
    assert n > 0
    assert isinstance(hyps, list)
    assert isinstance(hyps[0], list)
    assert isinstance(hyps[0][0], str)

    assert isinstance(refs, list)
    assert isinstance(refs[0], list)
    assert isinstance(refs[0][0], list)
    assert isinstance(refs[0][0][0], str)

    return nist_score.corpus_nist(refs, hyps, n=n)
Example #8
0
 def calculate_nist_score(self):
     ''' this is the main method to calculate the nist score '''
     hypo, ref, has_one_sentence = self._p.split_references_hypothesis()
     if ref is None:
         return 0
     elif has_one_sentence:
         ''' 1 or more references with 1 sentence: '''
         nist_score = sentence_nist(ref, hypo)
         # calculate the sentence-level nist score
         return nist_score
     else:
         ''' 1 or more references with more than 1 sentence: '''
         nist_score = corpus_nist(ref, hypo)
         # calculate the corpus-level nist score
         return nist_score
def compute_scores(predictions: List[List[str]],
                   references: List[List[str]]):
    scores = dict()

    #bleu
    scores['bleu'] = nt.bleu_score.corpus_bleu([references], predictions, smoothing_function=nt.bleu_score.SmoothingFunction().method4)

    #meteor
    # scores['meteor'] = 0.
    # for i in range(len(predictions)):
    #     scores['meteor'] += nt.meteor_score.single_meteor_score(' '.join(references[i]), ' '.join(predictions[i]))
    # scores['meteor'] /= len(predictions)

    #nist
    scores['nist'] = nist_score.corpus_nist([references], predictions)

    #ribs
    scores['ribes'] = nt.ribes_score.corpus_ribes([references], predictions)

    return scores
Example #10
0
def main():
    '''
    arguments = sys.argv[1:]
    num_args = len(arguments)
    if num_args != 2:
    print('Wrong number few arguments.')
    print(sys.argv[0], 'system-dir', 'referene-dir')
    exit()
    system_path = arguments[0]
    ref_path = arguments[1]
    '''
    output_path = sys.argv[1]

    # For all files in system path.
    # read files
    ref, hyp = read(output_path)

    # NIST score
    nist = ns.corpus_nist(ref, hyp, n=4)

    # BLEU score
    chencherry = bs.SmoothingFunction()
    bleu = bs.corpus_bleu(ref, hyp, smoothing_function=chencherry.method2)
    print('BLEU', round(bleu, 3))
    total_len = 0.0
    edi = 0.0
    for r, h in zip(ref, hyp):
        try:
            total_len += max(len(r[0][0]), len(h[0]))
            edi += edit_distance(r[0][0], h[0])
        except:
            #print('r', r[0])
            #print('h', h)
            print("ERROR")
            pass
    print('DIST', round(1 - edi / total_len, 3))
    print('NIST', round(nist, 6))
    print('')
Example #11
0
    def test_sentence_nist(self):
        ref_file = find('models/wmt15_eval/ref.ru')
        hyp_file = find('models/wmt15_eval/google.ru')
        mteval_output_file = find('models/wmt15_eval/mteval-13a.output')

        # Reads the NIST scores from the `mteval-13a.output` file.
        # The order of the list corresponds to the order of the ngrams.
        with open(mteval_output_file, 'r') as mteval_fin:
            # The numbers are located in the last 4th line of the file.
            # The first and 2nd item in the list are the score and system names.
            mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1])

        with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
            with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
                # Whitespace tokenize the file.
                # Note: split() automatically strip().
                hypotheses = list(map(lambda x: x.split(), hyp_fin))
                # Note that the corpus_bleu input is list of list of references.
                references = list(map(lambda x: [x.split()], ref_fin))
                # Without smoothing.
                for i, mteval_nist in zip(range(1, 10), mteval_nist_scores):
                    nltk_nist = corpus_nist(references, hypotheses, i)
                    # Check that the NIST scores difference is less than 0.5
                    assert abs(mteval_nist - nltk_nist) < 0.05
Example #12
0
    if not args.mbleu and not args.mnist and not args.mwer and not args.ed and not args.mter:  # and not args.mpyter:
        doBLEU = True
        doNIST = True
        doWER = True
        doED = True
        doTER = True

    if doBLEU:
        try:
            BLEU = corpus_bleu(references_tok, hypothesis_tok)
            print("BLEU:     ", round(BLEU, rbleu))
        except:
            print("ERROR: unable to calculate BLEU.")
    if doNIST:
        try:
            NIST = corpus_nist(references_tok, hypothesis_tok)
            print("NIST:     ", round(NIST, rnist))
        except:
            print("ERROR: unable to calculate NIST:")
    if doWER:
        try:
            WER = wer_corpus(references_tok, hypothesis_tok)
            print("WER:      ", round(WER, rwer))
        except:
            print("ERROR: unable to calculate WER.")

    if doED:
        try:
            edtotal = 0
            chartotal = 0
            for i in range(0, len(hypothesis)):
Example #13
0
# -*- coding: utf-8 -*-
"""NIST.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1rtoEa5dozl1tvJPqVslUk-jxk_03cdET
"""
import os
os.system('pip install nltk')
from nltk.translate.nist_score import corpus_nist
ref = []
refrences = input('Enter the refrences sentences : ').split('.')
for r in refrences[:-1]:
    ref.append(r.split(' '))
hyp = input('Enter the hypothesis sentence.').split(' ')
order = int(input('Enter the order of n (n-gram)'))
score = corpus_nist([ref], [hyp], n=order)
print(score)
Example #14
0
    f for f in listdir(reference_directory)
    if isfile(join(reference_directory, f))
]
for file in ref_files:
    with open(reference_directory + file, 'r', encoding="utf8") as f:
        fileTokens = []
        for line in f:
            text = line.translate(str.maketrans('', '',
                                                string.punctuation)).lower()
            token = text.split()
            fileTokens.extend(token)
        references.append([fileTokens])

for directory in data_directories:
    candidates = []
    only_files = [f for f in listdir(directory) if isfile(join(directory, f))]
    for file in only_files:
        fileTokens = []
        with open(directory + file, 'r', encoding="utf8") as f:
            for line in f:
                text = line.translate(str.maketrans(
                    '', '', string.punctuation)).lower()
                token = text.split()
                fileTokens.extend(token)
            candidates.append(fileTokens)
    bleu_score = corpus_bleu(references, candidates)
    nist_score = corpus_nist(references, candidates)
    print(directory)
    print("bleu {}".format(bleu_score))
    print("mist {}".format(nist_score))
Example #15
0
import sys
from nltk.translate.nist_score import corpus_nist

#Script arguments
REFERENCE_PATH = sys.argv[1]
HYPOTHESIS_PATH = sys.argv[2]

ref_sents = []
hyp_sents = []
with open(REFERENCE_PATH) as ref, open(HYPOTHESIS_PATH) as hyp:
    for line_ref, line_hyp in zip(ref, hyp):
        ref_sents.append(line_ref.strip())
        hyp_sents.append(line_hyp.strip())
     
chrf = corpus_nist(ref_sents, hyp_sents)

print("NIST: %6.2f %%"%(chrf * 100))
Example #16
0
def eval_nist(answers, result, n=5):
    answers_ = [[answer] for answer in answers]
    scores = corpus_nist(answers_, result, n)
    return scores
Example #17
0
def calculate():
    results_frame_text.delete('1.0', END)

    rfilename = F_frame_E_Ref.get()
    hfilename = F_frame_E_Hyp.get()

    rfile = codecs.open(rfilename, "r", encoding="utf-8")
    hfile = codecs.open(hfilename, "r", encoding="utf-8")

    sys.path.append(os.getcwd())
    selectedtokenizer = combo_tokenizersF.get()
    if not selectedtokenizer.endswith(".py"):
        selectedtokenizer = selectedtokenizer + ".py"
    spec = importlib.util.spec_from_file_location('', selectedtokenizer)
    tokenizermod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(tokenizermod)
    tokenizer = tokenizermod.Tokenizer()

    references = []
    references_tok = []
    contref = 0
    for linia in rfile:
        linia = linia.rstrip()
        #pot haver més d'una referència separada per tabuladors
        rd = []
        rd_tok = []
        linia = linia.rstrip()
        #pot haver més d'una referència separada per tabuladors
        for segment in linia.split("\t"):
            tokens = tokenizer.tokenize(segment).split(" ")
            rd.append(segment)
            rd_tok.append(tokens)
        references.append(rd)
        references_tok.append(rd_tok)
        contref += 1

    hypothesis = []
    hypothesis_tok = []

    conthyp = 0
    for linia in hfile:
        linia = linia.rstrip()
        tokens = tokenizer.tokenize(linia).split(" ")
        hypothesis.append(linia)
        hypothesis_tok.append(tokens)
        conthyp += 1

    if not contref == conthyp:
        messagebox.showerror(
            "Error",
            "Reference and hypothesis files should have the same number of lines."
        )

    if doBLEU:
        try:
            BLEU = corpus_bleu(references_tok, hypothesis_tok)
            cadena = "BLEU:    " + str(round(BLEU, rbleu))
            print(cadena)
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
        except:
            print("ERROR: unable to calculate BLEU.", sys.exc_info())
            cadena = "BLEU:  Unable to calculate BLEU"
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
    if doNIST:
        try:
            NIST = corpus_nist(references_tok, hypothesis_tok)
            cadena = "NIST:    " + str(round(NIST, rnist))
            print(cadena)
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
        except:
            print("ERROR: unable to calculate NIST:", sys.exc_info())
            cadena = "NIST:  Unable to calculate NIST"
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
    if doWER:
        try:
            WER = wer_corpus(references_tok, hypothesis_tok)
            cadena = "WER:     " + str(round(WER, rwer))
            print(cadena)
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
        except:
            print("ERROR: unable to calculate WER.", sys.exc_info())
            cadena = "WER:  Unable to calculate WER"
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")

    if doED:
        try:
            edtotal = 0
            chartotal = 0
            for i in range(0, len(hypothesis)):
                editmin = 100000000
                chartotal += len(hypothesis[i])
                for h in references[i]:
                    ed = edit_distance(hypothesis[i], h)
                    if ed < editmin:
                        editmin = ed
                edtotal += editmin

            EditDistance = 100 * (edtotal / chartotal)
            cadena = "%EdDist: " + str(round(EditDistance, reddist))
            print(cadena)
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
        except:
            print("ERROR: unable to calculate Ed", sys.exc_info())
            cadena = "%EdDIst: Unable to calculate Ed"
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")

    if doTER:
        try:
            TERcorpus = ter_corpus(references_tok, hypothesis_tok)
            cadena = "TER:     " + str(round(TERcorpus, rter))
            print(cadena)
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
        except:
            print("ERROR: unable to calculate TER", sys.exc_info())
            cadena = "TER:  Unable to calcualte TER"
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
    print("-------------------------------------------")
    rfile.close()
    hfile.close()

    if 'selected' in F_frame_detailed.state():

        sourcesegments = []
        try:
            sourcef = codecs.open(F_frame_E_Source.get(),
                                  "r",
                                  encoding="utf-8")
            for linia in sourcef:
                linia = linia.rstrip()
                sourcesegments.append(linia)
        except:
            for i in range(0, contref):
                sourcesegments.append("")

        excelfile = F_frame_E_Detailed.get()
        textfile = F_frame_E_Detailed.get()
        if not excelfile.endswith(".xlsx"):
            excelfile = excelfile.replace(".txt", "") + ".xlsx"
        if not textfile.endswith(".txt"):
            textfile = textfile.replace(".xlsx", "") + ".txt"

        workbook = xlsxwriter.Workbook(excelfile)
        sheetAll = workbook.add_worksheet("All")
        sheetDetailed = workbook.add_worksheet("Detailed")
        sheetDetailed.set_column(1, 4, 30)
        bold = workbook.add_format({'bold': True})
        red = workbook.add_format({'color': 'red'})
        red.set_font_strikeout()
        blue = workbook.add_format({'color': 'blue'})
        text_wrap = workbook.add_format({'text_wrap': 1, 'valign': 'top'})

        sortida = codecs.open(textfile, "w", encoding="utf-8")

        sheetAll.write(0, 0, "SEGMENTS")
        sheetAll.write(0, 1, contref)
        sheetAll.write(1, 0, "BLEU")
        if doBLEU:
            sheetAll.write(1, 1, round(BLEU, rbleu))
        sheetAll.write(2, 0, "NIST")
        if doNIST:
            sheetAll.write(2, 1, round(NIST, rnist))
        sheetAll.write(3, 0, "WER")
        if doWER:
            sheetAll.write(3, 1, round(WER, rwer))
        sheetAll.write(4, 0, "%EdDist")
        if doED:
            sheetAll.write(4, 1, round(EditDistance, reddist))
        sheetAll.write(5, 0, "TER")
        if doTER:
            sheetAll.write(5, 1, round(TERcorpus, rter))

        cadenasortida = []
        cadenasortida.append("IDENT.")
        cadenasortida.append("Source.")
        cadenasortida.append("Reference")
        cadenasortida.append("Hyphotesis")
        cadenasortida.append("DIFF.")

        sheetDetailed.write(0, 0, "IDENT.", bold)
        sheetDetailed.write(0, 1, "Source", bold)
        sheetDetailed.write(0, 2, "Reference", bold)
        sheetDetailed.write(0, 3, "Hyphotesis", bold)
        sheetDetailed.write(0, 4, "DIFF.", bold)
        column = 5
        if doBLEU:
            sheetDetailed.write(0, column, "BLEU", bold)
            cadenasortida.append("BLEU")
            columnBLEU = column
            column += 1
        if doNIST:
            sheetDetailed.write(0, column, "NIST", bold)
            cadenasortida.append("NIST")
            columnNIST = column
            column += 1
        if doWER:
            sheetDetailed.write(0, column, "WER", bold)
            cadenasortida.append("WER")
            columnWER = column
            column += 1
        if doTER:
            sheetDetailed.write(0, column, "TER", bold)
            cadenasortida.append("TER")
            columnTER = column
            column += 1
        if doED:
            sheetDetailed.write(0, column, "EditDistance", bold)
            cadenasortida.append("EditDistance")
            columnED = column
            column += 1

        sortida.write("\t".join(cadenasortida) + "\n")
        for i in range(0, len(hypothesis)):
            cadenasortida = []
            sheetDetailed.write(i + 1, 0, i + 1)
            cadenasortida.append(str(i + 1))
            sheetDetailed.write(i + 1, 1, sourcesegments[i], text_wrap)
            cadenasortida.append(sourcesegments[i].replace("\t", " "))
            sheetDetailed.write(i + 1, 3, hypothesis[i], text_wrap)

            rtok = [references_tok[i]]
            htok = [hypothesis_tok[i]]
            selectedreference = references[i][0]
            #NOTE: if more than one reference, the one used in the excel file is the first one

            sheetDetailed.write(i + 1, 2, selectedreference, text_wrap)
            cadenasortida.append(selectedreference.replace("\t", " "))
            cadenasortida.append(sourcesegments[i].replace("\t", " "))
            dE = differencesExcel(selectedreference, hypothesis[i], red, blue,
                                  bold)
            dEtext = differences(selectedreference.replace("\t", " "),
                                 hypothesis[i].replace("\t", " "))
            cadenasortida.append(dEtext)
            sheetDetailed.write_rich_string(i + 1, 4, *dE, text_wrap)

            if doBLEU:
                try:
                    BLEU = corpus_bleu(rtok, htok)
                    sheetDetailed.write(i + 1, columnBLEU, round(BLEU, rbleu))
                    cadenasortida.append(str(round(BLEU, rbleu)))
                except:
                    cadenasortida.append("")
                    print("ERROR: unable to calculate detailed BLEU.")
            if doNIST:
                try:
                    NIST = corpus_nist(rtok, htok)
                    sheetDetailed.write(i + 1, columnNIST, round(NIST, rnist))
                    cadenasortida.append(str(round(NIST, rnist)))
                except:
                    cadenasortida.append("")
                    print("ERROR: unable to calculate detailed NIST:")
            if doWER:
                try:
                    WER = wer_corpus(rtok, htok)
                    sheetDetailed.write(i + 1, columnWER, round(WER, rwer))
                    cadenasortida.append(str(round(WER, rwer)))
                except:
                    cadenasortida.append("")
                    print("ERROR: unable to calculate detailed WER.")

            if doED:
                try:
                    edtotal = 0
                    chartotal = 0
                    for i2 in range(0, len(htok)):
                        editmin = 100000000
                        chartotal += len(htok[i2])
                        for h in rtok[i2]:
                            ed = edit_distance(htok[i2], h)
                            if ed < editmin:
                                editmin = ed
                        edtotal += editmin

                    EditDistance = 100 * (edtotal / chartotal)
                    sheetDetailed.write(i + 1, columnED,
                                        round(EditDistance, reddist))
                    cadenasortida.append(str(round(EditDistance, reddist)))
                except:
                    cadenasortida.append("")
                    print("ERROR: unable to calculate detailed Ed")

            if doTER:
                try:
                    TERcorpus = ter_corpus(rtok, htok)
                    sheetDetailed.write(i + 1, columnTER,
                                        round(TERcorpus, rter))
                    cadenasortida.append(str(round(TERcorpus, rter)))
                except:
                    cadenasortida.append("")
                    print("ERROR: unable to calculate detailed TER",
                          sys.exc_info())

            sortida.write("\t".join(cadenasortida) + "\n")

        workbook.close()
Example #18
0
def evaluation_metrics(dataset, steps, size):
    references = []
    hypotheses = []

    rouge = Rouge()
    rouge_dict = {
        "rouge-1": {
            "f": 0.0,
            "p": 0.0,
            "r": 0.0
        },
        "rouge-2": {
            "f": 0.0,
            "p": 0.0,
            "r": 0.0
        },
        "rouge-l": {
            "f": 0.0,
            "p": 0.0,
            "r": 0.0
        }
    }

    # make references & hypotheses lists
    for inputs, targets in dataset.take(steps):
        for labels in target_tokenizer.sequences_to_texts(
                test_step(inputs, targets)):
            if len(labels) > 0:
                hypotheses.append(labels.split())
            else:
                hypotheses.append([""])

        for labels in input_tokenizer.sequences_to_texts(inputs.numpy()):
            references.append(word_split(labels))

    for index, hypothesis in enumerate(hypotheses):
        max_score = {
            "rouge-1": {
                "f": 0.0,
                "p": 0.0,
                "r": 0.0
            },
            "rouge-2": {
                "f": 0.0,
                "p": 0.0,
                "r": 0.0
            },
            "rouge-l": {
                "f": 0.0,
                "p": 0.0,
                "r": 0.0
            }
        }

        # one hypothesis may have several references
        for reference in references[index]:
            try:
                rouge_score = rouge.get_scores(" ".join(hypothesis),
                                               " ".join(reference))[0]
                # keep the best score
                if rouge_sum_score(rouge_score) > rouge_sum_score(max_score):
                    max_score = rouge_score
            except ValueError:
                pass

        for method_key in rouge_dict:
            # fpr for traversing f1 precision recall
            for fpr in rouge_dict[method_key]:
                rouge_dict[method_key][fpr] += max_score[method_key][fpr]

    # average
    for method_key in rouge_dict:
        for fpr in rouge_dict[method_key]:
            rouge_dict[method_key][fpr] /= size

    bleu = bleu_score.corpus_bleu(references, hypotheses, weights=(1, ))
    gleu = gleu_score.corpus_gleu(references, hypotheses, max_len=1)
    nist = nist_score.corpus_nist(references, hypotheses, n=1)

    print("BLEU-1 Score: %.4f" % bleu)
    print("GLEU-1 Score: %.4f" % gleu)
    print("NIST-1 Score: %.4f" % nist)
    print("ROUGE Scores: %s" % rouge_dict_format(rouge_dict))

    return bleu, gleu, nist, rouge_dict
Example #19
0
def validate(args, val_loader, model):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    val_logger = LogCollector()

    # switch to evaluate mode
    model.val_start()
    model.logger = val_logger
    end = time.time()
    max_length=50
    for i, val_data in enumerate(val_loader):

        decoder_outputs, sampled_idxs, mean, logvar, z = model.forward_emb(*val_data)

        if torch.cuda.is_available():
            val_data[1]=val_data[1].cuda()

        batch_size = val_data[1].size(0)
        max_length=50
        flattened_outputs = decoder_outputs.view(batch_size * max_length, -1)

        batch_outputs = trim_seqs(sampled_idxs)
        
        np_targets=trim_seqs(val_data[1].unsqueeze(-1))
            
        batch_targets = [[seq] for seq in np_targets]

        corpus_bleu_score = corpus_bleu(batch_targets, batch_outputs, smoothing_function=SmoothingFunction().method1)
        model.logger.update('C-BLEU', corpus_bleu_score, batch_size)
        
        corpus_nist_score = corpus_nist(batch_targets, batch_outputs, n=4)

        model.logger.update('C-NIST', corpus_nist_score, batch_size)

        corpus_meteor_score=0
        rouge_scores=0
        for j in range(batch_size):
            reference=[]
            for tid in range(len(batch_targets[j][0])):
                tok=batch_targets[j][0][tid]
                reference.append(vocab_inv[str(tok)])
            ref=[str(' '.join(reference))]
            
            hypothesis=[]
            for tid in range(len(batch_outputs[j])):
                tok=batch_outputs[j][tid]
                hypothesis.append(vocab_inv[str(tok)])
            hypo=str(' '.join(hypothesis))
            corpus_meteor_score+=meteor_score(ref, hypo)
            rouge_scores += rouge.score(ref[0],hypo)['rougeL'][2]
        
        rouge_scores=rouge_scores/batch_size
        model.logger.update('ROUGH-L', rouge_scores, batch_size)
        corpus_meteor_score=corpus_meteor_score/batch_size
        model.logger.update('C-METEOR', corpus_meteor_score, batch_size)
        batch_time.update(time.time() - end)
        end = time.time()
        # Print log info
        
        model.Eiters += 1
        if model.Eiters % args.logging_step == 0:
            print('Test: [{0}/{1}]\t'
                    '{e_log}\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    .format(
                        i, len(val_loader), batch_time=batch_time,
                        e_log=str(model.logger)))
            
    print('Test: [{0}/{1}]\t'
            '{e_log}\t'
            'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
            .format(
                i, len(val_loader), batch_time=batch_time,
                e_log=str(model.logger))) 
    
    return 0
Example #20
0
def calculate():
    results_frame_text.delete('1.0', END)
    doBLEU = True
    doNIST = True
    doWER = True
    doED = True
    doTER = True

    rfilename = F_frame_E_Ref.get()
    hfilename = F_frame_E_Hyp.get()

    rfile = codecs.open(rfilename, "r", encoding="utf-8")
    hfile = codecs.open(hfilename, "r", encoding="utf-8")

    sys.path.append(os.getcwd())
    selectedtokenizer = combo_tokenizersF.get()
    tokenizer = importlib.import_module(selectedtokenizer)

    references = []
    references_tok = []

    for linia in rfile:
        linia = linia.rstrip()
        #pot haver més d'una referència separada per tabuladors
        rd = []
        rd_tok = []
        linia = linia.rstrip()
        #pot haver més d'una referència separada per tabuladors
        for segment in linia.split("\t"):
            tokens = tokenizer.tokenize(segment).split(" ")
            rd.append(segment)
            rd_tok.append(tokens)
        references.append(rd)
        references_tok.append(rd_tok)

    hypothesis = []
    hypothesis_tok = []

    for linia in hfile:
        linia = linia.rstrip()
        tokens = tokenizer.tokenize(linia).split(" ")
        hypothesis.append(linia)
        hypothesis_tok.append(tokens)

    if doBLEU:
        try:
            BLEU = corpus_bleu(references_tok, hypothesis_tok)
            cadena = "BLEU:    " + str(round(BLEU, rbleu))
            print(cadena)
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
        except:
            print("ERROR: unable to calculate BLEU.", sys.exc_info())
            cadena = "BLEU:  Unable to calculate BLEU"
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
    if doNIST:
        try:
            NIST = corpus_nist(references_tok, hypothesis_tok)
            cadena = "NIST:    " + str(round(NIST, rnist))
            print(cadena)
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
        except:
            print("ERROR: unable to calculate NIST:", sys.exc_info())
            cadena = "NIST:  Unable to calculate NIST"
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
    if doWER:
        try:
            WER = wer_corpus(references_tok, hypothesis_tok)
            cadena = "WER:     " + str(round(WER, rwer))
            print(cadena)
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
        except:
            print("ERROR: unable to calculate WER.", sys.exc_info())
            cadena = "WER:  Unable to calculate WER"
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")

    if doED:
        try:
            edtotal = 0
            chartotal = 0
            for i in range(0, len(hypothesis)):
                editmin = 100000000
                chartotal += len(hypothesis[i])
                for h in references[i]:
                    ed = edit_distance(hypothesis[i], h)
                    if ed < editmin:
                        editmin = ed
                edtotal += editmin

            EditDistance = 100 * (edtotal / chartotal)
            cadena = "%EdDist: " + str(round(EditDistance, red))
            print(cadena)
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
        except:
            print("ERROR: unable to calculate Ed", sys.exc_info())
            cadena = "%EdDIst: Unable to calculate Ed"
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")

    if doTER:
        try:
            TERcorpus = ter_corpus(references_tok, hypothesis_tok)
            cadena = "TER:     " + str(round(TERcorpus, rter))
            print(cadena)
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
        except:
            print("ERROR: unable to calculate TER", sys.exc_info())
            cadena = "TER:  Unable to calcualte TER"
            results_frame_text.insert(INSERT, cadena)
            results_frame_text.insert(INSERT, "\n")
Example #21
0
# Read files
test_query_file = open('data/test-queries.txt', 'r')
test_queries = test_query_file.readlines()
pred_query_file = open('data/pred-queries.txt', 'r')
pred_queries = pred_query_file.readlines()

# Clean them up and get the structure right
test_queries = [s[:-1].split(" ") for s in test_queries]
pred_queries = [s[:-1].split(" ") for s in pred_queries]
candidates = pred_queries
reference_list = []
for sentence in test_queries:
    reference_list.append([sentence])

# Calculate metrics
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.nist_score import corpus_nist
bleu_score = round(corpus_bleu(reference_list, candidates), 3)
nist_score = round(corpus_nist(reference_list, candidates), 3)
print(f"BLEU:\t{bleu_score}\nNIST:\t{nist_score}")

# OUTPUT
# BLEU:   0.758
# NIST:   5.627
Example #22
0
def corpus_NIST(references, hypotheses):
    '''
    Same philosophy as with corpus BLEU.
    '''
    return nist_score.corpus_nist(references, hypotheses)
import sys

from nltk.tokenize import word_tokenize
from nltk.translate.nist_score import corpus_nist

target_file = sys.argv[1]
pred_file = sys.argv[2]

list_of_references = []
with open(target_file) as f:
    for line in f:
        list_of_references.append([word_tokenize(line)])

hypotheses = []
with open(pred_file) as f:
    for line in f:
        hypotheses.append(word_tokenize(line))

for i in range(5):
    print('NIST-' + str(i + 1) + ':',
          corpus_nist(list_of_references, hypotheses, i + 1))