Beispiel #1
0
def load_X(sent_pairs):
    """Create a matrix where every row is a pair of sentences and every column in a feature.
    Feature (column) order is not important to the algorithm."""

    features = ["NIST", "BLEU", "Word Error Rate", "Longest common substring", "Levenshtein distance"]
    X = np.zeros((len(sent_pairs), len(features)))
    NIST = 0
    BLEU = 1
    WER = 2
    LCS = 3
    LD = 4
    for i, pair in enumerate(sent_pairs):
        t1, t2 = pair
        t1_token = word_tokenize(t1)
        t2_token = word_tokenize(t2)
        # print(f"Sentences: {t1}\t{t2}")
        # calculate the scores
        ed = edit_distance(t1_token, t2_token)
        X[i, WER] = ed / len(t1_token) + ed / len(t2_token)
        try:
            X[i, NIST] = sentence_nist([t1_token], t2_token) + sentence_nist([t2_token], t1_token)
        except ZeroDivisionError:
            X[i, NIST] = 0
        X[i, BLEU] = sentence_bleu([t1_token], t2_token) + sentence_bleu([t2_token], t1_token)
        X[i, LCS] = lcs(t1, t2)
        X[i, LD] = edit_distance(t1, t2)

    return X
def main(sts_data, output_file):
    """Calculate pearson correlation between semantic similarity scores and string similarity metrics.
    Data is formatted as in the STS benchmark"""

    # score_types = ["NIST", "BLEU", "Word Error Rate", "Longest common substring", "Levenshtein distance"]

    # read the dataset
    texts = []
    labels = []

    with open(sts_data, 'r', encoding='utf8') as dd:
        for line in dd:
            fields = line.strip().split("\t")
            labels.append(float(fields[4]))
            t1 = fields[5].lower()
            t2 = fields[6].lower()
            texts.append((t1, t2))

    print(f"Found {len(texts)} STS pairs")

    NIST = []
    BLEU = []
    WER = []
    LCS = []
    LD = []
    for i, pair in enumerate(texts):
        t1, t2 = pair
        t1_token = word_tokenize(t1)
        t2_token = word_tokenize(t2)
        # print(f"Sentences: {t1}\t{t2}")
        # calculate the scores
        ed = edit_distance(t1_token, t2_token)
        WER.append(ed / len(t1_token) + ed / len(t2_token))
        try:
            NIST.append(
                sentence_nist([t1_token], t2_token) +
                sentence_nist([t2_token], t1_token))
        except ZeroDivisionError:
            NIST.append(0)
        BLEU.append(
            sentence_bleu([t1_token], t2_token) +
            sentence_bleu([t2_token], t1_token))
        LCS.append(lcs(t1, t2))
        LD.append(edit_distance(t1, t2))

    result = dict()
    result['NIST correlation'] = round(pearsonr(labels, NIST)[0], 3)
    result['BLEU correlation'] = round(pearsonr(labels, BLEU)[0], 3)
    result['Word Error Rate correlation'] = round(pearsonr(labels, WER)[0], 3)
    result['Longest common substring correlation'] = round(
        pearsonr(labels, LCS)[0], 3)
    result['Levenshtein distance correlation'] = round(
        pearsonr(labels, LD)[0], 3)

    with open(output_file, 'w') as out:
        out.write(f"Semantic textual similarity for {sts_data}\n")
        # TODO: write scores. See example output for formatting
        for metric, corr in result.items():
            out.write(f'{metric}: {corr}\n')
Beispiel #3
0
def nist_func(x, y):
    "catch the zero dividend and return it as zero"
    try:
        nist1 = sentence_nist([x], y)
        nist2 = sentence_nist([y], x)
        return nist1 + nist2
    except ZeroDivisionError:
        return 0
def get_metrics(pred, target):
    turns = len(target)
    bleu_2 = 0
    bleu_4 = 0
    meteor = 0
    nist_2 = 0
    nist_4 = 0
    for index in range(turns):
        pred_utt = pred[index]
        target_utt = target[index]
        min_len = min(len(pred_utt), len(target_utt))
        lens = min(min_len, 4)
        if lens == 0:
            continue
        if lens >= 4:
            bleu_4_utt = sentence_bleu(
                [target_utt],
                pred_utt,
                weights=(0.25, 0.25, 0.25, 0.25),
                smoothing_function=SmoothingFunction().method1)
            nist_4_utt = sentence_nist([target_utt], pred_utt, 4)
        else:
            bleu_4_utt = 0
            nist_4_utt = 0
        if lens >= 2:
            bleu_2_utt = sentence_bleu(
                [target_utt],
                pred_utt,
                weights=(0.5, 0.5),
                smoothing_function=SmoothingFunction().method1)
            nist_2_utt = sentence_nist([target_utt], pred_utt, 2)
        else:
            bleu_2_utt = 0
            nist_2_utt = 0

        bleu_2 += bleu_2_utt
        bleu_4 += bleu_4_utt
        meteor += meteor_score([" ".join(target_utt)], " ".join(pred_utt))
        nist_2 += nist_2_utt
        nist_4 += nist_4_utt

    bleu_2 /= turns
    bleu_4 /= turns
    meteor /= turns
    nist_2 /= turns
    nist_4 /= turns
    return bleu_2, bleu_4, meteor, nist_2, nist_4
Beispiel #5
0
def nist_2(labels, preds):

    label = ' '.join([str(elem) for elem in labels])
    prediction = ' '.join([str(elem) for elem in preds])

    if len(prediction) < 2 or len(label) < 2:
        return 0
    return sentence_nist([label], prediction, 2)
Beispiel #6
0
def calculate_metrics(data,
                      src_field,
                      trg_field,
                      model,
                      device,
                      dump_path=None,
                      model_name='default',
                      max_len=MAX_LEN):
    """Calculate BLEU and NIST metrics"""
    preds_file = dump_path + model_name + '-baseline-output.txt'
    refs_file = dump_path + model_name + '-devel-conc.txt'
    bleu_scores = []
    nist_scores = []
    multibleu_smoother = SmoothingFunction().method4
    refs = []
    preds = []

    # Group sources and references
    src_unique = list(set([tuple(e.src) for e in data.examples]))
    for src in src_unique:
        pred_trg = translate_sentence(src, src_field, trg_field, model, device,
                                      max_len)

        # Remove <eos>
        pred_trg = pred_trg[:-1]
        ref_list = get_ref_list(data, src)

        preds.append(' '.join(pred_trg))
        refs.append(ref_list)

        # Calculate scores and save
        bleu_score = sentence_bleu(ref_list,
                                   pred_trg,
                                   smoothing_function=multibleu_smoother)
        nist_score = sentence_nist(
            ref_list,
            pred_trg,
        )
        bleu_scores.append(bleu_score)
        nist_scores.append(nist_score)

    # Dump all results in official e2e metric script compatible format
    file_refs = []
    for ref in refs:
        ref_sentences = [' '.join(tokens) for tokens in ref]
        refs_joined = '\n'.join(ref_sentences)
        file_refs.append(refs_joined)
    with open(preds_file, 'w') as pred_f:
        pred_f.write('\n'.join(preds))
    with open(refs_file, 'w') as ref_f:
        ref_f.write('\n\n'.join(file_refs))
    print(f'Writing files for {model_name}')
    print(f'Predictions in {preds_file}')
    print(f'References in {refs_file}')
    return np.mean(bleu_scores), np.mean(nist_scores)
 def get_nist(self):
     ngram = self.gram
     nist = list()
     reference = self.get_reference()
     weight = tuple((1. / ngram for _ in range(ngram)))
     with open(self.test_data) as test_data:
         for hypothesis in test_data:
             hypothesis = nltk.word_tokenize(hypothesis)
             nist.append(
                 nist_score.sentence_nist(reference, hypothesis, n=ngram))
     return sum(nist) / len(nist)
Beispiel #8
0
 def sim_nist(self, hyps, ref):
     """
     :param refs - a list of strings representing references
     :param hyps - a list of tokens of the hypothesis
     :return maxbleu - recall bleu
     :return avgbleu - precision bleu
     """
     scores = []
     for hyp in hyps:
         try:
             scores.append(sentence_nist([ref], hyp))
         except:
             scores.append(0.0)
     return np.max(scores), np.mean(scores)
Beispiel #9
0
 def calculate_nist_score(self):
     ''' this is the main method to calculate the nist score '''
     hypo, ref, has_one_sentence = self._p.split_references_hypothesis()
     if ref is None:
         return 0
     elif has_one_sentence:
         ''' 1 or more references with 1 sentence: '''
         nist_score = sentence_nist(ref, hypo)
         # calculate the sentence-level nist score
         return nist_score
     else:
         ''' 1 or more references with more than 1 sentence: '''
         nist_score = corpus_nist(ref, hypo)
         # calculate the corpus-level nist score
         return nist_score
def calculate_max_nist(list_references, list_hypothesis):
    sum_bleu = 0.0
    for i,d in enumerate(list_references):
        references_items = list_references[i]
        hypothesis = list_hypothesis[i]
        bleu_score_sentence = []
        for reference in references_items:
            try:
                bleu_score_sentence.append(nist_score.sentence_nist([reference], hypothesis))
                sum_bleu += max(bleu_score_sentence)

            except:
                sum_bleu += 0
    mean_bleu = sum_bleu / len(list_hypothesis)   

    return mean_bleu
def nist_func(x, y):
    try:
        return nist_score.sentence_nist([nltk.word_tokenize(x)],
                                        nltk.word_tokenize(y))
    except ZeroDivisionError:
        return 0
 def calc_nist(self, reference, hypothesis, gram):
     return nist_score.sentence_nist(reference, hypothesis, n=gram)
Beispiel #13
0
for df in dfs:

  bleu_sm = []
  met = []
  nist = []

  for i in range(len(df.index)):

    row = df.loc[i]

    ref_tokens = word_tokenize(row['reference'])
    can_tokens = word_tokenize(row['candidate'])
    bleu_sm.append( sentence_bleu([ref_tokens], can_tokens, smoothing_function=smoother) )
    met.append( nltk.translate.meteor_score.meteor_score([row['reference']], row['candidate']) )
    nist.append( sentence_nist([ref_tokens], can_tokens, 2) )

  df['bleu_sm'] = bleu_sm
  df['met'] = met
  df['nist'] = nist

############ Evaluate various models on various feature selections

col_set = []
col_set.append(['bleu_uni'])
col_set.append(['bleu_sm'])
col_set.append(['met'])
col_set.append(['nist'])
col_set.append(['bleu_uni','bleu_sm','met','nist'])
col_set.append(['cos_s_r'])
col_set.append(['cos_s_c'])
Beispiel #14
0
def get_nist_score(candidate, reference, ngrams):
    candidate = candidate.split(' ')
    reference = [reference.split(' ')]
    return float(sentence_nist(reference, candidate, ngrams))
Beispiel #15
0
def nist(batch_targets, batch_outputs): 
    score={}
    score_nist=sentence_nist(batch_targets, batch_outputs, n=4)
    return score
from nltk.translate.nist_score import sentence_nist
import xlrd

# Give the location of the file
#if sheet.cell_value(i,0)==0:
#   score= str(sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))+ ','+ str(sentence_bleu(reference, candidate, weights=(0, 1, 0, 0)))+','+ str(sentence_bleu(reference, candidate, weights=(0, 0, 1, 0)))+','+ str(sentence_bleu(reference, candidate, weights=(0, 0, 0, 1)))+','+ str(sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0)))+','+ str(sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0)))+','+ str(sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)))+',Human-generated' '''''' else:
#       score= str(sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)))+ ','+ str(sentence_bleu(reference, candidate, weights=(0, 1, 0, 0)))+','+ str(sentence_bleu(reference, candidate, weights=(0, 0, 1, 0)))+','+ str(sentence_bleu(reference, candidate, weights=(0, 0, 0, 1)))+','+ str(sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0)))+','+ str(sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0)))+','+ str(sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)))+',Machine-generated' '''

loc = ("/Users/ishitagupta/Desktop/kashgari_data.xlsx")

# To open Workbook
wb = xlrd.open_workbook(loc)
sheet = wb.sheet_by_index(2)
i = 0
for i in range(1, 4):
    textreference = str(sheet.cell_value(i, 0))
    textcandidate = str(sheet.cell_value(i, 1))
    print(textreference.split())
    print(textcandidate.split())

    reference = [textreference.split()]
    candidate = textcandidate.split()
    if len(candidate) == 1:
        score = 1.0
        print(round(100 * score))
    else:
        score = str(round(100 * sentence_nist(reference, candidate, 2)))
        print(score)
Beispiel #17
0
 def compute_score(self, candidate: List[str],
                   references: List[List[str]]) -> Tensor:
     score = sentence_nist(candidate, references, n=self.ngram_order)
     return score
Beispiel #18
0
def sent_NIST(reference, hypothesis):
    '''
    Same philosophy as with sentence BLEU.
    '''
    return nist_score.sentence_nist(reference, hypothesis)
 def getNistScore(self,hypSent,refSent):
     return NN.sentence_nist([refSent.split()],hypSent.split(),n=5)
Beispiel #20
0
    etree.SubElement(paraphrase, 'BLUE_w3_titles').text = str(
        sentence_bleu([list_title_1], list_title_2, weights=weights3))
    etree.SubElement(paraphrase, 'BLUE_w4_titles').text = str(
        sentence_bleu([list_title_1], list_title_2, weights=weights4))

    etree.SubElement(paraphrase, 'BLUE_w1_articles').text = str(
        sentence_bleu([list_text_1], list_text_2, weights=weights1))
    etree.SubElement(paraphrase, 'BLUE_w2_articles').text = str(
        sentence_bleu([list_text_1], list_text_2, weights=weights2))
    etree.SubElement(paraphrase, 'BLUE_w3_articles').text = str(
        sentence_bleu([list_text_1], list_text_2, weights=weights3))
    etree.SubElement(paraphrase, 'BLUE_w4_articles').text = str(
        sentence_bleu([list_text_1], list_text_2, weights=weights4))

    # NIST
    nist_titles = sentence_nist([list_title_1], list_title_2, n=3)
    nist_articles = sentence_nist([list_text_1], list_text_2, n=3)
    etree.SubElement(paraphrase, 'nist_titles').text = str(nist_titles)
    etree.SubElement(paraphrase, 'nist_articles').text = str(nist_articles)
    etree.SubElement(paraphrase,
                     'nist_diff').text = str(nist_titles - nist_articles)

    # ROUGE
    title_1_space = title_1.replace(";", " ")
    title_2_space = title_2.replace(";", " ")
    text_1_space = text_1.replace(";", " ")
    text_2_space = text_2.replace(";", " ")

    rouge = Rouge()
    title_score = rouge.get_scores(title_1_space, title_2_space)[0]
    article_score = rouge.get_scores(text_1_space, text_2_space)[0]
def string_sim(sent_pairs):
    """Create a matrix where every row is a pair of sentences and every column in a feature.
    Feature (column) order is not important to the algorithm."""

    features = [
        "NIST", "BLEU", "Word Error Rate", "Longest common substring",
        "Levenshtein distance"
    ]
    nist_list = []
    bleu_list = []
    wer_list = []
    lcs_list = []
    dist_list = []
    for pair in sent_pairs:
        t1 = pair[0]
        t2 = pair[1]
        t1_token = word_tokenize(pair[0])
        t2_token = word_tokenize(pair[1])

        # NIST
        try:
            nist1 = nist_score.sentence_nist([
                t2_token,
            ], t1_token)
            nist2 = nist_score.sentence_nist([
                t1_token,
            ], t2_token)
            nist = nist1 + nist2
        except ZeroDivisionError:
            nist = 0
        nist_list.append(nist)

        # BLEU
        bleu1 = bleu_score.sentence_bleu([
            t1_token,
        ], t2_token)
        bleu2 = bleu_score.sentence_bleu([
            t2_token,
        ], t1_token)
        bleu_list.append(bleu1 + bleu2)

        # Longgest common substring
        s = SequenceMatcher(None, t1, t2)
        lcs = s.find_longest_match(0, len(t1), 0, len(t2))
        lcs_list.append(lcs[2])

        # Edit distance
        dist = edit_distance(t1, t2)
        dist_list.append(dist)

        # Word error rate
        dist_wer = edit_distance(t1_token, t2_token)
        wer = dist_wer / len(t1_token) + dist_wer / len(t2_token)
        wer_list.append(wer)

    all_list = [nist_list, bleu_list, wer_list, lcs_list, dist_list]
    X = np.zeros((len(sent_pairs), len(features)))
    for i in range(len(all_list)):
        X[:, i] = np.asarray(all_list[i])

    return X
Beispiel #22
0
def compute_nist(hypothesis, references):
    hypothesis = list(ntok.tokenize(hypothesis))

    references = [list(ntok.tokenize(reference)) for reference in references]

    return sentence_nist(references, hypothesis)
    def __call__(self):
        root = etree.parse(
            r'C:\Users\kiva0319\PycharmProjects\Diploma2020\processed\paraphrases.xml'
        )
        root = root.getroot()
        corpus = etree.SubElement(root, "corpus")

        result_xml = etree.Element('raw_data')
        result_doc = etree.ElementTree(result_xml)

        corpus_info = etree.SubElement(result_xml, 'head')
        etree.SubElement(corpus_info, 'description').text = "—"
        etree.SubElement(corpus_info, 'date').text = str(date.today())
        articles_list = etree.SubElement(result_xml, 'corpus')

        count = 0

        for element in root[1]:
            id = element[0].text
            old_id = element[1].text
            id_1 = element[2].text
            id_2 = element[3].text
            title_1 = element[4].text
            title_2 = element[5].text
            text_1 = element[6].text
            text_2 = element[7].text
            words_title_1 = int(element[8].text)
            words_title_2 = int(element[9].text)
            words_article_1 = int(element[10].text)
            words_article_2 = int(element[11].text)
            num_of_paragraphs_1 = int(element[12].text)
            num_of_paragraphs_2 = int(element[13].text)
            element_paragraphs_1 = element[14].text
            element_paragraphs_2 = element[15].text
            jaccard = element[16].text
            clas = element[17].text

            print(count, id, flush=True)

            # words_max = max(words_max, words_article_1)
            # words_max = max(words_max, words_article_2)
            # chars_max = max(chars_max, len(text_1))
            # chars_max = max(chars_max, len(text_2))
            # continue

            paraphrase = etree.SubElement(articles_list, 'paraphrase')
            etree.SubElement(paraphrase, 'value', name="id").text = id
            etree.SubElement(paraphrase, 'value', name="old_id").text = old_id
            etree.SubElement(paraphrase, 'value', name="id_1").text = id_1
            etree.SubElement(paraphrase, 'value', name="id_2").text = id_2
            etree.SubElement(paraphrase, 'value',
                             name="title_1").text = title_1
            etree.SubElement(paraphrase, 'value',
                             name="title_2").text = title_2
            etree.SubElement(paraphrase, 'value',
                             name="jaccard").text = jaccard
            etree.SubElement(paraphrase, 'value', name="class").text = clas

            # words and paragraphs diff
            etree.SubElement(paraphrase, 'words_title_diff').text = str(
                abs(words_title_1 - words_title_2))
            etree.SubElement(paraphrase, 'words_article_diff').text = str(
                abs(words_article_1 - words_article_2))
            etree.SubElement(paraphrase, 'paragraphs_diff').text = str(
                abs(num_of_paragraphs_1 - num_of_paragraphs_2))

            # flesch_reading_ease
            textstat.textstat.set_lang("ru")
            etree.SubElement(paraphrase,
                             'flesch_reading_ease_title_1').text = str(
                                 textstat.flesch_reading_ease(" ".join(
                                     title_1.split(";"))))
            etree.SubElement(paraphrase,
                             'flesch_reading_ease__title_2').text = str(
                                 textstat.flesch_reading_ease(" ".join(
                                     title_2.split(";"))))
            etree.SubElement(
                paraphrase, 'flesch_reading_ease_article_1').text = str(
                    textstat.flesch_reading_ease(" ".join(text_1.split(";"))) /
                    num_of_paragraphs_1)
            etree.SubElement(
                paraphrase, 'flesch_reading_ease_article_2').text = str(
                    textstat.flesch_reading_ease(" ".join(text_2.split(";"))) /
                    num_of_paragraphs_2)

            # BLUE
            weights1 = (1, 0, 0, 0)
            weights2 = (0.5, 0.5, 0, 0)
            weights3 = (0.33, 0.33, 0.33, 0)
            weights4 = (0.25, 0.25, 0.25, 0.25)

            list_title_1 = title_1.split(";")
            list_title_2 = title_2.split(";")
            list_text_1 = text_1.split(";")
            list_text_2 = text_2.split(";")

            etree.SubElement(paraphrase, 'BLUE_w1_titles').text = str(
                sentence_bleu([list_title_1], list_title_2, weights=weights1))
            etree.SubElement(paraphrase, 'BLUE_w2_titles').text = str(
                sentence_bleu([list_title_1], list_title_2, weights=weights2))
            etree.SubElement(paraphrase, 'BLUE_w3_titles').text = str(
                sentence_bleu([list_title_1], list_title_2, weights=weights3))
            etree.SubElement(paraphrase, 'BLUE_w4_titles').text = str(
                sentence_bleu([list_title_1], list_title_2, weights=weights4))

            etree.SubElement(paraphrase, 'BLUE_w1_articles').text = str(
                sentence_bleu([list_text_1], list_text_2, weights=weights1))
            etree.SubElement(paraphrase, 'BLUE_w2_articles').text = str(
                sentence_bleu([list_text_1], list_text_2, weights=weights2))
            etree.SubElement(paraphrase, 'BLUE_w3_articles').text = str(
                sentence_bleu([list_text_1], list_text_2, weights=weights3))
            etree.SubElement(paraphrase, 'BLUE_w4_articles').text = str(
                sentence_bleu([list_text_1], list_text_2, weights=weights4))

            # NIST
            nist_1_titles = 0
            nist_1_articles = 0

            nist_2_titles = 0
            nist_2_articles = 0

            nist_3_titles = 0
            nist_3_articles = 0

            try:
                nist_1_titles = sentence_nist([list_title_1],
                                              list_title_2,
                                              n=1)
            except ZeroDivisionError:
                print("ZeroDivisionError id =", count)

            try:
                nist_1_articles = sentence_nist([list_text_1],
                                                list_text_2,
                                                n=1)
            except ZeroDivisionError:
                print("ZeroDivisionError id =", count)

            try:
                nist_2_titles = sentence_nist([list_title_1],
                                              list_title_2,
                                              n=2)
            except ZeroDivisionError:
                print("ZeroDivisionError id =", count)

            try:
                nist_2_articles = sentence_nist([list_text_1],
                                                list_text_2,
                                                n=2)
            except ZeroDivisionError:
                print("ZeroDivisionError id =", count)

            try:
                nist_3_titles = sentence_nist([list_title_1],
                                              list_title_2,
                                              n=3)
            except ZeroDivisionError:
                print("ZeroDivisionError id =", count)

            try:
                nist_3_articles = sentence_nist([list_text_1],
                                                list_text_2,
                                                n=3)
            except ZeroDivisionError:
                print("ZeroDivisionError id =", count)

            etree.SubElement(paraphrase,
                             'nist_1_titles').text = str(nist_1_titles)
            etree.SubElement(paraphrase,
                             'nist_1_articles').text = str(nist_1_articles)

            etree.SubElement(paraphrase,
                             'nist_2_titles').text = str(nist_2_titles)
            etree.SubElement(paraphrase,
                             'nist_2_articles').text = str(nist_2_articles)

            etree.SubElement(paraphrase,
                             'nist_3_titles').text = str(nist_3_titles)
            etree.SubElement(paraphrase,
                             'nist_3_articles').text = str(nist_3_articles)

            etree.SubElement(paraphrase,
                             'nist_1_diff').text = str(nist_1_titles -
                                                       nist_1_articles)
            etree.SubElement(paraphrase,
                             'nist_2_diff').text = str(nist_2_titles -
                                                       nist_2_articles)
            etree.SubElement(paraphrase,
                             'nist_3_diff').text = str(nist_3_titles -
                                                       nist_3_articles)

            # ROUGE
            title_1_space = title_1.replace(";", " ")
            title_2_space = title_2.replace(";", " ")
            text_1_space = text_1.replace(";", " ")
            text_2_space = text_2.replace(";", " ")

            rouge = Rouge()
            title_score = rouge.get_scores(title_1_space, title_2_space)[0]
            article_score = rouge.get_scores(text_1_space, text_2_space)[0]

            etree.SubElement(paraphrase, 'rouge-1_titles').text = str(
                title_score['rouge-1']['f'])
            etree.SubElement(paraphrase, 'rouge-2_titles').text = str(
                title_score['rouge-2']['f'])
            etree.SubElement(paraphrase, 'rouge-L_titles').text = str(
                title_score['rouge-l']['f'])

            etree.SubElement(paraphrase, 'rouge-1_articles').text = str(
                article_score['rouge-1']['f'])
            etree.SubElement(paraphrase, 'rouge-2_articles').text = str(
                article_score['rouge-2']['f'])
            etree.SubElement(paraphrase, 'rouge-L_articles').text = str(
                article_score['rouge-l']['f'])

            # METEOR
            stemmer = SnowballStemmer("russian")
            wikiwordnet = WikiWordnet()
            etree.SubElement(paraphrase, 'meteor_title').text = str(
                meteor_score([title_1_space],
                             title_2_space,
                             stemmer=stemmer,
                             wordnet=wikiwordnet))
            etree.SubElement(paraphrase, 'meteor_article').text = str(
                meteor_score([text_1_space],
                             text_2_space,
                             stemmer=stemmer,
                             wordnet=wikiwordnet))

            count += 1

        outFile = open("processed/metrics.xml", 'wb')
        result_doc.write(outFile,
                         xml_declaration=True,
                         encoding='utf-8',
                         pretty_print=True)
 def calc_nist(self, reference, hypothesis, gram=5):
     print("Type:", type(reference[0][0]))
     print("Lengths:", len(reference), len(hypothesis))
     print("Hypothesis 2:", hypothesis)
     return nist_score.sentence_nist(reference, hypothesis, n=gram)
def nist(predict, target, n):
    if len(predict) < n or len(target) < n:
        return 0
    return sentence_nist([target], predict, n)
Beispiel #26
0
def nist(df):
    # tokenization happens inside nist
    df['nist'] = df.apply(lambda x: sentence_nist(x['reference'], x['translation']), axis=1)
    return df