コード例 #1
0
def G_bleu_score(tru, summ, rev):

    actual = []
    predicted = []
    review = []
    for i in range(len(tru)):
        actual.append(tru[i].split(' '))
        predicted.append(summ[i].split(' '))
        review.append(rev[i].split(' '))

    gleu_actual = []
    gleu_predicted = []
    gleu_pred_to_actual = []
    for i in range(len(actual)):
        gleu_actual.append(gleu.sentence_gleu(actual[i], ' '.join(review[i])))
        gleu_predicted.append(
            gleu.sentence_gleu(predicted[i], ' '.join(review[i])))
        gleu_pred_to_actual.append(
            gleu.sentence_gleu(predicted[i], ' '.join(actual[i])))

    ar = np.mean(gleu_actual)
    pr = np.mean(gleu_predicted)
    ap = np.mean(gleu_pred_to_actual)

    print('actual GLEU score: ', ar)
    print('predicted GLEU score: ', pr)
    print('actual to predicted GLEU score: ', ap)
コード例 #2
0
def test():
    hyp1 = [
        'It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that',
        'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the',
        'party'
    ]
    ref1a = [
        'It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that',
        'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands'
    ]
    ref1b = [
        'It', 'is', 'the', 'guiding', 'principle', 'which', 'guarantees',
        'the', 'military', 'forces', 'always', 'being', 'under', 'the',
        'command', 'of', 'the', 'Party'
    ]
    ref1c = [
        'It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army',
        'always', 'to', 'heed', 'the', 'directions', 'of', 'the', 'party'
    ]

    hyp2 = str(
        'he read the book because he was interested in world history').split()
    ref2a = str(
        'he was interested in world history because he read the book').split()

    list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
    hypotheses = [hyp1, hyp2]
    corpus_score = gleu.corpus_gleu(list_of_references, hypotheses)
    print("Corpus score: " + str(corpus_score))

    #The example below show that corpus_gleu() is different from averaging sentence_gleu() for hypotheses
    score1 = gleu.sentence_gleu([ref1a], hyp1)
    score2 = gleu.sentence_gleu([ref2a], hyp2)
    average_score = (score1 + score2) / 2
    print("Sentence score average: " + str(average_score))
コード例 #3
0
def calc_seq_mt_features(ql, qr, signature=""):
    bleu_score_l = sentence_bleu(
        [ql], qr, smoothing_function=cc.method3)  #NIST smoothing
    bleu_score_r = sentence_bleu([qr], ql, smoothing_function=cc.method3)
    gleu_score_l = sentence_gleu(ql, qr)
    gleu_score_r = sentence_gleu(qr, ql)

    try:
        ribes_score_l = sentence_ribes([ql], qr)
    except ZeroDivisionError:
        ribes_score_l = 0
    try:
        ribes_score_r = sentence_ribes([qr], ql)
    except ZeroDivisionError:
        ribes_score_r = 0

    feature_dict = {}

    if signature:
        signature = signature + "_"
    feature_names = [
        "bleu_score_l", "bleu_score_r", "gleu_score_l", "gleu_score_r",
        "ribes_score_l", "ribes_score_r"
    ]

    for feature_name in feature_names:
        feature_dict[signature + feature_name] = locals()[feature_name]

    return feature_dict
コード例 #4
0
ファイル: mt_text_score.py プロジェクト: samuel114/NLPMetrics
    def sentence_average_score(self, list_of_references, hypotheses, score_type="BLEU"):
        """ Averages score applied for every sentence

        :param list_of_references: list of reference texts (separated into words)
        :param hypotheses: hypotheses relative to reference (separated into words)
        :param score_type: metric being used
        :return: average sentences score
        """

        sent_average_score = 0
        if utils.BLEU_NAME in score_type:
            for ref, hyp in zip(list_of_references, hypotheses):
                sent_average_score += bleu.sentence_bleu(ref, hyp)  # gram: default is between 1 and 4
        elif utils.GOOGLE_BLEU_NAME in score_type:
            for ref, hyp in zip(list_of_references, hypotheses):
                sent_average_score += gleu.sentence_gleu(ref, hyp)  # gram: default is between 1 and 4
        elif utils.WER_NAME in score_type:
            for ref, hyp in zip(list_of_references, hypotheses):
                sent_average_score += self.wer_score(ref[0], hyp)  # Assumes only 1 reference
        elif utils.TER_NAME in score_type:
            for ref, hyp in zip(list_of_references, hypotheses):
                sent_average_score += self.ter_score(ref[0], hyp)

        sent_average_score /= len(list_of_references)

        print("%s sentence average score: %.4f" % (score_type, sent_average_score))
        return sent_average_score
コード例 #5
0
ファイル: train.py プロジェクト: zaidhassanch/PointerNetworks
def calculate_bleu(data, src_field, model, device, decodeType, max_len=30):
    cc = SmoothingFunction()
    sentBleu = 0.0
    sentGleu = 0.0
    trgs = []
    pred_trgs = []
    #bs = Beam_Search(model)
    for datum in tqdm(data):
        trg = vars(datum)['correction1']
        src = vars(datum)['orig']
        #translate_sentence(src, src_field, model, device, max_len = 25)
        #HERE
        if decodeType == "greedy":
            pred_trg = translate_sentence(src, src_field, model, device,
                                          max_len)
        else:
            #pred_trg = bs(src, src_field, device)
            pred_trg = beam_search(src, src_field, model, device, max_len)
        #cut off <eos> token
        #HERE
        #pred_trg = pred_trg[1:-1]
        #if len(pred_trg) < 2: pred_trg.append(".")
        sentBleu += sentence_bleu([trg],
                                  pred_trg,
                                  smoothing_function=cc.method3)
        sentGleu += sentence_gleu([trg], pred_trg)
        pred_trgs.append(pred_trg)
        trgs.append([trg])
    sentBleu = sentBleu / len(data)
    sentGleu = sentGleu / len(data)
    corpusBleu = corpus_bleu(trgs, pred_trgs, smoothing_function=cc.method3)
    corpusGleu = corpus_gleu(trgs, pred_trgs)
    return sentBleu, sentGleu, corpusBleu, corpusGleu
コード例 #6
0
def main(_):
    model = ShowAndTellModel(FLAGS.model_path)
    vocab = Vocabulary(FLAGS.vocab_file)
    filenames = _load_filenames()
    can1 = "a table with different kinds of food"
    candidate = can1.split()
    generator = CaptionGenerator(model, vocab)
    for filename in filenames:
        with tf.gfile.GFile(filename, "rb") as f:
            image = f.read()
        captions = generator.beam_search(image)
        print("Captions: ")
        for i, caption in enumerate(captions):
            sentence = [vocab.id_to_token(w) for w in caption.sentence[1:-1]]
            sentence = " ".join(sentence)
            temp = "  %d) %s (p=%f)" % (i + 1, sentence,
                                        math.exp(caption.logprob))
            print(temp)
            comp = [sentence.split()]
            # Calculating The Blue Score
            print('Blue cumulative 1-gram: %f' %
                  sentence_bleu(comp, candidate, weights=(1, 0, 0, 0)))
            print('Blue cumulative 2-gram: %f' %
                  sentence_bleu(comp, candidate, weights=(0.5, 0.5, 0, 0)))
            # Glue Score
            G = gleu.sentence_gleu(comp, candidate, min_len=1, max_len=2)
            print("Glue score for this sentence: {}".format(G))
コード例 #7
0
    def get_score(actual_list: List[str], desired_list: List[List[str]],
                  n_gram: int):
        """
        :param desired_list: A List of a List of all possible sentences ex: [['cats are cute'], ['dogs are cute']]
        :param actual_list: A list of the sentences to be scored ex: ['cats are cute']
        :param n_gram: is the gram size ex: ['cats are cute'] -> n_gram = 3
        return gives a float of the sentence-level GLEU score
        """
        import nltk.translate.gleu_score as gleu

        if n_gram <= 4:
            return gleu.sentence_gleu(desired_list,
                                      actual_list,
                                      max_len=n_gram)
        else:
            return gleu.sentence_gleu(
                desired_list,
                actual_list)  # if the ngram is at least 5, use the standard
コード例 #8
0
def get_gleu_score(sentence_gleu,hyp,ref):
    """
    This function return the gleu-Score
    :param sentence_gleu: nltk.translate.gleu_score.sentence_gleu 
    :param hyp: hypothesis sentences, list(str)
    :param ref: reference sentences, list(list(str))
    :return gleu-score
    """
    return sentence_gleu(ref, hyp)
コード例 #9
0
def _get_sent_gleu(
        hypothesis: List[str], references: List[List[str]],
        extra_args: Optional[Dict[str, str]] = None
) -> List[float]:
    joined_references = list(zip(*references))
    return [
        sentence_gleu([rr.split() for rr in r], h.split())
        for r, h in zip(joined_references, hypothesis)
    ]
コード例 #10
0
    def score_sentence(self, sentence, target):
        tgt = self.itos(target)
        sen = self.itos(sentence)
        # if (self.i == 0):
        # print(tgt)
        # print(sen)
        # print()
        self.i = 1

        return sentence_gleu([tgt], sen)
コード例 #11
0
    def _get_reward(y_hat, y, n_gram=6, method='gleu'):
        # This method gets the reward based on the sampling result and reference sentence.
        # For now, we uses GLEU in NLTK, but you can used your own well-defined reward function.
        # In addition, GLEU is variation of BLEU, and it is more fit to reinforcement learning.
        sf = SmoothingFunction()
        score_func = {
            'gleu':
            lambda ref, hyp: sentence_gleu([ref], hyp, max_len=n_gram),
            'bleu1':
            lambda ref, hyp: sentence_bleu([ref],
                                           hyp,
                                           weights=[1. / n_gram] * n_gram,
                                           smoothing_function=sf.method1),
            'bleu2':
            lambda ref, hyp: sentence_bleu([ref],
                                           hyp,
                                           weights=[1. / n_gram] * n_gram,
                                           smoothing_function=sf.method2),
            'bleu4':
            lambda ref, hyp: sentence_bleu([ref],
                                           hyp,
                                           weights=[1. / n_gram] * n_gram,
                                           smoothing_function=sf.method4),
        }[method]

        # Since we don't calculate reward score exactly as same as multi-bleu.perl,
        # (especialy we do have different tokenization,) I recommend to set n_gram to 6.

        # |y| = (batch_size, length1)
        # |y_hat| = (batch_size, length2)

        with torch.no_grad():
            scores = []

            for b in range(y.size(0)):
                ref, hyp = [], []
                for t in range(y.size(-1)):
                    ref += [str(int(y[b, t]))]
                    if y[b, t] == data_loader.EOS:
                        break

                for t in range(y_hat.size(-1)):
                    hyp += [str(int(y_hat[b, t]))]
                    if y_hat[b, t] == data_loader.EOS:
                        break
                # Below lines are slower than naive for loops in above.
                # ref = y[b].masked_select(y[b] != data_loader.PAD).tolist()
                # hyp = y_hat[b].masked_select(y_hat[b] != data_loader.PAD).tolist()

                scores += [score_func(ref, hyp) * 100.]
            scores = torch.FloatTensor(scores).to(y.device)
            # |scores| = (batch_size)

            return scores
コード例 #12
0
ファイル: utils.py プロジェクト: GZJAS/Squirrel
def computeGLEU(outputs, targets, corpus=False, tokenizer=None, segmenter=None):
    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]

    if segmenter is not None:
        outputs = segmenter(outputs)
        targets = segmenter(targets)

    if not corpus:
        return [sentence_gleu([t],  o) for o, t in zip(outputs, targets)]
    return corpus_gleu([[t] for t in targets], [o for o in outputs])
コード例 #13
0
def evaluateUsingGLEU(transliterated_file,row_name):
    j, score = 0,0
    dataReader = csv.DictReader(transliterated_file)
    for j, row in enumerate(dataReader):
        sent_man_written = row['man_written']
        reference = [tokenizer.tokenize(sent_man_written)]
        sent_machine_gen = row[row_name]
        candidate = tokenizer.tokenize(sent_machine_gen)
        score += gleu.sentence_gleu(reference, candidate)
    avg_score = score / (j + 1)
    print("Score using GLEU : ",avg_score)
コード例 #14
0
ファイル: utils.py プロジェクト: yoonjung/dl4mt-nonauto
def computeGLEU(outputs, targets, corpus=False, tokenizer=None):
    if tokenizer is None:
        tokenizer = revtok.tokenize

    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]

    if not corpus:
        return torch.Tensor(
            [sentence_gleu([t], o) for o, t in zip(outputs, targets)])
    return corpus_gleu([[t] for t in targets], [o for o in outputs])
コード例 #15
0
ファイル: utils.py プロジェクト: GZJAS/Squirrel
def computeBLEU(outputs, targets, corpus=False, tokenizer=None, segmenter=None):
    outputs = [tokenizer(o) for o in outputs]
    targets = [tokenizer(t) for t in targets]

    if segmenter is not None:
        outputs = segmenter(outputs)
        targets = segmenter(targets)

    if not corpus:
        return torch.Tensor([sentence_gleu(
            [t],  o) for o, t in zip(outputs, targets)])
    return corpus_bleu([[t] for t in targets], [o for o in outputs], emulate_multibleu=True)
コード例 #16
0
    def correlation(self, prev_sentences, response, dialog):
        """ Evaluate the relationship between every pair of sentences
            <previous_i, response> for i in [0, len(prev_sentence)]
            using averaged Word2Vec vectors and term-frequencies as weights.
            Also compute translation metrics scores using the previous lines as
            given references and the response as the translation hypothesis.

            TODO: - use bigrams
                  - use tf-idf trained on opposite character's dialog
                  - add more metrics (ROUGE, NIST, LEPOR, ...)

            INPUT:
            prev_sentences  -   list of previous sentences in the dialog
            response        -   response to the last sentence
            dialog          -   dialog generated so far (used as context)

            OUTPUT:
            score between 0 and MAX, where MAX is the maximum correlation score.
            MAX is just the number of metrics used to evaluate the correlation;
            it has yet to be established.
        """
        #0) Compute term frequency for each word from the given corpus
        score = 0.0
        counts = None
        corpus = prev_sentences + [response]

        if dialog:
            corpus += dialog

        counts = Counter(word_tokenize('\n'.join(corpus)))

        #1) Calculate weighted vector distance between every pair <previous_i, response>
        #   and compute the mean of such distances
        vect_dist = 0.0
        for previous in prev_sentences:
            vect_dist += 1 - np.linalg.norm(
                self._tf_weighted_sum(previous, counts) -
                self._tf_weighted_sum(response, counts))

        #TODO: scale down the weight as going back into the dialog history
        score += vect_dist / len(prev_sentences)

        #2) Sentence-level translation metrics (BLEU, GLEU)
        f = SmoothingFunction().method3
        bleu = sentence_bleu(prev_sentences, response, smoothing_function=f)
        gleu = sentence_gleu(prev_sentences, response)
        metrics_score = np.mean([bleu, gleu])

        score += metrics_score

        return score
コード例 #17
0
ファイル: utils.py プロジェクト: adempsey/neural-editor
def gleu(reference, predict):
    """Compute sentence-level gleu score.

    Args:
        reference (list[str])
        predict (list[str])
    """
    from nltk.translate import gleu_score

    if len(predict) == 0:
        if len(reference) == 0:
            return 1.0
        else:
            return 0.0

    return gleu_score.sentence_gleu([reference], predict)
コード例 #18
0
def sentence_gleu(reference: str, prediction: str):
    return gleu_score.sentence_gleu([reference.strip()], prediction.strip())
コード例 #19
0
def reinforce(
        input_tensor,
        target_tensor,
        target_sentence,  # used for calculating GLEU score
        encoder,
        decoder,
        output_lang,
        encoder_optimizer,
        decoder_optimizer,
        max_length=MAX_LENGTH,
        teacher_forcing_ratio=0.5,
        hypothesis_to_generate=20,
        baseline_reward=0.2):

    # Part 1: Generate k hypothesis sentences
    hyp_sents = []  # list of generated sentences
    hyp_probs = []  # their respecive log probabilities

    for k in range(hypothesis_to_generate):
        encoder_hidden = encoder.init_hidden()
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        input_length = input_tensor.size(0)
        target_length = target_tensor.size(0)

        encoder_outputs = torch.zeros(max_length,
                                      encoder.hidden_size,
                                      device=DEV_)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] = encoder_output[0, 0]

        decoder_input = torch.tensor([[Lang.SOS_token]], device=DEV_)
        decoder_hidden = encoder_hidden

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio \
            else False

        out_sent = []  # the kth generated sentence
        out_prob = 0  # and its respecive log probability

        if use_teacher_forcing:
            # Teacher forcing: Feed the target as the next input to decoder
            for di in range(target_length):
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                # topv, topi = decoder_output.topk(1)
                # out_sent.append(output_lang.idx2word[topi.item()])
                # out_prob += topv
                try:
                    m = Categorical(logits=decoder_output)
                    action = m.sample()

                    if action.cpu().item() == Lang.EOS_token:
                        break

                    out_sent.append(output_lang.idx2word[action.cpu().item()])
                    out_prob += decoder_output[0][action.cpu().item()]

                    decoder_input = target_tensor[di]  # Teacher forcing

                except Exception as e:
                    print(e)
                    breakpoint()
        else:
            # Without teacher forcing: use its own predictions as the next input to decoder
            for di in range(target_length):
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                # topv, topi = decoder_output.topk(1)
                # out_sent.append(output_lang.idx2word[topi.item()])
                # out_prob += topv
                try:
                    m = Categorical(logits=decoder_output)
                    action = m.sample()

                    if action.item() == Lang.EOS_token:
                        break

                    out_sent.append(output_lang.idx2word[action.cpu().item()])
                    out_prob += decoder_output[0][action.cpu().item()]

                    decoder_input = action.detach()

                except Exception as e:
                    print(e)
                    breakpoint()

                # FIXME Check this!
                # decoder_input = topi.squeeze().detach()  # detach from history as input
                # if decoder_input.item() == Lang.EOS_token:
                #    break

        hyp_sents.append(out_sent)
        hyp_probs.append(out_prob)

    hyp_probs = torch.tensor(hyp_probs, device=DEV_)
    # FIXME normalize probability values
    hyp_probs = F.softmax(hyp_probs, dim=0)
    print(hyp_probs)

    scores = []
    for k in range(hypothesis_to_generate):
        # Score the output sentence using GLEU
        score = gleu_score.sentence_gleu([target_sentence], hyp_sents[k])
        scores.append(score)

    scores = torch.tensor(scores, device=DEV_)

    reward = torch.sum(hyp_probs * scores)
    baseline = reward / hypothesis_to_generate  # TODO CHECK BASELINE

    loss = -torch.sum(torch.log(hyp_probs)) * (reward - baseline)
    print('loss {} - reward {} - baseline {}'.format(loss, reward, baseline))
    if loss < 1e-3:
        breakpoint()
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.item() / target_length  # TODO CHECK THIS
コード例 #20
0
def computeGleu(target,reference):
    return [sentence_gleu([ref.strip()],tgt.strip()) for (ref,tgt) in zip(reference,target)]
コード例 #21
0
def compute_features(data):
    # Initialize all feature placeholders
    data[ratio_num_char_source_candidate] = []
    data[ratio_num_tokens_source_candidate] = []
    data[ratio_mean_token_length_source_candidate] = []
    data[ratio_common_bigrams_candidate_reference] = []
    data[ratio_num_token_candidate_reference] = []
    data[gleu_scores] = []
    data[bleu_scores] = []
    data[chrf_scores] = []
    data[labels] = []
    data[ratio_tree_height_candidate_reference] = []

    for line_idx in range(0, len(data[source_lines])):
        # Feature: gleu_scores
        data[gleu_scores].append(sentence_gleu(data[reference_lines][line_idx], data[candidate_lines][line_idx]))

        # Feature: chrf_scores
        data[chrf_scores].append(sentence_chrf(data[reference_lines][line_idx], data[candidate_lines][line_idx]))

        # Feature: bleu_scores
        data[bleu_scores].append(float(data[bleu_scores_lines][line_idx]))

        # Feature: ratio_num_char_source_candidate
        data[ratio_num_char_source_candidate].append(
            len(re.sub('[\s+]', '', data[source_lines][line_idx]))
            / len(re.sub('[\s+]', '', data[candidate_lines][line_idx])))

        # Feature: ratio_num_tokens_source_candidate
        data[ratio_num_tokens_source_candidate].append(
            len(re.compile('\S+').findall(data[source_lines][line_idx]))
            / len(re.compile('\S+').findall(data[candidate_lines][line_idx])))

        # Feature: ratio_num_token_candidate_reference
        data[ratio_num_token_candidate_reference].append(
            len(re.sub('[\S+]', '', data[candidate_lines][line_idx]))
            / len(re.sub('[\S+]', '', data[reference_lines][line_idx])))

        # Feature: ratio_mean_token_length_source_candidate
        data[ratio_mean_token_length_source_candidate].append(
            np.mean(list(map(len, re.compile('\S+').findall(data[source_lines][line_idx]))))
            / np.mean(list(map(len, re.compile('\S+').findall(data[candidate_lines][line_idx])))))

        # Feature: ratio_common_bigrams_candidate_reference
        data[ratio_common_bigrams_candidate_reference].append(
            len(
                set([b for b in zip(re.compile('\S+').findall(data[reference_lines][line_idx])[:-1],
                                    re.compile('\S+').findall(data[reference_lines][line_idx])[1:])])
                &
                set([b for b in zip(re.compile('\S+').findall(data[candidate_lines][line_idx])[:-1],
                                    re.compile('\S+').findall(data[candidate_lines][line_idx])[1:])])
            )
            /
            len([b for b in zip(re.compile('\S+').findall(data[reference_lines][line_idx])[:-1],
                                re.compile('\S+').findall(data[reference_lines][line_idx])[1:])]))

        # Feature: ratio_tree_height_candidate_reference
        data[ratio_tree_height_candidate_reference].append(
            data[candidate_tree_heights][line_idx] / data[reference_tree_heights][line_idx]
        )

        # Feature: labels
        data[labels].append(1 if data[provided_labels][line_idx] == "H" else 0)
コード例 #22
0
ファイル: gleu.py プロジェクト: scape1989/differentiable-bleu
def main():
    from nltk.translate.gleu_score import corpus_gleu, sentence_gleu

    eos = 6
    reference_batch = [[1, 1, 2, 1,
                        eos]]  #, [5, 1, eos, 0, 0], [2, 5, 3, eos, 1]]
    candidate_batch = [[1, 3, 1, eos,
                        0]]  #, [5, 2, eos, 0, 0], [2, 2, 3, eos, 0]]
    row = 0

    seq_length = len(candidate_batch[row])

    true_batch_gleu = corpus_gleu([[_crop(r, eos)] for r in reference_batch],
                                  [_crop(c, eos) for c in candidate_batch])

    gleu_score, n_match, tpfp, tpfn = custom_sentence_gleu(
        [_crop(reference_batch[row], eos)], _crop(candidate_batch[row], eos))

    true_gleu_scores = [
        sentence_gleu([_crop(reference_batch[k], eos)],
                      _crop(candidate_batch[k], eos))
        for k in range(len(candidate_batch))
    ]
    print("true gleu: {}, n_match: {}, tpfp: {}, tpfn: {}".format(
        gleu_score, n_match, tpfp, tpfn))

    gleu_scorer = GleuScorer(seq_length=seq_length,
                             vocab_size=eos + 1,
                             eos_idx=eos,
                             input_type=ONEHOT_SOFT)

    #feed_hyp = np_label_smoothing(np_onehot(np.array(candidate_batch)), epsilon=1e-5)
    #feed_refs = np_label_smoothing(np_onehot(np.array(reference_batch)), epsilon=1e-5)

    feed_hyp = np_onehot(np.array(candidate_batch))
    feed_refs = np_onehot(np.array(reference_batch))

    #print("---> {}".format(feed_refs))
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        feed_dict = {
            gleu_scorer.hypothesis: feed_hyp,
            gleu_scorer.reference: feed_refs
        }

        targets = [
            gleu_scorer.batch_gleu_score, gleu_scorer.sentence_n_match,
            gleu_scorer.tpfn, gleu_scorer.tpfp,
            gleu_scorer.sentence_gleu_score, gleu_scorer.individual_ngrams[0]
        ]
        (batch_gleu, n_match, tpfn, tpfp, gleu,
         ngram) = sess.run(targets, feed_dict=feed_dict)

    print("our gleu: {}, n_match: {}, tpfp: {}, tpfn: {}".format(
        gleu[row], n_match[row], tpfp[row], tpfn[row]))

    print("\n\nBatch gleu's. official: {}. ours: {}".format(
        true_batch_gleu, batch_gleu))

    print("\n\nall gleus....")
    print("true ones: {}".format(true_gleu_scores))
    print("ours: {}".format(gleu))

    print("ngram: {}".format(ngram))
コード例 #23
0
def glue_similarity(hyp, ref):
    return gleu.sentence_gleu([hyp.split()], ref)
コード例 #24
0
def score_compute(comp_res):

    res_wer = []
    bleu_indi1 = []
    bleu_indi2 = []
    bleu_indi3 = []
    bleu_indi4 = []
    bleu_cum2 = []
    bleu_cum3 = []
    bleu_cum4 = []
    gleu_sent = []
    meteor_score = []
    rouge_score = []

    translated = []
    reference = []
    for i in range(len(comp_res)):
        reference.append([comp_res[i][0].split(' ')])
        translated.append(comp_res[i][1].split(' '))
    bleu_corpus = corpus_bleu(reference, translated)
    #sacrebleu_corpus = sacrebleu.corpus_bleu( translated, reference)
    gleu_corpus = corpus_gleu(reference, translated)

    # evaluator obj for rouge-l metric
    evaluator = Rouge(
        metrics=['rouge-l'],
        limit_length=True,
        length_limit=100,
        length_limit_type='words',
        apply_avg=True,
        apply_best=False,
        alpha=0.5,  # Default F1_score
        weight_factor=1.2,
        stemming=True)

    #for result_pair in compare_results:
    for result_pair in comp_res:
        # ------------ WER
        #res_back = wer( result_pair[0].split(' '), result_pair[1].split(' '))
        res_back = wer(result_pair[0].split(' '), result_pair[1].split(' '))

        res_wer.append(res_back)

        # ----------- BLEU
        indi1_gr = sentence_bleu([result_pair[0].split(' ')],
                                 result_pair[1].split(' '),
                                 weights=(1, 0, 0, 0))  # individual 1-gram
        indi2_gr = sentence_bleu([result_pair[0].split(' ')],
                                 result_pair[1].split(' '),
                                 weights=(0, 1, 0, 0))  # individual 2-gram
        indi3_gr = sentence_bleu([result_pair[0].split(' ')],
                                 result_pair[1].split(' '),
                                 weights=(0, 0, 1, 0))  # individual 3-gram
        indi4_gr = sentence_bleu([result_pair[0].split(' ')],
                                 result_pair[1].split(' '),
                                 weights=(0, 0, 0, 1))  # individual 4-gram

        # cumulative 2-gram, 3-gram, 4-gram bleu
        cum2_gr = sentence_bleu([result_pair[0].split(' ')],
                                result_pair[1].split(' '),
                                weights=(0.5, 0.5, 0, 0))
        cum3_gr = sentence_bleu([result_pair[0].split(' ')],
                                result_pair[1].split(' '),
                                weights=(0.33, 0.33, 0.33, 0))
        cum4_gr = sentence_bleu([result_pair[0].split(' ')],
                                result_pair[1].split(' '),
                                weights=(0.25, 0.25, 0.25, 0.25))

        gleu_s = sentence_gleu([result_pair[0].split(' ')],
                               result_pair[1].split(' '))
        meteor = round(single_meteor_score(result_pair[0], result_pair[1]), 4)
        rouge_all = evaluator.get_scores(result_pair[1], result_pair[0])
        rouge_l_f1 = rouge_all['rouge-l']['f']

        bleu_indi1.append(indi1_gr)
        bleu_indi2.append(indi2_gr)
        bleu_indi3.append(indi3_gr)
        bleu_indi4.append(indi4_gr)
        bleu_cum2.append(cum2_gr)
        bleu_cum3.append(cum3_gr)
        bleu_cum4.append(cum4_gr)
        gleu_sent.append(gleu_s)
        meteor_score.append(meteor)
        rouge_score.append(rouge_l_f1)

    wer_mean = np.mean(res_wer)
    wer_var = np.var(res_wer)
    bleu_indi1_mean = np.mean(bleu_indi1)
    bleu_indi2_mean = np.mean(bleu_indi2)
    bleu_indi3_mean = np.mean(bleu_indi3)
    bleu_indi4_mean = np.mean(bleu_indi4)
    bleu_cum2_mean = np.mean(bleu_cum2)
    bleu_cum3_mean = np.mean(bleu_cum3)
    bleu_cum4_mean = np.mean(bleu_cum4)
    gleu_s_mean = np.mean(gleu_sent)
    meteor_s_mean = np.mean(meteor_score)
    rouge_s_mean = np.mean(rouge_score)

    bleus = (bleu_indi1_mean, bleu_indi2_mean, bleu_indi3_mean,
             bleu_indi4_mean, bleu_cum2_mean, bleu_cum3_mean, bleu_cum4_mean,
             bleu_corpus)
    gleus = (gleu_s_mean, gleu_corpus)

    return wer_mean, wer_var, bleus, gleus, meteor_s_mean, rouge_s_mean
コード例 #25
0
def _gleu(x, y):
    return sentence_gleu([x.split(" ")], y.split(" "))
コード例 #26
0
def gleu_multi(x, y):
    return sentence_gleu([xx.split(" ") for xx in x], y.split(" "))
コード例 #27
0
def gleu_one(prediction, ground_truth):
    '''
    prediction: ['나', '는', ' ', '바', '보', '다', '.']
    ground_truth: ['나', '는', ' ', '바', '보', '일', '까', '?']
    '''
    return sentence_gleu([ground_truth], prediction) * 100.
コード例 #28
0
    num_sentences = len(hypothesis) - 1

    rf = open(ref_file, "r")
    reference = rf.read().split("\n")

    sf = open(scores_file, "w")

    gleu_score_average = 0.0

    real_num_sentences = 0
    for i in range(0, num_sentences):
        if (len(reference[i].strip()) != 0 or len(hypothesis[i].strip()) != 0):
            print("Ref" + str(i) + ": " + reference[i])
            print("Hyp" + str(i) + ": " + hypothesis[i])
            ref, hypo = reference[i].split(), hypothesis[i].split()
            gleu_score_average = gleu_score_average + gleu.sentence_gleu([ref],
                                                                         hypo)
            real_num_sentences = real_num_sentences + 1

    gleu_score_average = gleu_score_average / real_num_sentences
    print("Sentences: " + str(real_num_sentences) + "; GLEU score average: " +
          str(gleu_score_average))

    scores_str = 'GLEU: ' + str(gleu_score_average)

    sf.write(scores_str)
    sf.close()

    #print 'Average Metric Score for All Review Summary Pairs:'
    #print scores_str
コード例 #29
0
    def predictTestset(self, sess):
        """ Try predicting the sentences from the samples.txt file.
        The sentences are saved on the modelDir under the same name
        Args:
            sess: The current running session
        """
        lines = []
        hypseqs = []
        refs = []
        hyps = []
        average_bleu = 0
        average_gleu = 0
        average_uni_ratio = 0
        average_bi_ratio = 0
        total_token = 0
        uni_dict = {}
        bi_dict = {}
        av_total_uni_ratio = 0.0
        av_total_bi_ratio = 0.0
        flag = self.textData.loadTestData(self.textData.testSamplesDir)
        if not flag:
            # Loading the file to predict
            with open(os.path.join(self.args.rootDir, self.TEST_IN_NAME),
                      'r') as f:
                lines = f.readlines()
        else:
            for sample in self.textData.testingSamples:
                lines.append(self.textData.sequence2str(sample[0], clean=True))
                hypseqs.append(
                    self.textData.sequence2str(sample[1], clean=True))

        modelList = self._getModelList()
        if not modelList:
            print(
                'Warning: No model found in \'{}\'. Please train a model before trying to predict'
                .format(self.modelDir))
            return

        # Predicting for each model present in modelDir
        for modelName in sorted(modelList):  # TODO: Natural sorting
            print('Restoring previous model from {}'.format(modelName))
            self.saver.restore(sess, modelName)
            print('Testing...')

            saveName = modelName[:-len(
                self.MODEL_EXT
            )] + self.TEST_OUT_SUFFIX  # We remove the model extension and add the prediction suffix
            with open(saveName, 'w') as f:
                nbIgnored = 0
                index = 0
                for line in tqdm(lines, desc='Sentences'):
                    if not flag:
                        question = line[:-1]  # Remove the endl character
                    else:
                        question = line

                    answer = self.singlePredict(question)
                    if not answer:
                        nbIgnored += 1
                        continue  # Back to the beginning, try again

                    if not flag:
                        predString = '{x[0]}{0}\n{x[1]}{1}\n\n'.format(
                            question,
                            self.textData.sequence2str(answer, clean=True),
                            x=self.SENTENCES_PREFIX)
                    else:
                        predString = '{x[0]}{0}\n{x[1]}{1}\n{y}{2}\n\n'.format(
                            question,
                            self.textData.sequence2str(answer, clean=True),
                            hypseqs[index],
                            x=self.SENTENCES_PREFIX,
                            y='T: ')

                        ref = nltk.word_tokenize(
                            self.textData.sequence2str(answer, clean=True))
                        refs.append([ref])
                        hyp = nltk.word_tokenize(hypseqs[index])
                        hyps.append(hyp)

                        tokens = len(ref)
                        total_token += tokens
                        dic1 = {k: ref.count(k) for k in set(ref)}
                        uni_types = len(dic1)
                        dic2 = {}
                        for i in range(len(ref) - 1):
                            item = ref[i] + ' ' + ref[i + 1]
                            if item in dic2.keys():
                                dic2[item] += 1
                            else:
                                dic2[item] = 1
                        bi_types = len(dic2)
                        if tokens > 0:
                            uni_ratio = float(uni_types) / float(tokens)
                        else:
                            uni_ratio = 0
                        if tokens > 1:
                            bi_ratio = float(bi_types) / float(tokens - 1)
                        else:
                            bi_ratio = 0
                        for it1 in dic1.keys():
                            if it1 in uni_dict.keys():
                                uni_dict[it1] += 1
                            else:
                                uni_dict[it1] = 1
                        for it2 in dic2.keys():
                            if it2 in bi_dict.keys():
                                bi_dict[it2] += 1
                            else:
                                bi_dict[it2] = 1

                        # bleu = bleu_score.sentence_bleu([ref], hyp, smoothing_function=bleu_score.SmoothingFunction().method2, weights=[0.3, 0.3, 0.2, 0.2])
                        bleu = bleu_score.sentence_bleu(
                            [ref],
                            hyp,
                            smoothing_function=bleu_score.SmoothingFunction(
                            ).method2)
                        try:
                            gleu = gleu_score.sentence_gleu(ref, hyp)
                        except (ZeroDivisionError):
                            print(
                                "Error: Division by zero, need smoothing function."
                            )
                            gleu = 0.0
                        predString = predString + (
                            "Sentence BLEU %.4f, Sentence Google-BlEU %.4f.\n"
                            "Unigram diversity %.4f, Bigram diversity %.4f.\n\n"
                            % (bleu, gleu, uni_ratio, bi_ratio))

                        average_bleu += bleu
                        average_gleu += gleu
                        average_bi_ratio += bi_ratio
                        average_uni_ratio += uni_ratio

                    if self.args.verbose:
                        tqdm.write(predString)
                    f.write(predString)
                    index += 1
                if flag:
                    average_bleu /= (len(lines) - nbIgnored)
                    average_gleu /= (len(lines) - nbIgnored)
                    average_uni_ratio /= (len(lines) - nbIgnored)
                    average_bi_ratio /= (len(lines) - nbIgnored)

                    av_total_uni_ratio = float(
                        len(uni_dict)) / float(total_token)
                    av_total_bi_ratio = float(
                        len(bi_dict)) / float(total_token - len(lines) +
                                              nbIgnored)
                    # corpus_bleu = bleu_score.corpus_bleu(refs, hyps,
                    #                                      smoothing_function=bleu_score.SmoothingFunction().method2,
                    #                                      weights=[0.3, 0.3, 0.2, 0.2])
                    corpus_bleu = bleu_score.corpus_bleu(
                        refs,
                        hyps,
                        smoothing_function=bleu_score.SmoothingFunction(
                        ).method2)
                    f.write(
                        "Average BLEU %.4f, Average Google-BLEU %.4f, Corpus BLEU %.4f.\n"
                        "Average Unigram diversity %.4f, Average Bigram diversity %.4f.\n"
                        "Average T-Unigram diversity %.4f, Average T-Bigram diversity %.4f."
                        % (average_bleu, average_gleu, corpus_bleu,
                           average_uni_ratio, average_bi_ratio,
                           av_total_uni_ratio, av_total_bi_ratio))
                print(
                    'Prediction finished, {}/{} sentences ignored (too long)'.
                    format(nbIgnored, len(lines)))
コード例 #30
0
def compute_gleu(target, translation):
    gleu = gleu_score.sentence_gleu([target.replace("@@ ", "").split(" ")], translation.replace("@@ ", "").split(" "))
    return gleu