Exemple #1
0
def bleu_corpus(hypothesis, references):
    from nltk.translate.bleu_score import corpus_bleu
    hypothesis = hypothesis.copy()
    references = references.copy()
    hypothesis = [hyp.split() for hyp in hypothesis]
    references = [[ref.split()] for ref in references]
    # hypothesis = [normalize_answer(hyp).split(" ") for hyp in hypothesis]
    # references = [[normalize_answer(ref).split(" ")] for ref in references]
    b1 = corpus_bleu(
        references,
        hypothesis,
        weights=(1.0 / 1.0, ),
        smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1)
    b2 = corpus_bleu(
        references,
        hypothesis,
        weights=(1.0 / 2.0, 1.0 / 2.0),
        smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1)
    b3 = corpus_bleu(
        references,
        hypothesis,
        weights=(1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0),
        smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1)
    b4 = corpus_bleu(
        references,
        hypothesis,
        weights=(1.0 / 4.0, 1.0 / 4.0, 1.0 / 4.0, 1.0 / 4.0),
        smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1)
    return (b1, b2, b3, b4)
def _bleu(guess, answers, method=None):
    """Compute approximate BLEU score between guess and a set of answers."""
    if nltkbleu is None:
        # bleu library not installed, just return a default value
        return None
    # Warning: BLEU calculation *should* include proper tokenization and
    # punctuation etc. We're using the normalize_answer for everything though,
    # so we're over-estimating our BLEU scores.  Also note that NLTK's bleu is
    # going to be slower than fairseq's (which is written in C), but fairseq's
    # requires that everything be in arrays of ints (i.e. as tensors). NLTK's
    # works with strings, which is better suited for this module.
    if method == "method0":
        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method0
    elif method == "method1":
        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method1
    elif method == "method2":
        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method2
    elif method == "method3":
        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method3
    elif method == "method4":
        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method4
    elif method == "method5":
        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method5
    elif method == "method6":
        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method6
    elif method == "method7":
        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method7
    else:
        smoothing_func = nltkbleu.SmoothingFunction(epsilon=1e-12).method3

    return nltkbleu.sentence_bleu(
        [normalize_answer(a).split(" ") for a in answers],
        normalize_answer(guess).split(" "),
        smoothing_function=smoothing_func,
    )
Exemple #3
0
    def __call__(self, trainer):
        with chainer.no_backprop_mode():
            references = []
            hypotheses = []

            use_data = random.sample(self.test_data, self.batch)

            if self.device >= 0:
                sources = [cuda.cupy.asarray(x[1]) for x, _ in use_data]
                targets = [cuda.cupy.asarray(y[1]) for _, y in use_data]
                #sourcePersona = [cuda.cupy.asarray(x[0]) for x, _ in use_data]#今は使わん将来使うかも?
                targetPersona = [cuda.cupy.asarray(y[0]) for _, y in use_data]
            else:
                sources = [x[1] for x, _ in use_data]
                targets = [y[1] for _, y in use_data]
                sourcePersona = [x[0] for x, _ in use_data]
                targetPersona = [y[0] for _, y in use_data]
            
            sources = F.pad_sequence(sources, loadData.LoadData.maxlen, -1)
            targets = F.pad_sequence(targets, loadData.LoadData.maxlen, -1)

            references.extend([[t.tolist()] for t in targets.data])
            ys = [y.tolist()
                for y in self.model.predict(sources.data, targetPersona)]#batch_size
            hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses, smoothing_function=bleu_score.SmoothingFunction().method1)
        chainer.report({self.key:bleu})
Exemple #4
0
def distribScoreBleu(corpusDict, dicoTrad):
    """
        Méthode qui calcule la distribution des scores BLEU
        pour chaque type de phrase, direct et indirect

        Arg : dicoScore = dictionnaires des scores pour chaque type
        key : type de phrase, value : score BLEU
        dicoPhrases = Dictionnaire des phrases direct et indirect

        Return : DicoScore

    """
    chencherry = bleu.SmoothingFunction()
    dicoScore = defaultdict(list)
    dicoPhrases = dicoDirIndir(corpusDict)

    for directTrad in dicoTrad:
        dicoBleu = createDicoRefHyp(corpusDict, directTrad)

        for ref in dicoBleu.keys():
            if dicoBleu[ref][0] in dicoPhrases['direct_hyp']:
                bleuScore = bleu.sentence_bleu(
                    dicoBleu[ref],
                    ref.split(),
                    smoothing_function=chencherry.method3)
                dicoScore['direct_hyp'] += [bleuScore]
            elif dicoBleu[ref][0] in dicoPhrases['indirect_hyp']:
                bleuScore = bleu.sentence_bleu(
                    dicoBleu[ref],
                    ref.split(),
                    smoothing_function=chencherry.method3)
                dicoScore['indirect_hyp'] += [bleuScore]

    return dicoScore
Exemple #5
0
def scoreBleu(corpusDict, dicoTrad):
    """
        Méthode qui calcule le score bleu
        de chaque direction de traduction et qui le stock
        dans le dictionnaire de traduction approprié.

        Arg : dicoBleu = Dictionnaire qui change à chaque 
        nouvelle direction de traduction et qui est créée par createDicoRefHyp
        totalBleu = le nombre total de scoreBleu par direction de traduction

        Return : Le dictionnaire de traduction avec key : direction de traduction
        value : le score BLEU
    
    """

    dicoBleu = {}
    chencherry = bleu.SmoothingFunction()

    for directTrad in dicoTrad:
        totalBleu = 0.0
        dicoBleu = createDicoRefHyp(corpusDict, directTrad)

        for ref in dicoBleu.keys():
            totalBleu += bleu.sentence_bleu(
                dicoBleu[ref],
                ref.split(),
                smoothing_function=chencherry.method3)

        dicoTrad[directTrad] = totalBleu / len(dicoBleu.keys())

    return (dicoTrad)
Exemple #6
0
def get_bleu(pred, trg, lengths, idx2word, BLEU=[0.25, 0.25, 0.25, 0.25]):

    batch = pred.shape[0]

    cc = bleu.SmoothingFunction()

    score = 0
    b = 0
    for p, t in zip(pred, trg):
        p = p[1:lengths[b]]
        p = [idx2word[index].lower() for index in p]
        t = t[t != 0]
        t = t[1:-1]
        t = [idx2word[index].lower() for index in t]
        b += 1

        #print("pred :" , p)
        #print("target: ", t)

        score += bleu.sentence_bleu([t],
                                    p,
                                    BLEU,
                                    smoothing_function=cc.method1)

    return score
Exemple #7
0
    def get_ith_item(self, index, return_string=True):
        output = {
            "source":
            self._get_source_sentence(index, return_string=return_string),
            "reference":
            self._get_reference_sentence(index, return_string=return_string),
            "sampled":
            self._get_sampled_sentence(index, return_string=return_string),
            "attention":
            self._last_batch["attention"][index]
        }

        reference = output["reference"]
        hypothesis = output["sampled"]

        if not return_string:
            reference = " ".join(reference)
            hypothesis = " ".join(hypothesis)

        chencherry = bleu_score.SmoothingFunction()
        output["bleu-4"] = bleu_score.sentence_bleu(
            references=[reference],
            hypothesis=hypothesis,
            smoothing_function=chencherry.method1)
        return output
Exemple #8
0
def calculate_bleu_score(caption_model, dataset_encoddings, lookup_table,
                         word_to_idx, idx_to_word, max_length):
    example_count = 0
    bleu_sum = 0
    for image_key in tqdm(list(dataset_encoddings.keys()),
                          position=0,
                          leave=True):
        image = dataset_encoddings[image_key].reshape(1, IMAGE_OUTPUT_DIM)
        generated_caption = generate_caption(image, word_to_idx, idx_to_word,
                                             max_length, caption_model)

        candidate = generated_caption.split()
        references = list(
            map(lambda caption: caption.split(), lookup_table[image_key]))
        bleu_score = bleu.sentence_bleu(
            references,
            candidate,
            smoothing_function=bleu.SmoothingFunction().method1)
        bleu_sum += bleu_score

        if example_count < 5:
            log.warning(
                f'Image: {image_key}\nreal caption: {lookup_table[image_key][0]}\ngenerated caption: {generated_caption}\nBLEU: {bleu_score}'
            )
            log.warning(
                "********************************************************************"
            )
            example_count += 1

    return bleu_sum / len(dataset_encoddings)
Exemple #9
0
    def __call__(self, trainer):
        print('## Calculate BLEU')
        with chainer.no_backprop_mode():
            with chainer.using_config('train', False):
                references = []
                hypotheses = []
                for i in range(0, len(self.test_data), self.batch):
                    sources, targets = zip(*self.test_data[i:i + self.batch])
                    references.extend([[t.tolist()] for t in targets])

                    sources = [
                        chainer.dataset.to_device(self.device, x)
                        for x in sources
                    ]
                    ys = [
                        y.tolist() for y in self.model.translate(
                            sources, self.max_length, beam=False)
                    ]
                    # greedy generation for efficiency
                    hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references,
            hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1) * 100
        print('BLEU:', bleu)
        reporter.report({self.key: bleu})
def compute_scores(pred_fname, ref_fname):

    # read files
    hyps = read_corpus(pred_fname, ref=False)
    refs = read_corpus(ref_fname, ref=True)

    # NIST score
    nist = ns.corpus_nist(refs, hyps, n=4)

    # BLEU score
    chencherry = bs.SmoothingFunction()
    bleu = bs.corpus_bleu(refs, hyps, smoothing_function=chencherry.method2)

    # ED
    total_len = 0.0
    edi = 0.0
    for r, h in zip(refs, hyps):
        total_len += max(len(r[0]), len(h))
        edi += edit_distance(
            r[0],
            h)  # TODO: strange -- inputs should be strings, not charlists!

    bleu_score = 100.0 * round(bleu, 4)
    edist_score = 100.0 * round(1 - edi / total_len, 4)
    nist_score = round(nist, 2)

    return Scores(bleu_score, edist_score, nist_score)
Exemple #11
0
    def __call__(self, trainer):
        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            for i in range(0, len(self.test_data), self.batch):
                sources0, sources1, targets = zip(*self.test_data[i:i +
                                                                  self.batch])
                references.extend([[t.tolist()] for t in targets])

                sources0 = [
                    chainer.dataset.to_device(self.device, x) for x in sources0
                ]
                sources1 = [
                    chainer.dataset.to_device(self.device, x) for x in sources1
                ]
                ys = [
                    y.tolist() for y in self.model.translate(
                        sources0, sources1, self.max_length)
                ]
                hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references,
            hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1)
        chainer.report({self.key: bleu})
Exemple #12
0
    def compute(guess: str,
                answers: List[str],
                k: int = 4) -> Optional[BleuMetric]:
        """
        Compute approximate BLEU score between guess and a set of answers.
        """
        try:
            from nltk.translate import bleu_score as nltkbleu
        except ImportError:
            # User doesn't have nltk installed, so we can't use it for bleu
            # We'll just turn off things, but we might want to warn the user
            return None

        # Warning: BLEU calculation *should* include proper tokenization and
        # punctuation etc. We're using the normalize_answer for everything though,
        # so we're over-estimating our BLEU scores.  Also note that NLTK's bleu is
        # going to be slower than fairseq's (which is written in C), but fairseq's
        # requires that everything be in arrays of ints (i.e. as tensors). NLTK's
        # works with strings, which is better suited for this module.
        weights = [1 / k for _ in range(k)]
        score = nltkbleu.sentence_bleu(
            [normalize_answer(a).split(" ") for a in answers],
            normalize_answer(guess).split(" "),
            smoothing_function=nltkbleu.SmoothingFunction(
                epsilon=1e-12).method1,
            weights=weights,
        )
        return BleuMetric(score)
Exemple #13
0
def calc_bleu_many(cand_seq, ref_sequences):
    sf = bleu_score.SmoothingFunction()

    return bleu_score.sentence_bleu(ref_sequences,
                                    cand_seq,
                                    smoothing_function=sf.method1,
                                    weights=(0.5, 0.5))
Exemple #14
0
def test_review_bleu(gts_data, generate_data, vocab, bleu_totals, length):
    type_wights = [[1., 0, 0, 0], [.5, .5, 0, 0], [1 / 3, 1 / 3, 1 / 3, 0],
                   [.25, .25, .25, .25]]

    sf = bleu_score.SmoothingFunction()

    # batch first
    gts_idx = torch.transpose(gts_data, 0, 1)
    _, generate_idx = generate_data.max(2)
    generate_idx = torch.transpose(generate_idx, 0, 1)

    gts_sentence = []
    gene_sentence = []
    # detokenize the sentence
    for token_ids in gts_idx:
        current = [vocab.itos[id] for id in token_ids.detach().cpu().numpy()]
        gts_sentence.append(current)
    for token_ids in generate_idx:
        current = [vocab.itos[id] for id in token_ids.detach().cpu().numpy()]
        gene_sentence.append(current)
    # compute bleu score
    assert len(gts_sentence) == len(gene_sentence)
    for i in range(len(gts_sentence)):
        length += 1
        for j in range(4):
            refs = gts_sentence[i]
            sample = gene_sentence[i]
            weights = type_wights[j]
            bleu_totals[j] += bleu_score.sentence_bleu(
                refs, sample, smoothing_function=sf.method1, weights=weights)

    return bleu_totals, length
def main():
    arguments = sys.argv[1:]
    num_args = len(arguments)
    if num_args != 2:
        print('Wrong number few arguments.')
        print(str(sys.argv[0]), 'system-dir', 'reference-dir')
        exit()
    system_path = arguments[0]
    ref_path = arguments[1]

    # For all files in system path.
    for filename in os.listdir(system_path):
        print('Filename', str(filename))
        system_filename = os.path.join(system_path, filename)
        ref_filename = os.path.join(ref_path, filename)

        # read files
        ref = read_corpus(ref_filename, ref=True)
        hyp = read_corpus(system_filename, ref=False)

        # NIST score
        nist = ns.corpus_nist(ref, hyp, n=4)

        # BLEU score
        chencherry = bs.SmoothingFunction()
        bleu = bs.corpus_bleu(ref, hyp, smoothing_function=chencherry.method2)
        print('BLEU', str(round(bleu, 3)))
        total_len = 0.0
        edi = 0.0
        for r, h in zip(ref, hyp):
            total_len += max(len(r[0]), len(h))
            edi += edit_distance(r[0], h)
        print('DIST', str(round(1 - edi / total_len, 3)))
        print('NIST', str(round(nist, 6)))
        print()
def select_question(asked: list, questions: dict, lang: str) -> dict:
    nested_punct = re.compile('([a-zA-Z]+)['+string.punctuation+']+([a-zA-Z]+)')
    replace = lambda match:match.group(1) + ' ' + match.group(2)
    scores = {}
    # Check here if lang is not english, then run morpheme function on asked if so.
    for i, question in questions.items():
        english, lang_b = question
        eng_q, eng_ans = list(english.items())[0]
        lang_b_q, lang_b_ans = list(lang_b.items())[0]
        
        if lang == 'english':
            hypothesis = [word.lower() for word in word_tokenize(eng_q, 'english') if word not in string.punctuation]
        else:
            # Here if possible, run lang_b_q.split() through a function that returns a list of morphemes.
            hypothesis = [word.lower() for word in lang_b_q.split() if word not in string.punctuation]

        fin_hypothesis = []
        for word in hypothesis:
            fin_hypothesis += re.sub(nested_punct, replace, word).split()

        score = bleu_score.sentence_bleu([fin_hypothesis], asked, smoothing_function=bleu_score.SmoothingFunction().method1)
        scores[i] = [score, lang_b_q, lang_b_ans, eng_q, eng_ans]

    sort_scores = sorted(scores.items(), key=lambda score:score[1][0], reverse=True)
    return sort_scores[:2]
        def CalculateBleu(trainer=trainer,
                          model=model,
                          test_data=test_data,
                          key='validation/main/bleu',
                          batch=100,
                          max_length=100,
                          device=args.gpu):
            trigger = 1, 'epoch'
            priority = chainer.training.PRIORITY_WRITER
            with chainer.no_backprop_mode():
                references = []
                hypotheses = []
                for i in range(0, len(test_data), batch):
                    sources, targets = zip(*test_data[i:i + batch])
                    references.extend([[t.tolist()] for t in targets])

                    sources = [
                        chainer.dataset.to_device(device, x) for x in sources
                    ]
                    ys = [
                        y.tolist()
                        for y in model.translate(sources, max_length)
                    ]
                    hypotheses.extend(ys)

            bleu = bleu_score.corpus_bleu(
                references,
                hypotheses,
                smoothing_function=bleu_score.SmoothingFunction().method1)
            chainer.report({key: bleu})
            print(bleu)
    def CalculateBleu(model=model,
                      test_data=test_data,
                      batch=100,
                      max_length=100,
                      device=args.gpu):
        trigger = 1, 'epoch'
        priority = chainer.training.PRIORITY_WRITER
        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            for i in range(0, len(test_data), batch):
                sources, targets = zip(*test_data[i:i + batch])
                references.extend([[t.tolist()] for t in targets])

                sources = [
                    chainer.dataset.to_device(device, x) for x in sources
                ]
                ys = [y.tolist() for y in model.translate(sources, max_length)]
                hypotheses.extend(ys)
            print(len(references))
            bleu = bleu_score.corpus_bleu(
                references,
                hypotheses,
                smoothing_function=bleu_score.SmoothingFunction().method1)
            print(bleu)  #
            with open(path_w, mode='a') as f:  #
                f.write('\n bleu : %f' % bleu)  #
Exemple #19
0
def compute_blue(searcher):
    """
    This function computes the bleu score.
    :return: the blue score
    """
    smoothing_fn = bleu_score.SmoothingFunction()
    score = 0
    for idx, test_data in enumerate(test_dataloader):
        question = torch.transpose(test_data[0], 0, 1).to(device)
        answer = torch.transpose(test_data[1], 0, 1)
        answer = answer[1:]
        answer = answer.reshape(1, -1).tolist()[0]
        output = searcher(question)
        output = output[1:]
        prediction = output.topk(1).indices.reshape(1, -1).tolist()[0]
        prediction = convert_to_string(prediction)
        answer = convert_to_string(answer)
        answer = [answer.split()]
        prediction = prediction.split()
        score += bleu_score.sentence_bleu(
            answer,
            prediction,
            smoothing_function=smoothing_fn.method7,
            weights=(0.5, 0.5))
        if idx % 2000 == 0:
            print('Sentences processed: [{}/{}]'.format(
                idx + 1, len(test_dataloader)))
    return (score / len(test_dataloader)) * 100
Exemple #20
0
def calc_bleu_score_for_seqs(hypo_seq, ref_seqs):
    sf = bleu_score.SmoothingFunction()
    return bleu_score.sentence_bleu(
        references=ref_seqs,
        hypothesis=hypo_seq,
        smoothing_function=sf.method1,
        weights=(0.5, 0.5),
    )
Exemple #21
0
def text_bleu(gold_txts, pred_txts):
    all_ref = [[txt.lower().split()] for txt in gold_txts]
    all_hyp = [txt.lower().split() for txt in pred_txts]
    chencherry = bs.SmoothingFunction()
    bleu = bs.corpus_bleu(all_ref,
                          all_hyp,
                          smoothing_function=chencherry.method2)
    return bleu
 def belu_score(predicted_seq, reference_sequences):
     smoothing_fn = bleu_score.SmoothingFunction()
     reference_sequences = np.expand_dims(reference_sequences, axis=0)
     return bleu_score.sentence_bleu(
         reference_sequences,
         predicted_seq,
         smoothing_function=smoothing_fn.method1,
         weights=(0.5, 0.5))
Exemple #23
0
def calculate_dataset_bleu(img_encoddings, word_to_idx, idx_to_word,
                           max_length, caption_model, lookup_table):
    sat_captions = load_show_attend_and_tell_captions()
    sat_bleu_average = 0
    bleu_average = 0
    count = 0

    for image_key in tqdm(img_encoddings.keys()):
        image_path = os.path.join(FLICKR_FOLDER, FLICKR_IMAGES_FOLDER,
                                  image_key + '.jpg')
        if os.path.exists(image_path):
            count += 1
            image = img_encoddings[image_key].reshape(1, IMAGE_OUTPUT_DIM)
            generated_caption = generate_caption(image, word_to_idx,
                                                 idx_to_word, max_length,
                                                 caption_model)
            sat_caption = sat_captions[image_key]

            references = list(
                map(lambda caption: caption.split(), lookup_table[image_key]))
            generated_caption_bleu = bleu.sentence_bleu(
                references,
                generated_caption.split(),
                smoothing_function=bleu.SmoothingFunction().method3)
            bleu_average += generated_caption_bleu
            sat_caption_bleu = bleu.sentence_bleu(
                references,
                sat_caption.split(),
                smoothing_function=bleu.SmoothingFunction().method3)
            sat_bleu_average += sat_caption_bleu

            if count <= 2:
                print("Generated caption: ", generated_caption)
                print("SAT caption: ", sat_caption)
                print("Original caption: ", lookup_table[image_key][0])
                print("SAT caption BLEU score: ", sat_caption_bleu)
                print("Generated caption BLEU score: ", generated_caption_bleu)

                x = plt.imread(image_path)
                plt.imshow(x)
                plt.show()
                print("_____________________________________")

    print('Model validation set BLEU: ', bleu_average / count)
    print('SAT validation set BLEU: ', sat_bleu_average / count)
def _bleu_n(n, ref_words_list, hyp_words):
    """Computes BLEU score up to n-gram."""
    # Average weights across different n-grams.
    weights = [1.0 / n] * n
    return bleu_score.sentence_bleu(
        ref_words_list,
        hyp_words,
        weights=weights,
        smoothing_function=bleu_score.SmoothingFunction().method1)
Exemple #25
0
 def computeBLUEScore(self):
     blue_references = self.packTexts(self.breakTexts(self._references))
     blue_candidates = self._candidates  #self.breakTexts(self._candidates)
     #print(blue_references)
     #print(blue_candidates)
     self._blue_score = bleu_score.corpus_bleu(
         blue_references,
         blue_candidates,
         smoothing_function=bleu_score.SmoothingFunction().method1)
Exemple #26
0
def get_bleu(targets, predictions):
    bleu_scores = 0
    smooth = bleu_score.SmoothingFunction()
    for i in range(len(predictions)):
        score = bleu_score.sentence_bleu(targets[i],
                                         predictions[i],
                                         smoothing_function=smooth.method7)
        bleu_scores += score

    return float(bleu_scores / len(targets))
Exemple #27
0
def bleu(ref_list,
         candidateText,
         nGram=4,
         nGramType='cumulative',
         shouldSmooth=True):
    '''calculates BLEU score 
    
        _parameters_
        ref_list: expects a list of reference texts to compare (as strings)
        candidateText: the new text needing to be scored
        nGram: choose between 1-4.  Determines which ngram(s) to use in the scoring
        nGramType: 'cumulative' uses a simple average of all ngrams from 1 to nGram
        shouldSmooth: if False, calculates the BLEU score without smoothing. Recommended to use smoothing (set to True)
        
        _returns_
        score: BLEU score using nGram settings input, smoothed by default (can be turned off)
    '''

    # basic checks
    if nGram not in [1, 2, 3, 4]:
        raise ValueError('nGram must be between 1 and 4')

    if nGramType not in ['cumulative', 'exclusive']:
        raise ValueError(
            'nGramType must either be cumulative (average of nGrams less than n) or exclusive (1=unigram, etc.)'
        )

    # pre-score
    weight_dict = {
        ('cumulative', 1): (1, 0, 0, 0),
        ('cumulative', 2): (.5, .5, 0, 0),
        ('cumulative', 3): (.3333, .3333, .3333, 0),
        ('cumulative', 4): (.25, .25, .25, .25),
        ('exclusive', 1): (1, 0, 0, 0),
        ('exclusive', 2): (0, 1, 0, 0),
        ('exclusive', 3): (0, 0, 1, 0),
        ('exclusive', 4): (0, 0, 0, 1)
    }
    candidate = [removePunc(str(removeMarkupWords(candidateText))).split()]
    references = [[
        removePunc(str(removeMarkupWords(ref))).split() for ref in ref_list
    ]]
    weights = weight_dict[(nGramType, nGram)]

    # scoring
    if shouldSmooth == True:
        smoother = bleu_score.SmoothingFunction().method7
    else:
        smoother = None
    score = bleu_score.corpus_bleu(references,
                                   candidate,
                                   weights,
                                   smoothing_function=smoother)
    #print(score)
    return score
Exemple #28
0
def evaluate_bleu(dataloader, generator, eval_num=32, mode='sentence'):

    ref_list = []
    gen_list = []
    dataloader.test_init()

    if mode == 'corpus':
        while not dataloader.test_finished():
            batch = dataloader.test_next_batch()
            ref_list.extend(batch.astype(np.str).tolist())
            gen = generator.generate()
            gen_list.extend(gen.astype(np.str).tolist())

        return bleu.corpus_bleu(
            ref_list,
            gen_list,
            smoothing_function=bleu.SmoothingFunction().method4)
    else:
        while not dataloader.test_finished():
            batch = dataloader.test_next_batch()
            ref_list.extend(batch.astype(np.str).tolist())

        for i in range(eval_num // batch_size):
            gen = generator.generate()
            gen_list.extend(gen.astype(np.str).tolist())

        score8, score12 = 0., 0.
        for i, gen in enumerate(gen_list):
            x8 = bleu.sentence_bleu(
                ref_list,
                gen,
                weights=(1.0 / 8, ) * 8,
                smoothing_function=bleu.SmoothingFunction().method4)
            x12 = bleu.sentence_bleu(
                ref_list,
                gen,
                weights=(1.0 / 12, ) * 12,
                smoothing_function=bleu.SmoothingFunction().method4)
            score8 += x8
            score12 += x12

        return score8 / len(gen_list), score12 / len(gen_list)
Exemple #29
0
def test(modelPath):
    settings.sysAsserts()
    dataset = COCODataset('coco/images/val2017', 'coco/annotations/captions_val2017.json', True)
    with open(settings.vocabfilepath, 'rb') as dsf:
        pcklr = pickle.Unpickler(dsf)
        obj = pcklr.load()
        vocab = obj.get('vocab')
        maxCaptionLen = obj.get('maxCaptionLen')
        wordToInd = obj.get('wordToInd')

    cnn, rnn, outputNet = archs.CNN.ResNet34, archs.RNN.LSTM_2l, archs.OutputNet.FC1l
    model = archs.Architecture(cnn, rnn, outputNet, len(vocab),
                               maxCaptionLen + MainHyperparams().maxCaptionLenDelta, modelPath)

    loss = .0
    references = []
    hypotheses = []
    chencherry = bleu.SmoothingFunction()

    model.eval()
    with torch.no_grad():
        for i in range(len(dataset.coco)):
            image, captions = dataset.coco[i]
            image = image.unsqueeze(0).to(settings.device)
            captions = [c.lower() for c in captions]
            outputSeqs, outputLogProbs = model(image)
            outputSeq = [utils.indicesToSentence(vocab, outputSeqs[0])]

            loss += criterion(outputLogProbs, outputSeq, wordToInd)
            references.append(captions)
            hypotheses.append(outputSeq)

            if i % 100 == 0:
                print(f'Evaluated {i + 1}/{len(dataset.coco)}')
                print(f'-----{(dataset.coco.coco.imgs[dataset.coco.ids[i]])["file_name"]}')
                print(f'Truth: {[c.capitalize() for c in captions]}')
                print(f'Prediction: {outputSeq[0].capitalize()}')
                print("\n")

    loss /= len(dataset.coco)
    bleu2 = bleu.corpus_bleu(references, hypotheses, weights=(0.5, 0.5),
                             smoothing_function=chencherry.method1)
    bleu3 = bleu.corpus_bleu(references, hypotheses, weights=(1. / 3., 1. / 3., 1. / 3.),
                             smoothing_function=chencherry.method1)
    bleu4 = bleu.corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25),
                             smoothing_function=chencherry.method1)

    print('--------------------------------------------------------------')
    print(f'Testing dataset consists of {len(dataset.coco)} samples')
    print(f'Negative log-likelihood loss = {loss.item()}')
    print(f'Perplexity = {2 ** loss.item()}')
    print(f'BLEU-2 score = {bleu2}')
    print(f'BLEU-3 score = {bleu3}')
    print(f'BLEU-4 score = {bleu4}')
Exemple #30
0
 def __init__(self, N_gram=4, precision=2):
     from nltk.translate import bleu_score
     self.bleu_score = bleu_score
     self.N_gram = N_gram
     self._precision = precision
     self._weights = [
         tuple([round(1 / i, 4)] * i + [0.] * (N_gram - i))
         for i in range(1, N_gram + 1)
     ]
     self.smoothing_fn = bleu_score.SmoothingFunction().method3
     self.reset()