Ejemplo n.º 1
0
    def test_corpus_bleu(self):
        ref_file = find('models/wmt15_eval/ref.ru')
        hyp_file = find('models/wmt15_eval/google.ru')
        mteval_output_file = find('models/wmt15_eval/mteval-13a.output')

        # Reads the BLEU scores from the `mteval-13a.output` file.
        # The order of the list corresponds to the order of the ngrams.
        with open(mteval_output_file, 'r') as mteval_fin:
            # The numbers are located in the last 2nd line of the file.
            # The first and 2nd item in the list are the score and system names.
            mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])

        with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
            with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
                # Whitespace tokenize the file.
                # Note: split() automatically strip().
                hypothesis = list(map(lambda x: x.split(), hyp_fin))
                # Note that the corpus_bleu input is list of list of references.
                references = list(map(lambda x: [x.split()],ref_fin))
                # Without smoothing.
                for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i)
                    # Check that the BLEU scores difference is less than 0.005 .
                    # Note: This is an approximate comparison; as much as
                    #       +/- 0.01 BLEU might be "statistically significant",
                    #       the actual translation quality might not be.
                    assert abs(mteval_bleu - nltk_bleu) < 0.005

                # With the same smoothing method used in mteval-v13a.pl
                chencherry = SmoothingFunction()
                for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
                    nltk_bleu = corpus_bleu(references, hypothesis,
                                            weights=(1.0/i,)*i,
                                            smoothing_function=chencherry.method3)
                    assert abs(mteval_bleu - nltk_bleu) < 0.005
Ejemplo n.º 2
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    args = setup_args()
    logging.info(args)

    f = codecs.open('report-%s.csv'% args.model, 'w')
    csv_f = csv.writer(f, delimiter=',', encoding='utf-8')

    src_lines = codecs.open(args.src, 'r', 'utf-8').readlines()
    src_lines_nounk = codecs.open(args.src + '.nounk', 'r', 'utf-8').readlines()

    target_lines = codecs.open(args.target, 'r', 'utf-8').readlines()
    target_lines_nounk = codecs.open(args.target + '.nounk', 'r', 'utf-8').readlines()

    gold_lines = codecs.open(args.gold, 'r', 'utf-8').readlines()
    gold_lines_nounk = codecs.open(args.gold + '.nounk', 'r', 'utf-8').readlines()

    data = ['Src', 'Src_UNK', 'Target_UNK', 'Target', 'Gold_UNK', 'Gold', 'BLEU1']
    csv_f.writerow(data)

    num_lines = len(gold_lines)
    logging.info('Num Lines: %d'% num_lines)


    references = []
    hypotheses = []
    for index in range(num_lines):
        data = []
        data.append(src_lines_nounk[index].strip())
        data.append(src_lines[index].strip())

        data.append(target_lines[index].strip())
        data.append(target_lines_nounk[index].strip())

        data.append(gold_lines[index].strip())
        data.append(gold_lines_nounk[index].strip())

        gold = gold_lines[index].strip().split()
        output = target_lines[index].strip().split()
        default = 'UNK UNK UNK UNK'.split()

        if len(output) < 4:
            bleu_score = 0.0
            hypotheses.append(default)
        else:
            bleu_score = sentence_bleu([gold], output, weights=(1.0,))
            hypotheses.append(output)

        references.append([gold])
        logging.info('sentence:%d bleu:%f'%(index, bleu_score))
        data.append(str(bleu_score))
        csv_f.writerow(data)

    final_bleu = corpus_bleu(references, hypotheses)
    unigram_bleu = corpus_bleu(references, hypotheses, weights=(1.0,))
    logging.info('Final BLEU: %f Unigram_BLEU: %f '% (final_bleu, unigram_bleu))
Ejemplo n.º 3
0
    def evaluate(self):
        bt = time.time()
        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            observation = {}
            with reporter.report_scope(observation):
                for i in range(0, len(self.test_data), self.batch):
                    src, trg = zip(*self.test_data[i:i + self.batch])
                    references.extend([[t.tolist()] for t in trg])

                    src = [chainer.dataset.to_device(self.device, x)
                           for x in src]

                    if self.comm.rank == 0:
                        self.model.translate(src, self.max_length)

                    elif self.comm.rank == 1:
                        ys = [y.tolist()
                              for y in self.model.translate(
                                  src, self.max_length)]
                        hypotheses.extend(ys)

                if self.comm.rank == 1:
                    bleu = bleu_score.corpus_bleu(
                        references, hypotheses, smoothing_function=bleu_score.
                        SmoothingFunction().method1)
                    reporter.report({'bleu': bleu}, self.model)
        et = time.time()

        if self.comm.rank == 1:
            print("BleuEvaluator(single)::evaluate(): "
                  "took {:.3f} [s]".format(et - bt))
            sys.stdout.flush()
        return observation
Ejemplo n.º 4
0
def bleu_1(decoded, references):
    listed_references = [[s] for s in references]

    bleu_1 = \
        100 * corpus_bleu(listed_references, decoded,
                      weights=[1.0, 0, 0, 0],
                      smoothing_function=bleu_smoothing)
    return bleu_1
Ejemplo n.º 5
0
 def test_corpus_bleu_with_emulate_multibleu(self):
     hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R"
     ref = str("Their tasks include changing a pump on the faulty stokehold ."
               "Likewise , two species that are very similar in morphology "
               "were distinguished using genetics .")
     references = [[ref.split()]]
     hypothese = [hyp.split()]
     try: # Check that the warning is raised since no. of 2-grams < 0.
         with self.assertWarns(UserWarning):
             # Verify that the BLEU output is undesired since no. of 2-grams < 0.
             self.assertAlmostEqual(corpus_bleu(references, hypothese), 0.4309, places=4)
     except AttributeError:
         pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
     desired_output = corpus_bleu(references, hypothese,
                                  emulate_multibleu=True)
     #assert
     assert desired_output == 0.0
Ejemplo n.º 6
0
def eval(): 
    # Load graph
    g = Graph(is_training=False)
    print("Graph loaded")
    
    # Load data
    X, Sources, Targets = load_test_data()
    de2idx, idx2de = load_de_vocab()
    en2idx, idx2en = load_en_vocab()
     
#     X, Sources, Targets = X[:33], Sources[:33], Targets[:33]
     
    # Start session         
    with g.graph.as_default():    
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            ## Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print("Restored!")
              
            ## Get model name
            mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
             
            ## Inference
            if not os.path.exists('results'): os.mkdir('results')
            with codecs.open("results/" + mname, "w", "utf-8") as fout:
                list_of_refs, hypotheses = [], []
                for i in range(len(X) // hp.batch_size):
                     
                    ### Get mini-batches
                    x = X[i*hp.batch_size: (i+1)*hp.batch_size]
                    sources = Sources[i*hp.batch_size: (i+1)*hp.batch_size]
                    targets = Targets[i*hp.batch_size: (i+1)*hp.batch_size]
                     
                    ### Autoregressive inference
                    preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
                    for j in range(hp.maxlen):
                        _preds = sess.run(g.preds, {g.x: x, g.y: preds})
                        preds[:, j] = _preds[:, j]
                     
                    ### Write to file
                    for source, target, pred in zip(sources, targets, preds): # sentence-wise
                        got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip()
                        fout.write("- source: " + source +"\n")
                        fout.write("- expected: " + target + "\n")
                        fout.write("- got: " + got + "\n\n")
                        fout.flush()
                          
                        # bleu score
                        ref = target.split()
                        hypothesis = got.split()
                        if len(ref) > 3 and len(hypothesis) > 3:
                            list_of_refs.append([ref])
                            hypotheses.append(hypothesis)
              
                ## Calculate bleu score
                score = corpus_bleu(list_of_refs, hypotheses)
                fout.write("Bleu Score = " + str(100*score))
Ejemplo n.º 7
0
def evaluation(data, classifier, normalizer, pca, params, limit=1000):

    (inputs, references, candidates) = data

    bleu_references = [[x] for x in references]
    bleu_hypotheses_baseline = best_baseline(inputs, candidates)

    baseline_blue = corpus_bleu(bleu_references, bleu_hypotheses_baseline)
    print("Baseline BLEU: %0.10f" % baseline_blue)

    bleu_hypotheses_reranking = best_reranking(inputs, candidates, classifier, normalizer, pca, params, limit)

    reranking_blue = corpus_bleu(bleu_references, bleu_hypotheses_reranking)
    print("Reranking BLEU: %0.10f" % reranking_blue)

    blue_diff = reranking_blue - baseline_blue
    print("BLEU Diff: %0.10f" % blue_diff)

    return baseline_blue, reranking_blue, blue_diff, bleu_hypotheses_reranking
Ejemplo n.º 8
0
def compute_BLEU_score_corpus():
    print("Generate Captions")

    print("Loading Vocab")

    with open('models/vocab_list_'+dataname+'.pkl', 'rb') as f:
        vocabs = pickle.load(f)
        for v in vocabs:
            if v not in vocab:
                update_vocab(v)

    generating_model = load_model(rnn_model_name, 'models/'+dataname+'/best_' + rnn_model_name + '_model_'+
                                  str(image_caption_model)+'_output_rnn_'+str(output_rnn_dim)+'_weights_iteration_' +
                                  str(iteration) + '.h5')

    print("Loading Image Caption dict")
    with open('dataset/'+dataname+'/image_caption_dict.pkl', 'rb') as f:
        image_caption_dict = pickle.load(f)

    sentences = []
    references = []

    i = 0

    # Calculate Bleu-n
    weights = [0.25,0.25,0.25,0.25]

    for key, value in image_caption_dict.iteritems():
        print(str(i)+'/'+str(len(image_caption_dict.keys())))
        i += 1

        image_path_new = image_path+key

        result = get_caption(generating_model, image_path_new, value['image_data'])

        sentences.append(result)

        reference = [[str(word).lower() for word in x['tokens']] for x in value['sentences']]

        references.append(reference)

    corpus_score = corpus_bleu(references, sentences, weights) * 100

    print(corpus_score)

    return corpus_score
Ejemplo n.º 9
0
    def __call__(self, trainer):
        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            for i in range(0, len(self.test_data), self.batch):
                sources, targets = zip(*self.test_data[i:i + self.batch])
                references.extend([[t.tolist()] for t in targets])

                sources = [
                    chainer.dataset.to_device(self.device, x) for x in sources]
                ys = [y.tolist()
                      for y in self.model.translate(sources, self.max_length)]
                hypotheses.extend(ys)

        bleu = bleu_score.corpus_bleu(
            references, hypotheses,
            smoothing_function=bleu_score.SmoothingFunction().method1)
        reporter.report({self.key: bleu})
Ejemplo n.º 10
0
def compute_corpus_level_bleu_score(references: List[List[str]], hypotheses: List[Hypothesis]) -> float:
    """
    Given decoding results and reference sentences, compute corpus-level BLEU score

    Args:
        references: a list of gold-standard reference target sentences
        hypotheses: a list of hypotheses, one for each reference

    Returns:
        bleu_score: corpus-level BLEU score
    """

    if references[0][0] == '<s>':
        references = [ref[1:-1] for ref in references]

    bleu_score = corpus_bleu([[ref] for ref in references],
                             [hyp.value for hyp in hypotheses])

    return bleu_score
Ejemplo n.º 11
0
def bleu_4_dedup(decoded, references):
    listed_references = [[s] for s in references]
    deduplicated_sentences = []

    for sentence in decoded:
        last_w = None
        dedup_snt = []

        for word in sentence:
            if word != last_w:
                dedup_snt.append(word)
                last_w = word

        deduplicated_sentences.append(dedup_snt)

    bleu_4 = \
        100 * corpus_bleu(listed_references, deduplicated_sentences,
                      weights=[0.25, 0.25, 0.25, 0.25],
                      smoothing_function=bleu_smoothing)
    return bleu_4
def compute_bleu(batch_in, predicted):
    weights = []
    n = 4
    for i in range(n):
        weights.append(float(1.0 / n))

    # Create hypothesis and reference arrays taking 5 predicted captions per image (maybe we could modify this?)
    # Initialize hypothesis and reference arrays
    hypotheses = []
    references = []

    for j, (_, sentence_in, _) in enumerate(batch_in):
        references.append([sentence_in])
        hypotheses.append(tkn.tokenize(predicted[j]))

    # Compute BLEU score
    score = corpus_bleu(references, hypotheses, weights=weights)

    # Display BLEU score
    # logging.info('BLEU-{} score: {}'.format(n, score))

    return score
Ejemplo n.º 13
0
    def evaluate(self):
        bt = time.time()
        with chainer.no_backprop_mode():
            references = []
            hypotheses = []
            observation = {}
            with reporter.report_scope(observation):
                for i in range(0, len(self.test_data), self.batch):
                    src, trg = zip(*self.test_data[i:i + self.batch])
                    references.extend([[t.tolist()] for t in trg])

                    src = [chainer.dataset.to_device(self.device, x)
                           for x in src]
                    ys = [y.tolist()
                          for y in self.model.translate(src, self.max_length)]
                    hypotheses.extend(ys)

                bleu = bleu_score.corpus_bleu(
                    references, hypotheses,
                    smoothing_function=bleu_score.SmoothingFunction().method1)
                reporter.report({'bleu': bleu}, self.model)
        et = time.time()

        if self.comm is not None:
            # This evaluator is called via chainermn.MultiNodeEvaluator
            for i in range(0, self.comm.size):
                print('BleuEvaluator::evaluate(): '
                      'took {:.3f} [s]'.format(et - bt))
                sys.stdout.flush()
                self.comm.mpi_comm.Barrier()
        else:
            # This evaluator is called from a conventional
            # Chainer exntension
            print('BleuEvaluator(single)::evaluate(): '
                  'took {:.3f} [s]'.format(et - bt))
            sys.stdout.flush()
        return observation
Ejemplo n.º 14
0
print(score)  # 会输出一个满分, 因为候选语句完全匹配其中一个参考语句

reference = [['the', 'cat', "is", "sitting", "on", "the", "mat"]]
test = ["on", 'the', "mat", "is", "a", "cat"]  # The hypothesis contains 0 counts of 4-gram overlaps.
print(sentence_bleu(reference, test))  # 5.5546715329196825e-78
test = ['the', 'cat', 'is', 'sitting', 'on', 'mat']
print(sentence_bleu(reference, test))  # 0.6731821382417487

##################################################################
## 二: corpus_bleu: 计算多个句子(如段落或文档)的 BLEU 分数
# 参考文本必须被指定为文档列表, 其中每个文档是一个参考语句列表, 并且每个可替换的参考语句也是记号列表, 也就是说文档列表是记号列表的列表的列表
# 候选文档必须被指定为列表, 其中每个文件是一个记号列表, 也就是说候选文档是记号列表的列表

references = [[['this', 'is', 'a', 'test'], ['this', 'is' 'test']]]  # two references for one document
candidates = [['this', 'is', 'a', 'test']]
score = corpus_bleu(references, candidates)
print(score)  # 1.0; 运行这个例子就像之前一样输出满分

##################################################################
## 累加和单独的 BLEU 分数
# NLTK 中提供的 BLEU 评分方法允许你在计算 BLEU 分数时为不同的 n 元组指定权重
# 这使你可以灵活地计算不同类型的 BLEU 分数, 如单独和累加的 n-gram 分数

## 单独的 N-Gram 分数
# 单独的 N-gram 分数是对特定顺序的匹配 n 元组的评分, 例如单个单词(称为 1-gram)或单词对(称为 2-gram 或 bigram)
# 权重被指定为一个数组, 其中每个索引对应相应次序的 n 元组
# 仅要计算 1-gram 匹配的 BLEU 分数, 你可以指定 1-gram 权重为 1, 对于 2 元, 3 元和 4 元指定权重为 0, 也就是权重为(1, 0, 0, 0):

## 1-gram individual BLEU
reference = [['this', 'is', 'small', 'test']]
candidate = ['this', 'is', 'a', 'test']
Ejemplo n.º 15
0
def evaluate(opt_translates):
    eval_stats = []
    eval_results = []

    not_in_train = 0
    total = 0
    for opt_translate in opt_translates:
        src_lines = []
        with open(opt_translate.src, 'r') as f:
            src_lines = f.readlines()
        with open(opt_translate.output, 'r') as handle:
            lines = [line.strip() for line in handle]
            for i, l in enumerate(lines):
                di = opt.parser.test_src_to_di[opt_translate.predicate][
                    src_lines[i].strip()]
                total += 1
                stats = '\n++++NOT IN TRAIN++++'
                if src_lines[i].strip() not in opt.parser.train_src_to_di[
                        opt_translate.predicate]:
                    not_in_train += 1
                    stats = ''
                lexicalized_l = lexicalize_word_sequence(
                    l.split(), di.input.delexicalizationMap)

                stats += '\nSRC:' + str(src_lines[i]) + 'PRED:' + str(
                    opt_translate.predicate) + '\nMR:' + str(
                        di.input.attributeValues) + '\nREAL: ' + ' '.join(
                            lexicalized_l) + '\nDREF: ' + str(
                                di.directReference)
                logger.info(stats)

                eval_stats.append(di.output.evaluateAgainst(lexicalized_l))
                eval_results.append(
                    (" ".join(lexicalized_l), di.output.evaluationReferences))

                stats = '\nEREF: ' + str(
                    eval_stats[-1].refs) + '\nBLEU: ' + str(
                        eval_stats[-1].BLEU) + '\n'
                logger.info(stats)

                if (' '.join(lexicalized_l)).strip() == str(
                        di.directReference).strip(
                        ) and eval_stats[-1].BLEU != 1.0:
                    exit()

    realizations = []
    references = []
    for realization, refs in eval_results:
        realizations.append(realization)
        references.append(refs)
    corpusBLEU = corpus_bleu(references, realizations)
    bleu = numpy.average([e.BLEU for e in eval_stats])
    rouge = numpy.average([e.ROUGE for e in eval_stats])
    coverage = numpy.average([e.COVERAGE for e in eval_stats])

    print("corpusBLEU:", corpusBLEU)
    print("BLEU:", bleu)
    print("smoothBLEU:", numpy.average([e.BLEUSmooth for e in eval_stats]))
    print("ROUGE:", rouge)
    print("COVERAGE:", coverage)
    print("NOT IN TRAIN:", not_in_train, '/', total)

    return corpusBLEU, bleu, rouge, coverage
Ejemplo n.º 16
0
        eq = (preds==targs).float()
        indy_acc = eq[bitmask].mean()
        eq[~bitmask] = 1
        eq = eq.reshape(og_shape)
        acc = (eq.sum(-1)==sl).float().mean()
        bleu_trgs=targs.reshape(og_shape).data.cpu().numpy()
        trg_ends = np.argmax((bleu_trgs==stop_idx),axis=1)
        bleu_prds=preds.reshape(og_shape).data.cpu().numpy()
        prd_ends = np.argmax((bleu_prds==stop_idx),axis=1)
        btrgs = []
        bprds = []
        for i in range(len(bleu_trgs)):
            temp = bleu_trgs[i,None,:trg_ends[i]].tolist()
            btrgs.append(temp)
            bprds.append(bleu_prds[i,:prd_ends[i]].tolist())
        bleu = corpus_bleu(btrgs,bprds)
        avg_bleu += bleu
        avg_acc += acc.item()
        avg_indy_acc += indy_acc.item()
        avg_loss += loss.item()

        s="Loss:{:.5f} | Acc:{:.5f} | Bleu:{:.5f} | {:.0f}%"
        s = s.format(loss.item(), acc.item(), bleu,
                                  b/len(X)*100)
        print(s, end=len(s)*" " + "\r")
        if hyps['exp_name']=="test" and b > 5: break

val_avg_bleu = avg_bleu/n_loops
val_avg_loss = avg_loss/n_loops
val_avg_acc = avg_acc/n_loops
val_avg_indy = avg_indy_acc/n_loops
Ejemplo n.º 17
0
def validate(val_loader, encoder, decoder, criterion, rev_word_map):
    """
    Performs one epoch's validation.

    :param val_loader: DataLoader for validation data.
    :param encoder: encoder model
    :param decoder: decoder model
    :param criterion: loss layer
    :return: BLEU-4 score
    """
    # eval mode
    decoder.eval()
    if encoder is not None:
        encoder.eval()

    # meter
    batch_time = AverageMeter()
    losses = AverageMeter()
    top5accs = AverageMeter()

    start = time.time()

    references = list(
    )  # references (true captions) for calculating BLEU-4 score
    hypotheses = list()  # hypotheses (predictions)

    # explicitly disable gradient calculation to avoid CUDA memory error
    # solves the issue #57
    with torch.no_grad():
        # Batches
        for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader):
            # break after one epoch if debugging locally
            if (args.run_local or args.debug) and i > 2:
                break

            # Move to device, if available
            imgs = imgs.to(device)
            caps = caps.to(device)
            caplens = caplens.to(device)

            # Forward prop.
            if encoder is not None:
                imgs = encoder(imgs)

            scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(
                imgs, caps, caplens)

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
            targets = caps_sorted[:, 1:]

            # Remove timesteps that we didn't decode at, or are pads
            # pack_padded_sequence is an easy trick to do this
            scores_copy = scores.clone()
            scores = pack_padded_sequence(scores,
                                          decode_lengths,
                                          batch_first=True).data
            targets = pack_padded_sequence(targets,
                                           decode_lengths,
                                           batch_first=True).data

            # Calculate loss
            loss = criterion(scores, targets)

            # Add doubly stochastic attention regularization
            # We know the weights sum to 1 at a given timestep. But we also encourage
            # the weights at a single pixel p to sum to 1 across all timesteps T
            # This means we want the model to attend to every pixel over the course of generating
            # the entire sequence. Therefore, we try to minimize the difference between 1 and the sum of
            # a pixel's weights across all timesteps
            loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean()

            # Keep track of metrics_roc_and_more
            losses.update(loss.item(), sum(decode_lengths))
            top5 = accuracy(scores, targets, 5)
            top5accs.update(top5, sum(decode_lengths))
            batch_time.update(time.time() - start)

            start = time.time()

            if i % print_freq == 0:
                print(
                    '4    Validation: [{0}/{1}]\t'
                    'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(
                        i,
                        len(val_loader),
                        batch_time=batch_time,
                        loss=losses,
                        top5=top5accs))

            # Store references (true captions), and hypothesis (prediction) for each image
            # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
            # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
            # References
            allcaps = allcaps[
                sort_ind]  # because images were sorted in the decoder
            for j in range(allcaps.shape[0]):  # for each example
                img_caps = allcaps[j].tolist()
                img_captions = list(
                    map(
                        lambda c: [
                            w for w in c if w not in
                            {word_map['<start>'], word_map['<pad>']}
                        ], img_caps))  # remove <start> and pads
                references.append(img_captions)

            # Hypotheses
            # get for each example the max pred at each time step (batch size, max length caption)
            pred_values, preds_ind = torch.max(scores_copy, dim=2)
            preds_ind = preds_ind.tolist()
            temp_preds = list()

            # remove pads
            for j, p in enumerate(preds_ind):
                temp_preds.append(preds_ind[j][:decode_lengths[j]])
            preds_ind = temp_preds
            hypotheses.extend(preds_ind)

            assert len(references) == len(hypotheses)

            if (i + 1) % 300 == 0:
                print('-1   ************print captions***********')
                num_to_print = 0
                for h in hypotheses:
                    if num_to_print < 100:
                        words = []
                        for w in h:
                            words.append(rev_word_map[w])
                        print('1    ' + ' '.join(words))
                        num_to_print += 1
                    else:
                        break

                print('2    **************************************')

        # Calculate BLEU-4 scores
        bleu4 = corpus_bleu(references, hypotheses)

        print(
            '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n'
            .format(loss=losses, top5=top5accs, bleu=bleu4))

    return bleu4
Ejemplo n.º 18
0
max_test_encoder_sequence_length = max([len(txt) for txt in test_input_texts])

test_encoder_input_data = np.zeros(
    (num_test_samples, max_test_encoder_sequence_length, num_encoder_tokens), dtype='float32')

for i in range(num_test_samples):
    for j in range(len(test_input_texts[i])):
        if test_input_texts[i][j] in input_words:
            test_encoder_input_data[i, j, input_words[test_input_texts[i][j]]] = 1

references=[]
hypotheses=[]
with open('{}.txt'.format(int(time.time())),'w') as fi:
	for i in range(num_samples):
		decoded_sentence=decode_sequence(encoder_input_data[i:i+1])
		
		references.append([target_texts[i][1:-1]])
		hypotheses.append(decoded_sentence[:-1])

	print(corpus_bleu(references,hypotheses),file=fi)
	
	references=[]
	hypotheses=[]	
	for i in range(num_test_samples):
		decoded_sentence=decode_sequence(test_encoder_input_data[i:i+1])
		
		references.append([test_target_texts[i][1:-1]])
		hypotheses.append(decoded_sentence[:-1])

	print(corpus_bleu(references,hypotheses),file=fi)
Ejemplo n.º 19
0
for i in range(n):
    weights.append(float(1.0/n))

# Initialize hypothesis and reference arrays
hypotheses = []
references = []

# Create hypothesis and reference arrays taking 5 predicted captions per image (maybe we could modify this?)
for row in dataset['images']:
    caption = row['sentences'][0]['tokens']
    predicted_0 = row['predicted caption'][0]
    predicted_1 = row['predicted caption'][1]
    predicted_2 = row['predicted caption'][2]
    predicted_3 = row['predicted caption'][3]
    predicted_4 = row['predicted caption'][4]
    references.append([caption])
    references.append([caption])
    references.append([caption])
    references.append([caption])
    references.append([caption])
    hypotheses.append(tkn.tokenize(predicted_0))
    hypotheses.append(tkn.tokenize(predicted_1))
    hypotheses.append(tkn.tokenize(predicted_2))
    hypotheses.append(tkn.tokenize(predicted_3))
    hypotheses.append(tkn.tokenize(predicted_4))

# Compute BLEU score
bleu_score = corpus_bleu(references, hypotheses, weights=weights)

# Display BLEU score
print 'BLEU score: ' + str(bleu_score)
Ejemplo n.º 20
0
def validate(val_loader, net, encoder, decoder, criterion, word_map):
    net.eval()
    encoder.eval()
    decoder.eval()

    batch_time = AverageMeter()
    losses = AverageMeter()
    top3accs = AverageMeter()

    start = time.time()

    references = list()
    hypotheses = list()

    with torch.no_grad():
        # Batches
        for i, (imgs1, imgs2, caps, caplens, allcaps) in enumerate(val_loader):

            imgs1 = imgs1.to(device)
            imgs2 = imgs2.to(device)
            caps = caps.to(device)
            caplens = caplens.to(device)

            im1_enc = net(imgs1)
            im2_enc = net(imgs2)

            # Forward prop.
            l_bef, l_aft, alpha_bef, alpha_aft = encoder(im1_enc, im2_enc)
            scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(l_bef, l_aft, caps, caplens)

            targets = caps_sorted[:, 1:]

            scores_copy = scores.clone()
            scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).data
            targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).data

            loss = criterion(scores, targets)

            # TODO
            # Add doubly stochastic attention regularization

            losses.update(loss.item(), sum(decode_lengths))
            top3 = accuracy(scores, targets, 3)
            top3accs.update(top3, sum(decode_lengths))
            batch_time.update(time.time() - start)

            start = time.time()

            if i % print_freq == 0:
                print('Validation: [{0}/{1}]\t'
                      'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Top-3 Accuracy {top3.val:.3f} ({top3.avg:.3f})\t'.format(i, len(val_loader),
                                                                                batch_time=batch_time,
                                                                                loss=losses, top3=top3accs))

            # References
            allcaps = allcaps[sort_ind]
            for j in range(allcaps.shape[0]):
                img_caps = allcaps[j].tolist()
                img_captions = list(
                    map(lambda c: [w for w in c if w not in [0, 1, 2]], img_caps))
                references.append(img_captions)

            # Hypotheses
            _, preds = torch.max(scores_copy, dim=2)
            preds = preds.tolist()
            temp_preds = list()
            for j, p in enumerate(preds):
                temp_preds.append(preds[j][:decode_lengths[j]])
            preds = temp_preds
            hypotheses.extend(preds)

        assert len(references) == len(hypotheses)

        weights1 = (1.0, 0.0, 0.0, 0.0)
        weights2 = (0.5, 0.5, 0.0, 0.0)
        weights3 = (0.33, 0.33, 0.33, 0.0)
        weights4 = (0.25, 0.25, 0.25, 0.25)

        bleu1 = corpus_bleu(references, hypotheses, weights1)
        bleu2 = corpus_bleu(references, hypotheses, weights2)
        bleu3 = corpus_bleu(references, hypotheses, weights3)
        bleu4 = corpus_bleu(references, hypotheses, weights4)

        print(
            '\n * LOSS - {loss.avg:.3f}, TOP-3 ACCURACY - {top3.avg:.3f}, BLEU-1 - {bleu11}, BLEU-2 - {bleu22}, BLEU-3 - {bleu33}, BLEU-4 - {bleu44},\n'.format(
                loss=losses,
                top3=top3accs,
                bleu11=bleu1,
                bleu22=bleu2,
                bleu33=bleu3,
                bleu44=bleu4, ))

        return bleu4
Ejemplo n.º 21
0
        print("Loading initial model from {}...".format(ckpt_path))
        optimistic_restore(sess, ckpt_path)
    try:
        step = 0
        while True:
            step += 1
            print("training step {}...".format(step))
            training_start_time = time.time()
            ops = [gleu_train_op, global_step_var, graph_sums_op, mle_loss, gleu_score, preds, noised_y]
            _, global_step, graph_sums, loss, gleu, pred_values, y_values = sess.run(ops)
            training_step_time = time.time() - training_start_time

            # Compute batch BLEU and GLEU and save summaries of them
            cropped_y = [[_crop(y_values[k, :], EOS)] for k in range(batch_size)]
            cropped_preds = [_crop(pred_values[k, :], EOS) for k in range(batch_size)]
            nltk_bleu = corpus_bleu(cropped_y, cropped_preds, emulate_multibleu=True)
            nltk_gleu, nltk_n_match, nltk_n_all = custom_corpus_gleu(cropped_y, cropped_preds)

            sums = {
                'nltk.bleu': nltk_bleu,
                'nltk.gleu': nltk_gleu,
                'nltk.n_match': nltk_n_match,
                'nltk.n_all': nltk_n_all,
            }

            sum_writer.add_summary(graph_sums, global_step=global_step)
            for label, measure in sums.items():
                summary = tf.Summary(value=[tf.Summary.Value(tag=label, simple_value=measure)])
                sum_writer.add_summary(summary, global_step=global_step)

            print("step took {:.2f} seconds".format(training_step_time))
Ejemplo n.º 22
0
def evaluate(beam_size):
    """
    Evaluation

    :param beam_size: beam size at which to generate captions for evaluation
    :return: BLEU-4 score
    """
    # DataLoader
    loader = torch.utils.data.DataLoader(
        CaptionDataset(data_folder, data_name, 'TEST', transform=transforms.Compose([normalize])),
        batch_size=1, shuffle=False, num_workers=1, pin_memory=True)

    # TODO: Batched Beam Search
    # Therefore, do not use a batch_size greater than 1 - IMPORTANT!

    # Lists to store references (true captions), and hypothesis (prediction) for each image
    # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
    # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
    references = list()
    hypotheses = list()

    # For each image
    for i, (image, caps, caplens, allcaps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):
        if i > 14:
            break
        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        # Encode
        img, img_mean = model.imgEncoder(image)  # (1, enc_image_size, enc_image_size, encoder_dim)
        enc_image_size = img.size(1)
        encoder_dim = img.size(3)

        # Flatten encoding
        encoder_out = img.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # We'll treat the problem as having a batch size of k
        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        if standard_gaussian:
            z = torch.randn([k, latent_size])
            if torch.cuda.is_available():
                z = z.cuda()
        else:
            h2 = torch.relu(model.fc2(img_mean))
            mu2 = model.hidden2mu2(h2)
            logv2 = model.hidden2logv2(h2)
            std = torch.exp(0.5 * logv2)
            z = torch.randn([k, latent_size])
            if torch.cuda.is_available():
                z = z.cuda()
            z = z * std + mu2

        h, c = model.attnDecoder.init_hidden_state(z)
        smth_wrong = False

        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = model.attnDecoder.embedding(k_prev_words).squeeze(1)  # (s, embed_dim)

            awe, _ = model.attnDecoder.attention(encoder_out, h)  # (s, encoder_dim), (s, num_pixels)

            gate = model.attnDecoder.sigmoid(model.attnDecoder.f_beta(h))  # gating scalar, (s, encoder_dim)
            awe = gate * awe

            h, c = model.attnDecoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))  # (s, decoder_dim)

            scores = model.attnDecoder.fc(h)  # (s, vocab_size)
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)  # (s)

            # Convert unrolled indices to actual indices of scores
            prev_word_inds = top_k_words / vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)

            # Add new words to sequences
            seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                               next_word != word_map['<end>']]
            complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                smth_wrong = True
                break
            step += 1

        if smth_wrong is not True:
            i = complete_seqs_scores.index(max(complete_seqs_scores))
            seq = complete_seqs[i]
        else:
            seq = seqs[0][:20]
            seq = [x.item() for x in seq]


        # i = complete_seqs_scores.index(max(complete_seqs_scores))
        # seq = complete_seqs[i]

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(lambda c: [w for w in c if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],
                img_caps))  # remove <start> and pads
        references.append(img_captions)

        # Hypotheses
        hypotheses.append([w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}])
        print(' '.join([rev_word_map[x] for x in hypotheses[-1]]))
        assert len(references) == len(hypotheses)

    # Calculate BLEU-4 scores
    bleu4 = corpus_bleu(references, hypotheses)

    return bleu4
Ejemplo n.º 23
0
 def bleu(self, reference, candidate):
     bleu4 = corpus_bleu(
         reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)) * 100
     return bleu4
Ejemplo n.º 24
0
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from nltk.translate import bleu_score, meteor_score
import numpy as np
x = [1, 2, 2, 3]
hyp = ['am', 'leg']
ref = ['am', 'legend']
# print(bleu_score.sentence_bleu([ref],hyp,weights=[0.5,0.5]))
print(bleu_score.corpus_bleu([[ref]], [hyp], weights=[1.0]))

# plt.imsave(path)

# import tensorflow as tf

# print(tf.__version__)

# conda config --add channels conda-forge
# conda install keras opencv shapely tensorflow gensim pandas imgaug
# pip install --upgrade tensorflow==2.0.0-beta1
# pip install  "C:\Users\omarm\Downloads\Shapely-1.6.4.post2-cp37-cp37m-win_amd64.whl" matplotlib  pandas imgaug gensim tensorflow==2.0.0-beta1
Ejemplo n.º 25
0
def validate(validation_loader, model_network, criterion_func):
    model_network.eval()

    batch_time = avgValsTracker()
    losses = avgValsTracker()
    top5accs = avgValsTracker()

    start = time.time()

    references = list(
    )  # references (true captions) for calculating BLEU-4 score
    hypotheses = list()  # hypotheses (predictions)

    # explicitly disable gradient calculation to avoid CUDA memory error
    # solves the issue #57
    with torch.no_grad():
        # Batches
        for i, (imgs, caps, caplens, allcaps) in enumerate(validation_loader):

            # Move to device, if available
            imgs = imgs.to(device)
            caps = caps.to(device)
            caplens = caplens.to(device)

            # Forwardla
            scores, caps_sorted, decode_lengths, alphas, sort_ind = model(
                imgs, caps, caplens)

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
            targets = caps_sorted[:, 1:]

            # Remove time-steps that we didn't decode at, or are pads
            # pack_padded_sequence is an easy trick to do this
            scores_copy = scores.clone()
            scores = pack_padded_sequence(scores,
                                          decode_lengths,
                                          batch_first=True).data
            targets = pack_padded_sequence(targets,
                                           decode_lengths,
                                           batch_first=True).data

            # Calculate loss
            loss = criterion_func(scores, targets)

            # Add doubly stochastic attention regularization
            loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean()

            # Keep track of metrics
            losses.update(loss.item(), sum(decode_lengths))
            top5 = accuracy(scores, targets, 5)
            top5accs.update(top5, sum(decode_lengths))
            batch_time.update(time.time() - start)

            start = time.time()

            if i % 100 == 0:
                print(
                    'Validation: [{0}/{1}]\t'
                    'Batch Time {batch_time.value:.3f} ({batch_time.average:.3f})\t'
                    'Loss {loss.value:.4f} ({loss.average:.4f})\t'
                    'Top-5 Accuracy {top5.value:.3f} ({top5.average:.3f})\t'.
                    format(i,
                           len(validation_loader),
                           batch_time=batch_time,
                           loss=losses,
                           top5=top5accs))

            # Store references (true captions), and hypothesis (prediction) for each image
            # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
            # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]

            # References
            # allcaps = allcaps[sort_ind]  # because images were sorted in the decoder
            for j in range(allcaps.shape[0]):
                img_caps = allcaps[j].tolist()
                for idxx in range(len(img_caps) - 1, -1, -1):
                    if img_caps[idxx][0] == -1:
                        del img_caps[idxx]
                img_captions = list(
                    map(
                        lambda c:
                        [w for w in c if w not in {word_map['x_START_']}],
                        img_caps))  # remove <start> and pads
                references.append(img_captions)

            # Hypotheses
            _, preds = torch.max(scores_copy, dim=2)
            preds = preds.tolist()
            temp_preds = list()
            for j, p in enumerate(preds):
                temp_preds.append(preds[j][:decode_lengths[j]])  # remove pads
            preds = temp_preds
            hypotheses.extend(preds)

            assert len(references) == len(hypotheses)

        # Calculate BLEU-4 scores
        bleu4 = corpus_bleu(references, hypotheses)

        print(
            '\n * LOSS: {loss.avg:.3f}, TOP-5 ACCURACY: {top5.avg:.3f}, BLEU-4: {bleu}\n'
            .format(loss=losses, top5=top5accs, bleu=bleu4))

    return bleu4
Ejemplo n.º 26
0
    def evaluate(self,
                 model,
                 data,
                 vocabs=None,
                 use_concept=False,
                 log_dir=None,
                 embed=None,
                 cur_step=0):
        """ Evaluate a model on given dataset and return performance.

        Args:
            model (seq2seq.models): model to evaluate
            data (seq2seq.dataset.dataset.Dataset): dataset to evaluate against

        Returns:
            loss (float): loss of the given model on the given dataset
        """

        eval_limit = 5000
        step_limit = int(eval_limit / self.batch_size)

        model.eval()

        loss = self.loss
        loss.reset()
        match = 0
        total = 0

        device = torch.device('cuda', 0) if torch.cuda.is_available() else None
        batch_iterator = torchtext.data.BucketIterator(
            dataset=data,
            batch_size=self.batch_size,
            sort=True,
            sort_key=lambda x: len(x.src),
            device=device,
            train=False)
        tgt_vocab = data.fields[seq2seq.tgt_field_name].vocab
        src_vocab = data.fields[seq2seq.src_field_name].vocab
        pad = tgt_vocab.stoi[data.fields[seq2seq.tgt_field_name].pad_token]

        cnt = 0
        loss_sum = 0

        context_corpus = []
        reference_corpus = []
        prediction_corpus = []
        state_corpus = []
        with torch.no_grad():
            for batch in batch_iterator:
                print(cnt)
                cnt += 1
                input_variables, input_lengths = getattr(
                    batch, seq2seq.src_field_name)

                if torch.cuda.is_available():
                    input_index = input_variables.cpu().numpy()
                else:
                    input_index = input_variables.numpy()
                input_words = [[src_vocab.itos[word] for word in line]
                               for line in input_index]
                context_corpus.extend(input_words)

                if use_concept:
                    concept, _ = getattr(batch, seq2seq.cpt_field_name)
                else:
                    concept = []
                target_variables = getattr(batch, seq2seq.tgt_field_name)

                if use_concept:
                    (decoder_outputs, decoder_hidden,
                     other), state_loss, state_print = model(
                         input_variables,
                         input_lengths.tolist(),
                         target_variables,
                         concept=concept,
                         vocabs=vocabs,
                         use_concept=use_concept,
                         track_state=use_concept)
                    state_corpus.extend(state_print)
                    """
                    decoder_outputs, decoder_hidden, other = model(input_variables, input_lengths.tolist(),
                                                                            target_variables,
                                                                            concept=concept, vocabs=vocabs,
                                                                            use_concept=use_concept,
                                                                            track_state=False)
                    """
                else:
                    decoder_outputs, decoder_hidden, other = model(
                        input_variables,
                        input_lengths.tolist(),
                        target_variables,
                        vocabs=vocabs)
                # Evaluation
                seqlist = other['sequence']
                reference = []
                prediction = []
                for step, step_output in enumerate(decoder_outputs):
                    target = target_variables[:, step + 1]
                    loss.eval_batch(
                        step_output.view(target_variables.size(0), -1), target)
                    non_padding = target.ne(pad)
                    correct = seqlist[step].view(-1).eq(target).masked_select(
                        non_padding).sum().item()
                    match += correct
                    total += non_padding.sum().item()
                    if torch.cuda.is_available():
                        pred = seqlist[step].view(-1).cpu().numpy()
                        tgt = target.view(-1).cpu().numpy()
                    else:
                        pred = seqlist[step].view(-1).numpy()
                        tgt = target.view(-1).numpy()
                    for i in range(len(step_output)):
                        target_char = tgt_vocab.itos[tgt[i]]
                        pred_char = tgt_vocab.itos[pred[i]]
                        if target_char != '<pad>':
                            if len(reference) >= i + 1:
                                reference[i].append(target_char)
                            else:
                                reference.append([target_char])
                        if pred_char != '<pad>':
                            if len(prediction) >= i + 1:
                                if prediction[i][-1] != '<eos>':
                                    prediction[i].append(pred_char)
                            else:
                                prediction.append([pred_char])
                for i in range(len(reference)):
                    reference[i] = reference[i][:-1]
                    prediction[i] = prediction[i][:-1]
                reference_corpus.extend([[line] for line in reference])
                prediction_corpus.extend(prediction)
                if cnt > step_limit:
                    break

        bleu = corpus_bleu(reference_corpus,
                           prediction_corpus,
                           smoothing_function=smoothie)
        # embedding = embed.eval_embedding(reference_corpus, prediction_corpus)
        distinct_1 = distinct(prediction_corpus, 1)
        distinct_2 = distinct(prediction_corpus, 2)
        print("Corpus BLEU: ", bleu)
        # print("Embedding dist: ", embedding)
        print("Distinct-1: ", distinct_1)
        print("Distinct-2: ", distinct_2)
        with open(log_dir + '/log.txt', 'a+', encoding='utf-8') as file:
            file.write("Distinct-1: " + str(distinct_1) + '\n')
            file.write("Distinct-2: " + str(distinct_2) + '\n\n')

        with open(log_dir + '/log-' + str(cur_step), 'w',
                  encoding='utf-8') as file:
            file.write("Corpus BLEU: " + str(bleu) + '\n')
            # file.write("Embedding Dist: " + str(embedding) + '\n')
            file.write("Distinct-1: " + str(distinct_1) + '\n')
            file.write("Distinct-2: " + str(distinct_2) + '\n\n')
            for i in range(len(reference_corpus)):
                file.write("Context: " + '\n')
                context_str = " ".join(context_corpus[i])
                context_list = context_str.split('<eou>')
                for j in range(len(context_list)):
                    file.write(context_list[j] + '\n')
                if use_concept and state_corpus:
                    file.write("\nStates: " + '\n')
                    cd_pairs = zip(state_corpus[i][0], state_corpus[i][1])
                    cd_pairs = sorted(set(cd_pairs), key=lambda x: x[1])
                    for j in range(len(state_corpus[i][0])):
                        file.write("Concept: {}. Prob: {}.\n".format(
                            cd_pairs[j][0], cd_pairs[j][1]))
                file.write("\nGold: " + ' '.join(reference_corpus[i][0]) +
                           '\n\n')
                file.write("Response: " + ' '.join(prediction_corpus[i]) +
                           '\n\n')
                file.write('\n')
        if total == 0:
            accuracy = float('nan')
        else:
            accuracy = match / total

        return loss.get_loss(), accuracy
Ejemplo n.º 27
0
    #weights[n-1] = 1
    weights = []
    for i in range(n):
        weights.append(float(1.0 / n))

    # Create hypothesis and reference arrays taking 5 predicted captions per image (maybe we could modify this?)
    for conf_value in confidence:
        samples = 0
        # Initialize hypothesis and reference arrays
        hypotheses = []
        references = []
        for row in dataset['images']:
            caption = row['sentences'][0]['tokens']
            #for idx in range(len(row['predicted caption'])):
            #    if float(row['confidence'][idx]) >= conf_value and row['split'] in ['test'] and row['predicted caption'][idx]:
            #        samples += 1
            #        predicted = row['predicted caption'][idx]
            #        references.append([caption])
            #        hypotheses.append(tkn.tokenize(predicted))
            #if float(row['top confidence']) >= conf_value and row['split'] in ['test'] and row['top caption']:
            if float(row['top confidence']) >= conf_value and row['top caption']:
                samples += 1
                predicted = row['top caption']
                references.append([caption])
                hypotheses.append(tkn.tokenize(predicted))

        # Compute BLEU score
        bleu_score = corpus_bleu(references, hypotheses, weights=weights)

        # Display BLEU score
        print 'Confidence: {0} BLEU-{1} score: {2} Samples: {3}'.format(conf_value, n, bleu_score, samples)
Ejemplo n.º 28
0
def evaluate_decode_results(dataset, decode_results, verbose=True):
    from lang.py.parse import tokenize_code, de_canonicalize_code
    # tokenize_code = tokenize_for_bleu_eval
    import ast
    assert dataset.count == len(decode_results)

    f = f_decode = None
    if verbose:
        f = open(dataset.name + '.exact_match', 'w')
        exact_match_ids = []
        f_decode = open(dataset.name + '.decode_results.txt', 'w')
        eid_to_annot = dict()

        if config.data_type == 'django':
            for raw_id, line in enumerate(open(DJANGO_ANNOT_FILE)):
                eid_to_annot[raw_id] = line.strip()

        f_bleu_eval_ref = open(dataset.name + '.ref', 'w')
        f_bleu_eval_hyp = open(dataset.name + '.hyp', 'w')
        f_generated_code = open(dataset.name + '.geneated_code', 'w')

        logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count)

    cum_oracle_bleu = 0.0
    cum_oracle_acc = 0.0
    cum_bleu = 0.0
    cum_acc = 0.0
    sm = SmoothingFunction()

    all_references = []
    all_predictions = []

    if all(len(cand) == 0 for cand in decode_results):
        logging.ERROR('Empty decoding results for the current dataset!')
        return -1, -1

    for eid in range(dataset.count):
        example = dataset.examples[eid]
        ref_code = example.code
        ref_ast_tree = ast.parse(ref_code).body[0]
        refer_source = astor.to_source(ref_ast_tree).strip()
        # refer_source = ref_code
        refer_tokens = tokenize_code(refer_source)
        cur_example_correct = False

        decode_cands = decode_results[eid]
        if len(decode_cands) == 0:
            continue

        decode_cand = decode_cands[0]

        cid, cand, ast_tree, code = decode_cand
        code = astor.to_source(ast_tree).strip()

        # simple_url_2_re = re.compile('_STR:0_', re.))
        try:
            predict_tokens = tokenize_code(code)
        except:
            logging.error('error in tokenizing [%s]', code)
            continue

        if refer_tokens == predict_tokens:
            cum_acc += 1
            cur_example_correct = True

            if verbose:
                exact_match_ids.append(example.raw_id)
                f.write('-' * 60 + '\n')
                f.write('example_id: %d\n' % example.raw_id)
                f.write(code + '\n')
                f.write('-' * 60 + '\n')

        if config.data_type == 'django':
            ref_code_for_bleu = example.meta_data['raw_code']
            pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code'])
            # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code'])
            # convert canonicalized code to raw code
            for literal, place_holder in example.meta_data['str_map'].iteritems():
                pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal)
                # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal)
        elif config.data_type == 'hs':
            ref_code_for_bleu = ref_code
            pred_code_for_bleu = code

        # we apply Ling Wang's trick when evaluating BLEU scores
        refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu)
        pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu)

        # The if-chunk below is for debugging purpose, sometimes the reference cannot match with the prediction
        # because of inconsistent quotes (e.g., single quotes in reference, double quotes in prediction).
        # However most of these cases are solved by cannonicalizing the reference code using astor (parse the reference
        # into AST, and regenerate the code. Use this regenerated one as the reference)
        weired = False
        if refer_tokens_for_bleu == pred_tokens_for_bleu and refer_tokens != predict_tokens:
            # cum_acc += 1
            weired = True
        elif refer_tokens == predict_tokens:
            # weired!
            # weired = True
            pass

        shorter = len(pred_tokens_for_bleu) < len(refer_tokens_for_bleu)

        all_references.append([refer_tokens_for_bleu])
        all_predictions.append(pred_tokens_for_bleu)

        # try:
        ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu))
        bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3)
        cum_bleu += bleu_score
        # except:
        #    pass

        if verbose:
            print 'raw_id: %d, bleu_score: %f' % (example.raw_id, bleu_score)

            f_decode.write('-' * 60 + '\n')
            f_decode.write('example_id: %d\n' % example.raw_id)
            f_decode.write('intent: \n')

            if config.data_type == 'django':
                f_decode.write(eid_to_annot[example.raw_id] + '\n')
            elif config.data_type == 'hs':
                f_decode.write(' '.join(example.query) + '\n')

            f_bleu_eval_ref.write(' '.join(refer_tokens_for_bleu) + '\n')
            f_bleu_eval_hyp.write(' '.join(pred_tokens_for_bleu) + '\n')

            f_decode.write('canonicalized reference: \n')
            f_decode.write(refer_source + '\n')
            f_decode.write('canonicalized prediction: \n')
            f_decode.write(code + '\n')
            f_decode.write('reference code for bleu calculation: \n')
            f_decode.write(ref_code_for_bleu + '\n')
            f_decode.write('predicted code for bleu calculation: \n')
            f_decode.write(pred_code_for_bleu + '\n')
            f_decode.write('pred_shorter_than_ref: %s\n' % shorter)
            f_decode.write('weired: %s\n' % weired)
            f_decode.write('-' * 60 + '\n')

            # for Hiro's evaluation
            f_generated_code.write(pred_code_for_bleu.replace('\n', '#NEWLINE#') + '\n')


        # compute oracle
        best_score = 0.
        cur_oracle_acc = 0.
        for decode_cand in decode_cands[:config.beam_size]:
            cid, cand, ast_tree, code = decode_cand
            try:
                code = astor.to_source(ast_tree).strip()
                predict_tokens = tokenize_code(code)

                if predict_tokens == refer_tokens:
                    cur_oracle_acc = 1

                if config.data_type == 'django':
                    pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code'])
                    # convert canonicalized code to raw code
                    for literal, place_holder in example.meta_data['str_map'].iteritems():
                        pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal)
                elif config.data_type == 'hs':
                    pred_code_for_bleu = code

                # we apply Ling Wang's trick when evaluating BLEU scores
                pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu)

                ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu))
                bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu,
                                           weights=ngram_weights,
                                           smoothing_function=sm.method3)

                if bleu_score > best_score:
                    best_score = bleu_score

            except:
                continue

        cum_oracle_bleu += best_score
        cum_oracle_acc += cur_oracle_acc

    cum_bleu /= dataset.count
    cum_acc /= dataset.count
    cum_oracle_bleu /= dataset.count
    cum_oracle_acc /= dataset.count

    logging.info('corpus level bleu: %f', corpus_bleu(all_references, all_predictions, smoothing_function=sm.method3))
    logging.info('sentence level bleu: %f', cum_bleu)
    logging.info('accuracy: %f', cum_acc)
    logging.info('oracle bleu: %f', cum_oracle_bleu)
    logging.info('oracle accuracy: %f', cum_oracle_acc)

    if verbose:
        f.write(', '.join(str(i) for i in exact_match_ids))
        f.close()
        f_decode.close()

        f_bleu_eval_ref.close()
        f_bleu_eval_hyp.close()
        f_generated_code.close()

    return cum_bleu, cum_acc
Ejemplo n.º 29
0
def test_bleu_bug():
	ref = [[[1, 3], [3], [4]]]
	gen = [[1]]
	with pytest.raises(ZeroDivisionError):
		corpus_bleu(ref, gen, smoothing_function=SmoothingFunction().method3)
Ejemplo n.º 30
0
def eval_wordpiece_bleu(models,
                        dataloader,
                        recog_params,
                        epoch,
                        recog_dir=None,
                        streaming=False,
                        progressbar=False,
                        fine_grained=False,
                        oracle=False,
                        teacher_force=False):
    """Evaluate a wordpiece-level model by corpus-level BLEU.

    Args:
        models (List): models to evaluate
        dataloader (torch.utils.data.DataLoader): evaluation dataloader
        recog_params (omegaconf.dictconfig.DictConfig): decoding hyperparameters
        epoch (int): current epoch
        recog_dir (str): directory path to save hypotheses
        streaming (bool): streaming decoding for session-level evaluation
        progressbar (bool): visualize progressbar
        fine_grained (bool): calculate fine-grained corpus-level BLEU distributions based on input lengths
        oracle (bool): calculate oracle corpsu-level BLEU
        teacher_force (bool): conduct decoding in teacher-forcing mode
    Returns:
        c_bleu (float): corpus-level 4-gram BLEU

    """
    if recog_dir is None:
        recog_dir = 'decode_' + dataloader.set + '_ep' + \
            str(epoch) + '_beam' + str(recog_params.get('recog_beam_width'))
        recog_dir += '_lp' + str(recog_params.get('recog_length_penalty'))
        recog_dir += '_cp' + str(recog_params.get('recog_coverage_penalty'))
        recog_dir += '_' + str(recog_params.get('recog_min_len_ratio')) + '_' + \
            str(recog_params.get('recog_max_len_ratio'))
        recog_dir += '_lm' + str(recog_params.get('recog_lm_weight'))

        ref_trn_path = mkdir_join(models[0].save_path, recog_dir, 'ref.trn')
        hyp_trn_path = mkdir_join(models[0].save_path, recog_dir, 'hyp.trn')
    else:
        ref_trn_path = mkdir_join(recog_dir, 'ref.trn')
        hyp_trn_path = mkdir_join(recog_dir, 'hyp.trn')

    list_of_references_dist = {
    }  # calculate corpus-level BLEU distribution bucketed by input lengths
    hypotheses_dist = {}

    hypotheses_oracle = []
    n_oracle_hit = 0
    n_utt = 0

    # Reset data counter
    dataloader.reset(recog_params.get('recog_batch_size'))

    if progressbar:
        pbar = tqdm(total=len(dataloader))

    list_of_references = []
    hypotheses = []

    with codecs.open(hyp_trn_path, 'w', encoding='utf-8') as f_hyp, \
            codecs.open(ref_trn_path, 'w', encoding='utf-8') as f_ref:
        while True:
            batch, is_new_epoch = dataloader.next(
                recog_params.get('recog_batch_size'))
            if streaming or recog_params.get('recog_block_sync'):
                nbest_hyps_id = models[0].decode_streaming(
                    batch['xs'],
                    recog_params,
                    dataloader.idx2token[0],
                    exclude_eos=True)[0]
            else:
                nbest_hyps_id = models[0].decode(
                    batch['xs'],
                    recog_params,
                    idx2token=dataloader.idx2token[0],
                    exclude_eos=True,
                    refs_id=batch['ys'],
                    utt_ids=batch['utt_ids'],
                    speakers=batch['sessions' if dataloader.corpus ==
                                   'swbd' else 'speakers'],
                    ensemble_models=models[1:] if len(models) > 1 else [])[0]

            for b in range(len(batch['xs'])):
                ref = batch['text'][b]
                if ref[0] == '<':
                    ref = ref.split('>')[1]
                nbest_hyps = [
                    dataloader.idx2token[0](hyp_id)
                    for hyp_id in nbest_hyps_id[b]
                ]

                # Write to trn
                # speaker = str(batch['speakers'][b]).replace('-', '_')
                if streaming:
                    utt_id = str(batch['utt_ids'][b]) + '_0000000_0000001'
                else:
                    utt_id = str(batch['utt_ids'][b])
                f_ref.write(ref + '\n')
                f_hyp.write(nbest_hyps[0] + '\n')
                logger.debug('utt-id (%d/%d): %s' %
                             (n_utt + 1, len(dataloader), utt_id))
                logger.debug('Ref: %s' % ref)
                logger.debug('Hyp: %s' % nbest_hyps[0])
                logger.debug('-' * 150)

                if not streaming:
                    list_of_references += [[ref.split(' ')]]
                    hypotheses += [nbest_hyps[0].split(' ')]

                    if fine_grained:
                        xlen_bin = (batch['xlens'][b] // 200 + 1) * 200
                        if xlen_bin in hypotheses_dist.keys():
                            list_of_references_dist[xlen_bin] += [[
                                ref.split(' ')
                            ]]
                            hypotheses_dist[xlen_bin] += [hypotheses[-1]]
                        else:
                            list_of_references_dist[xlen_bin] = [[
                                ref.split(' ')
                            ]]
                            hypotheses_dist[xlen_bin] = [hypotheses[-1]]

                    # Compute oracle corpus-level BLEU (selected by sentence-level BLEU)
                    if oracle and len(nbest_hyps) > 1:
                        s_blues_b = [
                            sentence_bleu(ref.split(' '), hyp_n.split(' '))
                            for hyp_n in nbest_hyps
                        ]
                        oracle_idx = np.argmax(np.array(s_blues_b))
                        if oracle_idx == 0:
                            n_oracle_hit += len(batch['utt_ids'])
                        hypotheses_oracle += [
                            nbest_hyps[oracle_idx].split(' ')
                        ]

                n_utt += len(batch['utt_ids'])
                if progressbar:
                    pbar.update(len(batch['utt_ids']))

            if is_new_epoch:
                break

    if progressbar:
        pbar.close()

    # Reset data counters
    dataloader.reset()

    c_bleu = corpus_bleu(list_of_references, hypotheses) * 100

    if not streaming:
        if oracle:
            c_bleu_oracle = corpus_bleu(list_of_references,
                                        hypotheses_oracle) * 100
            oracle_hit_rate = n_oracle_hit * 100 / n_utt
            logger.info('Oracle corpus-level BLEU (%s): %.2f %%' %
                        (dataloader.set, c_bleu_oracle))
            logger.info('Oracle hit rate (%s): %.2f %%' %
                        (dataloader.set, oracle_hit_rate))

        if fine_grained:
            for len_bin, hypotheses_bin in sorted(hypotheses_dist.items(),
                                                  key=lambda x: x[0]):
                c_bleu_bin = corpus_bleu(list_of_references_dist[len_bin],
                                         hypotheses_bin) * 100
                logger.info('  corpus-level BLEU (%s): %.2f %% (%d)' %
                            (dataloader.set, c_bleu_bin, len_bin))

    logger.debug('Corpus-level BLEU (%s): %.2f %%' % (dataloader.set, c_bleu))

    return c_bleu
Ejemplo n.º 31
0
def validate(val_loader, decoder, criterion_ce, i2w, device, print_freq,
             word_map, current_epoch, break_flag, top_x, smoothing_method,
             print_flag):
    """
    Performs one epoch's validation.
    :param val_loader: DataLoader for validation data.
    :param decoder: decoder model
    :param criterion_ce: cross entropy loss layer
    :param criterion_dis : discriminative loss layer
    :return: BLEU-4 score
    """
    decoder.eval()  # eval mode (no dropout or batchnorm)

    batch_time = AverageMeter()
    losses = AverageMeter()
    top5accs = AverageMeter()

    start = time.time()

    references = list(
    )  # references (true captions) for calculating BLEU-4 score
    hypotheses = list()  # hypotheses (predictions)

    # Batches
    with torch.no_grad():
        for i, data in enumerate(val_loader):

            if break_flag and i == 5:
                break  # only 5 batches

            print('val i', i)
            imgs, caps, caplens, allcaps = data

            # Move to device, if available
            imgs = imgs.to(device)
            caps = caps.to(device)
            caplens = caplens.to(device)

            scores, caps_sorted, decode_lengths, sort_ind = decoder(
                imgs, caps, caplens)

            # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
            targets = caps_sorted[:, 1:]

            if print_flag:
                print_predictions(scores, targets, i2w)

            # Remove timesteps that we didn't decode at, or are pads
            # pack_padded_sequence is an easy trick to do this
            scores_copy = scores.clone()
            scores, _ = pack_padded_sequence(scores,
                                             decode_lengths,
                                             batch_first=True)
            targets, _ = pack_padded_sequence(targets,
                                              decode_lengths,
                                              batch_first=True)

            # Calculate loss
            loss = criterion_ce(scores, targets)

            # Keep track of metrics
            losses.update(loss.item(), sum(decode_lengths))
            top5 = accuracy(scores, targets, top_x)
            top5accs.update(top5, sum(decode_lengths))
            batch_time.update(time.time() - start)

            start = time.time()

            if i % print_freq == 0:
                print(
                    'Validation: [{0}/{1}]\t'
                    'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Top-{topx} Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.
                    format(i,
                           len(val_loader),
                           batch_time=batch_time,
                           loss=losses,
                           topx=top_x,
                           top5=top5accs))

            # Store references (true captions), and hypothesis (prediction) for each image
            # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
            # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]

            # References
            allcaps = allcaps[
                sort_ind]  # because images were sorted in the decoder
            # DIDEC caps of other participants come here

            # print(allcaps.shape)

            for j in range(allcaps.shape[0]):
                img_caps = allcaps[j].tolist()

                img_captions = list(
                    map(
                        lambda c: [
                            w for w in c if w not in
                            {word_map['<start>'], word_map['<pad>']}
                        ], img_caps))  # remove <start> and pads

                refs_per_img = []

                for ic in img_captions:
                    if len(ic) > 0:
                        refs_per_img.append(ic)

                references.append(refs_per_img)

            # Hypotheses
            _, preds = torch.max(scores_copy, dim=2)
            # print('scr', scores_copy)
            # print('preds', preds)

            preds = preds.tolist()
            temp_preds = list()
            for j, p in enumerate(preds):
                temp_preds.append(
                    preds[j][:decode_lengths[j]]
                )  # remove pads #SUPERFLUOUS ENDS stay for teacher forcing
            preds = temp_preds
            hypotheses.extend(preds)

            assert len(references) == len(hypotheses)

    # Calculate BLEU-4 scores

    #print('refshyps')
    #print(references)
    #print(hypotheses)

    bleu4 = corpus_bleu(references,
                        hypotheses,
                        smoothing_function=smoothing_method)
    bleu4 = round(bleu4, 4)

    print(
        '\n * LOSS - {loss.avg:.3f}, TOP-{topx} ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n'
        .format(loss=losses, topx=top_x, top5=top5accs, bleu=bleu4))

    return bleu4, losses
Ejemplo n.º 32
0
def evaluate_with_beam(beam_width, data_name, model, encoder, decoder,
                       word_map, word_map_start, word_map_end, rev_word_map):
    """
    Evaluation

    :param beam_size: beam size at which to generate captions for evaluation
    :return: BLEU-4 score
    """
    # DataLoader
    loader = torch.utils.data.DataLoader(CaptionDataset(
        data_folder,
        data_name,
        'TEST',
        transform=transforms.Compose([normalize])),
                                         batch_size=1,
                                         shuffle=True,
                                         num_workers=1,
                                         pin_memory=True)

    # TODO: Batched Beam Search
    # Therefore, do not use a batch_size greater than 1 - IMPORTANT!

    # Lists to store references (true captions), and hypothesis (prediction) for each image
    # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
    # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
    references = list()
    hypotheses = list()
    references_zh = list()
    hypotheses_zh = list()
    # For each image
    for i, (image, caps, caplens, allcaps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_width))):
        #if i % 1000 != 0: continue
        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        # Encode
        encoder_out = encoder(
            image)  # (1, enc_image_size, enc_image_size, encoder_dim)
        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)

        # Flatten encoding
        encoder_out = encoder_out.view(
            1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # Decode
        seq, _ = decode_one(decoder, encoder_out, encoder_dim, enc_image_size,
                            word_map_start, word_map_end, beam_width)

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(
                lambda c: [
                    w for w in c if w not in {
                        word_map['<start>'], word_map['<end>'], word_map[
                            '<pad>']
                    }
                ], img_caps))  # remove <start> and pads
        #img_captions_zh = [" ".join(rev_word_map[word] for word in sentence) for sentence in img_captions]
        img_captions_zh = [
            " ".join(str(word) for word in sentence)
            for sentence in img_captions
        ]

        references.append(img_captions)
        references_zh.append(img_captions_zh)

        # Hypotheses
        hypothese = [
            w for w in seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ]
        hypotheses.append(hypothese)
        #hypothese_zh = [rev_word_map[item] for item in hypothese]
        hypothese_zh = [str(item) for item in hypothese]
        hypothese_zh = " ".join(hypothese_zh)
        hypotheses_zh.append(hypothese_zh)

        assert len(references) == len(hypotheses)
        assert len(references_zh) == len(hypotheses_zh)

        # Calculate BLEU-4 scores
    bleu1nltk = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0))
    bleu2nltk = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0))
    bleu3nltk = corpus_bleu(references,
                            hypotheses,
                            weights=(0.33, 0.33, 0.33, 0))
    bleu4nltk = corpus_bleu(references,
                            hypotheses,
                            weights=(0.25, 0.25, 0.25, 0.25))

    nlgeval = NLGEval()  # loads the models
    metrics_dict = nlgeval.compute_metrics(references_zh, hypotheses_zh)
    metrics_dict["bleu1nltk"] = bleu1nltk
    metrics_dict["bleu2nltk"] = bleu2nltk
    metrics_dict["bleu3nltk"] = bleu3nltk
    metrics_dict["bleu4nltk"] = bleu4nltk

    # write result to json file
    #print(metrics_dict)
    output = {
        "model": model[15:-8],
        "beam_width": beam_width,
        "scores": metrics_dict
    }

    output_file_name = "../evaluation/" + model[15:-8] + ".txt"
    f = open(output_file_name, "a+")
    f.writelines(json.dumps(str(output)))
    f.writelines("\n")
    f.close()
    return metrics_dict
Ejemplo n.º 33
0
def self_bleu(sents):
    return bleu.corpus_bleu([[s for (j, s) in enumerate(sents) if j != i]
                             for i in range(len(sents))], sents)
def validate(val_loader, encoder, decoder, criterion):
    """
    Performs one epoch's validation.

    :param val_loader: DataLoader for validation data.
    :param encoder: encoder model
    :param decoder: decoder model
    :param criterion: loss layer
    :return: BLEU-4 score
    """
    decoder.eval()  # eval mode (no dropout or batchnorm)
    if encoder is not None:
        encoder.eval()

    batch_time = AverageMeter()
    losses = AverageMeter()
    top5accs = AverageMeter()

    start = time.time()

    references = list(
    )  # references (true captions) for calculating BLEU-4 score
    hypotheses = list()  # hypotheses (predictions)

    # Batches
    for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader):

        # Move to device, if available
        imgs = imgs.to(device)
        caps = caps.to(device)
        caplens = caplens.to(device)

        # Forward prop.
        if encoder is not None:
            imgs = encoder(imgs)
        scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(
            imgs, caps, caplens)

        # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
        targets = caps_sorted[:, 1:]

        # Remove timesteps that we didn't decode at, or are pads
        # pack_padded_sequence is an easy trick to do this
        scores_copy = scores.clone()
        scores, _ = pack_padded_sequence(scores,
                                         decode_lengths,
                                         batch_first=True)
        targets, _ = pack_padded_sequence(targets,
                                          decode_lengths,
                                          batch_first=True)

        # Calculate loss
        loss = criterion(scores, targets)

        # Add doubly stochastic attention regularization
        loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean()

        # Keep track of metrics
        losses.update(loss.item(), sum(decode_lengths))
        top5 = accuracy(scores, targets, 5)
        top5accs.update(top5, sum(decode_lengths))
        batch_time.update(time.time() - start)

        start = time.time()

        if i % print_freq == 0:
            print('Validation: [{0}/{1}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(
                      i,
                      len(val_loader),
                      batch_time=batch_time,
                      loss=losses,
                      top5=top5accs))

        # Store references (true captions), and hypothesis (prediction) for each image
        # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
        # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]

        # References
        allcaps = allcaps[
            sort_ind]  # because images were sorted in the decoder
        for j in range(allcaps.shape[0]):
            img_caps = allcaps[j].tolist()
            img_captions = list(
                map(
                    lambda c: [
                        w for w in c
                        if w not in {word_map['<start>'], word_map['<pad>']}
                    ], img_caps))  # remove <start> and pads
            references.append(img_captions)

        # Hypotheses
        _, preds = torch.max(scores_copy, dim=2)
        preds = preds.tolist()
        temp_preds = list()
        for j, p in enumerate(preds):
            temp_preds.append(preds[j][:decode_lengths[j]])  # remove pads
        preds = temp_preds
        hypotheses.extend(preds)

        assert len(references) == len(hypotheses)

    # Calculate BLEU-4 scores
    bleu4 = corpus_bleu(references, hypotheses)

    print(
        '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n'
        .format(loss=losses, top5=top5accs, bleu=bleu4))

    return bleu4
Ejemplo n.º 35
0
def validate(val_loader, encoder, decoder, criterion):

    decoder.eval()
    if encoder is not None:
        encoder.eval()

    batch_time = AverageMeter()
    losses = AverageMeter()
    top5accs = AverageMeter()

    start = time.time()

    references = list()  # true captions for calculating the bleu scores
    hypotheses = list()  # hypotheses (predictions)

    with torch.no_grad():
        # batches
        for i, (imgs, caps, caplens, allcaps) in enumerate(val_loader):

            # move to device, if available
            imgs = imgs.to(device)
            caps = caps.to(device)
            caplens = caplens.unsqueeze(1).to(device)

            # forward prop
            if encoder is not None:
                imgs = encoder(imgs)

            scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(
                imgs, caps, caplens)

            targets = caps_sorted[:, 1:]

            scores_copy = scores.clone()
            scores = pack_padded_sequence(scores,
                                          decode_lengths,
                                          batch_first=True).data
            targets = pack_padded_sequence(targets,
                                           decode_lengths,
                                           batch_first=True).data

            # calculate loss
            loss = criterion(scores, targets)

            # doubly stochastic attention regularization
            loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean()

            # keep track of metrics
            losses.update(loss.item(), sum(decode_lengths))
            top5 = accuracy(scores, targets, 5)
            top5accs.update(top5, sum(decode_lengths))
            batch_time.update(time.time() - start)

            start = time.time()

            if i % print_freq == 0:
                print(
                    'Validation: [{0}/{1}]\t'
                    'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'.format(
                        i,
                        len(val_loader),
                        batch_time=batch_time,
                        loss=losses,
                        top5=top5accs))

            allcaps = allcaps[sort_ind]
            for j in range(allcaps.shape[0]):
                img_caps = allcaps[j].tolist()
                # check if it has <sos> token and remove it
                references.append(img_caps)

            # hypotheses
            _, preds = torch.max(scores_copy, dim=2)
            preds = preds.tolist()
            temp_preds = list()
            for j, p in enumerate(preds):
                temp_preds.append(preds[j][:decode_lengths[j]])  # remove pads

            preds = temp_preds
            hypotheses.extend(preds)

            assert len(references) == len(hypotheses)

        # Calculate BLEU-4 scores
        bleu4 = corpus_bleu(references, hypotheses)

        print(
            '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}\n'
            .format(loss=losses, top5=top5accs, bleu=bleu4))

    return bleu4
Ejemplo n.º 36
0
def evaluate(beam_size):
    """
    Evaluation

    :param beam_size: beam size at which to generate captions for evaluation
    :return: BLEU-4 score
    """
    # DataLoader
    loader = torch.utils.data.DataLoader(
        CaptionDataset(data_folder, data_name, 'TEST', transform=transforms.Compose([normalize])),
        batch_size=1, shuffle=False, num_workers=1, pin_memory=True)

    # TODO: Batched Beam Search

    # Lists to store references (true captions), and hypothesis (prediction) for each image
    # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
    # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
    references = list()
    hypotheses = list()
    #bleu1_list=list()
    #bleu2_list=list()
    #bleu3_list=list()
    #bleu4_list=list()
    #result=[]

    # For each image
    for i, (image, caps, caplens, allcaps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        k = beam_size
        #print(i)
        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        # Encode
        encoder_out = encoder(image)  # (1, enc_image_size, enc_image_size, encoder_dim)
        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)

        # Flatten encoding
        encoder_out = encoder_out.view(1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # We'll treat the problem as having a batch size of k
        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h, c = decoder.init_hidden_state(encoder_out)

        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(1)  # (s, embed_dim)

            awe, _ = decoder.attention(encoder_out, h)  # (s, encoder_dim), (s, num_pixels)

            gate = decoder.sigmoid(decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
            awe = gate * awe

            h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c))  # (s, decoder_dim)

            scores = decoder.fc(h)  # (s, vocab_size)
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            # For the first step, all k points will have the same scores (since same k previous words, h, c)
            if step == 1:
                top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)  # (s)
            else:
                # Unroll and find top scores, and their unrolled indices
                top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)  # (s)

            # Convert unrolled indices to actual indices of scores
            prev_word_inds = top_k_words / vocab_size  # (s)
            next_word_inds = top_k_words % vocab_size  # (s)

            # Add new words to sequences
            seqs = torch.cat([seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [ind for ind, next_word in enumerate(next_word_inds) if
                               next_word != word_map['<end>']]
            complete_inds = list(set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly

            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1

        i = complete_seqs_scores.index(max(complete_seqs_scores))
        seq = complete_seqs[i]

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(lambda c: [rev_word_map[w] for w in c if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],
                img_caps))  # remove <start> and pads
        references.append(img_captions)
        #print(allcaps[0])
        #break

        # Hypotheses
        hypotheses.append([rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}])
        
        #result_dict={}
        #result_dict['num']=i
        #result_dict['reference']=img_captions
        #result_dict['hypotheses']=[rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}]
        #result.append(result_dict)
        assert len(references) == len(hypotheses)
        #bleu1 = sentence_bleu(img_captions,[w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],weights=(1, 0, 0, 0))
        #bleu2 = sentence_bleu(img_captions,[w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],weights=(0.5, 0.5, 0, 0))
        #bleu3 = sentence_bleu(img_captions,[w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],weights=(0.33, 0.33, 0.33, 0))
        #bleu4 = sentence_bleu(img_captions,[w for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}],weights=(0.25, 0.25, 0.25, 0.25))
        #print(bleu1)
        #bleu1_list.append(bleu1)
        #bleu2_list.append(bleu2)
        #bleu3_list.append(bleu3)
        #bleu4_list.append(bleu4)
        #print(len(bleu_list))
        #print(img_captions)
        #print([rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']}]) 
        
        
    
    #bleu_dict={}
    #bleu_dict['bleu1']=bleu1_list
    #bleu_dict['bleu2']=bleu2_list
    #bleu_dict['bleu3']=bleu3_list
    #bleu_dict['bleu4']=bleu4_list
    #print(len(references))
    #print(len(hypotheses))
    #with open('./result_hypothese.json','w') as f:
    #   json.dump(result,f)
    #=========matplotlib============
    #print(len(bleu_list))
    #print(bleu_list)
    #plt.hist(bleu_list,bins=20,normed=0,facecolor='blue',edgecolor='black')
    #plt.bar(range(len(bleu_list)),bleu_list,fc='b')
    #plt.plot(range(len(bleu_list)),bleu_list)
    #plt.xlim((0,5000))
    #plt.ylim((0,1))
    #plt.xticks(np.arange(0,5000,1000))
    #plt.yticks(np.arange(0,1,0.1))
    #plt.xlabel('image')
    #plt.ylabel('bleu 1 scores')
    #plt.title('bleu 1')
    #plt.show()
    #sns.distplot(bleu1_list)
    #plt.show()



    # Calculate BLEU-4 scores
    bleu4 = corpus_bleu(references, hypotheses,weights=(0.25,0.25,0.25,0.25),emulate_multibleu=True)
    bleu3 = corpus_bleu(references, hypotheses,weights=(0.33,0.33,0.33,0),emulate_multibleu=True)
    bleu2 = corpus_bleu(references, hypotheses,weights=(0.5, 0.5, 0, 0),emulate_multibleu=True)
    bleu1 = corpus_bleu(references, hypotheses,weights=(1, 0, 0, 0),emulate_multibleu=True)
    return bleu4,bleu3,bleu2,bleu1
Ejemplo n.º 37
0
        sources = Variable(sources.cuda(), volatile=True)
        M1.zero_grad()
        M2.zero_grad()
        logits = M1(sources, None)
        logits = torch.max(logits.data.cpu(), 2)[1]
        logits = [list(x) for x in logits]
        hyp = [x[:x.index(1)] if 1 in x else x for x in logits]
        hyp = [[DS1.vocab[x] for x in y] for y in hyp]
        inters.extend(hyp)
        sources2 = (hyp, targets)
        s2, _ = DS2.pad_batch(sources2, targ=False)
        s2 = Variable(s2.cuda(), volatile=True)
        logits2 = M2(s2, None)
        logits2 = torch.max(logits2.data.cpu(), 2)[1]
        logits2 = [list(x) for x in logits2]
        hyp2 = [x[:x.index(1)] if 1 in x else x for x in logits2]
        hyp2 = [[DS2.vocab[x] for x in y] for y in hyp2]
        hyps.extend(hyp2)
        refs.extend(targets)
    bleu = corpus_bleu(refs,
                       hyps,
                       emulate_multibleu=True,
                       smoothing_function=cc.method3)
    print(bleu)
    with open(args.savestr + "hyps", 'w') as f:
        hyps = [' '.join(x) for x in hyps]
        f.write('\n'.join(hyps))
    with open(args.savestr + "refs", 'w') as f:
        refs = ['\t'.join([' '.join(x) for x in y]) for y in refs]
        f.write('\n'.join(refs))
Ejemplo n.º 38
0
def eval():
    # Load graph
    g = Graph(is_training=False)
    print("Graph loaded")

    # Load data
    X, Sources, Targets = load_test_data()
    cn2idx, idx2cn = load_cn_vocab()
    en2idx, idx2en = load_en_vocab()

    #     X, Sources, Targets = X[:33], Sources[:33], Targets[:33]

    # Start session
    with g.graph.as_default():
        # A training helper that checkpoints models and computes summaries.
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            ## Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print("Restored!")

            ## Get model name
            mname = open(hp.logdir + '\checkpoint',
                         'r').read().split('"')[1]  # model name
            mname = re.findall(r'results(.*)', mname)[0]

            ## Inference
            result_trans = hp.logdir + '\\translation'
            if not os.path.exists(result_trans):
                os.mkdir(result_trans)
            with codecs.open(result_trans + mname, "w", "utf-8") as fout:
                list_of_refs, hypotheses = [], []
                if len(X) // hp.batch_size == 0:
                    iteration = 1
                else:
                    iteration = len(X) // hp.batch_size
                for i in range(iteration):
                    if iteration == 1:
                        x = X
                        sources = Sources
                        targets = Targets
                        preds = np.zeros((len(Sources), hp.maxlen), np.int32)
                    else:
                        ### Get mini-batches
                        x = X[i * hp.batch_size:(i + 1) * hp.batch_size]
                        sources = Sources[i * hp.batch_size:(i + 1) *
                                          hp.batch_size]
                        targets = Targets[i * hp.batch_size:(i + 1) *
                                          hp.batch_size]
                        ### Autoregressive inference
                        preds = np.zeros((hp.batch_size, hp.maxlen), np.int32)
                    for j in range(hp.maxlen):
                        _preds = sess.run(g.preds, {g.x: x, g.y: preds})
                        preds[:, j] = _preds[:, j]
                    ### Write to file
                    for source, target, pred in zip(sources, targets,
                                                    preds):  # sentence-wise
                        got = " ".join(
                            idx2en[idx]
                            for idx in pred).split("</S>")[0].strip()
                        fout.write("- source: " + source + "\n")
                        fout.write("- expected: " + target + "\n")
                        fout.write("- got: " + got + "\n\n")
                        fout.flush()
                        # bleu score
                        ref = target.split()
                        hypothesis = got.split()
                        if len(ref) > 3 and len(hypothesis) > 3:
                            list_of_refs.append([ref])
                            hypotheses.append(hypothesis)

                ## Calculate bleu score
                score = corpus_bleu(list_of_refs, hypotheses)
                fout.write("Bleu Score = " + str(100 * score))
Ejemplo n.º 39
0
def per_item_dialog_bleu(y_true, y_predicted):
    y_true = (y['text'] for dialog in y_true for y in dialog)
    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],
                       [y_p.lower().split() for y_p in y_predicted])
Ejemplo n.º 40
0
            if di != 0:
                answer += output_lang.index2word[topi.item()]
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])

            if decoder_input.item() == EOS_token:
                break

    target = [output_lang.index2word[i.item()] for i in target_tensor]
    loss.backward()

    bleu_score = corpus_bleu([target], [list(answer)], weights=(1, 0, 0, 0))

    target = ''.join(target)
    # if '!' == target[-1]:
    #     target = target[:-1]
    ind_acc = 0

    # if '!' == target[-1]:
    #     target = target[:-1]
    # if '?' in answer:
    #     target = target[:target.find('?')]
    # if '!' == answer[-1]:
    #     answer = answer[:-1]

    acc = 1 if target == answer else 0
Ejemplo n.º 41
0
def bleu(y_true, y_predicted):
    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],
                       [y_p.lower().split() for y_p in y_predicted])
Ejemplo n.º 42
0
def analyze_decode_results(dataset, decode_results, verbose=True):
    from lang.py.parse import tokenize_code, de_canonicalize_code
    # tokenize_code = tokenize_for_bleu_eval
    import ast
    assert dataset.count == len(decode_results)

    f = f_decode = None
    if verbose:
        f = open(dataset.name + '.exact_match', 'w')
        exact_match_ids = []
        f_decode = open(dataset.name + '.decode_results.txt', 'w')
        eid_to_annot = dict()

        if config.data_type == 'django':
            for raw_id, line in enumerate(open(DJANGO_ANNOT_FILE)):
                eid_to_annot[raw_id] = line.strip()

        f_bleu_eval_ref = open(dataset.name + '.ref', 'w')
        f_bleu_eval_hyp = open(dataset.name + '.hyp', 'w')

        logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count)

    cum_oracle_bleu = 0.0
    cum_oracle_acc = 0.0
    cum_bleu = 0.0
    cum_acc = 0.0
    sm = SmoothingFunction()

    all_references = []
    all_predictions = []

    if all(len(cand) == 0 for cand in decode_results):
        logging.ERROR('Empty decoding results for the current dataset!')
        return -1, -1

    binned_results_dict = defaultdict(list)
    def get_binned_key(ast_size):
        cutoff = 50 if config.data_type == 'django' else 250
        k = 10 if config.data_type == 'django' else 25 # for hs

        if ast_size >= cutoff:
            return '%d - inf' % cutoff

        lower = int(ast_size / k) * k
        upper = lower + k

        key = '%d - %d' % (lower, upper)

        return key


    for eid in range(dataset.count):
        example = dataset.examples[eid]
        ref_code = example.code
        ref_ast_tree = ast.parse(ref_code).body[0]
        refer_source = astor.to_source(ref_ast_tree).strip()
        # refer_source = ref_code
        refer_tokens = tokenize_code(refer_source)
        cur_example_acc = 0.0

        decode_cands = decode_results[eid]
        if len(decode_cands) == 0:
            continue

        decode_cand = decode_cands[0]

        cid, cand, ast_tree, code = decode_cand
        code = astor.to_source(ast_tree).strip()

        # simple_url_2_re = re.compile('_STR:0_', re.))
        try:
            predict_tokens = tokenize_code(code)
        except:
            logging.error('error in tokenizing [%s]', code)
            continue

        if refer_tokens == predict_tokens:
            cum_acc += 1
            cur_example_acc = 1.0

            if verbose:
                exact_match_ids.append(example.raw_id)
                f.write('-' * 60 + '\n')
                f.write('example_id: %d\n' % example.raw_id)
                f.write(code + '\n')
                f.write('-' * 60 + '\n')

        if config.data_type == 'django':
            ref_code_for_bleu = example.meta_data['raw_code']
            pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code'])
            # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code'])
            # convert canonicalized code to raw code
            for literal, place_holder in example.meta_data['str_map'].iteritems():
                pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal)
                # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal)
        elif config.data_type == 'hs':
            ref_code_for_bleu = ref_code
            pred_code_for_bleu = code

        # we apply Ling Wang's trick when evaluating BLEU scores
        refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu)
        pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu)

        shorter = len(pred_tokens_for_bleu) < len(refer_tokens_for_bleu)

        all_references.append([refer_tokens_for_bleu])
        all_predictions.append(pred_tokens_for_bleu)

        # try:
        ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu))
        bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3)
        cum_bleu += bleu_score
        # except:
        #    pass

        if verbose:
            print 'raw_id: %d, bleu_score: %f' % (example.raw_id, bleu_score)

            f_decode.write('-' * 60 + '\n')
            f_decode.write('example_id: %d\n' % example.raw_id)
            f_decode.write('intent: \n')

            if config.data_type == 'django':
                f_decode.write(eid_to_annot[example.raw_id] + '\n')
            elif config.data_type == 'hs':
                f_decode.write(' '.join(example.query) + '\n')

            f_bleu_eval_ref.write(' '.join(refer_tokens_for_bleu) + '\n')
            f_bleu_eval_hyp.write(' '.join(pred_tokens_for_bleu) + '\n')

            f_decode.write('canonicalized reference: \n')
            f_decode.write(refer_source + '\n')
            f_decode.write('canonicalized prediction: \n')
            f_decode.write(code + '\n')
            f_decode.write('reference code for bleu calculation: \n')
            f_decode.write(ref_code_for_bleu + '\n')
            f_decode.write('predicted code for bleu calculation: \n')
            f_decode.write(pred_code_for_bleu + '\n')
            f_decode.write('pred_shorter_than_ref: %s\n' % shorter)
            # f_decode.write('weired: %s\n' % weired)
            f_decode.write('-' * 60 + '\n')

        # compute oracle
        best_bleu_score = 0.
        cur_oracle_acc = 0.
        for decode_cand in decode_cands[:config.beam_size]:
            cid, cand, ast_tree, code = decode_cand
            try:
                code = astor.to_source(ast_tree).strip()
                predict_tokens = tokenize_code(code)

                if predict_tokens == refer_tokens:
                    cur_oracle_acc = 1.

                if config.data_type == 'django':
                    pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code'])
                    # convert canonicalized code to raw code
                    for literal, place_holder in example.meta_data['str_map'].iteritems():
                        pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal)
                elif config.data_type == 'hs':
                    pred_code_for_bleu = code

                # we apply Ling Wang's trick when evaluating BLEU scores
                pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu)

                ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu))
                cand_bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu,
                                                weights=ngram_weights,
                                                smoothing_function=sm.method3)

                if cand_bleu_score > best_bleu_score:
                    best_bleu_score = cand_bleu_score

            except:
                continue

        cum_oracle_bleu += best_bleu_score
        cum_oracle_acc += cur_oracle_acc

        ref_ast_size = example.parse_tree.size
        binned_key = get_binned_key(ref_ast_size)
        binned_results_dict[binned_key].append((bleu_score, cur_example_acc, best_bleu_score, cur_oracle_acc))

    cum_bleu /= dataset.count
    cum_acc /= dataset.count
    cum_oracle_bleu /= dataset.count
    cum_oracle_acc /= dataset.count

    logging.info('corpus level bleu: %f', corpus_bleu(all_references, all_predictions, smoothing_function=sm.method3))
    logging.info('sentence level bleu: %f', cum_bleu)
    logging.info('accuracy: %f', cum_acc)
    logging.info('oracle bleu: %f', cum_oracle_bleu)
    logging.info('oracle accuracy: %f', cum_oracle_acc)

    keys = sorted(binned_results_dict, key=lambda x: int(x.split(' - ')[0]))

    Y = [[], [], [], []]
    X = []

    for binned_key in keys:
        entry = binned_results_dict[binned_key]
        avg_bleu = np.average([t[0] for t in entry])
        avg_acc = np.average([t[1] for t in entry])
        avg_oracle_bleu = np.average([t[2] for t in entry])
        avg_oracle_acc = np.average([t[3] for t in entry])
        print binned_key, avg_bleu, avg_acc, avg_oracle_bleu, avg_oracle_acc, len(entry)

        Y[0].append(avg_bleu)
        Y[1].append(avg_acc)
        Y[2].append(avg_oracle_bleu)
        Y[3].append(avg_oracle_acc)

        X.append(int(binned_key.split(' - ')[0]))

    import matplotlib.pyplot as plt
    from pylab import rcParams
    rcParams['figure.figsize'] = 6, 2.5

    if config.data_type == 'django':
        fig, ax = plt.subplots()
        ax.plot(X, Y[0], 'bs--', label='BLEU', lw=1.2)
        # ax.plot(X, Y[2], 'r^--', label='oracle BLEU', lw=1.2)
        ax.plot(X, Y[1], 'r^--', label='acc', lw=1.2)
        # ax.plot(X, Y[3], 'r^--', label='oracle acc', lw=1.2)
        ax.set_ylabel('Performance')
        ax.set_xlabel('Reference AST Size (# nodes)')
        plt.legend(loc='upper right', ncol=6)
        plt.tight_layout()
        # plt.savefig('django_acc_ast_size.pdf', dpi=300)
        # os.system('pcrop.sh django_acc_ast_size.pdf')
        plt.savefig('django_perf_ast_size.pdf', dpi=300)
        os.system('pcrop.sh django_perf_ast_size.pdf')
    else:
        fig, ax = plt.subplots()
        ax.plot(X, Y[0], 'bs--', label='BLEU', lw=1.2)
        # ax.plot(X, Y[2], 'r^--', label='oracle BLEU', lw=1.2)
        ax.plot(X, Y[1], 'r^--', label='acc', lw=1.2)
        # ax.plot(X, Y[3], 'r^--', label='oracle acc', lw=1.2)
        ax.set_ylabel('Performance')
        ax.set_xlabel('Reference AST Size (# nodes)')
        plt.legend(loc='upper right', ncol=6)
        plt.tight_layout()
        # plt.savefig('hs_bleu_ast_size.pdf', dpi=300)
        # os.system('pcrop.sh hs_bleu_ast_size.pdf')
        plt.savefig('hs_perf_ast_size.pdf', dpi=300)
        os.system('pcrop.sh hs_perf_ast_size.pdf')
    if verbose:
        f.write(', '.join(str(i) for i in exact_match_ids))
        f.close()
        f_decode.close()

        f_bleu_eval_ref.close()
        f_bleu_eval_hyp.close()

    return cum_bleu, cum_acc
Ejemplo n.º 43
0
def validation(beam_size):
    """
    Evaluation Process
    
    :param beam_size: beam size at which to generate captions for evaluation
    :return: BLEU-4 score
    """

    references = list(
    )  # references (true captions) for calculating BLEU-4 score
    hypotheses = list()  # hypotheses (predictions)

    start_time = datetime.datetime.now()
    print("Start training at: ", start_time)

    for j, (images, captions, caplens, all_caps) in enumerate(val_loader):
        k = beam_size
        start = datetime.datetime.now()

        images = images.to(device)

        # Forward
        encoder_out = encoder(images)
        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(-1)
        ## Flatten encoding``
        encoder_out = encoder_out.view(
            batch_size, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)
        ## We'll treat the problem as having a batch size of k`
        encoder_out = encoder_out.expand(k, num_pixels, encoder_dim)
        ## Tensor to store top k previous words at each step; now they're just <start>`
        prev_words = torch.LongTensor([[word_map["<start>"]]] * k).to(
            device)  # (k, 1)
        ## Tensor to store top k sequences; now they're just <start>`
        seqs = prev_words  # (k, 1)
        ## Tensor to store top k sequences' scores; now they're just 0`
        seqs_scores = torch.zeros([k, 1]).to(device)  # (k, 1)

        # Initialize lists
        complete_seqs = []
        complete_scores = []

        print("start decode")
        # Decode
        step = 1
        h, c = decoder.init_hidden_state(encoder_out)  # (k, decoder_dim)
        ## Iterate until all k sequences are completed
        while True:
            # Compute scores of current k previous words
            embeddings = decoder.embedding(prev_words).squeeze(
                1)  # (k, 1, embed_dim) to (k, embed_dim)
            h, c = decoder.decode_step(
                embeddings,  # (1, embed_dim)
                (h, c))  # (1, decoder_dim)
            scores = decoder.fc(decoder.dropout(h))  # (k, vocab_size)
            scores = F.log_softmax(scores, dim=2)

            # Add (i.e. multiply because of 'log' above) to current scores
            scores = seqs_scores.expand_as(scores) + scores
            # Take the maximum k elements in (k * vocab_size) combinations
            if step == 1:  ## Initialize
                top_scores, top_k_locations = scores[0].topk(k, 0, True, True)
            else:
                top_scores, top_k_locations = scores.view(-1).topk(
                    k, 0, True, True)
            # Row and Column indices of k largest elements
            top_k_prev_ind = top_k_locations // vocab_size  # (k, 1)
            top_k_next_ind = top_k_locations % vocab_size  # (k, 1)

            # Update sequences
            seqs = torch.cat(
                [seqs[top_k_prev_ind],
                 top_k_next_ind.unsqueeze(1)], dim=1)  # (k, step+1)

            # Check whether a sequence is completed
            comp_seqs_ind = [
                j for j, next_word in enumerate(top_k_next_ind)
                if next_word == word_map["<end>"]
            ]
            incomp_seqs_ind = list(
                set(range(seqs.size(0))) - set(comp_seqs_ind))

            # Deal with completed sequences
            if len(comp_seqs_ind) > 0:
                complete_seqs.extend(seqs[comp_seqs_ind].tolist())
                complete_scores.extend(seqs_scores[comp_seqs_ind])
            k -= len(comp_seqs_ind)  # reduce beam length

            # Deal with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomp_seqs_ind]
            h = h[top_k_prev_ind[incomp_seqs_ind]]
            c = c[top_k_prev_ind[incomp_seqs_ind]]
            encoder_out = encoder_out[top_k_prev_ind[incomp_seqs_ind]]
            #seqs_scores = seqs_scores[incomp_seqs_ind].unsqueeze(1)
            seqs_scores = seqs_scores[incomp_seqs_ind]
            #prev_words = top_k_next_ind[incomp_seqs_ind].unsqueeze(1)
            prev_words = top_k_next_ind[incomp_seqs_ind]

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1

        max_i = np.argmax(complete_scores)
        #max_i = complete_scores.index(max(complete_scores))
        max_seq = complete_seqs[max_i]

        ## Store references (true captions), and hypothesis (prediction) for each image
        ## If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
        ## references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]

        # references
        # all_caps = all_caps[sort_ind]
        print("reference")
        for k in range(all_caps.shape[0]):
            img_caps = all_caps[k].tolist()
            img_captions = list(
                map(
                    lambda c: [
                        w for w in c
                        if w not in {word_map["<start>"], word_map["<pad>"]}
                    ], img_caps))
            references.append(img_captions)

        # hypotheses
        hypotheses.append([
            w for w in max_seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ])

        assert len(references) == len(hypotheses)

    ## Compute BLEU-4 Scores
    bleu4 = corpus_bleu(references, hypothesese)

    return bleu4
Ejemplo n.º 44
0
def per_item_bleu(y_true, y_predicted):
    y_predicted = itertools.chain(*y_predicted)
    return corpus_bleu([[y_t.lower().split()] for y_t in y_true],
                       [y_p.lower().split() for y_p in y_predicted])
Ejemplo n.º 45
0
def analyze_decode_results(dataset, decode_results, verbose=True):
    from lang.py.parse import tokenize_code, de_canonicalize_code
    # tokenize_code = tokenize_for_bleu_eval
    import ast
    assert dataset.count == len(decode_results)

    f = f_decode = None
    if verbose:
        f = open(dataset.name + '.exact_match', 'w')
        exact_match_ids = []
        f_decode = open(dataset.name + '.decode_results.txt', 'w')
        eid_to_annot = dict()

        if data_type == 'django':
            for raw_id, line in enumerate(open(DJANGO_ANNOT_FILE)):
                eid_to_annot[raw_id] = line.strip()

        f_bleu_eval_ref = open(dataset.name + '.ref', 'w')
        f_bleu_eval_hyp = open(dataset.name + '.hyp', 'w')

        logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count)

    cum_oracle_bleu = 0.0
    cum_oracle_acc = 0.0
    cum_bleu = 0.0
    cum_acc = 0.0
    sm = SmoothingFunction()

    all_references = []
    all_predictions = []

    if all(len(cand) == 0 for cand in decode_results):
        logging.ERROR('Empty decoding results for the current dataset!')
        return -1, -1

    binned_results_dict = defaultdict(list)
    def get_binned_key(ast_size):
        cutoff = 50 if data_type == 'django' else 250
        k = 10 if data_type == 'django' else 25 # for hs

        if ast_size >= cutoff:
            return '%d - inf' % cutoff

        lower = int(ast_size / k) * k
        upper = lower + k

        key = '%d - %d' % (lower, upper)

        return key


    for eid in range(dataset.count):
        example = dataset.examples[eid]
        ref_code = example.code
        ref_ast_tree = ast.parse(ref_code).body[0]
        refer_source = astor.to_source(ref_ast_tree).strip()
        # refer_source = ref_code
        refer_tokens = tokenize_code(refer_source)
        cur_example_acc = 0.0

        decode_cands = decode_results[eid]
        if len(decode_cands) == 0:
            continue

        decode_cand = decode_cands[0]

        cid, cand, ast_tree, code = decode_cand
        code = astor.to_source(ast_tree).strip()

        # simple_url_2_re = re.compile('_STR:0_', re.))
        try:
            predict_tokens = tokenize_code(code)
        except:
            logging.error('error in tokenizing [%s]', code)
            continue

        if refer_tokens == predict_tokens:
            cum_acc += 1
            cur_example_acc = 1.0

            if verbose:
                exact_match_ids.append(example.raw_id)
                f.write('-' * 60 + '\n')
                f.write('example_id: %d\n' % example.raw_id)
                f.write(code + '\n')
                f.write('-' * 60 + '\n')

        if data_type == 'django':
            ref_code_for_bleu = example.meta_data['raw_code']
            pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code'])
            # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code'])
            # convert canonicalized code to raw code
            for literal, place_holder in example.meta_data['str_map'].iteritems():
                pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal)
                # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal)
        elif data_type == 'hs':
            ref_code_for_bleu = ref_code
            pred_code_for_bleu = code

        # we apply Ling Wang's trick when evaluating BLEU scores
        refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu)
        pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu)

        shorter = len(pred_tokens_for_bleu) < len(refer_tokens_for_bleu)

        all_references.append([refer_tokens_for_bleu])
        all_predictions.append(pred_tokens_for_bleu)

        # try:
        ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu))
        bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3)
        cum_bleu += bleu_score
        # except:
        #    pass

        if verbose:
            print 'raw_id: %d, bleu_score: %f' % (example.raw_id, bleu_score)

            f_decode.write('-' * 60 + '\n')
            f_decode.write('example_id: %d\n' % example.raw_id)
            f_decode.write('intent: \n')

            if data_type == 'django':
                f_decode.write(eid_to_annot[example.raw_id] + '\n')
            elif data_type == 'hs':
                f_decode.write(' '.join(example.query) + '\n')

            f_bleu_eval_ref.write(' '.join(refer_tokens_for_bleu) + '\n')
            f_bleu_eval_hyp.write(' '.join(pred_tokens_for_bleu) + '\n')

            f_decode.write('canonicalized reference: \n')
            f_decode.write(refer_source + '\n')
            f_decode.write('canonicalized prediction: \n')
            f_decode.write(code + '\n')
            f_decode.write('reference code for bleu calculation: \n')
            f_decode.write(ref_code_for_bleu + '\n')
            f_decode.write('predicted code for bleu calculation: \n')
            f_decode.write(pred_code_for_bleu + '\n')
            f_decode.write('pred_shorter_than_ref: %s\n' % shorter)
            # f_decode.write('weired: %s\n' % weired)
            f_decode.write('-' * 60 + '\n')

        # compute oracle
        best_bleu_score = 0.
        cur_oracle_acc = 0.
        for ast_tree in decode_results:
            try:
                code = astor.to_source(ast_tree).strip()
                predict_tokens = tokenize_code(code)

                if predict_tokens == refer_tokens:
                    cur_oracle_acc = 1.

                if data_type == 'django':
                    pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code'])
                    # convert canonicalized code to raw code
                    for literal, place_holder in example.meta_data['str_map'].iteritems():
                        pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal)
                elif data_type == 'hs':
                    pred_code_for_bleu = code

                # we apply Ling Wang's trick when evaluating BLEU scores
                pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu)

                ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu))
                cand_bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu,
                                                weights=ngram_weights,
                                                smoothing_function=sm.method3)

                if cand_bleu_score > best_bleu_score:
                    best_bleu_score = cand_bleu_score

            except:
                continue

        cum_oracle_bleu += best_bleu_score
        cum_oracle_acc += cur_oracle_acc

        ref_ast_size = example.parse_tree.size
        binned_key = get_binned_key(ref_ast_size)
        binned_results_dict[binned_key].append((bleu_score, cur_example_acc, best_bleu_score, cur_oracle_acc))

    cum_bleu /= dataset.count
    cum_acc /= dataset.count
    cum_oracle_bleu /= dataset.count
    cum_oracle_acc /= dataset.count

    logging.info('corpus level bleu: %f', corpus_bleu(all_references, all_predictions, smoothing_function=sm.method3))
    logging.info('sentence level bleu: %f', cum_bleu)
    logging.info('accuracy: %f', cum_acc)
    logging.info('oracle bleu: %f', cum_oracle_bleu)
    logging.info('oracle accuracy: %f', cum_oracle_acc)

    keys = sorted(binned_results_dict, key=lambda x: int(x.split(' - ')[0]))

    Y = [[], [], [], []]
    X = []

    for binned_key in keys:
        entry = binned_results_dict[binned_key]
        avg_bleu = np.average([t[0] for t in entry])
        avg_acc = np.average([t[1] for t in entry])
        avg_oracle_bleu = np.average([t[2] for t in entry])
        avg_oracle_acc = np.average([t[3] for t in entry])
        print binned_key, avg_bleu, avg_acc, avg_oracle_bleu, avg_oracle_acc, len(entry)

        Y[0].append(avg_bleu)
        Y[1].append(avg_acc)
        Y[2].append(avg_oracle_bleu)
        Y[3].append(avg_oracle_acc)

        X.append(int(binned_key.split(' - ')[0]))

    import matplotlib.pyplot as plt
    from pylab import rcParams
    rcParams['figure.figsize'] = 6, 2.5

    if data_type == 'django':
        fig, ax = plt.subplots()
        ax.plot(X, Y[0], 'bs--', label='BLEU', lw=1.2)
        # ax.plot(X, Y[2], 'r^--', label='oracle BLEU', lw=1.2)
        ax.plot(X, Y[1], 'r^--', label='acc', lw=1.2)
        # ax.plot(X, Y[3], 'r^--', label='oracle acc', lw=1.2)
        ax.set_ylabel('Performance')
        ax.set_xlabel('Reference AST Size (# nodes)')
        plt.legend(loc='upper right', ncol=6)
        plt.tight_layout()
        # plt.savefig('django_acc_ast_size.pdf', dpi=300)
        # os.system('pcrop.sh django_acc_ast_size.pdf')
        plt.savefig('django_perf_ast_size.pdf', dpi=300)
        os.system('pcrop.sh django_perf_ast_size.pdf')
    else:
        fig, ax = plt.subplots()
        ax.plot(X, Y[0], 'bs--', label='BLEU', lw=1.2)
        # ax.plot(X, Y[2], 'r^--', label='oracle BLEU', lw=1.2)
        ax.plot(X, Y[1], 'r^--', label='acc', lw=1.2)
        # ax.plot(X, Y[3], 'r^--', label='oracle acc', lw=1.2)
        ax.set_ylabel('Performance')
        ax.set_xlabel('Reference AST Size (# nodes)')
        plt.legend(loc='upper right', ncol=6)
        plt.tight_layout()
        # plt.savefig('hs_bleu_ast_size.pdf', dpi=300)
        # os.system('pcrop.sh hs_bleu_ast_size.pdf')
        plt.savefig('hs_perf_ast_size.pdf', dpi=300)
        os.system('pcrop.sh hs_perf_ast_size.pdf')
    if verbose:
        f.write(', '.join(str(i) for i in exact_match_ids))
        f.close()
        f_decode.close()

        f_bleu_eval_ref.close()
        f_bleu_eval_hyp.close()

    return cum_bleu, cum_acc
from nltk.translate import bleu_score

# use sentence_bleu to evaluate single-sentence paraphrases. 
# reference = known good translation into the destination language
reference = 'The king is staying up all night drinking and dancing'.split(' ')
# hypothesis = system's translation into the destination language
hypothesis1 = 'The king doth wake tonight and takes his rouse'.split(' ')
hypothesis2 = 'The king stays up tonight and takes his rouse'.split(' ')
hypothesis3 = 'The king stays up tonight drinking and dancing'.split(' ')
hypothesis4 = 'The king stays up all night drinking and dancing'.split(' ')

for hyp in [hypothesis1, hypothesis2, hypothesis3, hypothesis4, reference]:
	print bleu_score.sentence_bleu([reference], hyp)


# use corpus_bleu to evaluate multi-sentence paraphrases.
reference1 = 'the musicians make a ruckus to celebrate his draining another cup.'.split(' ')
hypothesis1_1 = 'The kettle-drum and trumpet thus bray out The triumph of his pledge.'.split(' ')

print bleu_score.corpus_bleu(
	[[reference], [reference1]],  # list of references for each sentence in the corpus
	[hypothesis1, hypothesis1_1]) # 1 hypothesis for each sentence in the corpus 
Ejemplo n.º 47
0
def evaluate_decode_results(data_type, dataset, decode_results, verbose=True):
    from lang.py.parse import tokenize_code, de_canonicalize_code
    # tokenize_code = tokenize_for_bleu_eval
    import ast
    assert dataset.count == len(decode_results)

    f = f_decode = None
    if verbose:
        f = open(dataset.name + '.exact_match', 'w')
        exact_match_ids = []
        f_decode = open(dataset.name + '.decode_results.txt', 'w')
        eid_to_annot = dict()

        if data_type == 'django':
            for raw_id, line in enumerate(open(DJANGO_ANNOT_FILE)):
                eid_to_annot[raw_id] = line.strip()

        f_bleu_eval_ref = open(dataset.name + '.ref', 'w')
        f_bleu_eval_hyp = open(dataset.name + '.hyp', 'w')
        f_generated_code = open(dataset.name + '.geneated_code', 'w')

        logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count)

    cum_oracle_bleu = 0.0
    cum_oracle_acc = 0.0
    cum_bleu = 0.0
    cum_acc = 0.0
    sm = SmoothingFunction()

    all_references = []
    all_predictions = []

    for eid in range(dataset.count):
        example = dataset.examples[eid]
        ref_code = example.code
        ref_ast_tree = ast.parse(ref_code).body[0]
        refer_source = astor.to_source(ref_ast_tree).strip()
        # refer_source = ref_code
        refer_tokens = tokenize_code(refer_source)
        cur_example_correct = False

        ast_tree = decode_results[eid]
        code = astor.to_source(ast_tree).strip()

        # simple_url_2_re = re.compile('_STR:0_', re.))
        try:
            predict_tokens = tokenize_code(code)
        except:
            logging.error('error in tokenizing [%s]', code)
            continue

        if refer_tokens == predict_tokens:
            cum_acc += 1
            cur_example_correct = True

            if verbose:
                exact_match_ids.append(example.raw_id)
                f.write('-' * 60 + '\n')
                f.write('example_id: %d\n' % example.raw_id)
                f.write(code + '\n')
                f.write('-' * 60 + '\n')

        if data_type == 'django':
            ref_code_for_bleu = example.meta_data['raw_code']
            pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code'])
            # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code'])
            # convert canonicalized code to raw code
            for literal, place_holder in example.meta_data['str_map'].iteritems():
                pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal)
                # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal)
        elif data_type == 'hs':
            ref_code_for_bleu = ref_code
            pred_code_for_bleu = code

        # we apply Ling Wang's trick when evaluating BLEU scores
        refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu)
        pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu)

        # The if-chunk below is for debugging purpose, sometimes the reference cannot match with the prediction
        # because of inconsistent quotes (e.g., single quotes in reference, double quotes in prediction).
        # However most of these cases are solved by cannonicalizing the reference code using astor (parse the reference
        # into AST, and regenerate the code. Use this regenerated one as the reference)
        weired = False
        if refer_tokens_for_bleu == pred_tokens_for_bleu and refer_tokens != predict_tokens:
            # cum_acc += 1
            weired = True
        elif refer_tokens == predict_tokens:
            # weired!
            # weired = True
            pass

        shorter = len(pred_tokens_for_bleu) < len(refer_tokens_for_bleu)

        all_references.append([refer_tokens_for_bleu])
        all_predictions.append(pred_tokens_for_bleu)

        # try:
        ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu))
        bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3)
        cum_bleu += bleu_score
        # except:
        #    pass

        if verbose:
            print 'raw_id: %d, bleu_score: %f' % (example.raw_id, bleu_score)

            f_decode.write('-' * 60 + '\n')
            f_decode.write('example_id: %d\n' % example.raw_id)
            f_decode.write('intent: \n')

            if data_type == 'django':
                f_decode.write(eid_to_annot[example.raw_id] + '\n')
            elif data_type == 'hs':
                f_decode.write(' '.join(example.query) + '\n')

            f_bleu_eval_ref.write(' '.join(refer_tokens_for_bleu) + '\n')
            f_bleu_eval_hyp.write(' '.join(pred_tokens_for_bleu) + '\n')

            f_decode.write('canonicalized reference: \n')
            f_decode.write(refer_source + '\n')
            f_decode.write('canonicalized prediction: \n')
            f_decode.write(code + '\n')
            f_decode.write('reference code for bleu calculation: \n')
            f_decode.write(ref_code_for_bleu + '\n')
            f_decode.write('predicted code for bleu calculation: \n')
            f_decode.write(pred_code_for_bleu + '\n')
            f_decode.write('pred_shorter_than_ref: %s\n' % shorter)
            f_decode.write('weired: %s\n' % weired)
            f_decode.write('-' * 60 + '\n')

            # for Hiro's evaluation
            f_generated_code.write(pred_code_for_bleu.replace('\n', '#NEWLINE#') + '\n')


        # compute oracle
        best_score = 0.
        cur_oracle_acc = 0.
        for ast_tree in decode_results:

            try:
                code = astor.to_source(ast_tree).strip()
                predict_tokens = tokenize_code(code)

                if predict_tokens == refer_tokens:
                    cur_oracle_acc = 1

                if data_type == 'django':
                    pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code'])
                    # convert canonicalized code to raw code
                    for literal, place_holder in example.meta_data['str_map'].iteritems():
                        pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal)
                elif data_type == 'hs':
                    pred_code_for_bleu = code

                # we apply Ling Wang's trick when evaluating BLEU scores
                pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu)

                ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu))
                bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu,
                                           weights=ngram_weights,
                                           smoothing_function=sm.method3)

                if bleu_score > best_score:
                    best_score = bleu_score

            except:
                continue

        cum_oracle_bleu += best_score
        cum_oracle_acc += cur_oracle_acc

    cum_bleu /= dataset.count
    cum_acc /= dataset.count
    cum_oracle_bleu /= dataset.count
    cum_oracle_acc /= dataset.count

    logging.info('corpus level bleu: %f', corpus_bleu(all_references, all_predictions, smoothing_function=sm.method3))
    logging.info('sentence level bleu: %f', cum_bleu)
    logging.info('accuracy: %f', cum_acc)
    logging.info('oracle bleu: %f', cum_oracle_bleu)
    logging.info('oracle accuracy: %f', cum_oracle_acc)

    if verbose:
        f.write(', '.join(str(i) for i in exact_match_ids))
        f.close()
        f_decode.close()

        f_bleu_eval_ref.close()
        f_bleu_eval_hyp.close()
        f_generated_code.close()
    print cum_bleu, cum_acc
    return cum_bleu, cum_acc
Ejemplo n.º 48
0
for line in f:
    vocab.append(line.rstrip('\n'))
f.close()

stringSentences = []
for sentence in sentences:
    stringSentence = []
    for wordIndex in sentence:
        stringSentence.append(vocab[wordIndex-1])
    stringSentences.append(stringSentence)

#stringSentences = stringSentences[:1790] + stringSentences[2100:]
#hypotheses = hypotheses[:1790] + hypotheses[2100:]

stringSentencesDev = stringSentences[2000:] 
hypothesesDev = hypotheses[2000:] 

stringSentencesTest = stringSentences[2000:] 
hypothesesTest = hypotheses[2000:] 

"""
scores = []
for i,j in zip(stringSentences, hypotheses):
    references = [i]
    scores.append(bleu_score.sentence_bleu(references, j))
average = np.average(np.array(scores))
#print bleu_score.corpus_bleu(stringSentences, hypotheses)
print average
"""
print bleu_score.corpus_bleu(stringSentencesDev, hypothesesDev)
Ejemplo n.º 49
0
args = parser.parse_args()

wer_score_file = os.path.dirname(args.input_1) + "/" + os.path.splitext(
    os.path.basename(args.input_1))[0] + "_evaluated_WER_and_BLEU.txt"
hypotheses = args.input_1
target = args.input_2

import nltk
from nltk.translate.bleu_score import corpus_bleu

corpus_tokenized = [s.split() for s in reference_input]
references = [[['this', 'is', 'a', 'test'], ['this', 'is' 'test']]]

candidates = [['this', 'is', 'a', 'test']]

score = corpus_bleu(references, candidates)

print(score)


def levenshtein(src,
                trg,
                sub_cost=1.0,
                del_cost=1.0,
                ins_cost=1.0,
                randomize=True):
    DEL, INS, KEEP, SUB = range(4)
    op_names = 'delete', 'insert', 'keep', 'sub'

    costs = np.zeros((len(trg) + 1, len(src) + 1))
    ops = np.zeros((len(trg) + 1, len(src) + 1), dtype=np.int32)
Ejemplo n.º 50
0
def get_bleu(references, hypotheses):
    # compute BLEU
    bleu_score = corpus_bleu([[ref[1:-1]] for ref in references],
                             [hyp[1:-1] for hyp in hypotheses])

    return bleu_score
Ejemplo n.º 51
0
def evaluate(sess, dataloader, model, ksave_dir, mode='valid'):
    if mode == 'valid':
        # texts_path = "original_data/valid.summary"
        texts_path = "processed_data/valid/valid.box.val"
        gold_path = gold_path_valid
        evalset = dataloader.dev_set
    else:
        # texts_path = "original_data/test.summary"
        texts_path = "processed_data/test/test.box.val"
        gold_path = gold_path_test
        evalset = dataloader.test_set

    # for copy words from the infoboxes
    texts = open(texts_path, 'rt', encoding="UTF8").read().strip().split('\n')
    texts = [list(t.strip().split()) for t in texts]
    v = Vocab()

    # with copy
    pred_list, pred_list_copy, gold_list = [], [], []
    pred_unk, pred_mask = [], []

    k = 0
    for x in dataloader.batch_iter(evalset, FLAGS.batch_size, False):
        predictions, atts = model.generate(x, sess)
        atts = np.squeeze(atts)
        idx = 0
        for summary in np.array(predictions):
            with open(pred_path + str(k), 'w', -1, "utf-8") as sw:
                summary = list(summary)
                if 2 in summary:
                    summary = summary[:summary.
                                      index(2)] if summary[0] != 2 else [2]
                real_sum, unk_sum, mask_sum = [], [], []
                for tk, tid in enumerate(summary):
                    if tid == 3:
                        sub = texts[k][np.argmax(atts[tk, :len(texts[k]),
                                                      idx])]
                        real_sum.append(sub)
                        mask_sum.append("**" + str(sub) + "**")
                    else:
                        real_sum.append(v.id2word(tid))
                        mask_sum.append(v.id2word(tid))
                    unk_sum.append(v.id2word(tid))
                sw.write(" ".join([str(x) for x in real_sum]) + '\n')
                pred_list.append([str(x) for x in real_sum])
                pred_unk.append([str(x) for x in unk_sum])
                pred_mask.append([str(x) for x in mask_sum])
                k += 1
                idx += 1
    write_word(pred_mask, ksave_dir, mode + "_summary_copy.txt")
    write_word(pred_unk, ksave_dir, mode + "_summary_unk.txt")

    for tk in range(k):
        with open(gold_path + str(tk), 'r', -1, "utf-8") as g:
            gold_list.append([g.read().strip().split()])

    gold_set = [[gold_path + str(i)] for i in range(k)]
    pred_set = [pred_path + str(i) for i in range(k)]

    # recall_tmp, precision_tmp, F_measure_tmp = [],[],[]
    # scorer = rouge_scorer.RougeScorer(['rouge1'])
    # for i in range(len(pred_set)) :
    #     pred = open(pred_set[i], "rt", encoding="UTF8")
    #     pred_lines = pred.readlines()
    #     gold = open(gold_set[i][0], "rt", encoding="UTF8")
    #     gold_lines = gold.readlines()

    #     scores = scorer.score(pred_lines[0], gold_lines[0])
    #     result = list(scores.values())

    #     recall_tmp.append(result[0][1])
    #     precision_tmp.append(result[0][0])
    #     F_measure_tmp.append(result[0][2])

    # recall = np.mean(recall_tmp)
    # precision = np.mean(precision_tmp)
    # F_measure = np.mean(F_measure_tmp)

    F_measure1_tmp, F_measure2_tmp, F_measure3_tmp = [], [], []
    scorer1 = rouge_scorer.RougeScorer(['rouge1'])
    scorer2 = rouge_scorer.RougeScorer(['rouge2'])
    scorer3 = rouge_scorer.RougeScorer(['rouge3'])

    for i in range(len(pred_set)):
        pred = open(pred_set[i], "rt", encoding="UTF8")
        pred_lines = pred.readlines()
        gold = open(gold_set[i][0], "rt", encoding="UTF8")
        gold_lines = gold.readlines()

        scores1 = scorer1.score(pred_lines[0], gold_lines[0])
        scores2 = scorer2.score(pred_lines[0], gold_lines[0])
        scores3 = scorer3.score(pred_lines[0], gold_lines[0])
        result1 = list(scores1.values())
        result2 = list(scores2.values())
        result3 = list(scores3.values())

        F_measure1_tmp.append(result1[0][2])
        F_measure2_tmp.append(result2[0][2])
        F_measure3_tmp.append(result3[0][2])

    F_measure1 = np.mean(F_measure1_tmp)
    F_measure2 = np.mean(F_measure2_tmp)
    F_measure3 = np.mean(F_measure3_tmp)

    bleu = corpus_bleu(gold_list, pred_list)
    # copy_result = "with copy F_measure: %s Recall: %s Precision: %s BLEU: %s\n" % \
    # (str(F_measure), str(recall), str(precision), str(bleu))
    copy_result = "with copy F_measure of ROUGE1: %s ROUGE2: %s ROUGE3: %s BLEU: %s\n" % \
    (str(F_measure1), str(F_measure2), str(F_measure3), str(bleu))
    # print copy_result

    # for tk in range(k):
    #     with open(pred_path + str(tk), 'w', -1 ,"utf-8") as sw:
    #         sw.write(" ".join(pred_unk[tk]) + '\n')

    # bleu = corpus_bleu(gold_list, pred_unk)
    # # nocopy_result = "without copy F_measure: %s Recall: %s Precision: %s BLEU: %s\n" % \
    # # (str(F_measure), str(recall), str(precision), str(bleu))
    # nocopy_result = "without copy F_measure of ROUGE1: %s ROUGE2: %s ROUGE3: %s BLEU: %s\n" % \
    # (str(F_measure1), str(F_measure2), str(F_measure3), str(bleu))

    # print nocopy_result
    result = copy_result  #+ nocopy_result
    # print result
    if mode == 'valid':
        print(result)
    # wandb.log({'F_measure1' : F_measure1, 'F_measure2' : F_measure2, 'F_measure3' : F_measure3, 'BLEU' : bleu})
    return result
Ejemplo n.º 52
0
    def generator_test_max_example(self, positive_dir, negative_dir, num_batch):

        self.temp_positive_dir = positive_dir
        self.temp_negative_dir = negative_dir

        if not os.path.exists(self.temp_positive_dir): os.mkdir(self.temp_positive_dir)
        if not os.path.exists(self.temp_negative_dir): os.mkdir(self.temp_negative_dir)
        shutil.rmtree(self.temp_negative_dir)
        shutil.rmtree(self.temp_positive_dir)
        if not os.path.exists(self.temp_positive_dir): os.mkdir(self.temp_positive_dir)
        if not os.path.exists(self.temp_negative_dir): os.mkdir(self.temp_negative_dir)
        counter = 0
        batches = self.test_batches
        step = 0
        list_hop = []
        list_ref = []

        while step < num_batch:
            
            batch = batches[step]
            step += 1

            decode_result = self._model.max_generator(self._sess, batch)
            #decode_result = self._model.run_eval_given_step(self._sess, self.batches[self.current_batch])


            for i in range(FLAGS.batch_size):

                decoded_words_all = []
                original_review = batch.original_review_output[i]

                for j in range(FLAGS.max_dec_sen_num):

                    output_ids = [int(t) for t in decode_result['generated'][i][j]][1:]
                    decoded_words = data.outputids2words(output_ids, self._vocab, None)
                    # Remove the [STOP] token from decoded_words, if necessary
                    try:
                        fst_stop_idx = decoded_words.index(data.STOP_DECODING)  # index of the (first) [STOP] symbol
                        decoded_words = decoded_words[:fst_stop_idx]
                    except ValueError:
                        decoded_words = decoded_words

                    if len(decoded_words)<2:
                        continue

                    if len(decoded_words_all)>0:
                        new_set1 =set(decoded_words_all[len(decoded_words_all)-1].split())
                        new_set2= set(decoded_words)
                        if len(new_set1 & new_set2) > 0.5 * len(new_set2):
                            continue
                    decoded_output = ' '.join(decoded_words).strip()  # single string
                    decoded_words_all.append(decoded_output)
                decoded_words_all = ' '.join(decoded_words_all).strip()
                try:
                    fst_stop_idx = decoded_words_all.index(
                        data.STOP_DECODING_DOCUMENT)  # index of the (first) [STOP] symbol
                    decoded_words_all = decoded_words_all[:fst_stop_idx]
                except ValueError:
                    decoded_words_all = decoded_words_all
                decoded_words_all = decoded_words_all.replace("[UNK] ", "")
                decoded_words_all = decoded_words_all.replace("[UNK]", "")
                decoded_words_all, _ = re.subn(r"(! ){2,}", "! ", decoded_words_all)
                decoded_words_all, _ = re.subn(r"(\. ){2,}", ". ", decoded_words_all)
                self.write_negtive_temp_to_json(original_review, decoded_words_all, counter)
                list_ref.append([nltk.word_tokenize(original_review)])
                list_hop.append(nltk.word_tokenize(decoded_words_all))

                counter += 1  # this is how many examples we've decoded
            '''self.current_batch +=1
            if self.current_batch >= len(self.batches):
                self.current_batch = 0'''
        
        
        bleu_score = corpus_bleu(list_ref, list_hop)
        tf.logging.info('bleu: '  + str(bleu_score))
        eva = Evaluate()
        eva.diversity_evaluate(negative_dir + "/*")
Ejemplo n.º 53
0
        scores_ = scores.view(-1, max(decode_lens), scores.size(-1))
        recon_scores = torch.argmax(scores_, -1)

        # Convert to text for bleu score
        run_preprocess = \
            lambda x: remove_tokens(x, [SOS_TOKEN, EOS_TOKEN, PAD_TOKEN])
        all_recon_captions.extend(
            [run_preprocess(sent) \
             for sent in tensor2text(recon_scores, train_vocab)])
        # Note (BP): Wrap in
        all_gold_captions.extend(
            [[run_preprocess(sent)] \
             for sent in  tensor2text(captions_sorted, test_vocab)])
        pbar.update()
    pbar.close()
    # Bleu score
    bleu_score = corpus_bleu(all_gold_captions, all_recon_captions)
    logging.info("Corpus bleu:\t{}".format(round(bleu_score, 4)))

    # Attention plot

    # Loss plot
    if args.create_losses_plot:
        losses_fp = os.path.join(args.model_dir, 'losses.csv')
        df_losses = pd.read_csv(losses_fp)
        sns.lineplot(x="epochs", y="val", hue="typ", data=df_losses)
        losses_out_fp = os.path.join(args.model_dir, "loss.png")
        logging.info("Saving losses plot to {}".format(losses_out_fp))
        plt.savefig(losses_out_fp)