def run_test_our_method(cla_batcher, cnn_classifier, sess_cnn, filename):
    test_to_true = read_test_result_our(filename, 1)
    test_to_false = read_test_result_our(filename, 0)

    gold_to_true = read_test_input(filename, 1)
    gold_to_false = read_test_input(filename, 0)

    list_ref = []
    list_pre = []

    right_cnn = 0
    all_cnn = 0

    for i in range(len(gold_to_true) // 64):
        example_list = []
        for j in range(64):
            # example_list.append(test_true[i*64+j])
            new_dis_example = bc.Example(test_to_true[i * 64 + j], 1,
                                         cla_batcher._vocab, cla_batcher._hps)
            # list_pre.append(test_false[i*64+j].split())
            example_list.append(new_dis_example)

            # list_ref.append([gold_text[i*64+j].split()])

        cla_batch = bc.Batch(example_list, cla_batcher._hps,
                             cla_batcher._vocab)

        right_s, all_s, _, pre = cnn_classifier.run_eval_step(
            sess_cnn, cla_batch)
        right_cnn += right_s
        all_cnn += all_s

        for j in range(64):

            if len(gold_to_true[i * 64 + j].split()) > 2 and len(
                    test_to_true[i * 64 + j].split()) > 2 and 1 == pre[j]:
                list_ref.append([gold_to_true[i * 64 + j].split()])
                list_pre.append(test_to_true[i * 64 + j].split())

    for i in range(len(gold_to_false) // 64):
        example_list = []
        for j in range(64):
            # example_list.append(test_true[i*64+j])
            new_dis_example = bc.Example(test_to_false[i * 64 + j], 0,
                                         cla_batcher._vocab, cla_batcher._hps)
            # list_pre.append(test_false[i*64+j].split())
            example_list.append(new_dis_example)

            # list_ref.append([gold_text[i*64+j].split()])

        cla_batch = bc.Batch(example_list, cla_batcher._hps,
                             cla_batcher._vocab)

        right_s, all_s, _, pre = cnn_classifier.run_eval_step(
            sess_cnn, cla_batch)
        right_cnn += right_s
        all_cnn += all_s

        for j in range(64):

            if len(gold_to_false[i * 64 + j].split()) > 2 and len(
                    test_to_false[i * 64 + j].split()) > 2 and 0 == pre[j]:
                list_ref.append([gold_to_false[i * 64 + j].split()])
                list_pre.append(test_to_false[i * 64 + j].split())

    tf.logging.info("cnn test acc: " + str(right_cnn * 1.0 / all_cnn))
    cc = SmoothingFunction()
    tf.logging.info(
        "BLEU: " +
        str(corpus_bleu(list_ref, list_pre, smoothing_function=cc.method1)))
Esempio n. 2
0
def evaluate_autoencoder(whichdecoder, data_source, epoch):
    # Turn on evaluation mode which disables dropout.
    eos_id = corpus.dictionary.word2idx['<eos>']
    autoencoder.eval()
    ntokens = len(corpus.dictionary.word2idx)
    n_sents = 0.0
    total_loss = 0.0
    token_accuracies = 0.0
    all_source_sents = []
    all_transfer_sents = []

    pbar = tqdm(range(len(data_source)))
    for ii in pbar:
        batch = data_source[ii]

        source, target, lengths = batch
        source = to_gpu(use_cuda, Variable(source, requires_grad=False))
        target = to_gpu(use_cuda, Variable(target, requires_grad=False))
        n_sents += source.size()[0]

        mask = target.gt(0)
        masked_target = target.masked_select(mask)
        # examples x ntokens
        output_mask = mask.unsqueeze(1).expand(mask.size(0), ntokens)

        hidden = autoencoder(0, source, lengths, noise=False, encode_only=True)

        # output: batch x seq_len x ntokens
        if whichdecoder == 0:
            output = autoencoder(0, source, lengths, noise=False)
            flattened_output = output.view(-1, ntokens)
            masked_output = flattened_output.masked_select(output_mask).view(-1, ntokens)
            # accuracy
            max_vals1, max_indices1 = torch.max(masked_output, 1)
            token_accuracies += torch.mean(max_indices1.eq(masked_target).float()).item()

            max_values1, max_indices1 = torch.max(output, 2)
            max_indices2 = autoencoder.generate(1, hidden, maxlen=50)
        else:
            output = autoencoder(1, source, lengths, noise=False)
            flattened_output = output.view(-1, ntokens)
            masked_output = flattened_output.masked_select(output_mask).view(-1, ntokens)
            # accuracy
            max_vals2, max_indices2 = torch.max(masked_output, 1)
            token_accuracies += torch.mean(max_indices2.eq(masked_target).float()).item()

            max_values2, max_indices2 = torch.max(output, 2)
            max_indices1 = autoencoder.generate(0, hidden, maxlen=50)

        # forward
        total_loss += criterion_ce(masked_output / args.temp, masked_target).data

        # all_source_sents, all_transfer_sents
        max_indices1 = max_indices1.view(output.size(0), -1).data.cpu().numpy()
        max_indices2 = max_indices2.view(output.size(0), -1).data.cpu().numpy()
        target = target.view(output.size(0), -1).data.cpu().numpy()
        tran_indices = max_indices2 if whichdecoder == 0 else max_indices1
        for t, tran_idx in zip(target, tran_indices):
            # real sentence
            truncated_to_eos = t.tolist().index(eos_id) if eos_id in t.tolist() else len(t)
            chars = " ".join([corpus.dictionary.idx2word[x] for x in t[:truncated_to_eos]])
            all_source_sents.append(chars)
            # transfer sentence
            truncated_to_eos = tran_idx.tolist().index(eos_id) if eos_id in tran_idx.tolist() else len(tran_idx)
            chars = " ".join([corpus.dictionary.idx2word[x] for x in tran_idx[:truncated_to_eos]])
            all_transfer_sents.append(chars)

    # compare the original and transfer
    aeoutf_from = "{}/{}_output_decoder_{}_from.txt".format(args.outf, epoch, whichdecoder)
    aeoutf_tran = "{}/{}_output_decoder_{}_tran.txt".format(args.outf, epoch, whichdecoder)
    with open(aeoutf_from, 'w') as f_from, open(aeoutf_tran, 'w') as f_trans:
        # laplacian smoothing
        # for word in corpus.dictionary.word2idx.keys():
        #    f_from.write(word + "\n")
        #    f_trans.write(word + "\n")
        for i in range(len(all_source_sents)):
            # real sentence
            f_from.write(all_source_sents[i])
            # transfer sentence
            f_trans.write(all_transfer_sents[i])
            if i != len(all_source_sents) - 1:
                f_from.write("\n")
                f_trans.write("\n")

    # bleu
    all_bleu_scores = 0.0
    for i in range(len(all_source_sents)):
        sou = all_source_sents[i].split(' ')
        tran = all_transfer_sents[i].split(' ')
        all_bleu_scores += sentence_bleu([sou], tran,smoothing_function=SmoothingFunction().method7,weights=[1.0/3.0]*3)
    bleu = all_bleu_scores / n_sents * 100.0

    # forward and reverse
    loss = total_loss.item() / len(data_source)
    ppl = math.exp(loss)

    #print('bleu {:4.2f} | ppl {:4.3f}'.format(bleu, ppl))
    #logging.info('bleu {:4.2f} | ppl {:4.3f}'.format(bleu, ppl))

    # transfer
    labels = fasttext_classifier.predict(all_transfer_sents)
    truth = str(1 - whichdecoder)
    transfer = float(sum([l == truth for ll in labels for l in ll])) / n_sents * 100.0

    # load sentences to evaluate on
    arpa_path = '{}/{}_lm_{}.arpa'.format(args.outf, epoch, whichdecoder)
    kenlm_model = train_ngram_lm(args.kenlm_path, aeoutf_from, arpa_path, args.N)
    forward = get_ppl(kenlm_model, all_transfer_sents)

    kenlm_model = train_ngram_lm(args.kenlm_path, aeoutf_tran, arpa_path, args.N)
    reverse = get_ppl(kenlm_model, all_source_sents)

    #print('transfer {:4.2f} | forward {:4.3f} | reverse {:4.3f}'.format(transfer, forward, reverse))
    #logging.info('transfer {:4.2f} | forward {:4.3f} | reverse {:4.3f}'.format(transfer, forward, reverse))

    return bleu, ppl, transfer, forward, reverse
Esempio n. 3
0
import argparse
import csv
import sys, os, pdb
import nltk
import time
import random
from nltk.translate.bleu_score import SmoothingFunction
chencherry = SmoothingFunction()

MAX_REF_COUNT = 6


def get_annotations(line):
    set_info, post_id, best, valids, confidence = line.split(',')
    annotator_name = set_info.split('_')[0]
    sitename = set_info.split('_')[1]
    best = int(best)
    valids = [int(v) for v in valids.split()]
    confidence = int(confidence)
    return post_id, annotator_name, sitename, best, valids, confidence


def read_human_annotations(human_annotations_filename):
    human_annotations_file = open(human_annotations_filename, 'r')
    annotations = {}
    for line in human_annotations_file.readlines():
        line = line.strip('\n')
        splits = line.split('\t')
        post_id1, annotator_name1, sitename1, best1, valids1, confidence1 = get_annotations(
            splits[0])
        post_id2, annotator_name2, sitename2, best2, valids2, confidence2 = get_annotations(
Esempio n. 4
0
def analyze_decode_results(dataset, decode_results, verbose=True):
    from lang.py.parse import tokenize_code, de_canonicalize_code
    # tokenize_code = tokenize_for_bleu_eval
    import ast
    assert dataset.count == len(decode_results)

    f = f_decode = None
    if verbose:
        f = open(dataset.name + '.exact_match', 'w')
        exact_match_ids = []
        f_decode = open(dataset.name + '.decode_results.txt', 'w')
        eid_to_annot = dict()

        if data_type == 'django':
            for raw_id, line in enumerate(open(DJANGO_ANNOT_FILE)):
                eid_to_annot[raw_id] = line.strip()

        f_bleu_eval_ref = open(dataset.name + '.ref', 'w')
        f_bleu_eval_hyp = open(dataset.name + '.hyp', 'w')

        logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count)

    cum_oracle_bleu = 0.0
    cum_oracle_acc = 0.0
    cum_bleu = 0.0
    cum_acc = 0.0
    sm = SmoothingFunction()

    all_references = []
    all_predictions = []

    if all(len(cand) == 0 for cand in decode_results):
        logging.ERROR('Empty decoding results for the current dataset!')
        return -1, -1

    binned_results_dict = defaultdict(list)
    def get_binned_key(ast_size):
        cutoff = 50 if data_type == 'django' else 250
        k = 10 if data_type == 'django' else 25 # for hs

        if ast_size >= cutoff:
            return '%d - inf' % cutoff

        lower = int(ast_size / k) * k
        upper = lower + k

        key = '%d - %d' % (lower, upper)

        return key


    for eid in range(dataset.count):
        example = dataset.examples[eid]
        ref_code = example.code
        ref_ast_tree = ast.parse(ref_code).body[0]
        refer_source = astor.to_source(ref_ast_tree).strip()
        # refer_source = ref_code
        refer_tokens = tokenize_code(refer_source)
        cur_example_acc = 0.0

        decode_cands = decode_results[eid]
        if len(decode_cands) == 0:
            continue

        decode_cand = decode_cands[0]

        cid, cand, ast_tree, code = decode_cand
        code = astor.to_source(ast_tree).strip()

        # simple_url_2_re = re.compile('_STR:0_', re.))
        try:
            predict_tokens = tokenize_code(code)
        except:
            logging.error('error in tokenizing [%s]', code)
            continue

        if refer_tokens == predict_tokens:
            cum_acc += 1
            cur_example_acc = 1.0

            if verbose:
                exact_match_ids.append(example.raw_id)
                f.write('-' * 60 + '\n')
                f.write('example_id: %d\n' % example.raw_id)
                f.write(code + '\n')
                f.write('-' * 60 + '\n')

        if data_type == 'django':
            ref_code_for_bleu = example.meta_data['raw_code']
            pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code'])
            # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code'])
            # convert canonicalized code to raw code
            for literal, place_holder in example.meta_data['str_map'].iteritems():
                pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal)
                # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal)
        elif data_type == 'hs':
            ref_code_for_bleu = ref_code
            pred_code_for_bleu = code

        # we apply Ling Wang's trick when evaluating BLEU scores
        refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu)
        pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu)

        shorter = len(pred_tokens_for_bleu) < len(refer_tokens_for_bleu)

        all_references.append([refer_tokens_for_bleu])
        all_predictions.append(pred_tokens_for_bleu)

        # try:
        ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu))
        bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3)
        cum_bleu += bleu_score
        # except:
        #    pass

        if verbose:
            print 'raw_id: %d, bleu_score: %f' % (example.raw_id, bleu_score)

            f_decode.write('-' * 60 + '\n')
            f_decode.write('example_id: %d\n' % example.raw_id)
            f_decode.write('intent: \n')

            if data_type == 'django':
                f_decode.write(eid_to_annot[example.raw_id] + '\n')
            elif data_type == 'hs':
                f_decode.write(' '.join(example.query) + '\n')

            f_bleu_eval_ref.write(' '.join(refer_tokens_for_bleu) + '\n')
            f_bleu_eval_hyp.write(' '.join(pred_tokens_for_bleu) + '\n')

            f_decode.write('canonicalized reference: \n')
            f_decode.write(refer_source + '\n')
            f_decode.write('canonicalized prediction: \n')
            f_decode.write(code + '\n')
            f_decode.write('reference code for bleu calculation: \n')
            f_decode.write(ref_code_for_bleu + '\n')
            f_decode.write('predicted code for bleu calculation: \n')
            f_decode.write(pred_code_for_bleu + '\n')
            f_decode.write('pred_shorter_than_ref: %s\n' % shorter)
            # f_decode.write('weired: %s\n' % weired)
            f_decode.write('-' * 60 + '\n')

        # compute oracle
        best_bleu_score = 0.
        cur_oracle_acc = 0.
        for ast_tree in decode_results:
            try:
                code = astor.to_source(ast_tree).strip()
                predict_tokens = tokenize_code(code)

                if predict_tokens == refer_tokens:
                    cur_oracle_acc = 1.

                if data_type == 'django':
                    pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code'])
                    # convert canonicalized code to raw code
                    for literal, place_holder in example.meta_data['str_map'].iteritems():
                        pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal)
                elif data_type == 'hs':
                    pred_code_for_bleu = code

                # we apply Ling Wang's trick when evaluating BLEU scores
                pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu)

                ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu))
                cand_bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu,
                                                weights=ngram_weights,
                                                smoothing_function=sm.method3)

                if cand_bleu_score > best_bleu_score:
                    best_bleu_score = cand_bleu_score

            except:
                continue

        cum_oracle_bleu += best_bleu_score
        cum_oracle_acc += cur_oracle_acc

        ref_ast_size = example.parse_tree.size
        binned_key = get_binned_key(ref_ast_size)
        binned_results_dict[binned_key].append((bleu_score, cur_example_acc, best_bleu_score, cur_oracle_acc))

    cum_bleu /= dataset.count
    cum_acc /= dataset.count
    cum_oracle_bleu /= dataset.count
    cum_oracle_acc /= dataset.count

    logging.info('corpus level bleu: %f', corpus_bleu(all_references, all_predictions, smoothing_function=sm.method3))
    logging.info('sentence level bleu: %f', cum_bleu)
    logging.info('accuracy: %f', cum_acc)
    logging.info('oracle bleu: %f', cum_oracle_bleu)
    logging.info('oracle accuracy: %f', cum_oracle_acc)

    keys = sorted(binned_results_dict, key=lambda x: int(x.split(' - ')[0]))

    Y = [[], [], [], []]
    X = []

    for binned_key in keys:
        entry = binned_results_dict[binned_key]
        avg_bleu = np.average([t[0] for t in entry])
        avg_acc = np.average([t[1] for t in entry])
        avg_oracle_bleu = np.average([t[2] for t in entry])
        avg_oracle_acc = np.average([t[3] for t in entry])
        print binned_key, avg_bleu, avg_acc, avg_oracle_bleu, avg_oracle_acc, len(entry)

        Y[0].append(avg_bleu)
        Y[1].append(avg_acc)
        Y[2].append(avg_oracle_bleu)
        Y[3].append(avg_oracle_acc)

        X.append(int(binned_key.split(' - ')[0]))

    import matplotlib.pyplot as plt
    from pylab import rcParams
    rcParams['figure.figsize'] = 6, 2.5

    if data_type == 'django':
        fig, ax = plt.subplots()
        ax.plot(X, Y[0], 'bs--', label='BLEU', lw=1.2)
        # ax.plot(X, Y[2], 'r^--', label='oracle BLEU', lw=1.2)
        ax.plot(X, Y[1], 'r^--', label='acc', lw=1.2)
        # ax.plot(X, Y[3], 'r^--', label='oracle acc', lw=1.2)
        ax.set_ylabel('Performance')
        ax.set_xlabel('Reference AST Size (# nodes)')
        plt.legend(loc='upper right', ncol=6)
        plt.tight_layout()
        # plt.savefig('django_acc_ast_size.pdf', dpi=300)
        # os.system('pcrop.sh django_acc_ast_size.pdf')
        plt.savefig('django_perf_ast_size.pdf', dpi=300)
        os.system('pcrop.sh django_perf_ast_size.pdf')
    else:
        fig, ax = plt.subplots()
        ax.plot(X, Y[0], 'bs--', label='BLEU', lw=1.2)
        # ax.plot(X, Y[2], 'r^--', label='oracle BLEU', lw=1.2)
        ax.plot(X, Y[1], 'r^--', label='acc', lw=1.2)
        # ax.plot(X, Y[3], 'r^--', label='oracle acc', lw=1.2)
        ax.set_ylabel('Performance')
        ax.set_xlabel('Reference AST Size (# nodes)')
        plt.legend(loc='upper right', ncol=6)
        plt.tight_layout()
        # plt.savefig('hs_bleu_ast_size.pdf', dpi=300)
        # os.system('pcrop.sh hs_bleu_ast_size.pdf')
        plt.savefig('hs_perf_ast_size.pdf', dpi=300)
        os.system('pcrop.sh hs_perf_ast_size.pdf')
    if verbose:
        f.write(', '.join(str(i) for i in exact_match_ids))
        f.close()
        f_decode.close()

        f_bleu_eval_ref.close()
        f_bleu_eval_hyp.close()

    return cum_bleu, cum_acc
Esempio n. 5
0
def evaluate_decode_results(data_type, dataset, decode_results, verbose=True):
    from lang.py.parse import tokenize_code, de_canonicalize_code
    # tokenize_code = tokenize_for_bleu_eval
    import ast
    assert dataset.count == len(decode_results)

    f = f_decode = None
    if verbose:
        f = open(dataset.name + '.exact_match', 'w')
        exact_match_ids = []
        f_decode = open(dataset.name + '.decode_results.txt', 'w')
        eid_to_annot = dict()

        if data_type == 'django':
            for raw_id, line in enumerate(open(DJANGO_ANNOT_FILE)):
                eid_to_annot[raw_id] = line.strip()

        f_bleu_eval_ref = open(dataset.name + '.ref', 'w')
        f_bleu_eval_hyp = open(dataset.name + '.hyp', 'w')
        f_generated_code = open(dataset.name + '.geneated_code', 'w')

        logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count)

    cum_oracle_bleu = 0.0
    cum_oracle_acc = 0.0
    cum_bleu = 0.0
    cum_acc = 0.0
    sm = SmoothingFunction()

    all_references = []
    all_predictions = []

    for eid in range(dataset.count):
        example = dataset.examples[eid]
        ref_code = example.code
        ref_ast_tree = ast.parse(ref_code).body[0]
        refer_source = astor.to_source(ref_ast_tree).strip()
        # refer_source = ref_code
        refer_tokens = tokenize_code(refer_source)
        cur_example_correct = False

        ast_tree = decode_results[eid]
        code = astor.to_source(ast_tree).strip()

        # simple_url_2_re = re.compile('_STR:0_', re.))
        try:
            predict_tokens = tokenize_code(code)
        except:
            logging.error('error in tokenizing [%s]', code)
            continue

        if refer_tokens == predict_tokens:
            cum_acc += 1
            cur_example_correct = True

            if verbose:
                exact_match_ids.append(example.raw_id)
                f.write('-' * 60 + '\n')
                f.write('example_id: %d\n' % example.raw_id)
                f.write(code + '\n')
                f.write('-' * 60 + '\n')

        if data_type == 'django':
            ref_code_for_bleu = example.meta_data['raw_code']
            pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code'])
            # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code'])
            # convert canonicalized code to raw code
            for literal, place_holder in example.meta_data['str_map'].iteritems():
                pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal)
                # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal)
        elif data_type == 'hs':
            ref_code_for_bleu = ref_code
            pred_code_for_bleu = code

        # we apply Ling Wang's trick when evaluating BLEU scores
        refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu)
        pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu)

        # The if-chunk below is for debugging purpose, sometimes the reference cannot match with the prediction
        # because of inconsistent quotes (e.g., single quotes in reference, double quotes in prediction).
        # However most of these cases are solved by cannonicalizing the reference code using astor (parse the reference
        # into AST, and regenerate the code. Use this regenerated one as the reference)
        weired = False
        if refer_tokens_for_bleu == pred_tokens_for_bleu and refer_tokens != predict_tokens:
            # cum_acc += 1
            weired = True
        elif refer_tokens == predict_tokens:
            # weired!
            # weired = True
            pass

        shorter = len(pred_tokens_for_bleu) < len(refer_tokens_for_bleu)

        all_references.append([refer_tokens_for_bleu])
        all_predictions.append(pred_tokens_for_bleu)

        # try:
        ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu))
        bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3)
        cum_bleu += bleu_score
        # except:
        #    pass

        if verbose:
            print 'raw_id: %d, bleu_score: %f' % (example.raw_id, bleu_score)

            f_decode.write('-' * 60 + '\n')
            f_decode.write('example_id: %d\n' % example.raw_id)
            f_decode.write('intent: \n')

            if data_type == 'django':
                f_decode.write(eid_to_annot[example.raw_id] + '\n')
            elif data_type == 'hs':
                f_decode.write(' '.join(example.query) + '\n')

            f_bleu_eval_ref.write(' '.join(refer_tokens_for_bleu) + '\n')
            f_bleu_eval_hyp.write(' '.join(pred_tokens_for_bleu) + '\n')

            f_decode.write('canonicalized reference: \n')
            f_decode.write(refer_source + '\n')
            f_decode.write('canonicalized prediction: \n')
            f_decode.write(code + '\n')
            f_decode.write('reference code for bleu calculation: \n')
            f_decode.write(ref_code_for_bleu + '\n')
            f_decode.write('predicted code for bleu calculation: \n')
            f_decode.write(pred_code_for_bleu + '\n')
            f_decode.write('pred_shorter_than_ref: %s\n' % shorter)
            f_decode.write('weired: %s\n' % weired)
            f_decode.write('-' * 60 + '\n')

            # for Hiro's evaluation
            f_generated_code.write(pred_code_for_bleu.replace('\n', '#NEWLINE#') + '\n')


        # compute oracle
        best_score = 0.
        cur_oracle_acc = 0.
        for ast_tree in decode_results:

            try:
                code = astor.to_source(ast_tree).strip()
                predict_tokens = tokenize_code(code)

                if predict_tokens == refer_tokens:
                    cur_oracle_acc = 1

                if data_type == 'django':
                    pred_code_for_bleu = de_canonicalize_code(code, example.meta_data['raw_code'])
                    # convert canonicalized code to raw code
                    for literal, place_holder in example.meta_data['str_map'].iteritems():
                        pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal)
                elif data_type == 'hs':
                    pred_code_for_bleu = code

                # we apply Ling Wang's trick when evaluating BLEU scores
                pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu)

                ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu))
                bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu,
                                           weights=ngram_weights,
                                           smoothing_function=sm.method3)

                if bleu_score > best_score:
                    best_score = bleu_score

            except:
                continue

        cum_oracle_bleu += best_score
        cum_oracle_acc += cur_oracle_acc

    cum_bleu /= dataset.count
    cum_acc /= dataset.count
    cum_oracle_bleu /= dataset.count
    cum_oracle_acc /= dataset.count

    logging.info('corpus level bleu: %f', corpus_bleu(all_references, all_predictions, smoothing_function=sm.method3))
    logging.info('sentence level bleu: %f', cum_bleu)
    logging.info('accuracy: %f', cum_acc)
    logging.info('oracle bleu: %f', cum_oracle_bleu)
    logging.info('oracle accuracy: %f', cum_oracle_acc)

    if verbose:
        f.write(', '.join(str(i) for i in exact_match_ids))
        f.close()
        f_decode.close()

        f_bleu_eval_ref.close()
        f_bleu_eval_hyp.close()
        f_generated_code.close()
    print cum_bleu, cum_acc
    return cum_bleu, cum_acc
Esempio n. 6
0
def main(model_name, use_cuda, n_print, idxs_print, use_train_dataset,
         val_size, batch_size, interact, unsmear):
    #model_path = './model/' + model_name + '/'
    model_path = model_name

    if use_cuda:
        #encoder_decoder = torch.load(model_path + model_name + '.pt')
        encoder_decoder = torch.load(model_path)
    else:
        encoder_decoder = torch.load(model_path + model_name + '.pt',
                                     map_location=lambda storage, loc: storage)

    if use_cuda:
        encoder_decoder = encoder_decoder.cuda()
    else:
        encoder_decoder = encoder_decoder.cpu()

    dataset = SequencePairDataset(data_path='data/parsed/',
                                  lang=encoder_decoder.lang,
                                  use_cuda=use_cuda,
                                  val_size=val_size,
                                  data_type='dev')

    data_loader = DataLoader(dataset, batch_size=batch_size)

    get_bleu = True

    if get_bleu:
        dev_file = open("data/parsed/copynet_dev.txt", "r", encoding='utf-8')
        out_file = open("results/" + model_name.split('/')[-1] + ".txt",
                        'w',
                        encoding='utf-8')
        total_score = 0.0
        num = 0.0
        for i, row in enumerate(tqdm(dev_file)):
            sql = row.split('\t')[1]
            gold_nl = row.split('\t')[0]
            predicted = encoder_decoder.get_response(sql)
            predicted = predicted.replace('<SOS>', '')
            predicted = predicted.replace('<EOS>', '')
            predicted = predicted.rstrip()
            out_file.write(predicted + "\n")

            score = sentence_bleu(
                [gold_nl.split()],
                predicted.split(),
                smoothing_function=SmoothingFunction().method2)
            # score = sentence_bleu(ref, pred)
            total_score += score
            num += 1
            '''
            if i == 1000:
                break
            '''
        del encoder_decoder
        dev_file.close()
        out_file.close()
        print("BLEU score on test set is " + str(total_score * 100 / num))
        return

    if interact:
        encoder_decoder.interactive(unsmear)

    if n_print is not None:
        for _ in range(n_print):
            i_seq, t_seq, i_str, t_str = random.choice(dataset)

            i_length = (i_seq > 0).sum()
            t_length = (i_seq > 0).sum()

            i_seq = i_seq[:i_length]
            t_seq = t_seq[:t_length]

            i_tokens = i_str.split()
            t_tokens = t_str.split()

            print_output(i_seq,
                         encoder_decoder,
                         input_tokens=i_tokens,
                         target_tokens=t_tokens,
                         target_seq=t_seq)

    elif idxs_print is not None:
        for idx in idxs_print:
            i_seq, t_seq, i_str, t_str = dataset[idx]

            i_length = (i_seq > 0).sum()
            t_length = (i_seq > 0).sum()

            i_seq = i_seq[:i_length]
            t_seq = t_seq[:t_length]

            i_tokens = i_str.split()[:i_length]
            t_tokens = t_str.split()

            print_output(i_seq,
                         encoder_decoder,
                         input_tokens=i_tokens,
                         target_tokens=t_tokens,
                         target_seq=t_seq)

    else:
        evaluate(encoder_decoder, data_loader)
Esempio n. 7
0
    learning_rate = args.learning_rate  # learning rate for the optimizer
    dropout = args.dropout

    top_x = args.top_x

    start_epoch = 0
    epochs = args.epochs  # number of epochs to train for (if early stopping is not triggered)
    epochs_since_improvement = 0  # keeps track of number of epochs since there's been an improvement in validation BLEU

    best_bleu4 = 0.0  # to store the best BLEU-4 score
    best_cider = 0.0  # to store the best CIDER score
    best_loss = 0.0  # to store the best Cross-Entropy loss

    best_ours = 100.0  # to store the best metric (ours)

    smoothing_method = SmoothingFunction().method1  # epsilon method for bleu

    print_freq = args.print_freq  # print training/validation stats every __ batches
    checkpoint = args.checkpoint  # path to checkpoint, None if none

    # Read word map
    # WARNING union vocab from pretrained + didec itself
    word_map_file = os.path.join(data_folder, 'WORDMAP_union.json')
    with open(word_map_file, 'r') as j:
        word_map = json.load(j)

    rev_word_map = {v: k for k, v in word_map.items()}

    print('vocab len', len(word_map))

    i2w = dict()
Esempio n. 8
0
def test_micro_bleu_smooth2(candidates, references):
    _test(candidates, references, "micro", "smooth2",
          SmoothingFunction().method2, 3)
Esempio n. 9
0
def test_macro_bleu_smooth1(candidates, references):
    _test(candidates, references, "macro", "smooth1",
          SmoothingFunction().method1)
Esempio n. 10
0
            if sampled_token == '<end>':
                break
            decoded_seq[0, i + 1] = sampled_index

        ref = []
        for cap in real_captions:
            l_temp = cap.split()
            l_temp.remove('<start>')
            l_temp.remove('<end>')
            ref.append(l_temp)

        actual.append(ref)

        predicted.append(decoded_tokens[:-1])

    smoothie = SmoothingFunction()

    print("BLEU-1: {}".format(
        corpus_bleu(actual,
                    predicted,
                    weights=(1.0, 0, 0, 0),
                    smoothing_function=smoothie.method4)))
    print("BLEU-2: {}".format(
        corpus_bleu(actual,
                    predicted,
                    weights=(0.5, 0.5, 0, 0),
                    smoothing_function=smoothie.method4)))
    print("BLEU-3: {}".format(
        corpus_bleu(actual,
                    predicted,
                    weights=(0.3, 0.3, 0.3, 0),
Esempio n. 11
0
def test_macro_bleu_nltk_smooth2(candidates, references):
    _test(candidates, references, "macro", "nltk_smooth2",
          SmoothingFunction().method2)
Esempio n. 12
0
 def calc_bleu(self, reference, hypothesis, weight):
     return nltk.translate.bleu_score.sentence_bleu(
         reference,
         hypothesis,
         weight,
         smoothing_function=SmoothingFunction().method1)
Esempio n. 13
0
def evaluate(beam_size):
    """
    Evaluation

    :param beam_size: beam size at which to generate captions for evaluation
    :return: BLEU-4 score
    """
    # DataLoader
    _transforms = [normalize]
    if use_clip:
        _, preprocess = clip.load('ViT-B/32')
        preprocess.transforms = preprocess.transforms[:2]
        _transforms = preprocess.transforms + _transforms
    _transforms = transforms.Compose(_transforms)
    loader = torch.utils.data.DataLoader(CaptionDataset(data_folder,
                                                        data_name,
                                                        'TEST',
                                                        transform=_transforms),
                                         batch_size=1,
                                         shuffle=True,
                                         num_workers=1,
                                         pin_memory=True)

    # TODO: Batched Beam Search
    # Therefore, do not use a batch_size greater than 1 - IMPORTANT!

    # Lists to store references (true captions), and hypothesis (prediction) for each image
    # If for n images, we have n hypotheses, and references a, b, c... for each image, we need -
    # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...]
    references = list()
    hypotheses = list()

    # For each image
    for i, (image, caps, caplens, allcaps) in enumerate(
            tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))):

        k = beam_size

        # Move to GPU device, if available
        image = image.to(device)  # (1, 3, 256, 256)

        # Encode
        encoder_out = encoder(
            image)  # (1, enc_image_size, enc_image_size, encoder_dim)
        enc_image_size = encoder_out.size(1)
        encoder_dim = encoder_out.size(3)

        # Flatten encoding
        encoder_out = encoder_out.view(
            1, -1, encoder_dim)  # (1, num_pixels, encoder_dim)
        num_pixels = encoder_out.size(1)

        # We'll treat the problem as having a batch size of k
        encoder_out = encoder_out.expand(
            k, num_pixels, encoder_dim)  # (k, num_pixels, encoder_dim)

        # Tensor to store top k previous words at each step; now they're just <start>
        k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to(
            device)  # (k, 1)

        # Tensor to store top k sequences; now they're just <start>
        seqs = k_prev_words  # (k, 1)

        # Tensor to store top k sequences' scores; now they're just 0
        top_k_scores = torch.zeros(k, 1).to(device)  # (k, 1)

        # Lists to store completed sequences and scores
        complete_seqs = list()
        complete_seqs_scores = list()

        # Start decoding
        step = 1
        h, c = decoder.init_hidden_state(encoder_out)

        rev_word_map = {v: k for k, v in word_map.items()}
        if clip_beam_search:
            with torch.no_grad():
                image_features = encoder.clip_model.encode_image(image)
                image_features /= image_features.norm(dim=-1, keepdim=True)

        def get_clip_scores(seqs, scores):
            nonlocal top_k_scores
            special_words = ['<start>', '<end>']
            replace_words = {
                '<unk>': '<averyunpleasantword>',
                '<pad>': '<anotherveryunpleasantword>'
            }
            special_words_enc = [word_map[w] for w in special_words]
            if step == 1:
                top_k_scores, next_word_inds = scores[0].topk(
                    k, 0, True, True)  # (s)
                return torch.zeros(k, device=device).long(), next_word_inds
            next_word_inds = scores.topk(k)[1]
            inds = []

            text = []
            weights = torch.ones(k**2).to(device)
            count = 0
            for idx, (prev_seq, next_words) in enumerate(
                    zip(seqs.tolist(), next_word_inds.tolist())):
                prev_words = [
                    rev_word_map[w] for w in prev_seq
                    if w not in special_words_enc
                ]
                for word in next_words:
                    cap_words = copy.copy(prev_words)
                    if word not in special_words:
                        word_char = rev_word_map[word]
                        word_char = replace_words.get(word_char) or word_char
                        cap_words.append(word_char)
                    text.append(' '.join(cap_words))
                    inds.append([idx, word])
                    if rev_word_map[word] == '<end>':
                        weights[count] = 1.5
                    count += 1
            inds = np.array(inds)
            text = clip.tokenize(text).to(device)
            with torch.no_grad():
                text_features = encoder.clip_model.encode_text(text)

            # Pick the top k most similar captions for the image
            text_features /= text_features.norm(dim=-1, keepdim=True)
            similarity = (image_features @ text_features.T *
                          weights).log_softmax(dim=-1)
            top_k_scores, indices = similarity.view(-1).topk(k, 0, True, True)
            prev_inds = torch.tensor([inds[idx][0] for idx in indices],
                                     device=device)
            next_inds = torch.tensor([inds[idx][1] for idx in indices],
                                     device=device)

            return prev_inds, next_inds

        # s is a number less than or equal to k, because sequences are removed from this process once they hit <end>
        while True:

            embeddings = decoder.embedding(k_prev_words).squeeze(
                1)  # (s, embed_dim)

            awe, _ = decoder.attention(encoder_out,
                                       h)  # (s, encoder_dim), (s, num_pixels)

            gate = decoder.sigmoid(
                decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
            awe = gate * awe

            h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1),
                                       (h, c))  # (s, decoder_dim)

            scores = decoder.fc(h)  # (s, vocab_size)
            scores = F.log_softmax(scores, dim=1)

            # Add
            scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)

            if clip_beam_search:
                prev_word_inds, next_word_inds = get_clip_scores(seqs, scores)
            else:

                # For the first step, all k points will have the same scores (since same k previous words, h, c)
                if step == 1:
                    top_k_scores, top_k_words = scores[0].topk(
                        k, 0, True, True)  # (s)
                else:
                    # Unroll and find top scores, and their unrolled indices
                    top_k_scores, top_k_words = scores.view(-1).topk(
                        k, 0, True, True)  # (s)

                # Convert unrolled indices to actual indices of scores
                prev_word_inds = (top_k_words / vocab_size).long()  # (s)
                next_word_inds = (top_k_words % vocab_size).long()  # (s)

            # Add new words to sequences
            seqs = torch.cat(
                [seqs[prev_word_inds],
                 next_word_inds.unsqueeze(1)], dim=1)  # (s, step+1)

            # Which sequences are incomplete (didn't reach <end>)?
            incomplete_inds = [
                ind for ind, next_word in enumerate(next_word_inds)
                if next_word != word_map['<end>']
            ]
            complete_inds = list(
                set(range(len(next_word_inds))) - set(incomplete_inds))

            # Set aside complete sequences
            if len(complete_inds) > 0:
                complete_seqs.extend(seqs[complete_inds].tolist())
                complete_seqs_scores.extend(top_k_scores[complete_inds])
            k -= len(complete_inds)  # reduce beam length accordingly
            # Proceed with incomplete sequences
            if k == 0:
                break
            seqs = seqs[incomplete_inds]
            h = h[prev_word_inds[incomplete_inds]]
            c = c[prev_word_inds[incomplete_inds]]
            encoder_out = encoder_out[prev_word_inds[incomplete_inds]]
            top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1)
            k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1)

            # Break if things have been going on too long
            if step > 50:
                break
            step += 1

        if len(complete_inds) > 0:
            i = complete_seqs_scores.index(max(complete_seqs_scores))
            seq = complete_seqs[i]
        else:
            i = top_k_scores.argmax().item()
            seq = seqs[i].tolist()

        # References
        img_caps = allcaps[0].tolist()
        img_captions = list(
            map(
                lambda c: [
                    rev_word_map[w] for w in c if w not in {
                        word_map['<start>'], word_map['<end>'], word_map[
                            '<pad>']
                    }
                ], img_caps))  # remove <start> and pads
        references.append(img_captions)

        # Hypotheses
        hypotheses.append([
            rev_word_map[w] for w in seq if w not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ])

        assert len(references) == len(hypotheses)

    bleu4 = corpus_bleu(references,
                        hypotheses,
                        smoothing_function=SmoothingFunction().method1)

    return bleu4
def main(unused_argv):
    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    tf.logging.set_verbosity(
        tf.logging.INFO)  # choose what level of logging you want
    tf.logging.info('Starting running in %s mode...', (FLAGS.mode))

    # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary
    FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name)
    if not os.path.exists(FLAGS.log_root):
        os.makedirs(FLAGS.log_root)

    vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size)  # create a vocabulary

    # Make a namedtuple hps, containing the values of the hyperparameters that the model needs
    hparam_list = [
        'mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag',
        'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim',
        'batch_size', 'max_dec_steps', 'max_enc_steps'
    ]
    hps_dict = {}
    for key, val in FLAGS.__flags.items():  # for each flag
        if key in hparam_list:  # if it's in the list
            hps_dict[key] = val  # add it to the dict
    hps_generator = namedtuple("HParams", hps_dict.keys())(**hps_dict)

    hparam_list = [
        'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std',
        'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps'
    ]
    hps_dict = {}
    for key, val in FLAGS.__flags.items():  # for each flag
        if key in hparam_list:  # if it's in the list
            hps_dict[key] = val  # add it to the dict
    hps_discriminator = namedtuple("HParams", hps_dict.keys())(**hps_dict)

    tf.set_random_seed(
        111
    )  # a seed value for randomness # train-classification  train-sentiment  train-cnn-classificatin train-generator

    if FLAGS.mode == "train-classifier":

        #print("Start pre-training......")
        model_class = Classification(hps_discriminator, vocab)
        cla_batcher = ClaBatcher(hps_discriminator, vocab)
        sess_cls, saver_cls, train_dir_cls = setup_training_classification(
            model_class)
        print("Start pre-training classification......")
        run_pre_train_classification(model_class, cla_batcher, 1, sess_cls,
                                     saver_cls, train_dir_cls)  #10
        generated = Generate_training_sample(model_class, vocab, cla_batcher,
                                             sess_cls)

        print("Generating training examples......")
        generated.generate_training_example("train")
        generated.generate_test_example("test")

    elif FLAGS.mode == "train-sentimentor":

        model_class = Classification(hps_discriminator, vocab)
        cla_batcher = ClaBatcher(hps_discriminator, vocab)
        sess_cls, saver_cls, train_dir_cls = setup_training_classification(
            model_class)

        print("Start pre_train_sentimentor......")
        model_sentiment = Sentimentor(hps_generator, vocab)
        sentiment_batcher = SenBatcher(hps_generator, vocab)
        sess_sen, saver_sen, train_dir_sen = setup_training_sentimentor(
            model_sentiment)
        util.load_ckpt(saver_cls, sess_cls, ckpt_dir="train-classification")
        run_pre_train_sentimentor(model_sentiment, sentiment_batcher, 1,
                                  sess_sen, saver_sen, train_dir_sen)  #1

    elif FLAGS.mode == "test":

        config = {
            'n_epochs': 5,
            'kernel_sizes': [3, 4, 5],
            'dropout_rate': 0.5,
            'val_split': 0.4,
            'edim': 300,
            'n_words': None,  # Leave as none
            'std_dev': 0.05,
            'sentence_len': 50,
            'n_filters': 100,
            'batch_size': 50
        }
        config['n_words'] = 50000

        cla_cnn_batcher = CNN_ClaBatcher(hps_discriminator, vocab)
        cnn_classifier = CNN(config)
        sess_cnn_cls, saver_cnn_cls, train_dir_cnn_cls = setup_training_cnnclassifier(
            cnn_classifier)
        #util.load_ckpt(saver_cnn_cls, sess_cnn_cls, ckpt_dir="train-cnnclassification")
        run_train_cnn_classifier(cnn_classifier, cla_cnn_batcher, 1,
                                 sess_cnn_cls, saver_cnn_cls,
                                 train_dir_cnn_cls)  #1

        print("Generating test transfer files")
        files = os.listdir("test-generate-transfer/")
        for file_ in files:
            run_test_our_method(cla_cnn_batcher, cnn_classifier, sess_cnn_cls,
                                "test-generate-transfer/" + file_ + "/*")

    #elif FLAGS.mode == "test":

    elif FLAGS.mode == "train-generator":

        model_class = Classification(hps_discriminator, vocab)
        cla_batcher = ClaBatcher(hps_discriminator, vocab)
        sess_cls, saver_cls, train_dir_cls = setup_training_classification(
            model_class)

        model_sentiment = Sentimentor(hps_generator, vocab)
        sentiment_batcher = SenBatcher(hps_generator, vocab)
        sess_sen, saver_sen, train_dir_sen = setup_training_sentimentor(
            model_sentiment)

        config = {
            'n_epochs': 5,
            'kernel_sizes': [3, 4, 5],
            'dropout_rate': 0.5,
            'val_split': 0.4,
            'edim': 300,
            'n_words': None,  # Leave as none
            'std_dev': 0.05,
            'sentence_len': 50,
            'n_filters': 100,
            'batch_size': 50
        }
        config['n_words'] = 50000

        cla_cnn_batcher = CNN_ClaBatcher(hps_discriminator, vocab)
        cnn_classifier = CNN(config)
        sess_cnn_cls, saver_cnn_cls, train_dir_cnn_cls = setup_training_cnnclassifier(
            cnn_classifier)

        model = Generator(hps_generator, vocab)
        batcher = GenBatcher(vocab, hps_generator)
        sess_ge, saver_ge, train_dir_ge = setup_training_generator(model)

        #util.load_ckpt(saver_cnn_cls, sess_cnn_cls, ckpt_dir="train-cnnclassification")
        util.load_ckpt(saver_sen, sess_sen, ckpt_dir="train-sentimentor")

        generated = Generated_sample(model, vocab, batcher, sess_ge)
        tf.logging.info("Start pre-training generator......")
        run_pre_train_generator(model, batcher, 1, sess_ge, saver_ge,
                                train_dir_ge, generated, cla_cnn_batcher,
                                cnn_classifier, sess_cnn_cls)  # 4

        generated.generate_test_negetive_example(
            "temp_negetive",
            batcher)  # batcher, model_class, sess_cls, cla_batcher
        generated.generate_test_positive_example("temp_positive", batcher)
        tf.logging.info("finished pre-training generator")

        #run_test_our_method(cla_cnn_batcher, cnn_classifier, sess_cnn_cls,
        #                    "temp_negetive" + "/*")

        tf.logging.info("begin reinforcement learning:")
        total_epochs = 30
        step = 1
        for epoch in range(total_epochs):
            batches = batcher.get_batches(mode='train')
            tf.logging.info("num_batches: {}".format(len(batches)))
            for i in range(len(batches)):
                current_batch = copy.deepcopy(batches[i])
                sentiment_batch = batch_sentiment_batch(
                    current_batch, sentiment_batcher)
                result = model_sentiment.max_generator(sess_sen,
                                                       sentiment_batch)
                weight = result['generated']
                current_batch.weight = weight
                sentiment_batch.weight = weight

                cla_batch = batch_classification_batch(current_batch, batcher,
                                                       cla_batcher)
                result = model_class.run_ypred_auc(sess_cls, cla_batch)

                cc = SmoothingFunction()

                reward_sentiment = 1 - np.abs(0.5 - result['y_pred_auc'])
                reward_BLEU = []
                for k in range(FLAGS.batch_size):
                    reward_BLEU.append(
                        sentence_bleu(
                            [current_batch.original_reviews[k].split()],
                            cla_batch.original_reviews[k].split(),
                            smoothing_function=cc.method1))

                reward_BLEU = np.array(reward_BLEU)

                reward_de = (2 / (1.0 / (1e-6 + reward_sentiment) + 1.0 /
                                  (1e-6 + reward_BLEU)))

                result = model.run_train_step(sess_ge, current_batch)
                train_step = result[
                    'global_step']  # we need this to update our running average loss
                loss = result['loss']
                tf.logging.info('epoch: %d/%d, step: %d/%d, loss: %f',
                                epoch + 1, total_epochs, i + 1, len(batches),
                                loss)

                if not step % 10000:
                    tf.logging.info("generating test examples")
                    generated.generate_test_negetive_example(
                        "test-generate-transfer/" + str(epoch) + "epoch_step" +
                        str(train_step) + "_temp_positive", batcher)
                    generated.generate_test_positive_example(
                        "test-generate/" + str(epoch) + "epoch_step" +
                        str(train_step) + "_temp_positive", batcher)
                    # saver_ge.save(sess, train_dir + "/model", global_step=train_step)
                    run_test_our_method(
                        cla_cnn_batcher, cnn_classifier, sess_cnn_cls,
                        "test-generate-transfer/" + str(epoch) + "epoch_step" +
                        str(train_step) + "_temp_positive" + "/*")

                tf.logging.info("classifying output and evaluating")
                cla_batch, bleu = output_to_classification_batch(
                    result['generated'], current_batch, batcher, cla_batcher,
                    cc)
                result = model_class.run_ypred_auc(sess_cls, cla_batch)
                reward_result_sentiment = result['y_pred_auc']
                reward_result_bleu = np.array(bleu)

                reward_result = (2 / (1.0 /
                                      (1e-6 + reward_result_sentiment) + 1.0 /
                                      (1e-6 + reward_result_bleu)))

                current_batch.score = 1 - current_batch.score

                result = model.max_generator(sess_ge, current_batch)

                tf.logging.info("classifying output and re-evaluating")
                cla_batch, bleu = output_to_classification_batch(
                    result['generated'], current_batch, batcher, cla_batcher,
                    cc)
                result = model_class.run_ypred_auc(sess_cls, cla_batch)
                reward_result_transfer_sentiment = result['y_pred_auc']
                reward_result_transfer_bleu = np.array(bleu)

                reward_result_transfer = (
                    2 / (1.0 /
                         (1e-6 + reward_result_transfer_sentiment) + 1.0 /
                         (1e-6 + reward_result_transfer_bleu)))

                #tf.logging.info("reward_nonsentiment: "+str(reward_sentiment) +" output_original_sentiment: "+str(reward_result_sentiment)+" output_original_bleu: "+str(reward_result_bleu))

                reward = reward_result_transfer  #reward_de + reward_result_sentiment +
                #tf.logging.info("reward_de: "+str(reward_de))

                tf.logging.info("running sentiment train step")
                model_sentiment.run_train_step(sess_sen, sentiment_batch,
                                               reward)
                step += 1
Esempio n. 15
0
    def compute_bleu(self, predictions):

        # Hide warnings
        warnings.filterwarnings('ignore')

        # NLTK
        # Download Punkt tokenizer (for word_tokenize method)
        # Download stopwords (for stopword removal)
        nltk.download('punkt')
        nltk.download('stopwords')

        # English Stopwords
        stops = set(stopwords.words("english"))

        #stops.remove('no')

        # Stemming
        stemmer = SnowballStemmer("english")

        # Remove punctuation from string
        translator = str.maketrans('', '', string.punctuation)

        candidate_pairs = self.readresult(predictions)

        gt_pairs = self.readresult(self.gt)

        # Define max score and current score
        max_score = len(gt_pairs)
        current_score = 0

        i = 0
        for image_key in candidate_pairs:

            # Get candidate and GT caption
            candidate_caption = candidate_pairs[image_key]
            gt_caption = gt_pairs[image_key]

            # Optional - Go to lowercase
            if not VqaMedEvaluator.case_sensitive:
                candidate_caption = candidate_caption.lower()
                gt_caption = gt_caption.lower()

            # Split caption into individual words (remove punctuation)
            candidate_words = nltk.tokenize.word_tokenize(
                candidate_caption.translate(translator))
            gt_words = nltk.tokenize.word_tokenize(
                gt_caption.translate(translator))

            # Optional - Remove stopwords
            if VqaMedEvaluator.remove_stopwords:
                candidate_words = [
                    word for word in candidate_words
                    if word.lower() not in stops
                ]
                gt_words = [
                    word for word in gt_words if word.lower() not in stops
                ]

            # Optional - Apply stemming
            if VqaMedEvaluator.stemming:
                candidate_words = [
                    stemmer.stem(word) for word in candidate_words
                ]
                gt_words = [stemmer.stem(word) for word in gt_words]

            # Calculate BLEU score for the current caption
            try:
                # If both the GT and candidate are empty, assign a score of 1 for this caption
                if len(gt_words) == 0 and len(candidate_words) == 0:
                    bleu_score = 1
                # Calculate the BLEU score
                else:
                    bleu_score = bleu_score = nltk.translate.bleu_score.sentence_bleu(
                        [gt_words],
                        candidate_words,
                        smoothing_function=SmoothingFunction().method0)
            # Handle problematic cases where BLEU score calculation is impossible
            except ZeroDivisionError:
                pass
                #raise Exception('Problem with {} {}', gt_words, candidate_words)

            # Increase calculated score
            current_score += bleu_score

        return current_score / max_score
Esempio n. 16
0
def get_bleu4(dialog_acts, golden_utts, gen_utts, data_key):
    das2utts = {}
    for das, utt, gen in zip(dialog_acts, golden_utts, gen_utts):
        intent_frequency = defaultdict(int)
        for act in das:
            cur_act = copy.copy(act)

            # intent list
            facility = None  # for 酒店设施
            if '酒店设施' in cur_act[2]:
                facility = cur_act[2].split('-')[1]
                if cur_act[0] == 'Inform':
                    cur_act[2] = cur_act[2].split('-')[0] + '+' + cur_act[3]
                elif cur_act[0] == 'Request':
                    cur_act[2] = cur_act[2].split('-')[0]
            if cur_act[0] == 'Select':
                cur_act[2] = '源领域+' + cur_act[3]
            intent = '+'.join(cur_act[:-1])
            if '+'.join(cur_act) == 'Inform+景点+门票+免费' or cur_act[-1] == '无':
                intent = '+'.join(cur_act)

            intent_frequency[intent] += 1

            # utt content replacement
            if (act[0] in ['Inform', 'Recommend']
                    or '酒店设施' in intent) and not intent.endswith('无'):
                if act[3] in utt or (facility and facility in utt):
                    # value to be replaced
                    if '酒店设施' in intent:
                        value = facility
                    else:
                        value = act[3]

                    # placeholder
                    placeholder = '[' + intent + ']'
                    placeholder_one = '[' + intent + '1]'
                    placeholder_with_number = '[' + intent + str(
                        intent_frequency[intent]) + ']'

                    if intent_frequency[intent] > 1:
                        utt = utt.replace(placeholder, placeholder_one)
                        utt = utt.replace(value, placeholder_with_number)

                        gen = gen.replace(placeholder, placeholder_one)
                        gen = gen.replace(value, placeholder_with_number)
                    else:
                        utt = utt.replace(value, placeholder)
                        gen = gen.replace(value, placeholder)

        hash_key = ''
        for act in sorted(das):
            hash_key += act2intent(act)
        das2utts.setdefault(hash_key, {'refs': [], 'gens': []})
        das2utts[hash_key]['refs'].append(utt)
        das2utts[hash_key]['gens'].append({'das': das, 'gen': gen})

    refs, gens = [], []
    for das in das2utts.keys():
        assert len(das2utts[das]['refs']) == (len(das2utts[das]['gens']))
        for gen_pair in das2utts[das]['gens']:
            lex_das = gen_pair['das']  # das w/ value
            gen = gen_pair['gen']
            lex_gen = value_replace(gen, lex_das)
            gens.append([x for x in jieba.lcut(lex_gen) if x.strip()])
            refs.append([[
                x for x in jieba.lcut(value_replace(s, lex_das)) if x.strip()
            ] for s in das2utts[das]['refs']])

    with open(os.path.join('', 'generated_sens_%s.json' % data_key),
              'w',
              encoding='utf-8') as f:
        json.dump({
            'refs': refs,
            'gens': gens
        },
                  f,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
    print('generated_sens_%s.txt saved!' % data_key)

    print('Start calculating bleu score...')
    bleu = corpus_bleu(refs,
                       gens,
                       weights=(0.25, 0.25, 0.25, 0.25),
                       smoothing_function=SmoothingFunction().method1)
    return bleu
def get_metrics(pred, target):
    turns = len(target)
    bleu_2 = 0
    bleu_4 = 0
    meteor = 0
    nist_2 = 0
    nist_4 = 0
    for index in range(turns):
        pred_utt = pred[index]
        target_utt = target[index]
        min_len = min(len(pred_utt), len(target_utt))
        lens = min(min_len, 4)
        if lens == 0:
            continue
        if lens >= 4:
            bleu_4_utt = sentence_bleu([target_utt], pred_utt, weights = (0.25, 0.25, 0.25, 0.25), smoothing_function = SmoothingFunction().method1)
            nist_4_utt = sentence_nist([target_utt], pred_utt, 4)
        else:
            bleu_4_utt = 0
            nist_4_utt = 0
        if lens >= 2:
            bleu_2_utt = sentence_bleu([target_utt], pred_utt, weights = (0.5, 0.5), smoothing_function = SmoothingFunction().method1)
            nist_2_utt = sentence_nist([target_utt], pred_utt, 2)
        else:
            bleu_2_utt = 0
            nist_2_utt = 0
            
        bleu_2 += bleu_2_utt
        bleu_4 += bleu_4_utt
        meteor += meteor_score([" ".join(target_utt)], " ".join(pred_utt))
        nist_2 += nist_2_utt
        nist_4 += nist_4_utt
        
    bleu_2 /= turns
    bleu_4 /= turns
    meteor /= turns
    nist_2 /= turns
    nist_4 /= turns
    return bleu_2, bleu_4, meteor, nist_2, nist_4
Esempio n. 18
0
def calculate_bleu(pred_trg, real_trg):
    smoothie = SmoothingFunction().method4
    return sentence_bleu(real_trg, pred_trg, smoothing_function=smoothie)
Esempio n. 19
0
def test_bleu_bug():
	ref = [[[1, 3], [3], [4]]]
	gen = [[1]]
	with pytest.raises(ZeroDivisionError):
		corpus_bleu(ref, gen, smoothing_function=SmoothingFunction().method3)
Esempio n. 20
0
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction


smooth = SmoothingFunction()


def eval_bleu(ref, pred):
    """
    :param ref: list(list(list(any))), a list of reference sentences, each element of the list is a list of references
    :param pred: list(list(any)), a list of predictions
    :return: corpus bleu score
    """
    return corpus_bleu(ref, pred, smoothing_function=smooth.method1)
Esempio n. 21
0
    def rl_train(self, sess, batch, with_ce):
        feed_dict = self.run_encoder(sess, batch, only_feed_dict=True)
        feed_dict[self.decoder_inputs] = batch.decoder_inputs

        greedy_outputs = sess.run(self.greedy_words, feed_dict)
        greedy_outputs = greedy_outputs.tolist()
        gold_output = batch.question_words.tolist()

        # baseline outputs by flipping coin
        flipp = 0.1
        baseline_outputs = np.copy(batch.question_words)
        for i in range(batch.question_words.shape[0]):
            seq_len = min(self.flags.max_question_len,
                          batch.question_lengths[i] -
                          1)  # don't change stop token '</s>'
            for j in range(seq_len):
                if greedy_outputs[i][j] != 0 and random.random() < flipp:
                    baseline_outputs[i, j] = greedy_outputs[i][j]
        baseline_outputs = baseline_outputs.tolist()

        rl_inputs = []
        rl_outputs = []
        rl_input_lengths = []
        rewards = []
        for i, (baseline_output, greedy_output) in enumerate(
                zip(baseline_outputs, greedy_outputs)):
            _, baseline_output_words = self.word_vocab.getLexical(
                baseline_output)
            greedy_output, greedy_output_words = self.word_vocab.getLexical(
                greedy_output)
            _, gold_output_words = self.word_vocab.getLexical(gold_output[i])

            rl_inputs.append([int(batch.decoder_inputs[i, 0])] +
                             greedy_output[:-1])
            rl_outputs.append(greedy_output)
            rl_input_lengths.append(len(greedy_output))

            baseline_output_words_list = baseline_output_words.split()
            greedy_output_words_list = greedy_output_words.split()
            gold_output_words_list = gold_output_words.split()

            if self.flags.reward_type == 'bleu':
                cc = SmoothingFunction()
                reward = sentence_bleu([gold_output_words_list],
                                       greedy_output_words_list,
                                       smoothing_function=cc.method3)
                baseline = sentence_bleu([gold_output_words_list],
                                         baseline_output_words_list,
                                         smoothing_function=cc.method3)
                rewards.append(reward - baseline)

            elif self.flags.reward_type == 'rouge':
                reward = rouge.rouge([gold_output_words],
                                     [greedy_output_words])["rouge_l/f_score"]
                baseline = rouge.rouge(
                    [gold_output_words],
                    [baseline_output_words])["rouge_l/f_score"]
                rewards.append(reward - baseline)

            else:
                raise ValueError("Reward type is not bleu or rouge!")

        rl_inputs = padding_utils.pad_2d_vals(rl_inputs, len(rl_inputs),
                                              self.flags.max_question_len)
        rl_outputs = padding_utils.pad_2d_vals(rl_outputs, len(rl_outputs),
                                               self.flags.max_question_len)
        rl_input_lengths = np.array(rl_input_lengths, dtype=np.int32)
        rewards = np.array(rewards, dtype=np.float32)
        #reward = rescale(reward)
        assert rl_inputs.shape == rl_outputs.shape

        feed_dict = self.run_encoder(sess, batch, only_feed_dict=True)
        feed_dict[self.rewards] = rewards

        if with_ce:
            feed_dict[self.decoder_inputs_rl] = rl_inputs
            feed_dict[self.question_words_rl] = rl_outputs
            feed_dict[self.question_lengths_rl] = rl_input_lengths
            feed_dict[self.decoder_inputs] = batch.decoder_inputs
            feed_dict[self.question_words] = batch.question_words
            feed_dict[self.question_lengths] = batch.question_lengths

        else:
            feed_dict[self.decoder_inputs] = rl_inputs
            feed_dict[self.question_words] = rl_outputs
            feed_dict[self.question_lengths] = rl_input_lengths

        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss
Esempio n. 22
0
def modified_corpus_bleu(list_of_references,
                         hypotheses,
                         weights=(0.25, 0.25, 0.25, 0.25),
                         smoothing_function=None,
                         auto_reweigh=False):
    """
    modified from nltk.translate.bleu_score.corpus_bleu,
    returns 'multi-bleu.perl'-like intermediate results.
    Args:
        list_of_references:
        hypotheses:
        weights:
        smoothing_function:
        auto_reweigh:

    Returns:

    """
    # Before proceeding to compute BLEU, perform sanity checks.

    p_numerators = Counter(
    )  # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter(
    )  # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    assert len(list_of_references) == len(hypotheses), f"The number of hypotheses and their reference(s) should be " \
                                                       f"the same: {len(list_of_references)} != {len(hypotheses)}"

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i, _ in enumerate(weights, start=1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Uniformly re-weighting based on maximum hypothesis lengths if largest
    # order of n-grams < 4 and weights is set at default.
    if auto_reweigh:
        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
            weights = (1 / hyp_lengths, ) * hyp_lengths

    # Collects the various precision values for the different ngram orders.
    p_n = [
        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
        for i, _ in enumerate(weights, start=1)
    ]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return 0

    # If there's no smoothing, set use method0 from SmoothinFunction class.
    if not smoothing_function:
        smoothing_function = SmoothingFunction().method0
    # Smoothen the modified precision.
    # Note: smoothing_function() may convert values into floats;
    #       it tries to retain the Fraction object as much as the
    #       smoothing method allows.
    p_n = smoothing_function(p_n,
                             references=references,
                             hypothesis=hypothesis,
                             hyp_len=hyp_len)
    s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
    s = bp * math.exp(math.fsum(s))
    return s, p_n, bp, hyp_lengths / ref_lengths, hyp_lengths, ref_lengths
Esempio n. 23
0
def evaluate_seq2seq_decode_results(dataset, seq2seq_decode_file, seq2seq_ref_file, verbose=True, is_nbest=False):
    from lang.py.parse import parse

    f_seq2seq_decode = open(seq2seq_decode_file)
    f_seq2seq_ref = open(seq2seq_ref_file)

    if verbose:
        logging.info('evaluating [%s] set, [%d] examples', dataset.name, dataset.count)

    cum_bleu = 0.0
    cum_acc = 0.0
    sm = SmoothingFunction()

    decode_file_data = [l.strip() for l in f_seq2seq_decode.readlines()]
    ref_code_data = [l.strip() for l in f_seq2seq_ref.readlines()]

    if is_nbest:
        for i in xrange(len(decode_file_data)):
            d = decode_file_data[i].split(' ||| ')
            decode_file_data[i] = (int(d[0]), d[1])

    def is_well_formed_python_code(_hyp):
        try:
            _hyp = _hyp.replace('#NEWLINE#', '\n').replace('#INDENT#', '    ').replace(' #MERGE# ', '')
            hyp_ast_tree = parse(_hyp)
            return True
        except:
            return False

    for eid in range(dataset.count):
        example = dataset.examples[eid]
        cur_example_correct = False

        if is_nbest:
            # find the best-scored well-formed code from the n-best list
            n_best_list = filter(lambda x: x[0] == eid, decode_file_data)
            code = top_scored_code = n_best_list[0][1]
            for _, hyp in n_best_list:
                if is_well_formed_python_code(hyp):
                    code = hyp
                    break

            if top_scored_code != code:
                print '*' * 60
                print top_scored_code
                print code
                print '*' * 60

            code = n_best_list[0][1]
        else:
            code = decode_file_data[eid]

        code = code.replace('#NEWLINE#', '\n').replace('#INDENT#', '    ').replace(' #MERGE# ', '')
        ref_code = ref_code_data[eid].replace('#NEWLINE#', '\n').replace('#INDENT#', '    ').replace(' #MERGE# ', '')

        if code == ref_code:
            cum_acc += 1
            cur_example_correct = True


        if data_type == 'django':
            ref_code_for_bleu = example.meta_data['raw_code']
            pred_code_for_bleu = code # de_canonicalize_code(code, example.meta_data['raw_code'])
            # ref_code_for_bleu = de_canonicalize_code(ref_code_for_bleu, example.meta_data['raw_code'])
            # convert canonicalized code to raw code
            for literal, place_holder in example.meta_data['str_map'].iteritems():
                pred_code_for_bleu = pred_code_for_bleu.replace('\'' + place_holder + '\'', literal)
                # ref_code_for_bleu = ref_code_for_bleu.replace('\'' + place_holder + '\'', literal)
        elif data_type == 'hs':
            ref_code_for_bleu = example.code
            pred_code_for_bleu = code

        # we apply Ling Wang's trick when evaluating BLEU scores
        refer_tokens_for_bleu = tokenize_for_bleu_eval(ref_code_for_bleu)
        pred_tokens_for_bleu = tokenize_for_bleu_eval(pred_code_for_bleu)

        ngram_weights = [0.25] * min(4, len(refer_tokens_for_bleu))
        bleu_score = sentence_bleu([refer_tokens_for_bleu], pred_tokens_for_bleu, weights=ngram_weights, smoothing_function=sm.method3)
        cum_bleu += bleu_score

    cum_bleu /= dataset.count
    cum_acc /= dataset.count

    logging.info('sentence level bleu: %f', cum_bleu)
    logging.info('accuracy: %f', cum_acc)
Esempio n. 24
0
from __future__ import print_function, division

import torch
import torchtext

import seq2seq
import autoeval
from seq2seq.loss import NLLLoss
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
from autoeval.eval_embedding import Embed
from autoeval.eval_distinct import distinct

smoothie = SmoothingFunction().method4


class Evaluator(object):
    """ Class to evaluate models with given datasets.

    Args:
        loss (seq2seq.loss, optional): loss for evaluator (default: seq2seq.loss.NLLLoss)
        batch_size (int, optional): batch size for evaluator (default: 64)
    """
    def __init__(self, loss=NLLLoss(), batch_size=64):
        self.loss = loss
        self.batch_size = batch_size

    def evaluate(self,
                 model,
                 data,
                 vocabs=None,
Esempio n. 25
0
 def __init__(self):
     self.rouge = Rouge()
     self.smooth = SmoothingFunction().method1
     self.best_bleu = 0.
Esempio n. 26
0
 def get_sentence_bleu(self, example, hyp):
     return sentence_bleu([tokenize_for_bleu_eval(example.info['example_dict']['snippet'])],
                          tokenize_for_bleu_eval(hyp.decanonical_code),
                          smoothing_function=SmoothingFunction().method3)
import time
import numpy as np
import codecs

from vocab_utils import Vocab
import namespace_utils
import NP2P_data_stream
from NP2P_model_graph import ModelGraph

FLAGS = None
import tensorflow as tf
tf.logging.set_verbosity(
    tf.logging.ERROR)  # DEBUG, INFO, WARN, ERROR, and FATAL

from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu
cc = SmoothingFunction()

import metric_utils

import platform


def get_machine_name():
    return platform.node()


def vec2string(val):
    result = ""
    for v in val:
        result += " {}".format(v)
    return result.strip()
Esempio n. 28
0
    def evaluate_dataset(self, dataset, decode_results, fast_mode=False):
        examples = dataset
        assert len(examples) == len(decode_results)

        # speed up, cache tokenization results
        if not hasattr(examples[0], 'reference_code_tokens'):
            for example in examples:
                setattr(example, 'reference_code_tokens', tokenize_for_bleu_eval(example.info['example_dict']['snippet']))

        if not hasattr(decode_results[0][0], 'decanonical_code_tokens'):
            for i, example in enumerate(examples):
                hyp_list = decode_results[i]
                # here we prune any hypothesis that throws an error when converting back to the decanonical code!
                # This modifies the decode_results in-place!
                filtered_hyp_list = []
                for hyp in hyp_list:
                    if not hasattr(hyp, 'decanonical_code'):
                        try:
                            hyp.decanonical_code = decanonicalize_code(hyp.code, slot_map=example.info['slot_map'])
                            if hyp.decanonical_code:
                                hyp.decanonical_code_tokens = tokenize_for_bleu_eval(hyp.decanonical_code)
                                filtered_hyp_list.append(hyp)
                        except: pass

                decode_results[i] = filtered_hyp_list

        if fast_mode:
            references = [e.reference_code_tokens for e in examples]
            hypotheses = [hyp_list[0].decanonical_code_tokens if hyp_list else [] for hyp_list in decode_results]

            bleu_tup = compute_bleu([[x] for x in references], hypotheses, smooth=False)
            bleu = bleu_tup[0]

            return bleu
        else:
            tokenized_ref_snippets = []
            hyp_code_tokens = []
            best_hyp_code_tokens = []
            sm_func = SmoothingFunction().method3
            sent_bleu_scores = []
            oracle_bleu_scores = []
            oracle_exact_match = []
            for example, hyp_list in zip(examples, decode_results):
                tokenized_ref_snippets.append(example.reference_code_tokens)
                example_hyp_bleu_scores = []
                if hyp_list:
                    for i, hyp in enumerate(hyp_list):
                        hyp.bleu_score = sentence_bleu([example.reference_code_tokens],
                                                       hyp.decanonical_code_tokens,
                                                       smoothing_function=sm_func)
                        hyp.is_correct = self.is_hyp_correct(example, hyp)

                        example_hyp_bleu_scores.append(hyp.bleu_score)

                    top_decanonical_code_tokens = hyp_list[0].decanonical_code_tokens
                    sent_bleu_score = hyp_list[0].bleu_score

                    best_hyp_idx = np.argmax(example_hyp_bleu_scores)
                    oracle_sent_bleu = example_hyp_bleu_scores[best_hyp_idx]
                    _best_hyp_code_tokens = hyp_list[best_hyp_idx].decanonical_code_tokens
                else:
                    top_decanonical_code_tokens = []
                    sent_bleu_score = 0.
                    oracle_sent_bleu = 0.
                    _best_hyp_code_tokens = []

                oracle_exact_match.append(any(hyp.is_correct for hyp in hyp_list))
                hyp_code_tokens.append(top_decanonical_code_tokens)
                sent_bleu_scores.append(sent_bleu_score)
                oracle_bleu_scores.append(oracle_sent_bleu)
                best_hyp_code_tokens.append(_best_hyp_code_tokens)

            bleu_tup = compute_bleu([[x] for x in tokenized_ref_snippets], hyp_code_tokens, smooth=False)
            corpus_bleu = bleu_tup[0]

            bleu_tup = compute_bleu([[x] for x in tokenized_ref_snippets], best_hyp_code_tokens, smooth=False)
            oracle_corpus_bleu = bleu_tup[0]

            avg_sent_bleu = np.average(sent_bleu_scores)
            oracle_avg_sent_bleu = np.average(oracle_bleu_scores)
            exact = sum([1 if h == r else 0 for h, r in zip(hyp_code_tokens, tokenized_ref_snippets)]) / float(
                len(examples))
            oracle_exact_match = np.average(oracle_exact_match)

            return {'corpus_bleu': corpus_bleu,
                    'oracle_corpus_bleu': oracle_corpus_bleu,
                    'avg_sent_bleu': avg_sent_bleu,
                    'oracle_avg_sent_bleu': oracle_avg_sent_bleu,
                    'exact_match': exact,
                    'oracle_exact_match': oracle_exact_match}
Esempio n. 29
0
 def get_bleu(self):
     ngram = self.gram
     bleu = list()
     reference = self.get_reference()
     weight = tuple((1. / ngram for _ in range(ngram)))
     with open(self.test_data) as test_data:
         for hypothesis in test_data:
             hypothesis = nltk.word_tokenize(hypothesis)
             bleu.append(nltk.translate.bleu_score.sentence_bleu(reference, hypothesis, weight,
                                                                 smoothing_function=SmoothingFunction().method1))
     return sum(bleu) / len(bleu)
Esempio n. 30
0
def bleu_score(a, b):
    cc = SmoothingFunction()
    bl = sentence_bleu([a], b, smoothing_function=cc.method1)
    return bl