Esempio n. 1
0
    def compute_metrics(self, data, preds):
        em = 0
        hyps, refs = [], []
        utt_hyps, utt_refs = [], []
        for ex in data:
            p = preds[ex['id']]
            gsql = ex['g_query_recov'].lower().split()
            psql = p['query'].lower().split()
            refs.append([gsql])
            hyps.append(psql)
            em += psql == gsql

            utt_hyps.append(p['utt_toks'])
            utt_refs.append([ex['g_question_toks']])
        metrics = {
            'em':
            em / len(data),
            'bleu':
            corpus_bleu(refs,
                        hyps,
                        smoothing_function=SmoothingFunction().method3),
            'utt_bleu':
            corpus_bleu(utt_refs,
                        utt_hyps,
                        smoothing_function=SmoothingFunction().method3),
        }
        if not self.training:
            metrics.update(self.compute_official_eval(data, preds))
        else:
            metrics['official_em'] = metrics['em']
        return metrics
def evaluate(gen_chars, gen_actions, gen_outputs, ref_chars, ref_actions, ref_outputs, idx2word):
    '''
    bleu
    ppl
    char_acc
    '''
    # char
    total_sum = total_length = 0
    for gc, rc in zip(gen_chars, ref_chars):
        eq = np.equal(gc, rc)
        total_sum += sum(eq)
        total_length += len(eq)
    print('char_acc:{}'.format((total_sum / total_length) * 100), flush=True)
    
    # action
    total_sum = total_length = 0
    for ga, ra in zip(gen_actions, ref_actions):
        eq = np.equal(ga, ra)
        total_sum += sum(eq)
        total_length += len(eq)
    print('action_acc:{}'.format((total_sum / total_length) * 100), flush=True)
    
    gen_outputs = [' '.join([' '.join([idx2word[id.item()] for id in s]) for s in outputs]) for outputs in gen_outputs]
    ref_outputs = [[' '.join([' '.join([idx2word[id.item()] for id in s]) for s in outputs])] for outputs in ref_outputs]   # TODO restore name?
    scores, _ = bleu.corpus_bleu(gen_outputs, ref_outputs, max_n=4)
    final_bleu, bleu1, bleu2, bleu3, bleu4 = scores
    print('bleu1:{}, bleu2:{}, bleu3:{}, bleu4:{}'.format(bleu1 * 100, bleu2 * 100, bleu3 * 100, bleu4 * 100), flush=True)
Esempio n. 3
0
def evaluate_trans(thenet, references, vali_data, vali_raw_data):
    hypothesis = []
    score_total = 0.
    num_word_total = 0
    for batch in vali_data:
        pred_batch, gold_batch, pred_scores, gold_scores, attn, src = thenet.translate(
            batch, vali_raw_data)
        score_total += sum([score[0] for score in pred_scores])
        num_word_total += sum(len(x) for x in batch.tgt[1:])
        hypothesis.extend([' '.join(x[0]) for x in pred_batch])
    ppl = math.exp(-score_total / num_word_total)
    bleu_score = bleu.corpus_bleu(
        hypothesis, references)[0][0]  #[final, n-gram1,n-gram2,...], [bp, ...]
    nlg_ref = [[x[0] for x in references if x is not None]]

    nlg_eval = NLGEval()
    save_txt('/fl/txtfile/rnn_h1.txt', hypothesis)
    metrics_eval = nlg_eval.compute_metrics(nlg_ref, hypothesis)
    print(metrics_eval)
    print('BLEU: {}'.format(bleu_score))
    # training/validation 阶段的ppl计算在onmt/Trainer.py的Statisci()中;translating的ppl计算在 translate.py中的reprot_score函数里
    print('PPL: {}'.format(ppl))

    return torch.FloatTensor([ppl, bleu_score,
                              0.0])  # the last reserved for rank number
Esempio n. 4
0
 def compute_metrics(self, data, preds):
     em = 0
     hyps, refs = [], []
     for ex in data:
         if ex['id'] not in preds and self.training:
             continue
         p = preds[ex['id']]
         gsql = ex['query'].lower().split()
         psql = p['query'].lower().split()
         refs.append([gsql])
         hyps.append(psql)
         em += psql == gsql
     metrics = {
         'em':
         em / len(data),
         'bleu':
         corpus_bleu(refs,
                     hyps,
                     smoothing_function=SmoothingFunction().method3),
     }
     if not self.training:
         metrics.update(self.compute_official_eval(data, preds))
     else:
         metrics['official_em'] = metrics['em']
     return metrics
Esempio n. 5
0
    def compute_metrics(self, data, preds):
        em = 0
        utt_hyps, utt_refs = [], []
        generated = dataset.Dataset()
        for ex in data:
            p = preds[ex['id']]
            utt_hyps.append(p['utt_toks'])
            utt_refs.append([ex['g_question_toks']])

            # make new example
            db_id = ex['db_id']
            db = self.conv.database_schemas[db_id]
            question_toks = p['utt_toks']
            query_context = preprocess_nl2sql.SQLDataset.build_contexts(
                question_toks, db, self.bert_tokenizer)

            if 'g_sql' not in ex:
                ex['g_sql'] = self.conv.build_sql(ex['query'], db_id)

            new = dict(
                id=ex['id'],
                question=ex['question'],
                db_id=db_id,
                g_question_toks=question_toks,
                query=ex['query'],
                g_values=ex['g_values'],
                g_sql=ex['g_sql'],
                value_context=[self.bert_tokenizer.cls_token] + question_toks +
                [self.bert_tokenizer.sep_token],
                query_context=query_context,
                invalid=False,
                cands_query=preprocess_nl2sql.SQLDataset.make_column_cands(
                    query_context),
            )
            new['cands_query'], new[
                'cands_value'] = preprocess_nl2sql.SQLDataset.make_cands(
                    new, self.nl2sql.sql_vocab)
            generated.append(new)

        metrics = {
            'utt_bleu':
            corpus_bleu(utt_refs,
                        utt_hyps,
                        smoothing_function=SmoothingFunction().method3),
        }
        if not self.training:
            with torch.no_grad():
                self.nl2sql.eval()
                preds = self.nl2sql.run_pred(generated,
                                             self.nl2sql.args,
                                             verbose=True,
                                             desc='cycle_pred')
            metrics.update(self.nl2sql.compute_official_eval(generated, preds))
        return metrics
Esempio n. 6
0
    def populate_train_dict(self, sess, targets_batch):
        """Prepares the feed dictionary for training.
        Args:
            targets_batch:  Target sentences in ids [batch_size, max_length]
        Returns:
            train_dict:     Feed dictionary for training
        """
        hidden_states = np.zeros(
            (self.config.max_length, len(self.cur_hypos), self.config.d_model),
            dtype=np.float32)
        actions = np.zeros((self.config.max_length, len(self.cur_hypos)),
                           dtype=np.int32)
        for step in range(self.config.max_length -
                          1):  # -1 since already have a READ
            #prev_lengths = [len(x[0].actions) for x in self.cur_hypos]
            hidden_states[step, :, :] = self._get_hidden_states()
            actions[step, :], probs, _ = self.predict_one_step(
                sess, hidden_states[step, :, :])
            #logging.info("step %d     actions:" % step)
            #logging.info(actions[step,:])
            #logging.info("probs:")
            #logging.info(np.max(probs, axis=1))
            #logging.info("Actions length of 1st sentence %d" % len(self.cur_hypos[0][0].actions))
            #logging.info("\n")
            self._update_hidden_states(actions[step, :])
            #new_lengths = [len(x[0].actions) for x in self.cur_hypos]
            #logging.info("change in actions length:")
            #logging.info(np.array(new_lengths)-np.array(prev_lengths))
            #logging.info(np.array([x[0].netRead for x in self.cur_hypos]))
            #logging.info("\n")

        batch_average_delay = 0.0
        BLEU_hypos = []
        BLEU_refs = []
        # Generate full hypotheses from partial hypotheses
        for idx, hypo in enumerate(self.cur_hypos):
            self.cur_hypos[idx] = hypo[0].generate_full_hypothesis()
            batch_average_delay += self.cur_hypos[idx].get_average_delay()
            BLEU_hypos.append(
                [str(x) for x in self.cur_hypos[idx].trgt_sentence])
            BLEU_refs.append([[str(x) for x in targets_batch[idx]]])
        cum_rewards = self._get_bacth_cumulative_rewards(targets_batch)

        _, quality = corpus_bleu(BLEU_refs,
                                 BLEU_hypos)  # BLEU score for the batch
        batch_average_delay /= len(self.cur_hypos)
        logging.info("\n     batch average delay: %f\n" % batch_average_delay)
        logging.info("\n     batch BLEU:          %f\n" % quality)

        train_dict = self.create_feed_dict(
            np.reshape(hidden_states, (-1, self.config.d_model)),
            actions.flatten(), cum_rewards.flatten(), self.config.dropout)

        return train_dict
def evaluate_trans(thenet, references, vali_data, vali_raw_data):
    hypothesis = []
    score_total = 0.
    num_word_total = 0
    for batch in vali_data:
        pred_batch, gold_batch, pred_scores, gold_scores, attn, src = thenet.translate(
            batch, vali_raw_data)
        score_total += sum([score[0] for score in pred_scores])
        num_word_total += sum(len(x) for x in batch.tgt[1:])
        hypothesis.extend([' '.join(x[0]) for x in pred_batch])
    ppl = math.exp(-score_total / num_word_total)
    bleu_score = bleu.corpus_bleu(hypothesis, references)[0][0]

    # the last reserved for rank number
    return torch.FloatTensor([ppl, bleu_score, 0.0])
Esempio n. 8
0
for i in range(len(pre_references)):
    assert len(hypothesis) == len(pre_references[i])

references = []
for i in range(len(hypothesis)):
    ref_for_instance = []
    for j in range(len(pre_references)):
        ref_for_instance.append(pre_references[j][i])
    references.append(ref_for_instance)
assert len(references) == len(pre_references) * len(hypothesis)

# calculate ngram match (BLEU)
tokenized_hyps = [x.split() for x in hypothesis]
tokenized_refs = [[x.split() for x in reference] for reference in references]

ngram_match_score = bleu.corpus_bleu(tokenized_refs, tokenized_hyps)

# calculate weighted ngram match
keywords = [
    x.strip()
    for x in open('keywords/' + args.lang +
                  '.txt', 'r', encoding='utf-8').readlines()
]


def make_weights(reference_tokens, key_word_list):
    return {token: 1 if token in key_word_list else 0.2 \
            for token in reference_tokens}


tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)] \
Esempio n. 9
0
File: Coach.py Progetto: zhyack/SCC
    def learn_comment_iter(self):
        self.trainExamples = []
        self.validExamples = []

        if 'train_text_files' in self.args:

            train_text_files, ns = self.getDataSetDistribution('text')

            for j, file_name in enumerate(train_text_files):
                trainExamples = loadTrainExamples(self.args['train_text_files'][0]+file_name)
                shuffle(trainExamples)
                self.trainExamples.extend(trainExamples[:ns[j]])
        
        if 'valid_text_files' in self.args:
            for file_name in self.args['valid_text_files'][1]:
                validExamples = loadTrainExamples(self.args['valid_text_files'][0]+file_name)
                self.validExamples.extend(validExamples)
        
        # define train classification
        shuffle(self.trainExamples)
        n_e = len(self.trainExamples)
        if n_e:
            class_examples = [[], [], [], [], [], []]
            for i in range(n_e):
                e = self.trainExamples[i]
                if e==None:
                    continue
                class_examples[0].append(e)
                if e[-1]!=None:
                    e = list(e)
                    if isinstance(e[-1], tuple):
                        cs = set(e[-1][1])
                        e[-1]=e[-1][0]
                        for c in cs:
                            class_examples[c].append(e)
                        if 'test_with_no_ai' in self.nnet.args and self.nnet.args['test_with_no_ai'] and 1 not in cs:
                            class_examples[1].append(e)
                    else:
                        le = e[-1].split('@\t@')
                        assert(len(le)==2)
                        e[-1]=le[1]
                        cs = set([int(s) for s in le[0].split()])
                        for c in cs:
                            class_examples[c].append(e)
                        if 'test_with_no_ai' in self.nnet.args and self.nnet.args['test_with_no_ai'] and 1 not in cs:
                            class_examples[1].append(e)
                self.trainExamples[i] = None
            print('TrainExamples Distribution: ', [len(e) for e in class_examples])
            if self.args['comment_chess_training']:
                print('Training Chess: ')
                self.nnet.train(class_examples[0], transform=True, models=[0])
            for m in self.nnet.args['models']['comment']:
                i = ord(m)-ord('A')+1
                if len(class_examples[i]):
                    print('Training Class %s:\n'%(m))
                    self.nnet.train(class_examples[i], transform=True, models=[i])

        # define dev
        n_e = len(self.validExamples)
        if n_e:
            class_examples = [[], [], [], [], [], []]
            for i in range(n_e):
                e = self.validExamples[i]
                if e==None:
                    continue
                class_examples[0].append(e)
                if e[-1]!=None:
                    e = list(e)
                    if isinstance(e[-1], tuple):
                        cs = set(e[-1][1])
                        e[-1]=e[-1][0]
                        for c in cs:
                            class_examples[c].append(e)
                        if 'test_with_no_ai' in self.nnet.args and self.nnet.args['test_with_no_ai'] and 1 not in cs:
                            class_examples[1].append(e)
                    else:
                        le = e[-1].split('@\t@')
                        assert(len(le)==2)
                        e[-1]=le[1]
                        cs = set([int(s) for s in le[0].split()])
                        for c in cs:
                            class_examples[c].append(e)
                        if 'test_with_no_ai' in self.nnet.args and self.nnet.args['test_with_no_ai'] and 1 not in cs:
                            class_examples[1].append(e)
                self.validExamples[i] = None
            print('ValidExamples Distribution: ', [len(e) for e in class_examples])
            for m in self.nnet.args['models']['comment']:
                i = ord(m)-ord('A')+1
                if len(class_examples[i]):
                    print('Evaluating Class %s:'%(m))
                    n = len(class_examples[i])
                    bsize = self.nnet.args['batch_size']
                    predict_texts = []
                    gold_texts = []
                    for b in range((n+bsize-1)//bsize):
                        batch = class_examples[i][b*bsize:min((b+1)*bsize,n)]
                        n_batch = len(batch)
                        if n_batch<bsize:
                            batch += class_examples[i][:bsize-n_batch]
                        boards, pis, vs, valids, texts = list(zip(*batch))
                        rets = list(self.nnet.predict(boards, [pis, valids], models=[i], transform=True))
                        for tb in range(n_batch):
                            if boards[tb][-1]==-1 and len(rets[tb])>0:
                                rets[tb] = self.postProcess(rets[tb], player="black")
                            elif len(rets[tb])>0:
                                rets[tb] = self.postProcess(rets[tb], player="white")
                        predict_texts.extend(rets[:n_batch])
                        gold_texts.extend(texts[:n_batch])
                        # for k in range(n_batch):
                        #     if random.random()<0.0005 or len(self.trainExamples)==0:
                        #         print('Board: ', boards[k][-1])
                        #         print(chess.Board(boards[k][0]).unicode().replace(u'·', u'.'))
                        #         print('Move: ', self.game.action_list[pis[k][0]])
                        #         print('Expected: ', texts[k].strip())
                        #         print('Predicted: ', rets[k])
                    
                    result = bleu.corpus_bleu(predict_texts, [[t.strip()] for t in gold_texts])[0][0]
                    # refs = []
                    # hyps = []
                    # for p, t in zip(predict_texts, gold_texts):
                    #     refs.append([t.split()])
                    #     hyps.append(p.split())
                    # result = nltk.translate.bleu_score.corpus_bleu(refs, hyps, auto_reweigh=True)
                    print('BLEU-4 for Class %s: %.2f'%(m, result*100))
                    result = bleu.corpus_bleu(predict_texts, [[t.strip()] for t in gold_texts], max_n=2)[0][0]
                    print('BLEU-2 for Class %s: %.2f'%(m, result*100))
                    save2text(gold_texts, self.args.checkpoint+'_gold%d-%d.txt'%(i, self.iter))
                    save2text(predict_texts, self.args.checkpoint+'_predicted%d-%d.txt'%(i, self.iter))
                    print('METEOR for Class %s: '%(m), meteor.evaluate(self.args.checkpoint+'_predicted%d-%d.txt'%(i, self.iter), self.args.checkpoint+'_gold%d-%d.txt'%(i, self.iter)))
                    print('Dist-2 for Class %s: '%(m), diversity.corpus_diversity(predict_texts))
        self.trainExamples = []
        self.validExamples = []
Esempio n. 10
0
    def populate_train_dict(self, sess, targets_batch):
        """Prepares the feed dictionary for training.
        Args:
            targets_batch:  Target sentences in ids [batch_size, max_length]
        Returns:
            train_dict:     Feed dictionary for training
        """
        hidden_states = np.zeros(
            (self.config.max_length, len(self.cur_hypos), self.config.d_model),
            dtype=np.float32 )
        actions = np.zeros((self.config.max_length, len(self.cur_hypos)),
                           dtype=np.int32)
        targets = np.zeros((self.config.max_length, len(self.cur_hypos)),
                           dtype=np.int32)
        for step in range(self.config.max_length-1):# -1 since already have a READ
            #prev_lengths = [len(x[0].actions) for x in self.cur_hypos]
            hidden_states[step,:,:] = self._get_hidden_states()
            # current best qval is the target for the previous step
            actions[step,:], qvals, optimal_actions = self.predict_one_step(sess,
                                                       hidden_states[step,:,:])
            
            # If the target network exists, decouple the action selection from
            # target values prediction
            if hasattr(self, 'target'):
                targets[step-1,:] = self.target._get_targets(sess,
                                    hidden_states[step,:,:], optimal_actions)
            else:
                targets[step-1,:] = np.choose(optimal_actions, qvals.T)
            self._update_hidden_states(actions[step,:])

        BLEU_hypos = []
        BLEU_refs = []
        wue_BLEU_hypos = []  # the hypo holder for WUE decoding
        # Generate full hypotheses from partial hypotheses
        for idx, hypo in enumerate(self.cur_hypos):
            self.cur_hypos[idx] = hypo[0].generate_full_hypothesis()
            action_length = len(self.cur_hypos[idx].actions)
            # get hypothesis and reference for BLEU evaluation
            BLEU_hypos.append([str(x) for x in self.cur_hypos[idx].trgt_sentence])
            BLEU_refs.append([[str(x) for x in targets_batch[idx]]])
            if hasattr(self, 'target') and self.config.useBLEUDrop:
                wue_BLEU_hypos.append([str(x) for x in 
                                self.all_wue_trans[self.cur_hypos[idx].lst_id]])
            # targets are just as long as the action sequence, pad the remaining
            # targets with 0
            targets[action_length-2:,idx] = 0

        # give the quality rewards (BLEU) at the end
        _, quality = corpus_bleu(BLEU_refs, BLEU_hypos) # BLEU score for the batch
        
        if self.config.useBLEUDrop:
            _, wue_BLEU = corpus_bleu(BLEU_refs, wue_BLEU_hypos)
            quality = quality - wue_BLEU
        batch_average_delay = 0.0
        for idx in range(len(self.cur_hypos)):
            batch_average_delay += self.cur_hypos[idx].get_average_delay()
            targets[len(self.cur_hypos[idx].actions)-2,idx] = \
                quality + self.cur_hypos[idx].get_last_delay_reward(self.config)

        batch_average_delay /= len(self.cur_hypos)
        logging.info("\n     batch average delay: %f\n" % batch_average_delay)
        logging.info("\n     batch (delta) BLEU:  %f\n" % quality)
        if self.config.useBLEUDrop:
            logging.info("\n     batch WUE BLEU:      %f\n" % wue_BLEU)
        train_dict = self.create_feed_dict(
            np.reshape(hidden_states, (-1, self.config.d_model)),
            actions.flatten(),
            targets.flatten(),
            self.config.dropout )

        return train_dict