Exemple #1
0
def cal_bleu_rouge(keys, values, rank=None, num_print=10000):

    start = rank * num_print if rank is not None else 0
    end = start + num_print
    # values = [val[start:end] for val in values]
    ref_answers, pred_answers = [], []
    for ex_idx in range(len(values[0])):
        for key_idx, key in enumerate(keys):
            value = values[key_idx][ex_idx]
            v = value[0] if isinstance(value, list) else value
            # print(f'{key}: {repr(v)}')
            if key == 'greedy':
                pred_answers.append({'answers': [str(v)]})
            elif key == 'answer':
                ref_answers.append({
                    'answers': [str(v)],
                    'question_id': ex_idx
                })

    # compute the bleu and rouge scores if reference answers is provided
    if len(ref_answers) > 0:
        pred_dict, ref_dict = {}, {}
        for pred, ref in zip(pred_answers, ref_answers):
            question_id = ref['question_id']
            if len(ref['answers']) > 0:
                pred_dict[question_id] = normalize(pred['answers'])
                # 利用utils包,normalize strings to space joined chars
                ref_dict[question_id] = normalize(ref['answers'])
        bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
        # pred_dict是预测值,ref_dict是真实值
        # 利用utils包,calculate bleu and rouge metrics
    else:
        bleu_rouge = None

    return bleu_rouge
Exemple #2
0
def run_eval(pred_answers_list, ref_answers_list):
    """
    Run eval.
    """
    pred_answers = pred_answers_list
    ref_answers = ref_answers_list

    # compute the bleu and rouge scores if reference answers is provided
    if len(ref_answers) > 0:

        assert len(pred_answers) == len(ref_answers), \
            "length distinguish: (pred_len={}) != (ref_len={})".format(len(pred_answers), len(ref_answers))

        pred_ids, ref_ids = [], []
        pred_dict, ref_dict = {}, {}
        for pred, ref in zip(pred_answers, ref_answers):
            question_id_pred = pred['question_id']
            question_id_ref = ref['question_id']
            pred_ids.append(question_id_pred)
            ref_ids.append(question_id_ref)
            if len(ref['answers']) > 0:
                pred_dict[question_id_pred] = normalize(pred['answers'])
                ref_dict[question_id_ref] = normalize(ref['answers'])

        assert set(pred_ids) == set(
            ref_ids), "There have different ids in both files."

        bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
    else:
        bleu_rouge = None

    return bleu_rouge
Exemple #3
0
    def evaluate(self, eval_batches, result_dir=None, result_prefix=None, save_full_info=False):
        pred_answers, ref_answers = [], []
        total_loss, total_num = 0, 0
        for b_itx, batch in enumerate(eval_batches):

            feed_dict = {self.p: batch['passage_token_ids'],
                         self.q: batch['question_token_ids'],
                         self.p_length: batch['passage_length'],
                         self.q_length: batch['question_length'],
                         self.start_label: batch['start_id'],
                         self.end_label: batch['end_id'],
                         self.dropout: 0.0}

            start_probs, end_probs,loss= self.sess.run([self.start_probs,self.end_probs,self.loss], feed_dict)
            total_loss += loss * len(batch['raw_data'])
            total_num += len(batch['raw_data'])
            padded_p_len = len(batch['passage_token_ids'][0])
            for sample, start_prob, end_prob in zip(batch['raw_data'], start_probs, end_probs):
                best_answer = self.find_best_answer(sample, start_prob, end_prob, padded_p_len)
                if save_full_info:
                    sample['pred_answers'] = [best_answer]
                    pred_answers.append(sample)
                else:
                    pred_answers.append({'question_id': sample['question_id'],
                                         'question_type': sample['question_type'],
                                         'answers': [best_answer],
                                         'entity_answers': [[]],
                                         'yesno_answers': []})
                if 'answers' in sample:
                    ref_answers.append({'question_id': sample['question_id'],
                                        'question_type': sample['question_type'],
                                        'answers': sample['answers'],
                                        'entity_answers': [[]],
                                        'yesno_answers': []})

        if result_dir is not None and result_prefix is not None:
            result_file = os.path.join(result_dir, result_prefix + '.json')
            with open(result_file, 'w',encoding='utf-8') as fout:
                for pred_answer in pred_answers:
                    fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n')

            self.logger.info('Saving {} results to {}'.format(result_prefix, result_file))

        # this average loss is invalid on test set, since we don't have true start_id and end_id
        ave_loss = 1.0 * total_loss / total_num
        # compute the bleu and rouge scores if reference answers is provided
        if len(ref_answers) > 0:
            pred_dict, ref_dict = {}, {}
            for pred, ref in zip(pred_answers, ref_answers):
                question_id = ref['question_id']
                if len(ref['answers']) > 0:
                    pred_dict[question_id] = normalize(pred['answers'])
                    ref_dict[question_id] = normalize(ref['answers'])
            bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
        else:
            bleu_rouge = None
        return ave_loss,bleu_rouge
Exemple #4
0
def get_score(result_dir):
    pred_answers = []
    with open(result_dir) as fin:
        for lidx_a, line_a in enumerate(fin):
            em_answer = json.loads(line_a.strip())
            pred_answers.append(em_answer)

    # 如果有参考答案,计算bleu和rouge分数
    pred_dict, ref_dict = {}, {}
    for answer in pred_answers:
        question_id = answer['question_id']
        if len(answer['ref_answers']) > 0:
            pred_dict[question_id] = normalize(answer['answers'])
            ref_dict[question_id] = normalize(answer['ref_answers'])
    bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
    return bleu_rouge
Exemple #5
0
def eval(path1, path2):
    # with open(path2, encoding='utf-8') as f:
    #     data = json.load(f)
    # id = list(int(i) for i in data.keys())

    with open(path1, encoding='utf-8') as f:
        pred_answers, ref_answers = [], []
        for lidx, line in enumerate(f):
            sample = json.loads(line.strip())
            pred_answers.append({
                'question_id': sample['question_id'],
                'question': sample['question'],
                'question_type': sample['question_type'],
                'answers': ["No Answer Present."],
                'ref_answers': sample['ref_answers'],
                'entity_answers': [[]],
                'yesno_answers': []
            })
            ref_answers.append({
                'question_id': sample['question_id'],
                'question': sample['question'],
                'question_type': sample['question_type'],
                'answers': sample['ref_answers'],
                'ref_answers': sample['ref_answers'],
                'entity_answers': [[]],
                'yesno_answers': []
            })
    pred_dict, ref_dict = {}, {}
    F1 = 0
    count = 0
    for pred, ref in zip(pred_answers, ref_answers):
        question_id = ref['question_id']
        if len(ref['answers']) > 0:
            pred_dict[question_id] = normalize(pred['answers'])
            ref_dict[question_id] = normalize(ref['answers'])

            F = local_prf(pred['answers'][0].split(),
                          ref['answers'][0].split())
            F1 += F

            count += 1
    bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
    F1_avg = F1 / count
    return bleu_rouge, F1_avg
Exemple #6
0
    def evaluate(self,
            infer_file,
            ret=None,
            from_file=False):
        """
        Processes and evaluates the inferred result.

        Args:
            infer_file: A file name to store or read from the inferred results.
            ret: The information returned by the inferring operation, which
                 contains the batch-level input and the the batch-level
                 inferring result.
            from_file: If True, the time consuming inferring process will be
                       skipped, and this method takes the content of infer_file
                       as input for evaluation. If False, this method takes
                       the ret as input for evaluation.

        """
        def _merge_and_normalize(obj_list):
            ret = {}
            for obj in obj_list:
                normalized = {k: normalize(v) for k, v in obj.items()}
                ret.update(normalized)
            return ret

        pred_list = []
        ref_list = []
        objs = []

        if from_file:
            ref_list, pred_list = self._read_list(infer_file)
        else:
            ref_list, pred_list, objs = self._parse_infer_ret(ret)
            with open(infer_file, 'w') as of:
                for o in objs:
                    print >> of, json.dumps(o, ensure_ascii=False).encode('utf8')
        metrics = compute_bleu_rouge(
                _merge_and_normalize(pred_list),
                _merge_and_normalize(ref_list))
        res_str = '{} {}'.format(infer_file,
                ' '.join('{}={}'.format(k, v) for k, v in metrics.items()))
        logger.info(res_str)
def run_eval(pred_answers_list, ref_answers_list):
    """
    Run eval.
    """
    pred_answers = pred_answers_list
    ref_answers = ref_answers_list

    # compute the bleu and rouge scores if reference answers is provided
    if len(ref_answers) > 0:
        pred_dict, ref_dict = {}, {}
        for pred, ref in zip(pred_answers, ref_answers):
            question_id = ref['question_id']
            if len(ref['answers']) > 0:
                pred_dict[question_id] = normalize(pred['answers'])
                ref_dict[question_id] = normalize(ref['answers'])
        bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
    else:
        bleu_rouge = None

    return bleu_rouge
Exemple #8
0
    def evaluate(self,
                 eval_batches,
                 result_dir=None,
                 result_prefix=None,
                 save_full_info=False):
        """
        Evaluates the model performance on eval_batches and results are saved if specified
        Args:
            eval_batches: iterable batch data
            result_dir: directory to save predicted answers, answers will not be saved if None
            # 文件的前缀
            result_prefix: prefix of the file for saving predicted answers,
                           answers will not be saved if None
            save_full_info: if True, the pred_answers will be added to raw sample and saved
        """
        # 预测答案,参考答案
        pred_answers, ref_answers = [], []
        total_loss, total_num = 0, 0
        line_number = 1  # 从第1行开始读
        for bitx in range(1, 72):
            batch, line_number = self.brc_data.load_batch_size_data_set(
                line_number, '../data/devset/search.dev.json', train=True)
            feed_dict = {
                self.p: batch['passage_token_ids'],
                self.q: batch['question_token_ids'],
                self.p_length: batch['passage_length'],
                self.q_length: batch['question_length'],
                self.start_label: batch['start_id'],
                self.end_label: batch['end_id'],
                self.dropout_keep_prob: 1.0
            }
            start_probs, end_probs, loss = self.sess.run(
                [self.start_probs, self.end_probs, self.loss], feed_dict)
            total_loss += loss * len(batch['raw_data'])
            total_num += len(batch['raw_data'])
            padded_p_len = len(batch['passage_token_ids'][0])
            for sample, start_prob, end_prob in zip(batch['raw_data'],
                                                    start_probs, end_probs):

                best_answer = self.find_best_answer(sample, start_prob,
                                                    end_prob, padded_p_len)
                # 如果需要保存预测答案信息,将会把预测答案存入源sample中并保存
                if save_full_info:
                    sample['pred_answers'] = [best_answer]
                    pred_answers.append(sample)
                else:
                    pred_answers.append({
                        'question_id':
                        sample['question_id'],
                        'question_type':
                        sample['question_type'],
                        'answers': [best_answer],
                        'entity_answers': [[]],
                        'yesno_answers': []
                    })
                if 'answers' in sample:
                    ref_answers.append({
                        'question_id': sample['question_id'],
                        'question_type': sample['question_type'],
                        'answers': sample['answers'],
                        'entity_answers': [[]],
                        'yesno_answers': []
                    })
        # 保存预测数据
        if result_dir is not None and result_prefix is not None:
            result_file = os.path.join(result_dir, result_prefix + '.json')
            with open(result_file, 'w') as fout:
                for pred_answer in pred_answers:
                    fout.write(
                        json.dumps(pred_answer, ensure_ascii=False) + '\n')

            self.logger.info('Saving {} results to {}'.format(
                result_prefix, result_file))

        # 计算损失 以及bleu and rouge得分
        # this average loss is invalid on test set, since we don't have true start_id and end_id
        ave_loss = 1.0 * total_loss / total_num
        # compute the bleu and rouge scores if reference answers is provided
        if len(ref_answers) > 0:
            pred_dict, ref_dict = {}, {}
            for pred, ref in zip(pred_answers, ref_answers):
                question_id = ref['question_id']
                if len(ref['answers']) > 0:
                    pred_dict[question_id] = normalize(pred['answers'])
                    ref_dict[question_id] = normalize(ref['answers'])
            # 把预测数据 与 参考答案保存
            pickle.dump(pred_dict, open('./logs/pred_dict.txt', 'wb'))
            pickle.dump(ref_dict, open('./logs/ref_dict.txt', 'wb'))
            result_file = os.path.join('./logs', 'pred_dict' + '.json')
            with open(result_file, 'w') as fout:
                fout.write(json.dumps(pred_dict, ensure_ascii=False) + '\n')
            bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
        else:
            bleu_rouge = None
        return ave_loss, bleu_rouge
Exemple #9
0
    def evaluate(self, eval_batches, result_dir=None, result_prefix=None, save_full_info=False):
        """
        Evaluates the model performance on eval_batches and results are saved if specified
        Args:
            eval_batches: iterable batch data
            result_dir: directory to save predicted answers, answers will not be saved if None
            result_prefix: prefix of the file for saving predicted answers,
                           answers will not be saved if None
            save_full_info: if True, the pred_answers will be added to raw sample and saved
        """
        pred_answers, ref_answers = [], []
        total_loss, total_num = 0, 0
        for b_itx, batch in enumerate(eval_batches):
            feed_dict = {self.p: batch['passage_token_ids'],
                         self.q: batch['question_token_ids'],
                         self.p_length: batch['passage_length'],
                         self.q_length: batch['question_length'],
                         self.start_label: batch['start_id'],
                         self.end_label: batch['end_id'],
                         self.dropout_keep_prob: 1.0}
            start_probs, end_probs, loss = self.sess.run([self.start_probs,
                                                          self.end_probs, self.loss], feed_dict)

            total_loss += loss * len(batch['raw_data'])
            total_num += len(batch['raw_data'])

            padded_p_len = len(batch['passage_token_ids'][0])
            for sample, start_prob, end_prob in zip(batch['raw_data'], start_probs, end_probs):

                best_answer = self.find_best_answer(sample, start_prob, end_prob, padded_p_len)
                if save_full_info:
                    sample['pred_answers'] = [best_answer]
                    pred_answers.append(sample)
                else:
                    pred_answers.append({'question_id': sample['question_id'],
                                         'question_type': sample['question_type'],
                                         'answers': [best_answer],
                                         'entity_answers': [[]],
                                         'yesno_answers': []})
                if 'answers' in sample:
                    ref_answers.append({'question_id': sample['question_id'],
                                         'question_type': sample['question_type'],
                                         'answers': sample['answers'],
                                         'entity_answers': [[]],
                                         'yesno_answers': []})

        if result_dir is not None and result_prefix is not None:
            result_file = os.path.join(result_dir, result_prefix + '.json')
            with open(result_file, 'w') as fout:
                for pred_answer in pred_answers:
                    fout.write(json.dumps(pred_answer, encoding='utf8', ensure_ascii=False) + '\n')

            self.logger.info('Saving {} results to {}'.format(result_prefix, result_file))

        # this average loss is invalid on test set, since we don't have true start_id and end_id
        ave_loss = 1.0 * total_loss / total_num
        # compute the bleu and rouge scores if reference answers is provided
        if len(ref_answers) > 0:
            pred_dict, ref_dict = {}, {}
            for pred, ref in zip(pred_answers, ref_answers):
                question_id = ref['question_id']
                if len(ref['answers']) > 0:
                    pred_dict[question_id] = normalize(pred['answers'])
                    ref_dict[question_id] = normalize(ref['answers'])
            bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
        else:
            bleu_rouge = None
        return ave_loss, bleu_rouge
Exemple #10
0
    def train_analysis(self, step):

        pred_answers, ref_answers = [], []
        fake_answers = []
        ref_answers.append({
            'question_id': self.data['question_id'],
            'question_type': self.data['question_type'],
            'answers': self.data['ref_answers']
        })
        listSelectedSet = []
        all_set = []
        # print '+++++++++++++++++++++++++++++++++++++++++++'
        # print ('question_id', list2string(self.tfg.vocab.recover_from_ids(self.data['question_token_ids'])))
        for p_idx, is_selected in enumerate(
                self.data['passage_is_selected_list'], 0):
            all_set += self.data['passage_token_ids_list'][p_idx]
            if is_selected == True:
                # print 'is True'
                # print('title ', self.data['passage_title_token_ids_list'][p_idx])
                #print('title', list2string(self.tfg.vocab.recover_from_ids(self.data['passage_title_token_ids_list'][p_idx])))
                listSelectedSet += self.data['passage_token_ids_list'][p_idx]

        pred_answer_str = ''

        str123_list = self.tfg.vocab.recover_from_ids(listSelectedSet)
        all_set_list = self.tfg.vocab.recover_from_ids(all_set)
        for s in str123_list:
            pred_answer_str += s
        selected_recall_score = 1
        selected_f1_score = 0
        all_racall_score = 1
        all_f1_score = 0
        if len(self.data['segmented_answers']) > 0 and len(str123_list) > 0:
            selected_recall_score = metric_max_over_ground_truths(
                recall, str123_list, self.data['segmented_answers'])
            selected_f1_score = metric_max_over_ground_truths(
                f1_score, str123_list, self.data['segmented_answers'])
        if len(self.data['segmented_answers']) > 0 and len(str123_list) > 0:
            all_racall_score = metric_max_over_ground_truths(
                recall, all_set_list, self.data['segmented_answers'])
            all_f1_score = metric_max_over_ground_truths(
                f1_score, all_set_list, self.data['segmented_answers'])

        # print pred_answer_str
        # print ('ref_answer',self.data['ref_answers'] )
        # print('fake_answers', self.data['fake_answers'])
        # print('pre_answer', [''.join(pred_answer_str)])
        pred_answer = {
            'question_id': self.data['question_id'],
            'question_type': self.data['question_type'],
            'answers': [''.join(pred_answer_str)]
        }
        pred_answers.append(pred_answer)
        fake_answer = {
            'question_id': self.data['question_id'],
            'question_type': self.data['question_type'],
            'answers': self.data['fake_answers']
        }
        fake_answers.append(fake_answer)
        # pre VS ref
        if len(ref_answers) > 0:
            pred_dict, ref_dict = {}, {}
            for pred, ref in zip(pred_answers, ref_answers):
                question_id = ref['question_id']
                if len(ref['answers']) > 0:
                    pred_dict[question_id] = normalize(pred['answers'])
                    ref_dict[question_id] = normalize(ref['answers'])
            bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
        else:
            bleu_rouge = None
        value_with_mcts = bleu_rouge
        # print 'ref VS pre: '
        # print value_with_mcts

        # pre VS fac
        if len(ref_answers) > 0 and len(fake_answers) > 0:
            pred_dict, ref_dict = {}, {}
            for pred, ref in zip(pred_answers, fake_answers):
                question_id = ref['question_id']
                if len(ref['answers']) > 0:
                    pred_dict[question_id] = normalize(pred['answers'])
                    ref_dict[question_id] = normalize(ref['answers'])
            bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
        else:
            bleu_rouge = None
        value_with_mcts = bleu_rouge
        # print 'pre VS fac: '
        # print value_with_mcts

        # pre VS fac
        if len(ref_answers) > 0 and len(self.data['fake_answers']) > 0:
            pred_dict, ref_dict = {}, {}
            for pred, ref in zip(fake_answers, ref_answers):
                question_id = ref['question_id']
                if len(ref['answers']) > 0:
                    pred_dict[question_id] = normalize(pred['answers'])
                    ref_dict[question_id] = normalize(ref['answers'])
            bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
        else:
            bleu_rouge = None
        value_with_mcts = bleu_rouge
        # print 'fac VS ref: '
        # print value_with_mcts
        match_score = [
            selected_recall_score, selected_f1_score, all_racall_score,
            all_f1_score
        ]
        #print ('match_score', match_score)
        return pred_answer, fake_answer, match_score
Exemple #11
0
    def evaluate(self,
                 eval_batches,
                 result_dir=None,
                 result_prefix=None,
                 result_name='',
                 save_full_info=False):
        """
        Evaluates the model performance on eval_batches and results are saved if specified
        Args:
            eval_batches: iterable batch data
            result_dir: directory to save predicted answers, answers will not be saved if None
            result_prefix: prefix of the file for saving predicted answers,
                           answers will not be saved if None
            save_full_info: if True, the pred_answers will be added to raw sample and saved
        """
        pred_answers, ref_answers = [], []
        total_loss, total_num = 0, 0
        n_batch_loss = 0.0
        n_batch = 0
        for b_itx, batch in enumerate(eval_batches, 1):
            feed_dict = {
                self.p: batch['passage_token_ids'],
                self.q: batch['question_token_ids'],
                self.p_length: batch['passage_length'],
                self.q_length: batch['question_length'],
                self.start_label: batch['start_id'],
                self.end_label: batch['end_id'],
                self.dropout_keep_prob: 1.0
            }
            if self.debug_print:
                if self.simple_net in [0, 5]:
                    res = self.sess.run([
                        self.loss, self.p_emb, self.q_emb, self.sep_p_encodes,
                        self.sep_q_encodes, self.p, self.q, self.start_probs
                    ], feed_dict)
                    names = 'self.loss, self.p_emb, self.q_emb, self.sep_p_encodes, self.sep_q_encodes, self.p, self.q, self.start_probs'.split(
                        ',')
                if self.simple_net in [1, 2]:
                    res = self.sess.run([
                        self.loss, self.p_length, self.q_length, self.p_emb,
                        self.q_emb, self.sep_p_encodes, self.sep_q_encodes,
                        self.p, self.q, self.match_p_encodes,
                        self.fuse_p_encodes, self.gm1, self.gm2,
                        self.start_probs, self.sim_matrix,
                        self.context2question_attn, self.b,
                        self.question2context_attn
                    ], feed_dict)
                    names = 'self.loss, self.p_length, self.q_length, self.p_emb, self.q_emb, self.sep_p_encodes, self.sep_q_encodes, self.p, self.q, self.match_p_encodes, self.fuse_p_encodes, \
                         self.gm1, self.gm2, self.start_probs, self.sim_matrix, self.context2question_attn, self.b, self.question2context_attn'.split(
                        ',')
                if self.simple_net in [3, 4]:
                    res = self.sess.run([
                        self.loss, self.start_probs, self.end_probs,
                        self.loss2, self.start_label, self.end_label,
                        self.p_length, self.q_length, self.p_emb, self.q_emb,
                        self.sep_p_encodes, self.sep_q_encodes, self.p, self.q,
                        self.match_p_encodes, self.fuse_p_encodes,
                        self.sim_matrix, self.context2question_attn, self.b,
                        self.question2context_attn, self.pn_init_state,
                        self.pn_f0, self.pn_f1, self.pn_b0, self.pn_b1
                    ], feed_dict)
                    names = 'self.loss, self.start_probs, self.end_probs, self.loss2, self.start_label, self.end_label, self.p_length, self.q_length, self.p_emb, self.q_emb, self.sep_p_encodes, self.sep_q_encodes, self.p, self.q, self.match_p_encodes, self.fuse_p_encodes, \
                         self.sim_matrix, self.context2question_attn, self.b, self.question2context_attn, self.pn_init_state, self.pn_f0, self.pn_f1, self.pn_b0, self.pn_b1'.split(
                        ',')

                loss, start_probs, end_probs = res[0:3]
                for i in range(1, len(res)):
                    p_name = names[i]
                    p_array = res[i]
                    self.var_print('var', p_array, p_name, p_name)
            else:
                start_probs, end_probs, loss = self.sess.run(
                    [self.start_probs, self.end_probs, self.loss], feed_dict)
            total_loss += loss * len(batch['raw_data'])
            total_num += len(batch['raw_data'])
            n_batch_loss = loss * len(batch['raw_data'])
            n_batch += len(batch['raw_data'])
            if self.log_interval > 0 and b_itx % self.log_interval == 0:
                #self.print_num_of_total_parameters(True, True)
                self.logger.info(
                    'Average dev loss from batch {} to {} is {}'.format(
                        b_itx - self.log_interval + 1, b_itx,
                        "%.10f" % (n_batch_loss / n_batch)))
                n_batch_loss = 0.0
                n_batch = 0

            padded_p_len = len(batch['passage_token_ids'][0])
            for sample, start_prob, end_prob in zip(batch['raw_data'],
                                                    start_probs, end_probs):

                best_answer, best_span = self.find_best_answer(
                    sample, start_prob, end_prob, padded_p_len)
                if save_full_info:
                    sample['pred_answers'] = [best_answer]
                    pred_answers.append(sample)
                else:
                    pred = {
                        'question_id': sample['question_id'],
                        'question_type': sample['question_type'],
                        'answers': [best_answer],
                        'entity_answers': [[]],
                        'yesno_answers': [best_span]
                    }
                    pred_answers.append(pred)
                    if self.debug_print:
                        self.logger.info('pred=' +
                                         json.dumps(pred, ensure_ascii=False))
                if 'answers' in sample:
                    ref = {
                        'question_id': sample['question_id'],
                        'question_type': sample['question_type'],
                        'answers': sample['answers'],
                        'entity_answers': [[]],
                        'yesno_answers': [best_span]
                    }
                    ref_answers.append(ref)
                    if self.debug_print:
                        self.logger.info('ref=' +
                                         json.dumps(ref, ensure_ascii=False))

        if result_dir is not None and result_prefix is not None:
            result_file = os.path.join(result_dir,
                                       result_prefix + result_name + '.json')
            with open(result_file, 'w') as fout:
                for pred_answer in pred_answers:
                    fout.write(
                        json.dumps(pred_answer, ensure_ascii=False) + '\n')

            self.logger.info('Saving {} results to {}'.format(
                result_prefix, result_file))
            #exit()

        # this average loss is invalid on test set, since we don't have true start_id and end_id
        ave_loss = 1.0 * total_loss / total_num
        # compute the bleu and rouge scores if reference answers is provided
        if len(ref_answers) > 0:
            pred_dict, ref_dict = {}, {}
            for pred, ref in zip(pred_answers, ref_answers):
                question_id = ref['question_id']
                if len(ref['answers']) > 0:
                    pred_dict[question_id] = normalize(pred['answers'])
                    ref_dict[question_id] = normalize(ref['answers'])
            bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
        else:
            bleu_rouge = None
        return ave_loss, bleu_rouge
Exemple #12
0
    def evaluate(self,
                 eval_batches,
                 result_dir=None,
                 result_prefix=None,
                 save_full_info=False):
        """
        Evaluates the model performance on eval_batches and results are saved
        if specified
        Args:
            eval_batches: iterable batch data
            result_dir: directory to save predicted answers,
            answers will not be saved if None
            result_prefix: prefix of the file for saving predicted answers,
                           answers will not be saved if None
            save_full_info: if True, the pred_answers will be added to
            raw sample and saved
        """
        pred_answers, ref_answers = [], []
        total_loss, total_num = 0, 0
        for b_itx, batch in enumerate(eval_batches):
            feed_dict = {
                self.p: batch['passage_token_ids'],
                self.q: batch['question_token_ids'],
                self.p_length: batch['passage_length'],
                self.q_length: batch['question_length'],
                self.start_label: batch['start_id'],
                self.end_label: batch['end_id'],
                self.dropout_keep_prob: 1.0
            }
            # evaluate必然是没有dropout的
            start_probs, end_probs, loss = self.sess.run(
                [self.start_probs, self.end_probs, self.loss], feed_dict)
            # self.logger.debug(start_probs)

            total_loss += loss * len(batch['raw_data'])
            total_num += len(batch['raw_data'])

            padded_p_len = len(batch['passage_token_ids'][0])
            # self.p中最长的那个样本的长度?batch['passage_token_ids']应该是个
            # 多维的np.ndarray才对,有可能是当list of list来处理,就是第一行,
            # 也就是第一个样本

            for sample, start_prob, end_prob in zip(batch['raw_data'],
                                                    start_probs, end_probs):

                best_answer = self.find_best_answer(sample, start_prob,
                                                    end_prob, padded_p_len)
                # 在做evaluate和test的推测工作时,要这样利用start_prob和end_prob
                if save_full_info:
                    sample['pred_answers'] = [best_answer]
                    pred_answers.append(sample)
                else:
                    pred_answers.append({
                        'question_id':
                        sample['question_id'],
                        'question_type':
                        sample['question_type'],
                        'question':
                        sample['question'],
                        'answers': [best_answer],
                        'entity_answers': [[]],
                        'yesno_answers': []
                    })
                if 'answers' in sample:
                    ref_answers.append({
                        'question_id': sample['question_id'],
                        'question_type': sample['question_type'],
                        'answers': sample['answers'],
                        'entity_answers': [[]],
                        'yesno_answers': []
                    })

        if result_dir is not None and result_prefix is not None:
            result_file = os.path.join(result_dir, result_prefix + '.json')
            with open(result_file, 'w', encoding='utf8') as fout:
                for pred_answer in pred_answers:
                    fout.write(
                        json.dumps(pred_answer, ensure_ascii=False) + '\n')

            self.logger.info('Saving {} results to {}'.format(
                result_prefix, result_file))

        # this average loss is invalid on test set,
        # since we don't have true start_id and end_id
        ave_loss = 1.0 * total_loss / total_num

        # compute the bleu and rouge scores if reference answers is provided
        if len(ref_answers) > 0:
            pred_dict, ref_dict = {}, {}
            for pred, ref in zip(pred_answers, ref_answers):
                question_id = ref['question_id']
                if len(ref['answers']) > 0:
                    pred_dict[question_id] = normalize(pred['answers'])
                    # 利用utils包,normalize strings to space joined chars
                    ref_dict[question_id] = normalize(ref['answers'])
            bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
            # pred_dict是预测值,ref_dict是真实值
            # 利用utils包,calculate bleu and rouge metrics
        else:
            bleu_rouge = None
        return ave_loss, bleu_rouge
Exemple #13
0
    def evaluate(self, eval_batches, result_dir=None, result_prefix=None):
        """
        评估模型在验证集上的表现,如果指定了保存,则将把结果保存
        """
        pred_answers, ref_answers = [], []
        total_loss, total_num = 0, 0
        count = 0
        for b_itx, batch in enumerate(eval_batches):
            count += 1
            if count % 100 == 0:
                self.logger.info(count)
            if batch['passage_length'][0] <= 0:
                continue
            feed_dict = {
                self.p: batch['passage_token_ids'],
                self.q: batch['question_token_ids'],
                self.p_length: batch['passage_length'],
                self.q_length: batch['question_length'],
                self.start_label: batch['start_id'],
                self.end_label: batch['end_id'],
                self.dropout_keep_prob: 1.0
            }

            start_probs, end_probs, loss, fuse_value, context2question_attn, question2context_attn, match_value = self.sess.run(
                [
                    self.start_probs, self.end_probs, self.loss,
                    self.fuse_value, self.context2question_attn,
                    self.question2context_attn, self.match_value
                ], feed_dict)

            total_loss += loss * len(batch['raw_data'])
            total_num += len(batch['raw_data'])

            padded_p_len = len(batch['passage_token_ids'][0])
            for sample, start_prob, end_prob in zip(batch['raw_data'],
                                                    start_probs, end_probs):
                best_answer, seg_answers, best_score, q2c_match_score, c2q_match_score = self.find_best_answer(
                    sample, start_prob, end_prob, padded_p_len,
                    context2question_attn, question2context_attn)
                if 'answers' in sample:
                    ref_answers.append({
                        'question_id': sample['question_id'],
                        'question_type': sample['question_type'],
                        'answers': sample['answers'],
                        'entity_answers': [[]],
                        'yesno_answers': []
                    })
                    pred_answers.append({
                        'question_id':
                        sample['question_id'],
                        'question_type':
                        sample['question_type'],
                        'question_tokens':
                        sample['question'],
                        'ref_answers':
                        sample['answers'],
                        'best_score':
                        str(best_score),
                        'answers': [best_answer],
                        'entity_answers': [[]],
                        'yesno_answers': [],
                        'match_value':
                        str(match_value),
                        'fuse_value':
                        str(fuse_value),
                        'q2c_match_score':
                        str(q2c_match_score),
                        'c2q_match_score':
                        str(c2q_match_score),
                        'seg_answers': [seg_answers]
                    })
                else:
                    pred_answers.append({
                        'question_id':
                        sample['question_id'],
                        'question_type':
                        sample['question_type'],
                        'question_tokens':
                        sample['question'],
                        'answers': [best_answer],
                        'best_score':
                        str(best_score),
                        'match_value':
                        str(match_value),
                        'entity_answers': [[]],
                        'yesno_answers': [],
                        'fuse_value':
                        str(fuse_value),
                        'q2c_match_score':
                        str(q2c_match_score),
                        'c2q_match_score':
                        str(c2q_match_score),
                        'seg_answers': [seg_answers]
                    })

        if result_dir is not None and result_prefix is not None:
            result_file = os.path.join(result_dir, result_prefix + '.json')
            with open(result_file, 'w') as fout:
                for pred_answer in pred_answers:
                    fout.write(
                        json.dumps(
                            pred_answer, encoding='utf8', ensure_ascii=False) +
                        '\n')

            self.logger.info('保存 {} 结果到 {}'.format(result_prefix, result_file))

        # 该平均损失对于测试集是无效的,因为测试集没有标注答案
        ave_loss = 1.0 * total_loss / total_num
        # 如果有参考答案,计算bleu和rouge分数
        if len(ref_answers) > 0:
            pred_dict, ref_dict = {}, {}
            for pred, ref in zip(pred_answers, ref_answers):
                question_id = ref['question_id']
                if len(ref['answers']) > 0:
                    pred_dict[question_id] = normalize(pred['answers'])
                    ref_dict[question_id] = normalize(ref['answers'])
            bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
        else:
            bleu_rouge = None
        return ave_loss, bleu_rouge
    def evaluate(self,
                 eval_batches,
                 result_dir=None,
                 result_prefix=None,
                 save_full_info=False):
        """
        Evaluates the model performance on eval_batches and results are saved if specified
        Args:
            eval_batches: iterable batch data
            result_dir: directory to save predicted answers, answers will not be saved if None
            result_prefix: prefix of the file for saving predicted answers,
                           answers will not be saved if None
            save_full_info: if True, the pred_answers will be added to raw sample and saved
        """
        pred_answers, ref_answers = [], []
        total_loss, total_num = 0, 0
        for b_itx, batch in enumerate(eval_batches):
            #p_allennlpd = [get_str_num(ids) for ids in batch['passage_token_ids']]
            #q_allennlpd = [get_str_num(ids) for ids in batch['question_token_ids']]
            #p_allennlpd = get_allennlp_vec(p_allennlpd)
            #q_allennlpd = get_allennlp_vec(q_allennlpd)  # shape[batch,sequence_len , 3 * 1024 = 3072]
            # print(p_allennlpd.shape) # [32 * 5 = 160 , 500 , 3 * 1024 = 3072]
            # print(q_allennlpd.shape) # [32 * 5 , 500 , 3 * 1024 = 3072]

            feed_dict = {
                self.p:
                batch['passage_token_ids'],
                self.q:
                batch['question_token_ids'],
                #self.p_allennlp: p_allennlpd,
                #self.q_allennlp: q_allennlpd,
                self.p_length:
                batch['passage_length'],
                self.q_length:
                batch['question_length'],
                self.start_label:
                batch['start_id'],
                self.end_label:
                batch['end_id'],
                self.real_pass:
                batch['real_pass'],
                self.sequence_label:
                batch['sequence_label'],
                self.dropout_keep_prob:
                1.0
            }
            start_probs, end_probs, yes_probs, pred_pass_probs, loss = self.sess.run(
                [
                    self.start_probs, self.end_probs, self.yes_probs,
                    self.pred_pass_probs, self.loss
                ], feed_dict)

            total_loss += loss * len(batch['raw_data'])
            total_num += len(batch['raw_data'])

            padded_p_len = len(batch['passage_token_ids'][0])
            for sample, start_prob, end_prob, yes_prob, pred_pass_prob in zip(
                    batch['raw_data'], start_probs, end_probs, yes_probs,
                    pred_pass_probs):

                best_answer = self.find_best_answer(sample, start_prob,
                                                    end_prob, yes_prob,
                                                    pred_pass_prob,
                                                    padded_p_len)
                if save_full_info:
                    sample['pred_answers'] = [best_answer]
                    pred_answers.append(sample)
                else:
                    pred_answers.append({
                        'question_id':
                        sample['question_id'],
                        'question_type':
                        sample['question_type'],
                        'answers': [best_answer],
                        'entity_answers': [[]],
                        'yesno_answers': []
                    })
                if 'answers' in sample:
                    ref_answers.append({
                        'question_id': sample['question_id'],
                        'question_type': sample['question_type'],
                        'answers': sample['answers'],
                        'entity_answers': [[]],
                        'yesno_answers': []
                    })

        if result_dir is not None and result_prefix is not None:
            result_file = os.path.join(result_dir, result_prefix + '.json')
            with open(result_file, 'w') as fout:
                for pred_answer in pred_answers:
                    fout.write(
                        json.dumps(pred_answer, ensure_ascii=False) + '\n')

            self.logger.info('Saving {} results to {}'.format(
                result_prefix, result_file))

        # this average loss is invalid on test set, since we don't have true start_id and end_id
        ave_loss = 1.0 * total_loss / total_num
        # compute the bleu and rouge scores if reference answers is provided
        if len(ref_answers) > 0:
            pred_dict, ref_dict = {}, {}
            for pred, ref in zip(pred_answers, ref_answers):
                question_id = ref['question_id']
                if len(ref['answers']) > 0:
                    pred_dict[question_id] = normalize(pred['answers'])
                    ref_dict[question_id] = normalize(ref['answers'])
            bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
        else:
            bleu_rouge = None
        return ave_loss, bleu_rouge
Exemple #15
0
    def search(self, start_node_id):
        #print '----tree search'
        tmp_node = self.tree.get_node(start_node_id)
        #print tmp_node.data.num
        has_visit_num = tmp_node.data.num - 1
        self.count = has_visit_num

        if int(self.max_search_time - has_visit_num) > 0:
            start_node_search_time = int(self.max_search_time - has_visit_num)
        else:
            start_node_search_time = 0

        for tm in range(start_node_search_time):
            if tm % 10 == 0:
                batch_start_time = time.time()
                #print ('search time',tm)
            search_list = [start_node_id]
            tmp_node = self.tree.get_node(start_node_id)
            #print 'search time :'+ str(time)

            while not tmp_node.is_leaf():
                max_score = float("-inf")
                max_id = -1
                for child_id in tmp_node.fpointer:
                    child_node = self.tree.get_node(child_id)
                    score = self.beta * child_node.data.p * (
                        (tmp_node.data.num)**0.5 / (1 + child_node.data.num))

                    #print 'child_node.data.Q: '
                    #print child_node.data.Q
                    score += child_node.data.Q

                    #print 'score: '
                    #print score

                    #print '**************'

                    if score > max_score:
                        max_id = child_id
                        max_score = score
                search_list.append(max_id)
                tmp_node = self.tree.get_node(max_id)

            if not tmp_node.data.value == None:
                v = tmp_node.data.value
            else:
                if tmp_node.data.sen[-1] == str(self.l_passages - 1):
                    pred_answer = tmp_node.data.sen
                    # print 'search to end  pred_answer: '
                    # print pred_answer
                    # print 'listSelectedSet'
                    listSelectedSet_sens = []
                    listSelectedSet = map(eval, pred_answer)
                    # print listSelectedSet
                    for idx in listSelectedSet:
                        listSelectedSet_sens.append(self.p_sen_list[idx])
                    # print 'pred_answer '
                    pred_answer_str = ''
                    for sen in listSelectedSet_sens:
                        str123_list = self.carpe_diem.vocab.recover_from_ids(
                            sen, 0)
                        for s in str123_list:
                            pred_answer_str += s
                    # print 'pred_answer_str: '
                    # print pred_answer_str
                    # print 'ref_answer_str: '
                    # print list2string(self.ref_answer[0]['answers'])
                    pred_answers = []

                    pred_answers.append({
                        'question_id': [self.q_id],
                        'question_type': [],
                        'answers': [''.join(pred_answer_str)],
                        'entity_answers': [[]],
                        'yesno_answers': []
                    })
                    if len(self.ref_answer) > 0:
                        pred_dict, ref_dict = {}, {}
                        for pred, ref in zip(pred_answers, self.ref_answer):
                            question_id = ref['question_id']
                            if len(ref['answers']) > 0:
                                pred_dict[question_id] = normalize(
                                    pred['answers'])
                                ref_dict[question_id] = normalize(
                                    ref['answers'])
                                # print '========compare in tree======='
                                # print pred_dict[question_id]
                                # print '----------------------'
                                # print ref_dict[question_id]
                        bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
                    else:
                        bleu_rouge = None
                    # print 'last words ++++++++++++++ '
                    # print bleu_rouge
                    v = input_v = bleu_rouge['Rouge-L'] * self.m_value['Rouge-L'] \
                                  + bleu_rouge['Bleu-4'] * self.m_value['Bleu-4'] \
                                  + bleu_rouge['Bleu-1'] * self.m_value['Bleu-1'] \
                                  + bleu_rouge['Bleu-3'] * self.m_value['Bleu-3'] \
                                  + bleu_rouge['Bleu-2'] * self.m_value['Bleu-2']
                else:
                    v = self.carpe_diem.value_function(tmp_node.data.sen)[0][0]
                tmp_node.data.value = v

            # if tmp_node.data.sen[-1] == str(self.l_passages - 1):
            #     pred_answer = tmp_node.data.sen
            #     listSelectedSet_sens = []
            #     listSelectedSet = map(eval, pred_answer)
            #     # print listSelectedSet
            #     for idx in listSelectedSet:
            #         listSelectedSet_sens.append(self.p_sen_list[idx])
            #         # print 'pred_answer '
            #     pred_answer_str = ''
            #     for sen in listSelectedSet_sens:
            #         str123_list = self.carpe_diem.vocab.recover_from_ids(sen, 0)
            #         for s in str123_list:
            #             pred_answer_str += s
            #
            #     pred_answers = []
            #
            #     pred_answers.append({'question_id': [self.q_id],
            #                                  'question_type': [],
            #                                  'answers': [''.join(pred_answer_str)],
            #                                  'entity_answers': [[]],
            #                                  'yesno_answers': []})
            #     if len(self.ref_answer) > 0:
            #         pred_dict, ref_dict = {}, {}
            #         for pred, ref in zip(pred_answers, self.ref_answer):
            #             question_id = ref['question_id']
            #             if len(ref['answers']) > 0:
            #                     pred_dict[question_id] = normalize(pred['answers'])
            #                     ref_dict[question_id] = normalize(ref['answers'])
            #         bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
            #     else:
            #         bleu_rouge = None
            #     v = bleu_rouge['Rouge-L'] * self.m_value['Rouge-L'] \
            #                           + bleu_rouge['Bleu-4'] * self.m_value['Bleu-4'] \
            #                           + bleu_rouge['Bleu-1'] * self.m_value['Bleu-1'] \
            #                           + bleu_rouge['Bleu-3'] * self.m_value['Bleu-3'] \
            #                           + bleu_rouge['Bleu-2'] * self.m_value['Bleu-2']
            # else:
            #     v = self.carpe_diem.value_function(tmp_node.data.sen)[0][0]

            self.update(search_list, v)
            self.count += 1

            if tmp_node.is_leaf() and (
                    self.tree.depth(tmp_node) < self.max_depth
            ) and tmp_node.data.sen[-1] != str(self.l_passages - 1):
                self.expand(tmp_node)

            # if tm %10 == 0:
            #     print ('==================== search 10  time = %3.2f s ====================' % (time.time() - batch_start_time))
            ###########
            '''
#!/usr/bin/python3
Exemple #17
0
    def _analysis(self, step , train_batches, dropout_keep_prob):
        """
        Trains the model for a single epoch.
        Args:
            train_batches: iterable batch data for training
            dropout_keep_prob: float value indicating dropout keep probability
        """
        total_loss = 0
        num_loss = 0
        total_recall = [0.0,0.0,0.0,0.0]
        num_recall =0
        batch_start_time = 0
        batch_start_time = time.time()
        pred_answers, ref_answers = [], []
        fake_answers = []
        for fbitx, batch in enumerate(train_batches, 1):
            step += 1
            if fbitx % 1000 == 0:
                print '------ Batch Question: ' + str(fbitx)

            trees = []
            batch_tree_set = []

            batch_size = len(batch['question_ids'])
            #print ('batch_size)', batch_size)
            for bitx in range(batch_size):
                tree = {'question_id': batch['question_ids'][bitx],
                        'question_token_ids': batch['question_token_ids'][bitx],
                        'q_length': batch['question_length'][bitx],

                        'passage_token_ids_list': batch['passage_token_ids_list'][bitx],

                        'passage_title_token_ids_list': batch['passage_title_token_ids_list'][bitx],
                        'passage_title_length_list': batch['passage_title_length_list'][bitx],

                        'passage_sentence_token_ids_list': batch['passage_sentence_token_ids_list'][bitx],
                        'passage_sen_length': batch['passage_sen_length_list'][bitx],


                        #'p_length': batch['passage_length'][bitx],
                        'passage_is_selected_list': batch['passage_is_selected_list'][bitx],

                        'question_type': batch['question_types'][bitx],

                        'ref_answers': batch['ref_answers'][bitx],
                        'fake_answers': batch['fake_answers'][bitx],
                        'segmented_answers': batch['segmented_answers'][bitx]

                        }
                ref_answers.append({'question_id': tree['question_id'],
                                    'question_type': tree['question_type'],
                                    'answers': tree['ref_answers']})
                trees.append(tree)
                #print batch
                batch_tree = SearchTree(self.tfg, tree, self.max_a_len, self.search_time, self.beta, self.m_value, dropout_keep_prob)
                batch_tree_set.append(batch_tree)


            # for every data in batch do training process
            for idx, batch_tree in enumerate(batch_tree_set,1):
                pred_answer, fake_answer, recall = batch_tree.train_analysis(step)
                pred_answers.append(pred_answer)
                fake_answers.append(fake_answer)
                total_recall[0] += recall[0]
                total_recall[1] += recall[1]
                total_recall[2] += recall[2]
                total_recall[3] += recall[3]
                num_recall += 1
        print('ave select recall', total_recall[0] / num_recall)
        print('ave select f1', total_recall[1] / num_recall)
        print('ave all recall', total_recall[2] / num_recall)
        print('ave all f1', total_recall[3] / num_recall)
        ii = 0
        if len(ref_answers) > 0:
            pred_dict, ref_dict = {}, {}
            for pred, ref in zip(pred_answers, ref_answers):
                ii += 1
                question_id = ref['question_id']
                #print('type', question_id)
                if len(ref['answers']) > 0:

                    ref_dict[question_id] = normalize(ref['answers'])
                    pred_dict[question_id] = normalize(pred['answers'])


            bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
        else:
            bleu_rouge = None
        value_with_mcts = bleu_rouge
        print ('pre_scor',value_with_mcts)


        #return 1.0 * total_loss / num_loss, step
        return 0, step
Exemple #18
0
def validation(inference_program, avg_cost, s_probs, e_probs, match,
               feed_order, place, dev_count, vocab, brc_data, logger, args):
    """
    do inference with given inference_program
    """
    parallel_executor = fluid.ParallelExecutor(main_program=inference_program,
                                               use_cuda=bool(args.use_gpu),
                                               loss_name=avg_cost.name)
    print_para(inference_program, parallel_executor, logger, args)

    # Use test set as validation each pass
    total_loss = 0.0
    count = 0
    n_batch_cnt = 0
    n_batch_loss = 0.0
    pred_answers, ref_answers = [], []
    val_feed_list = [
        inference_program.global_block().var(var_name)
        for var_name in feed_order
    ]
    val_feeder = fluid.DataFeeder(val_feed_list, place)
    pad_id = vocab.get_id(vocab.pad_token)
    dev_reader = lambda: brc_data.gen_mini_batches(
        'dev', args.batch_size, pad_id, shuffle=False)
    dev_reader = read_multiple(dev_reader, dev_count)

    for batch_id, batch_list in enumerate(dev_reader(), 1):
        feed_data = batch_reader(batch_list, args)
        val_fetch_outs = parallel_executor.run(
            feed=list(val_feeder.feed_parallel(feed_data, dev_count)),
            fetch_list=[avg_cost.name, s_probs.name, e_probs.name, match.name],
            return_numpy=False)
        total_loss += np.array(val_fetch_outs[0]).sum()
        start_probs_m = LodTensor_Array(val_fetch_outs[1])
        end_probs_m = LodTensor_Array(val_fetch_outs[2])
        match_lod = val_fetch_outs[3].lod()
        count += len(np.array(val_fetch_outs[0]))

        n_batch_cnt += len(np.array(val_fetch_outs[0]))
        n_batch_loss += np.array(val_fetch_outs[0]).sum()
        log_every_n_batch = args.log_interval
        if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0:
            logger.info('Average dev loss from batch {} to {} is {}'.format(
                batch_id - log_every_n_batch + 1, batch_id,
                "%.10f" % (n_batch_loss / n_batch_cnt)))
            n_batch_loss = 0.0
            n_batch_cnt = 0
        batch_offset = 0
        for idx, batch in enumerate(batch_list):
            #one batch
            batch_size = len(batch['raw_data'])
            batch_range = match_lod[0][batch_offset:batch_offset + batch_size +
                                       1]
            batch_lod = [[batch_range[x], batch_range[x + 1]]
                         for x in range(len(batch_range[:-1]))]
            start_prob_batch = start_probs_m[batch_offset:batch_offset +
                                             batch_size + 1]
            end_prob_batch = end_probs_m[batch_offset:batch_offset +
                                         batch_size + 1]
            for sample, start_prob_inst, end_prob_inst, inst_range in zip(
                    batch['raw_data'], start_prob_batch, end_prob_batch,
                    batch_lod):
                #one instance
                inst_lod = match_lod[1][inst_range[0]:inst_range[1] + 1]
                best_answer, best_span = find_best_answer_for_inst(
                    sample, start_prob_inst, end_prob_inst, inst_lod)
                pred = {
                    'question_id': sample['question_id'],
                    'question_type': sample['question_type'],
                    'answers': [best_answer],
                    'entity_answers': [[]],
                    'yesno_answers': []
                }
                pred_answers.append(pred)
                if 'answers' in sample:
                    ref = {
                        'question_id': sample['question_id'],
                        'question_type': sample['question_type'],
                        'answers': sample['answers'],
                        'entity_answers': [[]],
                        'yesno_answers': []
                    }
                    ref_answers.append(ref)
            batch_offset = batch_offset + batch_size

    result_dir = args.result_dir
    result_prefix = args.result_name
    if result_dir is not None and result_prefix is not None:
        if not os.path.exists(args.result_dir):
            os.makedirs(args.result_dir)
        result_file = os.path.join(result_dir, result_prefix + '.json')
        with open(result_file, 'w') as fout:
            for pred_answer in pred_answers:
                fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n')
        logger.info('Saving {} results to {}'.format(result_prefix,
                                                     result_file))

    ave_loss = 1.0 * total_loss / count
    # compute the bleu and rouge scores if reference answers is provided
    if len(ref_answers) > 0:
        pred_dict, ref_dict = {}, {}
        for pred, ref in zip(pred_answers, ref_answers):
            question_id = ref['question_id']
            if len(ref['answers']) > 0:
                pred_dict[question_id] = normalize(pred['answers'])
                ref_dict[question_id] = normalize(ref['answers'])
        bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
    else:
        bleu_rouge = None
    return ave_loss, bleu_rouge
Exemple #19
0
def do_eval(model, batcher, settings, result_dir=None, result_prefix=None, save_full_info=False):
    """
    """
    pred_answers, ref_answers = [], []
    total_loss, total_num = 0, 0
    
    count = 0
    while True:
        #
        batch = batcher.get_next_batch()  
        if batch is None: break
        #
        results = model.run_eval_one_batch(batch)
        count += 1
        print(count)
        #
        loss = results["loss_optim"]
        idx_passage = results["idx_passage"]
        idx_start = results["idx_start"]
        idx_end = results["idx_end"]
        # pred_prob = results["pred_prob"]
        #
        batch_size = len(idx_passage)
        total_loss += loss * batch_size
        total_num += batch_size
        #
        sidx = 0
        for sidx in range(batch_size):
            #            
            sample = batch['data_raw'][sidx]
            idx_p_curr = idx_passage[sidx]
            idx_s_curr = idx_start[sidx]
            idx_e_curr = idx_end[sidx]
            # prob_curr = pred_prob[sidx]
            #
            pred_a = ''.join(sample['passages'][idx_p_curr]['passage_tokens'][idx_s_curr: idx_e_curr + 1])
            #
            if save_full_info:
                sample['pred_answers'] = [pred_a]
                pred_answers.append(sample)
            else:
                pred_answers.append({'question_id': sample['question_id'],
                                     'question_type': sample['question_type'],
                                     'answers': [ pred_a ],
                                     'entity_answers': [[]],
                                     'yesno_answers': []})
            if 'answers' in sample:
                ref_answers.append({'question_id': sample['question_id'],
                                    'question_type': sample['question_type'],
                                    'answers': sample['answers'],
                                    'entity_answers': [[]],
                                    'yesno_answers': []})
    #
    # saving
    if result_dir is not None and result_prefix is not None:
        result_file = os.path.join(result_dir, result_prefix + '.json')
        with open(result_file, 'w', encoding="utf-8") as fout:
            for pred_answer in pred_answers:
                fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n')
        #
        model.logger.info('saving {} results to {}'.format(result_prefix, result_file))
        #
    
    # 
    # metric
    # this average loss is invalid on test set, since we don't have true start_id and end_id
    ave_loss = 1.0 * total_loss / total_num
    #
    if len(ref_answers) > 0:
        pred_dict, ref_dict = {}, {}
        for pred, ref in zip(pred_answers, ref_answers):
            question_id = ref['question_id']
            if len(ref['answers']) > 0:
                pred_dict[question_id] = normalize(pred['answers'])
                ref_dict[question_id] = normalize(ref['answers'])
        #
        bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
    else:
        bleu_rouge = None
    #
    print("ave_loss: %g" % ave_loss)
    print("bleu_rouge:")
    print(bleu_rouge)
    #
    model.logger.info('ave_loss: {}'.format(ave_loss))
    model.logger.info('bleu_rouge: {}'.format(bleu_rouge))
    #
    return ave_loss, bleu_rouge
Exemple #20
0
    def evaluate(self,
                 eval_batches,
                 result_dir=None,
                 result_prefix=None,
                 save_full_info=False):
        """
        Evaluates the model performance on eval_batches and results are saved if specified
        Args:
            eval_batches: iterable batch data
            result_dir: directory to save predicted answers, answers will not be saved if None
            result_prefix: prefix of the file for saving predicted answers,
                           answers will not be saved if None
            save_full_info: if True, the pred_answers will be added to raw sample and saved
        """
        pred_answers, ref_answers = [], []
        total_loss, total_num = 0, 0
        for b_itx, batch in enumerate(eval_batches):
            passage_len = len(batch['passage_token_ids'][0])
            label_batch = len(batch['start_id'])
            all_passage = len(batch['passage_token_ids'])
            concat_passage_len = all_passage / label_batch * passage_len
            feed_dict = {
                self.p:
                batch['passage_token_ids'],
                self.q:
                batch['question_token_ids'],
                self.p_length:
                batch['passage_length'],
                self.q_length:
                batch['question_length'],
                self.start_label:
                batch['start_id'],
                self.end_label:
                batch['end_id'],
                self.start_label_probs:
                self._get_label_probs(batch['start_id'], concat_passage_len),
                self.end_label_probs:
                self._get_label_probs(batch['end_id'], concat_passage_len),
                self.dropout_keep_prob:
                1.0
            }
            if hasattr(self, 'char_vocab'):
                char_input = {
                    self.p_char: batch['passage_char_ids'],
                    self.q_char: batch['question_char_ids'],
                    self.p_char_length: batch['passage_char_length'],
                    self.q_char_length: batch['question_char_length'],
                }
                feed_dict.update(char_input)
            start_probs, end_probs, loss = self.sess.run(
                [self.start_probs, self.end_probs, self.loss], feed_dict)

            total_loss += loss * len(batch['raw_data'])
            total_num += len(batch['raw_data'])

            padded_p_len = len(batch['passage_token_ids'][0])
            for sample, start_prob, end_prob in zip(batch['raw_data'],
                                                    start_probs, end_probs):

                best_answer = self.find_best_answer(sample, start_prob,
                                                    end_prob, padded_p_len)
                if save_full_info:
                    sample['pred_answers'] = [best_answer]
                    pred_answers.append(sample)
                else:
                    pred_answers.append({
                        'question_id':
                        sample['question_id'],
                        'question_type':
                        sample['question_type'],
                        'answers': [best_answer],
                        'entity_answers': [[]],
                        'yesno_answers': []
                    })
                if 'answers' in sample:
                    ref_answers.append({
                        'question_id': sample['question_id'],
                        'question_type': sample['question_type'],
                        'answers': sample['answers'],
                        'entity_answers': [[]],
                        'yesno_answers': []
                    })

        if result_dir is not None and result_prefix is not None:
            result_file = os.path.join(result_dir, result_prefix + '.json')
            with open(result_file, 'w') as fout:
                for pred_answer in pred_answers:
                    fout.write(
                        json.dumps(pred_answer, ensure_ascii=False) + '\n')

            self.logger.info('Saving {} results to {}'.format(
                result_prefix, result_file))

        # this average loss is invalid on test set, since we don't have true start_id and end_id
        ave_loss = 1.0 * total_loss / total_num
        # compute the bleu and rouge scores if reference answers is provided
        if len(ref_answers) > 0:
            pred_dict, ref_dict = {}, {}
            for pred, ref in zip(pred_answers, ref_answers):
                question_id = ref['question_id']
                if len(ref['answers']) > 0:
                    pred_dict[question_id] = normalize(pred['answers'])
                    ref_dict[question_id] = normalize(ref['answers'])
            bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
        else:
            bleu_rouge = None
        return ave_loss, bleu_rouge
Exemple #21
0
    def evaluate(self, eval_batches, result_dir=None, result_prefix=None, save_full_info=False):
        """
        Evaluates the model performance on eval_batches and results are saved if specified
        Args:
            eval_batches: iterable batch data
            result_dir: directory to save predicted answers, answers will not be saved if None
            result_prefix: prefix of the file for saving predicted answers,
                           answers will not be saved if None
            save_full_info: if True, the pred_answers will be added to raw sample and saved
        """
        pred_answers, ref_answers = [], []
        total_loss, total_num = 0, 0
        for b_itx, batch in enumerate(eval_batches):
            feed_dict = {self.p: batch['passage_token_ids'],
                         self.q: batch['question_token_ids'],
                         self.p_length: batch['passage_length'],
                         self.q_length: batch['question_length'],
                         self.start_label: batch['start_id'],
                         self.end_label: batch['end_id'],
                         self.dropout_keep_prob: 1.0,
                         self.em: batch['exact_match']}
            batch_size = len(batch['start_id'])
            padded_p_len = len(batch['passage_token_ids'][0])
            padded_p_num = len(batch['passage_token_ids']) / batch_size
            para_ids = []
            for start_id in batch['start_id']:
                para_ids.append(start_id // padded_p_len)
            feed_dict[self.para_label] = para_ids
            content_label = np.zeros([batch_size, padded_p_num * padded_p_len], dtype=int)
            for s_idx, (start_id, end_id) in enumerate(zip(batch['start_id'], batch['end_id'])):
                content_label[s_idx, start_id: end_id+1] = 1
            feed_dict[self.content_label] = content_label
            start_probs, end_probs, content_scores, verif_scores, loss = self.sess.run([self.start_probs, self.end_probs,
                                                                                        self.concat_content_score, self.reshaped_ans_verif_score,
                                                                                        self.loss], feed_dict)

            total_loss += loss * len(batch['raw_data'])
            total_num += len(batch['raw_data'])

            padded_p_len = len(batch['passage_token_ids'][0])
            for s_idx, sample in enumerate(batch['raw_data']):
                start_prob = start_probs[s_idx]
                end_prob = end_probs[s_idx]
                content_score = content_scores[s_idx]
                verif_score = verif_scores[s_idx]
                best_answer = self.find_best_answer_with_verif(sample, start_prob, end_prob,
                                                               content_score, verif_score, padded_p_len)
                if save_full_info:
                    sample['pred_answers'] = [best_answer]
                    pred_answers.append(sample)
                else:
                    pred_answers.append({'question_id': sample['question_id'],
                                         'question_type': sample['question_type'],
                                         'answers': [best_answer],
                                         'entity_answers': [[]],
                                         'yesno_answers': []})
                if 'answers' in sample:
                    ref_answers.append({'question_id': sample['question_id'],
                                         'question_type': sample['question_type'],
                                         'answers': sample['answers'],
                                         'entity_answers': [[]],
                                         'yesno_answers': []})

        if result_dir is not None and result_prefix is not None:
            result_file = os.path.join(result_dir, result_prefix + '.json')
            with open(result_file, 'w') as fout:
                for pred_answer in pred_answers:
                    fout.write(json.dumps(pred_answer, encoding='utf8', ensure_ascii=False) + '\n')

            self.logger.info('Saving {} results to {}'.format(result_prefix, result_file))

        # this average loss is invalid on test set, since we don't have true start_id and end_id
        ave_loss = 1.0 * total_loss / total_num
        # compute the bleu and rouge scores if reference answers is provided
        if len(ref_answers) > 0:
            pred_dict, ref_dict = {}, {}
            for pred, ref in zip(pred_answers, ref_answers):
                question_id = ref['question_id']
                if len(ref['answers']) > 0:
                    pred_dict[question_id] = normalize(pred['answers'])
                    ref_dict[question_id] = normalize(ref['answers'])
            bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
        else:
            bleu_rouge = None
        return ave_loss, bleu_rouge