Esempio n. 1
0
    def process_data(self, file_name, pkl_name): # 入口函数

        process_data = []
        ans = []
        a = 0
        with open(file_name, 'r', encoding='utf-8') as f:
            for line in tqdm(f.readlines()):

                sample = json.loads(line)

                docid, question, answer, text = sample['docid'], sample['question'], sample['answer'], sample['text']
                recall_paras = sample['recall_paras']
                pre_sample = {'question': question, 'docs': [], 'answers':answer}

                recall_paras = [normalized(n) for n in recall_paras]
                text = normalized(text)
                answer = normalized(answer)

                ans.append(answer)
                #recall_paras = [text]
                doc_stride = []
                for _, text in enumerate(recall_paras[:5]):

                    max_c_len = 512 - len(encode_pieces(self.sp_model, question)) - 5

                    #text = '凰'.join([x for x in text.split(' ') if len(x) != 0])
                    #answer = '凰'.join([x for x in answer.split(' ') if len(x) != 0])
                    doc_stride.extend(self.get_doc_strides(text, max_c_len=max_c_len, ds=256))

                doc_stride = list(set(doc_stride))
                for ds_id, doc_span in enumerate(doc_stride):

                        doc_span_token = ['<cls>'] + encode_pieces(self.sp_model, question) + ['<sep>'] + \
                                        encode_pieces(self.sp_model, doc_span)

                        ref_ans_token = encode_pieces(self.sp_model, answer)

                        start, end = self.find_answer(doc_span_token, ref_ans_token)
                        if start[0] == 0:
                            ans_dict = {'is_impossible': True, 'answers': [[0, 0, 0, 0]]}
                            doc = {'content': doc_span_token, 'ans_dict': ans_dict}
                            pre_sample['docs'].append(doc)
                            a += 1
                        else:
                            ans_dict = {'is_impossible': False, 'answers': [[0, 0, answer, 0]], 'muti_ans':[start, end]}
                            doc = {'content': doc_span_token, 'ans_dict': ans_dict}
                            pre_sample['docs'].append(doc)

                process_data.append(pre_sample)

        print('no-ans:',a)
        print('len(ans):',len(set(ans)))
        print('len(process_data): ', len(process_data))
        with open(pkl_name, 'w', encoding="utf-8") as fout:
            for feature in process_data:
                fout.write(json.dumps(feature, ensure_ascii=False) + '\n')
Esempio n. 2
0
    def train(json_path, seg, save_path, stopwords):
        all_question_gram = get_all_question_token()

        print('===== loading data =====')
        with open(json_path, 'r') as f:
            corpus = []
            g = f.read().split('\n')
            for _, data_piece in enumerate(tqdm(g)):

                context = normalized(json.loads(data_piece)['context'])
                # 切词
                item_str = seg(context)
                # 去停用词
                doc = []
                for word in item_str:
                    if word not in stopwords:
                        doc.append(word)
                # 2-gram
                ngram = [
                    ''.join([doc[x], doc[x + 1]])
                    for x in range(len(doc[:]) - 1)
                ]
                ngram = set(ngram)
                ngram = list(ngram & set(all_question_gram))  # 过滤掉不在问题中的2-gram
                # ngram = [x for x in ngram if x in all_question_gram]
                doc.extend(ngram)

                corpus.append(doc)
        dictionary = corpora.Dictionary(corpus)
        print('document dictionary length: {}'.format(len(dictionary)))
        # corpus为文本全集
        bm25Model = BM25(corpus)
        with open(os.path.join(save_path, 'bm25_2gram.Model'), 'wb') as f:
            pickle.dump(bm25Model, f)
Esempio n. 3
0
 def context_eval(seg, tfidf_model, sparse_result):
     id2index, index2id = get_index_id_trans(
         os.path.join(SAVE_PATH, 'id2index.pkl'),
         os.path.join(SAVE_PATH, 'index2id.pkl'))
     print('===== loading data =====')
     acc = 0
     total = 0
     with open(os.path.join(SAVE_PATH, 'context.json'), 'r') as f:
         g = f.read().split('\n')
         for _, data_piece in enumerate(tqdm(g)):
             context = normalized(json.loads(data_piece)['context'])
             res = TFIDF.predict(context, seg, tfidf_model, sparse_result)
             total += 1
             if index2id[json.loads(data_piece)['id']] in res:
                 acc += 1
     print("total: {}, acc: {}, rate: {}".format(total, acc, acc / total))
     return acc / total
Esempio n. 4
0
    def predict(from_train, query, seg, bm25Model, stopwords):
        train_question, question2id = from_train
        context = normalized(query)
        item_str = seg(context)
        # n-gram
        ngram = [
            ''.join([item_str[x], item_str[x + 1]])
            for x in range(len(item_str[:]) - 1)
        ]
        item_str.extend(ngram)

        doc = []
        for word in item_str:
            if word not in stopwords:
                doc.append(word)
        avg_idf = nCoV_BM25.get_avg_idf(bm25Model)
        scores = bm25Model.get_scores(doc)
        scores = np.array(scores)
        # fak = find_from_train(query, train_question, question2id)
        # scores[fak] = (scores[fak]+1)*10
        return list(reversed(list(np.argsort(scores))))[:300]
Esempio n. 5
0
def evaluate(model, one_or_more='one', path=args.split_dev_data):

    with open('./data/similarity/id2index.pkl', 'rb') as f:
        id2index = pickle.load(f)

    sp_model = spm.SentencePieceProcessor()
    sp_model.Load('data/spiece.model')

    rouge_L = RougeL(beta=1)

    device = torch.device("cuda", 0)
    tokenizer = XLNetTokenizer()

    model = model.to(device)

    predict_data = []
    hit_rate = {
        'docid_correct': 0,
        'ans_in_doc_correct': 0,
        'total': 0,
        'docid_score': 0,
        'ans_in_doc_score': 0,
        'wrong_score': 0
    }
    look_data = []
    with torch.no_grad():
        model.eval()
        with open(path, 'r', encoding='utf-8') as f:
            for step, line in enumerate(tqdm(f.readlines()[0::1])):

                sample = json.loads(line)
                docid, question, answer, text, recall_paras = \
                    sample['docid'], sample['question'], sample['answer'], sample['text'], sample['recall_paras']
                top = [int(t) for t in sample['top']]

                recall_paras = [normalized(n) for n in recall_paras]
                ori_text = normalized(text)
                answer = normalized(answer)

                all_best_score = -999
                pre_ans = ''
                doc_index = top[0]

                doc_head_len = len(encode_pieces(sp_model, question))

                if one_or_more == 'more':
                    texts = recall_paras[:15]
                else:
                    texts = [text]

                for doc_num, text in enumerate(texts):
                    text = '凰'.join(
                        [x for x in text.split(' ') if len(x) != 0])

                    max_c_len = 512 - len(encode_pieces(sp_model,
                                                        question)) - 5
                    doc_strides = get_doc_strides(sp_model,
                                                  content=text,
                                                  max_c_len=max_c_len,
                                                  ds=256)

                    for ds_id, doc_span in enumerate(doc_strides):

                        tokens = ['<cls>'] + encode_pieces(sp_model, question) + ['<sep>'] + \
                                         encode_pieces(sp_model, doc_span)

                        ques_len = len(encode_pieces(sp_model, question))
                        input_ids = tokenizer.convert_tokens_to_ids(tokens)

                        tokentype_ids = [1] * len(input_ids)
                        tokentype_ids[:ques_len + 1] = [0] * (ques_len + 1)
                        assert len(tokentype_ids) == len(input_ids)

                        tokentype_ids = torch.tensor(tokentype_ids).unsqueeze(
                            0).to(device)
                        input_ids = torch.tensor(input_ids).unsqueeze(0).to(
                            device)

                        start, end, verify_gate, cls_logit = model(
                            input_ids, token_type_ids=tokentype_ids)

                        # cls_logit = torch.nn.Softmax(dim=-1)(cls_logit)
                        # cls_logit = cls_logit.cpu().squeeze().tolist()

                        start, end = start.cpu().squeeze().tolist(), end.cpu(
                        ).squeeze().tolist()
                        #verify_gate = torch.nn.Softmax(dim=-1)(verify_gate)
                        verify_gate = verify_gate.cpu().squeeze().tolist()
                        verify_gate = [i[1] for i in verify_gate]

                        is_ans = (start[0] + end[0])

                        start_g = sorted(start)[-5:][0]
                        end_g = sorted(end)[-5:][0]

                        for s, s_prob in enumerate(start[1:-2]):
                            # 不遍历doc_head
                            if s < doc_head_len - 2:
                                continue
                            if s_prob < start_g:
                                continue
                            for e, e_prob in enumerate(end[s + 1:s + 1 + 280]):
                                if e_prob < end_g:
                                    continue

                                v_score = np.min(verify_gate[s + 1:e + s + 2])
                                h_score = (s_prob +
                                           e_prob) * 1 - is_ans + v_score
                                #h_score = h_score * cls_logit[1]
                                if doc_num > 14 and h_score > 0:
                                    h_score *= 0.7
                                if h_score > all_best_score:
                                    here_ans = ''.join(tokens[s + 1:e + s + 2])
                                    all_best_score = h_score
                                    pre_ans = here_ans
                                    doc_index = top[doc_num]

                pre_ans = pre_ans.replace('凰', ' ')
                sc = rouge_L.get_rouge_L(answer, pre_ans)
                if doc_index == id2index[docid]:
                    hit_rate['docid_correct'] += 1
                    hit_rate['ans_in_doc_correct'] += 1
                    hit_rate['docid_score'] += sc
                    hit_rate['ans_in_doc_score'] += sc
                elif answer in recall_paras[top.index(doc_index)]:
                    hit_rate['ans_in_doc_correct'] += 1
                    hit_rate['ans_in_doc_score'] += sc
                else:
                    is_docid_in_top15 = id2index[docid] in top[:15]
                    look_data.append({
                        'predict_ans':
                        pre_ans,
                        'real_ans':
                        answer,
                        'top15_docs':
                        recall_paras[:15],
                        'model_choose_doc':
                        recall_paras[top.index(doc_index)],
                        'real_doc':
                        ori_text,
                        'is_docid_in_top15':
                        is_docid_in_top15,
                        'question':
                        question,
                        'rouge-L':
                        sc
                    })

                    hit_rate['wrong_score'] += sc

                hit_rate['total'] += 1

                predict_data.append({'pre': pre_ans, 'rel': answer})
                if step % 20 == -1:
                    print('docid_hit_rate:{}\n'
                          'ans_in_doc_hit_rate:{}\n'
                          'docid_score:{}\n'
                          'ans_in_doc_score:{}\n'
                          'wrong_score:{}\n'.format(
                              float(hit_rate['docid_correct']) /
                              (hit_rate['total']),
                              float(hit_rate['ans_in_doc_correct']) /
                              (hit_rate['total']),
                              float(hit_rate['docid_score']) /
                              (hit_rate['total']),
                              float(hit_rate['ans_in_doc_score']) /
                              (hit_rate['total']),
                              float(hit_rate['wrong_score']) /
                              (hit_rate['total'])))

    score = 0.0
    for n in predict_data:
        a = rouge_L.get_rouge_L(n['pre'], n['rel'])
        score += a
    print('docid_hit_rate:{}\n'
          'ans_in_doc_hit_rate:{}\n'
          'docid_score:{}\n'
          'ans_in_doc_score:{}\n'
          'wrong_score:{}\n'.format(
              float(hit_rate['docid_correct']) / (hit_rate['total']),
              float(hit_rate['ans_in_doc_correct']) / (hit_rate['total']),
              float(hit_rate['docid_score']) / (hit_rate['total']),
              float(hit_rate['ans_in_doc_score']) / (hit_rate['total']),
              float(hit_rate['wrong_score']) / (hit_rate['total'])))
    print('rouge_L : ', score / len(predict_data))

    with open('look_data.json', 'w', encoding="utf-8") as fout:
        for feature in look_data:
            fout.write(json.dumps(feature, ensure_ascii=False) + '\n')

    return score / len(predict_data)
Esempio n. 6
0
def predict(model, paths):

    sp_model = spm.SentencePieceProcessor()
    sp_model.Load('data/spiece.model')

    device = torch.device("cuda", 1)
    tokenizer = XLNetTokenizer()

    model = model.to(device)

    predict_data = []

    with torch.no_grad():
        model.eval()
        for path in paths:
            with open(path, 'r', encoding='utf-8') as f:
                for step, line in enumerate(tqdm(f.readlines()[0::1])):

                    sample = json.loads(line)
                    qid, question, recall_paras = \
                        sample['qid'], sample['question'], sample['recall_paras']

                    recall_paras = [normalized(d) for d in recall_paras]
                    max_c_len = 512 - len(question) - 5
                    all_best_score = -999
                    pre_ans = ''

                    doc_head_len = len(encode_pieces(sp_model, question))

                    for text in recall_paras[:15]:
                        text = '凰'.join(
                            [x for x in text.split(' ') if len(x) != 0])
                        doc_strides = get_doc_strides(sp_model,
                                                      content=text,
                                                      max_c_len=max_c_len,
                                                      ds=256)

                        for ds_id, doc_span in enumerate(doc_strides):

                            tokens = ['<cls>'] + encode_pieces(sp_model, question) + ['<sep>'] + \
                                         encode_pieces(sp_model, doc_span)

                            ques_len = len(encode_pieces(sp_model, question))

                            input_ids = tokenizer.convert_tokens_to_ids(tokens)
                            tokentype_ids = [1] * len(input_ids)
                            tokentype_ids[:ques_len + 1] = [0] * (ques_len + 1)
                            assert len(tokentype_ids) == len(input_ids)

                            input_ids = torch.tensor(input_ids).unsqueeze(
                                0).to(device)
                            tokentype_ids = torch.tensor(
                                tokentype_ids).unsqueeze(0).to(device)

                            start, end, verify_gate, cls_logit = model(
                                input_ids, token_type_ids=tokentype_ids)

                            is_ans = (start.cpu().squeeze().tolist()[0] +
                                      end.cpu().squeeze().tolist()[0])

                            #cls_logit = torch.nn.Softmax(dim=-1)(cls_logit)
                            #cls_logit = cls_logit.cpu().squeeze().tolist()

                            start, end = start.cpu().squeeze().tolist(
                            ), end.cpu().squeeze().tolist()
                            # verify_gate = torch.nn.Softmax(dim=-1)(verify_gate)
                            verify_gate = verify_gate.cpu().squeeze().tolist()
                            verify_gate = [i[1] for i in verify_gate]

                            start_g = sorted(start)[-5:][0]
                            end_g = sorted(end)[-5:][0]

                            for s, s_prob in enumerate(start[1:-2]):
                                if s < doc_head_len - 2:
                                    continue
                                if s_prob < start_g:
                                    continue
                                for e, e_prob in enumerate(end[s + 1:s + 1 +
                                                               280]):
                                    if e_prob < end_g:
                                        continue

                                    v_score = np.min(verify_gate[s + 1:e + s +
                                                                 2])
                                    h_score = (
                                        s_prob + e_prob
                                    ) * 1 - is_ans + v_score  # + 0.002*sim # - is_ans
                                    # h_score = h_score * cls_logit[1]

                                    if h_score > all_best_score:
                                        here_ans = ''.join(tokens[s + 1:e + s +
                                                                  2])
                                        all_best_score = h_score
                                        pre_ans = here_ans

                    pre_ans = pre_ans.replace('凰', ' ')
                    predict_data.append({
                        'pre': pre_ans,
                        'qid': qid,
                        'question': question
                    })

    with open('result.json', 'w', encoding="utf-8") as fout:
        for feature in predict_data:
            fout.write(json.dumps(feature, ensure_ascii=False) + '\n')