Esempio n. 1
0
    def process_data(self, file_name, pkl_name): # 入口函数

        process_data = []
        ans = []
        a = 0
        with open(file_name, 'r', encoding='utf-8') as f:
            for line in tqdm(f.readlines()):

                sample = json.loads(line)

                docid, question, answer, text = sample['docid'], sample['question'], sample['answer'], sample['text']
                recall_paras = sample['recall_paras']
                pre_sample = {'question': question, 'docs': [], 'answers':answer}

                recall_paras = [normalized(n) for n in recall_paras]
                text = normalized(text)
                answer = normalized(answer)

                ans.append(answer)
                #recall_paras = [text]
                doc_stride = []
                for _, text in enumerate(recall_paras[:5]):

                    max_c_len = 512 - len(encode_pieces(self.sp_model, question)) - 5

                    #text = '凰'.join([x for x in text.split(' ') if len(x) != 0])
                    #answer = '凰'.join([x for x in answer.split(' ') if len(x) != 0])
                    doc_stride.extend(self.get_doc_strides(text, max_c_len=max_c_len, ds=256))

                doc_stride = list(set(doc_stride))
                for ds_id, doc_span in enumerate(doc_stride):

                        doc_span_token = ['<cls>'] + encode_pieces(self.sp_model, question) + ['<sep>'] + \
                                        encode_pieces(self.sp_model, doc_span)

                        ref_ans_token = encode_pieces(self.sp_model, answer)

                        start, end = self.find_answer(doc_span_token, ref_ans_token)
                        if start[0] == 0:
                            ans_dict = {'is_impossible': True, 'answers': [[0, 0, 0, 0]]}
                            doc = {'content': doc_span_token, 'ans_dict': ans_dict}
                            pre_sample['docs'].append(doc)
                            a += 1
                        else:
                            ans_dict = {'is_impossible': False, 'answers': [[0, 0, answer, 0]], 'muti_ans':[start, end]}
                            doc = {'content': doc_span_token, 'ans_dict': ans_dict}
                            pre_sample['docs'].append(doc)

                process_data.append(pre_sample)

        print('no-ans:',a)
        print('len(ans):',len(set(ans)))
        print('len(process_data): ', len(process_data))
        with open(pkl_name, 'w', encoding="utf-8") as fout:
            for feature in process_data:
                fout.write(json.dumps(feature, ensure_ascii=False) + '\n')
Esempio n. 2
0
    def get_doc_strides(self, content, max_c_len=args.max_c_len, ds=256):
        c_tokens = encode_pieces(self.sp_model, content)
        all_strides = []
        here_start = 0
        while here_start < len(c_tokens):
            here_c = ''.join(c_tokens[here_start:here_start + max_c_len])
            all_strides.append(here_c)
            here_start += ds

        if len(c_tokens) <= max_c_len:
            return all_strides[:1]
        if all_strides[-1] in all_strides[-2]:
            all_strides = all_strides[:-1]

        return all_strides[:15]
Esempio n. 3
0
 def train(json_path, seg, save_path):
     # Similarity.get_index_id_trans()
     print('===== loading data =====')
     with open(json_path, 'r') as f:
         real_documents = []
         g = f.read().split('\n')
         for _, data_piece in enumerate(tqdm(g)):
             context = json.loads(data_piece)['context']
             item_str = encode_pieces(sp_model, context)  #seg(context)
             real_documents.append(item_str)
     tfidf_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b",
                                        max_df=0.7,
                                        min_df=1 / 3000)
     document = [' '.join(sent) for sent in real_documents]
     tfidf_model = tfidf_vectorizer.fit(document)
     # 下面这个pickle存不了。。
     # sparse matrix, [n_samples, n_features] Tf-idf-weighted document-term matrix.
     sparse_result = tfidf_model.transform(document).todense()
     with open(os.path.join(save_path, 'tfidf_model.pkl'), 'wb') as f:
         pickle.dump(tfidf_model, f)
     with open(os.path.join(save_path, 'document.pkl'), 'wb') as f:
         pickle.dump(document, f)
Esempio n. 4
0
 def predict(query, seg, tfidf_model, sparse_result):
     test_document = [' '.join(encode_pieces(sp_model, query))]
     result = tfidf_model.transform(test_document).todense()
     scores = np.array(result * sparse_result.T)[0]
     return list(reversed(list(np.argsort(scores))))[:300]
Esempio n. 5
0
def predict(model, paths):

    sp_model = spm.SentencePieceProcessor()
    sp_model.Load('data/spiece.model')

    device = torch.device("cuda", 1)
    tokenizer = XLNetTokenizer()

    model = model.to(device)

    predict_data = []

    with torch.no_grad():
        model.eval()
        for path in paths:
            with open(path, 'r', encoding='utf-8') as f:
                for step, line in enumerate(tqdm(f.readlines()[0::1])):

                    sample = json.loads(line)
                    qid, question, recall_paras = \
                        sample['qid'], sample['question'], sample['recall_paras']

                    recall_paras = [normalized(d) for d in recall_paras]
                    max_c_len = 512 - len(question) - 5
                    all_best_score = -999
                    pre_ans = ''

                    doc_head_len = len(encode_pieces(sp_model, question))

                    for text in recall_paras[:15]:
                        text = '凰'.join(
                            [x for x in text.split(' ') if len(x) != 0])
                        doc_strides = get_doc_strides(sp_model,
                                                      content=text,
                                                      max_c_len=max_c_len,
                                                      ds=256)

                        for ds_id, doc_span in enumerate(doc_strides):

                            tokens = ['<cls>'] + encode_pieces(sp_model, question) + ['<sep>'] + \
                                         encode_pieces(sp_model, doc_span)

                            ques_len = len(encode_pieces(sp_model, question))

                            input_ids = tokenizer.convert_tokens_to_ids(tokens)
                            tokentype_ids = [1] * len(input_ids)
                            tokentype_ids[:ques_len + 1] = [0] * (ques_len + 1)
                            assert len(tokentype_ids) == len(input_ids)

                            input_ids = torch.tensor(input_ids).unsqueeze(
                                0).to(device)
                            tokentype_ids = torch.tensor(
                                tokentype_ids).unsqueeze(0).to(device)

                            start, end, verify_gate, cls_logit = model(
                                input_ids, token_type_ids=tokentype_ids)

                            is_ans = (start.cpu().squeeze().tolist()[0] +
                                      end.cpu().squeeze().tolist()[0])

                            #cls_logit = torch.nn.Softmax(dim=-1)(cls_logit)
                            #cls_logit = cls_logit.cpu().squeeze().tolist()

                            start, end = start.cpu().squeeze().tolist(
                            ), end.cpu().squeeze().tolist()
                            # verify_gate = torch.nn.Softmax(dim=-1)(verify_gate)
                            verify_gate = verify_gate.cpu().squeeze().tolist()
                            verify_gate = [i[1] for i in verify_gate]

                            start_g = sorted(start)[-5:][0]
                            end_g = sorted(end)[-5:][0]

                            for s, s_prob in enumerate(start[1:-2]):
                                if s < doc_head_len - 2:
                                    continue
                                if s_prob < start_g:
                                    continue
                                for e, e_prob in enumerate(end[s + 1:s + 1 +
                                                               280]):
                                    if e_prob < end_g:
                                        continue

                                    v_score = np.min(verify_gate[s + 1:e + s +
                                                                 2])
                                    h_score = (
                                        s_prob + e_prob
                                    ) * 1 - is_ans + v_score  # + 0.002*sim # - is_ans
                                    # h_score = h_score * cls_logit[1]

                                    if h_score > all_best_score:
                                        here_ans = ''.join(tokens[s + 1:e + s +
                                                                  2])
                                        all_best_score = h_score
                                        pre_ans = here_ans

                    pre_ans = pre_ans.replace('凰', ' ')
                    predict_data.append({
                        'pre': pre_ans,
                        'qid': qid,
                        'question': question
                    })

    with open('result.json', 'w', encoding="utf-8") as fout:
        for feature in predict_data:
            fout.write(json.dumps(feature, ensure_ascii=False) + '\n')