def fast_qa_by_cmkb(): from qa.para_select import WordMatchSelector import random if request.json is None: return jsonify({ 'result': 'failed', 'message': 'request is not json' }) req = RequestQA(request.json) res = cmkb_elk_retriever.search_elk(req.question)[0:2] print('retrieve %d docs' % (len(res))) if len(res) == 0: keyword = '高血壓' if '糖尿病' in req.question: keyword = '糖尿病' target_sample = None for sample in jsonl_reader('./data/docs/fake_db.jsonl'): if keyword in sample['question']: target_sample = sample break assert target_sample is not None SimpleParagraphTransform().transform(target_sample) x = DureaderRawExample(target_sample) records = x.flatten(['question', 'qid'], ['url', 'title']) results = WordMatchSelector(k=1).paragraph_selection(records) results = random.sample(results, k=2) else: records = [] for i, x in enumerate(res): q = 'q:%d' % (i) for j, p in enumerate(x['paragraphs']): obj = { 'qid': i, 'doc_id': i, 'question': q, 'passage': p, 'url': x['url'], 'title': x['title'] } records.append(obj) k = 1 if len(res) > 1 else 2 results = WordMatchSelector(k=k).paragraph_selection(records) results = random.sample(results, k=2) response = { 'question': req.question, 'algo_version': req.algo_version, 'answers': [] } for x in results: response['answers'].append({ 'paragraph': x['passage'], 'answer': '', 'answer_pos': [0, -1], 'title': x['title'], 'url': x['url'] }) return jsonify(wrap_response(response))
def load_beta_file(path): print('load file %s' % (path)) records = [] for json_obj in jsonl_reader(path): question = json_obj['question'] if len(question) == 0: continue for paragraph in json_obj['paragraphs']: records.append({'question': question, 'passage': paragraph}) return records
def test_mrc_model(): from mrc_server import create_app, multi_doc_model_factory from preprocessing import SimpleParagraphTransform test_data = next(jsonl_reader('./data/test/test_mrc.jsonl')) test_config = {'model_type': 'mock'} model = multi_doc_model_factory(test_config) assert True == util.check_input_format(test_data, 'raw') SimpleParagraphTransform().transform(test_data) assert util.check_input_format(test_data, 'multi_mrc') print(model.get_answer_list(test_data))
def test_mock_mrc_server(): from mrc_server import create_app from qa.ranker import RankerFactory from preprocessing import SimpleParagraphTransform test_data = next(jsonl_reader('./data/test/test_mrc.jsonl')) test_config = {'model_type': 'mock'} assert True == util.check_input_format(test_data, 'raw') SimpleParagraphTransform().transform(test_data) assert util.check_input_format(test_data, 'multi_mrc') app = create_app(test_config) with app.test_client() as c: rv = c.post('/qa', json={ 'mrc_input': test_data, 'answer_num': 3, 'algo_version': 0 }) json_data = rv.get_json() print(json_data)
def load_examples_from_scratch(path,sample_stg=None,concat=False,attach_label=None): examples = [] labels = [] line_cnt = 0 for json_obj in jsonl_reader(path): if line_cnt%2000 == 0: print('load %dth line'%(line_cnt)) line_cnt+=1 paras = [] if sample_stg is None: tmp_paras = [] tmp_labels = [] for di,doc in enumerate(json_obj['documents']): if attach_label is not None and 'most_related_para' not in doc: continue tmp_paras.extend(doc['paragraphs']) if attach_label is None: continue zeros = [0] * len(doc['paragraphs']) if attach_label == 'most_related_para' : zeros[doc['most_related_para']] = 1 if attach_label == 'answer_docs' and 'answer_docs' in json_obj and di in json_obj['answer_docs' ]: zeros[doc['most_related_para']] = 1 assert sum(zeros) <2 tmp_labels.extend(zeros) if attach_label is not None: assert len(tmp_labels) == len(tmp_paras) paras.extend(tmp_paras) if len(tmp_labels) > 0: labels.extend(tmp_labels) else: pos_examples,neg_examples = sample_stg(json_obj) labels.extend([1]*len(pos_examples)+[0]*len(neg_examples)) paras = pos_examples+neg_examples examples.extend([(json_obj['question'].strip(),p) for p in paras]) print('total %d examples'%(len(examples))) if len(labels) > 0: if concat: return [(q,p,lb) for (q,p),lb in zip(examples,labels) ] return examples,labels return examples
def test_mrc_server(): from mrc_server import create_app from preprocessing import SimpleParagraphTransform test_data = next(jsonl_reader('./data/test/test_mrc.jsonl')) test_config = { 'model_type': 'pipeline', 'device': 'cpu', 'ranker_config_path': './data/model/pointwise/answer_doc/config.json', 'reader_config_path': './data/model/reader/bert_default/config.json' } assert True == util.check_input_format(test_data, 'raw') SimpleParagraphTransform().transform(test_data) assert util.check_input_format(test_data, 'multi_mrc') app = create_app(test_config) print('send data') with app.test_client() as c: rv = c.post('/qa', json={ 'mrc_input': test_data, 'answer_num': 3, 'algo_version': 0 }) json_data = rv.get_json() print(json_data)
def _load_file(self, path): ret = [] for raw_sample in jsonl_reader(path): ret.extend(self._para_selection(raw_sample)) return ret
from .dureader import DureaderRawExample, DureaderRawDocument, DureaderLoader from common.util import jsonl_reader json_obj = next(jsonl_reader('./data/unittest/dureader_fake.json')) example = DureaderRawExample(json_obj) flatten_samples = example.flatten() assert len(flatten_samples) == 5 assert json_obj['question'] == flatten_samples[0]['question'] assert json_obj['question_id'] == flatten_samples[0]['question_id'] o1 = [{ 'passage': 'abcde', 'passage_id': 0, 'doc_id': 0, 'question': 'wtf is this?', 'question_id': 12345 }] o2 = [{ 'passage': 'abcde', 'passage_id': 0, 'doc_id': 0, 'question': 'wtf is this?', 'question_id': 12345 }, { 'passage': 'ccc', 'passage_id': 2, 'doc_id': 1, 'question': 'wtf is this?', 'question_id': 12345 }]
def test_check_format(): test_data = next(jsonl_reader('./data/test/test_mrc.jsonl')) assert True == util.check_input_format(test_data, 'raw') assert False == util.check_input_format(test_data, 'multi_mrc')