def dataset_test(): ''' 用训练问答对中的实体+属性,去知识库中进行问答测试准确率上限 :return: ''' with open(file) as f: total = 0 recall = 0 correct = 0 for line in f: question, entity, attribute, answer, ner = line.split("\t") ner = ner.replace("#", "").replace("[UNK]", "%") # case1: entity and attribute Exact Match sql_e1_a1 = "select * from nlpccQA where entity='" + \ entity+"' and attribute='"+attribute+"' limit 10" result_e1_a1 = upload_data(sql_e1_a1) # case2: entity Fuzzy Match and attribute Exact Match sql_e0_a1 = "select * from nlpccQA where entity like '%" + \ entity + "%' and attribute='" + attribute + "' limit 10" #result_e0_a1 = upload_data(sql_e0_a1, True) # case3: entity Exact Match and attribute Fuzzy Match sql_e1_a0 = "select * from nlpccQA where entity like '" + \ entity + "' and attribute='%" + attribute + "%' limit 10" #result_e1_a0 = upload_data(sql_e1_a0) if len(result_e1_a1) > 0: recall += 1 for l in result_e1_a1: if l[2] == answer: correct += 1 else: result_e0_a1 = upload_data(sql_e0_a1) if len(result_e0_a1) > 0: recall += 1 for l in result_e0_a1: if l[2] == answer: correct += 1 else: result_e1_a0 = upload_data(sql_e1_a0) if len(result_e1_a0) > 0: recall += 1 for l in result_e1_a0: if l[2] == answer: correct += 1 else: loginfo.logger.info(sql_e1_a0) if total > 100: break total += 1 time.sleep(1) loginfo.logger.info( "total: {}, recall: {}, correct:{}, accuracy: {}%".format( total, recall, correct, correct * 100.0 / recall))
def kb_fuzzy_classify_test(): ''' 进行问答测试: 1、 实体检索:输入问题,ner得出实体集合,在数据库中检索与输入实体相关的所有三元组 2、 属性映射——bert分类/文本相似度 + 非语义匹配:如果所得三元组的关系(attribute)属性是 输入问题 字符串的子集,将所得三元组的答案(answer)属性与正确答案匹配,correct +1 + 语义匹配:利用bert计算输入问题(input question)与所得三元组的关系(attribute)属性的相似度,将最相似的三元组的答案作为答案,并与正确 的答案进行匹配,correct +1 3、 答案组合 :return: ''' with open(file, encoding='utf-8') as f: total = 0 recall = 0 correct = 0 ambiguity = 0 # 属性匹配正确但是答案不正确 for line in f: try: total += 1 question, entity, attribute, answer, ner = line.split("\t") ner = ner.replace("#", "").replace("[UNK]", "%").replace("\n", "") # case: entity Fuzzy Match # 找出所有包含这些实体的三元组 sql_e0_a1 = "select * from nlpccQA where entity like '%" + \ ner + "%' order by length(entity) asc limit 20" # sql查出来的是tuple,要转换成list才不会报错 result_e0_a1 = list(upload_data(sql_e0_a1)) if len(result_e0_a1) > 0: recall += 1 flag_fuzzy = True # 非语义匹配,加快速度 # l1[0]: entity # l1[1]: attribute # l1[2]: answer flag_ambiguity = True for l in result_e0_a1: if l[1] in question or l[1].lower( ) in question or l[1].upper() in question: flag_fuzzy = False if estimate_answer(l[2], answer): correct += 1 flag_ambiguity = False else: loginfo.logger.info("\t".join(l)) # 非语义匹配成功,继续下一次 if not flag_fuzzy: if flag_ambiguity: ambiguity += 1 time.sleep(1) loginfo.logger.info( "total: {}, recall: {}, correct:{}, accuracy: {}%, ambiguity:{}" .format(total, recall, correct, correct * 100.0 / recall, ambiguity)) continue # 语义匹配 result_df = pd.DataFrame( result_e0_a1, columns=['entity', 'attribute', 'value']) # loginfo.logger.info(result_df.head(100)) attribute_candicate_sim = [ (k, bs.predict(question, k)[0][1]) for k in result_df['attribute'].tolist() ] attribute_candicate_sort = sorted( attribute_candicate_sim, key=lambda candicate: candicate[1], reverse=True) loginfo.logger.info("\n".join([ str(k) + " " + str(v) for (k, v) in attribute_candicate_sort ])) answer_candicate_df = result_df[result_df["attribute"] == attribute_candicate_sort[0] [0]] for row in answer_candicate_df.index: if estimate_answer( answer_candicate_df.loc[row, "value"], answer): correct += 1 else: loginfo.logger.info("\t".join( answer_candicate_df.loc[row].tolist())) time.sleep(1) loginfo.logger.info( "total: {}, recall: {}, correct:{}, accuracy: {}%, ambiguity:{}" .format(total, recall, correct, correct * 100.0 / recall, ambiguity)) except Exception as e: loginfo.logger.info("the question id % d occur error %s" % (total, repr(e)))
def kbqa_api(sentence): """ do online prediction. each time make prediction for one instance. you can change to a batch if you want. :param line: a list. element is: [dummy_label,text_a,text_b] :return: """ def convert(line): feature = convert_single_example(0, line, label_list, FLAGS.max_seq_length, tokenizer, 'p') input_ids = np.reshape([feature.input_ids],(batch_size, FLAGS.max_seq_length)) input_mask = np.reshape([feature.input_mask],(batch_size, FLAGS.max_seq_length)) segment_ids = np.reshape([feature.segment_ids],(batch_size, FLAGS.max_seq_length)) label_ids =np.reshape([feature.label_ids],(batch_size, FLAGS.max_seq_length)) return input_ids, input_mask, segment_ids, label_ids global graph with graph.as_default(): print(id2label) sentence = str(sentence) start = datetime.now() if len(sentence) < 2: print(sentence) return None sentence = tokenizer.tokenize(sentence) # print('your input is:{}'.format(sentence)) input_ids, input_mask, segment_ids, label_ids = convert(sentence) feed_dict = {input_ids_p: input_ids, input_mask_p: input_mask, segment_ids_p:segment_ids, label_ids_p:label_ids} # run session get current feed_dict result pred_ids_result = sess.run([pred_ids], feed_dict) pred_label_result = convert_id_to_label(pred_ids_result, id2label) print(pred_label_result) #todo: 组合策略 result = strage_combined_link_org_loc(sentence, pred_label_result[0], True) print('识别的实体是:{}'.format(''.join(result))) #print('Time used: {} sec'.format((datetime.now() - start).seconds)) ner = ''.join(result) ner = ner.replace("#", "").replace("[UNK]", "%").replace("\n", "") if len(ner) == 0: print('can not recognize this entity') return None sql_e0_a1 = "select * from nlpccQA where entity like '%" + ner + "%' order by length(entity) asc limit 20" result_e0_a1 = list(upload_data(sql_e0_a1)) if len(result_e0_a1) == 0: print('can not find this NE in kb') else: result_df = pd.DataFrame(result_e0_a1, columns=['entity', 'attribute', 'value']) attribute_candicate_sim = [(k, bs.predict(sentence, k)[0][1]) for k in result_df['attribute'].tolist()] attribute_candicate_sort = sorted(attribute_candicate_sim, key=lambda candicate: candicate[1], reverse=True) print ('\n知识库中相关的实体是: ', result_df) print('\n属性相似性排序结果:') print("\n".join([str(k)+" "+str(v) for (k, v) in attribute_candicate_sort])) print('\n问题是:', sentence) answer_candicate_df = result_df[result_df["attribute"] == attribute_candicate_sort[0][0]] for row in answer_candicate_df.index: print print('\n识别实体: ', ner, '最相似关系:', attribute_candicate_sort[0][0], '问题的答案是:', answer_candicate_df.loc[row,'value'] ) return answer_candicate_df.loc[row,'value']
def kbqa_api(str_input, str_output): """ do online prediction. each time make prediction for one instance. you can change to a batch if you want. :param line: a list. element is: [dummy_label,text_a,text_b] :return: """ dict_output = {'status': 'yes'} dict_input = json.loads(str_input) print(dict_input, type(dict_input)) if 'question' in dict_input: dict_output['question'] = dict_input['question'] sentence = dict_input['question'] else: dict_output['status'] = 'there is no question keyword in json format' str_output = json.dumps(dict_output) return str_output def convert(line): feature = convert_single_example(0, line, label_list, FLAGS.max_seq_length, tokenizer, 'p') input_ids = np.reshape([feature.input_ids], (batch_size, FLAGS.max_seq_length)) input_mask = np.reshape([feature.input_mask], (batch_size, FLAGS.max_seq_length)) segment_ids = np.reshape([feature.segment_ids], (batch_size, FLAGS.max_seq_length)) label_ids = np.reshape([feature.label_ids], (batch_size, FLAGS.max_seq_length)) return input_ids, input_mask, segment_ids, label_ids global graph with graph.as_default(): print(id2label) sentence = str(sentence) start = datetime.now() if len(sentence) < 2: print(sentence) dict_output['status'] = 'question value is too short' str_output = json.dumps(dict_output) return str_output sentence = tokenizer.tokenize(sentence) print('your input is:{}'.format(sentence)) input_ids, input_mask, segment_ids, label_ids = convert(sentence) feed_dict = { input_ids_p: input_ids, input_mask_p: input_mask, segment_ids_p: segment_ids, label_ids_p: label_ids } pred_ids_result = sess.run([pred_ids], feed_dict) pred_label_result = convert_id_to_label(pred_ids_result, id2label) print(pred_label_result) #todo: 组合策略 result = strage_combined_link_org_loc(sentence, pred_label_result[0], True) print('识别的实体是:{}'.format(''.join(result))) ner = ''.join(result) ner = ner.replace("#", "").replace("[UNK]", "%").replace("\n", "") if len(ner) == 0: print('can not recognize this entity') dict_output['status'] = 'can not recognize entity' str_output = json.dumps(dict_output) return str_output else: dict_output['entity'] = ner sql_e0_a1 = "select * from nlpccQA where entity like '%" + ner + "%' order by length(entity) asc limit 20" result_e0_a1 = list(upload_data(sql_e0_a1)) #print('Time used: {} sec'.format((datetime.now() - start).seconds)) dict_output['time'] = (datetime.now() - start).microseconds / 1000.0 if len(result_e0_a1) == 0: print('can not find this NE in kb') dict_output['status'] = 'can not find this entity in kb' str_output = json.dumps(dict_output) return str_output else: result_df = pd.DataFrame(result_e0_a1, columns=['entity', 'attribute', 'value']) list_df = [] for i in range(0, len(result_df)): list_df.append([ result_df.iloc[i]['entity'], result_df.iloc[i]['attribute'], result_df.iloc[i]['value'] ]) attribute_candicate_sim = [ (k, bs.predict(sentence, k)[0][1]) for k in result_df['attribute'].tolist() ] attribute_candicate_sort = sorted( attribute_candicate_sim, key=lambda candicate: candicate[1], reverse=True) print('\n知识库中相关的实体是: ', result_df) print('\n属性相似性排序结果:') print("\n".join([ str(k) + " " + str(v) for (k, v) in attribute_candicate_sort ])) print('\n问题是:', sentence) answer_candicate_df = result_df[result_df["attribute"] == attribute_candicate_sort[0][0]] print(answer_candicate_df.index) for row in answer_candicate_df.index: print(row) print('\n识别实体: ', ner, '最相似关系:', attribute_candicate_sort[0][0], '问题的答案是:', answer_candicate_df.loc[row, 'value']) dict_output['status'] = 'ok' dict_output['answer'] = answer_candicate_df.loc[row, 'value'] dict_output['kb'] = list_df dict_output['attribute'] = "\n".join([ str(k) + " " + str(v) for (k, v) in attribute_candicate_sort ]) str_output = json.dumps(dict_output) return str_output
from run_similarity import BertSim import tensorflow as tf from global_config import Logger loginfo = Logger("recommend_articles.log", "info") bs = BertSim() bs.set_mode(tf.estimator.ModeKeys.PREDICT) while True: choice = {} question = input("question:") start1 = datetime.now() ner = predict_service(question) print("识别出的实体:{}".format(ner)) sql_e1 = "select * from nlpccQA where entity ='" + ner + "' order by length(entity) asc " result_e1 = list(upload_data(sql_e1)) print("从数据库中精确找到实体{}个".format(len(result_e1))) result = result_e1 if len(result_e1) == 0: print("精确查找没有查找到实体,采用模糊查找") sql_e0 = "select * from nlpccQA where entity like '%" + ner + "%' order by length(entity) asc " result_e0 = list(upload_data(sql_e0)) print(result_e0) if len(result_e0) == 0: print("这个问题我也不知道呀~~") continue k = 1 entity_candidate = [result_e0[0][0], 0] # [实体,start, end] flag = 0 for i in range(1, len(result_e0)):