def predict_single(data): response = [] predict_test_data = [ # "《中国风水十讲》是2007年华夏出版社出版的图书,作者是杨文衡", # "你是最爱词:许常德李素珍/曲:刘天健你的故事写到你离去后为止", # "《苏州商会档案丛编第二辑》是2012年华中师范大学出版社出版的图书,作者是马敏、祖苏、肖芃", "广州珂妃化妆品有限公司创立于2005年,是一家集研发、生产、销售、服务为一体的多元化化妆品集团公司", "吕雅堂(1907-)安徽寿县人,1926年10月南京中央军校第六期,后任国民革命军徐州剿总骑兵第一旅副旅长", ] # num_actual_predict_examples = len(predict_test_data) features = string_tokenizer(predict_test_data, label_list, max_seq_length, tokenizer) result = sess.run(fetches, feed_dict={ 'input_ids:0': features['input_ids'], 'input_mask:0': features['input_mask'], 'segment_ids:0': features['segment_ids'] }) for prediction in result: prediction = prediction.tolist() print("\n\n prediction:\n{}".format(prediction)) for idx, class_probability in enumerate(prediction): predicate_predict = [] if class_probability > 0.5: print(label_list[idx]) predicate_predict = [] predicate_predict.append(label_list[prediction.index(max(prediction))]) res = {"relations": predicate_predict} response.append(res) return json.dumps(response)
def predict_v1(): data = request.get_data() json_data = json.loads(data.decode("utf-8")) # 对不同数据进行切分,切分到小于max_raw_str_len res = [] for doc in json_data["data"]: segments = getSegments(doc) # raw seg # token # token_unk # predicate_ids print("Total {} segments:\n{}".format(len(segments), segments)) features = string_tokenizer(segments, label_list, max_seq_length, tokenizer) result = sess.run(fetches, feed_dict={ 'input_ids:0': features['input_ids'], 'input_mask:0': features['input_mask'], 'segment_ids:0': features['segment_ids'] }) predicate_ids = [] for prediction in result: prediction = prediction.tolist() predicate_id = [] for idx, class_probability in enumerate(prediction): predicate_predict = [] if class_probability > 0.5: predicate_predict.append(label_list[idx]) predicate_id.append(idx) predicate_ids.append(predicate_id) # TODO: 其中某句话可能没有predicate,此时应该去除 seq_segments, seq_predicate_ids = [], [] for seg, pred in zip(segments, predicate_ids): if len(pred) > 0: seq_segments.append(seg) seq_predicate_ids.append(pred) triples = [] if len(seq_segments) > 0: features_seq = sequence_tokenizer(seq_segments, seq_predicate_ids, label_list, max_seq_length, tokenizer) # print("\nfeatures_seq:\n{}\n\n".format(features_seq)) seq_result = sess_seq.run( # TODO: 是否可以只跑fetches_seq【3】 fetches_seq, feed_dict={ 'input_ids:0': features_seq['input_ids'], 'input_mask:0': features_seq['input_mask'], 'segment_ids:0': features_seq['segment_ids'] }) seq_label_result = seq_result[2] # for token_label_prediction in seq_label_result: # token_label_output_line = " ".join(token_label_id2label[id] for id in token_label_prediction) # print("raw out:\n{}".format(token_label_prediction)) # print("label out:\n{}".format(token_label_output_line)) # print("\n") res_index = 0 for (i, src) in enumerate(seq_segments): for predicate_id in seq_predicate_ids[i]: pred = label_list[predicate_id] trps = getTriples_v1(seq_label_result[res_index], src, pred) triples.extend(trps) res_index += 1 # print("triples: {}".format(triples)) res.append(triples) return json.dumps(res, ensure_ascii=False)
def predict_pipline(data): response = [] predict_test_data = [ # "《中国风水十讲》是2007年华夏出版社出版的图书,作者是杨文衡", # "你是最爱词:许常德李素珍/曲:刘天健你的故事写到你离去后为止", # "《苏州商会档案丛编第二辑》是2012年华中师范大学出版社出版的图书,作者是马敏、祖苏、肖芃", "广州珂妃化妆品有限公司创立于2005年,是一家集研发、生产、销售、服务为一体的多元化化妆品集团公司", "吕雅堂(1907-)安徽寿县人,1926年10月南京中央军校第六期,后任国民革命军徐州剿总骑兵第一旅副旅长", ] # num_actual_predict_examples = len(predict_test_data) features = string_tokenizer(predict_test_data, label_list, max_seq_length, tokenizer) result = sess.run(fetches, feed_dict={ 'input_ids:0': features['input_ids'], 'input_mask:0': features['input_mask'], 'segment_ids:0': features['segment_ids'] }) predicate_ids = [] for prediction in result: prediction = prediction.tolist() print("\n\n prediction:\n{}".format(prediction)) predicate_id = [] for idx, class_probability in enumerate(prediction): predicate_predict = [] if class_probability > 0.5: predicate_predict.append(label_list[idx]) predicate_id.append(idx) predicate_ids.append(predicate_id) res = {"relations": predicate_predict} response.append(res) features_seq = sequence_tokenizer(predict_test_data, predicate_ids, label_list, max_seq_length, tokenizer) seq_result = sess_seq.run(fetches_seq, feed_dict={ 'input_ids:0': features_seq['input_ids'], 'input_mask:0': features_seq['input_mask'], 'segment_ids:0': features_seq['segment_ids'] }) seq_label_result = seq_result[2] for token_label_prediction in seq_label_result: token_label_output_line = " ".join(token_label_id2label[id] for id in token_label_prediction) print("raw out:\n{}".format(token_label_prediction)) print("label out:\n{}".format(token_label_output_line)) print("\n") res_index = 0 triples = [] for (i, src) in enumerate(predict_test_data): for predicate_id in predicate_ids[i]: pred = label_list[predicate_id] # sub, obj = getS # obj = trps = getTriples_v1(seq_label_result[res_index], src, pred) triples.extend(trps) res_index += 1 # print("seq result: {}".format(seq_label_result)) print("triples: {}".format(triples)) return json.dumps(triples, ensure_ascii=False)