def __init__(self): """ 初始化一些必要的全局变量 """ system_info = SystemInfo() entity = EntityCode() self.account_label = entity.get_account_label() self.entity_code = entity.get_entity_code() self.account_identify = yaml.load(open( system_info.get_account_label_path(), encoding='utf-8'), Loader=yaml.SafeLoader) self.identity_account = { value: k for k, v in self.account_identify.items() for value in v }
def __init__(self, test_mode=False): self.system_info = SystemInfo(is_test=test_mode) self.client = ModelServing(self.system_info.MODE_NER, is_test=test_mode) self.config = self.system_info.get_config() self.entity_code = EntityCode() self.ner_entities = self.entity_code.get_ner_entities() self.code = self.entity_code.get_entity_code() self.labels_list = [] self.labels_list_split = [] self.__init_specific_label_combine() self.__init_jieba()
def __init__(self): self.system_info = SystemInfo() self.client = ModelServing(self.system_info.MODE_NER) self.config = self.system_info.get_config() self.entity_code = EntityCode() self.ner_entities = self.entity_code.get_ner_entities() self.code = self.entity_code.get_entity_code() self.entity_map_dic = { "ORG": "CPNY_NAME", "PER": "NAME", "DATE": "DATE", "LOC": "ADDR_VALUE" } self.labels_list = [] self.labels_list_split = [] self.__init_specific_label_combine() self.__init_jieba()
def __init__(self): self.config = SystemInfo().get_config() self.server_ip = self.config['dependence_parser_ip'] self.server_port = self.config['dependence_parser_port'] self.annotators = self.config['server_type']['depparse']
def __init__(self, mode, is_test=False): self.system_info = SystemInfo() if is_test: # 测试模式下加载配置文件 self.config = load_config('../../config/') self.time_out = self.config["grpc_request_timeout"] self.batch_size = self.config["pred_batch_size"] self.hidden_size = self.config["hidden_size"] with open('../../output/labels/label_list.pkl', 'rb') as rf: self.label_list = pickle.load(rf) with open('../../output/labels/label2id.pkl', 'rb') as rf: self.label2id = pickle.load(rf) self.id2label = { value: key for key, value in self.label2id.items() } self.label_map = {} for (i, label) in enumerate(self.label_list, 1): self.label_map[label] = i self.tokenizer = tokenization.FullTokenizer( vocab_file='../../chinese_L-12_H-768_A-12/vocab.txt', do_lower_case=self.config["do_lower_case"]) channel = grpc.insecure_channel(self.config["model_ner_address"]) self.stub = prediction_service_pb2_grpc.PredictionServiceStub( channel) else: self.config = self.system_info.get_config() self.time_out = self.config["grpc_request_timeout"] self.batch_size = self.config["pred_batch_size"] self.hidden_size = self.config["hidden_size"] self.max_seq_length = self.config["max_seq_length"] label_path = self.system_info.get_labels_path() with open(os.path.join(label_path, 'label_list.pkl'), 'rb') as rf: self.label_list = pickle.load(rf) with open(os.path.join(label_path, 'label2id.pkl'), 'rb') as rf: self.label2id = pickle.load(rf) self.id2label = { value: key for key, value in self.label2id.items() } self.label_map = {} # 1表示从1开始对label进行index化 for (i, label) in enumerate(self.label_list, 1): self.label_map[label] = i self.tokenizer = tokenization.FullTokenizer( vocab_file=os.getcwd() + self.config["vocab_file"], do_lower_case=self.config["do_lower_case"]) if mode == self.system_info.MODE_NER: channel = grpc.insecure_channel( self.config["model_ner_address"]) self.stub = prediction_service_pb2_grpc.PredictionServiceStub( channel) else: logger.error('Please config ip address and port first.')
class ModelServing(object): """ 通过 grpc 的方式请求部署在Docker上的TensorFlow Serving服务。 提供了两种服务:NER 和 SEN 服务 NER:识别句子中的实体 SEN:将句子表示成向量形式 """ def __init__(self, mode, is_test=False): self.system_info = SystemInfo() if is_test: # 测试模式下加载配置文件 self.config = load_config('../../config/') self.time_out = self.config["grpc_request_timeout"] self.batch_size = self.config["pred_batch_size"] self.hidden_size = self.config["hidden_size"] with open('../../output/labels/label_list.pkl', 'rb') as rf: self.label_list = pickle.load(rf) with open('../../output/labels/label2id.pkl', 'rb') as rf: self.label2id = pickle.load(rf) self.id2label = { value: key for key, value in self.label2id.items() } self.label_map = {} for (i, label) in enumerate(self.label_list, 1): self.label_map[label] = i self.tokenizer = tokenization.FullTokenizer( vocab_file='../../chinese_L-12_H-768_A-12/vocab.txt', do_lower_case=self.config["do_lower_case"]) channel = grpc.insecure_channel(self.config["model_ner_address"]) self.stub = prediction_service_pb2_grpc.PredictionServiceStub( channel) else: self.config = self.system_info.get_config() self.time_out = self.config["grpc_request_timeout"] self.batch_size = self.config["pred_batch_size"] self.hidden_size = self.config["hidden_size"] self.max_seq_length = self.config["max_seq_length"] label_path = self.system_info.get_labels_path() with open(os.path.join(label_path, 'label_list.pkl'), 'rb') as rf: self.label_list = pickle.load(rf) with open(os.path.join(label_path, 'label2id.pkl'), 'rb') as rf: self.label2id = pickle.load(rf) self.id2label = { value: key for key, value in self.label2id.items() } self.label_map = {} # 1表示从1开始对label进行index化 for (i, label) in enumerate(self.label_list, 1): self.label_map[label] = i self.tokenizer = tokenization.FullTokenizer( vocab_file=os.getcwd() + self.config["vocab_file"], do_lower_case=self.config["do_lower_case"]) if mode == self.system_info.MODE_NER: channel = grpc.insecure_channel( self.config["model_ner_address"]) self.stub = prediction_service_pb2_grpc.PredictionServiceStub( channel) else: logger.error('Please config ip address and port first.') def convert_single_example(self, ex_index, example, max_seq_length, tokenizer, mode): """ 将一个样本进行分析,然后将字转化为id, 标签转化为id,然后结构化到InputFeatures对象中 :param ex_index: index :param example: 一个样本 :param max_seq_length: :param tokenizer: :param mode: :return: """ tokens = example # 序列截断 if len(tokens) >= max_seq_length - 1: tokens = tokens[0:(max_seq_length - 2)] # -2 的原因是因为序列需要加一个句首和句尾标志 ntokens = [] segment_ids = [] label_ids = [] ntokens.append("[CLS]") # 句子开始设置CLS 标志 segment_ids.append(0) label_ids.append( self.label_map["[CLS]"] ) # O OR CLS 没有任何影响,不过我觉得O 会减少标签个数,不过拒收和句尾使用不同的标志来标注,使用LCS 也没毛病 for i, token in enumerate(tokens): ntokens.append(token) segment_ids.append(0) label_ids.append(0) ntokens.append("[SEP]") # 句尾添加[SEP] 标志 segment_ids.append(0) label_ids.append(self.label_map["[SEP]"]) input_ids = tokenizer.convert_tokens_to_ids( ntokens) # 将序列中的字(ntokens)转化为ID形式 input_mask = [1] * len(input_ids) # padding, 使用 while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) label_ids.append(0) ntokens.append("**NULL**") assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(label_ids) == max_seq_length # 结构化为一个类 feature = InputFeatures( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids, ) return feature def convert(self, line, seq_length=128): feature = self.convert_single_example(0, line, seq_length, self.tokenizer, 'p') input_ids = np.reshape([feature.input_ids], (self.batch_size, seq_length)).tolist() input_mask = np.reshape([feature.input_mask], (self.batch_size, seq_length)).tolist() segment_ids = np.reshape([feature.segment_ids], (self.batch_size, seq_length)).tolist() label_ids = np.reshape([feature.label_ids], (self.batch_size, seq_length)).tolist() return input_ids, input_mask, segment_ids, label_ids def send_grpc_request_ner(self, raw_sen): """ 发送grpc请求到服务器,获取对句子实体识别的结果 :param raw_sen: 待识别的句子 :return: """ sentence = self.tokenizer.tokenize(raw_sen) input_ids, input_mask, segment_ids, label_ids = self.convert( sentence, self.config["max_seq_length"]) # create the request object and set the name and signature_name params request = predict_pb2.PredictRequest() request.model_spec.name = MODEL_NAME_NER request.model_spec.signature_name = MODEL_SIGNATURE_NER # fill in the request object with the necessary data request.inputs['input_ids'].CopyFrom( tf.contrib.util.make_tensor_proto(input_ids)) request.inputs['input_mask'].CopyFrom( tf.contrib.util.make_tensor_proto(input_mask)) request.inputs['segment_ids'].CopyFrom( tf.contrib.util.make_tensor_proto(segment_ids)) request.inputs['label_ids'].CopyFrom( tf.contrib.util.make_tensor_proto(label_ids)) result_future = self.stub.Predict.future(request, self.time_out) exception = result_future.exception() if exception: logger.error('process sentence: {0}, raise exception: {1}'.format( raw_sen, exception)) return None, None else: pred_ids_result = np.array( result_future.result().outputs['pred_ids'].int_val) pred_label_result = convert_id_to_label(pred_ids_result, self.id2label) return sentence, pred_label_result def test_send_grpc_request_ner(self, raw_sen): """ 测试 ner 服务是否可以正常使用 :param raw_sen: :return: """ sentence = self.tokenizer.tokenize(raw_sen) input_ids, input_mask, segment_ids, label_ids = self.convert( sentence, self.config["max_seq_length"]) # create the request object and set the name and signature_name params request = predict_pb2.PredictRequest() request.model_spec.name = MODEL_NAME_NER request.model_spec.signature_name = MODEL_SIGNATURE_NER # fill in the request object with the necessary data request.inputs['input_ids'].CopyFrom( tf.contrib.util.make_tensor_proto(input_ids)) request.inputs['input_mask'].CopyFrom( tf.contrib.util.make_tensor_proto(input_mask)) request.inputs['segment_ids'].CopyFrom( tf.contrib.util.make_tensor_proto(segment_ids)) request.inputs['label_ids'].CopyFrom( tf.contrib.util.make_tensor_proto(label_ids)) result_future = self.stub.Predict.future(request, self.time_out) exception = result_future.exception() if exception: print('process sentence: {0}, raise exception: {1}'.format( raw_sen, exception)) return False else: pred_ids_result = np.array( result_future.result().outputs['pred_ids'].int_val) pred_label_result = convert_id_to_label(pred_ids_result, self.id2label) print(sentence) print(pred_label_result) return True
class SemanticSearch(object): """ 通过调用 sentence_ner_entities 函数实现对:人名、组织结构名、地名和日期 的识别 """ def __init__(self): self.system_info = SystemInfo() self.client = ModelServing(self.system_info.MODE_NER) self.config = self.system_info.get_config() self.entity_code = EntityCode() self.ner_entities = self.entity_code.get_ner_entities() self.code = self.entity_code.get_entity_code() self.entity_map_dic = { "ORG": "CPNY_NAME", "PER": "NAME", "DATE": "DATE", "LOC": "ADDR_VALUE" } self.labels_list = [] self.labels_list_split = [] self.__init_specific_label_combine() self.__init_jieba() def __init_specific_label_combine(self): """ 初始化labels_list和labels_list_split列表 用于将出现的此类标签:“NAMECOMPANY” 分开成 “NAME#COMPANY” :return: """ entities = self.entity_code.get_entities() for i in range(0, len(entities)): for j in range(0, len(entities)): if i != j: self.labels_list.append(entities[i] + entities[j]) self.labels_list_split.append( (entities[i] + "#" + entities[j])) self.labels_list.append(entities[j] + entities[i]) self.labels_list_split.append( (entities[j] + "#" + entities[i])) def __init_jieba(self): """ 可以给分词工具加入领域词汇辅助分词,加入公司名称可以有效提升分词工具对公司名称分词的准确度 :return: """ entities = self.entity_code.get_entities() for label in entities: jieba.add_word(label) @staticmethod def __combine_label(entities, label=None): """ 合并实体列表中相连且相同的label :param entities: :param label: :return: """ pre_label = False first_label = None entities_copy = [] for i in range(len(entities)): if entities[i][1] != label: pre_label = False if first_label is not None: entities_copy.append(first_label) first_label = None entities_copy.append(entities[i]) elif pre_label is False and entities[i][1] == label: pre_label = True first_label = entities[i] elif pre_label and first_label is not None and entities[i][ 1] == label: temp = first_label first_label = [temp[0] + entities[i][0], temp[1]] if first_label is not None: entities_copy.append(first_label) return entities_copy def __combine_com_add(self, entities): """ 合并 COMPANYADDR 和 ADDRCOMPANY 这类实体为 COMPANY :param entities: :return: """ company_index = -1 addr_index = -1 for i, entity in enumerate(entities): if self.ner_entities['COMPANY'] == entity[1]: company_index = i if self.ner_entities['ADDR'] == entity[1]: addr_index = i if company_index != -1 and addr_index != -1: if company_index == addr_index + 1: entities[company_index][ 0] = entities[addr_index][0] + entities[company_index][0] entities.remove(entities[addr_index]) elif company_index == addr_index - 1: entities[company_index][ 0] = entities[company_index][0] + entities[addr_index][0] entities.remove(entities[addr_index]) def __split_diff_labels(self, template_sen): """ 检测模板句中是否有不同的label相互连接的情况,eg. "ADDRNAME",这种情况分词工具无法正确分词 如果存在相连的label,使用“#”将两个label分开 :param template_sen: 模板句子 :return: """ for i, label in enumerate(self.labels_list): if label in template_sen: template_sen = template_sen.replace(label, self.labels_list_split[i]) return template_sen def __get_entities(self, sentence, pred_label_result): """ 根据BIO标签从识别结果中找出所有的实体 :param sentence: 待识别的句子 :param pred_label_result: 对该句子预测的标签 :return: 返回识别的实体 """ word = "" label = "" entities = [] for i in range(len(sentence)): temp_label = pred_label_result[i] if temp_label[0] == 'B': if word != "": if "##" in word: word = word.replace('##', '') if len(word) > 1: entities.append([word, label]) word = "" label = self.entity_map_dic[temp_label[2:]] word += sentence[i] elif temp_label[0] == 'I' and word != "": word += sentence[i] elif temp_label == 'O' and word != "": if "##" in word: word = word.replace('##', '') if len(word) > 1: entities.append([word, label]) word = "" label = "" if word != "": if "##" in word: word = word.replace('##', '') if len(word) > 1: entities.append([word, label]) return entities def get_ner_result(self, query): """ 发送 gRPC 请求到 Docker 服务,对 query 进行命名实体识别 :param query: 问句 :return: """ sentence, pred_label_result = self.client.send_grpc_request_ner(query) if pred_label_result is None: logger.error("句子: {0}\t实体识别结果为空".format(query)) return None entities = self.__get_entities(sentence, pred_label_result) # if len(entities) != 0: # self.__combine_com_add(entities) entity = [] for word, label in entities: begin = query.find(word) if begin != -1 and word.isdigit() is False: entity.append({ "type": label, "value": word, "code": self.code[label], "begin": begin, "end": begin + len(word) + 1 if begin != -1 else -1 }) return entity, entities def sentence_ner_entities(self, result_intent): """ 使用 BERT 模型对句子进行实体识别,返回标记实体的句子 :param result_intent: 意图识别模块的输出 :return: entities: 列表,存储的是 BERT 识别出来的实体信息:(word, label) result: account_label 模块返回的结果 """ sentence = result_intent["query"] entity, entities = self.get_ner_result(sentence) result_intent["entity"] = entity # 如果一个词被标识为命名实体,而该词又被检测为关系,那么从关系中将该词去除 for index, rel in enumerate(result_intent["relation"]): for word, _ in entities: if word.find(rel["value"]) != -1: result_intent["relation"].pop(index) # 如果识别的实体已经被识别为账户,那么其为账户的可能性更大,从实体列表里面去除该实体 for index, entity in enumerate(result_intent["entity"]): for account in result_intent["accounts"]: if account["value"].find(entity["value"]) != -1: result_intent["entity"].pop(index) # 提取出账户识别模块识别的所有 UNLABEL 标签 unlabels = [] for value in result_intent["accounts"]: if value["type"] == "UNLABEL": unlabels.append(value["value"]) if len(unlabels) == 0: unlabel_result = None else: unlabel_result = { "sentence": sentence, "unlabels": unlabels, "error": "账户类型不明确" } return result_intent, unlabel_result
class SemanticSearch(object): """ 通过调用 sentence_ner_entities 函数实现对:人名、组织结构名、地名和日期 的识别 """ def __init__(self, test_mode=False): self.system_info = SystemInfo(is_test=test_mode) self.client = ModelServing(self.system_info.MODE_NER, is_test=test_mode) self.config = self.system_info.get_config() self.entity_code = EntityCode() self.ner_entities = self.entity_code.get_ner_entities() self.code = self.entity_code.get_entity_code() self.labels_list = [] self.labels_list_split = [] self.__init_specific_label_combine() self.__init_jieba() def __init_specific_label_combine(self): """ 初始化labels_list和labels_list_split列表 用于将出现的此类标签:“NAMECOMPANY” 分开成 “NAME#COMPANY” :return: """ entities = self.entity_code.get_entities() for i in range(0, len(entities)): for j in range(0, len(entities)): if i != j: self.labels_list.append(entities[i] + entities[j]) self.labels_list_split.append( (entities[i] + "#" + entities[j])) self.labels_list.append(entities[j] + entities[i]) self.labels_list_split.append( (entities[j] + "#" + entities[i])) def __init_jieba(self): """ 可以给分词工具加入领域词汇辅助分词,加入公司名称可以有效提升分词工具对公司名称分词的准确度 :return: """ entities = self.entity_code.get_entities() for label in entities: jieba.add_word(label) def __combine_label(self, entities, label=None): """ 合并实体列表中相连且相同的label :param entities: :param label: :return: """ pre_label = False first_label = None entities_copy = [] for i in range(len(entities)): if entities[i][1] != label: pre_label = False if first_label is not None: entities_copy.append(first_label) first_label = None entities_copy.append(entities[i]) elif pre_label is False and entities[i][1] == label: pre_label = True first_label = entities[i] elif pre_label and first_label is not None and entities[i][ 1] == label: temp = first_label first_label = [temp[0] + entities[i][0], temp[1]] if first_label is not None: entities_copy.append(first_label) return entities_copy def __combine_com_add(self, entities): """ 合并 COMPANYADDR 和 ADDRCOMPANY 这类实体为 COMPANY :param entities: :return: """ company_index = -1 addr_index = -1 for i, entity in enumerate(entities): if self.ner_entities['COMPANY'] == entity[1]: company_index = i if self.ner_entities['ADDR'] == entity[1]: addr_index = i if company_index != -1 and addr_index != -1: if company_index == addr_index + 1: entities[company_index][ 0] = entities[addr_index][0] + entities[company_index][0] entities.remove(entities[addr_index]) elif company_index == addr_index - 1: entities[company_index][ 0] = entities[company_index][0] + entities[addr_index][0] entities.remove(entities[addr_index]) def __split_diff_labels(self, template_sen): """ 检测模板句中是否有不同的label相互连接的情况,eg. "ADDRNAME",这种情况分词工具无法正确分词 如果存在相连的label,使用“#”将两个label分开 :param template_sen: 模板句子 :return: """ for i, label in enumerate(self.labels_list): if label in template_sen: template_sen = template_sen.replace(label, self.labels_list_split[i]) return template_sen def __convert_output_data_format(self, data_param): """ 将 data_param 数据转换成问答图模块需要的数据格式 :param data_param: :return: """ output = defaultdict() output["query"] = data_param["raw_input"] output["template"] = data_param["new_input"] entity = [] for key, values in data_param["labels"].items(): for v in values: begin = data_param["raw_input"].find(v) entity.append({ "type": key, "value": v, "code": self.code[key], "begin": begin, "end": begin + len(v) + 1 if begin != -1 else -1 }) output["entity"] = entity return output def sentence_ner_entities(self, result): """ 使用 BERT 模型对句子进行实体识别,返回标记实体的句子 :param result: account_label 模块返回的结果 :return: entities: 列表,存储的是 BERT 识别出来的实体信息:(word, label) result: account_label 模块返回的结果 """ sentence = result["new_input"] sentence, pred_label_result = self.client.send_grpc_request_ner( sentence) word = "" label = "" entities = [] if sentence is None or pred_label_result is None: return entities, result for i in range(len(sentence)): temp_label = pred_label_result[i] if temp_label[0] == 'B': if word != "": if "##" in word: word = word.replace('##', '') entities.append([word, label]) word = "" if temp_label[2:] == 'ORG': label = self.ner_entities['COMPANY'] elif temp_label[2:] == 'PER': label = self.ner_entities['NAME'] elif temp_label[2:] == 'DATE': label = self.ner_entities['DATE'] else: label = self.ner_entities['ADDR'] word += sentence[i] elif temp_label[0] == 'I' and word != "": word += sentence[i] elif temp_label == 'O' and word != "": if "##" in word: word = word.replace('##', '') entities.append([word, label]) word = "" label = "" if word != "": if "##" in word: word = word.replace('##', '') entities.append([word, label]) if len(entities) != 0: entities = self.__combine_label(entities, label=self.ner_entities['ADDR']) entities = self.__combine_label(entities, label=self.ner_entities['COMPANY']) entities = self.__combine_label(entities, label=self.ner_entities['NAME']) entities = self.__combine_label(entities, label=self.ner_entities['DATE']) self.__combine_com_add(entities) entities = self.__combine_label(entities, label=self.ner_entities['COMPANY']) for (word, label) in entities: result["new_input"] = result["new_input"].replace(word, label) result["labels"].setdefault(label, []).append(word) result = self.__convert_output_data_format(result) return result