def _prepare(self): """ 加载词典 """ cell_phone_location, zip_code_location, area_code_location = phone_location_loader() self.zip_code_location = zip_code_location self.area_code_location = area_code_location self.cell_phone_location_trie = TrieTree() for num, loc in cell_phone_location.items(): self.cell_phone_location_trie.add_node(num, loc) self.cell_phone_pattern = re.compile(CELL_PHONE_CHECK_PATTERN) self.landline_phone_pattern = re.compile(LANDLINE_PHONE_CHECK_PATTERN) self.landline_area_code_pattern = re.compile(LANDLINE_PHONE_AREA_CODE_PATTERN) # 运营商词典 telecom_operator = telecom_operator_loader() self.telecom_operator_trie = TrieTree() for num, loc in telecom_operator.items(): self.telecom_operator_trie.add_node(num, loc)
class LexiconNER(object): ''' 构建基于 Trie 词典的前向最大匹配算法,做实体识别。 Args: entity_dicts(dict): 每个类型对应的实体词典 e.g. { 'Person': ['张大山', '岳灵珊', '岳不群'] 'Organization': ['成都市第一人民医院', '四川省水利局'] } text: str 类型,被搜索的文本内容。 Return: entity_list: 基于字 token 的实体列表 Examples: >>> import jionlp as jio >>> entity_dicts = { 'Person': ['张大山', '岳灵珊', '岳不群'], 'Organization': ['成都市第一人民医院', '四川省水利局']} >>> lexicon_ner = jio.ner.LexiconNER(entity_dicts) >>> text = '岳灵珊在四川省水利局上班。' >>> result = lexicon_ner(text) >>> print(result) [{'type': 'Person', 'text': '岳灵珊', 'offset': [0, 3]}, {'type': 'Organization', 'text': '四川省水利局', 'offset': [4, 10]}] ''' def __init__(self, entity_dicts): ''' 初始化构建词典 Trie 树 ''' self.trie_tree_obj = TrieTree() for typing, entity_list in entity_dicts.items(): self.trie_tree_obj.build_trie_tree(entity_list, typing) def __call__(self, text): ''' 标注数据,给定一个文本字符串,标注出所有的数据 Args: text: 给定的文本 str 格式 Return: entity_list: 标注的实体列表数据 ''' record_list = list() # 输出最终结果 i = 0 text_length = len(text) while i < text_length: pointer_orig = text[i:self.trie_tree_obj.depth + i] pointer = pointer_orig.lower() step, typing = self.trie_tree_obj.search(pointer) if typing is not None: record = { 'type': typing, 'text': pointer_orig[0:step], 'offset': [i, step + i] } record_list.append(record) i += step return record_list
def __init__(self, entity_dicts): ''' 初始化构建词典 Trie 树 ''' self.trie_tree_obj = TrieTree() for typing, entity_list in entity_dicts.items(): self.trie_tree_obj.build_trie_tree(entity_list, typing)
class PhoneLocation(object): """ 对于给定的电话号码,返回其归属地、区号、运营商等信息。 该方法与 jio.extract_phone_number 配合使用。 Args: text(str): 电话号码文本。若输入为 jio.extract_phone_number 返回的结果,效果更佳。 注意,仅输入电话号码文本,如 "86-17309729105"、"13499013052"、"021 60128421" 等, 而 "81203432" 这样的电话号码则没有对应的归属地。 若输入 "343981217799212723" 这样的文本,会造成误识别,须首先从中识别电话号码,再进行 归属地、区号、运营商的识别 Returns: dict: 该电话号码的类型,归属地,手机运营商 Examples: >>> import jionlp as jio >>> text = '联系电话:13288568202. (021)32830431' >>> num_list = jio.extract_phone_number(text) >>> print(num_list) >>> res = [jio.phone_location(item['text']) for item in num_list] >>> print(res) # [{'text': '13288568202', 'offset': (5, 16), 'type': 'cell_phone'}, {'text': '(021)32830431', 'offset': (18, 31), 'type': 'landline_phone'}] # {'number': '(021)32830431', 'province': '上海', 'city': '上海', 'type': 'landline_phone'} # {'number': '13288568202', 'province': '广东', 'city': '揭阳', 'type': 'cell_phone', 'operator': '中国联通'} """ def __init__(self): self.cell_phone_location_trie = None def _prepare(self): """ 加载词典 """ cell_phone_location, zip_code_location, area_code_location = phone_location_loader( ) self.zip_code_location = zip_code_location self.area_code_location = area_code_location self.cell_phone_location_trie = TrieTree() for num, loc in cell_phone_location.items(): self.cell_phone_location_trie.add_node(num, loc) self.cell_phone_pattern = re.compile(CELL_PHONE_CHECK_PATTERN) self.landline_phone_pattern = re.compile(LANDLINE_PHONE_CHECK_PATTERN) self.landline_area_code_pattern = re.compile( LANDLINE_PHONE_AREA_CODE_PATTERN) # 运营商词典 telecom_operator = telecom_operator_loader() self.telecom_operator_trie = TrieTree() for num, loc in telecom_operator.items(): self.telecom_operator_trie.add_node(num, loc) def __call__(self, text): """ 输入一段电话号码文本,返回其结果 """ if self.cell_phone_location_trie is None: self._prepare() res = self.cell_phone_pattern.search(text) if res is not None: # 匹配至手机号码 cell_phone_number = res.group() first_seven = cell_phone_number[:7] _, location = self.cell_phone_location_trie.search(first_seven) province, city = location.split(' ') # print(province, city) _, operator = self.telecom_operator_trie.search( cell_phone_number[:4]) return { 'number': text, 'province': province, 'city': city, 'type': 'cell_phone', 'operator': operator } res = self.landline_phone_pattern.search(text) if res is not None: # 匹配至固话号码 # 抽取固话号码的区号 res = self.landline_area_code_pattern.search(text) if res is not None: area_code = res.group(1) province, city = self.area_code_location.get(area_code, ' ').split(' ') if province == '': province, city = None, None return { 'number': text, 'province': province, 'city': city, 'type': 'landline_phone' } else: return { 'number': text, 'province': None, 'city': None, 'type': 'landline_phone' } return { 'number': text, 'province': None, 'city': None, 'type': 'unknown' } def landline_phone_location(self, phone_num): """ 检索固定电话号码城市区号并返回,即已知输入是固话号码 """ if self.cell_phone_location_trie is None: self._prepare() # 抽取固话号码的区号 res = self.landline_area_code_pattern.search(phone_num) if res is not None: area_code = res.group(1) province, city = self.area_code_location.get(area_code, ' ').split(' ') if province == '': province, city = None, None return { 'number': phone_num, 'province': province, 'city': city, 'type': 'landline_phone' } else: return { 'number': phone_num, 'province': None, 'city': None, 'type': 'landline_phone' } def cell_phone_location(self, phone_num): """ 检索手机号码城市区号并返回,即已知输入是手机号 """ if self.cell_phone_location_trie is None: self._prepare() res = self.cell_phone_pattern.search(phone_num) cell_phone_number = res.group() first_seven = cell_phone_number[:7] _, location = self.cell_phone_location_trie.search(first_seven) province, city = location.split(' ') _, operator = self.telecom_operator_trie.search(cell_phone_number[:4]) return { 'number': phone_num, 'province': province, 'city': city, 'type': 'cell_phone', 'operator': operator }