def word_distribution_loader(): """ 加载 jieba 分词后的词汇结果在中文文本中的词频分布,返回每个词在语料中的出现总次数、概率、 概率的 -log10 值。 Returns: dict(list): 例如 {'国家': {'total_num': 101930, 'prob': 0.0014539722, 'log_prob': 3.2632870}, ...} """ word_info = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'word_distribution.json')) word_info_dict = dict() total_num = sum([item[1] for item in word_info]) for item in word_info: word_info_dict.update({ item[0]: { 'total_num': item[1], 'prob': item[1] / total_num, 'log_prob': -math.log10(item[1] / total_num) } }) return word_info_dict
def char_radical_loader(): """ 加载汉字字形词典 char_radical.txt """ structure_dict = { 0: '一体结构', 1: '左右结构', 2: '上下结构', 3: '左中右结构', 4: '上中下结构', 5: '右上包围结构', 6: '左上包围结构', 7: '左下包围结构', 8: '全包围结构', 9: '半包围结构' } content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'char_radical.txt')) map_dict = dict() for item in content: assert len(item.split('\t')) == 5 char, radical, structure, four_corner, components = item.split('\t') map_dict.update( {char: [radical, int(structure), four_corner, components]}) return map_dict, structure_dict
def chinese_char_dictionary_loader(): """ 加载新华字典,分别包括: 汉字,释义,详细释义 3 部分 考虑到新华字典无法与时俱进,其中有相当多的老旧内容,故增删说明如下: 1、删除了所有的日本和字 -> 释义中包含 “日本和字” 内容,如 “桛 ā 1.日本和字。”; 2、删除了释义未详的字 -> 释义中包含 “义未详” 内容,如 “穝zuō## ⒈义未详。” 3、删除了低频汉字 -> 释义中字频低于亿分之一的,且不在 char_distribution.json 中的字。 如 “葨 葨wēi 1.见"葨芝"。” """ content = read_file_by_line(os.path.join(GRAND_DIR_PATH, 'dictionary', 'chinese_char_dictionary.txt'), strip=False) char_dict = dict() for idx, line in enumerate(content): segs = line.split('\t') assert len(segs) == 3 char_dict.update({ segs[0]: { 'explanation': segs[1], 'more_details': segs[2].replace('\n', '') if segs[2] != '\n' else None } }) return char_dict
def char_distribution_loader(): """ 加载 utf-8 编码字符在中文文本中的分布,返回每个字在语料中的出现总次数、概率、 概率的 -log10 值。 Returns: dict(list): 例如 {'中': {'total_num': 61980430, 'prob': 0.0054539722, 'log_prob': 2.2632870}, ...} """ char_info = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'char_distribution.json')) char_info_dict = dict() total_num = sum([item[1] for item in char_info]) for item in char_info: char_info_dict.update({ item[0]: { 'total_num': item[1], 'prob': item[1] / total_num, 'log_prob': -math.log10(item[1] / total_num) } }) return char_info_dict
def world_location_loader(): ''' 加载世界地名词典 world_location.txt ''' content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary/world_location.txt')) result = dict() cur_continent = None for line in content: if '洲:' in line: cur_continent = line.replace(':', '') result.update({cur_continent: dict()}) continue item_tup = line.split('\t') item_length = len(item_tup) if item_length == 3: result[cur_continent].update( {item_tup[0]: {'full_name': item_tup[1], 'capital': item_tup[2]}}) if item_length == 4: result[cur_continent].update( {item_tup[0]: {'full_name': item_tup[1], 'capital': item_tup[2], 'main_city': item_tup[3].split('/')}}) else: pass return result
def stopwords_loader(): """ 加载停用词典 stopwords.txt """ res = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary/stopwords.txt')) # 一般漏掉了若干转换符号 res.extend(['', ' ', '\t']) return res
def chinese_char_dictionary_loader(): """ 加载百度汉语字典,字典与新华字典大同小异,分别包括: 汉字,偏旁,字形结构,四角编码,笔画顺序,繁体字,五笔输入编码,拼音,释义 本词典囊括了 utf-8 编码中,“一~龥”的所有汉字,但有所删减 考虑到百度汉语字典无法与时俱进,其中有相当多的老旧内容,故增删说明如下: 1、删除了所有的日本和字 -> 释义中包含 “日本汉字/日本地名用字” 内容,如 “桛 ā 1.日本和字。”; 2、删除了释义未详的字 -> 释义中包含 “义未详” 内容,或某个字的某个读音义未详,如 “穝zuō## ⒈义未详。” 3、删除了低频汉字 -> 释义中字频低于亿分之一的,且不在 char_distribution.json 中的字。 如 “葨wēi 1.见"葨芝"。” 4、删除了所有的韩国、朝鲜创字、用字、用意 -> 櫷guī槐木的一种(韩国汉字) 5、删除了古代用字、用意 -> 释义中包含 “古同~/古代~/古通~/古书~/古地名/古人名” 内容, 但如有多个释义,且其中有非古代释义,则保留该汉字;如 “鼃 wā 古同蛙”。但常见古字,如“巙kuí” 共计删减 3402 字。 """ content = read_file_by_line(os.path.join(GRAND_DIR_PATH, 'dictionary', 'chinese_char_dictionary.txt'), strip=False) pinyin_ptn = re.compile('\[[a-zàáāǎòóōǒèéēěìíīǐùúūǔǜǘǖǚǹńňüḿ]{1,8}\]') explanation_ptn = re.compile('\d{1,2}\.') char_dict = dict() for idx, line in enumerate(content): segs = line.split('\t') assert len(segs) == 8 # 拆解每个读音的各个含义 pinyin_list = [item[1:-1] for item in pinyin_ptn.findall(segs[-1])] explanation_list = [ item for item in pinyin_ptn.split(segs[-1].replace( '~', segs[0]).strip()) if item != '' ] assert len(pinyin_list) == len(explanation_list) pinyin_explanation_dict = dict() for pinyin, explanations in zip(pinyin_list, explanation_list): explanations = [ ex for ex in explanation_ptn.split(explanations) if ex != '' ] pinyin_explanation_dict.update({pinyin: explanations}) char_dict.update({ segs[0]: { 'radical': segs[1], 'structure': STRUCTURE_DICT[int(segs[2])], 'corner_coding': segs[3], 'stroke_order': segs[4], 'traditional_version': segs[5], 'wubi_coding': segs[6], 'pinyin': pinyin_explanation_dict } }) return char_dict
def chinese_idiom_loader(): ''' 加载成语词典 chinese_idiom.txt ''' content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary/chinese_idiom.txt')) result = dict() for line in content: item_tup = line.split('\t') result.update({item_tup[0]: int(item_tup[1])}) return result
def traditional_simplified_loader(file_name): """ 加载繁简体转换词典 """ content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', file_name)) map_dict = dict() for item in content: key, value = item.split('\t') map_dict.update({key: value}) return map_dict
def idf_loader(): """ 加载 idf 文件,属于 tfidf 算法的一部分 """ content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'idf.txt')) idf_dict = dict() for item in content: word, idf_value = item.split('\t') idf_dict.update({word: float(idf_value)}) return idf_dict
def pinyin_phrase_loader(): content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'pinyin_phrase.txt')) map_dict = dict() for item in content: key, value = item.split('\t') value = value.split('/') map_dict.update({key: value}) return map_dict
def pkuseg_postag_loader(): ''' 加载北大分词器的词性映射表 ''' content = read_file_by_line(os.path.join(DIR_PATH, 'pkuseg_postag_map.txt')) pkuseg_postag_map = dict() for line in content: segs = line.split('\t') pkuseg_postag_map.update({segs[0]: segs[1]}) return pkuseg_postag_map
def telecom_operator_loader(): """ 加载通信运营商手机号码的匹配词典 """ telecom_operator = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'telecom_operator.txt')) telecom_operator_dict = dict() for line in telecom_operator: num, operator = line.strip().split(' ') telecom_operator_dict.update({num: operator}) return telecom_operator_dict
def china_location_loader(): ''' 加载中国地名词典 china_location.txt ''' location_jio = read_file_by_line(os.path.join( GRAND_DIR_PATH, 'dictionary/china_location.txt'), strip=False) cur_province = None cur_city = None cur_county = None location_dict = dict() for item in location_jio: if not item.startswith('\t'): # 省 if len(item.strip().split('\t')) != 3: continue province, admin_code, alias_name = item.strip().split('\t') cur_province = province location_dict.update({ cur_province: { '_full_name': province, '_alias': alias_name, '_admin_code': admin_code } }) elif item.startswith('\t\t'): # 县 if len(item.strip().split('\t')) != 3: continue county, admin_code, alias_name = item.strip().split('\t') cur_county = county location_dict[cur_province][cur_city].update({ cur_county: { '_full_name': county, '_alias': alias_name, '_admin_code': admin_code } }) else: # 市 if len(item.strip().split('\t')) != 3: continue city, admin_code, alias_name = item.strip().split('\t') cur_city = city location_dict[cur_province].update({ cur_city: { '_full_name': city, '_alias': alias_name, '_admin_code': admin_code } }) return location_dict
def xiehouyu_loader(): """ 加载歇后语词典,共计 17000 余条,其中有相似的歇后语,如: 一个模子出来的 一个样 一个模子出来的 一模一样 对于此类歇后语,均按不同的表达分为不同的歇后语,方便检索查询 """ xiehouyu = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'xiehouyu.txt')) xiehouyu = list(set(xiehouyu)) xiehouyu = [item.split('\t') for item in xiehouyu] return xiehouyu
def pinyin_char_loader(): content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'pinyin_char.txt')) map_dict = dict() for item in content: key, value = item.split('\t') assert len(item.split('\t')) == 2 multi_pinyin = value.split('/') map_dict.update({key: multi_pinyin}) return map_dict
def chinese_word_dictionary_loader(): """ 加载新华词典,词典中有 20 万余个多音字,分别包括: 词语及其释义 """ content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'chinese_word_dictionary.txt')) word_dict = dict() for idx, line in enumerate(content): segs = line.split('\t') assert len(segs) == 2 word_dict.update({segs[0]: segs[1]}) return word_dict
def sentiment_words_loader(): """ 加载情感词典,并附带其对应的情感权重 """ content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'sentiment_words.txt')) sentiment_words_dict = dict() for item in content: key, value = item.split('\t') assert len(item.split('\t')) == 2 # multi_pinyin = value.split('/') sentiment_words_dict.update({key: float(value)}) return sentiment_words_dict
def chinese_word_dictionary_loader(): ''' 加载新华词典,词典中有 20 万余个多音字,分别包括: 词语及其释义 ''' content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'chinese_word_dictionary.txt')) word_list = list() for idx, line in enumerate(content): segs = line.split('\t') assert len(segs) == 2 cur_item = {'word': segs[0], 'explanation': segs[1]} word_list.append(cur_item) return word_list
def chinese_char_dictionary_loader(): ''' 加载新华字典,分别包括: 汉字,释义,详细释义 3 部分 ''' content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'chinese_char_dictionary.txt'), strip=False) char_dict = dict() for line in content: segs = line.split('\t') assert len(segs) == 3 char_dict.update({ segs[0]: {'explanation': segs[1], 'more_details': segs[2].replace('\n', '') if segs[2] != '\n' else None}}) return char_dict
def chinese_word_dictionary_loader(): """ 加载新华词典,词典中有 20 万余个多音字,分别包括: 词语及其释义 考虑到新华词典无法与时俱进,其中有相当多的老旧内容,故增删说明如下: 1、删除了所有未出现在 word_distribution.json 中的词汇; 可发现,词典由原先 26万条锐减至 3.3万条,即新华词典中大量的词条都已被淘汰,且有很多新词未加入词典。 """ content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'chinese_word_dictionary.txt')) word_dict = dict() for idx, line in enumerate(content): segs = line.split('\t') assert len(segs) == 2 word_dict.update({segs[0]: segs[1]}) return word_dict
def chinese_idiom_loader(): ''' 加载成语词典 chinese_idiom.txt ''' content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary/chinese_idiom.txt')) result = dict() cur_item = dict() import json for line in content: item_tup = line.split('\t') assert len(item_tup) == 5 example = item_tup[3] if item_tup[3] != '' else None cur_item = {'explanation': item_tup[1], 'derivation': item_tup[2], 'example': example, 'freq': int(item_tup[4])} result.update({item_tup[0]: cur_item}) return result
def phone_location_loader(): """ 加载电话号码地址与运营商解析词典 """ content = read_file_by_line(os.path.join(GRAND_DIR_PATH, 'dictionary', 'phone_location.txt'), strip=False) def return_all_num(line): """ 返回所有的手机号码中间四位字符串 """ front, info = line.strip().split('\t') num_string_list = info.split(',') result_list = list() for num_string in num_string_list: if '-' in num_string: start_num, end_num = num_string.split('-') for i in range(int(start_num), int(end_num) + 1): result_list.append('{:0>4d}'.format(i)) else: result_list.append(num_string) result_list = [front + res for res in result_list] return result_list phone_location_dict = dict() cur_location = '' zip_code_location_dict = dict() area_code_location_dict = dict() for line in content: if line.startswith('\t'): res = return_all_num(line) for i in res: phone_location_dict.update({i: cur_location}) else: cur_location, area_code, zip_code = line.strip().split('\t') zip_code_location_dict.update({zip_code: cur_location}) area_code_location_dict.update({area_code: cur_location}) return phone_location_dict, zip_code_location_dict, area_code_location_dict
def pinyin_char_loader(): content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'pinyin_char.txt')) map_dict = dict() for item in content: #print(item) if len(item.split('\t')) != 2: # 该发音下无汉字 continue #pdb.set_trace() key, value = item.split('\t') #print(key, value) value = list(value) for val in value: if val not in map_dict: map_dict.update({val: key}) else: # 说明存在多音字 #logging.warn(val, map_dict[val]) #logging.warn(val, key) pass #pdb.set_trace() return map_dict
def chinese_char_dictionary_loader(): ''' 加载新华字典,词典中有两千余个多音字,分别包括: 汉字,其旧称,笔画数,拼音,偏旁部首,释义,详细释义 7 部分 ''' content = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary', 'chinese_char_dictionary.txt')) char_list = list() for line in content: segs = line.split('\t') assert len(segs) == 7 cur_item = { 'word': segs[0], 'old_word': segs[1], 'strokes': segs[2], 'pinyin': segs[3], 'radicals': segs[4], 'explanation': segs[5], 'more_details': segs[6] } char_list.append(cur_item) return char_list
def china_location_change_loader(): """ 加载中国地名变更词典 china_location_change.txt 整理了 2018 年至今国内政府批复修改的县级以上的地名变化。仅添加了地名的撤销变更, 而对未撤销地名的新增地名,如深圳市光明区,不做记录,因为不影响工具的使用。 Args: None Returns: dict: 返回 省、市、县区 三级的变更地址,以及变更日期和批准部门; '国批' 表示国务院批准,'民批' 表示国务院民政部批准, '省批'表示省级政府或民政部批准。 """ location_change_jio = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary/china_location_change.txt')) location_change_list = list() for line in location_change_jio: location_change_dict = dict() line_seg = line.split('=>') orig_line_seg = line_seg[0].split('\t') new_line_seg = line_seg[1].split('\t') location_change_dict.update({ 'date': orig_line_seg[0], 'department': orig_line_seg[1], 'old_loc': [orig_line_seg[2:4], orig_line_seg[4:6], orig_line_seg[6:8]], 'new_loc': new_line_seg }) location_change_list.append(location_change_dict) return location_change_list
def stopwords_loader(): ''' 加载停用词典 stopwords.txt ''' return read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary/stopwords.txt'))
def pornography_loader(): """ 加载淫秽色情词典 pornography.txt """ return read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary/pornography.txt'))
def china_location_loader(detail=False): """ 加载中国地名词典 china_location.txt Args: detail(bool): 若为 True,则返回 省、市、县区、乡镇街道、村社区 五级信息; 若为 False,则返回 省、市、县区 三级信息 """ location_jio = read_file_by_line(os.path.join( GRAND_DIR_PATH, 'dictionary/china_location.txt'), strip=False) cur_province = None cur_city = None cur_county = None cur_town = None cur_village = None location_dict = dict() for item in location_jio: if not item.startswith('\t'): # 省 if len(item.strip().split('\t')) != 3: continue province, admin_code, alias_name = item.strip().split('\t') cur_province = province location_dict.update({ cur_province: { '_full_name': province, '_alias': alias_name, '_admin_code': admin_code } }) elif item.startswith('\t\t\t\t'): # 村、社区 if not detail: continue cur_village = item.strip() location_dict[cur_province][cur_city][cur_county][cur_town].update( {cur_village: None}) elif item.startswith('\t\t\t'): # 乡镇、街道 if not detail: continue cur_town = item.strip() location_dict[cur_province][cur_city][cur_county].update( {cur_town: dict()}) elif item.startswith('\t\t'): # 县、区 if len(item.strip().split('\t')) != 3: continue county, admin_code, alias_name = item.strip().split('\t') cur_county = county location_dict[cur_province][cur_city].update({ cur_county: { '_full_name': county, '_alias': alias_name, '_admin_code': admin_code } }) else: # 市 if len(item.strip().split('\t')) != 3: continue city, admin_code, alias_name = item.strip().split('\t') cur_city = city location_dict[cur_province].update({ cur_city: { '_full_name': city, '_alias': alias_name, '_admin_code': admin_code } }) return location_dict
def negative_words_loader(): """ 加载否定词典 negative_words.txt """ res = read_file_by_line( os.path.join(GRAND_DIR_PATH, 'dictionary/negative_words.txt')) return res