def get_expand_words(test: str, num: int): test_list = fool.pos_cut(test) pp = set([p for _, p in fool.pos_cut(test)[0]]) pinyin = [ yun[tone] for tone in lazy_pinyin( test, style=Style.FINALS, strict=True, errors='ignore') ] expand = {} front = 0 while len(expand) + 1 < num and front < len(pinyin): yy = '_'.join(pinyin[front:][::-1]) front += 1 if yy in word_dict: for p in pp: if p in word_dict[yy]: for w, c in word_dict[yy][p].items(): expand[w] = expand.get(w, 0) + c if len(expand) < num: for p in word_dict[yy]: for w, c in word_dict[yy][p].items(): expand[w] = expand.get(w, 0) + c for w in expand: expand[w] *= num if test in expand: del expand[test] mid_c = sorted(list(expand.values()), reverse=True)[num] out = [w for w, c in expand.items() if c >= mid_c][:num] return out
def n_answer(s): ans = '' parts = fool.pos_cut(s) for part in parts[0]: if part[1] == 'n': ans = ans + part[0] return ans
def tcut(): text = "我在北京天安门" words, ners = fool.analysis(text) print(ners) words = fool.pos_cut(text) print(words) fool.delete_userdict() print(fool.cut(text))
def processSentence(sentence): #print(fool.cut(sentence)) #print(fool.pos_cut(sentence)) try: print(fool.cut(sentence)) print(fool.pos_cut(sentence)) words, ners = fool.analysis(sentence) print(words,ners) except: pass
def how_much_answer(s): ans = '' ans_temp = '' parts = fool.pos_cut(s) try: for i, part in enumerate(parts[0]): if part[1] == 'm': ans_temp = ans_temp + part[0] if parts[0][i + 1][1] == 'q': ans = ans + part[0] elif parts[0][i + 1][1] == 'm': ans = ans + part[0] elif part[1] == 'q': if parts[0][i - 1][1] == 'm': ans = ans + part[0] except: pass if ans == '': return ans_temp return ans
def get_businessScope_list_with_fool(self, corp_name_list): businessScope_list = ["" for i in range(len(corp_name_list))] count = 0 now_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print(now_time + " 开始提取经营范围:") for corpName_str in tqdm(corp_name_list, ncols = 75): data_tuple_list_single = fool.pos_cut(corpName_str)[0] last_nz_index_single = tuple_list_find_key_index(tuple_list = data_tuple_list_single, axis = 1, key = "nz", appear_type = "last") # 最后一个nz标签位置 if last_nz_index_single != -1 and last_nz_index_single + 1 < len(data_tuple_list_single) \ and len(data_tuple_list_single[last_nz_index_single + 1][0]) > 1: # 存在nz且最后一个nz后第一个分词字段不是单字,且nz不是最后一个 tuple_list_withoutNZ = data_tuple_list_single[last_nz_index_single + 1 :] first_ns_index_single = tuple_list_find_key_index(tuple_list = tuple_list_withoutNZ, axis = 1, key = "ns", appear_type = "first") # 去除nz后第一个ns标签位置 if first_ns_index_single == 0: tuple_list_withoutNZ = tuple_list_withoutNZ[1 :] elif first_ns_index_single != -1: tuple_list_withoutNZ = tuple_list_withoutNZ[0: first_ns_index_single] businessScope_temp = merge_tuple_list(tuple_list = tuple_list_withoutNZ, axis = 0) lawtype_key_result_temp = keyword_matrix_in_word(keyword_matrix = self.keyword_read_obj.lawtype_keyword_matrix, \ string = businessScope_temp) # 矩阵关键字匹配 businessScope_temp, _ = string_cut_off_from_key(string = businessScope_temp, key = lawtype_key_result_temp[1], \ appear_type = "first", cut_type = "after", include = True) # 剔除末尾的公司类型标识 if businessScope_temp != "" and len(businessScope_temp) > 1: # 经营范围只有一个字的也去掉 businessScope_list[count] = businessScope_temp count += 1 return businessScope_list[: count]
#/usr/bin/env python #-*-conding:utf-8-*- source_path = "../../crawler/all_comment.txt" outf_path = "input.json" outf = open(outf_path, 'w') input_f = open(source_path) import fool import json for i, line in enumerate(input_f): line = line.strip("\n").strip() if not line: continue tag_words = fool.pos_cut(line) tag_list = [item[1] for item in tag_words] words_list = [item[0] for item in tag_words] outf.write( json.dumps({ "words": words_list, "tag": tag_list }, ensure_ascii=False) + "\n")
#!/usr/bin/env python # -*-coding:utf-8-*- import fool text = ["我在北京天安门看你难受香菇,一一千四百二十九", "我在北京晒太阳你在非洲看雪", "千年不变的是什么", "我在北京天安门。"] print("no dict:", fool.cut(text, ignore=True)) fool.load_userdict("./test_dict.txt") print("use dict: ", fool.cut(text)) fool.delete_userdict() print("delete dict:", fool.cut(text)) pos_words = fool.pos_cut(text) print("pos result", pos_words) words, ners = fool.analysis(text) print("ners: ", ners) ners = fool.ner(text) print("ners:", ners)
if er == "fool": return fool_cut(text) if er == "hanlp": return hanlp_cut(text) def recognize(text, er="fool"): if er == "fool": return fool_recognize(text) if er == "hanlp": return hanlp_recognize(text) if __name__ == "__main__": result = fool.pos_cut(text) print(result) q.d() print(cut(text)) print(recognize(text)) """ [[ (0, 5, 'company', '新浪科技'), (6, 9, 'location', '北京'), (10, 18, 'time', '4月29日晚间'), (20, 25, 'company', '搜狗公司'), (24, 27, 'time', '今天'), (31, 37, 'time', '3月31日'), (37, 47, 'time', '2019年第一季度'), (60, 65, 'time', '第一季度'), (91, 94, 'location', '美国'),
parser.add_argument("filename", nargs='?', help="input file") args = parser.parse_args() delim = args.delimiter plim = args.pos batch_zize = args.batch_size if args.user_dict: fool.load_userdict(args.user_dict) fp = open(args.filename, 'r') if args.filename else sys.stdin lines = fp.readlines(batch_zize) while lines: lines = [ln.strip("\r\n") for ln in lines] if args.pos: result_list = fool.pos_cut(lines) for res in result_list: out_str = [plim.join(p) for p in res] print(delim.join(out_str)) else: result_list = fool.cut(lines) for res in result_list: print(delim.join(res)) lines = fp.readlines(batch_zize) fp.close()
#!/usr/bin/env python # -*-coding:utf-8-*- import fool text = ["我在北京天安门看你难受香菇,一一千四百二十九", "我在北京晒太阳你在非洲看雪", "千年不变的是什么", "我在北京天安门。"] print("no dict:", fool.cut(text, ignore=True)) fool.load_userdict("./test_dict.txt") print("use dict: ", fool.cut(text)) fool.delete_userdict() print("delete dict:", fool.cut(text)) pos_words =fool.pos_cut(text) print("pos result", pos_words) words, ners = fool.analysis(text) print("ners: ", ners) ners = fool.ner(text) print("ners:", ners)
def pos(self,sen): res=fool.pos_cut(sen) return res[0]
''' 源教程来自: https://github.com/rockyzhengwu/FoolNLTK/blob/master/README_CH.md ''' import fool path=r"C:\Users\lenvov\Desktop\my_diy_dic.txt" #txt文件保存用户本地自定义词典,每行格式为:词 权重 fool.load_userdict(path) #加载自定义词典 #词典只能定义词的权值,不能定义词的词性,故对词性标注没有帮助 #fool.delete_userdict(); #删除用户自定义词典 text="习近平觉得张构架的趣多多比希斯罗机场的巧克力味的奥利奥要贵得多。" words, ners = fool.analysis(text) #words列表保存分词后词性标注的结果(只使用自带词典不添加自定义词典),ners保存识别得到的实体(存在分词不准确但命名实体识别正确的现象,但使用自定义字典以后便可修正) # 实体识别过程得到的words列表不受自定义词典影响。一般不用 print('文本切分:',fool.cut(text),'\n') print('文本切分后进行词性标注:',fool.pos_cut(text),'\n') print('words:',words,'\n') print('实体识别',ners,'\n')
import fool string = '这个把手该换了,我不喜欢日本和服,别把手放在我的肩膀上,工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作' result = fool.cut(string) print(result) result = fool.pos_cut(string) print(result) _, ners = fool.analysis(string) print(ners)
def fool_pos_cut_text(self,text): res = fool.pos_cut(text) return res