def data_read_from_txt(): fout=open('nvp.txt','w',encoding='utf8') for line in open('text.txt','r',encoding='utf8'): line=line.strip() grammer(cut_hanlp(line),fout) fout.close()
#-*- coding=utf8 -*- import jieba import re from tokenizer import cut_hanlp #jieba.load_userdict("dict.txt") # # 设置高词频:一个 # jieba.suggest_freq('台中',tune=True) # 设置高词频:dict.txt中的每一行都设置一下 # fp=open("dict.txt",'r',encoding='utf8') # for line in fp: # line=line.strip() # jieba.suggest_freq(line, tune=True) # # 设置高词频:dict.txt中的每一行都设置一下快速方法 # [jieba.suggest_freq(line.strip(), tune=True) for line in open("dict.txt",'r',encoding='utf8')] if __name__ == "__main__": string = "台中正确应该不会被切开。" words_jieba = " ".join(jieba.cut(string, HMM=False)) words_hanlp = cut_hanlp(string) print("words_jieba:" + words_jieba, '\n', "words_hanlp:" + words_hanlp)
for line in fp.readlines(): result1 = p1.findall(line) # 返回匹配到的list print(result1) if result1: regex_re1 = result1 line = p1.sub("FLAG1", line) # 将匹配到的替换成FLAG1 print(line) result2 = p2.findall(line) if result2: line = p2.sub("FLAG2", line) print(line) words = jieba.cut(line) # 结巴分词,返回一个generator object result = " ".join( words) # 结巴分词结果 本身是一个generator object,所以使用 “ ”.join() 拼接起来 words1 = cut_hanlp(line) # hanlp分词结果,返回的是str if "FLAG1" in result: result = result.split("FLAG1") result = merge_two_list(result, result1) ss = result result = "".join(result) # 本身是个list,我们需要的是str,所以使用 "".join() 拼接起来 if "FLAG2" in result: result = result.split("FLAG2") result = merge_two_list(result, result2) result = "".join(result) # print(result) fout.write("jieba :" + result) fout.write("hanlp:" + str(words1)) fout.close()
def data_read(): fout = open('nvp.txt', 'w', encoding='utf8') for line in open('text.txt', 'r', encoding='utf8'): line = line.strip() grammer(cut_hanlp(line), fout) # 先进行hanlp进行分词,在使用grammer进行合并短语 fout.close()
regex1 = u'[^\u4e00-\u9fa5()*&……%¥$,,。.@! !]{1,5}期' #非汉字和特殊字符的xxx期 regex2 = r'[0-9]{1,3}[.]?[0-9]{1,3}%' p1 = re.compile(regex1) p2 = re.compile(regex2) for line in fp.readlines(): result1 = p1.findall(line) if result1: line = p1.sub('Flag1', line) result2 = p2.findall(line) if result2: line = p2.sub('Flag2', line) words = jieba.cut(line) result = ' '.join(words) words1 = cut_hanlp(line) if 'Flag1' in result: result = result.split('Flag1') result = merge_two_list(result, result1) result = ''.join(result) if 'Flag2' in result: result = result.split('Flag2') result = merge_two_list(result, result2) result = ''.join(result) if 'Flag 1' in words1: words1 = words1.split('Flag 1') words1 = merge_two_list(words1, result1) words1 = ''.join(words1) if 'Flag 2' in words1: words1 = words1.split('Flag 2') words1 = merge_two_list(words1, result2)
#-*- coding=utf8 -*- import jieba import re from tokenizer import cut_hanlp #jieba.load_userdict("dict.txt") #jieba.add_word(row[0].strip(),tag=row[1].strip()) #jieba.suggest_freq(segment) #fp=open("dict.txt",'r',encoding='utf8') #for line in fp: #line=line.strip() #jieba.suggest_freq(line, tune=True) #[jieba.suggest_freq(line.strip(), tune=True) for line in open("dict.txt",'r',encoding='utf8')] if __name__ == "__main__": string = "台中正确应该不会被切开。" words = cut_hanlp(string) #words = jieba.cut(string) print([i for i in words])
if node[1] not in merge_pos: text += node[0].strip() + '/O' + 3 * ' ' model_tagged_file.write(text + '\n') def grammer(sentence, model_tagged_file): # 输入的sentence的格式为:[('工作','vn'),('描述','v'),(':','w')] grammer1 = r"""NP: {<m|mg|Mg|mq|q|qg|qt|qv|s|>*<a|an|ag>*<s|g|gb|gbc|gc|gg|gm|gp|n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|o|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+<f>?<ude1>?<g|gb|gbc|gc|gg|gm|gp|n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|o|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+} {<n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+<cc>+<n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+} {<m|mg|Mg|mq|q|qg|qt|qv|s|>*<q|qg|qt|qv>*<f|b>*<vi|v|vn|vg|vd>+<ude1>+<n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+} {<g|gb|gbc|gc|gg|gm|gp|n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+<vi>?} VP:{<v|vd|vg|vf|vl|vshi|vyou|vx|vi|vn>+} """ # 动词短语块 cp = nltk.RegexpParser(grammer1) try: result = cp.parse(sentence) # 输出以grammer1设置的名词块为单位的树 except: pass else: getNodes(result, model_tagged_file) # 使用getNodes遍历树 if __name__ == '__main__': with open('nvp.txt', 'w', encoding='utf-8') as fout: with open('text.txt', 'r', encoding='utf-8') as fp: for line in fp: line = line.strip() grammer(cut_hanlp(line), fout)
p1 = re.compile(regex1) p2 = re.compile(regex2) for line in fp.readlines(): result1 = p1.findall(line) # 返回匹配到的list [xxx期] if result1: regex_re1 = result1 line = p1.sub("FLAG1", line) # 将匹配到的替换成FLAG1 result2 = p2.findall(line) if result2: line = p2.sub("FLAG2", line) words = jieba.cut(line) # 结巴分词,返回一个generator object result = " ".join(words) # 结巴分词结果 本身是一个generator object,所以使用 “ ”.join() 拼接起来 words1 = tokenizer.cut_hanlp(line) # hanlp分词结果,返回的是str if "FLAG1" in result: result = result.split("FLAG1") result = merge_two_list(result, result1) ss = result result = "".join(result) # 本身是个list,我们需要的是str,所以使用 "".join() 拼接起来 if "FLAG2" in result: result = result.split("FLAG2") result = merge_two_list(result, result2) result = "".join(result) # print(result) fout.write("jieba:" + result) fout.write("hanlp:" + words1) fout.close()