#-*- coding=utf8 -*- import jieba import re from tokenizer import seg_sentences fp=open("text.txt",'r',encoding='utf8') fout=open("out.txt",'w',encoding='utf8') for line in fp: line=line.strip() if len(line)>0: fout.write(' '.join(seg_sentences(line))+"\n") fout.close() if __name__=="__main__": pass
intab = "" outtab = "" deltab = "\n\t " trantab=text.maketrans(intab, outtab,deltab) return text.translate(trantab) def generate_ngram(sentence, n=4, m=2): # 生成n-gram if len(sentence) < n: n = len(sentence) temp=[tuple(sentence[i - k:i]) for k in range(m, n + 1) for i in range(k, len(sentence) + 1) ] # 生成2个或者3个的gram return [item for item in temp if len(''.join(item).strip())>1 and len(pattern.findall(''.join(item).strip()))==0] # 去掉非字母汉字的符号 if __name__=="__main__": # 分字进行n-gram copus_character=[generate_ngram(line.strip()) for line in open('text.txt','r',encoding='utf8') if len(line.strip())>0 and "RESUMEDOCSSTARTFLAG" not in line] # 先用hanlp分词,在对词进行n-gram copus_word=[generate_ngram(seg_sentences(line.strip(),with_filter=True) ) for line in open('text.txt','r',encoding='utf8') if len(line.strip())>0 and "RESUMEDOCSSTARTFLAG" not in line] copus_word=chain.from_iterable(copus_word) copus_word=['_'.join(item) for item in copus_word] fout=open("ngram2_3.txt", "w", encoding='utf-8') dic_filter={} # 统计词频 for item in copus_word: if item in dic_filter: dic_filter[item]+=1 else: dic_filter[item]=1 sort_dic=dict(sorted(dic_filter.items(),key=lambda val:val[1],reverse=True)) #reverse=True为降序排列,返回list fout.write(json.dumps(sort_dic, ensure_ascii=False,cls=NumpyEncoder)) fout.close()
def tokenize_raw(text): # 先以标点符号为单位切分,再使用hanlp的seg_sentences分词 split_sen = (i.strip() for i in re.split('。|,|,|:|:|?|!|\t|\n', _replace_c(text)) if len(i.strip()) > 5 ) # 这里用()而不用[] 是因为()是生成器,有利于减小内存,如果用[]生成list的话可能会内存不足 return [seg_sentences(sentence) for sentence in split_sen]
return [ item for item in temp if len(''.join(item).strip()) > 1 and len(pattern.findall(''.join(item).strip())) == 0 ] # 去掉非字母汉字的符号 if __name__ == "__main__": # 分字进行n-gram copus_character = [ generate_ngram(line.strip()) for line in open('text.txt', 'r', encoding='utf8') if len(line.strip()) > 0 and "RESUMEDOCSSTARTFLAG" not in line ] # 先用hanlp分词,在对词进行n-gram copus_word = [ generate_ngram(seg_sentences(line.strip(), with_filter=True)) for line in open('text.txt', 'r', encoding='utf8') if len(line.strip()) > 0 and "RESUMEDOCSSTARTFLAG" not in line ] copus_word = chain.from_iterable(copus_word) copus_word = ['_'.join(item) for item in copus_word] fout = open("ngram2_3.txt", "w", encoding='utf-8') dic_filter = {} # 统计词频 for item in copus_word: if item in dic_filter: dic_filter[item] += 1 else: dic_filter[item] = 1 sort_dic = dict( sorted(dic_filter.items(), key=lambda val: val[1],
def tokenize_raw(text): split_sen = (i.strip() for i in re.split('。|,|,|:|:|?|!|\t|\n', _replace_c(text)) if len(i.strip()) > 5) return [seg_sentences(sentence) for sentence in split_sen]
def tokenize_raw(text): split_sen = (i.strip() for i in re.split(u'。|,|,|:|:|?|!|\t|\n',_replace_c(text)) if len(i.strip())>4) # 使用()生成器来减小内存 return [seg_sentences(sentence) for sentence in split_sen]
#encoding=utf8 import jieba import re from tokenizer import seg_sentences fp = open('text.txt', 'r', encoding='utf-8') fout = open('out.txt', 'w', encoding='utf-8') for line in fp: line = line.strip() if len(line) > 0: fout.write(' '.join(seg_sentences(line)) + '\n') fout.close()