Ejemplo n.º 1
0
#-*- coding=utf8 -*-
import jieba
import re
from tokenizer import seg_sentences

fp=open("text.txt",'r',encoding='utf8')
fout=open("out.txt",'w',encoding='utf8')
for line in fp:
    line=line.strip()
    if len(line)>0:
        fout.write(' '.join(seg_sentences(line))+"\n")
fout.close()
if __name__=="__main__":
    pass
    
  
Ejemplo n.º 2
0
    intab = ""
    outtab = ""    
    deltab = "\n\t "
    trantab=text.maketrans(intab, outtab,deltab)
    return text.translate(trantab)

def generate_ngram(sentence, n=4, m=2):           # 生成n-gram
    if len(sentence) < n:
        n = len(sentence)
    temp=[tuple(sentence[i - k:i]) for k in range(m, n + 1) for i in range(k, len(sentence) + 1) ]                       # 生成2个或者3个的gram
    return [item for item in temp if len(''.join(item).strip())>1 and len(pattern.findall(''.join(item).strip()))==0]    # 去掉非字母汉字的符号
  
if __name__=="__main__":
    # 分字进行n-gram
    copus_character=[generate_ngram(line.strip())  for line  in open('text.txt','r',encoding='utf8') if len(line.strip())>0 and "RESUMEDOCSSTARTFLAG" not in line]    
    # 先用hanlp分词,在对词进行n-gram
    copus_word=[generate_ngram(seg_sentences(line.strip(),with_filter=True) ) for line  in open('text.txt','r',encoding='utf8') if len(line.strip())>0 and "RESUMEDOCSSTARTFLAG" not in line]
    copus_word=chain.from_iterable(copus_word)
    copus_word=['_'.join(item) for item in copus_word]
    fout=open("ngram2_3.txt", "w", encoding='utf-8')
    
    dic_filter={}                     # 统计词频
    for item in copus_word:
        if item in dic_filter:
            dic_filter[item]+=1
        else:
            dic_filter[item]=1
    sort_dic=dict(sorted(dic_filter.items(),key=lambda val:val[1],reverse=True))       #reverse=True为降序排列,返回list
    fout.write(json.dumps(sort_dic, ensure_ascii=False,cls=NumpyEncoder))               
    fout.close() 
Ejemplo n.º 3
0
def tokenize_raw(text):  # 先以标点符号为单位切分,再使用hanlp的seg_sentences分词
    split_sen = (i.strip()
                 for i in re.split('。|,|,|:|:|?|!|\t|\n', _replace_c(text))
                 if len(i.strip()) > 5
                 )  # 这里用()而不用[] 是因为()是生成器,有利于减小内存,如果用[]生成list的话可能会内存不足
    return [seg_sentences(sentence) for sentence in split_sen]
Ejemplo n.º 4
0
    return [
        item for item in temp if len(''.join(item).strip()) > 1
        and len(pattern.findall(''.join(item).strip())) == 0
    ]  # 去掉非字母汉字的符号


if __name__ == "__main__":
    # 分字进行n-gram
    copus_character = [
        generate_ngram(line.strip())
        for line in open('text.txt', 'r', encoding='utf8')
        if len(line.strip()) > 0 and "RESUMEDOCSSTARTFLAG" not in line
    ]
    # 先用hanlp分词,在对词进行n-gram
    copus_word = [
        generate_ngram(seg_sentences(line.strip(), with_filter=True))
        for line in open('text.txt', 'r', encoding='utf8')
        if len(line.strip()) > 0 and "RESUMEDOCSSTARTFLAG" not in line
    ]
    copus_word = chain.from_iterable(copus_word)
    copus_word = ['_'.join(item) for item in copus_word]
    fout = open("ngram2_3.txt", "w", encoding='utf-8')

    dic_filter = {}  # 统计词频
    for item in copus_word:
        if item in dic_filter:
            dic_filter[item] += 1
        else:
            dic_filter[item] = 1
    sort_dic = dict(
        sorted(dic_filter.items(), key=lambda val: val[1],
Ejemplo n.º 5
0
def tokenize_raw(text):
    split_sen = (i.strip()
                 for i in re.split('。|,|,|:|:|?|!|\t|\n', _replace_c(text))
                 if len(i.strip()) > 5)
    return [seg_sentences(sentence) for sentence in split_sen]
Ejemplo n.º 6
0
def tokenize_raw(text):
    split_sen = (i.strip() for i in re.split(u'。|,|,|:|:|?|!|\t|\n',_replace_c(text)) if len(i.strip())>4)
    # 使用()生成器来减小内存
    return [seg_sentences(sentence) for sentence in split_sen]
Ejemplo n.º 7
0
#encoding=utf8
import jieba
import re
from tokenizer import seg_sentences

fp = open('text.txt', 'r', encoding='utf-8')
fout = open('out.txt', 'w', encoding='utf-8')
for line in fp:
    line = line.strip()
    if len(line) > 0:
        fout.write(' '.join(seg_sentences(line)) + '\n')
fout.close()