Example #1
0
def data_read_from_txt():
    fout=open('nvp.txt','w',encoding='utf8')
    for line in open('text.txt','r',encoding='utf8'):    
        line=line.strip()
        grammer(cut_hanlp(line),fout)   
    fout.close()
Example #2
0
#-*- coding=utf8 -*-
import jieba
import re
from tokenizer import cut_hanlp

#jieba.load_userdict("dict.txt")

# # 设置高词频:一个
# jieba.suggest_freq('台中',tune=True)

# 设置高词频:dict.txt中的每一行都设置一下
# fp=open("dict.txt",'r',encoding='utf8')
# for line in fp:
#     line=line.strip()
#     jieba.suggest_freq(line, tune=True)

# # 设置高词频:dict.txt中的每一行都设置一下快速方法
# [jieba.suggest_freq(line.strip(), tune=True) for line in open("dict.txt",'r',encoding='utf8')]

if __name__ == "__main__":
    string = "台中正确应该不会被切开。"

    words_jieba = " ".join(jieba.cut(string, HMM=False))

    words_hanlp = cut_hanlp(string)
    print("words_jieba:" + words_jieba, '\n', "words_hanlp:" + words_hanlp)
Example #3
0
    for line in fp.readlines():
        result1 = p1.findall(line)  # 返回匹配到的list
        print(result1)
        if result1:
            regex_re1 = result1
            line = p1.sub("FLAG1", line)  # 将匹配到的替换成FLAG1
            print(line)
        result2 = p2.findall(line)
        if result2:
            line = p2.sub("FLAG2", line)
            print(line)
        words = jieba.cut(line)  # 结巴分词,返回一个generator object
        result = " ".join(
            words)  # 结巴分词结果 本身是一个generator object,所以使用 “ ”.join() 拼接起来

        words1 = cut_hanlp(line)  # hanlp分词结果,返回的是str

        if "FLAG1" in result:
            result = result.split("FLAG1")
            result = merge_two_list(result, result1)
            ss = result
            result = "".join(result)  # 本身是个list,我们需要的是str,所以使用 "".join() 拼接起来
        if "FLAG2" in result:
            result = result.split("FLAG2")
            result = merge_two_list(result, result2)
            result = "".join(result)
        # print(result)

        fout.write("jieba :" + result)
        fout.write("hanlp:" + str(words1))
    fout.close()
Example #4
0
def data_read():
    fout = open('nvp.txt', 'w', encoding='utf8')
    for line in open('text.txt', 'r', encoding='utf8'):
        line = line.strip()
        grammer(cut_hanlp(line), fout)  # 先进行hanlp进行分词,在使用grammer进行合并短语
    fout.close()
Example #5
0
    regex1 = u'[^\u4e00-\u9fa5()*&……%¥$,,。.@! !]{1,5}期'  #非汉字和特殊字符的xxx期
    regex2 = r'[0-9]{1,3}[.]?[0-9]{1,3}%'
    p1 = re.compile(regex1)
    p2 = re.compile(regex2)
    for line in fp.readlines():
        result1 = p1.findall(line)
        if result1:
            line = p1.sub('Flag1', line)
        result2 = p2.findall(line)
        if result2:
            line = p2.sub('Flag2', line)

        words = jieba.cut(line)
        result = ' '.join(words)

        words1 = cut_hanlp(line)
        if 'Flag1' in result:
            result = result.split('Flag1')
            result = merge_two_list(result, result1)
            result = ''.join(result)
        if 'Flag2' in result:
            result = result.split('Flag2')
            result = merge_two_list(result, result2)
            result = ''.join(result)
        if 'Flag 1' in words1:
            words1 = words1.split('Flag 1')
            words1 = merge_two_list(words1, result1)
            words1 = ''.join(words1)
        if 'Flag 2' in words1:
            words1 = words1.split('Flag 2')
            words1 = merge_two_list(words1, result2)
Example #6
0
#-*- coding=utf8 -*-
import jieba
import re
from tokenizer import cut_hanlp
#jieba.load_userdict("dict.txt")

#jieba.add_word(row[0].strip(),tag=row[1].strip())
#jieba.suggest_freq(segment)
#fp=open("dict.txt",'r',encoding='utf8')
#for line in fp:
#line=line.strip()
#jieba.suggest_freq(line, tune=True)
#[jieba.suggest_freq(line.strip(), tune=True) for line in open("dict.txt",'r',encoding='utf8')]
if __name__ == "__main__":
    string = "台中正确应该不会被切开。"

    words = cut_hanlp(string)
    #words  = jieba.cut(string)
    print([i for i in words])
Example #7
0
            if node[1] not in merge_pos:
                text += node[0].strip() + '/O' + 3 * ' '
    model_tagged_file.write(text + '\n')


def grammer(sentence, model_tagged_file):
    # 输入的sentence的格式为:[('工作','vn'),('描述','v'),(':','w')]
    grammer1 = r"""NP:
        {<m|mg|Mg|mq|q|qg|qt|qv|s|>*<a|an|ag>*<s|g|gb|gbc|gc|gg|gm|gp|n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|o|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+<f>?<ude1>?<g|gb|gbc|gc|gg|gm|gp|n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|o|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+}
        {<n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+<cc>+<n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+}
        {<m|mg|Mg|mq|q|qg|qt|qv|s|>*<q|qg|qt|qv>*<f|b>*<vi|v|vn|vg|vd>+<ude1>+<n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+}
        {<g|gb|gbc|gc|gg|gm|gp|n|an|nr|ns|nt|nz|nb|nba|nbc|nbp|nf|ng|nh|nhd|nz|nx|ntu|nts|nto|nth|ntch|ntcf|ntcb|ntc|nt|nsf|ns|nrj|nrf|nr2|nr1|nr|nnt|nnd|nn|nmc|nm|nl|nit|nis|nic|ni|nhm|nhd>+<vi>?}
        VP:{<v|vd|vg|vf|vl|vshi|vyou|vx|vi|vn>+}
        """      # 动词短语块

    cp = nltk.RegexpParser(grammer1)
    try:
        result = cp.parse(sentence)  # 输出以grammer1设置的名词块为单位的树
    except:
        pass
    else:
        getNodes(result, model_tagged_file)  # 使用getNodes遍历树


if __name__ == '__main__':
    with open('nvp.txt', 'w', encoding='utf-8') as fout:
        with open('text.txt', 'r', encoding='utf-8') as fp:
            for line in fp:
                line = line.strip()
                grammer(cut_hanlp(line), fout)
Example #8
0
    p1 = re.compile(regex1)
    p2 = re.compile(regex2)
    for line in fp.readlines():
        result1 = p1.findall(line)  # 返回匹配到的list [xxx期]
        if result1:
            regex_re1 = result1
            line = p1.sub("FLAG1", line)  # 将匹配到的替换成FLAG1

        result2 = p2.findall(line)
        if result2:
            line = p2.sub("FLAG2", line)

        words = jieba.cut(line)  # 结巴分词,返回一个generator object
        result = " ".join(words)  # 结巴分词结果 本身是一个generator object,所以使用 “ ”.join() 拼接起来

        words1 = tokenizer.cut_hanlp(line)  # hanlp分词结果,返回的是str
        if "FLAG1" in result:
            result = result.split("FLAG1")
            result = merge_two_list(result, result1)
            ss = result
            result = "".join(result)  # 本身是个list,我们需要的是str,所以使用 "".join() 拼接起来
        if "FLAG2" in result:
            result = result.split("FLAG2")
            result = merge_two_list(result, result2)
            result = "".join(result)
            # print(result)
        fout.write("jieba:" + result)
        fout.write("hanlp:" + words1)
    fout.close()