コード例 #1
0
def fenci(one_string, discover_new_word=False):
    one_string = re.sub(r'\s+', '', one_string)# 去掉所有空格
    final_result = []
    temp_list = jieba.lcut(one_string, HMM=discover_new_word)
    if discover_new_word==False:# HMM=False已实际使之缩小了不少粒度
        for word in temp_list:
            if isAllZh(word) == False:
                continue
            if len(word) > 3:
                jieba.del_word(word)
                final_result.extend(jieba.lcut(word, HMM=discover_new_word))
            else:
                final_result.append(word)
    else:
        for word in temp_list:
            if isAllZh(word)==False:
                continue
            # if len(word)==3: # 根据词频设置阈值
            #     print(word,jieba.get_FREQ(word))
            if jieba.get_FREQ(word)==None \
                    or (len(word)>1 and (jieba.get_FREQ(word)==None or jieba.get_FREQ(word)==0)) \
                    or len(word)>3 \
                    or (len(word)==3 and jieba.get_FREQ(word)!=None and jieba.get_FREQ(word)<100):
                jieba.del_word(word) # 强制
                final_result.extend(jieba.lcut(word))
            else:
                final_result.append(word)
    return final_result
コード例 #2
0
def frequency_tune():
    testlist = [
        ('今天天气不错', ('今天', '天气')),
        ('如果放到post中将出错。', ('中', '将')),
        ('我们中出了一个叛徒', ('中', '出')),
    ]

    for sent, seg in testlist:
        print('/'.join(jieba.cut(sent, HMM=False)))
        word = ''.join(seg)
        # 使用suggest_freq()调节单个词语的词频,使其能被分出来
        print('%s before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
        print('/'.join(jieba.cut(sent, HMM=False)))
        print('-'*40)
コード例 #3
0
def jieba_test():
    """

    :return:
    """
    jieba.load_userdict("./dict/user_dict.txt")

    jieba.add_word('石墨烯')
    jieba.add_word('凱特琳')
    jieba.del_word('自定义词')

    test_sent = ("李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n"
                 "例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n"
                 "「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。")
    words = jieba.cut(test_sent)
    print('/'.join(words))

    print("=" * 40)

    result = pseg.cut(test_sent)

    for w in result:
        print(w.word, "/", w.flag, ", ", end=' ')

    print("\n" + "=" * 40)

    terms = jieba.cut('easy_install is great')
    print('/'.join(terms))
    terms = jieba.cut('python 的正则表达式是好用的')
    print('/'.join(terms))

    print("=" * 40)
    # test frequency tune
    testlist = [
        ('今天天气不错', ('今天', '天气')),
        ('如果放到post中将出错。', ('中', '将')),
        ('我们中出了一个叛徒', ('中', '出')),
    ]

    for sent, seg in testlist:
        print('/'.join(jieba.cut(sent, HMM=False)))
        word = ''.join(seg)
        print('%s Before: %s, After: %s' %
              (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
        print('/'.join(jieba.cut(sent, HMM=False)))
        print("-" * 40)

    return None
コード例 #4
0
ファイル: text4words.py プロジェクト: minlogiciel/docutone
    def write_dictionary_line(self, w, dict_file=None, withFlag=True):
        """
        write to dictionary file 
        
        w : word
        dict_file : dictionary file
        
        """

        n = jieba.get_FREQ(w.word)
        if n:
            if dict_file != None:
                if withFlag == True:
                    dict_file.write(w.word + " " + str(n) + " " + w.flag +
                                    "\n")
                else:
                    dict_file.write(w.word + "/ ")
            else:
                print(w.word + " " + str(n) + " " + w.flag + "\n")
コード例 #5
0
def main():
    # jieba.enable_parallel(4)
    text = "我们来到了北京饭店"
    print(" ".join(jieba.cut(text)))
    print(jieba.get_FREQ('北京饭店'), jieba.get_FREQ('北京'), jieba.get_FREQ('饭店'))
    print("=" * 10)
    # 定义词库, 分隔 北京  饭店
    jieba.set_dictionary('my_dict.txt')
    print(" ".join(jieba.cut('今天天气不错')))
    print(jieba.get_FREQ('北京饭店'), jieba.get_FREQ('北京'), jieba.get_FREQ('饭店'))
    print(" ".join(jieba.cut(text)))
    print("=" * 10)
    print(" ".join(jieba.cut('藏宝阁太贵')))
    jieba.suggest_freq(('太', '贵'), True)
    print(" ".join(jieba.cut('藏宝阁太贵')))
コード例 #6
0
    def norm_document(self, document, test=True):
        """
        Arguments :
        
        document : converted document
        
        return normalize document
        
        """
        norm_sentences = document.split('\n')
        sentences = ""

        if test:
            seg = Segmentation()
            words = seg.segment(norm_sentences)[0]
            for sentence in words:
                hasword = False
                for w in sentence:
                    if w in self.SEP:
                        sentences += w
                    elif len(w) == 1:
                        # 去掉不常用的单字
                        n = jieba.get_FREQ(w)
                        if n != None and n > 10:
                            sentences += w
                    else:
                        hasword = True
                        sentences += w
                if hasword:
                    sentences += '\n'

        else:
            for sentence in norm_sentences:
                s = sentence
                if s:
                    sentences += s + '\n'

        return sentences
コード例 #7
0
    def norm_document(self, document):
        norm_sentences = document.split('\n')
        sentences = ""

        seg = Segmentation()
        words = seg.segment(norm_sentences)[0]
        for sentence in words:
            hasword = False
            for w in sentence:
                if w in self.SEP:
                    sentences += w
                elif len(w) == 1:
                    # 去掉不常用的单字
                    n = jieba.get_FREQ(w)
                    if n != None and n > 10:
                        sentences += w
                else:
                    hasword = True
                    sentences += w
            if hasword:
                sentences += '\n'

        return sentences
コード例 #8
0
def fun5():
    jieba.load_userdict("userdict.txt")
    jieba.add_word("石墨烯")
    jieba.add_word("凯特琳")
    jieba.del_word("自定义词")
    test_send = (
        "李小福是创新办主任也是云计算方面的专家;什么是八一双鹿\n"
        "例如我输入一个带“韩玉鉴赏”的标题,在自定义词库中也增加了此词为N类\n"
        "「台中」正確應該不會被切開。mac上可以分出「石墨烯」;此時又可以分出來凱特琳了。"
    )
    words = jieba.cut(test_send)
    print("/".join(words))
    print("=" * 40)
    result = pseg.cut(test_send)
    for w in result:
        print(w.word, "/", w.flag, ", ", end="  ")
    print("\n" + "=" * 48)

    terms = jieba.cut("easy_install is great")
    print("/".join(terms))
    terms = jieba.cut("python 的正则表达式是好用的")
    print("/".join(terms))

    print("=" * 40)

    testlist = [
        ('今天天气不错', ('今天', '天气')),
        ('如果放到post中将出错。', ('中', '将')),
        ('我们中出了一个叛徒', ('中', '出'))
    ]
    for send, seg in testlist:
        print("/".join(jieba.cut(send, HMM=False)))
        word = ''.join(seg)
        print("%s Before: %s, After:  %s" % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
        print("/".join(jieba.cut(send, HMM=False)))
        print("-" * 40)
コード例 #9
0
ファイル: test_jieba.py プロジェクト: kaifei-bianjie/DAP
    def test_user_dict(self):
        """
        2. 自定义词典
        """
        topic = '添加自定义词典'
        split_line = self.get_split_line(topic)
        self.logger.debug(split_line)
        test_sent = """李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n
        例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n
        台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。
        """
        words = jieba.cut(test_sent)
        self.logger.debug('{topic}_原始: {msg}'.format(topic=topic,
                                                     msg='/'.join(words)))
        """
                调整词典:动态修改词典
                """
        userdict_path = os.path.dirname(__file__) + "/jieba_dict/dict.txt"
        jieba.add_word('石墨烯')
        jieba.add_word('凱特琳')
        jieba.del_word('自定义词')
        jieba.load_userdict(userdict_path)
        words = jieba.cut(test_sent)
        self.logger.debug('{topic}_自定义字典分词:{msg}'.format(topic=topic,
                                                         msg='/'.join(words)))

        self.logger.debug('test split words' + "=" * 40)
        terms = jieba.cut('easy_install is great')
        self.logger.debug('{topic}_字典分词: {msg}'.format(topic=topic,
                                                       msg='/'.join(terms)))
        jieba.del_word('easy_install')
        terms = jieba.cut('easy_install is great')
        self.logger.debug('{topic}_删除单词: {msg}'.format(topic=topic,
                                                       msg='/'.join(terms)))
        terms = jieba.cut('python 的正则表达式是好用的')
        self.logger.debug('{topic}_单词: {msg}'.format(topic=topic,
                                                     msg='/'.join(terms)))

        self.logger.debug('test frequency tune' + "=" * 40)
        word = '这里中将应该被切开'
        self.logger.debug('{topic}_调低词频之前: {msg}'.format(topic=topic,
                                                         msg='/'.join(
                                                             jieba.cut(word))))
        self.logger.debug('{topic}_调整词频: {msg}'.format(
            topic=topic,
            msg='before: {before}, after: {after}'.format(
                before=jieba.get_FREQ('中将'),
                after=jieba.suggest_freq(('中', '将'), True))))
        self.logger.debug('{topic}_调低词频之后: {msg}'.format(
            topic=topic, msg='/'.join(jieba.cut(word, HMM=False))))

        jieba.del_word('台中')
        word = '[台中]正确应该不会被切开'
        self.logger.debug('{topic}_调高词频之前: {msg}'.format(topic=topic,
                                                         msg='/'.join(
                                                             jieba.cut(word))))
        self.logger.debug('{topic}_调整词频: {msg}'.format(
            topic=topic,
            msg='before: {before}, after: {after}'.format(
                before=jieba.get_FREQ('台中'),
                after=jieba.suggest_freq('台中', True))))
        self.logger.debug('{topic}_调高词频之后: {msg}'.format(
            topic=topic, msg='/'.join(jieba.cut(word, HMM=False))))
コード例 #10
0
 def freq_tag(self, word):
     freq = jieba.get_FREQ(word)
     tag = ""
     if freq is not None:
         tag = pseg.lcut(word, HMM=False)[0].flag
     return freq, tag
words = jieba.cut(test_sent)
#print('/'.join(words))

#print("="*40)

result = pseg.cut(test_sent)

#for w in result:
#    print(w.word, "/", w.flag, ", ", end=' ')

#print("\n" + "="*40)

terms = jieba.cut('easy_install is great')
#print('/'.join(terms))
terms = jieba.cut('python 的正则表达式是好用的')
#print('/'.join(terms))

#print("="*40)
# test frequency tune
testlist = [
('今天天气不错', ('今天', '天气')),
('如果放到post中将出错。', ('中', '将')),
('我们中出了一个叛徒', ('中', '出')),
]

for sent, seg in testlist:
#    print('/'.join(jieba.cut(sent, HMM=False)))
    word = ''.join(seg)
    print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
    print('/'.join(jieba.cut(sent, HMM=False)))
#    print("-"*40)
コード例 #12
0
ファイル: test_userdict.py プロジェクト: WalkerWang731/jieba
print('/'.join(words))

print("=" * 40)

result = pseg.cut(test_sent)

for w in result:
    print(w.word, "/", w.flag, ", ", end=' ')

print("\n" + "=" * 40)

terms = jieba.cut('easy_install is great')
print('/'.join(terms))
terms = jieba.cut('python 的正则表达式是好用的')
print('/'.join(terms))

print("=" * 40)
# test frequency tune
testlist = [
    ('今天天气不错', ('今天', '天气')),
    ('如果放到post中将出错。', ('中', '将')),
    ('我们中出了一个叛徒', ('中', '出')),
]

for sent, seg in testlist:
    print('/'.join(jieba.cut(sent, HMM=False)))
    word = ''.join(seg)
    print('%s Before: %s, After: %s' %
          (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
    print('/'.join(jieba.cut(sent, HMM=False)))
    print("-" * 40)
コード例 #13
0
/「/台中/」/正確/應該/不會/被/切開/。/mac/上/可/分出/「/石墨烯/」/;/此時/又/可以/分出/來/凱特琳/了/。
"""

# print("="*40)
#
# result = pseg.cut(test_sent)
#
# for w in result:
#     print(w.word, "/", w.flag, ", ", end=' ')
#
# print("\n" + "="*40)
#
# terms = jieba.cut('easy_install is great')
# print('/'.join(terms))
# terms = jieba.cut('python 的正则表达式是好用的')
# print('/'.join(terms))
#
# print("="*40)
# test frequency tune
testlist = [
('今天天气不错', ('今天', '天气')),
('如果放到post中将出错。', ('中', '将')),
('我们中出了一个叛徒', ('中', '出')),
]

for sent, seg in testlist:
    print('/'.join(jieba.cut(sent, HMM=False)))
    word = ''.join(seg)
    print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
    print('/'.join(jieba.cut(sent, HMM=False)))
    print("-"*40)
コード例 #14
0
#!/usr/bin/python3
# coding: utf-8
import jieba
##################################################################
## suggest_freq(segment, tune=True) 可调节单个词语的词频, 使其能(或不能)被分出来
# suggest_freq() 每执行一次, 频率会增加 1
print(jieba.get_FREQ(('中', '将')))  # None
print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))  # 如果/放到/post/中将/出错/。
print(jieba.suggest_freq(('中', '将'), True))  # 494; 意思是 中将 两个字要分开
print(jieba.get_FREQ('中'), jieba.get_FREQ('将'))  # 243191 122305
print(jieba.get_FREQ('中', '将'))  # 243191; 输出的是 中 的词频
print(jieba.get_FREQ(('中', '将')))  # None, 没有意义
print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))  # 如果/放到/post/中/将/出错/。

print(jieba.get_FREQ('台中'))
print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))  # 「/台/中/」/正确/应该/不会/被/切开
print(jieba.suggest_freq('台中', True))  # 69; 执行几次以后会增加...,
print(jieba.get_FREQ('台中'))
print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))  # 「/台中/」/正确/应该/不会/被/切开
##################################################################
## "台中"总是被切成"台 中"; P(台中) < P(台) x P(中), "台中"词频不够导致其成词概率较低
# 解决方法: 强制调高词频
# jieba.add_word('台中') 或者 jieba.suggest_freq('台中', True)
##################################################################
## test frequency tune
testlist = [
    ('今天天气不错', ('今天', '天气')),
    ('如果放到post中将出错。', ('中', '将')),
    ('我们中出了一个叛徒', ('中', '出')),
]
for sent, seg in testlist:
コード例 #15
0
#!/usr/bin/python3
# coding: utf-8
import jieba
##################################################################
## suggest_freq(segment, tune=True) 可调节单个词语的词频, 使其能(或不能)被分出来
# suggest_freq() 每执行一次, 频率会增加 1
print(jieba.get_FREQ(('中', '将')))  # None
print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))  # 如果/放到/post/中将/出错/。
print(jieba.suggest_freq(('中', '将'), True))  # 494; 意思是 中将 两个字要分开
print(jieba.get_FREQ('中'), jieba.get_FREQ('将'))  # 243191 122305
print(jieba.get_FREQ('中', '将'))  # 243191; 输出的是 中 的词频
print(jieba.get_FREQ(('中', '将')))  # None, 没有意义
print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))  # 如果/放到/post/中/将/出错/。

print(jieba.get_FREQ('台中'))
print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))  # 「/台/中/」/正确/应该/不会/被/切开
print(jieba.suggest_freq('台中', True))  # 69; 执行几次以后会增加...,
print(jieba.get_FREQ('台中'))
print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))  # 「/台中/」/正确/应该/不会/被/切开
##################################################################
## "台中"总是被切成"台 中"; P(台中) < P(台) x P(中), "台中"词频不够导致其成词概率较低
# 解决方法: 强制调高词频
# jieba.add_word('台中') 或者 jieba.suggest_freq('台中', True)
##################################################################
## test frequency tune
testlist = [
    ('今天天气不错', ('今天', '天气')),
    ('如果放到post中将出错。', ('中', '将')),
    ('我们中出了一个叛徒', ('中', '出')),
]
for sent, seg in testlist: