Ejemplo n.º 1
0
def cut_input(input):
    '''
    cut a input string, return utf-8 string
    '''
    result = norm_seg(input)
    wordsList = []
    for w in result:
        if w.word.strip() == '' or w.flag.strip() == '':
            continue
        wordsList.append(w.word)
    with open(project_path + '/stopwords') as f:
        stop_words = []
        data = f.readline().strip()
        while data:
            stop_words.append(data)
            data = f.readline().strip()
    drop_word_list = []  # 需要删掉的word
    for word in wordsList:
        if word.encode('utf-8').strip() in stop_words:
            drop_word_list.append(word)
        elif len(word.encode('utf-8').strip()) == len(word.strip()):
            drop_word_list.append(word)
    for drop_word in drop_word_list:
        wordsList.remove(drop_word)
    words = " ".join(wordsList)

    return words.encode('utf-8')
Ejemplo n.º 2
0
 def cut_seg(self, sentence=''):
     '''
     :param special_words: ['美丽 a', '转发 v']
     :param industrys: 行业字典, 2汽车, 7美妆, 0 新词
     :return: 
     '''
     words = norm_seg(sentence)
     return words
Ejemplo n.º 3
0
def cut_input(process_data):
    '''
        cut a input string, return utf-8 string  jieba中文分词的处理
    '''
    result = norm_seg(process_data)
    words_list = []
    for w in result:
        if w.word.strip() == '':
            continue
        words_list.append(w.word)
    words = " ".join(words_list)
    return words.encode('utf-8')
Ejemplo n.º 4
0
def cut_input(input):
    '''
    cut a input string, return utf-8 string
    '''

    result = norm_seg(input)
    wordsList = []
    for w in result:
        if w.word.strip() == '' or w.flag.strip() == '':
            continue
        wordsList.append(w.word)

    words = " ".join(wordsList)

    return words.encode('utf-8')
Ejemplo n.º 5
0
def cut_input(input, posFlag):
    '''
    cut a input string, return utf-8 string
    '''

    if posFlag == True:
        result = norm_seg(input)
        wordsList = []
        for w in result:
            wordsList.append(w.word + '_' + w.flag)
        words = " ".join(wordsList)
    else:
        words = " ".join(norm_cut(input))
    #return words.encode('utf-8')
    return words
Ejemplo n.º 6
0
def cut_input(input, flag=False):
    '''
    cut a input string, return utf-8 string
    '''

    result = norm_seg(input)
    wordsList = []
    for w in result:
        if w.word.strip() == '' or w.flag.strip() == '':
            continue
        if flag:
            wordsList.append(w.word + '_' + w.flag)
        else:
            wordsList.append(w.word)

    return " ".join(wordsList).encode('utf8')
Ejemplo n.º 7
0
 def count_words(self, lwords):
     """
     :param lwords:
     :return: dwords type is dict ; key :word#flag value :freq
     """
     jieba.enable_parallel(10)  # start many processes
     word_flags = []  # {word#flag : freq},{word:sex}
     for context in lwords:
         for sub in norm_seg(context):
             w = sub.word
             if self.oGWF.isGeneralWord(w.encode('utf-8')) or w.strip() == '':
                 continue
             if len(w) >= int(self.word_length):
                 key = '%s#%s' % (w, sub.flag)
                 word_flags.append(key)
     logger.info('count is starting')
     jieba.disable_parallel()
     dwords = Counter(word_flags)
     return dwords
Ejemplo n.º 8
0
    def cut(self, sentence='', special_words=[], industrys=[]):
        '''
        :param special_words: ['美丽 a', '转发 v']
        :param industrys: 行业字典, 2汽车, 7美妆, 0 新词
        :return: 
        '''
        f_special_dict = self.f_inner_dict
        if special_words and f_special_dict:
            f_special_dict = self.__set_new_dic(special_words)
            if os.path.exists(f_special_dict):
                jieba.set_dictionary(f_special_dict)

        load_industrydict(industrys)
        print sentence
        words = norm_seg(sentence)

        if special_words and os.path.exists(
                f_special_dict) and self.f_inner_dict != f_special_dict:
            cmd = 'rm -rf %s' % f_special_dict
            os.system(cmd)
        return words
Ejemplo n.º 9
0
def cut_input_plus(input, sBrand):
    '''
    cut a input string, return utf-8 string
    '''
    result = norm_seg(input)
    wordsList = []
    wordsPosList = []
    posDict = {'all': []}
    bIsBrand = False
    sBrand0 = ""
    for w in result:
        if w.word.strip() == '' or w.flag.strip() == '':
            continue
        w.word = w.word.strip().encode('utf8')
        w.flag = w.flag.strip().encode('utf8')

        if sBrand == w.word:
            bIsBrand = True

        if len(w.word.split(' ')) > 1:
            w.word = "#kong#".join(w.word.split(' '))
            if bIsBrand:
                sBrand0 = w.word

        wordsList.append(w.word)
        wordsPosList.append(w.word + '_' + w.flag)
        if not w.flag in posDict:
            posDict[w.flag] = []
        posDict[w.flag].append(w.word)
        posDict['all'].append(w.word)

    words_posFlag = " ".join(wordsPosList)
    words = " ".join(wordsList)

    if sBrand0 != "":
        sBrand = sBrand0

    return bIsBrand, words, words_posFlag, posDict, sBrand
Ejemplo n.º 10
0
def cut_input(input):
    '''
    cut a input string, return utf-8 string
    '''

    result = norm_seg(input)
    wordsList = []
    wordsPosList = []
    posDict = {'all': []}
    for w in result:
        if w.word.strip() == '' or w.flag.strip() == '':
            continue
        wordsList.append(w.word)
        wordsPosList.append(w.word + '_' + w.flag)
        if not w.flag in posDict:
            posDict[w.flag] = []
        posDict[w.flag].append(w.word.encode('utf-8'))
        posDict['all'].append(w.word.encode('utf-8'))

    words_posFlag = " ".join(wordsPosList)
    words = " ".join(wordsList)

    return words.encode('utf-8'), words_posFlag.encode('utf-8'), posDict
Ejemplo n.º 11
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
1.test suggest_freq
2.test load industry_dict
3.test special word (If the word contains spaces, add to userword / specialword )
industry_dict = {2:"car_dict",7:"makeup.dict"}

"""
from jieba import suggest_freq
from jieba.norm import norm_seg, load_industrydict
#print suggest_freq('小黑瓶',True)

#test 2
testword = ['长安欧尚', "睿骋cc", "行动力", "蓝水粉水小黑瓶"]
load_industrydict([2, 7])
for i in testword:
    for word in norm_seg(i):
        print word.word, word.flag