def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route = {} jieba.calc(sentence, DAG, 0, route=route) x = 0 buf = u'' N = len(sentence) while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if len(buf) > 0: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) buf = u'' else: regognized = __cut(buf) for t in regognized: yield t buf = u'' yield pair(l_word, word_tag_tab.get(l_word, 'x')) x = y if len(buf) > 0: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) else: regognized = __cut(buf) for t in regognized: yield t
def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route ={} jieba.calc(sentence,DAG,0,route=route) x = 0 buf =u'' N = len(sentence) while x<N: y = route[x][1]+1 l_word = sentence[x:y] if y-x==1: buf+= l_word else: if len(buf)>0: if len(buf)==1: yield pair(buf,word_tag_tab.get(buf,'x')) buf=u'' else: regognized = __cut_detail(buf) for t in regognized: yield t buf=u'' yield pair(l_word,word_tag_tab.get(l_word,'x')) x =y if len(buf)>0: if len(buf)==1: yield pair(buf,word_tag_tab.get(buf,'x')) else: regognized = __cut_detail(buf) for t in regognized: yield t
def __cut_DAG_NO_HMM(self, sentence): _DAG = jieba.get_DAG(sentence) # print('[__cut_DAG_NO_HMM] sentence: ', sentence) # print('[__cut_DAG_NO_HMM] DAG: ', _DAG) my_route = self.get_route(sentence, _DAG) # print('[__cut_DAG_NO_HMM] my_route: ', my_route) if len(my_route) > 1: _tmp_freq = 0 _list = [] for _ in my_route: if _['freq'] > _tmp_freq: _tmp_freq = _['freq'] _list = _['list'] # print('[__cut_DAG_NO_HMM] max freq list: ', _list) if _list: for __ in _list: yield __ else: print('[__cut_DAG_NO_HMM] sentence: ', sentence) print('[__cut_DAG_NO_HMM] my_route: ', my_route) else: route = {} jieba.calc(sentence, _DAG, route) x = 0 N = len(sentence) buf = '' while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if self.re_eng.match(l_word) and len(l_word) == 1: buf += l_word x = y else: if buf: yield buf buf = '' yield l_word x = y if buf: yield buf buf = ''
def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route = {} jieba.calc(sentence, DAG, 0, route=route) x = 0 buf = u"" N = len(sentence) while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if len(buf) > 0: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, "x")) buf = u"" else: if buf not in jieba.FREQ: regognized = __cut_detail(buf) for t in regognized: yield t else: for elem in buf: yield pair(elem, word_tag_tab.get(elem, "x")) buf = u"" yield pair(l_word, word_tag_tab.get(l_word, "x")) x = y if len(buf) > 0: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, "x")) else: if buf not in jieba.FREQ: regognized = __cut_detail(buf) for t in regognized: yield t else: for elem in buf: yield pair(elem, word_tag_tab.get(elem, "x"))
def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route = {} jieba.calc(sentence, DAG, 0, route=route) x = 0 buf = '' N = len(sentence) while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if buf: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) buf = '' else: if (buf not in jieba.FREQ): recognized = __cut_detail(buf) for t in recognized: yield t else: for elem in buf: yield pair(elem, word_tag_tab.get(elem, 'x')) buf = '' yield pair(l_word, word_tag_tab.get(l_word, 'x')) x = y if buf: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) elif (buf not in jieba.FREQ): recognized = __cut_detail(buf) for t in recognized: yield t else: for elem in buf: yield pair(elem, word_tag_tab.get(elem, 'x'))
def __cut_DAG_NO_HMM(sentence): DAG = jieba.get_DAG(sentence) route = {} jieba.calc(sentence, DAG, route) x = 0 N = len(sentence) buf = '' while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if re_eng1.match(l_word): buf += l_word x = y else: if buf: yield pair(buf, 'eng') buf = '' yield pair(l_word, word_tag_tab.get(l_word, 'x')) x = y if buf: yield pair(buf, 'eng') buf = ''
def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route = {} jieba.calc(sentence, DAG, route) x = 0 buf = '' N = len(sentence) while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if buf: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) elif not jieba.FREQ.get(buf): recognized = __cut_detail(buf) for t in recognized: yield t else: for elem in buf: yield pair(elem, word_tag_tab.get(elem, 'x')) buf = '' yield pair(l_word, word_tag_tab.get(l_word, 'x')) x = y if buf: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) elif not jieba.FREQ.get(buf): recognized = __cut_detail(buf) for t in recognized: yield t else: for elem in buf: yield pair(elem, word_tag_tab.get(elem, 'x'))
def __cut_DAG_NO_HMM(sentence): DAG = jieba.get_DAG(sentence) route ={} jieba.calc(sentence,DAG,0,route=route) x = 0 N = len(sentence) buf =u'' re_eng = re.compile(ur'[a-zA-Z0-9]',re.U) while x<N: y = route[x][1]+1 l_word = sentence[x:y] if re_eng.match(l_word) and len(l_word)==1: buf += l_word x = y else: if len(buf)>0: yield pair(buf,'eng') buf = u'' yield pair(l_word,word_tag_tab.get(l_word,'x')) x =y if len(buf)>0: yield pair(buf,'eng') buf = u''
def cutc(self,content): jieba.calc()