def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route ={} jieba.calc(sentence,DAG,0,route=route) x = 0 buf =u'' N = len(sentence) while x<N: y = route[x][1]+1 l_word = sentence[x:y] if y-x==1: buf+= l_word else: if len(buf)>0: if len(buf)==1: yield pair(buf,word_tag_tab.get(buf,'x')) buf=u'' else: regognized = __cut_detail(buf) for t in regognized: yield t buf=u'' yield pair(l_word,word_tag_tab.get(l_word,'x')) x =y if len(buf)>0: if len(buf)==1: yield pair(buf,word_tag_tab.get(buf,'x')) else: regognized = __cut_detail(buf) for t in regognized: yield t
def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route = {} jieba.calc(sentence, DAG, 0, route=route) x = 0 buf = u'' N = len(sentence) while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if len(buf) > 0: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) buf = u'' else: regognized = __cut(buf) for t in regognized: yield t buf = u'' yield pair(l_word, word_tag_tab.get(l_word, 'x')) x = y if len(buf) > 0: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) else: regognized = __cut(buf) for t in regognized: yield t
def get_cut_all(self, sentence, min_length=1): _dag = jieba.get_DAG(sentence) _n = len(sentence) result = [] for _idx in range(_n): if _idx == 0 or sentence[_idx - 1] == self.split_character: _dag_list = _dag[_idx] for __x in _dag_list: _word = sentence[_idx:__x + 1] if _word.count(self.split_character) >= min_length: result.append(_word) return result
def __cut_DAG_NO_HMM(self, sentence): _DAG = jieba.get_DAG(sentence) # print('[__cut_DAG_NO_HMM] sentence: ', sentence) # print('[__cut_DAG_NO_HMM] DAG: ', _DAG) my_route = self.get_route(sentence, _DAG) # print('[__cut_DAG_NO_HMM] my_route: ', my_route) if len(my_route) > 1: _tmp_freq = 0 _list = [] for _ in my_route: if _['freq'] > _tmp_freq: _tmp_freq = _['freq'] _list = _['list'] # print('[__cut_DAG_NO_HMM] max freq list: ', _list) if _list: for __ in _list: yield __ else: print('[__cut_DAG_NO_HMM] sentence: ', sentence) print('[__cut_DAG_NO_HMM] my_route: ', my_route) else: route = {} jieba.calc(sentence, _DAG, route) x = 0 N = len(sentence) buf = '' while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if self.re_eng.match(l_word) and len(l_word) == 1: buf += l_word x = y else: if buf: yield buf buf = '' yield l_word x = y if buf: yield buf buf = ''
def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route = {} jieba.calc(sentence, DAG, 0, route=route) x = 0 buf = u"" N = len(sentence) while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if len(buf) > 0: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, "x")) buf = u"" else: if buf not in jieba.FREQ: regognized = __cut_detail(buf) for t in regognized: yield t else: for elem in buf: yield pair(elem, word_tag_tab.get(elem, "x")) buf = u"" yield pair(l_word, word_tag_tab.get(l_word, "x")) x = y if len(buf) > 0: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, "x")) else: if buf not in jieba.FREQ: regognized = __cut_detail(buf) for t in regognized: yield t else: for elem in buf: yield pair(elem, word_tag_tab.get(elem, "x"))
def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route = {} jieba.calc(sentence, DAG, 0, route=route) x = 0 buf = '' N = len(sentence) while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if buf: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) buf = '' else: if (buf not in jieba.FREQ): recognized = __cut_detail(buf) for t in recognized: yield t else: for elem in buf: yield pair(elem, word_tag_tab.get(elem, 'x')) buf = '' yield pair(l_word, word_tag_tab.get(l_word, 'x')) x = y if buf: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) elif (buf not in jieba.FREQ): recognized = __cut_detail(buf) for t in recognized: yield t else: for elem in buf: yield pair(elem, word_tag_tab.get(elem, 'x'))
def __cut_DAG_NO_HMM(sentence): DAG = jieba.get_DAG(sentence) route = {} jieba.calc(sentence, DAG, route) x = 0 N = len(sentence) buf = '' while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if re_eng1.match(l_word): buf += l_word x = y else: if buf: yield pair(buf, 'eng') buf = '' yield pair(l_word, word_tag_tab.get(l_word, 'x')) x = y if buf: yield pair(buf, 'eng') buf = ''
def __cut_DAG(sentence): DAG = jieba.get_DAG(sentence) route = {} jieba.calc(sentence, DAG, route) x = 0 buf = '' N = len(sentence) while x < N: y = route[x][1] + 1 l_word = sentence[x:y] if y - x == 1: buf += l_word else: if buf: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) elif not jieba.FREQ.get(buf): recognized = __cut_detail(buf) for t in recognized: yield t else: for elem in buf: yield pair(elem, word_tag_tab.get(elem, 'x')) buf = '' yield pair(l_word, word_tag_tab.get(l_word, 'x')) x = y if buf: if len(buf) == 1: yield pair(buf, word_tag_tab.get(buf, 'x')) elif not jieba.FREQ.get(buf): recognized = __cut_detail(buf) for t in recognized: yield t else: for elem in buf: yield pair(elem, word_tag_tab.get(elem, 'x'))
def __cut_DAG_NO_HMM(sentence): DAG = jieba.get_DAG(sentence) route ={} jieba.calc(sentence,DAG,0,route=route) x = 0 N = len(sentence) buf =u'' re_eng = re.compile(ur'[a-zA-Z0-9]',re.U) while x<N: y = route[x][1]+1 l_word = sentence[x:y] if re_eng.match(l_word) and len(l_word)==1: buf += l_word x = y else: if len(buf)>0: yield pair(buf,'eng') buf = u'' yield pair(l_word,word_tag_tab.get(l_word,'x')) x =y if len(buf)>0: yield pair(buf,'eng') buf = u''
def get_DAG(self, sentence): return jieba.get_DAG(sentence)
#coding=utf-8 import jieba #query = "在北京举行的庆祝新中国成立70周年湖南专场新闻发布会" query = "景区非常好,酒店设施非常新,温泉池非常舒服" #query = "希望香港社会反对暴力、守护法治" seg_list = jieba.cut(query, cut_all=True) print("all: " + "/ ".join(seg_list)) seg_list = jieba.cut(query, cut_all=False, HMM=False) print("sec: " + "/ ".join(seg_list)) seg_list = jieba.cut(query, cut_all=False, HMM=True) print("HMM: " + "/ ".join(seg_list)) seg_list = jieba.cut_for_search(query, HMM=True) print("search: " + "/ ".join(seg_list)) query = "设置股票预警" seg_list = jieba.cut(query, cut_all=True) print("all: " + "/ ".join(seg_list)) print(jieba.get_DAG(query))
new_value = [] for v in value: new_value.append(v + ' ' + str(10**len(v))) tag_dict_freq[key] = new_value dict_for_jieba =[leaf for branch in list(tag_dict_freq.values()) for leaf in branch] ftag = open(path + 'sentence_components_dict.txt','w') ftag.write('\n'.join(dict_for_jieba)) ftag.close() # DAG - 有向无环图 import jieba jieba.set_dictionary(dictionary_path='.\\rule\\sentence_components_dict.txt') jieba.get_dict_file() sentence = '两个人上下摞起来是什么字' jieba.get_DAG(sentence) list(jieba.cut(sentence, cut_all=False, HMM=False)) # ['两个', '人', '上', '下', '摞起来', '是', '什么字'] list(jieba.cut(sentence, cut_all=False, HMM=True)) # ['两个', '人上', '下', '摞起来', '是', '什么字'] list(jieba.cut(sentence, cut_all=True, HMM=False)) # ['两个', '人', '上', '下', '摞起来', '是', '什么字'] # 最大匹配方法 # 最小切分