Example #1
0
def __cut_DAG(sentence):
	DAG = jieba.get_DAG(sentence)
	route ={}
	
	jieba.calc(sentence,DAG,0,route=route)

	x = 0
	buf =u''
	N = len(sentence)
	while x<N:
		y = route[x][1]+1
		l_word = sentence[x:y]
		if y-x==1:
			buf+= l_word
		else:
			if len(buf)>0:
				if len(buf)==1:
					yield pair(buf,word_tag_tab.get(buf,'x'))
					buf=u''
				else:
					regognized = __cut_detail(buf)
					for t in regognized:
						yield t
					buf=u''
			yield pair(l_word,word_tag_tab.get(l_word,'x'))
		x =y

	if len(buf)>0:
		if len(buf)==1:
			yield pair(buf,word_tag_tab.get(buf,'x'))
		else:
			regognized = __cut_detail(buf)
			for t in regognized:
				yield t
Example #2
0
def __cut_DAG(sentence):
    DAG = jieba.get_DAG(sentence)
    route = {}
    jieba.calc(sentence, DAG, 0, route=route)
    x = 0
    buf = u''
    N = len(sentence)
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if y - x == 1:
            buf += l_word
        else:
            if len(buf) > 0:
                if len(buf) == 1:
                    yield pair(buf, word_tag_tab.get(buf, 'x'))
                    buf = u''
                else:
                    regognized = __cut(buf)
                    for t in regognized:
                        yield t
                    buf = u''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
        x = y

    if len(buf) > 0:
        if len(buf) == 1:
            yield pair(buf, word_tag_tab.get(buf, 'x'))
        else:
            regognized = __cut(buf)
            for t in regognized:
                yield t
    def get_cut_all(self, sentence, min_length=1):
        _dag = jieba.get_DAG(sentence)
        _n = len(sentence)
        result = []

        for _idx in range(_n):
            if _idx == 0 or sentence[_idx - 1] == self.split_character:
                _dag_list = _dag[_idx]
                for __x in _dag_list:
                    _word = sentence[_idx:__x + 1]
                    if _word.count(self.split_character) >= min_length:
                        result.append(_word)

        return result
    def __cut_DAG_NO_HMM(self, sentence):
        _DAG = jieba.get_DAG(sentence)
        # print('[__cut_DAG_NO_HMM] sentence: ', sentence)
        # print('[__cut_DAG_NO_HMM] DAG: ', _DAG)

        my_route = self.get_route(sentence, _DAG)
        # print('[__cut_DAG_NO_HMM] my_route: ', my_route)

        if len(my_route) > 1:
            _tmp_freq = 0
            _list = []

            for _ in my_route:
                if _['freq'] > _tmp_freq:
                    _tmp_freq = _['freq']
                    _list = _['list']

            # print('[__cut_DAG_NO_HMM] max freq list: ', _list)
            if _list:
                for __ in _list:
                    yield __
            else:
                print('[__cut_DAG_NO_HMM] sentence: ', sentence)
                print('[__cut_DAG_NO_HMM] my_route: ', my_route)

        else:
            route = {}
            jieba.calc(sentence, _DAG, route)

            x = 0
            N = len(sentence)
            buf = ''
            while x < N:
                y = route[x][1] + 1
                l_word = sentence[x:y]
                if self.re_eng.match(l_word) and len(l_word) == 1:
                    buf += l_word
                    x = y
                else:
                    if buf:
                        yield buf
                        buf = ''
                    yield l_word
                    x = y
            if buf:
                yield buf
                buf = ''
Example #5
0
def __cut_DAG(sentence):
    DAG = jieba.get_DAG(sentence)
    route = {}

    jieba.calc(sentence, DAG, 0, route=route)

    x = 0
    buf = u""
    N = len(sentence)
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if y - x == 1:
            buf += l_word
        else:
            if len(buf) > 0:
                if len(buf) == 1:
                    yield pair(buf, word_tag_tab.get(buf, "x"))
                    buf = u""
                else:
                    if buf not in jieba.FREQ:
                        regognized = __cut_detail(buf)
                        for t in regognized:
                            yield t
                    else:
                        for elem in buf:
                            yield pair(elem, word_tag_tab.get(elem, "x"))
                    buf = u""
            yield pair(l_word, word_tag_tab.get(l_word, "x"))
        x = y

    if len(buf) > 0:
        if len(buf) == 1:
            yield pair(buf, word_tag_tab.get(buf, "x"))
        else:
            if buf not in jieba.FREQ:
                regognized = __cut_detail(buf)
                for t in regognized:
                    yield t
            else:
                for elem in buf:
                    yield pair(elem, word_tag_tab.get(elem, "x"))
Example #6
0
def __cut_DAG(sentence):
    DAG = jieba.get_DAG(sentence)
    route = {}

    jieba.calc(sentence, DAG, 0, route=route)

    x = 0
    buf = ''
    N = len(sentence)
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if y - x == 1:
            buf += l_word
        else:
            if buf:
                if len(buf) == 1:
                    yield pair(buf, word_tag_tab.get(buf, 'x'))
                    buf = ''
                else:
                    if (buf not in jieba.FREQ):
                        recognized = __cut_detail(buf)
                        for t in recognized:
                            yield t
                    else:
                        for elem in buf:
                            yield pair(elem, word_tag_tab.get(elem, 'x'))
                    buf = ''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
        x = y

    if buf:
        if len(buf) == 1:
            yield pair(buf, word_tag_tab.get(buf, 'x'))
        elif (buf not in jieba.FREQ):
            recognized = __cut_detail(buf)
            for t in recognized:
                yield t
        else:
            for elem in buf:
                yield pair(elem, word_tag_tab.get(elem, 'x'))
Example #7
0
def __cut_DAG_NO_HMM(sentence):
    DAG = jieba.get_DAG(sentence)
    route = {}
    jieba.calc(sentence, DAG, route)
    x = 0
    N = len(sentence)
    buf = ''
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if re_eng1.match(l_word):
            buf += l_word
            x = y
        else:
            if buf:
                yield pair(buf, 'eng')
                buf = ''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
            x = y
    if buf:
        yield pair(buf, 'eng')
        buf = ''
Example #8
0
def __cut_DAG_NO_HMM(sentence):
    DAG = jieba.get_DAG(sentence)
    route = {}
    jieba.calc(sentence, DAG, route)
    x = 0
    N = len(sentence)
    buf = ''
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if re_eng1.match(l_word):
            buf += l_word
            x = y
        else:
            if buf:
                yield pair(buf, 'eng')
                buf = ''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
            x = y
    if buf:
        yield pair(buf, 'eng')
        buf = ''
Example #9
0
def __cut_DAG(sentence):
    DAG = jieba.get_DAG(sentence)
    route = {}

    jieba.calc(sentence, DAG, route)

    x = 0
    buf = ''
    N = len(sentence)
    while x < N:
        y = route[x][1] + 1
        l_word = sentence[x:y]
        if y - x == 1:
            buf += l_word
        else:
            if buf:
                if len(buf) == 1:
                    yield pair(buf, word_tag_tab.get(buf, 'x'))
                elif not jieba.FREQ.get(buf):
                    recognized = __cut_detail(buf)
                    for t in recognized:
                        yield t
                else:
                    for elem in buf:
                        yield pair(elem, word_tag_tab.get(elem, 'x'))
                buf = ''
            yield pair(l_word, word_tag_tab.get(l_word, 'x'))
        x = y

    if buf:
        if len(buf) == 1:
            yield pair(buf, word_tag_tab.get(buf, 'x'))
        elif not jieba.FREQ.get(buf):
            recognized = __cut_detail(buf)
            for t in recognized:
                yield t
        else:
            for elem in buf:
                yield pair(elem, word_tag_tab.get(elem, 'x'))
Example #10
0
def __cut_DAG_NO_HMM(sentence):
    DAG = jieba.get_DAG(sentence)
    route ={}
    jieba.calc(sentence,DAG,0,route=route)
    x = 0
    N = len(sentence)
    buf =u''
    re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
    while x<N:
        y = route[x][1]+1
        l_word = sentence[x:y]
        if re_eng.match(l_word) and len(l_word)==1:
            buf += l_word
            x = y
        else:
            if len(buf)>0:
                yield pair(buf,'eng')
                buf = u''
            yield pair(l_word,word_tag_tab.get(l_word,'x'))
            x =y
    if len(buf)>0:
        yield pair(buf,'eng')
        buf = u''
Example #11
0
def __cut_DAG_NO_HMM(sentence):
    DAG = jieba.get_DAG(sentence)
    route ={}
    jieba.calc(sentence,DAG,0,route=route)
    x = 0
    N = len(sentence)
    buf =u''
    re_eng = re.compile(ur'[a-zA-Z0-9]',re.U)
    while x<N:
        y = route[x][1]+1
        l_word = sentence[x:y]
        if re_eng.match(l_word) and len(l_word)==1:
            buf += l_word
            x = y
        else:
            if len(buf)>0:
                yield pair(buf,'eng')
                buf = u''
            yield pair(l_word,word_tag_tab.get(l_word,'x'))
            x =y
    if len(buf)>0:
        yield pair(buf,'eng')
        buf = u''
 def get_DAG(self, sentence):
     return jieba.get_DAG(sentence)
Example #13
0
#coding=utf-8
import jieba

#query = "在北京举行的庆祝新中国成立70周年湖南专场新闻发布会"

query = "景区非常好,酒店设施非常新,温泉池非常舒服"

#query = "希望香港社会反对暴力、守护法治"
seg_list = jieba.cut(query, cut_all=True)
print("all: " + "/ ".join(seg_list))

seg_list = jieba.cut(query, cut_all=False, HMM=False)
print("sec: " + "/ ".join(seg_list))

seg_list = jieba.cut(query, cut_all=False, HMM=True)
print("HMM: " + "/ ".join(seg_list))

seg_list = jieba.cut_for_search(query, HMM=True)
print("search: " + "/ ".join(seg_list))

query = "设置股票预警"
seg_list = jieba.cut(query, cut_all=True)
print("all: " + "/ ".join(seg_list))
print(jieba.get_DAG(query))
Example #14
0
    new_value = []
    for v in value:
        new_value.append(v + ' ' + str(10**len(v)))
    tag_dict_freq[key] = new_value

dict_for_jieba =[leaf for branch in list(tag_dict_freq.values()) for leaf in branch]
ftag = open(path + 'sentence_components_dict.txt','w')
ftag.write('\n'.join(dict_for_jieba))
ftag.close()

# DAG - 有向无环图
import jieba
jieba.set_dictionary(dictionary_path='.\\rule\\sentence_components_dict.txt')
jieba.get_dict_file()
sentence = '两个人上下摞起来是什么字'
jieba.get_DAG(sentence)
list(jieba.cut(sentence, cut_all=False, HMM=False))
# ['两个', '人', '上', '下', '摞起来', '是', '什么字']
list(jieba.cut(sentence, cut_all=False, HMM=True))
# ['两个', '人上', '下', '摞起来', '是', '什么字']
list(jieba.cut(sentence, cut_all=True, HMM=False))
# ['两个', '人', '上', '下', '摞起来', '是', '什么字']


# 最大匹配方法


# 最小切分