def topinyin(s):
    """
    s都是汉字
    """
    s = util.as_text(s)
    py_list = pypinyin.lazy_pinyin(s)
    result = []
    for py in py_list:
        py = util.as_text(py)
        if py == '〇':
            result.append('ling')
        else:
            result.append(util.simplify_pinyin(py))

    return result
def read_from_sentence_txt(start, emission, transition):
    ## ./result/sentence.txt
    print('read from sentence.txt')
    for line in open(SENTENCE_FILE):
        line = util.as_text(line.strip())
        if len(line) < 2:
            continue
        if not util.is_chinese(line):
            continue

        ## for start
        start.setdefault(line[0], 0)
        start[line[0]] += 1

        ## for emission
        pinyin_list = topinyin(line)
        char_list = [c for c in line]

        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += 1

        ## for transition
        for f, t in zip(line[:-1], line[1:]):
            transition.setdefault(f, {})
            transition[f].setdefault(t, 0)
            transition[f][t] += 1
Beispiel #3
0
    def segment(self, pic, pipe, text, lower=False, use_jieba=False):
        text = util.as_text(text)
        sentences = self.ss.segment(
            text)  # Sentences is a sentence with end delimiters .
        words_no_filter = self.ws.segment_sentences(
            pic,
            pipe,
            sentences=sentences,
            lower=lower,
            use_stop_words=False,
            use_speech_tags_filter=False,
            use_jieba=use_jieba)
        words_no_stop_words = self.ws.segment_sentences(
            pic,
            pipe,
            sentences=sentences,
            lower=lower,
            use_stop_words=True,
            use_speech_tags_filter=False,
            use_jieba=use_jieba)

        words_all_filters = self.ws.segment_sentences(
            pic,
            pipe,
            sentences=sentences,
            lower=lower,
            use_stop_words=True,
            use_speech_tags_filter=True,
            use_jieba=use_jieba)

        return util.AttrDict(sentences=sentences,
                             words_no_filter=words_no_filter,
                             words_no_stop_words=words_no_stop_words,
                             words_all_filters=words_all_filters)
Beispiel #4
0
def gen_emission():
    """
    base_emission   = {} #>   {'泥': {'ni':1.0}, '了':{'liao':0.5, 'le':0.5}}
    """
    data = {'default': 1.e-200, 'data': None}
    emission = readdatafromfile(BASE_EMISSION_FILE)

    for line in open(HZ2PY_FILE):
        line = util.as_text(line.strip())
        hanzi, pinyin_list = line.split('=')
        pinyin_list = [
            util.simplify_pinyin(item.strip())
            for item in pinyin_list.split(',')
        ]

        char_list = [hanzi] * len(pinyin_list)
        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0.)
            emission[hanzi][pinyin] += 1.

    for hanzi in emission:
        num_sum = 0.
        for pinyin in emission[hanzi]:
            num_sum += emission[hanzi][pinyin]
        for pinyin in emission[hanzi]:
            emission[hanzi][pinyin] = round(
                math.log(emission[hanzi][pinyin] / num_sum), 6)
    data['default'] = round(math.log(1.e-200), 6)
    data['data'] = emission
    writejson2file(data, FIN_EMISSION_FILE)
Beispiel #5
0
    def segment(self, text, lower = True, use_stop_words = True, use_speech_tags_filter = False):
        """对一段文本进行分词,返回list类型的分词结果

        Keyword arguments:
        lower                  -- 是否将单词小写(针对英文)
        use_stop_words         -- 若为True,则利用停止词集合来过滤(去掉停止词)
        use_speech_tags_filter -- 是否基于词性进行过滤。若为True,则使用self.default_speech_tag_filter过滤。否则,不过滤。    
        """
        text = util.as_text(text)
        jieba_result = pseg.cut(text)
        
        if use_speech_tags_filter == True:
            jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter]
        else:
            jieba_result = [w for w in jieba_result]

        # 去除特殊符号
        word_list = [w.word.strip() for w in jieba_result if w.flag!='x']
        word_list = [word for word in word_list if len(word)>0]
        
        if lower:
            word_list = [word.lower() for word in word_list]

        if use_stop_words:
            word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]

        return word_list
Beispiel #6
0
    def segment(self, text):
        res = [util.as_text(text)]

        for sep in self.delimiters:
            text, res = res, []
            for seq in text:
                res += seq.split(sep)
        res = [s.strip() for s in res if len(s.strip()) > 0]
        return res
Beispiel #7
0
def topinyin(s):
    """
    s都是汉字
    """
    s = util.as_text(s)
    py_list = PinyinHelper.convertToPinyinFromSentence(s)
    result = []
    for py in py_list:
        py = util.as_text(py)
        if py == '〇':
            result.append('ling')
        else:
            result.append(util.simplify_pinyin(py))

    if ',' in ''.join(result):
        print(s)
        print(''.join(result))
        sys.exit()
    return result
Beispiel #8
0
 def __init__(self, stop_words_file=None, allowed_speech_tags=allowed_speech_tags):
     allowed_speech_tags = [util.as_text(item)
                            for item in allowed_speech_tags]
     self.default_tag_filter = allowed_speech_tags
     self.stop_words = set()
     self.stop_words_file = get_default_stop_words_file()
     if type(stop_words_file) is str:
         self.stop_words_file = stop_words_file
     for word in codecs.open(self.stop_words_file, 'r', 'utf-8', 'ignore'):
         self.stop_words.add(word.strip())
Beispiel #9
0
def segment(self, text):
    res = [util.as_text(text)]  # 返回句子

    util.debug(res)
    util.debug(self.delimiters)

    for sep in self.delimiters:
        text, res = res, []
        for seq in text:
            res += seq.split(sep)  # split() 通过指定分隔符对字符串进行切片
    res = [s.strip() for s in res if len(s.strip()) > 0]
    return res
Beispiel #10
0
def process_hanzipinyin(emission):
    ## ./hanzipinyin.txt
    print('read from hanzipinyin.txt')
    for line in open(HANZI2PINYIN_FILE):
        line = util.as_text(line.strip())
        if '=' not in line:
            continue
        hanzi, pinyins = line.split('=')
        pinyins = pinyins.split(',')
        pinyins = [util.simplify_pinyin(py) for py in pinyins]
        for pinyin in pinyins:
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += 1
Beispiel #11
0
def gen_py2hz():
    data = {}
    for line in open(PY2HZ_FILE):
        line = util.as_text(line.strip())
        ls = line.split('=')
        if len(ls) != 2:
            raise Exception('invalid format')
        py, chars = ls
        py = py.strip()
        chars = chars.strip()
        if len(py) > 0 and len(chars) > 0:
            data[py] = chars

    writejson2file(data, FIN_PY2HZ_FILE)
Beispiel #12
0
def extract_chinese_sentences(content):
    content = util.as_text(content)
    content = content.replace(' ', '')
    content = content.replace('\t', '')
    sentences = []
    s = ''
    for c in content:
        if util.is_chinese(c):
            s += c
        else:
            sentences.append(s)
            s = ''
    sentences.append(s)

    return [s.strip() for s in sentences if len(s.strip()) > 1]
Beispiel #13
0
    def __init__(self, stop_words_file = None, allow_speech_tags = util.allow_speech_tags):
        """
        Keyword arguments:
        stop_words_file    -- 保存停止词的文件路径,utf8编码,每行一个停止词。若不是str类型,则使用默认的停止词
        allow_speech_tags  -- 词性列表,用于过滤
        """     
        
        allow_speech_tags = [util.as_text(item) for item in allow_speech_tags]

        self.default_speech_tag_filter = allow_speech_tags
        self.stop_words = set()
        self.stop_words_file = get_default_stop_words_file()
        if type(stop_words_file) is str:
            self.stop_words_file = stop_words_file
        for word in codecs.open(self.stop_words_file, 'r', 'utf-8', 'ignore'):
            self.stop_words.add(word.strip())
Beispiel #14
0
    def keywords(self, text, n):
        text = text.replace('\n', '')
        text = text.replace('\r', '')
        text = utils.as_text(text)
        tokens = utils.cut_sentences(text)
        sentences, sents = utils.psegcut_filter_words(tokens,
                                                      self.__stop_words,
                                                      self.__use_stopword)

        word_index, index_word, words_number = self.build_vocab(sents)
        graph = self.create_graph(sents, words_number,
                                  word_index, window=self.__window)
        scores = utils.weight_map_rank(graph, max_iter=self.__max_iter,
                                       tol=self.__tol)
        sent_selected = nlargest(n, zip(scores, count()))
        sent_index = []
        for i in range(n):
            sent_index.append(sent_selected[i][1])
        return [index_word[i] for i in sent_index]
Beispiel #15
0
def read_from_word_txt(start, emission, transition):
    ## ! 基于word.txt的优化
    print('read from word.txt')
    _base = 1000.
    _min_value = 2.
    for line in open(WORD_FILE):
        line = util.as_text(line.strip())
        if '=' not in line:
            continue
        if len(line) < 3:
            continue
        ls = line.split('=')
        if len(ls) != 2:
            continue
        word, num = ls
        word = word.strip()
        num = num.strip()
        if len(num) == 0:
            continue
        num = float(num)
        num = max(_min_value, num / _base)

        if not util.is_chinese(word):
            continue

        ## for start
        start.setdefault(word[0], 0)
        start[word[0]] += num

        ## for emission
        pinyin_list = topinyin(word)
        char_list = [c for c in word]
        for hanzi, pinyin in zip(char_list, pinyin_list):
            emission.setdefault(hanzi, {})
            emission[hanzi].setdefault(pinyin, 0)
            emission[hanzi][pinyin] += num

        ## for transition
        for f, t in zip(word[:-1], word[1:]):
            transition.setdefault(f, {})
            transition[f].setdefault(t, 0)
            transition[f][t] += num
Beispiel #16
0
    def get_word_list(self, text, lower=True, strip_stop_words=True, use_tag_filter=False):
        text = util.as_text(text)
        jieba_result = pseg.cut(text)

        if use_tag_filter:
            jieba_result = [
                w for w in jieba_result if w.flag in self.default_tag_filter]
        else:
            jieba_result = [w for w in jieba_result]

        word_list = [w.word.strip() for w in jieba_result if w.flag != 'x']
        word_list = [word for word in word_list if len(word) > 0]

        if lower:
            word_list = [word.lower() for word in word_list]

        if strip_stop_words:
            word_list = [word.strip()
                         for word in word_list if word.strip() not in self.stop_words]

        return word_list
Beispiel #17
0
    def segment(self, text, lower = False):
        text = util.as_text(text)
        sentences = self.ss.segment(text)
        words_no_filter = self.ws.segment_sentences(sentences=sentences, 
                                                    lower = lower, 
                                                    use_stop_words = False,
                                                    use_speech_tags_filter = False)
        words_no_stop_words = self.ws.segment_sentences(sentences=sentences, 
                                                    lower = lower, 
                                                    use_stop_words = True,
                                                    use_speech_tags_filter = False)

        words_all_filters = self.ws.segment_sentences(sentences=sentences, 
                                                    lower = lower, 
                                                    use_stop_words = True,
                                                    use_speech_tags_filter = True)

        return util.AttrDict(
                    sentences           = sentences, 
                    words_no_filter     = words_no_filter, 
                    words_no_stop_words = words_no_stop_words, 
                    words_all_filters   = words_all_filters
                )
Beispiel #18
0
 def summarize(self, text):
     text = text.replace('\n', '')
     text = text.replace('\r', '')
     text = util.as_text(text)  #处理编码问题
     tokens = util.cut_sentences(text)
     #sentences用于记录文章最原本的句子,sents用于各种计算操作
     sentences, sents = util.cut_filter_words(tokens, self.__stop_words,
                                              self.__use_stopword)
     if self.__use_w2v:
         sents = self.filter_dictword(sents)
     graph = self.create_graph_sentence(sents, self.__use_w2v)
     scores = util.weight_map_rank(graph, self.__max_iter, self.__tol)
     num = len(scores)
     if num > 1:
         n = 1
         sent_selected = nlargest(n, zip(scores, count()))
     else:
         n = num
         sent_selected = nlargest(n, zip(scores, count()))
     sent_index = []
     for i in range(n):
         sent_index.append(sent_selected[i][1])  # 添加入关键词在原来文章中的下标
     return [sentences[i] for i, counts in range(n) if sentences[i] != '']
Beispiel #19
0
import sys
import codecs
def segment(self, text):
    res = [util.as_text(text)]  # 返回句子

    util.debug(res)
    util.debug(self.delimiters)

    for sep in self.delimiters:
        text, res = res, []
        for seq in text:
            res += seq.split(sep)  # split() 通过指定分隔符对字符串进行切片
    res = [s.strip() for s in res if len(s.strip()) > 0]
    return res

text = codecs.open('../test/doc/01.txt', 'r', 'utf-8').read()
delimiters = set([util.as_text(item) for item in util.sentence_delimiters])
res = [util.as_text(text)]


for sep in delimiters:
    text, res = res, []
    for seq in text:#他传入的文章就有换行符,所以已经会分割了
        res += seq.split(sep)  # split() 通过指定分隔符对字符串进行切片
        #print(res)
res = [s.strip() for s in res if len(s.strip()) > 0]#去除掉换行符句子得到一个完整的句子列表




Beispiel #20
0
    def segment(self,
                pic,
                pipe,
                text,
                lower=True,
                use_stop_words=True,
                use_speech_tags_filter=False,
                use_jieba=False):
        """对一段文本进行分词,返回list类型的分词结果

        Keyword arguments:
        lower                  -- 是否将单词小写(针对英文)
        use_stop_words         -- 若为True,则利用停止词集合来过滤(去掉停止词)
        use_speech_tags_filter -- 是否基于词性进行过滤。若为True,则使用self.default_speech_tag_filter过滤。否则,不过滤。    
        """
        text = util.as_text(text).split(",")
        if len(text) > 0:
            text = text[0]
        if use_jieba:
            jieba_result = pseg.cut(text)
            if use_speech_tags_filter == True:
                jieba_result = [
                    w for w in jieba_result
                    if w.flag in self.default_speech_tag_filter
                ]
            else:
                jieba_result = [w for w in jieba_result]

            # 去除特殊符号
            word_list = [w.word.strip() for w in jieba_result if w.flag != 'x']
            word_list = [word for word in word_list if len(word) > 0]
        else:
            word2id_c, id2tag_c, word2id_p, id2tag_p, word2id_n, id2tag_n, zy = pic
            cws, pos = pipe.analyze(text, word2id_c, id2tag_c, zy, word2id_p,
                                    id2tag_p, word2id_n, id2tag_n)
            pos = [pos[i][1] for i in range(len(pos))]
            if use_speech_tags_filter == True:
                cws = [
                    cws[i] for i in range(len(pos))
                    if pos[i] in self.default_speech_tag_filter
                ]
                pos = [
                    pos[i] for i in range(len(pos))
                    if pos[i] in self.default_speech_tag_filter
                ]

            word_list = [
                cws[i].strip() for i in range(len(pos)) if pos[i] != 'x'
            ]
            word_list = [word for word in word_list if len(word) > 0]

        if lower:
            word_list = [word.lower() for word in word_list]

        if use_stop_words:
            word_list = [
                word.strip() for word in word_list
                if word.strip() not in self.stop_words
            ]

        return word_list
Beispiel #21
0
    sys.setdefaultencoding('utf-8')
except:
    pass

SOURCE_FILE = '../data/train/original/hanzipinyin.txt'

ALL_STATES_FILE = '../data/train/result/all_states.txt'  # 汉字(隐藏状态)
ALL_OBSERVATIONS_FILE = '../data/train/result/all_observations.txt'  # 拼音(观测值)
PINYIN2HANZI_FILE = '../data/train/result/pinyin2hanzi.txt'

states = set()
observations = set()
py2hz = {}

for line in open(SOURCE_FILE):
    line = util.as_text(line.strip())
    hanzi, pinyin_list = line.split('=')
    pinyin_list = [
        util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',')
    ]

    states.add(hanzi)

    for pinyin in pinyin_list:
        observations.add(pinyin)
        py2hz.setdefault(pinyin, set())
        py2hz[pinyin].add(hanzi)
        # 声母
        shengmu = util.get_shengmu(pinyin)
        if shengmu is not None:
            py2hz.setdefault(shengmu, set())
Beispiel #22
0
 def __init__(self, delimiters=util.sentence_delimiters):
     """
     Keyword arguments:
     delimiters -- 可迭代对象,用来拆分句子
     """
     self.delimiters = set([util.as_text(item) for item in delimiters])