Exemple #1
0
    def segment(self, text, lower=True, use_stop_words=True, use_speech_tags_filter=False):
        """对一段文本进行分词,返回list类型的分词结果

        Keyword arguments:
        lower                  -- 是否将单词小写(针对英文)
        use_stop_words         -- 若为True,则利用停止词集合来过滤(去掉停止词)
        use_speech_tags_filter -- 是否基于词性进行过滤。若为True,则使用self.default_speech_tag_filter过滤。否则,不过滤。
        """
        text = util.as_text(text)
        jieba_result = pseg.cut(text)

        if use_speech_tags_filter == True:
            jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter]
        else:
            jieba_result = [w for w in jieba_result]

        # 去除特殊符号
        word_list = [w.word.strip() for w in jieba_result if w.flag != 'x']
        word_list = [word for word in word_list if len(word) > 0]

        if lower:
            word_list = [word.lower() for word in word_list]

        if use_stop_words:
            word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words]

        return word_list
Exemple #2
0
    def segment(self, text, lower=False):
        '''

        :param text:
        :param lower:
        :return:
        '''
        text = util.as_text(text)
        sentences = self.ss.segment(text)
        words_no_filter = self.ws.segment_sentences(sentences=sentences,
                                                    lower=lower,
                                                    use_stop_words=False,
                                                    use_speech_tags_filter=False)
        words_no_stop_words = self.ws.segment_sentences(sentences=sentences,
                                                        lower=lower,
                                                        use_stop_words=True,
                                                        use_speech_tags_filter=False)

        words_all_filters = self.ws.segment_sentences(sentences=sentences,
                                                      lower=lower,
                                                      use_stop_words=True,
                                                      use_speech_tags_filter=True)

        return util.AttrDict(
            sentences=sentences,
            words_no_filter=words_no_filter,
            words_no_stop_words=words_no_stop_words,
            words_all_filters=words_all_filters
        )
Exemple #3
0
 def segment(self, text, lower=False):
     '''
     :param text:
     :param lower:
     :return:
     '''
     text = util.as_text(text)
     sentences = self.ss.segment(text)
     return util.AttrDict(sentences=sentences, )
Exemple #4
0
    def segment(self, text):
        res = [util.as_text(text)]

        util.debug(res)
        util.debug(self.delimiters)

        for sep in self.delimiters:
            text, res = res, []
            for seq in text:
                res += seq.split(sep)
        res = [s.strip() for s in res if len(s.strip()) > 0]
        return res
Exemple #5
0
    def __init__(self, stop_words_file=None, allow_speech_tags=util.allow_speech_tags):
        """
        Keyword arguments:
        stop_words_file    -- 保存停止词的文件路径,utf8编码,每行一个停止词。若不是str类型,则使用默认的停止词
        allow_speech_tags  -- 词性列表,用于过滤
        """

        allow_speech_tags = [util.as_text(item) for item in allow_speech_tags]

        self.default_speech_tag_filter = allow_speech_tags
        self.stop_words = set()
        self.stop_words_file = get_default_stop_words_file()
        if type(stop_words_file) is str:
            self.stop_words_file = stop_words_file
        for word in codecs.open(self.stop_words_file, 'r', 'utf-8', 'ignore'):
            self.stop_words.add(word.strip())
Exemple #6
0
 def __init__(self, delimiters=util.sentence_delimiters):
     """
     Keyword arguments:
     delimiters -- 可迭代对象,用来拆分句子
     """
     self.delimiters = set([util.as_text(item) for item in delimiters])