def segment(self, text, lower=True, use_stop_words=True, use_speech_tags_filter=False): """对一段文本进行分词,返回list类型的分词结果 Keyword arguments: lower -- 是否将单词小写(针对英文) use_stop_words -- 若为True,则利用停止词集合来过滤(去掉停止词) use_speech_tags_filter -- 是否基于词性进行过滤。若为True,则使用self.default_speech_tag_filter过滤。否则,不过滤。 """ text = util.as_text(text) jieba_result = pseg.cut(text) if use_speech_tags_filter == True: jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter] else: jieba_result = [w for w in jieba_result] # 去除特殊符号 word_list = [w.word.strip() for w in jieba_result if w.flag != 'x'] word_list = [word for word in word_list if len(word) > 0] if lower: word_list = [word.lower() for word in word_list] if use_stop_words: word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words] return word_list
def segment(self, text, lower=False): ''' :param text: :param lower: :return: ''' text = util.as_text(text) sentences = self.ss.segment(text) words_no_filter = self.ws.segment_sentences(sentences=sentences, lower=lower, use_stop_words=False, use_speech_tags_filter=False) words_no_stop_words = self.ws.segment_sentences(sentences=sentences, lower=lower, use_stop_words=True, use_speech_tags_filter=False) words_all_filters = self.ws.segment_sentences(sentences=sentences, lower=lower, use_stop_words=True, use_speech_tags_filter=True) return util.AttrDict( sentences=sentences, words_no_filter=words_no_filter, words_no_stop_words=words_no_stop_words, words_all_filters=words_all_filters )
def segment(self, text, lower=False): ''' :param text: :param lower: :return: ''' text = util.as_text(text) sentences = self.ss.segment(text) return util.AttrDict(sentences=sentences, )
def segment(self, text): res = [util.as_text(text)] util.debug(res) util.debug(self.delimiters) for sep in self.delimiters: text, res = res, [] for seq in text: res += seq.split(sep) res = [s.strip() for s in res if len(s.strip()) > 0] return res
def __init__(self, stop_words_file=None, allow_speech_tags=util.allow_speech_tags): """ Keyword arguments: stop_words_file -- 保存停止词的文件路径,utf8编码,每行一个停止词。若不是str类型,则使用默认的停止词 allow_speech_tags -- 词性列表,用于过滤 """ allow_speech_tags = [util.as_text(item) for item in allow_speech_tags] self.default_speech_tag_filter = allow_speech_tags self.stop_words = set() self.stop_words_file = get_default_stop_words_file() if type(stop_words_file) is str: self.stop_words_file = stop_words_file for word in codecs.open(self.stop_words_file, 'r', 'utf-8', 'ignore'): self.stop_words.add(word.strip())
def __init__(self, delimiters=util.sentence_delimiters): """ Keyword arguments: delimiters -- 可迭代对象,用来拆分句子 """ self.delimiters = set([util.as_text(item) for item in delimiters])