def seg(hans): hans = simple_seg(hans) ret = [] for x in hans: if not RE_HANS.match(x): # 没有拼音的字符,不再参与二次分词 ret.append(x) elif PHRASES_DICT: ret.extend(list(mmseg.seg.cut(x))) else: # 禁用了词语库,不分词 ret.append(x) return ret
def seg(hans): if getattr(seg, 'no_jieba', None): ret = hans return simple_seg(ret) if seg.jieba is None: try: import jieba seg.jieba = jieba except ImportError: seg.no_jieba = True return seg(hans) else: hans = simple_seg(hans) ret = [] for x in hans: if not RE_HANS.match(x): # 没有拼音的字符,不再参与二次分词 ret.append(x) else: ret.extend(list(seg.jieba.cut(x))) return ret
def tag_pinyin(txt): newparts = [] for part in simple_seg(txt): if RE_HANS.match(part): pys = lazy_pinyin(part) newparts += [_ for _ in zip(part, pys)] else: for p in re.split(r'([,。?!?,])', part): if p: newparts.append((p, None)) return newparts
def _pinyin(words, style, heteronym, errors, strict=True): pys = [] # 初步过滤没有拼音的字符 if RE_HANS.match(words): pys = phrase_pinyin(words, style=style, heteronym=heteronym, errors=errors, strict=strict) return pys for word in simple_seg(words): if not (RE_HANS.match(word)): py = handle_nopinyin(word, errors=errors) pys.append(py) if py else None else: pys.extend(_pinyin(word, style, heteronym, errors, strict=strict)) return pys
def _pinyin(words, style, heteronym, errors): pys = [] # 初步过滤没有拼音的字符 if RE_HANS.match(words): pys = phrases_pinyin(words, style=style, heteronym=heteronym, errors=errors) return pys for word in simple_seg(words): if not (RE_HANS.match(word)): py = handle_nopinyin(word, errors=errors) pys.append(py) if py else None else: pys.extend(_pinyin(word, style, heteronym, errors)) return pys
def test_simple_seg(): assert simple_seg('啦啦') == ['啦啦'] assert simple_seg('啦啦abc') == ['啦啦', 'abc'] assert simple_seg('&##啦啦abc') == ['&##', '啦啦', 'abc'] assert simple_seg('&#哦#啦啦abc') == ['&#', '哦', '#', '啦啦', 'abc'] assert simple_seg('哦ほ#') == ['哦', 'ほ#'] assert simple_seg(['啦啦']) == ['啦啦'] assert simple_seg(['啦啦', 'abc']) == ['啦啦', 'abc'] assert simple_seg('哦ほ#哪') == ['哦', 'ほ#', '哪'] assert simple_seg('哦ほ#哪#') == ['哦', 'ほ#', '哪', '#'] assert simple_seg('你好啊 --') == ['你好啊', ' --'] assert simple_seg('啊 -- ') == ['啊', ' -- '] assert simple_seg('你好啊 -- 那') == ['你好啊', ' -- ', '那'] assert simple_seg('啊 -- 你好那 ') == ['啊', ' -- ', '你好那', ' '] assert simple_seg('a 你好啊 -- 那 ') == ['a ', '你好啊', ' -- ', '那', ' '] assert simple_seg('a啊 -- 你好那 ') == ['a', '啊', ' -- ', '你好那', ' ']