def split_zh_en (zh_en_str): tokenizer = Tokenizer() mark = {"en":1, "zh":2} zh_en_group = [] zh_set = [] en_set = [] status = "" en = "" zh = "" for c in zh_en_str: if tokenizer.is_zh(c): if status == 'en': zh_en_group.append ([mark["en"], ''.join(en_set)]) en += ''.join(en_set) en_set = [] zh_set.append(c) status = 'zh' else: if status == 'zh': zh_en_group.append ([mark["zh"], ''.join(zh_set)]) zh += ''.join(zh_set) zh_set = [] en_set.append(c) status = 'en' if en_set: zh_en_group.append ([mark["en"], ''.join(en_set)]) en += ''.join(en_set) elif zh_set: zh_en_group.append ([mark["zh"], ''.join(zh_set)]) zh += ''.join(zh_set) if en == "": print 'error' return zh_en_group, en, zh
def split_zh_en(zh_en_str): tokenizer = Tokenizer() mark = {"en": 1, "zh": 2} zh_en_group = [] zh_set = [] en_set = [] status = "" en = "" zh = "" for c in zh_en_str: if tokenizer.is_zh(c): if status == 'en': zh_en_group.append([mark["en"], ''.join(en_set)]) en += ''.join(en_set) en_set = [] zh_set.append(c) status = 'zh' else: if status == 'zh': zh_en_group.append([mark["zh"], ''.join(zh_set)]) zh += ''.join(zh_set) zh_set = [] en_set.append(c) status = 'en' if en_set: zh_en_group.append([mark["en"], ''.join(en_set)]) en += ''.join(en_set) elif zh_set: zh_en_group.append([mark["zh"], ''.join(zh_set)]) zh += ''.join(zh_set) if en == "": print 'error' return zh_en_group, en, zh