def split_zh_en (zh_en_str):

    tokenizer = Tokenizer()
    mark = {"en":1, "zh":2}
    zh_en_group = []
    zh_set = []
    en_set = []
    status = ""
    en = ""
    zh = ""
    for c in zh_en_str:
        if tokenizer.is_zh(c):
            if status == 'en':
                zh_en_group.append ([mark["en"], ''.join(en_set)])
                en += ''.join(en_set)
                en_set = []
            zh_set.append(c)
            status = 'zh'
        else:
            if status == 'zh':
                zh_en_group.append ([mark["zh"], ''.join(zh_set)])
                zh += ''.join(zh_set)
                zh_set = []
            en_set.append(c)
            status = 'en'
        
    if en_set:
        zh_en_group.append ([mark["en"], ''.join(en_set)])
        en += ''.join(en_set)
    elif zh_set:
        zh_en_group.append ([mark["zh"], ''.join(zh_set)])
        zh += ''.join(zh_set)
    if en == "":
        print 'error'

    return zh_en_group, en, zh
Esempio n. 2
0
def split_zh_en(zh_en_str):

    tokenizer = Tokenizer()
    mark = {"en": 1, "zh": 2}
    zh_en_group = []
    zh_set = []
    en_set = []
    status = ""
    en = ""
    zh = ""
    for c in zh_en_str:
        if tokenizer.is_zh(c):
            if status == 'en':
                zh_en_group.append([mark["en"], ''.join(en_set)])
                en += ''.join(en_set)
                en_set = []
            zh_set.append(c)
            status = 'zh'
        else:
            if status == 'zh':
                zh_en_group.append([mark["zh"], ''.join(zh_set)])
                zh += ''.join(zh_set)
                zh_set = []
            en_set.append(c)
            status = 'en'

    if en_set:
        zh_en_group.append([mark["en"], ''.join(en_set)])
        en += ''.join(en_set)
    elif zh_set:
        zh_en_group.append([mark["zh"], ''.join(zh_set)])
        zh += ''.join(zh_set)
    if en == "":
        print 'error'

    return zh_en_group, en, zh