Beispiel #1
0
def record_boundary_sentences(sentences, text):
    sentence_start = 0

    text = re.sub(r'\n+', ' ', text)
    sentences = [re.sub(r'\n+', ' ', sentence) for sentence in sentences]

    for i, sentence in enumerate(sentences):
        boundary = re.search(r'^\s+', text[sentence_start + len(sentence):])

        if boundary == None:
            boundary = ''
        else:
            boundary = boundary.group()

        sentences[i] = wordless_text.Wordless_Token(sentences[i],
                                                    boundary=boundary)

        sentence_start += len(sentence) + len(boundary)

    return sentences
def wordless_word_tokenize(main,
                           text,
                           lang,
                           word_tokenizer='default',
                           flat_tokens=True):
    tokens_hierarchical = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    # Check initialization status of word (and sentence) tokenizers
    if flat_tokens:
        wordless_text_utils.check_word_tokenizers(
            main, lang=lang, word_tokenizer=word_tokenizer)
    else:
        wordless_text_utils.check_tokenizers(main,
                                             lang=lang,
                                             word_tokenizer=word_tokenizer)

    # NLTK
    if 'NLTK' in word_tokenizer:
        sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(
                    treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(tweet_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence))
    # Sacremoses
    elif 'Sacremoses' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(
                    moses_tokenizer.tokenize(sentence, escape=False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(
                    moses_tokenizer.penn_tokenize(sentence))
    # spaCy
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if flat_tokens:
            tokens_hierarchical.append([token.text for token in doc])
        else:
            for sentence in doc.sents:
                tokens_hierarchical.append(
                    [token.text for token in sentence.as_doc()])
    # syntok
    elif word_tokenizer == 'syntok - Word Tokenizer':
        syntok_tokenizer = syntok.tokenizer.Tokenizer()

        if flat_tokens:
            tokens_hierarchical.append(
                [token.value for token in syntok_tokenizer.tokenize(text)])
        else:
            for para in syntok.segmenter.analyze(text):
                for sentence in para:
                    tokens_hierarchical.append(
                        [token.value for token in sentence])
    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang=lang)

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_hierarchical.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_hierarchical.append(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_hierarchical.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wordless_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_hierarchical.append(tokens)
    # Russian
    elif word_tokenizer == 'razdel - Russian Word Tokenizer':
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang='rus')

        for sentence in sentences:
            tokens_hierarchical.append(
                [token.text for token in razdel.tokenize(sentence)])
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        # Preserve sentence boundaries
        sentences = wordless_sentence_tokenize(
            main,
            text,
            lang='tha',
            sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer')

        if word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm + TCC'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(
                        sentence, engine='longest-matching'))
    # Tibetan
    elif 'botok' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang='bod')

        botok_tokenizer = wordless_text_utils.check_botok_tokenizers(
            main, word_tokenizer)

        for sentence in sentences:
            tokens_hierarchical.append(
                [token.text for token in botok_tokenizer.tokenize(sentence)])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )

        for sentence in sentences:
            tokens_hierarchical.append(underthesea.word_tokenize(
                str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, sentence in enumerate(tokens_hierarchical):
        tokens_hierarchical[i] = [
            token.strip() for token in sentence if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(
                    sentence[-1], boundary='', sentence_ending=True)
    else:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(
                    sentence[-1], boundary=' ', sentence_ending=True)

    # Clause tokenization
    if not flat_tokens:
        for i, sentence in enumerate(tokens_hierarchical):
            tokens_hierarchical[i] = wordless_clause_tokenize(
                main, sentence, lang)

    # Flatten tokens
    tokens_flat = list(wordless_misc.flatten_list(tokens_hierarchical))

    if flat_tokens:
        return tokens_flat
    else:
        return tokens_hierarchical
def wordless_word_tokenize(main,
                           text,
                           lang,
                           word_tokenizer='default',
                           keep_sentences=False):
    tokens_sentences = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    wordless_text_utils.check_word_tokenizers(main,
                                              lang=lang,
                                              word_tokenizer=word_tokenizer)

    if 'NLTK' in word_tokenizer:
        sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_sentences.append(treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_sentences.append(tweet_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_sentences.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_sentences.append(toktok_tokenizer.tokenize(sentence))

        if not keep_sentences:
            tokens_sentences = [
                itertools.chain.from_iterable(tokens_sentences)
            ]
    elif 'Sacremoses' in word_tokenizer:
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang)
        else:
            sentences = [text]

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_sentences.append(
                    moses_tokenizer.tokenize(sentence, escape=False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_sentences.append(
                    moses_tokenizer.penn_tokenize(sentence))
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if keep_sentences:
            for sentence in doc.sents:
                tokens_sentences.append(
                    [token.text for token in sentence.as_doc()])
        else:
            tokens_sentences.append([token.text for token in doc])

    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang=lang)
        else:
            sentences = [text]

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_sentences.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_sentences.extend(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_sentences.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wordless_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_sentences.extend(tokens)
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        sentences = wordless_sentence_tokenize(
            main,
            text,
            lang='tha',
            sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer')

        if word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm + TCC'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(
                        sentence, engine='longest-matching'))
    # Tibetan
    elif 'pybo' in word_tokenizer:
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang='bod')
        else:
            sentences = [text]

        if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text
                    for token in main.pybo_tokenizer_gmd.tokenize(sentence)
                ])
        elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text
                    for token in main.pybo_tokenizer_pos.tokenize(sentence)
                ])
        elif word_tokenizer == main.tr(
                'pybo - Tibetan Word Tokenizer (tsikchen)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text for token in
                    main.pybo_tokenizer_tsikchen.tokenize(sentence)
                ])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if keep_sentences:
            sentences = wordless_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )
        else:
            sentences = [text]

        for sentence in sentences:
            tokens_sentences.append(underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, tokens in enumerate(tokens_sentences):
        tokens_sentences[i] = [
            token.strip() for token in tokens if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for tokens in tokens_sentences:
            if tokens:
                tokens[-1] = wordless_text.Wordless_Token(tokens[-1],
                                                          boundary='',
                                                          sentence_ending=True)
    else:
        for tokens in tokens_sentences:
            if tokens:
                tokens[-1] = wordless_text.Wordless_Token(tokens[-1],
                                                          boundary=' ',
                                                          sentence_ending=True)

    return tokens_sentences
Beispiel #4
0
                                                             word_detokenizer = word_detokenizer)

    print(f"\t{text}")

sentence_zho_cn = '汉语,又称中文、华文、唐话[2],或被视为汉藏语系汉语族下之语言,或被视为语族。'
sentence_zho_tw = '漢語,又稱中文、華文、唐話[2],或被視為漢藏語系漢語族下之語言,或被視為語族。'
sentence_eng = 'English is a West Germanic language that was first spoken in early medieval England and eventually became a global lingua franca.[4][5]'
sentence_jpn = '使用人口について正確な統計はないが、日本国内の人口、および日本国外に住む日本人や日系人、日本がかつて統治した地域の一部住民など、約1億3千万人以上と考えられている[7]。'
sentence_tha = 'ภาษาไทย หรือ ภาษาไทยกลาง เป็นภาษาราชการและภาษาประจำชาติของประเทศไทย'
sentence_bod = '༄༅། །རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྻ་ཨ་བ་ཏ་ར། བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པ། །སངས་རྒྱས་དང་བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚལ་ལོ། །བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང༌། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །'

tokens_zho_cn = ['汉语', ',', '又称', '中文', '、', '华文', '、', '唐话', '[', '2', ']', ',', '或', '被', '视为', '汉藏语系', '汉语', '族', '下', '之', '语言', ',', '或', '被', '视为', '语族', '。']
tokens_zho_tw = ['漢語', ',', '又', '稱', '中文', '、', '華文', '、', '唐話', '[', '2', ']', ',', '或', '被', '視為', '漢藏語', '系漢', '語族', '下', '之', '語言', ',', '或', '被', '視為', '語族', '。']
tokens_eng = ['English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'became', 'a', 'global', 'lingua', 'franca.[4][5', ']']
tokens_jpn = ['使用', '人口', 'に', 'つい', 'て', '正確', 'な', '統計', 'は', 'ない', 'が', '、', '日本', '国', '内', 'の', '人口', '、', 'および', '日本', '国', '外', 'に', '住む', '日本', '人', 'や', '日系', '人', '、', '日本', 'が', 'かつて', '統治', 'し', 'た', '地域', 'の', '一部', '住民', 'など', '、', '約', '1', '億', '3千', '万', '人', '以上', 'と', '考え', 'られ', 'て', 'いる', '[', '7', ']', '。']
tokens_tha = [wordless_text.Wordless_Token('ภาษาไทย', boundary = ' '), wordless_text.Wordless_Token('หรือ', boundary = ' '), 'ภาษาไทย', wordless_text.Wordless_Token('กลาง', boundary = ' '), 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศไทย']
tokens_bod = ['༄༅། །', 'རྒྱ་གར་','སྐད་', 'དུ', '།', 'བོ་དྷི་སཏྭ་', 'ཙརྻ་', 'ཨ་བ་ཏ་ར', '།', 'བོད་སྐད་', 'དུ', '།', 'བྱང་ཆུབ་སེམས་དཔ', 'འི་', 'སྤྱོད་པ་', 'ལ་', 'འཇུག་པ', '། །', 'སངས་རྒྱས་', 'དང་', 'བྱང་ཆུབ་སེམས་དཔའ་', 'ཐམས་ཅད་', 'ལ་', 'ཕྱག་', 'འཚལ་', 'ལོ', '། །', 'བདེ་གཤེགས་', 'ཆོས་', 'ཀྱི་', 'སྐུ་', 'མངའ་', 'སྲས་', 'བཅས་', 'དང༌', '། །', 'ཕྱག་འོས་', 'ཀུན་', 'ལ', 'འང་', 'གུས་པ', 'ར་', 'ཕྱག་', 'འཚལ་', 'ཏེ', '། །', 'བདེ་གཤེགས་', 'སྲས་', 'ཀྱི་', 'སྡོམ་', 'ལ་', 'འཇུག་པ་', 'ནི', '། །', 'ལུང་', 'བཞིན་', 'མདོར་བསྡུས་ན', 'ས་', 'ནི་', 'བརྗོད་པ', 'ར་', 'བྱ', '། །']

testing_word_detokenize(lang = 'zho_cn',
                        word_detokenizer = 'Wordless - Chinese Word Detokenizer')
testing_word_detokenize(lang = 'zho_tw',
                        word_detokenizer = 'Wordless - Chinese Word Detokenizer')

testing_word_detokenize(lang = 'eng',
                        word_detokenizer = 'NLTK - Penn Treebank Detokenizer')
testing_word_detokenize(lang = 'eng',
                        word_detokenizer = 'SacreMoses - Moses Detokenizer')

testing_word_detokenize(lang = 'jpn',
                        word_detokenizer = 'Wordless - Japanese Word Detokenizer')