def run(self):
        preview_results = []

        preview_lang = self.main.settings_custom['pos_tagging']['preview_lang']
        preview_samples = self.main.settings_custom['pos_tagging'][
            'preview_samples']

        for line in preview_samples.split('\n'):
            line = line.strip()

            if line:
                tokens = wl_word_tokenization.wl_word_tokenize(
                    self.main, line, lang=preview_lang)
                tokens = list(wl_misc.flatten_list(tokens))

                tokens_tagged = wl_pos_tagging.wl_pos_tag(
                    self.main,
                    tokens,
                    lang=preview_lang,
                    pos_tagger=self.pos_tagger,
                    tagset=self.tagset)

                preview_results.append(' '.join(
                    [f'{token}_{tag}' for token, tag in tokens_tagged]))
            else:
                preview_results.append('')

        self.worker_done.emit(preview_samples, preview_results)
    def run(self):
        preview_results = []

        preview_lang = self.main.settings_custom['word_tokenization'][
            'preview_lang']
        preview_samples = self.main.settings_custom['word_tokenization'][
            'preview_samples']

        for line in preview_samples.split('\n'):
            line = line.strip()

            if line:
                tokens = wl_word_tokenization.wl_word_tokenize(
                    self.main,
                    line,
                    lang=preview_lang,
                    word_tokenizer=self.word_tokenizer)
                tokens = wl_misc.flatten_list(tokens)

                # Vietnamese
                if preview_lang == 'vie':
                    tokens = [re.sub(r'\s+', r'_', token) for token in tokens]

                preview_results.append(' '.join(tokens))
            else:
                preview_results.append('')

        self.worker_done.emit(preview_samples, preview_results)
    def run(self):
        preview_results = []

        preview_lang = self.main.settings_custom['lemmatization'][
            'preview_lang']
        preview_samples = self.main.settings_custom['lemmatization'][
            'preview_samples']

        for line in preview_samples.split('\n'):
            line = line.strip()

            if line:
                tokens = wl_word_tokenization.wl_word_tokenize(
                    self.main, line, lang=preview_lang)
                tokens = wl_misc.flatten_list(tokens)

                lemmas = wl_lemmatization.wl_lemmatize(
                    self.main,
                    tokens,
                    lang=preview_lang,
                    lemmatizer=self.lemmatizer)

                text = wl_word_detokenization.wl_word_detokenize(
                    self.main, lemmas, lang=preview_lang)

                preview_results.append(text)
            else:
                preview_results.append('')

        self.worker_done.emit(preview_samples, preview_results)
Beispiel #4
0
    def tokenize_text(self, text):
        if text:
            tokens = wl_word_tokenization.wl_word_tokenize(self.main,
                                                           text,
                                                           lang=self.lang)

            self.tags.extend([[]] * len(list(wl_misc.flatten_list(tokens))))
Beispiel #5
0
def get_counts(main, text):
    # Count of sentences
    if 'count_sentences' not in text.__dict__:
        text.words_multilevel = []

        for para in text.tokens_multilevel:
            text.words_multilevel.append([])

            for sentence in para:
                text.words_multilevel[-1].append([
                    token for token in sentence
                    if wl_checking_tokens.is_word_alphanumeric(token)
                ])

        text.sentences = [
            sentence for para in text.words_multilevel for sentence in para
        ]
        text.count_sentences = len(text.sentences)

    # Count of words with at least one letter or numeral
    if 'count_words' not in text.__dict__:
        text.words_flat = list(wl_misc.flatten_list(text.words_multilevel))
        text.count_words = len(text.words_flat)

    # Count of syllables
    if 'count_syls' not in text.__dict__:
        text.syls_words = wl_syl_tokenization.wl_syl_tokenize(main,
                                                              text.words_flat,
                                                              lang=text.lang)
        text.count_syls = sum([len(syls) for syls in text.syls_words])

    # Count of characters
    if 'count_chars_all' not in text.__dict__:
        text.count_chars_all = 0
        text.count_chars_alphanumeric = 0
        text.count_chars_alphabetic = 0

        for token in text.words_flat:
            for char in token:
                text.count_chars_all += 1

                if char.isalpha():
                    text.count_chars_alphanumeric += 1
                    text.count_chars_alphabetic += 1
                elif char.isalnum():
                    text.count_chars_alphanumeric += 1
Beispiel #6
0
    def run(self):
        preview_results = []

        preview_lang = self.main.settings_custom['word_tokenization']['preview_lang']
        preview_samples = self.main.settings_custom['word_tokenization']['preview_samples']

        tokens_multilevel = wl_word_tokenization.wl_word_tokenize(
            main = self.main,
            text = preview_samples,
            lang = preview_lang,
            word_tokenizer = self.word_tokenizer
        )

        for para in tokens_multilevel:
            tokens = wl_misc.flatten_list(para)

            # Vietnamese
            if preview_lang == 'vie':
                tokens = [re.sub(r'\s+', r'_', token) for token in tokens]

            preview_results.append(' '.join(tokens))

        self.worker_done.emit(preview_results)
Beispiel #7
0
class Wl_Text:
    def __init__(self, main, file):
        self.main = main
        self.lang = file['lang']
        self.tokenized = file['tokenized']
        self.tagged = file['tagged']

        self.offsets_paras = []
        self.offsets_sentences = []

        self.tokens_multilevel = []
        self.tokens_flat = []
        self.tags = []

        file_ext = os.path.splitext(file['path'])[1].lower()
        re_tags = re.compile(
            wl_matching.get_re_tags(self.main, tag_type='body'))
        re_tags_start = re.compile(
            fr"\s*({wl_matching.get_re_tags(self.main, tag_type = 'body')})")

        if (file_ext == '.txt'
                # Treat untagged XML files as untagged text files
                or file_ext == '.xml' and not self.tagged):
            with open(file['path'],
                      'r',
                      encoding=file['encoding'],
                      errors='replace') as f:
                text = f.read()

            # Untokenized & Untagged
            if not self.tokenized and not self.tagged:
                tokens = wl_word_tokenization.wl_word_tokenize(self.main,
                                                               text,
                                                               lang=self.lang)

                self.tokens_multilevel.extend(tokens)
            # Untokenized & Tagged
            elif not self.tokenized and self.tagged:
                # Replace all tags with a whitespace to ensure no words run together
                text_no_tags = re.sub(re_tags, ' ', text)

                tokens = wl_word_tokenization.wl_word_tokenize(self.main,
                                                               text_no_tags,
                                                               lang=self.lang)

                self.tokens_multilevel.extend(tokens)

                # Check if the first token in the text is a tag
                if re.match(re_tags_start, text):
                    # Check if the first paragraph is empty
                    if not self.tokens_multilevel[0]:
                        self.tokens_multilevel[0].append([])

                    self.tokens_multilevel[0][0].insert(0, '')
                    self.tags.append([])

                # Extract tags
                tag_end = 0

                for tag in re.finditer(re_tags, text):
                    self.add_tags_tokenization(text[tag_end:tag.start()])
                    self.tags[-1].append(tag.group())

                    tag_end = tag.end()

                # The last part of the text
                if (text := text[tag_end:]):
                    self.add_tags_tokenization(text)
            # Tokenized & Untagged
            elif self.tokenized and not self.tagged:
                for para in text.splitlines():
                    self.tokens_multilevel.append([])

                    if para:
                        for sentence in wl_sentence_tokenization.wl_sentence_split(
                                self.main, para):
                            self.tokens_multilevel[-1].append(sentence.split())
            # Tokenized & Tagged
            elif self.tokenized and self.tagged:
                for i, para in enumerate(text.splitlines()):
                    self.tokens_multilevel.append([])

                    if para:
                        # Replace all tags with a whitespace to ensure no words run together
                        text_no_tags = re.sub(re_tags, ' ', para)

                        for sentence in wl_sentence_tokenization.wl_sentence_split(
                                self.main, text_no_tags):
                            self.tokens_multilevel[-1].append(sentence.split())

                        # Check if the first token in the text is a tag
                        if i == 0 and re.match(re_tags_start, para):
                            # Check if the first paragraph is empty
                            if not self.tokens_multilevel[0]:
                                self.tokens_multilevel[0].append([])

                            self.tokens_multilevel[0][0].insert(0, '')

                            self.tags.append([])

                        # Extract tags
                        tag_end = 0

                        for tag in re.finditer(re_tags, para):
                            self.add_tags_splitting(para[tag_end:tag.start()])
                            self.tags[-1].append(tag.group())

                            tag_end = tag.end()

                        # The last part of the text
                        if (para := para[tag_end:]):
                            self.add_tags_splitting(para)

            # Add empty tags for untagged files
            if not self.tagged:
                self.tags.extend(
                    [[] for _ in wl_misc.flatten_list(self.tokens_multilevel)])
Beispiel #8
0
                            self.tokens_multilevel[-1].append([])

                            for word in sentence.select(css_word):
                                self.tokens_multilevel[-1][-1].append(
                                    word.get_text().strip())
                # XML tags unfound or unspecified
                else:
                    text = soup.get_text()
                    tokens = wl_word_tokenization.wl_word_tokenize(
                        self.main, text, lang=self.lang)

                    self.tokens_multilevel.extend(tokens)

            # Add empty tags
            self.tags.extend(
                [[] for _ in wl_misc.flatten_list(self.tokens_multilevel)])

        # Paragraph and sentence offsets
        for para in self.tokens_multilevel:
            self.offsets_paras.append(len(self.tokens_flat))

            for sentence in para:
                self.offsets_sentences.append(len(self.tokens_flat))

                self.tokens_flat.extend(sentence)

        # Remove whitespace around all tags
        self.tags = [[tag.strip() for tag in tags] for tags in self.tags]

        # Remove Wl_Main object from the text since it cannot be pickled
        del self.main
Beispiel #9
0
def wl_process_tokens(text, token_settings):
    main = text.main
    settings = copy.deepcopy(token_settings)

    # Token Settings
    if settings['use_tags']:
        settings['ignore_tags'] = settings['ignore_tags_tags']
        settings['ignore_tags_type'] = settings['ignore_tags_type_tags']

    # Punctuations
    if not settings['puncs']:
        i_tokens = 0

        # Mark tokens to be removed
        for para in text.tokens_multilevel:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        if wl_checking_token.is_token_punc(token):
                            clause[i] = ''

                            text.tags_pos[i_tokens + i] = ''
                            text.tags_non_pos[i_tokens + i] = ''
                            text.tags_all[i_tokens + i] = ''

                    i_tokens += len(clause)

        # Remove punctuations
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, clause in enumerate(sentence):
                    sentence[i] = [token for token in clause if token]

        text.tags_pos = [tags for tags in text.tags_pos if tags != '']
        text.tags_non_pos = [tags for tags in text.tags_pos if tags != '']
        text.tags_all = [tags for tags in text.tags_pos if tags != '']

    # Lemmatize all tokens
    if not settings['use_tags'] and settings['lemmatize_tokens']:
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, clause in enumerate(sentence):
                    sentence[i] = wl_lemmatization.wl_lemmatize(main,
                                                                clause,
                                                                lang=text.lang)

    # Treat as all lowercase
    if settings['treat_as_lowercase']:
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, clause in enumerate(sentence):
                    sentence[i] = [token.lower() for token in clause]

        text.tags_pos = [[tag.lower() for tag in tags]
                         for tags in text.tags_pos]
        text.tags_non_pos = [[tag.lower() for tag in tags]
                             for tags in text.tags_non_pos]
        text.tags_all = [[tag.lower() for tag in tags]
                         for tags in text.tags_all]

    # Words
    if settings['words']:
        # Lowercase
        if not settings['lowercase']:
            for para in text.tokens_multilevel:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            if wl_checking_token.is_token_word_lowercase(
                                    token):
                                clause[i] = ''
        # Uppercase
        if not settings['uppercase']:
            for para in text.tokens_multilevel:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            if wl_checking_token.is_token_word_uppercase(
                                    token):
                                clause[i] = ''
        # Title Case
        if not settings['title_case']:
            for para in text.tokens_multilevel:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            if wl_checking_token.is_token_word_title_case(
                                    token):
                                clause[i] = ''
    else:
        for para in text.tokens_multilevel:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        if wl_checking_token.is_token_word(token):
                            clause[i] = ''

    # Numerals
    if not settings['nums']:
        for para in text.tokens_multilevel:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        if wl_checking_token.is_token_num(token):
                            clause[i] = ''

    # Filter stop words
    if settings['filter_stop_words']:
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, clause in enumerate(sentence):
                    sentence[i] = wl_stop_word_lists.wl_filter_stop_words(
                        main, clause, lang=text.lang)

    # Ignore tags
    i_token = 0

    if settings['ignore_tags']:
        # Ignore all tags
        if settings['ignore_tags_type'] == main.tr('all'):
            for para in text.tokens_multilevel:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            clause[i] = (token, [])
        # Ignore POS tags
        elif settings['ignore_tags_type'] == main.tr('POS'):
            for para in text.tokens_multilevel:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            clause[i] = (token, text.tags_non_pos[i_token + i])

                        i_token += len(clause)

        # Ignore non-POS tags
        elif settings['ignore_tags_type'] == main.tr('non-POS'):
            for para in text.tokens_multilevel:
                for sentence in para:
                    for clause in sentence:
                        for i, token in enumerate(clause):
                            clause[i] = (token, text.tags_pos[i_token + i])

                        i_token += len(clause)
    else:
        for para in text.tokens_multilevel:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        clause[i] = (token, text.tags_all[i_token + i])

                    i_token += len(clause)

    # Use tags only
    if settings['use_tags']:
        for para in text.tokens_multilevel:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        clause[i] = clause[i][1]
    else:
        for para in text.tokens_multilevel:
            for sentence in para:
                for clause in sentence:
                    for i, token in enumerate(clause):
                        clause[i] = f"{clause[i][0]}{''.join(clause[i][1])}"

    text.tokens_flat = list(wl_misc.flatten_list(text.tokens_multilevel))

    return text
Beispiel #10
0
def test_word_detokenize(lang, word_detokenizer, show_results=False):
    lang_text = wl_conversion.to_lang_text(main, lang)

    tokens = wl_word_tokenization.wl_word_tokenize(
        main,
        text=getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang=lang)
    tokens = list(wl_misc.flatten_list(tokens))

    text = wl_word_detokenization.wl_word_detokenize(
        main, tokens=tokens, lang=lang, word_detokenizer=word_detokenizer)

    if show_results:
        print(f'{lang} / {word_detokenizer}:')
        print(text)

    if lang == 'cat':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == "El català (denominació oficial a Catalunya, a les Illes Balears, a Andorra, a la ciutat de l' Alguer i tradicional a Catalunya Nord) o valencià (denominació oficial al País Valencià i tradicional al Carxe) és una llengua romànica parlada a Catalunya, el País Valencià (tret d' algunes comarques i localitats de l' interior), les Illes Balears, Andorra, la Franja de Ponent (a l' Aragó), la ciutat de l' Alguer (a l' illa de Sardenya), la Catalunya del Nord,[8] el Carxe (un petit territori de Múrcia poblat per immigrats valencians),[9][10] i en petites comunitats arreu del món (entre les quals destaca la de l' Argentina, amb 195.000 parlants).[11 ]"
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == "El català (denominació oficial a Catalunya, a les Illes Balears, a Andorra, a la ciutat de l' Alguer i tradicional a Catalunya Nord) o valencià (denominació oficial al País Valencià i tradicional al Carxe) és una llengua romànica parlada a Catalunya, el País Valencià (tret d' algunes comarques i localitats de l' interior), les Illes Balears, Andorra, la Franja de Ponent (a l' Aragó), la ciutat de l' Alguer (a l' illa de Sardenya), la Catalunya del Nord,[8] el Carxe (un petit territori de Múrcia poblat per immigrats valencians),[9][10] i en petites comunitats arreu del món (entre les quals destaca la de l' Argentina, amb 195.000 parlants).[11]"
    elif lang == 'zho_cn':
        assert text == '汉语,又称汉文、中文、中国话、中国语、华语、华文、唐话[2],或被视为一个语族,或被视为隶属于汉藏语系汉语族之一种语言。'
    elif lang == 'zho_tw':
        assert text == '漢語,又稱漢文、中文、中國話、中國語、華語、華文、唐話[2],或被視為一個語族,或被視為隸屬於漢藏語系漢語族之一種語言。'
    elif lang == 'ces':
        assert text == 'Čeština neboli český jazyk je západoslovanský jazyk, nejbližší slovenštině, poté lužické srbštině a polštině.'
    elif lang == 'nld':
        assert text == 'Het Nederlands is een West-Germaanse taal en de moedertaal van de meeste inwoners van Nederland, België en Suriname.'
    elif lang == 'eng':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'English is a West Germanic language that was first spoken in early medieval England and eventually became a global lingua franca.[4][5 ]'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'English is a West Germanic language that was first spoken in early medieval England and eventually became a global lingua franca.[4][5]'
    elif lang == 'fin':
        assert text == 'Suomen kieli (suomi) on uralilaisten kielten itämerensuomalaiseen ryhmään kuuluva kieli.'
    elif lang == 'fra':
        assert text == 'Le français est une langue indo-européenne de la famille des langues romanes.'
    elif lang == 'deu':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Die deutsche Sprache bzw. Deutsch ([ dɔʏ̯t͡ʃ];abgekürzt dt.oder dtsch . )ist eine westgermanische Sprache.'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ];abgekürzt dt.oder dtsch.)ist eine westgermanische Sprache.'
    elif lang == 'ell':
        assert text == 'Η ελληνική γλώσσα ανήκει στην ινδοευρωπαϊκή οικογένεια[9] και συγκεκριμένα στον ελληνικό κλάδο, μαζί με την τσακωνική, ενώ είναι η επίσημη γλώσσα της Ελλάδος και της Κύπρου.'
    elif lang == 'hun':
        assert text == 'A magyar nyelv az uráli nyelvcsalád tagja, a finnugor nyelvek közé tartozó ugor nyelvek egyike.'
    elif lang == 'isl':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Íslenska er vesturnorrænt, germanskt og indóevrópskt tungumál sem er einkum talað og ritað á Íslandi og er móðurmál langflestra Íslendinga.[ 4 ]'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Íslenska er vesturnorrænt, germanskt og indóevrópskt tungumál sem er einkum talað og ritað á Íslandi og er móðurmál langflestra Íslendinga.[4]'
    elif lang == 'gle':
        assert text == 'Is ceann de na teangacha Ceilteacha í an Ghaeilge (nó Gaeilge na hÉireann mar a thugtar uirthi corruair), agus ceann den dtrí cinn de theangacha Ceilteacha ar a dtugtar na teangacha Gaelacha (.i. an Ghaeilge, Gaeilge na hAlban agus Gaeilge Mhanann) go háirithe.'
    elif lang == 'ita':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == "L' italiano ([ itaˈljaːno][Nota 1] ascolta[?·info] ) è una lingua romanza parlata principalmente in Italia."
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == "L'italiano ([itaˈljaːno][Nota 1] ascolta[?·info]) è una lingua romanza parlata principalmente in Italia."
    elif lang == 'jpn':
        assert text == '日本語(にほんご、にっぽんご[注1])は、主に日本国内や日本人同士の間で使用されている言語である。'
    elif lang == 'lav':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Latviešu valoda ir dzimtā valoda apmēram 1,7 miljoniem cilvēku, galvenokārt Latvijā, kur tā ir vienīgā valsts valoda.[3 ]'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Latviešu valoda ir dzimtā valoda apmēram 1,7 miljoniem cilvēku, galvenokārt Latvijā, kur tā ir vienīgā valsts valoda.[3]'
    elif lang == 'lit':
        assert text == 'Lietuvių kalba – iš baltų prokalbės kilusi lietuvių tautos kalba, kuri Lietuvoje yra valstybinė, o Europos Sąjungoje – viena iš oficialiųjų kalbų.'
    elif lang == 'pol':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Język polski, polszczyzna, skrót :pol . –język naturalny należący do grupy języków zachodniosłowiańskich (do której należą również czeski, słowacki, kaszubski, dolnołużycki, górnołużycki i wymarły połabski), stanowiącej część rodziny języków indoeuropejskich.'
        elif word_detokenizer == 'pol / Sacremoses - Moses Detokenizer':
            assert text == 'Język polski, polszczyzna, skrót: pol. – język naturalny należący do grupy języków zachodniosłowiańskich (do której należą również czeski, słowacki, kaszubski, dolnołużycki, górnołużycki i wymarły połabski), stanowiącej część rodziny języków indoeuropejskich.'
    elif lang == 'por':
        assert text == 'A língua portuguesa, também designada português, é uma língua românica flexiva ocidental originada no galego-português falado no Reino da Galiza e no norte de Portugal.'
    elif lang == 'ron':
        assert text == 'Limba română este o limbă indo-europeană, din grupul italic și din subgrupul oriental al limbilor romanice.'
    elif lang == 'rus':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Ру́сский язы́к ([ ˈruskʲɪi̯ jɪˈzɨk] Информация о файле слушать) [~ 3] [⇨] — один из восточнославянских языков, национальный язык русского народа.'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Ру́сский язы́к ([ˈruskʲɪi̯ jɪˈzɨk] Информация о файле слушать) [~ 3] [⇨] — один из восточнославянских языков, национальный язык русского народа.'
    elif lang == 'slk':
        assert text == 'Slovenčina patrí do skupiny západoslovanských jazykov (spolu s češtinou, poľštinou, hornou a dolnou lužickou srbčinou a kašubčinou).'
    elif lang == 'slv':
        assert text == 'Slovenščina [slovénščina] / [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore, ki jih govorijo ali so jih nekoč govorili Slovenci.'
    elif lang == 'spa':
        assert text == 'El español o castellano es una lengua romance procedente del latín hablado.'
    elif lang == 'swe':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Svenska (svenska (info) ) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk, men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Svenska (svenska (info)) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk, men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.'
    elif lang == 'tam':
        assert text == 'தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும்.'
    elif lang == 'tha':
        assert text == 'ภาษาไทยหรือภาษาไทยกลางเป็นภาษาราชการและภาษาประจำชาติของประเทศไทย'
    elif lang == 'bod':
        assert text == 'བོད་ཀྱི་སྐད་ཡིག་ནི་བོད་ཡུལ་དང་དེའི་ཉེ་འཁོར་གྱི་ས་ཁུལ་ཏེ།'
Beispiel #11
0
    def __init__(self, main, file):
        self.main = main
        self.lang = file['lang']
        self.tokenized = file['tokenized']
        self.tagged = file['tagged']

        self.offsets_paras = []
        self.offsets_sentences = []

        self.tokens_multilevel = []
        self.tokens_flat = []
        self.tags = []

        re_tags = wl_matching.get_re_tags(main)

        if re.search(r'\.txt', file['path'], flags=re.IGNORECASE):
            with open(file['path'], 'r', encoding=file['encoding']) as f:
                # Untokenized & Untagged
                if self.tokenized == 'No' and self.tagged == 'No':
                    for line in f:
                        text = line.rstrip()

                        if text:
                            tokens = wl_word_tokenization.wl_word_tokenize(
                                main, text, lang=self.lang)

                            self.tokens_multilevel.append(tokens)
                            self.tags.extend(
                                [[]] * len(list(wl_misc.flatten_list(tokens))))
                # Untokenized & Tagged
                elif self.tokenized == 'No' and self.tagged == 'Yes':
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            tokens = wl_word_tokenization.wl_word_tokenize(
                                main, text_no_tags, lang=self.lang)

                            self.tokens_multilevel.append(tokens)

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags, text):
                                self.tokens_multilevel[0][0].insert(0, '')
                                self.tags.append([])

                            # Extract tags
                            for tag in re.findall(re_tags, text):
                                i_tag = text.index(tag)

                                self.tokenize_text(text[:i_tag])
                                self.tags[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.tokenize_text(text)
                # Tokenized & Untagged
                elif self.tokenized == 'Yes' and self.tagged == 'No':
                    for line in f:
                        text = line.rstrip()

                        if text:
                            self.tokens_multilevel.append([])

                            for sentence in wl_sentence_tokenization.wl_sentence_split(
                                    main, text):
                                self.tokens_multilevel[-1].append(
                                    sentence.split())
                # Tokenized & Tagged
                elif self.tokenized == 'Yes' and self.tagged == 'Yes':
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_multilevel.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            for sentence in wl_sentence_tokenization.wl_sentence_split(
                                    main, text_no_tags):
                                self.tokens_multilevel[-1].append(
                                    sentence.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags, text):
                                self.tokens_multilevel[0][0].insert(0, '')

                                self.tags.append([])

                            # Extract tags
                            for tag in re.findall(re_tags, text):
                                i_tag = text.index(tag)

                                self.tokenize_text(text[:i_tag])
                                self.tags[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.tokenize_text(text)
        elif re.search(r'\.xml', file['path'], flags=re.IGNORECASE):
            text = ''

            with open(file['path'], 'r', encoding=file['encoding']) as f:
                for line in f:
                    text += line

            soup = bs4.BeautifulSoup(text, features='lxml-xml')

            tags_para = []
            tags_sentence = []
            tags_word = []

            for _, level, opening_tag, _ in self.main.settings_custom['tags'][
                    'tags_xml']:
                if level == 'Paragraph':
                    tags_para.append(opening_tag[1:-1])
                elif level == 'Sentence':
                    tags_sentence.append(opening_tag[1:-1])
                elif level == 'Word':
                    tags_word.append(opening_tag[1:-1])

            for para in div.select(','.join(tags_para)):
                self.tokens_multilevel.append([])

                for sentence in para.select(','.join(tags_sentence)):
                    self.tokens_multilevel[-1].append([])

                    for word in sentence.select(','.join(tags_word)):
                        self.tokens_multilevel[-1][-1].append(word.get_text())

                        self.tags.append([])

        # Paragraph and sentence offsets
        for para in self.tokens_multilevel:
            self.offsets_paras.append(len(self.tokens_flat))

            for sentence in para:
                self.offsets_sentences.append(len(self.tokens_flat))

                self.tokens_flat.extend(sentence)

        # Remove whitespace around all tags
        self.tags = [[tag.strip() for tag in tags] for tags in self.tags]
Beispiel #12
0
def test_flatten_list():
    assert list(wl_misc.flatten_list([1, 2, [3, 4, [5, 6]]])) == [1, 2, 3, 4, 5, 6]
Beispiel #13
0
def test_pos_tag(lang, pos_tagger, show_results=False):
    lang_text = wl_conversion.to_lang_text(main, lang)

    tokens = wl_word_tokenization.wl_word_tokenize(
        main,
        text=getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang=lang)
    tokens = list(wl_misc.flatten_list(tokens))

    tokens_tagged = wl_pos_tagging.wl_pos_tag(main,
                                              tokens=tokens,
                                              lang=lang,
                                              pos_tagger=pos_tagger)
    tokens_tagged_universal = wl_pos_tagging.wl_pos_tag(main,
                                                        tokens=tokens,
                                                        lang=lang,
                                                        pos_tagger=pos_tagger,
                                                        tagset='universal')

    if show_results:
        print(f'{lang} / {pos_tagger}:')
        print(tokens_tagged)
        print(tokens_tagged_universal)

    if lang == 'zho_cn':
        assert tokens_tagged == [('汉语', 'nz'), (',', 'x'), ('又', 'd'),
                                 ('称', 'v'), ('汉文', 'nz'), ('、', 'x'),
                                 ('中文', 'nz'), ('、', 'x'), ('中国', 'ns'),
                                 ('话', 'n'), ('、', 'x'), ('中国', 'ns'),
                                 ('语', 'ng'), ('、', 'x'), ('华语', 'nz'),
                                 ('、', 'x'), ('华文', 'nz'), ('、', 'x'),
                                 ('唐', 'nr'), ('话', 'n'), ('[', 'x'),
                                 ('2', 'x'), (']', 'x'), (',', 'x'),
                                 ('或', 'c'), ('被', 'p'), ('视为', 'v'),
                                 ('一个', 'm'), ('语族', 'n'), (',', 'x'),
                                 ('或', 'c'), ('被', 'p'), ('视为', 'v'),
                                 ('隶属于', 'n'), ('汉藏', 'ns'), ('语系', 'n'),
                                 ('汉语', 'nz'), ('族', 'ng'), ('之一', 'r'),
                                 ('种', 'm'), ('语言', 'n'), ('。', 'x')]
        assert tokens_tagged_universal == [('汉语', 'PROPN'), (',', 'PUNCT/SYM'),
                                           ('又', 'ADV'), ('称', 'VERB'),
                                           ('汉文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('中文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('中国', 'PROPN'), ('话', 'NOUN'),
                                           ('、', 'PUNCT/SYM'), ('中国', 'PROPN'),
                                           ('语', 'NOUN'), ('、', 'PUNCT/SYM'),
                                           ('华语', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('华文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('唐', 'PRONP'), ('话', 'NOUN'),
                                           ('[', 'PUNCT/SYM'),
                                           ('2', 'PUNCT/SYM'),
                                           (']', 'PUNCT/SYM'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('视为', 'VERB'),
                                           ('一个', 'NUM'), ('语族', 'NOUN'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('视为', 'VERB'),
                                           ('隶属于', 'NOUN'), ('汉藏', 'PROPN'),
                                           ('语系', 'NOUN'), ('汉语', 'PROPN'),
                                           ('族', 'NOUN'), ('之一', 'PRON'),
                                           ('种', 'NUM'), ('语言', 'NOUN'),
                                           ('。', 'PUNCT/SYM')]
    elif lang == 'zho_tw':
        assert tokens_tagged == [
            ('漢語', 'nz'), (',', 'x'), ('又', 'd'), ('稱', 'zg'), ('漢文', 'nz'),
            ('、', 'x'), ('中文', 'nz'), ('、', 'x'), ('中', 'f'), ('國話', 'n'),
            ('、', 'x'), ('中國', 'ns'), ('語', 'n'), ('、', 'x'), ('華語', 'nz'),
            ('、', 'x'), ('華文', 'nz'), ('、', 'x'), ('唐', 'nr'), ('話', 'x'),
            ('[', 'x'), ('2', 'x'), (']', 'x'), (',', 'x'), ('或', 'c'),
            ('被', 'p'), ('視為', 'v'), ('一', 'm'), ('個', 'zg'), ('語族', 'n'),
            (',', 'x'), ('或', 'c'), ('被', 'p'), ('視', 'x'), ('為', 'p'),
            ('隸', 'j'), ('屬', 'v'), ('於', 'nr'), ('漢', 'j'), ('藏', 'j'),
            ('語系', 'n'), ('漢語', 'nz'), ('族', 'ng'), ('之一', 'r'), ('種', 'x'),
            ('語言', 'n'), ('。', 'x')
        ]
        assert tokens_tagged_universal == [('漢語', 'PROPN'), (',', 'PUNCT/SYM'),
                                           ('又', 'ADV'), ('稱', 'PART'),
                                           ('漢文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('中文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('中', 'ADP'), ('國話', 'NOUN'),
                                           ('、', 'PUNCT/SYM'), ('中國', 'PROPN'),
                                           ('語', 'NOUN'), ('、', 'PUNCT/SYM'),
                                           ('華語', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('華文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('唐', 'PRONP'), ('話', 'PUNCT/SYM'),
                                           ('[', 'PUNCT/SYM'),
                                           ('2', 'PUNCT/SYM'),
                                           (']', 'PUNCT/SYM'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('視為', 'VERB'),
                                           ('一', 'NUM'), ('個', 'PART'),
                                           ('語族', 'NOUN'), (',', 'PUNCT/SYM'),
                                           ('或', 'CONJ'), ('被', 'ADP'),
                                           ('視', 'PUNCT/SYM'), ('為', 'ADP'),
                                           ('隸', 'X'), ('屬', 'VERB'),
                                           ('於', 'PRONP'), ('漢', 'X'),
                                           ('藏', 'X'), ('語系', 'NOUN'),
                                           ('漢語', 'PROPN'), ('族', 'NOUN'),
                                           ('之一', 'PRON'), ('種', 'PUNCT/SYM'),
                                           ('語言', 'NOUN'), ('。', 'PUNCT/SYM')]
    elif lang == 'dan':
        assert tokens_tagged == [
            ('Dansk', 'ADJ__Definite=Ind|Degree=Pos|Number=Sing'),
            ('er', 'AUX__Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act'),
            ('et', 'DET__Gender=Neut|Number=Sing|PronType=Ind'),
            ('nordgermansk', 'ADJ__Definite=Ind|Degree=Pos|Number=Sing'),
            ('sprog', 'NOUN__Definite=Ind|Gender=Neut|Number=Sing'),
            ('af', 'ADP__AdpType=Prep'),
            ('den', 'DET__Gender=Com|Number=Sing|PronType=Dem'),
            ('østnordiske', 'ADJ__Definite=Def|Degree=Pos|Number=Sing'),
            ('(', 'PUNCT'), ('kontinentale', 'ADJ__Degree=Pos|Number=Plur'),
            (')', 'PUNCT'),
            ('gruppe', 'NOUN__Definite=Ind|Gender=Com|Number=Sing'),
            (',', 'PUNCT'), ('der', 'PRON__PartType=Inf'),
            ('tales', 'VERB__Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Pass'),
            ('af', 'ADP__AdpType=Prep'), ('ca.', 'ADV'),
            ('seks', 'NUM__NumType=Card'),
            ('millioner', 'NOUN__Definite=Ind|Gender=Com|Number=Plur'),
            ('mennesker', 'NOUN__Definite=Ind|Gender=Neut|Number=Plur'),
            ('.', 'PUNCT')
        ]
        assert tokens_tagged_universal == [('Dansk', 'ADJ'), ('er', 'AUX'),
                                           ('et', 'DET'),
                                           ('nordgermansk', 'ADJ'),
                                           ('sprog', 'NOUN'), ('af', 'ADP'),
                                           ('den', 'DET'),
                                           ('østnordiske', 'ADJ'),
                                           ('(', 'PUNCT'),
                                           ('kontinentale', 'ADJ'),
                                           (')', 'PUNCT'), ('gruppe', 'NOUN'),
                                           (',', 'PUNCT'), ('der', 'PRON'),
                                           ('tales', 'VERB'), ('af', 'ADP'),
                                           ('ca.', 'ADV'), ('seks', 'NUM'),
                                           ('millioner', 'NOUN'),
                                           ('mennesker', 'NOUN'),
                                           ('.', 'PUNCT')]
    elif lang == 'nld':
        assert tokens_tagged == [
            ('Het', 'LID|bep|stan|evon__Definite=Def'),
            ('Nederlands',
             'N|eigen|ev|basis|onz|stan__Gender=Neut|Number=Sing'),
            ('is', 'WW|pv|tgw|ev__Number=Sing|Tense=Pres|VerbForm=Fin'),
            ('een', 'LID|onbep|stan|agr__Definite=Ind'),
            ('West-Germaanse', 'ADJ|prenom|basis|met-e|stan__Degree=Pos'),
            ('taal', 'N|soort|ev|basis|zijd|stan__Gender=Com|Number=Sing'),
            ('en', 'VG|neven'), ('de', 'LID|bep|stan|rest__Definite=Def'),
            ('moedertaal',
             'N|soort|ev|basis|zijd|stan__Gender=Com|Number=Sing'),
            ('van', 'VZ|init'), ('de', 'LID|bep|stan|rest__Definite=Def'),
            ('meeste', 'VNW|onbep|grad|stan|prenom|met-e|agr|sup'),
            ('inwoners', 'N|soort|mv|basis__Number=Plur'), ('van', 'VZ|init'),
            ('Nederland',
             'N|eigen|ev|basis|onz|stan__Gender=Neut|Number=Sing'),
            (',', 'LET'),
            ('België', 'N|eigen|ev|basis|onz|stan__Gender=Neut|Number=Sing'),
            ('en', 'VG|neven'),
            ('Suriname', 'N|eigen|ev|basis|onz|stan__Gender=Neut|Number=Sing'),
            ('.', 'LET')
        ]
        assert tokens_tagged_universal == [
            ('Het', 'DET'), ('Nederlands', 'PROPN'), ('is', 'VERB'),
            ('een', 'DET'), ('West-Germaanse', 'ADJ'), ('taal', 'NOUN'),
            ('en', 'CCONJ'), ('de', 'DET'), ('moedertaal', 'NOUN'),
            ('van', 'ADP'), ('de', 'DET'), ('meeste', 'ADV'),
            ('inwoners', 'NOUN'), ('van', 'ADP'), ('Nederland', 'PROPN'),
            (',', 'SYM'), ('België', 'PROPN'), ('en', 'CCONJ'),
            ('Suriname', 'PROPN'), ('.', 'SYM')
        ]
    elif lang == 'eng':
        if pos_tagger == 'NLTK - Perceptron POS Tagger':
            assert tokens_tagged == [('English', 'NNP'), ('is', 'VBZ'),
                                     ('a', 'DT'), ('West', 'NNP'),
                                     ('Germanic', 'NNP'), ('language', 'NN'),
                                     ('that', 'WDT'), ('was', 'VBD'),
                                     ('first', 'RB'), ('spoken', 'VBN'),
                                     ('in', 'IN'), ('early', 'JJ'),
                                     ('medieval', 'NN'), ('England', 'NNP'),
                                     ('and', 'CC'), ('eventually', 'RB'),
                                     ('became', 'VBD'), ('a', 'DT'),
                                     ('global', 'JJ'), ('lingua', 'NN'),
                                     ('franca.[4][5', 'NN'), (']', 'NN')]
            assert tokens_tagged_universal == [('English', 'PROPN'),
                                               ('is', 'VERB'), ('a', 'DET'),
                                               ('West', 'PROPN'),
                                               ('Germanic', 'PROPN'),
                                               ('language', 'NOUN'),
                                               ('that', 'DET'), ('was',
                                                                 'VERB'),
                                               ('first', 'ADV'),
                                               ('spoken', 'VERB'),
                                               ('in', 'ADP/SCONJ'),
                                               ('early', 'ADJ'),
                                               ('medieval', 'NOUN'),
                                               ('England', 'PROPN'),
                                               ('and', 'CCONJ'),
                                               ('eventually', 'ADV'),
                                               ('became', 'VERB'),
                                               ('a', 'DET'), ('global', 'ADJ'),
                                               ('lingua', 'NOUN'),
                                               ('franca.[4][5', 'NOUN'),
                                               (']', 'NOUN')]
        elif pos_tagger == 'spaCy - English POS Tagger':
            assert tokens_tagged == [('English', 'NNP'), ('is', 'VBZ'),
                                     ('a', 'DT'), ('West', 'NNP'),
                                     ('Germanic', 'NNP'), ('language', 'NN'),
                                     ('that', 'WDT'), ('was', 'VBD'),
                                     ('first', 'RB'), ('spoken', 'VBN'),
                                     ('in', 'IN'), ('early', 'JJ'),
                                     ('medieval', 'JJ'), ('England', 'NNP'),
                                     ('and', 'CC'), ('eventually', 'RB'),
                                     ('became', 'VBD'), ('a', 'DT'),
                                     ('global', 'JJ'), ('lingua', 'NN'),
                                     ('franca.[4][5', 'NNP'), (']', '-RRB-')]
            assert tokens_tagged_universal == [('English', 'PROPN'),
                                               ('is', 'AUX'), ('a', 'DET'),
                                               ('West', 'PROPN'),
                                               ('Germanic', 'PROPN'),
                                               ('language', 'NOUN'),
                                               ('that', 'DET'), ('was', 'AUX'),
                                               ('first', 'ADV'),
                                               ('spoken', 'VERB'),
                                               ('in', 'ADP'), ('early', 'ADJ'),
                                               ('medieval', 'ADJ'),
                                               ('England', 'PROPN'),
                                               ('and', 'CCONJ'),
                                               ('eventually', 'ADV'),
                                               ('became', 'VERB'),
                                               ('a', 'DET'), ('global', 'ADJ'),
                                               ('lingua', 'NOUN'),
                                               ('franca.[4][5', 'PROPN'),
                                               (']', 'PUNCT')]
    elif lang == 'fra':
        assert tokens_tagged == [
            ('Le', 'DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('français', 'ADJ__Gender=Masc'),
            ('est',
             'AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('une', 'DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('langue', 'NOUN__Gender=Fem|Number=Sing'),
            ('indo-européenne', 'ADJ__Gender=Fem|Number=Sing'), ('de', 'ADP'),
            ('la', 'DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('famille', 'NOUN__Gender=Fem|Number=Sing'),
            ('des', 'ADP_DET__Definite=Def|Number=Plur|PronType=Art'),
            ('langues', 'NOUN__Gender=Fem|Number=Plur'),
            ('romanes', 'ADJ__Gender=Fem|Number=Plur'), ('.', 'PUNCT')
        ]
        assert tokens_tagged_universal == [('Le', 'DET'), ('français', 'ADJ'),
                                           ('est', 'AUX'), ('une', 'DET'),
                                           ('langue', 'NOUN'),
                                           ('indo-européenne', 'ADJ'),
                                           ('de', 'ADP'), ('la', 'DET'),
                                           ('famille', 'NOUN'), ('des', 'ADP'),
                                           ('langues', 'NOUN'),
                                           ('romanes', 'ADJ'), ('.', 'PUNCT')]
    elif lang == 'deu':
        assert tokens_tagged == [('Die', 'ART'), ('deutsche', 'ADJA'),
                                 ('Sprache', 'NN'), ('bzw.', 'ADJA'),
                                 ('Deutsch', 'NN'), ('(', '$('), ('[', 'NE'),
                                 ('dɔʏ̯t͡ʃ', 'NE'), (']', 'NE'), (';', '$.'),
                                 ('abgekürzt', 'VVFIN'), ('dt', 'NE'),
                                 ('.', '$.'), ('oder', 'KON'),
                                 ('dtsch', 'ADJD'), ('.', '$.'), (')', '$('),
                                 ('ist', 'VAFIN'), ('eine', 'ART'),
                                 ('westgermanische', 'ADJA'),
                                 ('Sprache', 'NN'), ('.', '$.')]
        assert tokens_tagged_universal == [('Die', 'DET'), ('deutsche', 'ADJ'),
                                           ('Sprache', 'NOUN'), ('bzw.',
                                                                 'ADJ'),
                                           ('Deutsch', 'NOUN'), ('(', 'PUNCT'),
                                           ('[', 'PROPN'),
                                           ('dɔʏ̯t͡ʃ', 'PROPN'),
                                           (']', 'PROPN'), (';', 'PUNCT'),
                                           ('abgekürzt', 'VERB'),
                                           ('dt', 'PROPN'), ('.', 'PUNCT'),
                                           ('oder', 'CCONJ'), ('dtsch', 'ADJ'),
                                           ('.', 'PUNCT'), (')', 'PUNCT'),
                                           ('ist', 'AUX'), ('eine', 'DET'),
                                           ('westgermanische', 'ADJ'),
                                           ('Sprache', 'NOUN'), ('.', 'PUNCT')]
    elif lang == 'ell':
        assert tokens_tagged == [
            ('Η',
             'DET__Case=Nom|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('ελληνική', 'ADJ__Case=Nom|Gender=Fem|Number=Sing'),
            ('γλώσσα', 'NOUN__Case=Nom|Gender=Fem|Number=Sing'),
            ('ανήκει',
             'VERB__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act'
             ), ('στην', 'AsPpSp_AtDf__Case=Acc|Gender=Fem|Number=Sing'),
            ('ινδοευρωπαϊκή', 'ADJ__Case=Acc|Gender=Fem|Number=Sing'),
            ('οικογένεια[9', 'NOUN__Case=Acc|Gender=Fem|Number=Sing'),
            (']',
             'VERB__Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Tense=Past|VerbForm=Fin|Voice=Pass'
             ), ('και', 'CCONJ'), ('συγκεκριμένα', 'ADV'),
            ('στον', 'AsPpSp_AtDf__Case=Acc|Gender=Masc|Number=Sing'),
            ('ελληνικό', 'ADJ__Case=Acc|Gender=Masc|Number=Sing'),
            ('κλάδο', 'NOUN__Case=Acc|Gender=Masc|Number=Sing'),
            (',', 'PUNCT'), ('μαζί', 'ADV'), ('με', 'ADP'),
            ('την',
             'DET__Case=Acc|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('τσακωνική', 'NOUN__Case=Acc|Gender=Fem|Number=Sing'),
            (',', 'PUNCT'), ('ενώ', 'SCONJ'),
            ('είναι',
             'AUX__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Pass'
             ),
            ('η',
             'DET__Case=Nom|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('επίσημη', 'ADJ__Case=Nom|Gender=Fem|Number=Sing'),
            ('γλώσσα', 'NOUN__Case=Nom|Gender=Fem|Number=Sing'),
            ('της',
             'DET__Case=Gen|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('Ελλάδος', 'PROPN__Case=Gen|Gender=Fem|Number=Sing'),
            ('και', 'CCONJ'),
            ('της',
             'DET__Case=Gen|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('Κύπρου', 'PROPN__Case=Gen|Gender=Fem|Number=Sing'),
            ('.', 'PUNCT')
        ]
        assert tokens_tagged_universal == [('Η', 'DET'), ('ελληνική', 'ADJ'),
                                           ('γλώσσα', 'NOUN'),
                                           ('ανήκει', 'VERB'), ('στην', 'ADP'),
                                           ('ινδοευρωπαϊκή', 'ADJ'),
                                           ('οικογένεια[9', 'NOUN'),
                                           (']', 'VERB'), ('και', 'CCONJ'),
                                           ('συγκεκριμένα', 'ADV'),
                                           ('στον', 'ADP'),
                                           ('ελληνικό', 'ADJ'),
                                           ('κλάδο', 'NOUN'), (',', 'PUNCT'),
                                           ('μαζί', 'ADV'), ('με', 'ADP'),
                                           ('την', 'DET'), ('τσακωνική',
                                                            'NOUN'),
                                           (',', 'PUNCT'), ('ενώ', 'SCONJ'),
                                           ('είναι', 'AUX'), ('η', 'DET'),
                                           ('επίσημη', 'ADJ'),
                                           ('γλώσσα', 'NOUN'), ('της', 'DET'),
                                           ('Ελλάδος', 'PROPN'),
                                           ('και', 'CCONJ'), ('της', 'DET'),
                                           ('Κύπρου', 'PROPN'), ('.', 'PUNCT')]
    elif lang == 'ita':
        assert tokens_tagged == [
            ("L'", 'RD__Definite=Def|Number=Sing|PronType=Art'),
            ('italiano', 'S__Gender=Masc|Number=Sing'), ('(', 'FB'),
            ('[', 'FB'), ('itaˈljaːno][Nota', 'S__Gender=Masc|Number=Sing'),
            ('1', 'N__NumType=Card'), (']', 'FB'), ('ascolta[?·info', 'S'),
            (']', 'FB'), (')', 'FB'),
            ('è', 'V__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('una', 'RI__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('lingua', 'S__Gender=Fem|Number=Sing'),
            ('romanza', 'S__Gender=Fem|Number=Sing'),
            ('parlata', 'A__Gender=Fem|Number=Sing'), ('principalmente', 'B'),
            ('in', 'E'), ('Italia', 'SP'), ('.', 'FS')
        ]
        assert tokens_tagged_universal == [("L'", 'DET'), ('italiano', 'NOUN'),
                                           ('(', 'PUNCT'), ('[', 'PUNCT'),
                                           ('itaˈljaːno][Nota', 'NOUN'),
                                           ('1', 'NUM'), (']', 'PUNCT'),
                                           ('ascolta[?·info', 'NOUN'),
                                           (']', 'PUNCT'), (')', 'PUNCT'),
                                           ('è', 'VERB'), ('una', 'DET'),
                                           ('lingua', 'NOUN'),
                                           ('romanza', 'NOUN'),
                                           ('parlata', 'ADJ'),
                                           ('principalmente', 'ADV'),
                                           ('in', 'ADP'), ('Italia', 'PROPN'),
                                           ('.', 'PUNCT')]
    elif lang == 'jpn':
        assert tokens_tagged == [('日本', '名詞'), ('語', '名詞'), ('(', '補助記号'),
                                 ('にほんご', '名詞'), ('、', '補助記号'), ('にっぽん', '名詞'),
                                 ('ご', '接尾辞'), ('[', '補助記号'), ('注', '名詞'),
                                 ('1', '名詞'), (']', '補助記号'), (')', '補助記号'),
                                 ('は', '助詞'), ('、', '補助記号'), ('主に', '副詞'),
                                 ('日本', '名詞'), ('国', '接尾辞'), ('内', '接尾辞'),
                                 ('や', '助詞'), ('日本', '名詞'), ('人', '接尾辞'),
                                 ('同士', '接尾辞'), ('の', '助詞'), ('間', '名詞'),
                                 ('で', '助詞'), ('使用', '名詞'), ('さ', '動詞'),
                                 ('れ', '助動詞'), ('て', '助詞'), ('いる', '動詞'),
                                 ('言語', '名詞'), ('で', '助動詞'), ('ある', '動詞'),
                                 ('。', '補助記号')]
        assert tokens_tagged_universal == [('日本', 'NOUN'), ('語', 'NOUN'),
                                           ('(', 'PUNCT/SYM'), ('にほんご',
                                                                'NOUN'),
                                           ('、', 'PUNCT/SYM'),
                                           ('にっぽん', 'NOUN'), ('ご', 'PART'),
                                           ('[', 'PUNCT/SYM'), ('注', 'NOUN'),
                                           ('1', 'NOUN'), (']', 'PUNCT/SYM'),
                                           (')', 'PUNCT/SYM'), ('は', 'PART'),
                                           ('、', 'PUNCT/SYM'), ('主に', 'ADV'),
                                           ('日本', 'NOUN'), ('国', 'PART'),
                                           ('内', 'PART'), ('や', 'PART'),
                                           ('日本', 'NOUN'), ('人', 'PART'),
                                           ('同士', 'PART'), ('の', 'PART'),
                                           ('間', 'NOUN'), ('で', 'PART'),
                                           ('使用', 'NOUN'), ('さ', 'VERB'),
                                           ('れ', 'AUX'), ('て', 'PART'),
                                           ('いる', 'VERB'), ('言語', 'NOUN'),
                                           ('で', 'AUX'), ('ある', 'VERB'),
                                           ('。', 'PUNCT/SYM')]
    elif lang == 'lit':
        assert tokens_tagged == [
            ('Lietuvių', 'dkt.vyr.dgs.K.__Case=Gen|Gender=Masc|Number=Plur'),
            ('kalba', 'dkt.mot.vns.Įn.__Case=Ins|Gender=Fem|Number=Sing'),
            ('–', 'skyr.'), ('iš', 'prl.K.__AdpType=Prep|Case=Gen'),
            ('baltų',
             'bdv.nelygin.mot.vns.K.__Case=Gen|Definite=Ind|Degree=Pos|Gender=Fem|Number=Sing'
             ),
            ('prokalbės', 'dkt.mot.vns.K.__Case=Gen|Gender=Fem|Number=Sing'),
            ('kilusi',
             'bdv.aukšč.vyr.dgs.V.__Case=Nom|Definite=Ind|Degree=Sup|Gender=Masc|Number=Plur'
             ),
            ('lietuvių', 'dkt.vyr.dgs.K.__Case=Gen|Gender=Masc|Number=Plur'),
            ('tautos', 'dkt.mot.vns.K.__Case=Gen|Gender=Fem|Number=Sing'),
            ('kalba', 'dkt.mot.vns.Įn.__Case=Ins|Gender=Fem|Number=Sing'),
            (',', 'skyr.'),
            ('kuri',
             'įv.mot.vns.V.__Case=Nom|Definite=Ind|Gender=Fem|Number=Sing|PronType=Int'
             ),
            ('Lietuvoje',
             'dkt.tikr.mot.vns.Vt.__Case=Loc|Gender=Fem|Number=Sing'),
            ('yra',
             'vksm.asm.tiesiog.es.vns.3.__Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin'
             ),
            ('valstybinė',
             'bdv.nelygin.mot.vns.V.__Case=Nom|Definite=Ind|Degree=Pos|Gender=Fem|Number=Sing'
             ), (',', 'skyr.'), ('o', 'jng.'),
            ('Europos',
             'dkt.tikr.mot.vns.K.__Case=Gen|Gender=Fem|Number=Sing'),
            ('Sąjungoje', 'dkt.mot.vns.Vt.__Case=Loc|Gender=Fem|Number=Sing'),
            ('–', 'skyr.'),
            ('viena',
             'įv.mot.vns.V.__Case=Nom|Definite=Ind|Gender=Fem|Number=Sing|PronType=Ind'
             ), ('iš', 'prl.K.__AdpType=Prep|Case=Gen'),
            ('oficialiųjų',
             'bdv.nelygin.įvardž.vyr.dgs.K.__Case=Gen|Definite=Def|Degree=Pos|Gender=Masc|Number=Plur'
             ), ('kalbų', 'dkt.vyr.dgs.V.__Case=Nom|Gender=Masc|Number=Plur'),
            ('.', 'skyr.')
        ]
        assert tokens_tagged_universal == [
            ('Lietuvių', 'NOUN'), ('kalba', 'NOUN'), ('–', 'PUNCT'),
            ('iš', 'ADP'), ('baltų', 'ADJ'), ('prokalbės', 'NOUN'),
            ('kilusi', 'ADJ'), ('lietuvių', 'NOUN'), ('tautos', 'NOUN'),
            ('kalba', 'NOUN'), (',', 'PUNCT'), ('kuri', 'DET'),
            ('Lietuvoje', 'PROPN'), ('yra', 'AUX'), ('valstybinė', 'ADJ'),
            (',', 'PUNCT'), ('o', 'CCONJ'), ('Europos', 'PROPN'),
            ('Sąjungoje', 'NOUN'), ('–', 'PUNCT'), ('viena', 'PRON'),
            ('iš', 'ADP'), ('oficialiųjų', 'ADJ'), ('kalbų', 'NOUN'),
            ('.', 'PUNCT')
        ]
    elif lang == 'nob':
        assert tokens_tagged == [
            ('Bokmål', 'PROPN'),
            ('er', 'AUX__Mood=Ind|Tense=Pres|VerbForm=Fin'),
            ('en', 'DET__Gender=Masc|Number=Sing|PronType=Art'),
            ('varietet', 'NOUN__Definite=Ind|Gender=Masc|Number=Sing'),
            ('av', 'ADP'),
            ('norsk', 'ADJ__Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing'),
            ('språk', 'NOUN__Definite=Ind|Gender=Neut|Number=Sing'),
            ('.', 'PUNCT')
        ]
        assert tokens_tagged_universal == [('Bokmål', 'PROPN'), ('er', 'AUX'),
                                           ('en', 'DET'), ('varietet', 'NOUN'),
                                           ('av', 'ADP'), ('norsk', 'ADJ'),
                                           ('språk', 'NOUN'), ('.', 'PUNCT')]
    elif lang == 'pol':
        assert tokens_tagged == [('Język', 'SUBST'), ('polski', 'ADJ'),
                                 (',', 'INTERP'), ('polszczyzna', 'SUBST'),
                                 (',', 'INTERP'), ('skrót', 'SUBST'),
                                 (':', 'INTERP'), ('pol', 'BREV'),
                                 ('.', 'INTERP'), ('–', 'INTERP'),
                                 ('język', 'SUBST'), ('naturalny', 'ADJ'),
                                 ('należący', 'PACT'), ('do', 'PREP'),
                                 ('grupy', 'SUBST'), ('języków', 'SUBST'),
                                 ('zachodniosłowiańskich', 'ADJ'),
                                 ('(', 'INTERP'), ('do', 'PREP'),
                                 ('której', 'ADJ'), ('należą', 'FIN'),
                                 ('również', 'QUB'), ('czeski', 'ADJ'),
                                 (',', 'INTERP'), ('słowacki', 'ADJ'),
                                 (',', 'INTERP'), ('kaszubski', 'ADJ'),
                                 (',', 'INTERP'), ('dolnołużycki', 'ADJ'),
                                 (',', 'INTERP'), ('górnołużycki', 'SUBST'),
                                 ('i', 'CONJ'), ('wymarły', 'SUBST'),
                                 ('połabski', 'ADJ'), (')', 'INTERP'),
                                 (',', 'INTERP'), ('stanowiącej', 'PACT'),
                                 ('część', 'SUBST'), ('rodziny', 'SUBST'),
                                 ('języków', 'SUBST'),
                                 ('indoeuropejskich', 'ADJ'), ('.', 'INTERP')]
        assert tokens_tagged_universal == [
            ('Język', 'NOUN'), ('polski', 'ADJ'), (',', 'PUNCT'),
            ('polszczyzna', 'NOUN'), (',', 'PUNCT'), ('skrót', 'NOUN'),
            (':', 'PUNCT'), ('pol', 'X'), ('.', 'PUNCT'), ('–', 'PUNCT'),
            ('język', 'NOUN'), ('naturalny', 'ADJ'), ('należący', 'VERB'),
            ('do', 'ADP'), ('grupy', 'NOUN'), ('języków', 'NOUN'),
            ('zachodniosłowiańskich', 'ADJ'), ('(', 'PUNCT'), ('do', 'ADP'),
            ('której', 'ADJ'), ('należą', 'VERB'), ('również', 'PART'),
            ('czeski', 'ADJ'), (',', 'PUNCT'), ('słowacki', 'ADJ'),
            (',', 'PUNCT'), ('kaszubski', 'ADJ'), (',', 'PUNCT'),
            ('dolnołużycki', 'ADJ'), (',', 'PUNCT'), ('górnołużycki', 'NOUN'),
            ('i', 'CCONJ'), ('wymarły', 'NOUN'), ('połabski', 'ADJ'),
            (')', 'PUNCT'), (',', 'PUNCT'), ('stanowiącej', 'VERB'),
            ('część', 'NOUN'), ('rodziny', 'NOUN'), ('języków', 'NOUN'),
            ('indoeuropejskich', 'ADJ'), ('.', 'PUNCT')
        ]
    elif lang == 'por':
        assert tokens_tagged == [
            ('A', 'DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('língua', 'NOUN__Gender=Fem|Number=Sing'),
            ('portuguesa', 'ADJ__Gender=Fem|Number=Sing'), (',', 'PUNCT'),
            ('também', 'ADV'),
            ('designada', 'VERB__Gender=Fem|Number=Sing|VerbForm=Part'),
            ('português', 'NOUN__Gender=Masc|Number=Sing'), (',', 'PUNCT'),
            ('é',
             'AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('uma', 'DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('língua', 'NOUN__Gender=Fem|Number=Sing'),
            ('românica', 'ADJ__Gender=Fem|Number=Sing'),
            ('flexiva', 'ADJ__Gender=Fem|Number=Sing'),
            ('ocidental', 'ADJ__Gender=Fem|Number=Sing'),
            ('originada', 'VERB__Gender=Fem|Number=Sing|VerbForm=Part'),
            ('no',
             'ADP_DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('galego-português', 'NOUN__Gender=Masc|Number=Sing'),
            ('falado', 'VERB__Gender=Masc|Number=Sing|VerbForm=Part'),
            ('no',
             'ADP_DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('Reino', 'PROPN__Gender=Masc|Number=Sing'),
            ('da',
             'ADP_DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('Galiza', 'PROPN__Number=Sing'), ('e', 'CCONJ'),
            ('no',
             'ADP_DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('norte', 'NOUN__Gender=Masc|Number=Sing'), ('de', 'ADP'),
            ('Portugal', 'PROPN__Gender=Masc|Number=Sing'), ('.', 'PUNCT')
        ]
        assert tokens_tagged_universal == [
            ('A', 'DET'), ('língua', 'NOUN'), ('portuguesa', 'ADJ'),
            (',', 'PUNCT'), ('também', 'ADV'), ('designada', 'VERB'),
            ('português', 'NOUN'), (',', 'PUNCT'), ('é', 'AUX'),
            ('uma', 'DET'), ('língua', 'NOUN'), ('românica', 'ADJ'),
            ('flexiva', 'ADJ'), ('ocidental', 'ADJ'), ('originada', 'VERB'),
            ('no', 'DET'), ('galego-português', 'NOUN'), ('falado', 'VERB'),
            ('no', 'DET'), ('Reino', 'PROPN'), ('da', 'DET'),
            ('Galiza', 'PROPN'), ('e', 'CCONJ'), ('no', 'DET'),
            ('norte', 'NOUN'), ('de', 'ADP'), ('Portugal', 'PROPN'),
            ('.', 'PUNCT')
        ]
    elif lang == 'ron':
        assert tokens_tagged == [('Limba', 'Ncfsry'), ('română', 'Afpfsrn'),
                                 ('este', 'Vmip3s'), ('o', 'Tifsr'),
                                 ('limbă', 'Ncfsrn'),
                                 ('indo-europeană', 'Afpfsrn'), (',', 'COMMA'),
                                 ('din', 'Spsa'), ('grupul', 'Ncmsry'),
                                 ('italic', 'Afpms-n'), ('și', 'Crssp'),
                                 ('din', 'Spsa'), ('subgrupul', 'Ncmsry'),
                                 ('oriental', 'Afpms-n'), ('al', 'Tsms'),
                                 ('limbilor', 'Ncfpoy'),
                                 ('romanice', 'Afpfp-n'), ('.', 'PERIOD')]
        assert tokens_tagged_universal == [('Limba', 'NOUN'),
                                           ('română', 'ADJ'), ('este', 'AUX'),
                                           ('o', 'DET'), ('limbă', 'NOUN'),
                                           ('indo-europeană', 'ADJ'),
                                           (',', 'PUNCT'), ('din', 'ADP'),
                                           ('grupul', 'NOUN'), ('italic',
                                                                'ADJ'),
                                           ('și', 'CCONJ'), ('din', 'ADP'),
                                           ('subgrupul', 'NOUN'),
                                           ('oriental', 'ADJ'), ('al', 'DET'),
                                           ('limbilor', 'NOUN'),
                                           ('romanice', 'ADJ'), ('.', 'PUNCT')]
    elif lang == 'rus':
        if pos_tagger == 'NLTK - Perceptron POS Tagger':
            assert tokens_tagged == [('Ру́сский', 'A=m'), ('язы́к', 'S'),
                                     ('(', 'NONLEX'), ('[', 'NONLEX'),
                                     ('ˈruskʲɪi̯', 'NONLEX'),
                                     ('jɪˈzɨk', 'NONLEX'), (']', 'NONLEX'),
                                     ('Информация', 'S'), ('о', 'PR'),
                                     ('файле', 'S'), ('слушать', 'V'),
                                     (')', 'NONLEX'), ('[', 'NONLEX'),
                                     ('~', 'NONLEX'), ('3', 'NUM=ciph'),
                                     (']', 'NONLEX'), ('[', 'NONLEX'),
                                     ('⇨', 'NONLEX'), (']', 'NONLEX'),
                                     ('—', 'NONLEX'), ('один', 'A-PRO=m'),
                                     ('из', 'PR'),
                                     ('восточнославянских', 'A=pl'),
                                     ('языков', 'S'), (',', 'NONLEX'),
                                     ('национальный', 'A=m'), ('язык', 'S'),
                                     ('русского', 'A=m'), ('народа', 'S'),
                                     ('.', 'NONLEX')]
            assert tokens_tagged_universal == [('Ру́сский', 'ADJ'),
                                               ('язы́к', 'NOUN'),
                                               ('(', 'PUNCT'), ('[', 'PUNCT'),
                                               ('ˈruskʲɪi̯', 'PUNCT'),
                                               ('jɪˈzɨk', 'PUNCT'),
                                               (']', 'PUNCT'),
                                               ('Информация', 'NOUN'),
                                               ('о', 'ADP'), ('файле', 'NOUN'),
                                               ('слушать', 'VERB'),
                                               (')', 'PUNCT'), ('[', 'PUNCT'),
                                               ('~', 'PUNCT'), ('3', 'NUM'),
                                               (']', 'PUNCT'), ('[', 'PUNCT'),
                                               ('⇨', 'PUNCT'), (']', 'PUNCT'),
                                               ('—', 'PUNCT'),
                                               ('один', 'PRON'), ('из', 'ADP'),
                                               ('восточнославянских', 'ADJ'),
                                               ('языков', 'NOUN'),
                                               (',', 'PUNCT'),
                                               ('национальный', 'ADJ'),
                                               ('язык', 'NOUN'),
                                               ('русского', 'ADJ'),
                                               ('народа', 'NOUN'),
                                               ('.', 'PUNCT')]
        elif pos_tagger == 'pymorphy2 - Morphological Analyzer':
            assert tokens_tagged == [
                ('Ру́сский', 'NOUN'), ('язы́к', 'NOUN'), ('(', 'PNCT'),
                ('[', 'PNCT'), ('ˈruskʲɪi̯', 'UNKN'), ('jɪˈzɨk', 'UNKN'),
                (']', 'PNCT'), ('Информация', 'NOUN'), ('о', 'PREP'),
                ('файле', 'NOUN'), ('слушать', 'INFN'), (')', 'PNCT'),
                ('[', 'PNCT'), ('~', 'UNKN'), ('3', 'NUMB'), (']', 'PNCT'),
                ('[', 'PNCT'), ('⇨', 'UNKN'), (']', 'PNCT'), ('—', 'PNCT'),
                ('один', 'ADJF'), ('из', 'PREP'),
                ('восточнославянских', 'ADJF'), ('языков', 'NOUN'),
                (',', 'PNCT'), ('национальный', 'ADJF'), ('язык', 'NOUN'),
                ('русского', 'ADJF'), ('народа', 'NOUN'), ('.', 'PNCT')
            ]
            assert tokens_tagged_universal == [('Ру́сский', 'NOUN'),
                                               ('язы́к', 'NOUN'),
                                               ('(', 'PUNCT'), ('[', 'PUNCT'),
                                               ('ˈruskʲɪi̯', 'SYM/X'),
                                               ('jɪˈzɨk', 'SYM/X'),
                                               (']', 'PUNCT'),
                                               ('Информация', 'NOUN'),
                                               ('о', 'ADP'), ('файле', 'NOUN'),
                                               ('слушать', 'VERB'),
                                               (')', 'PUNCT'), ('[', 'PUNCT'),
                                               ('~', 'SYM/X'), ('3', 'NUM'),
                                               (']', 'PUNCT'), ('[', 'PUNCT'),
                                               ('⇨', 'SYM/X'), (']', 'PUNCT'),
                                               ('—', 'PUNCT'), ('один', 'ADJ'),
                                               ('из', 'ADP'),
                                               ('восточнославянских', 'ADJ'),
                                               ('языков', 'NOUN'),
                                               (',', 'PUNCT'),
                                               ('национальный', 'ADJ'),
                                               ('язык', 'NOUN'),
                                               ('русского', 'ADJ'),
                                               ('народа', 'NOUN'),
                                               ('.', 'PUNCT')]
    elif lang == 'spa':
        assert tokens_tagged == [
            ('El', 'DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('español', 'NOUN__Gender=Masc|Number=Sing'), ('o', 'CCONJ'),
            ('castellano', 'NOUN__Gender=Masc|Number=Sing'),
            ('es',
             'AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('una', 'DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('lengua', 'NOUN__Gender=Fem|Number=Sing'),
            ('romance', 'NOUN__Gender=Masc|Number=Sing'),
            ('procedente', 'ADJ__Number=Sing'),
            ('del', 'ADP__AdpType=Preppron'),
            ('latín', 'NOUN__Gender=Masc|Number=Sing'),
            ('hablado', 'ADJ__Gender=Masc|Number=Sing|VerbForm=Part'),
            ('.', 'PUNCT__PunctType=Peri')
        ]
        assert tokens_tagged_universal == [('El', 'DET'), ('español', 'NOUN'),
                                           ('o', 'CCONJ'),
                                           ('castellano', 'NOUN'),
                                           ('es', 'AUX'), ('una', 'DET'),
                                           ('lengua', 'NOUN'),
                                           ('romance', 'NOUN'),
                                           ('procedente', 'ADJ'),
                                           ('del', 'ADP'), ('latín', 'NOUN'),
                                           ('hablado', 'ADJ'), ('.', 'PUNCT')]
    elif lang == 'tha':
        if pos_tagger == 'PyThaiNLP - Perceptron Tagger (ORCHID)':
            assert tokens_tagged == [('ภาษา', 'NCMN'), ('ไทย', 'NPRP'),
                                     ('หรือ', 'JCRG'), ('ภาษา', 'NCMN'),
                                     ('ไทย', 'NPRP'), ('กลาง', 'VATT'),
                                     ('เป็น', 'VSTA'), ('ภาษา', 'NCMN'),
                                     ('ราชการ', 'NCMN'), ('และ', 'JCRG'),
                                     ('ภาษา', 'NCMN'), ('ประจำ', 'RPRE'),
                                     ('ชาติ', 'NCMN'), ('ของ', 'RPRE'),
                                     ('ประเทศไทย', 'NPRP')]
            assert tokens_tagged_universal == [('ภาษา', 'NOUN'),
                                               ('ไทย', 'PROPN'),
                                               ('หรือ', 'CCONJ'),
                                               ('ภาษา', 'NOUN'),
                                               ('ไทย', 'PROPN'),
                                               ('กลาง', 'VERB'),
                                               ('เป็น', 'VERB'),
                                               ('ภาษา', 'NOUN'),
                                               ('ราชการ', 'NOUN'),
                                               ('และ', 'CCONJ'),
                                               ('ภาษา', 'NOUN'),
                                               ('ประจำ', 'ADP'),
                                               ('ชาติ', 'NOUN'),
                                               ('ของ', 'ADP'),
                                               ('ประเทศไทย', 'PROPN')]
        elif pos_tagger == 'PyThaiNLP - Perceptron Tagger (PUD)':
            assert tokens_tagged == [('ภาษา', 'NOUN'), ('ไทย', 'PROPN'),
                                     ('หรือ', 'CCONJ'), ('ภาษา', 'NOUN'),
                                     ('ไทย', 'PROPN'), ('กลาง', 'NOUN'),
                                     ('เป็น', 'AUX'), ('ภาษา', 'NOUN'),
                                     ('ราชการ', 'NOUN'), ('และ', 'CCONJ'),
                                     ('ภาษา', 'NOUN'), ('ประจำ', 'VERB'),
                                     ('ชาติ', 'NOUN'), ('ของ', 'ADP'),
                                     ('ประเทศไทย', 'PROPN')]
            assert tokens_tagged_universal == [
                ('ภาษา', 'NOUN'), ('ไทย', 'PROPN'), ('หรือ', 'CCONJ'),
                ('ภาษา', 'NOUN'), ('ไทย', 'PROPN'), ('กลาง', 'NOUN'),
                ('เป็น', 'AUX'), ('ภาษา', 'NOUN'), ('ราชการ', 'NOUN'),
                ('และ', 'CCONJ'), ('ภาษา', 'NOUN'), ('ประจำ', 'VERB'),
                ('ชาติ', 'NOUN'), ('ของ', 'ADP'), ('ประเทศไทย', 'PROPN')
            ]
    elif lang == 'bod':
        assert tokens_tagged == [('བོད་', 'PROPN'), ('ཀྱི་', 'NO_POS'),
                                 ('སྐད་ཡིག་', 'NOUN'), ('ནི་', 'NO_POS'),
                                 ('བོད་ཡུལ་', 'PROPN'), ('དང་', 'NO_POS'),
                                 ('དེ', 'DET'), ('འི་', 'PART'),
                                 ('ཉེ་འཁོར་', 'NOUN'), ('གྱི་', 'NO_POS'),
                                 ('ས་ཁུལ་', 'OTHER'), ('ཏེ', 'NO_POS'),
                                 ('།', 'PUNCT')]
        assert tokens_tagged_universal == [('བོད་', 'PROPN'), ('ཀྱི་', 'X'),
                                           ('སྐད་ཡིག་', 'NOUN'), ('ནི་', 'X'),
                                           ('བོད་ཡུལ་', 'PROPN'), ('དང་', 'X'),
                                           ('དེ', 'DET'), ('འི་', 'PART'),
                                           ('ཉེ་འཁོར་', 'NOUN'), ('གྱི་', 'X'),
                                           ('ས་ཁུལ་', 'X'), ('ཏེ', 'X'),
                                           ('།', 'PUNCT')]
    elif lang == 'ukr':
        assert tokens_tagged == [('Украї́нська', 'ADJF'), ('мо́ва', 'ADJF'),
                                 ('(', 'PNCT'), ('МФА', 'UNKN'), (':', 'PNCT'),
                                 ('[', 'PNCT'), ('ukrɑ̽ˈjɪnʲsʲkɑ̽', 'UNKN'),
                                 ('ˈmɔwɑ̽', 'UNKN'), (']', 'PNCT'),
                                 (',', 'PNCT'), ('історичні', 'ADJF'),
                                 ('назви', 'NOUN'), ('—', 'PNCT'),
                                 ('ру́ська', 'ADJF'), (',', 'PNCT'),
                                 ('руси́нська[9][10][11', 'UNKN'),
                                 (']', 'PNCT'), ('[', 'PNCT'), ('*', 'PNCT'),
                                 ('2', 'NUMB'), (']', 'PNCT'), (')', 'PNCT'),
                                 ('—', 'PNCT'), ('національна', 'ADJF'),
                                 ('мова', 'NOUN'), ('українців', 'NOUN'),
                                 ('.', 'PNCT')]
        assert tokens_tagged_universal == [('Украї́нська', 'ADJ'),
                                           ('мо́ва', 'ADJ'), ('(', 'PUNCT'),
                                           ('МФА', 'SYM/X'), (':', 'PUNCT'),
                                           ('[', 'PUNCT'),
                                           ('ukrɑ̽ˈjɪnʲsʲkɑ̽', 'SYM/X'),
                                           ('ˈmɔwɑ̽', 'SYM/X'), (']', 'PUNCT'),
                                           (',', 'PUNCT'),
                                           ('історичні', 'ADJ'),
                                           ('назви', 'NOUN'), ('—', 'PUNCT'),
                                           ('ру́ська', 'ADJ'), (',', 'PUNCT'),
                                           ('руси́нська[9][10][11', 'SYM/X'),
                                           (']', 'PUNCT'), ('[', 'PUNCT'),
                                           ('*', 'PUNCT'), ('2', 'NUM'),
                                           (']', 'PUNCT'), (')', 'PUNCT'),
                                           ('—', 'PUNCT'),
                                           ('національна', 'ADJ'),
                                           ('мова', 'NOUN'),
                                           ('українців', 'NOUN'),
                                           ('.', 'PUNCT')]
    elif lang == 'vie':
        assert tokens_tagged == [('Tiếng', 'N'), ('Việt', 'Np'), (',', 'CH'),
                                 ('còn', 'C'), ('gọi', 'V'), ('tiếng', 'N'),
                                 ('Việt Nam', 'Np'), ('[', 'V'), ('5', 'M'),
                                 (']', 'CH'), (',', 'CH'), ('tiếng Kinh', 'N'),
                                 ('hay', 'C'), ('Việt ngữ', 'V'), (',', 'CH'),
                                 ('là', 'V'), ('ngôn ngữ', 'N'), ('của', 'E'),
                                 ('người', 'Nc'), ('Việt', 'Np'), ('(', 'CH'),
                                 ('dân tộc', 'N'), ('Kinh', 'Np'), (')', 'CH'),
                                 ('và', 'C'), ('là', 'V'), ('ngôn ngữ', 'N'),
                                 ('chính thức', 'A'), ('tại', 'E'),
                                 ('Việt Nam', 'Np'), ('.', 'CH')]
        assert tokens_tagged_universal == [
            ('Tiếng', 'NOUN'), ('Việt', 'PROPN'), (',', 'PUNCT'),
            ('còn', 'CCONJ'), ('gọi', 'VERB'), ('tiếng', 'NOUN'),
            ('Việt Nam', 'PROPN'), ('[', 'VERB'), ('5', 'NUM'), (']', 'PUNCT'),
            (',', 'PUNCT'), ('tiếng Kinh', 'NOUN'), ('hay', 'CCONJ'),
            ('Việt ngữ', 'VERB'), (',', 'PUNCT'), ('là', 'VERB'),
            ('ngôn ngữ', 'NOUN'), ('của', 'ADP'), ('người', 'NOUN'),
            ('Việt', 'PROPN'), ('(', 'PUNCT'), ('dân tộc', 'NOUN'),
            ('Kinh', 'PROPN'), (')', 'PUNCT'), ('và', 'CCONJ'), ('là', 'VERB'),
            ('ngôn ngữ', 'NOUN'), ('chính thức', 'ADJ'), ('tại', 'ADP'),
            ('Việt Nam', 'PROPN'), ('.', 'PUNCT')
        ]
def wl_process_tokens(main, text, token_settings):
    settings = copy.deepcopy(token_settings)

    # Remove empty paragraphs
    text.tokens_multilevel = [
        para
        for para in text.tokens_multilevel
        if para
    ]

    # Punctuations
    if not settings['puncs']:
        i_tokens = 0

        # Mark tokens to be removed
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, token in enumerate(sentence):
                    if wl_checking_tokens.is_punc(token):
                        sentence[i] = ''

                        text.tags[i_tokens + i] = ''

                i_tokens += len(sentence)

        # Remove punctuations
        for para in text.tokens_multilevel:
            for i, sentence in enumerate(para):
                para[i] = [token for token in sentence if token]

        text.tags = [tags for tags in text.tags if tags != '']

        # Update offsets
        i_sentences = 0
        i_tokens = 0

        for i, para in enumerate(text.tokens_multilevel):
            text.offsets_paras[i] = i_tokens

            for j, sentence in enumerate(para):
                text.offsets_sentences[i_sentences + j] = i_tokens

                i_tokens += len(sentence)

            i_sentences += len(para)

    # Lemmatize all tokens
    if not settings['use_tags'] and settings['lemmatize_tokens']:
        for para in text.tokens_multilevel:
            for i, sentence in enumerate(para):
                para[i] = wl_lemmatization.wl_lemmatize(
                    main, sentence,
                    lang = text.lang
                )

    # Treat as all lowercase
    if settings['treat_as_lowercase']:
        for para in text.tokens_multilevel:
            for i, sentence in enumerate(para):
                para[i] = [token.lower() for token in sentence]

        text.tags = [
            [tag.lower() for tag in tags]
            for tags in text.tags
        ]

    # Words
    if settings['words']:
        # Lowercase
        if not settings['lowercase']:
            for para in text.tokens_multilevel:
                for sentence in para:
                    for i, token in enumerate(sentence):
                        if wl_checking_tokens.is_word_lowercase(token):
                            sentence[i] = ''
        # Uppercase
        if not settings['uppercase']:
            for para in text.tokens_multilevel:
                for sentence in para:
                    for i, token in enumerate(sentence):
                        if wl_checking_tokens.is_word_uppercase(token):
                            sentence[i] = ''
        # Title Case
        if not settings['title_case']:
            for para in text.tokens_multilevel:
                for sentence in para:
                    for i, token in enumerate(sentence):
                        if wl_checking_tokens.is_word_title_case(token):
                            sentence[i] = ''
    else:
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, token in enumerate(sentence):
                    if wl_checking_tokens.is_word_alphabetic(token):
                        sentence[i] = ''

    # Numerals
    if not settings['nums']:
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, token in enumerate(sentence):
                    if wl_checking_tokens.is_num(token):
                        sentence[i] = ''

    # Filter stop words
    if settings['filter_stop_words']:
        for para in text.tokens_multilevel:
            for i, sentence in enumerate(para):
                stop_words = wl_stop_word_lists.wl_get_stop_word_list(main, lang = text.lang)

                para[i] = [
                    token if token not in stop_words else ''
                    for token in sentence
                ]

    # Ignore tags
    i_token = 0

    if settings['ignore_tags']:
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, token in enumerate(sentence):
                    sentence[i] = (token, [])
    else:
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, token in enumerate(sentence):
                    sentence[i] = (token, text.tags[i_token + i])

                i_token += len(sentence)

    # Use tags only
    if settings['use_tags']:
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, token in enumerate(sentence):
                    sentence[i] = sentence[i][1]
    else:
        for para in text.tokens_multilevel:
            for sentence in para:
                for i, token in enumerate(sentence):
                    sentence[i] = f"{sentence[i][0]}{''.join(sentence[i][1])}"

    text.tokens_flat = list(wl_misc.flatten_list(text.tokens_multilevel))

    return text
def wl_word_tokenize(main, text, lang, word_tokenizer='default'):
    tokens_multilevel = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    wl_nlp_utils.init_word_tokenizers(main,
                                      lang=lang,
                                      word_tokenizer=word_tokenizer)

    if word_tokenizer.startswith('spacy_'):
        # Input of SudachiPy cannot be more than 49149 BYTES
        if word_tokenizer == 'spacy_jpn' and len(text) > 49149 // 4:
            # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300)
            sections = wl_nlp_utils.split_into_chunks_text(text,
                                                           section_size=10)
        else:
            sections = wl_nlp_utils.split_into_chunks_text(
                text,
                section_size=main.settings_custom['files']['misc']
                ['read_files_in_chunks'])
    else:
        sections = wl_nlp_utils.split_into_chunks_text(text, 1)

    for section in sections:
        # spaCy
        if word_tokenizer.startswith('spacy_'):
            # Chinese, English, German, Portuguese
            if not lang.startswith('srp_'):
                lang = wl_conversion.remove_lang_code_suffixes(main, lang)

            nlp = main.__dict__[f'spacy_nlp_{lang}']
            doc = nlp(section)

            tokens_multilevel.append([])

            len_sents = len(list(doc.sents))

            for i, sentence in enumerate(doc.sents):
                tokens_sentence = []

                tokens = [token.text for token in sentence]
                len_tokens = len(tokens)

                for j, token in enumerate(tokens):
                    # Split paragraphs by new line character
                    len_lines = len(re.findall(r'\n', token))

                    if len_lines:
                        # Check if the last paragraph is empty
                        if i == len_sents - 1 and j == len_tokens - 1 and token.endswith(
                                '\n'):
                            len_lines -= 1

                        if tokens_sentence:
                            tokens_multilevel[-1].append(tokens_sentence)

                            tokens_sentence = []

                        tokens_multilevel.extend([[]
                                                  for j in range(len_lines)])
                    else:
                        if token.strip():
                            tokens_sentence.append(token)

                if tokens_sentence:
                    tokens_multilevel[-1].append(tokens_sentence)
        else:
            tokens_multilevel.append([])

            if section.strip():
                # NLTK
                if word_tokenizer.startswith('nltk_'):
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang)

                    if word_tokenizer == 'nltk_nist':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_nist_tokenizer.tokenize(sentence))
                    elif word_tokenizer == 'nltk_nltk':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_nltk_tokenizer.tokenize(sentence))
                    elif word_tokenizer == 'nltk_penn_treebank':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_treebank_tokenizer.tokenize(
                                    sentence))
                    elif word_tokenizer == 'nltk_tok_tok':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_toktok_tokenizer.tokenize(sentence))
                    elif word_tokenizer == 'nltk_twitter':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_tweet_tokenizer.tokenize(sentence))
                # Sacremoses
                elif word_tokenizer == 'sacremoses_moses':
                    lang = wl_conversion.remove_lang_code_suffixes(main, lang)
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang)

                    for sentence in sentences:
                        tokens_multilevel[-1].append(
                            main.__dict__[f'sacremoses_moses_tokenizer_{lang}']
                            .tokenize(sentence, escape=False))
                # Chinese
                elif word_tokenizer == 'jieba_zho':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens_multilevel[-1].append(jieba.lcut(sentence))
                elif word_tokenizer == 'pkuseg_zho':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens_multilevel[-1].append(
                            main.pkuseg_word_tokenizer.cut(sentence))
                elif word_tokenizer == 'wordless_zho_char':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens = []
                        non_han_start = 0

                        for i, char in enumerate(sentence):
                            if i >= non_han_start:
                                if wl_checking_unicode.is_han(char):
                                    tokens.append(char)

                                    non_han_start += 1
                                else:
                                    # English
                                    if wl_checking_unicode.is_eng(char):
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or not wl_checking_unicode.is_eng(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='eng_us'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break
                                    # Other Languages
                                    else:
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or wl_checking_unicode.is_han(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='other'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break

                        tokens_multilevel[-1].append(tokens)
                # Japanese
                elif word_tokenizer == 'nagisa_jpn':
                    import nagisa

                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens_multilevel[-1].append(
                            nagisa.tagging(str(sentence)).words)
                elif word_tokenizer.startswith('sudachipy_jpn'):
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    if word_tokenizer == 'sudachipy_jpn_split_mode_a':
                        for sentence in sentences:
                            tokens_multilevel[-1].append([
                                token.surface()
                                for token in main.sudachipy_word_tokenizer.
                                tokenize(sentence, sudachipy.SplitMode.A)
                            ])
                    elif word_tokenizer == 'sudachipy_jpn_split_mode_b':
                        for sentence in sentences:
                            tokens_multilevel[-1].append([
                                token.surface()
                                for token in main.sudachipy_word_tokenizer.
                                tokenize(sentence, sudachipy.SplitMode.B)
                            ])
                    elif word_tokenizer == 'sudachipy_jpn_split_mode_c':
                        for sentence in sentences:
                            tokens_multilevel[-1].append([
                                token.surface()
                                for token in main.sudachipy_word_tokenizer.
                                tokenize(sentence, sudachipy.SplitMode.C)
                            ])
                elif word_tokenizer == 'wordless_jpn_kanji':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens = []
                        non_han_start = 0

                        for i, char in enumerate(sentence):
                            if i >= non_han_start:
                                if wl_checking_unicode.is_han(char):
                                    tokens.append(char)

                                    non_han_start += 1
                                else:
                                    # Japanese Kana
                                    if wl_checking_unicode.is_kana(char):
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or not wl_checking_unicode.is_kana(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='jpn'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break
                                    # English
                                    elif wl_checking_unicode.is_eng(char):
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or not wl_checking_unicode.is_eng(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='eng_us'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break
                                    # Other Languages
                                    else:
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or wl_checking_unicode.is_han(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='other'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break

                        tokens_multilevel[-1].append(tokens)
                # Icelandic
                elif word_tokenizer == 'tokenizer_isl':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main,
                        section,
                        lang='isl',
                        sentence_tokenizer='tokenizer_isl')

                    for sentence in sentences:
                        tokens_multilevel[-1].append([
                            token for kind, token, val in tokenizer.tokenize(
                                sentence) if token
                        ])
                # Thai
                elif word_tokenizer.startswith('pythainlp_'):
                    # Preserve sentence boundaries
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang='tha')

                    if word_tokenizer == 'pythainlp_longest_matching':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence,
                                                        engine='longest'))
                    elif word_tokenizer == 'pythainlp_max_matching':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence, engine='mm'))
                    elif word_tokenizer == 'pythainlp_max_matching_tcc':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence,
                                                        engine='newmm'))
                    elif word_tokenizer == 'pythainlp_max_matching_tcc_safe_mode':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence,
                                                        engine='newmm-safe'))
                    elif word_tokenizer == 'pythainlp_nercut':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence,
                                                        engine='nercut'))
                # Tibetan
                elif word_tokenizer == 'botok_bod':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang='bod')

                    for sentence in sentences:
                        tokens_multilevel[-1].append([
                            token.text for token in
                            main.botok_word_tokenizer.tokenize(sentence)
                        ])
                # Vietnamese
                elif word_tokenizer == 'underthesea_vie':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main,
                        section,
                        lang='vie',
                        sentence_tokenizer='underthesea_vie')

                    for sentence in sentences:
                        tokens_multilevel[-1].append(
                            underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for para in tokens_multilevel:
        for i, sentence in enumerate(para):
            para[i] = [token.strip() for token in sentence if token.strip()]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for para in tokens_multilevel:
            for sentence in para:
                if sentence:
                    sentence[-1] = wl_texts.Wl_Token(sentence[-1],
                                                     boundary='',
                                                     sentence_ending=True)
    else:
        for para in tokens_multilevel:
            for sentence in para:
                if sentence:
                    sentence[-1] = wl_texts.Wl_Token(sentence[-1],
                                                     boundary=' ',
                                                     sentence_ending=True)

    return tokens_multilevel
def wl_word_tokenize_flat(main, text, lang, word_tokenizer='default'):
    tokens_multilevel = wl_word_tokenize(main, text, lang, word_tokenizer)

    return list(wl_misc.flatten_list(tokens_multilevel))
def wl_word_tokenize(main,
                     text,
                     lang,
                     word_tokenizer='default',
                     flat_tokens=True):
    tokens_multilevel = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    # Check initialization status of word (and sentence) tokenizers
    if flat_tokens:
        wl_text_utils.check_word_tokenizers(main,
                                            lang=lang,
                                            word_tokenizer=word_tokenizer)
    else:
        wl_text_utils.check_tokenizers(main,
                                       lang=lang,
                                       word_tokenizer=word_tokenizer)

    # NLTK
    if 'NLTK' in word_tokenizer:
        sentences = wl_sentence_tokenization.wl_sentence_tokenize(
            main, text, lang)

        if word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NLTK Tokenizer'):
            nltk_tokenizer = nltk.NLTKWordTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(nltk_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(toktok_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(tweet_tokenizer.tokenize(sentence))
    # Sacremoses
    elif 'Sacremoses' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main, text, lang)

        moses_tokenizer = sacremoses.MosesTokenizer(
            lang=wl_conversion.to_iso_639_1(main, lang))

        for sentence in sentences:
            tokens_multilevel.append(
                moses_tokenizer.tokenize(sentence, escape=False))

    # spaCy
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if flat_tokens:
            tokens_multilevel.append([token.text for token in doc])
        else:
            for sentence in doc.sents:
                tokens_multilevel.append(
                    [token.text for token in sentence.as_doc()])
    # syntok
    elif word_tokenizer == 'syntok - Word Tokenizer':
        syntok_tokenizer = syntok.tokenizer.Tokenizer()

        if flat_tokens:
            tokens_multilevel.append(
                [token.value for token in syntok_tokenizer.tokenize(text)])
        else:
            for para in syntok.segmenter.analyze(text):
                for sentence in para:
                    tokens_multilevel.append(
                        [token.value for token in sentence])
    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main, text, lang=lang)

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_multilevel.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wl_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wl_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wl_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wl_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_multilevel.append(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_multilevel.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wl_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wl_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wl_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wl_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wl_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wl_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_multilevel.append(tokens)
    # Russian
    elif word_tokenizer == 'razdel - Russian Word Tokenizer':
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main, text, lang='rus')

        for sentence in sentences:
            tokens_multilevel.append(
                [token.text for token in razdel.tokenize(sentence)])
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        # Preserve sentence boundaries
        sentences = wl_sentence_tokenization.wl_sentence_tokenize(main,
                                                                  text,
                                                                  lang='tha')

        if word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_multilevel.append(
                    pythainlp.word_tokenize(sentence, engine='longest'))
        elif word_tokenizer == main.tr('PyThaiNLP - Maximum Matching'):
            for sentence in sentences:
                tokens_multilevel.append(
                    pythainlp.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Maximum Matching + TCC'):
            for sentence in sentences:
                tokens_multilevel.append(
                    pythainlp.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching + TCC (Safe Mode)'):
            for sentence in sentences:
                tokens_multilevel.append(
                    pythainlp.word_tokenize(sentence, engine='newmm-safe'))
    # Tibetan
    elif 'botok' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main, text, lang='bod')

        for sentence in sentences:
            tokens_multilevel.append([
                token.text
                for token in main.botok_word_tokenizer.tokenize(sentence)
            ])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )

        for sentence in sentences:
            tokens_multilevel.append(underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, sentence in enumerate(tokens_multilevel):
        tokens_multilevel[i] = [
            token.strip() for token in sentence if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for sentence in tokens_multilevel:
            if sentence:
                sentence[-1] = wl_text.Wl_Token(sentence[-1],
                                                boundary='',
                                                sentence_ending=True)
    else:
        for sentence in tokens_multilevel:
            if sentence:
                sentence[-1] = wl_text.Wl_Token(sentence[-1],
                                                boundary=' ',
                                                sentence_ending=True)

    # Clause tokenization
    if not flat_tokens:
        for i, sentence in enumerate(tokens_multilevel):
            tokens_multilevel[i] = wl_sentence_tokenization.wl_clause_tokenize(
                main, sentence, lang)

    # Flatten tokens
    tokens_flat = list(wl_misc.flatten_list(tokens_multilevel))

    if flat_tokens:
        return tokens_flat
    else:
        return tokens_multilevel
def test_word_tokenize(lang, word_tokenizer, show_results = False):
    lang_text = wl_conversion.to_lang_text(main, lang)

    tokens = wl_word_tokenization.wl_word_tokenize(
        main,
        text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang = lang,
        word_tokenizer = word_tokenizer
    )
    tokens = list(wl_misc.flatten_list(tokens))

    if show_results:
        print(f'{lang} / {word_tokenizer}:')
        print(tokens)

    if lang == 'afr':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['Afrikaans', 'is', 'tipologies', 'gesien', "'n", 'Indo-Europese', ',', 'Wes-Germaanse', ',', 'Nederfrankiese', 'taal', ',', '[', '2', ']', 'wat', 'sy', 'ontstaan', 'aan', 'die', 'suidpunt', 'van', 'Afrika', 'gehad', 'het', 'onder', 'invloed', 'van', 'verskeie', 'ander', 'tale', 'en', 'taalgroepe', '.']
        elif word_tokenizer == ['NLTK - NLTK Tokenizer',
                                'NLTK - Twitter Tokenizer']:
            assert tokens == ['Afrikaans', 'is', 'tipologies', 'gesien', "'", 'n', 'Indo-Europese', ',', 'Wes-Germaanse', ',', 'Nederfrankiese', 'taal', ',', '[', '2', ']', 'wat', 'sy', 'ontstaan', 'aan', 'die', 'suidpunt', 'van', 'Afrika', 'gehad', 'het', 'onder', 'invloed', 'van', 'verskeie', 'ander', 'tale', 'en', 'taalgroepe', '.']
        elif word_tokenizer == 'spaCy - Afrikaans Word Tokenizer':
            assert tokens == ['Afrikaans', 'is', 'tipologies', 'gesien', "'", 'n', 'Indo', '-', 'Europese', ',', 'Wes', '-', 'Germaanse', ',', 'Nederfrankiese', 'taal,[2', ']', 'wat', 'sy', 'ontstaan', 'aan', 'die', 'suidpunt', 'van', 'Afrika', 'gehad', 'het', 'onder', 'invloed', 'van', 'verskeie', 'ander', 'tale', 'en', 'taalgroepe', '.']
    elif lang == 'sqi':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['Gjuha', 'shqipe', '(', 'ose', 'thjeshtë', 'shqipja', ')', 'është', 'gjuhë', 'dhe', 'degë', 'e', 'veçantë', 'e', 'familjes', 'indo-evropiane', 'të', 'folur', 'nga', 'më', 'shumë', 'se', '6', 'milionë', 'njerëz', '[', '4', ']', ',', 'kryesisht', 'në', 'Shqipëri', ',', 'Kosovë', 'dhe', 'Republikën', 'e', 'Maqedonisë', ',', 'por', 'edhe', 'në', 'zona', 'të', 'tjera', 'të', 'Evropës', 'Jugore', 'ku', 'ka', 'një', 'popullsi', 'shqiptare', ',', 'duke', 'përfshirë', 'Malin', 'e', 'Zi', 'dhe', 'Luginën', 'e', 'Preshevës', '.']
        elif word_tokenizer == 'spaCy - Albanian Word Tokenizer':
            assert tokens == ['Gjuha', 'shqipe', '(', 'ose', 'thjeshtë', 'shqipja', ')', 'është', 'gjuhë', 'dhe', 'degë', 'e', 'veçantë', 'e', 'familjes', 'indo', '-', 'evropiane', 'të', 'folur', 'nga', 'më', 'shumë', 'se', '6', 'milionë', 'njerëz[4', ']', ',', 'kryesisht', 'në', 'Shqipëri', ',', 'Kosovë', 'dhe', 'Republikën', 'e', 'Maqedonisë', ',', 'por', 'edhe', 'në', 'zona', 'të', 'tjera', 'të', 'Evropës', 'Jugore', 'ku', 'ka', 'një', 'popullsi', 'shqiptare', ',', 'duke', 'përfshirë', 'Malin', 'e', 'Zi', 'dhe', 'Luginën', 'e', 'Preshevës', '.']
    elif lang == 'ara':
        assert tokens == ['اللُّغَة', 'العَرَبِيّة', 'هي', 'أكثر', 'اللغات', 'تحدثاً', 'ونطقاً', 'ضمن', 'مجموعة', 'اللغات', 'السامية', '،', 'وإحدى', 'أكثر', 'اللغات', 'انتشاراً', 'في', 'العالم', '،', 'يتحدثها', 'أكثر', 'من', '467', 'مليون', 'نسمة،(1', ')', 'ويتوزع', 'متحدثوها', 'في', 'الوطن', 'العربي', '،', 'بالإضافة', 'إلى', 'العديد', 'من', 'المناطق', 'الأخرى', 'المجاورة', 'كالأحواز', 'وتركيا', 'وتشاد', 'ومالي', 'والسنغال', 'وإرتيريا', 'وإثيوبيا', 'وجنوب', 'السودان', 'وإيران', '.']
    elif lang == 'hye':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'spaCy - Armenian Word Tokenizer']:
            assert tokens == ['Հայոց', 'լեզվով', 'ստեղծվել', 'է', 'մեծ', 'գրականություն', ':', 'Գրաբարով', 'է', 'ավանդված', 'հայ', 'հին', 'պատմագրությունը', ',', 'գիտափիլիսոփայական', ',', 'մաթեմատիկական', ',', 'բժշկագիտական', ',', 'աստվածաբանական-դավանաբանական', 'գրականությունը։']
        elif word_tokenizer == 'NLTK - Twitter Tokenizer':
            assert tokens == ['Հայոց', 'լեզվով', 'ստեղծվել', 'է', 'մեծ', 'գրականություն', ':', 'Գրաբարով', 'է', 'ավանդված', 'հայ', 'հին', 'պատմագրությունը', ',', 'գիտափիլիսոփայական', ',', 'մաթեմատիկական', ',', 'բժշկագիտական', ',', 'աստվածաբանական-դավանաբանական', 'գրականությունը', '։']
    elif lang == 'eus':
        assert tokens == ['Euskara', 'Euskal', 'Herriko', 'hizkuntza', 'da.[5', ']']
    elif lang == 'ben':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামগুলোতেও', 'পরিচিত', ')', 'একটি', 'ইন্দো-আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা।']
        elif word_tokenizer == 'NLTK - Twitter Tokenizer':
            assert tokens == ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামগুলোতেও', 'পরিচিত', ')', 'একটি', 'ইন্দো-আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা', '।']
        elif word_tokenizer == 'spaCy - Arabic Word Tokenizer':
            assert tokens == ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামগুলোতেও', 'পরিচিত', ')', 'একটি', 'ইন্দো', '-', 'আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা', '।']
    elif lang == 'bul':
        assert tokens == ['Бъ̀лгарският', 'езѝк', 'е', 'индоевропейски', 'език', 'от', 'групата', 'на', 'южнославянските', 'езици', '.']
    elif lang == 'cat':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'les', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'la', 'ciutat', 'de', "l'Alguer", 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'és', 'una', 'llengua', 'romànica', 'parlada', 'a', 'Catalunya', ',', 'el', 'País', 'Valencià', '(', 'tret', "d'algunes", 'comarques', 'i', 'localitats', 'de', "l'interior", ')', ',', 'les', 'Illes', 'Balears', ',', 'Andorra', ',', 'la', 'Franja', 'de', 'Ponent', '(', 'a', "l'Aragó", ')', ',', 'la', 'ciutat', 'de', "l'Alguer", '(', 'a', "l'illa", 'de', 'Sardenya', ')', ',', 'la', 'Catalunya', 'del', 'Nord', ',', '[', '8', ']', 'el', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblat', 'per', 'immigrats', 'valencians', ')', ',', '[', '9', ']', '[', '10', ']', 'i', 'en', 'petites', 'comunitats', 'arreu', 'del', 'món', '(', 'entre', 'les', 'quals', 'destaca', 'la', 'de', "l'Argentina", ',', 'amb', '195.000', 'parlants', ')', '.', '[', '11', ']']
        elif word_tokenizer == 'NLTK - Twitter Tokenizer':
            assert tokens == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'les', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'la', 'ciutat', 'de', "l'Alguer", 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'és', 'una', 'llengua', 'romànica', 'parlada', 'a', 'Catalunya', ',', 'el', 'País', 'Valencià', '(', 'tret', "d'algunes", 'comarques', 'i', 'localitats', 'de', "l'interior", ')', ',', 'les', 'Illes', 'Balears', ',', 'Andorra', ',', 'la', 'Franja', 'de', 'Ponent', '(', 'a', "l'Aragó", ')', ',', 'la', 'ciutat', 'de', "l'Alguer", '(', 'a', "l'illa", 'de', 'Sardenya', ')', ',', 'la', 'Catalunya', 'del', 'Nord', ',', '[8', ']', 'el', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblat', 'per', 'immigrats', 'valencians', ')', ',', '[', '9', ']', '[', '10', ']', 'i', 'en', 'petites', 'comunitats', 'arreu', 'del', 'món', '(', 'entre', 'les', 'quals', 'destaca', 'la', 'de', "l'Argentina", ',', 'amb', '195.000', 'parlants', ')', '.', '[', '11', ']']
        elif word_tokenizer == 'Sacremoses - Moses Tokenizer':
            assert tokens == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'les', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'la', 'ciutat', 'de', 'l', "'", 'Alguer', 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'és', 'una', 'llengua', 'romànica', 'parlada', 'a', 'Catalunya', ',', 'el', 'País', 'Valencià', '(', 'tret', 'd', "'", 'algunes', 'comarques', 'i', 'localitats', 'de', 'l', "'", 'interior', ')', ',', 'les', 'Illes', 'Balears', ',', 'Andorra', ',', 'la', 'Franja', 'de', 'Ponent', '(', 'a', 'l', "'", 'Aragó', ')', ',', 'la', 'ciutat', 'de', 'l', "'", 'Alguer', '(', 'a', 'l', "'", 'illa', 'de', 'Sardenya', ')', ',', 'la', 'Catalunya', 'del', 'Nord', ',', '[', '8', ']', 'el', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblat', 'per', 'immigrats', 'valencians', ')', ',', '[', '9', ']', '[', '10', ']', 'i', 'en', 'petites', 'comunitats', 'arreu', 'del', 'món', '(', 'entre', 'les', 'quals', 'destaca', 'la', 'de', 'l', "'", 'Argentina', ',', 'amb', '195.000', 'parlants', ')', '.', '[', '11', ']']
        elif word_tokenizer == 'spaCy - Catalan Word Tokenizer':
            assert tokens == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'les', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'la', 'ciutat', 'de', "l'", 'Alguer', 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'és', 'una', 'llengua', 'romànica', 'parlada', 'a', 'Catalunya', ',', 'el', 'País', 'Valencià', '(', 'tret', "d'", 'algunes', 'comarques', 'i', 'localitats', 'de', "l'", 'interior', ')', ',', 'les', 'Illes', 'Balears', ',', 'Andorra', ',', 'la', 'Franja', 'de', 'Ponent', '(', 'a', "l'", 'Aragó', ')', ',', 'la', 'ciutat', 'de', "l'", 'Alguer', '(', 'a', "l'", 'illa', 'de', 'Sardenya', ')', ',', 'la', 'Catalunya', 'del', 'Nord,[8', ']', 'el', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblat', 'per', 'immigrats', 'valencians),[9][10', ']', 'i', 'en', 'petites', 'comunitats', 'arreu', 'del', 'món', '(', 'entre', 'les', 'quals', 'destaca', 'la', 'de', "l'", 'Argentina', ',', 'amb', '195.000', 'parlants).[11', ']']
    elif lang == 'zho_cn':
        if word_tokenizer == 'jieba - Chinese Word Tokenizer':
            assert tokens == ['汉语', ',', '又称', '汉文', '、', '中文', '、', '中国', '话', '、', '中国', '语', '、', '华语', '、', '华文', '、', '唐话', '[', '2', ']', ',', '或', '被', '视为', '一个', '语族', ',', '或', '被', '视为', '隶属于', '汉藏语系', '汉语', '族', '之', '一种', '语言', '。']
        elif word_tokenizer == 'pkuseg - Chinese Word Tokenizer':
            assert tokens == ['汉语', ',', '又', '称', '汉文', '、', '中文', '、', '中国话', '、', '中国语', '、', '华语', '、', '华文', '、', '唐', '话[', '2', ']', ',', '或', '被', '视为', '一个', '语族', ',', '或', '被', '视为', '隶属于', '汉藏', '语系', '汉语族', '之一', '种', '语言', '。']
        elif word_tokenizer == 'Wordless - Chinese Character Tokenizer':
            assert tokens == ['汉', '语', ',', '又', '称', '汉', '文', '、', '中', '文', '、', '中', '国', '话', '、', '中', '国', '语', '、', '华', '语', '、', '华', '文', '、', '唐', '话', '[', '2', ']', ',', '或', '被', '视', '为', '一', '个', '语', '族', ',', '或', '被', '视', '为', '隶', '属', '于', '汉', '藏', '语', '系', '汉', '语', '族', '之', '一', '种', '语', '言', '。']
    elif lang == 'zho_tw':
        if word_tokenizer == 'jieba - Chinese Word Tokenizer':
            assert tokens == ['漢語', ',', '又', '稱漢文', '、', '中文', '、', '中國話', '、', '中國語', '、', '華語', '、', '華文', '、', '唐話', '[', '2', ']', ',', '或', '被', '視為', '一個', '語族', ',', '或', '被', '視為', '隸屬', '於', '漢藏語', '系漢', '語族', '之一', '種語', '言', '。']
        elif word_tokenizer == 'pkuseg - Chinese Word Tokenizer':
            assert tokens == ['漢語', ',', '又', '稱', '漢文', '、', '中文', '、', '中', '國話', '、', '中國語', '、', '華語', '、', '華文', '、', '唐', '話[', '2', ']', ',', '或', '被', '視為', '一', '個', '語族', ',', '或', '被', '視', '為隸', '屬於', '漢藏', '語系', '漢語族', '之一', '種', '語言', '。']
        elif word_tokenizer == 'Wordless - Chinese Character Tokenizer':
            assert tokens == ['漢', '語', ',', '又', '稱', '漢', '文', '、', '中', '文', '、', '中', '國', '話', '、', '中', '國', '語', '、', '華', '語', '、', '華', '文', '、', '唐', '話', '[', '2', ']', ',', '或', '被', '視', '為', '一', '個', '語', '族', ',', '或', '被', '視', '為', '隸', '屬', '於', '漢', '藏', '語', '系', '漢', '語', '族', '之', '一', '種', '語', '言', '。']
    elif lang == 'hrv':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'spaCy - Croatian Word Tokenizer']:
            assert tokens == ['Hrvatski', 'jezik', '(', 'ISO', '639', '-', '3', ':', 'hrv', ')', 'skupni', 'je', 'naziv', 'za', 'nacionalni', 'standardni', 'jezik', 'Hrvata', ',', 'te', 'za', 'skup', 'narječja', 'i', 'govora', 'kojima', 'govore', 'ili', 'su', 'nekada', 'govorili', 'Hrvati', '.']
        elif word_tokenizer in ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer',
                                'NLTK - Twitter Tokenizer']:
            assert tokens == ['Hrvatski', 'jezik', '(', 'ISO', '639-3', ':', 'hrv', ')', 'skupni', 'je', 'naziv', 'za', 'nacionalni', 'standardni', 'jezik', 'Hrvata', ',', 'te', 'za', 'skup', 'narječja', 'i', 'govora', 'kojima', 'govore', 'ili', 'su', 'nekada', 'govorili', 'Hrvati', '.']
        
    elif lang == 'ces':
        assert tokens == ['Čeština', 'neboli', 'český', 'jazyk', 'je', 'západoslovanský', 'jazyk', ',', 'nejbližší', 'slovenštině', ',', 'poté', 'lužické', 'srbštině', 'a', 'polštině', '.']
    elif lang == 'dan':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['Dansk', 'er', 'et', 'nordgermansk', 'sprog', 'af', 'den', 'østnordiske', '(', 'kontinentale', ')', 'gruppe', ',', 'der', 'tales', 'af', 'ca', '.', 'seks', 'millioner', 'mennesker', '.']
        elif word_tokenizer in ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer',
                                'spaCy - Danish Word Tokenizer']:
            assert tokens == ['Dansk', 'er', 'et', 'nordgermansk', 'sprog', 'af', 'den', 'østnordiske', '(', 'kontinentale', ')', 'gruppe', ',', 'der', 'tales', 'af', 'ca.', 'seks', 'millioner', 'mennesker', '.']
        
    elif lang == 'nld':
        assert tokens == ['Het', 'Nederlands', 'is', 'een', 'West-Germaanse', 'taal', 'en', 'de', 'moedertaal', 'van', 'de', 'meeste', 'inwoners', 'van', 'Nederland', ',', 'België', 'en', 'Suriname', '.']
    elif lang == 'eng':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - Twitter Tokenizer',
                              'Sacremoses - Moses Tokenizer']:
            assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'became', 'a', 'global', 'lingua', 'franca', '.', '[', '4', ']', '[', '5', ']']
        elif word_tokenizer in ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer',
                                'syntok - Word Tokenizer']:
            assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'became', 'a', 'global', 'lingua', 'franca.', '[', '4', ']', '[', '5', ']']
        elif word_tokenizer == 'NLTK - Tok-tok Tokenizer':
            assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'became', 'a', 'global', 'lingua', 'franca.[', '4', ']', '[', '5', ']']
        elif word_tokenizer == 'spaCy - English Word Tokenizer':
            assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'became', 'a', 'global', 'lingua', 'franca.[4][5', ']']
    elif lang == 'est':
        assert tokens == ['Eesti', 'keel', '(', 'varasem', 'nimetus', 'maakeel', ')', 'on', 'läänemeresoome', 'lõunarühma', 'kuuluv', 'keel', '.']
    elif lang == 'fin':
        assert tokens == ['Suomen', 'kieli', '(', 'suomi', ')', 'on', 'uralilaisten', 'kielten', 'itämerensuomalaiseen', 'ryhmään', 'kuuluva', 'kieli', '.']
    elif lang == 'fra':
        assert tokens == ['Le', 'français', 'est', 'une', 'langue', 'indo-européenne', 'de', 'la', 'famille', 'des', 'langues', 'romanes', '.']
    elif lang == 'deu':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'syntok - Word Tokenizer']:
            assert tokens == ['Die', 'deutsche', 'Sprache', 'bzw', '.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abgekürzt', 'dt', '.', 'oder', 'dtsch', '.', ')', 'ist', 'eine', 'westgermanische', 'Sprache', '.']
        elif word_tokenizer == ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer',
                                'NLTK - Tok-tok Tokenizer']:
            assert tokens == ['Die', 'deutsche', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abgekürzt', 'dt.', 'oder', 'dtsch.', ')', 'ist', 'eine', 'westgermanische', 'Sprache', '.']
        elif word_tokenizer == 'German / NLTK - Twitter Tokenizer':
            assert tokens == ['Die', 'deutsche', 'Sprache', 'bzw', '.', 'Deutsch', '(', '[', 'dɔʏ', '̯', 't', '͡', 'ʃ', '];', 'abgekürzt', 'dt', '.', 'oder', 'dtsch', '.', ')', 'ist', 'eine', 'westgermanische', 'Sprache', '.']
        elif word_tokenizer == 'Sacremoses - Moses Tokenizer':
            assert tokens == ['Die', 'deutsche', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ', '̯', 't', '͡', 'ʃ', ']', ';', 'abgekürzt', 'dt.', 'oder', 'dtsch', '.', ')', 'ist', 'eine', 'westgermanische', 'Sprache', '.']
        elif word_tokenizer == 'spaCy - German Word Tokenizer':
            assert tokens == ['Die', 'deutsche', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abgekürzt', 'dt', '.', 'oder', 'dtsch', '.', ')', 'ist', 'eine', 'westgermanische', 'Sprache', '.']
    elif lang == 'ell':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer',
                              'Sacremoses - Moses Tokenizer']:
            assert tokens == ['Η', 'ελληνική', 'γλώσσα', 'ανήκει', 'στην', 'ινδοευρωπαϊκή', 'οικογένεια', '[', '9', ']', 'και', 'συγκεκριμένα', 'στον', 'ελληνικό', 'κλάδο', ',', 'μαζί', 'με', 'την', 'τσακωνική', ',', 'ενώ', 'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'Ελλάδος', 'και', 'της', 'Κύπρου', '.']
        elif word_tokenizer == 'spaCy - Greek (Modern) Word Tokenizer':
            assert tokens == ['Η', 'ελληνική', 'γλώσσα', 'ανήκει', 'στην', 'ινδοευρωπαϊκή', 'οικογένεια[9', ']', 'και', 'συγκεκριμένα', 'στον', 'ελληνικό', 'κλάδο', ',', 'μαζί', 'με', 'την', 'τσακωνική', ',', 'ενώ', 'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'Ελλάδος', 'και', 'της', 'Κύπρου', '.']
    elif lang == 'guj':
        assert tokens == ['ગુજરાતી', '\u200d(/ɡʊdʒəˈrɑːti/[૭', ']', ',', 'રોમન', 'લિપિમાં', ':', 'Gujarātī', ',', 'ઉચ્ચાર', ':', '[', 'ɡudʒəˈɾɑːtiː', ']', ')', 'ભારત', 'દેશના', 'ગુજરાત', 'રાજ્યની', 'ઇન્ડો-આર્યન', 'ભાષા', 'છે', ',', 'અને', 'મુખ્યત્વે', 'ગુજરાતી', 'લોકો', 'દ્વારા', 'બોલાય', 'છે.']
    elif lang == 'heb':
        assert tokens == ['עִבְרִית', 'היא', 'שפה', 'שמית', ',', 'ממשפחת', 'השפות', 'האפרו', '-', 'אסיאתיות', ',', 'הידועה', 'כשפתם', 'של', 'היהודים', 'ושל', 'השומרונים', ',', 'אשר', 'ניב', 'מודרני', 'שלה', '(', 'עברית', 'ישראלית', ')', 'הוא', 'שפתה', 'הרשמית', 'של', 'מדינת', 'ישראל', ',', 'מעמד', 'שעוגן', 'בשנת', '2018', 'בחוק', 'יסוד', ':', 'ישראל', '–', 'מדינת', 'הלאום', 'של', 'העם', 'היהודי', '.']
    elif lang == 'hin':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['हिन्दी', 'विश्व', 'की', 'एक', 'प्रमुख', 'भाषा', 'है', 'एवं', 'भारत', 'की', 'राजभाषा', 'है।']
        elif word_tokenizer == ['NLTK - Twitter Tokenizer',
                                'spaCy - Hindi Word Tokenizer']:
            assert tokens == ['हिन्दी', 'विश्व', 'की', 'एक', 'प्रमुख', 'भाषा', 'है', 'एवं', 'भारत', 'की', 'राजभाषा', 'है', '।']
    elif lang == 'hun':
        assert tokens == ['A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tagja', ',', 'a', 'finnugor', 'nyelvek', 'közé', 'tartozó', 'ugor', 'nyelvek', 'egyike', '.']
    elif lang == 'isl':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - Twitter Tokenizer',
                              'Sacremoses - Moses Tokenizer',
                              'Tokenizer - Icelandic Word Tokenizer']:
            assert tokens == ['Íslenska', 'er', 'vesturnorrænt', ',', 'germanskt', 'og', 'indóevrópskt', 'tungumál', 'sem', 'er', 'einkum', 'talað', 'og', 'ritað', 'á', 'Íslandi', 'og', 'er', 'móðurmál', 'langflestra', 'Íslendinga', '.', '[', '4', ']']
        elif word_tokenizer == ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['Íslenska', 'er', 'vesturnorrænt', ',', 'germanskt', 'og', 'indóevrópskt', 'tungumál', 'sem', 'er', 'einkum', 'talað', 'og', 'ritað', 'á', 'Íslandi', 'og', 'er', 'móðurmál', 'langflestra', 'Íslendinga.', '[', '4', ']']
        elif word_tokenizer == 'spaCy - Icelandic Word Tokenizer':
            assert tokens == ['Íslenska', 'er', 'vesturnorrænt', ',', 'germanskt', 'og', 'indóevrópskt', 'tungumál', 'sem', 'er', 'einkum', 'talað', 'og', 'ritað', 'á', 'Íslandi', 'og', 'er', 'móðurmál', 'langflestra', 'Íslendinga.[4', ']']
    elif lang == 'ind':
        assert tokens == ['Bahasa', 'Indonesia', 'adalah', 'bahasa', 'Melayu', 'baku', 'yang', 'dijadikan', 'sebagai', 'bahasa', 'resmi', 'Republik', 'Indonesia[1', ']', 'dan', 'bahasa', 'persatuan', 'bangsa', 'Indonesia.[2', ']']
    elif lang == 'gle':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['Is', 'ceann', 'de', 'na', 'teangacha', 'Ceilteacha', 'í', 'an', 'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'thugtar', 'uirthi', 'corruair', ')', ',', 'agus', 'ceann', 'den', 'dtrí', 'cinn', 'de', 'theangacha', 'Ceilteacha', 'ar', 'a', 'dtugtar', 'na', 'teangacha', 'Gaelacha', '(', '.', 'i', '.', 'an', 'Ghaeilge', ',', 'Gaeilge', 'na', 'hAlban', 'agus', 'Gaeilge', 'Mhanann', ')', 'go', 'háirithe', '.']
        elif word_tokenizer in ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer',
                                'Sacremoses - Moses Tokenizer',
                                'spaCy - Irish Word Tokenizer']:
            assert tokens == ['Is', 'ceann', 'de', 'na', 'teangacha', 'Ceilteacha', 'í', 'an', 'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'thugtar', 'uirthi', 'corruair', ')', ',', 'agus', 'ceann', 'den', 'dtrí', 'cinn', 'de', 'theangacha', 'Ceilteacha', 'ar', 'a', 'dtugtar', 'na', 'teangacha', 'Gaelacha', '(', '.i.', 'an', 'Ghaeilge', ',', 'Gaeilge', 'na', 'hAlban', 'agus', 'Gaeilge', 'Mhanann', ')', 'go', 'háirithe', '.']
    elif lang == 'ita':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ["L'italiano", '(', '[', 'itaˈljaːno', ']', '[', 'Nota', '1', ']', 'ascolta', '[', '?', '·info', ']', ')', 'è', 'una', 'lingua', 'romanza', 'parlata', 'principalmente', 'in', 'Italia', '.']
        elif word_tokenizer == 'NLTK - Twitter Tokenizer':
            assert tokens == ["L'italiano", '(', '[', 'itaˈljaːno', ']', '[', 'Nota', '1', ']', 'ascolta', '[', '?', '·', 'info', ']', ')', 'è', 'una', 'lingua', 'romanza', 'parlata', 'principalmente', 'in', 'Italia', '.']
        elif word_tokenizer == 'Sacremoses - Moses Tokenizer':
            assert tokens == ["L'", 'italiano', '(', '[', 'itaˈljaːno', ']', '[', 'Nota', '1', ']', 'ascolta', '[', '?', '·', 'info', ']', ')', 'è', 'una', 'lingua', 'romanza', 'parlata', 'principalmente', 'in', 'Italia', '.']
        elif word_tokenizer == 'spaCy - Italian Word Tokenizer':
            assert tokens == ["L'", 'italiano', '(', '[', 'itaˈljaːno][Nota', '1', ']', 'ascolta[?·info', ']', ')', 'è', 'una', 'lingua', 'romanza', 'parlata', 'principalmente', 'in', 'Italia', '.']
    elif lang == 'jpn':
        if word_tokenizer == 'nagisa - Japanese Word Tokenizer':
            assert tokens == ['日本', '語', '(', 'にほんご', '、', 'にっぽん', 'ご', '[', '注', '1', ']', ')', 'は', '、', '主に', '日本', '国', '内', 'や', '日本', '人', '同士', 'の', '間', 'で', '使用', 'さ', 'れ', 'て', 'いる', '言語', 'で', 'ある', '。']
        elif word_tokenizer == 'Wordless - Japanese Kanji Tokenizer':
            assert tokens == ['日', '本', '語', '(', 'にほんご', '、', 'にっぽん', 'ご', '[', '注', '1', ']', ')', 'は', '、', '主', 'に', '日', '本', '国', '内', 'や', '日', '本', '人', '同', '士', 'の', '間', 'で', '使', '用', 'さ', 'れ', 'て', 'いる', '言', '語', 'で', 'ある', '。']
    elif lang == 'kan':
        assert tokens == ['ದ್ರಾವಿಡ', 'ಭಾಷೆಗಳಲ್ಲಿ', 'ಪ್ರಾಮುಖ್ಯವುಳ್ಳ', 'ಭಾಷೆಯೂ', 'ಭಾರತದ', 'ಪುರಾತನವಾದ', 'ಭಾಷೆಗಳಲ್ಲಿ', 'ಒಂದೂ', 'ಆಗಿರುವ', 'ಕನ್ನಡ', 'ಭಾಷೆಯನ್ನು', 'ಅದರ', 'ವಿವಿಧ', 'ರೂಪಗಳಲ್ಲಿ', 'ಸುಮಾರು', '೪೫', 'ದಶಲಕ್ಷ', 'ಜನರು', 'ಆಡು', 'ನುಡಿಯಾಗಿ', 'ಬಳಸುತ್ತಲಿದ್ದಾರೆ', '.']
    elif lang == 'lav':
        if word_tokenizer == ['NLTK - NIST Tokenizer',
                              'NLTK - Twitter Tokenizer',
                              'Sacremoses - Moses Tokenizer']:
            assert tokens == ['Latviešu', 'valoda', 'ir', 'dzimtā', 'valoda', 'apmēram', '1,7', 'miljoniem', 'cilvēku', ',', 'galvenokārt', 'Latvijā', ',', 'kur', 'tā', 'ir', 'vienīgā', 'valsts', 'valoda', '.', '[', '3', ']']
        elif word_tokenizer == ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['Latviešu', 'valoda', 'ir', 'dzimtā', 'valoda', 'apmēram', '1,7', 'miljoniem', 'cilvēku', ',', 'galvenokārt', 'Latvijā', ',', 'kur', 'tā', 'ir', 'vienīgā', 'valsts', 'valoda.', '[', '3', ']']
        elif word_tokenizer == 'spaCy - Latvian Word Tokenizer':
            assert tokens == ['Latviešu', 'valoda', 'ir', 'dzimtā', 'valoda', 'apmēram', '1,7', 'miljoniem', 'cilvēku', ',', 'galvenokārt', 'Latvijā', ',', 'kur', 'tā', 'ir', 'vienīgā', 'valsts', 'valoda.[3', ']']
    elif lang == 'lij':
        if word_tokenizer == ['NLTK - NIST Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['O', 'Lìgure', '(', 'in', 'monegasco', ':', 'lenga', 'ligüra', 'e', 'lenga', 'lìgura', ')', 'o', "l'é", "'na", 'lengoa', '[', '1', ']', 'do', 'gruppo', 'lengoìstego', 'itàlico', 'oçidentâ', 'parlâ', 'in', 'Italia', '(', 'Liguria', ',', 'Piemonte', ',', 'Emilia-Romagna', 'e', 'Sardegna', ')', ',', 'into', 'sud', 'da', 'Fransa', ',', 'in', 'Còrsega', ',', 'e', 'into', 'Prinçipato', 'de', 'Monego', '.']
        elif word_tokenizer == 'NLTK - NLTK Tokenizer':
            assert tokens == ['O', 'Lìgure', '(', 'in', 'monegasco', ':', 'lenga', 'ligüra', 'e', 'lenga', 'lìgura', ')', 'o', 'l', "'", 'é', "'na", 'lengoa', '[', '1', ']', 'do', 'gruppo', 'lengoìstego', 'itàlico', 'oçidentâ', 'parlâ', 'in', 'Italia', '(', 'Liguria', ',', 'Piemonte', ',', 'Emilia-Romagna', 'e', 'Sardegna', ')', ',', 'into', 'sud', 'da', 'Fransa', ',', 'in', 'Còrsega', ',', 'e', 'into', 'Prinçipato', 'de', 'Monego', '.']
        elif word_tokenizer == 'spaCy - Ligurian Word Tokenizer':
            assert tokens == ['O', 'Lìgure', '(', 'in', 'monegasco', ':', 'lenga', 'ligüra', 'e', 'lenga', 'lìgura', ')', 'o', "l'", 'é', "'", 'na', 'lengoa[1', ']', 'do', 'gruppo', 'lengoìstego', 'itàlico', 'oçidentâ', 'parlâ', 'in', 'Italia', '(', 'Liguria', ',', 'Piemonte', ',', 'Emilia', '-', 'Romagna', 'e', 'Sardegna', ')', ',', 'into', 'sud', 'da', 'Fransa', ',', 'in', 'Còrsega', ',', 'e', 'into', 'Prinçipato', 'de', 'Monego', '.']
    elif lang == 'lit':
        assert tokens == ['Lietuvių', 'kalba', '–', 'iš', 'baltų', 'prokalbės', 'kilusi', 'lietuvių', 'tautos', 'kalba', ',', 'kuri', 'Lietuvoje', 'yra', 'valstybinė', ',', 'o', 'Europos', 'Sąjungoje', '–', 'viena', 'iš', 'oficialiųjų', 'kalbų', '.']
    elif lang == 'ltz':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ["D'Lëtzebuergesch", 'gëtt', 'an', 'der', 'däitscher', 'Dialektologie', 'als', 'ee', 'westgermaneschen', ',', 'mëtteldäitschen', 'Dialekt', 'aklasséiert', ',', 'deen', 'zum', 'Muselfränkesche', 'gehéiert', '.']
        elif word_tokenizer == 'spaCy - Luxembourgish Word Tokenizer':
            assert tokens == ["D'", 'Lëtzebuergesch', 'gëtt', 'an', 'der', 'däitscher', 'Dialektologie', 'als', 'ee', 'westgermaneschen', ',', 'mëtteldäitschen', 'Dialekt', 'aklasséiert', ',', 'deen', 'zum', 'Muselfränkesche', 'gehéiert', '.']
    elif lang == 'mal':
        assert tokens == ['ഇന്ത്യയിൽ', 'പ്രധാനമായും', 'കേരള', 'സംസ്ഥാനത്തിലും', 'ലക്ഷദ്വീപിലും', 'പുതുച്ചേരിയുടെ', 'ഭാഗമായ', 'മയ്യഴിയിലും', 'സംസാരിക്കപ്പെടുന്ന', 'ഭാഷയാണ്', 'മലയാളം.']
    elif lang == 'mar':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['मराठीभाषा', 'ही', 'इंडो-युरोपीय', 'भाषाकुलातील', 'एक', 'भाषा', 'आहे', '.']
        elif word_tokenizer == 'spaCy - Marathi Word Tokenizer':
            assert tokens == ['मराठीभाषा', 'ही', 'इंडो', '-', 'युरोपीय', 'भाषाकुलातील', 'एक', 'भाषा', 'आहे', '.']
    elif lang == 'nep':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['नेपाली', 'भाषा', '(', 'अन्तर्राष्ट्रिय', 'ध्वन्यात्मक', 'वर्णमाला', '[', 'neˈpali', 'bʱaʂa', ']', ')', 'नेपालको', 'सम्पर्क', 'भाषा', 'तथा', 'भारत', ',', 'भुटान', 'र', 'म्यानमारको', 'केही', 'भागमा', 'मातृभाषाको', 'रूपमा', 'बोलिने', 'भाषा', 'हो।']
        elif word_tokenizer in ['NLTK - Twitter Tokenizer',
                                'spaCy - Nepali Word Tokenizer']:
            assert tokens == ['नेपाली', 'भाषा', '(', 'अन्तर्राष्ट्रिय', 'ध्वन्यात्मक', 'वर्णमाला', '[', 'neˈpali', 'bʱaʂa', ']', ')', 'नेपालको', 'सम्पर्क', 'भाषा', 'तथा', 'भारत', ',', 'भुटान', 'र', 'म्यानमारको', 'केही', 'भागमा', 'मातृभाषाको', 'रूपमा', 'बोलिने', 'भाषा', 'हो', '।']
    elif lang == 'nob':
        assert tokens == ['Bokmål', 'er', 'en', 'varietet', 'av', 'norsk', 'språk', '.']
    elif lang == 'fas':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران،', 'افغانستان،', '[', '۳', ']', 'تاجیکستان', '[', '۴', ']', 'و', 'ازبکستان', '[', '۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
        elif word_tokenizer == 'NLTK - Tok-tok Tokenizer':
            assert tokens == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان', '،', '[', '۳', ']', 'تاجیکستان[', '۴', ']', 'و', 'ازبکستان[', '۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
        elif word_tokenizer == 'NLTK - Twitter Tokenizer':
            assert tokens == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان', '،', '[', '۳', ']', 'تاجیکستان', '[', '۴', ']', 'و', 'ازبکستان', '[', '۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
        elif word_tokenizer == 'spaCy - Persian Word Tokenizer':
            assert tokens == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان،[۳', ']', 'تاجیکستان[۴', ']', 'و', 'ازبکستان[۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
    elif lang == 'pol':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer',
                              'spaCy - Polish Word Tokenizer']:
            assert tokens == ['Język', 'polski', ',', 'polszczyzna', ',', 'skrót', ':', 'pol', '.', '–', 'język', 'naturalny', 'należący', 'do', 'grupy', 'języków', 'zachodniosłowiańskich', '(', 'do', 'której', 'należą', 'również', 'czeski', ',', 'słowacki', ',', 'kaszubski', ',', 'dolnołużycki', ',', 'górnołużycki', 'i', 'wymarły', 'połabski', ')', ',', 'stanowiącej', 'część', 'rodziny', 'języków', 'indoeuropejskich', '.']
        elif word_tokenizer == 'Sacremoses - Moses Tokenizer':
            assert tokens == ['Język', 'polski', ',', 'polszczyzna', ',', 'skrót', ':', 'pol.', '–', 'język', 'naturalny', 'należący', 'do', 'grupy', 'języków', 'zachodniosłowiańskich', '(', 'do', 'której', 'należą', 'również', 'czeski', ',', 'słowacki', ',', 'kaszubski', ',', 'dolnołużycki', ',', 'górnołużycki', 'i', 'wymarły', 'połabski', ')', ',', 'stanowiącej', 'część', 'rodziny', 'języków', 'indoeuropejskich', '.']
    elif lang == 'por':
        assert tokens == ['A', 'língua', 'portuguesa', ',', 'também', 'designada', 'português', ',', 'é', 'uma', 'língua', 'românica', 'flexiva', 'ocidental', 'originada', 'no', 'galego-português', 'falado', 'no', 'Reino', 'da', 'Galiza', 'e', 'no', 'norte', 'de', 'Portugal', '.']
    elif lang == 'ron':
        assert tokens == ['Limba', 'română', 'este', 'o', 'limbă', 'indo-europeană', ',', 'din', 'grupul', 'italic', 'și', 'din', 'subgrupul', 'oriental', 'al', 'limbilor', 'romanice', '.']
    elif lang == 'rus':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Tok-tok Tokenizer',
                              'NLTK - Twitter Tokenizer',
                              'razdel - Russian Word Tokenizer']:
            assert tokens == ['Ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'Информация', 'о', 'файле', 'слушать', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'один', 'из', 'восточнославянских', 'языков', ',', 'национальный', 'язык', 'русского', 'народа', '.']
        elif word_tokenizer == 'Sacremoses - Moses Tokenizer':
            assert tokens == ['Ру', '́', 'сский', 'язы', '́', 'к', '(', '[', 'ˈruskʲɪi', '̯', 'jɪˈzɨk', ']', 'Информация', 'о', 'файле', 'слушать', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'один', 'из', 'восточнославянских', 'языков', ',', 'национальный', 'язык', 'русского', 'народа', '.']
        elif word_tokenizer == 'spaCy - Russian Word Tokenizer':
            assert tokens == ['Ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'Информация', 'о', 'файле', 'слушать)[~', '3', ']', '[', '⇨', ']', '—', 'один', 'из', 'восточнославянских', 'языков', ',', 'национальный', 'язык', 'русского', 'народа', '.']
    elif lang == 'srp_cyrl':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['Српски', 'језик', 'припада', 'словенској', 'групи', 'језика', 'породице', 'индоевропских', 'језика', '.', '[', '12', ']']
        elif word_tokenizer == 'spaCy - Serbian Word Tokenizer':
            assert tokens == ['Српски', 'језик', 'припада', 'словенској', 'групи', 'језика', 'породице', 'индоевропских', 'језика.[12', ']']
    elif lang == 'srp_latn':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['Srpski', 'jezik', 'pripada', 'slovenskoj', 'grupi', 'jezika', 'porodice', 'indoevropskih', 'jezika', '.', '[', '12', ']']
        elif word_tokenizer == 'spaCy - Serbian Word Tokenizer':
            assert tokens == ['Srpski', 'jezik', 'pripada', 'slovenskoj', 'grupi', 'jezika', 'porodice', 'indoevropskih', 'jezika.[12', ']']
    elif lang == 'sin':
        assert tokens == ['ශ්\u200dරී', 'ලංකාවේ', 'ප්\u200dරධාන', 'ජාතිය', 'වන', 'සිංහල', 'ජනයාගේ', 'මව්', 'බස', 'සිංහල', 'වෙයි', '.']
    elif lang == 'slk':
        assert tokens == ['Slovenčina', 'patrí', 'do', 'skupiny', 'západoslovanských', 'jazykov', '(', 'spolu', 's', 'češtinou', ',', 'poľštinou', ',', 'hornou', 'a', 'dolnou', 'lužickou', 'srbčinou', 'a', 'kašubčinou', ')', '.']
    elif lang == 'slv':
        assert tokens == ['Slovenščina', '[', 'slovénščina', ']', '/', '[', 'sloˈʋenʃtʃina', ']', 'je', 'združeni', 'naziv', 'za', 'uradni', 'knjižni', 'jezik', 'Slovencev', 'in', 'skupno', 'ime', 'za', 'narečja', 'in', 'govore', ',', 'ki', 'jih', 'govorijo', 'ali', 'so', 'jih', 'nekoč', 'govorili', 'Slovenci', '.']
    elif lang == 'spa':
        assert tokens == ['El', 'español', 'o', 'castellano', 'es', 'una', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablado', '.']
    elif lang == 'swe':
        assert tokens == ['Svenska', '(', 'svenska', '(', 'info', ')', ')', 'är', 'ett', 'östnordiskt', 'språk', 'som', 'talas', 'av', 'ungefär', 'tio', 'miljoner', 'personer', 'främst', 'i', 'Sverige', 'där', 'språket', 'har', 'en', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'men', 'även', 'som', 'det', 'ena', 'nationalspråket', 'i', 'Finland', 'och', 'som', 'enda', 'officiella', 'språk', 'på', 'Åland', '.']
    elif lang == 'tgl':
        assert tokens == ['Ang', 'Wikang', 'Tagalog[2', ']', '(', 'Baybayin', ':', 'ᜏᜒᜃᜅ᜔', 'ᜆᜄᜎᜓᜄ᜔', ')', ',', 'na', 'kilala', 'rin', 'sa', 'payak', 'na', 'pangalang', 'Tagalog', ',', 'ay', 'isa', 'sa', 'mga', 'pangunahing', 'wika', 'ng', 'Pilipinas', 'at', 'sinasabing', 'ito', 'ang', 'de', 'facto', '(', '"', 'sa', 'katunayan', '"', ')', 'ngunit', 'hindî', 'de', 'jure', '(', '"', 'sa', 'batas', '"', ')', 'na', 'batayan', 'na', 'siyang', 'pambansang', 'Wikang', 'Filipino', '(', 'mula', '1961', 'hanggang', '1987', ':', 'Pilipino).[2', ']']
    elif lang == 'tgk':
        assert tokens == ['Забони', 'тоҷикӣ', '—', 'забоне', ',', 'ки', 'дар', 'Эрон', ':', 'форсӣ', ',', 'ва', 'дар', 'Афғонистон', 'дарӣ', 'номида', 'мешавад', ',', 'забони', 'давлатии', 'кишварҳои', 'Тоҷикистон', ',', 'Эрон', 'ва', 'Афғонистон', 'мебошад', '.']
    elif lang == 'tam':
        if word_tokenizer == 'Sacremoses - Moses Tokenizer':
            assert tokens == ['தமிழ', '்', 'மொழி', '(', 'Tamil', 'language', ')', 'தமிழர', '்', 'களினதும', '்', ',', 'தமிழ', '்', 'பேசும', '்', 'பலரதும', '்', 'தாய', '்', 'மொழி', 'ஆகும', '்', '.']
        elif word_tokenizer == 'spaCy - Tamil Word Tokenizer':
            assert tokens == ['தமிழ்', 'மொழி', '(', 'Tamil', 'language', ')', 'தமிழர்களினதும்', ',', 'தமிழ்', 'பேசும்', 'பலரதும்', 'தாய்மொழி', 'ஆகும்', '.']
    elif lang == 'tat':
        assert tokens == ['Татар', 'теле', '—', 'татарларның', 'милли', 'теле', ',', 'Татарстанның', 'дәүләт', 'теле', ',', 'таралышы', 'буенча', 'Русиядә', 'икенче', 'тел', '.']
    elif lang == 'tel':
        tokens == ['ఆంధ్ర', 'ప్రదేశ్', ',', 'తెలంగాణ', 'రాష్ట్రాల', 'అధికార', 'భాష', 'తెలుగు', '.']
    elif lang == 'tha':
        if word_tokenizer == ('AttaCut - Thai Word Tokenizer'):
            assert tokens == ['ภาษา', 'ไทย', 'หรือ', 'ภาษา', 'ไทย', 'กลาง', 'เป็น', 'ภาษา', 'ราชการ', 'และ', 'ภาษา', 'ประจำ', 'ชาติ', 'ของ', 'ประเทศไทย']
        elif word_tokenizer in ['PyThaiNLP - Longest Matching',
                                'PyThaiNLP - Maximum Matching + TCC',
                                'PyThaiNLP - Maximum Matching + TCC (Safe Mode)']:
            assert tokens == ['ภาษาไทย', 'หรือ', 'ภาษาไทย', 'กลาง', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย']
        elif word_tokenizer == 'PyThaiNLP - Maximum Matching':
            assert tokens == ['ภาษาไทย', 'หรือ', 'ภาษาไทยกลาง', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย']
    elif lang == 'bod':
        assert tokens == ['བོད་', 'ཀྱི་', 'སྐད་ཡིག་', 'ནི་', 'བོད་ཡུལ་', 'དང་', 'དེ', 'འི་', 'ཉེ་འཁོར་', 'གྱི་', 'ས་ཁུལ་', 'ཏེ', '།']
    elif lang == 'tur':
        assert tokens == ['Türkçe', 'ya', 'da', 'Türk', 'dili', ',', 'batıda', 'Balkanlar’dan', 'başlayıp', 'doğuda', 'Hazar', 'Denizi', 'sahasına', 'kadar', 'konuşulan', 'Türkî', 'diller', 'dil', 'ailesine', 'ait', 'sondan', 'eklemeli', 'bir', 'dil.[12', ']']
    elif lang == 'ukr':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['Украї́нська', 'мо́ва', '(', 'МФА', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичні', 'назви', '—', 'ру́ська', ',', 'руси́нська', '[', '9', ']', '[', '10', ']', '[', '11', ']', '[', '*', '2', ']', ')', '—', 'національна', 'мова', 'українців', '.']
        elif word_tokenizer == 'spaCy - Ukrainian Word Tokenizer':
            assert tokens == ['Украї́нська', 'мо́ва', '(', 'МФА', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичні', 'назви', '—', 'ру́ська', ',', 'руси́нська[9][10][11', ']', '[', '*', '2', ']', ')', '—', 'національна', 'мова', 'українців', '.']
    elif lang == 'urd':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['اُردُو', 'لشکری', 'زبان', '[', '8', ']', '(', 'یا', 'جدید', 'معیاری', 'اردو', ')', 'برصغیر', 'کی', 'معیاری', 'زبانوں', 'میں', 'سے', 'ایک', 'ہے۔']
        elif word_tokenizer == 'NLTK - Twitter Tokenizer':
            assert tokens == ['اُردُو', 'لشکری', 'زبان', '[8', ']', '(', 'یا', 'جدید', 'معیاری', 'اردو', ')', 'برصغیر', 'کی', 'معیاری', 'زبانوں', 'میں', 'سے', 'ایک', 'ہے', '۔']
        elif word_tokenizer == 'spaCy - Ukrainian Word Tokenizer':
            assert tokens == ['اُردُو', 'لشکری', 'زبان[8', ']', '(', 'یا', 'جدید', 'معیاری', 'اردو', ')', 'برصغیر', 'کی', 'معیاری', 'زبانوں', 'میں', 'سے', 'ایک', 'ہے', '۔']
    elif lang == 'vie':
        if word_tokenizer == 'NLTK - Tok-tok Tokenizer':
            assert tokens == ['Tiếng', 'Việt', ',', 'còn', 'gọi', 'tiếng', 'Việt', 'Nam[', '5', ']', ',', 'tiếng', 'Kinh', 'hay', 'Việt', 'ngữ', ',', 'là', 'ngôn', 'ngữ', 'của', 'người', 'Việt', '(', 'dân', 'tộc', 'Kinh', ')', 'và', 'là', 'ngôn', 'ngữ', 'chính', 'thức', 'tại', 'Việt', 'Nam', '.']
        elif word_tokenizer == 'Underthesea - Vietnamese Word Tokenizer':
            assert tokens == ['Tiếng', 'Việt', ',', 'còn', 'gọi', 'tiếng', 'Việt Nam', '[', '5', ']', ',', 'tiếng Kinh', 'hay', 'Việt ngữ', ',', 'là', 'ngôn ngữ', 'của', 'người', 'Việt', '(', 'dân tộc', 'Kinh', ')', 'và', 'là', 'ngôn ngữ', 'chính thức', 'tại', 'Việt Nam', '.']
    elif lang == 'yor':
        assert tokens == ['Èdè', 'Yorùbá', 'Ni', 'èdè', 'tí', 'ó', 'ṣàkójọ', 'pọ̀', 'gbogbo', 'kú', 'oótu', 'o', '-', 'ò', '-', 'jíire', 'bí', ',', 'níapá', 'ìwọ̀', 'Oòrùn', 'ilẹ̀', 'Nàìjíríà', ',', 'tí', 'a', 'bá', 'wo', 'èdè', 'Yorùbá', ',', 'àwọn', 'onímọ̀', 'pín', 'èdè', 'náà', 'sábẹ́', 'ẹ̀yà', 'Kwa', 'nínú', 'ẹbí', 'èdè', 'Niger', '-', 'Congo', '.']
Beispiel #19
0
def test_lemmatize(lang, lemmatizer, show_results = False):
    lang_text = wl_conversion.to_lang_text(main, lang)

    tokens = wl_word_tokenization.wl_word_tokenize(
        main,
        text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang = lang
    )
    
    lemmas = wl_lemmatization.wl_lemmatize(
        main,
        tokens = wl_misc.flatten_list(tokens),
        lang = lang,
        lemmatizer = lemmatizer
    )

    if show_results:
        print(f'{lang} / {lemmatizer}:')
        print(lemmas)

    if lang == 'ast':
        assert lemmas == ["L'asturianu", 'ser', 'unu', 'llingua', 'romance', 'propiu', "d'Asturies,[1", ']', 'perteneciente', 'al', 'subgrupu', 'asturllionés', '.']
    elif lang == 'bul':
        assert lemmas == ['Бъ̀лгарският', 'езѝк', 'съм', 'индоевропейски', 'език', 'от', 'група', 'на', 'южнославянските', 'език', '.']
    elif lang == 'cat':
        assert lemmas == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'ell', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'ell', 'ciutat', 'de', 'ell', 'Alguer', 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'ser', 'un', 'llengua', 'romànic', 'parlar', 'a', 'Catalunya', ',', 'ell', 'País', 'Valencià', '(', 'treure', 'de', 'algun', 'comarca', 'i', 'localitat', 'de', 'ell', 'interior', ')', ',', 'ell', 'Illes', 'Balears', ',', 'Andorra', ',', 'ell', 'Franja', 'de', 'Ponent', '(', 'a', 'ell', 'Aragó', ')', ',', 'ell', 'ciutat', 'de', 'ell', 'Alguer', '(', 'a', 'ell', 'illa', 'de', 'Sardenya', ')', ',', 'ell', 'Catalunya', 'del', 'Nord,[8', ']', 'ell', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblar', 'per', 'immigrar', 'valencians),[9][10', ']', 'i', 'en', 'petita', 'comunitat', 'arreu', 'del', 'món', '(', 'entrar', 'ell', 'qual', 'destacar', 'ell', 'de', 'ell', 'Argentina', ',', 'amb', '195.000', 'parlants).[11', ']']
    elif lang == 'ces':
        assert lemmas == ['Čeština', 'neboli', 'český', 'jazyk', 'on', 'západoslovanský', 'jazyk', ',', 'blízký', 'slovenštině', ',', 'poté', 'lužické', 'srbštině', 'a', 'polštině', '.']
    elif lang == 'dan':
        assert lemmas == ['Dansk', 'være', 'en', 'nordgermansk', 'sprog', 'af', 'den', 'østnordiske', '(', 'kontinental', ')', 'gruppe', ',', 'der', 'tale', 'af', 'ca.', 'seks', 'million', 'menneske', '.']
    elif lang == 'nld':
        assert lemmas == ['het', 'nederlands', 'zijn', 'een', 'west-germaans', 'taal', 'en', 'de', 'moedertaal', 'van', 'de', 'veel', 'inwoner', 'van', 'nederland', ',', 'belgië', 'en', 'suriname', '.']
    elif lang == 'eng':
        if lemmatizer == 'Lemmatization Lists - English Lemma List':
            assert lemmas == ['English', 'be', 'a', 'West', 'Germanic', 'language', 'that', 'be', '1', 'speak', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'become', 'a', 'global', 'lingua', 'franca.[4][5', ']']
        elif lemmatizer in ['NLTK - WordNet Lematizer',
                            'spaCy - English Lemmatizer']:
            assert lemmas == ['English', 'be', 'a', 'West', 'Germanic', 'language', 'that', 'be', 'first', 'speak', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'become', 'a', 'global', 'lingua', 'franca.[4][5', ']']
    elif lang == 'est':
        assert lemmas == ['Eesti', 'kee', '(', 'varasem', 'nimetu', 'maakeel', ')', 'olema', 'läänemeresoome', 'lõunarühma', 'kuuluma', 'kee', '.']
    elif lang == 'fra':
        if lemmatizer == 'Lemmatization Lists - French Lemma List':
            assert lemmas == ['Le', 'français', 'être', 'un', 'langue', 'indo-européen', 'de', 'le', 'famille', 'un', 'langue', 'roman', '.']
        elif lemmatizer == 'spaCy - French Lemmatizer':
            assert lemmas == ['le', 'français', 'être', 'un', 'langue', 'indo-européen', 'de', 'le', 'famille', 'de', 'langue', 'roman', '.']
    elif lang == 'glg':
        assert lemmas == ['O', 'galego', '(', '[', 'ɡaˈleɣo̝', ']', ')', 'ser', 'un', 'lingua', 'indoeuropeo', 'que', 'pertencer', 'á', 'póla', 'de', 'lingua', 'románico', '.']
    elif lang == 'deu':
        if lemmatizer == 'Lemmatization Lists - German Lemma List':
            assert lemmas == ['Die', 'deutsch', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abkürzen', 'dt', '.', 'oder', 'dtsch', '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', '.']
        elif lemmatizer == 'spaCy - German Lemmatizer':
            assert lemmas == ['der', 'deutsch', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abkürzen', 'dt', '.', 'oder', 'dtsch', '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', '.']
    elif lang == 'grc':
        assert lemmas == ['Με', 'τον', 'όρο', 'αρχαία', 'ελληνική', 'γλώσσα', 'εννοείται', 'μια', 'μορφή', 'της', 'ελληνικής', 'γλώσσας', ',', 'πού', 'ομιλούνταν', 'κατά', 'τους', 'αρχαϊκούς', 'χρόνους', 'και', 'την', 'κλασική', 'αρχαιότητα', '.']
    elif lang == 'ell':
        assert lemmas == ['η', 'ελληνικός', 'γλώσσα', 'ανήκω', 'στην', 'ινδοευρωπαϊκός', 'οικογένεια[9', ']', 'και', 'συγκεκριμένα', 'στον', 'ελληνικό', 'κλάδο', ',', 'μαζί', 'με', 'την', 'τσακωνική', ',', 'ενώ', 'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'Ελλάδος', 'και', 'της', 'Κύπρου', '.']
    elif lang == 'hun':
        assert lemmas == ['A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tag', ',', 'a', 'finnugor', 'nyelv', 'köz', 'tartozó', 'ugor', 'nyelv', 'egyik', '.']
    elif lang == 'gle':
        assert lemmas == ['Is', 'ceann', 'de', 'na', 'teangach', 'Ceilteacha', 'í', 'an', 'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'tabhair', 'ar', 'corruair', ')', ',', 'agus', 'ceann', 'den', 'trí', 'ceann', 'de', 'teangach', 'Ceilteacha', 'air', 'a', 'tabhair', 'na', 'teangach', 'Gaelacha', '(', '.i.', 'an', 'Ghaeilge', ',', 'Gaeilge', 'na', 'hAlban', 'agus', 'Gaeilge', 'Mhanann', ')', 'go', 'áirithe', '.']
    elif lang == 'ita':
        assert lemmas == ["L'", 'italiano', '(', '[', 'itaˈljaːno][Nota', '1', ']', 'ascolta[?·info', ']', ')', 'essere', 'una', 'lingua', 'romanzo', 'parlato', 'principalmente', 'in', 'Italia', '.']
    elif lang == 'lit':
        assert lemmas == ['lietuvė', 'kalbėti', '–', 'ižti', 'baltas', 'prokalbės', 'kilęs', 'lietuvė', 'tauta', 'kalbėti', ',', 'kuri', 'Lietuvoje', 'irti', 'valstybinis', ',', 'o', 'Europos', 'sąjunga', '–', 'viena', 'ižti', 'oficialus', 'kalbus', '.']
    elif lang == 'glv':
        assert lemmas == ['She', 'Gaelg', '(', 'graït', ':', '/gɪlg/', ')', 'çhengey', 'Gaelagh', 'Mannin', '.']
    elif lang == 'nob':
        assert lemmas == ['Bokmål', 'er', 'en', 'varietet', 'av', 'norsk', 'språk', '.']
    elif lang == 'fas':
        assert lemmas == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان،[۳', ']', 'تاجیکستان[۴', ']', 'را', 'ازبکستان[۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
    elif lang == 'pol':
        assert lemmas == ['język', 'polski', ',', 'polszczyzna', ',', 'skrót', ':', 'pol', '.', '–', 'język', 'naturalny', 'należeć', 'do', 'grupa', 'język', 'zachodniosłowiańskich', '(', 'do', 'który', 'należeć', 'również', 'czeski', ',', 'słowacki', ',', 'kaszubski', ',', 'dolnołużycki', ',', 'górnołużycki', 'i', 'wymarły', 'połabski', ')', ',', 'stanowić', 'część', 'rodzina', 'język', 'indoeuropejski', '.']
    elif lang == 'por':
        assert lemmas == ['A', 'língua', 'portuguesar', ',', 'também', 'designar', 'português', ',', 'ser', 'umar', 'língua', 'românico', 'flexivo', 'ocidental', 'originar', 'o', 'galego-português', 'falar', 'o', 'Reino', 'da', 'Galiza', 'e', 'o', 'norte', 'de', 'Portugal', '.']
    elif lang == 'ron':
        assert lemmas == ['Limba', 'român', 'fi', 'vrea', 'limbă', 'indo-european', ',', 'din', 'grup', 'italic', 'și', 'din', 'subgrupul', 'oriental', 'al', 'limbă', 'romanice', '.']
    elif lang == 'rus':
        assert lemmas == ['ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'информация', 'о', 'файл', 'слушать', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'один', 'из', 'восточнославянский', 'язык', ',', 'национальный', 'язык', 'русский', 'народ', '.']
    elif lang == 'gla':
        assert lemmas == ["'S", 'i', 'cànan', 'dùthchasach', 'na', 'h', '-', 'Alba', 'a', 'th', "'", 'anns', 'a', "'", 'Ghàidhlig', '.']
    elif lang == 'slk':
        assert lemmas == ['Slovenčina', 'patriť', 'do', 'skupina', 'západoslovanský', 'jazyk', '(', 'spolu', 's', 'čeština', ',', 'poľština', ',', 'horný', 'as', 'dolný', 'lužickou', 'srbčina', 'as', 'kašubčinou', ')', '.']
    elif lang == 'slv':
        assert lemmas == ['Slovenščina', '[', 'slovénščina', ']', '/', '[', 'sloˈʋenʃtʃina', ']', 'onbiti', 'združen', 'naziv', 'za', 'uraden', 'knjižen', 'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in', 'govoriti', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on', 'nekoč', 'govoriti', 'Slovenec', '.']
    elif lang == 'spa':
        assert lemmas == ['El', 'español', 'o', 'castellano', 'ser', 'uno', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablar', '.']
    elif lang == 'swe':
        assert lemmas == ['Svenska', '(', 'svensk', '(', 'info', ')', ')', 'vara', 'en', 'östnordiskt', 'språka', 'som', 'tala', 'av', 'ungefär', 'tio', 'miljon', 'person', 'främst', 'i', 'Sverige', 'där', 'språk', 'hare', 'man', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'mena', 'även', 'som', 'en', 'en', 'nationalspråk', 'i', 'Finland', 'och', 'som', 'enda', 'officiell', 'språka', 'på', 'Åland', '.']
    elif lang == 'bod':
        assert lemmas == ['བོད་', 'ཀྱི་', 'སྐད་ཡིག་', 'ནི་', 'བོད་ཡུལ་', 'དང་', 'དེ་', 'གི་', 'ཉེ་འཁོར་', 'གྱི་', 'ས་ཁུལ་', 'ཏེ་', ' །']
    elif lang == 'ukr':
        if lemmatizer == 'Lemmatization Lists - Ukrainian Lemma List':
            assert lemmas == ['Украї́нська', 'мо́ва', '(', 'МФА', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назвати', '—', 'ру́ська', ',', 'руси́нська[9][10][11', ']', '[', '*', '2', ']', ')', '—', 'національний', 'мова', 'українець', '.']
        elif lemmatizer == 'pymorphy2 - Morphological Analyzer':
            assert lemmas == ['украї́нський', 'мо́вий', '(', 'мфа', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назва', '—', 'ру́ський', ',', 'руси́нська[9][10][11', ']', '[', '*', '2', ']', ')', '—', 'національний', 'мова', 'українець', '.']
    elif lang == 'cym':
        assert lemmas == ['Aelod', "o'r", 'cangen', 'Frythonaidd', "o'r", 'iaith', 'Celtaidd', 'a', 'siarad', 'bod', 'brodorol', 'yn', 'Nghymru', ',', 'can', 'Gymry', 'a', 'pobl', 'arall', 'aredig', 'gwasgar', 'bod', 'Lloegr', ',', 'a', 'can', 'cymuno', 'bechan', 'bod', 'Y', 'Wladfa', ',', 'gwybod', 'Ariannin[7', ']', "yw'r", 'Gymraeg', '(', 'hefyd', 'Cymraeg', 'heb', 'yr', 'bannod', ')', '.']