def run(self):
        preview_results = []

        preview_lang = self.main.settings_custom['lemmatization'][
            'preview_lang']
        preview_samples = self.main.settings_custom['lemmatization'][
            'preview_samples']

        for line in preview_samples.split('\n'):
            line = line.strip()

            if line:
                tokens = wl_word_tokenization.wl_word_tokenize(
                    self.main, line, lang=preview_lang)
                tokens = wl_misc.flatten_list(tokens)

                lemmas = wl_lemmatization.wl_lemmatize(
                    self.main,
                    tokens,
                    lang=preview_lang,
                    lemmatizer=self.lemmatizer)

                text = wl_word_detokenization.wl_word_detokenize(
                    self.main, lemmas, lang=preview_lang)

                preview_results.append(text)
            else:
                preview_results.append('')

        self.worker_done.emit(preview_samples, preview_results)
    def run(self):
        preview_results = []

        preview_lang = self.main.settings_custom['word_tokenization'][
            'preview_lang']
        preview_samples = self.main.settings_custom['word_tokenization'][
            'preview_samples']

        for line in preview_samples.split('\n'):
            line = line.strip()

            if line:
                tokens = wl_word_tokenization.wl_word_tokenize(
                    self.main,
                    line,
                    lang=preview_lang,
                    word_tokenizer=self.word_tokenizer)

                # Vietnamese
                if preview_lang == 'vie':
                    tokens = [re.sub(r'\s+', r'_', token) for token in tokens]

                preview_results.append(' '.join(tokens))
            else:
                preview_results.append('')

        self.worker_done.emit(preview_samples, preview_results)
Exemple #3
0
    def tokenize_text(self, text):
        if text:
            tokens = wl_word_tokenization.wl_word_tokenize(self.main,
                                                           text,
                                                           lang=self.lang)

            self.tags.extend([[]] * len(list(wl_misc.flatten_list(tokens))))
    def run(self):
        preview_results = []

        preview_lang = self.main.settings_custom['pos_tagging']['preview_lang']
        preview_samples = self.main.settings_custom['pos_tagging'][
            'preview_samples']

        for line in preview_samples.split('\n'):
            line = line.strip()

            if line:
                tokens = wl_word_tokenization.wl_word_tokenize(
                    self.main, line, lang=preview_lang)
                tokens = list(wl_misc.flatten_list(tokens))

                tokens_tagged = wl_pos_tagging.wl_pos_tag(
                    self.main,
                    tokens,
                    lang=preview_lang,
                    pos_tagger=self.pos_tagger,
                    tagset=self.tagset)

                preview_results.append(' '.join(
                    [f'{token}_{tag}' for token, tag in tokens_tagged]))
            else:
                preview_results.append('')

        self.worker_done.emit(preview_samples, preview_results)
Exemple #5
0
    def tokenize_text(self, text):
        if text:
            tokens = wl_word_tokenization.wl_word_tokenize(self.main,
                                                           text,
                                                           lang=self.lang)

            for i in range(len(tokens)):
                self.tags_all.append([])
                self.tags_pos.append([])
                self.tags_non_pos.append([])
def test_lemmatize(lang, lemmatizer, show_results = False):
    lang_text = wl_conversion.to_lang_text(main, lang)

    tokens = wl_word_tokenization.wl_word_tokenize(
        main,
        text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang = lang
    )
    
    lemmas = wl_lemmatization.wl_lemmatize(
        main,
        tokens = tokens,
        lang = lang,
        lemmatizer = lemmatizer
    )

    if show_results:
        print(f'{lang} / {lemmatizer}:')
        print(lemmas)

    if lang == 'ast':
        assert lemmas == ["L'asturianu", 'ser', 'unu', 'llingua', 'romance', 'propiu', "d'Asturies,[1", ']', 'perteneciente', 'al', 'subgrupu', 'asturllionés', '.']
    elif lang == 'bul':
        assert lemmas == ['Бъ̀лгарският', 'езѝк', 'съм', 'индоевропейски', 'език', 'от', 'група', 'на', 'южнославянските', 'език', '.']
    elif lang == 'cat':
        assert lemmas == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'ell', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'ell', 'ciutat', 'de', 'ell', 'Alguer', 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'ser', 'un', 'llengua', 'romànic', 'parlar', 'a', 'Catalunya', ',', 'ell', 'País', 'Valencià', '(', 'treure', 'de', 'algun', 'comarca', 'i', 'localitat', 'de', 'ell', 'interior', ')', ',', 'ell', 'Illes', 'Balears', ',', 'Andorra', ',', 'ell', 'Franja', 'de', 'Ponent', '(', 'a', 'ell', 'Aragó', ')', ',', 'ell', 'ciutat', 'de', 'ell', 'Alguer', '(', 'a', 'ell', 'illa', 'de', 'Sardenya', ')', ',', 'ell', 'Catalunya', 'del', 'Nord,[8', ']', 'ell', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblar', 'per', 'immigrar', 'valencians),[9][10', ']', 'i', 'en', 'petita', 'comunitat', 'arreu', 'del', 'món', '(', 'entrar', 'ell', 'qual', 'destacar', 'ell', 'de', 'ell', 'Argentina', ',', 'amb', '195.000', 'parlants).[11', ']']
    elif lang == 'ces':
        assert lemmas == ['Čeština', 'neboli', 'český', 'jazyk', 'on', 'západoslovanský', 'jazyk', ',', 'blízký', 'slovenštině', ',', 'poté', 'lužické', 'srbštině', 'a', 'polštině', '.']
    elif lang == 'nld':
        assert lemmas == ['het', 'nederlands', 'zijn', 'een', 'west-germaans', 'taal', 'en', 'de', 'moedertaal', 'van', 'de', 'veel', 'inwoner', 'van', 'nederland', ',', 'belgië', 'en', 'suriname', '.']
    elif lang == 'eng':
        if lemmatizer == 'Lemmatization Lists - English Lemma List':
            assert lemmas == ['English', 'be', 'a', 'West', 'Germanic', 'language', 'that', 'be', '1', 'speak', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'become', 'a', 'global', 'lingua', 'franca.[4][5', ']']
        elif lemmatizer == 'NLTK - WordNet Lemmatizer':
            assert lemmas == ['English', 'be', 'a', 'West', 'Germanic', 'language', 'that', 'be', 'first', 'speak', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'become', 'a', 'global', 'lingua', 'franca.[4][5', ']']
        elif lemmatizer == 'spaCy - English Lemmatizer':
            assert lemmas == ['English', 'be', 'a', 'West', 'germanic', 'language', 'that', 'be', 'first', 'speak', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'become', 'a', 'global', 'lingua', 'franca.[4][5', ']']
    elif lang == 'est':
        assert lemmas == ['Eesti', 'kee', '(', 'varasem', 'nimetu', ':', 'maakeel', ')', 'olema', 'läänemeresoome', 'lõunarühma', 'kuuluma', 'kee', '.']
    elif lang == 'fra':
        if lemmatizer == 'Lemmatization Lists - French Lemma List':
            assert lemmas == ['Le', 'français', 'être', 'un', 'langue', 'indo-européen', 'de', 'le', 'famille', 'un', 'langue', 'roman', '.']
        elif lemmatizer == 'spaCy - French Lemmatizer':
            assert lemmas == ['le', 'français', 'être', 'un', 'langue', 'indo-européen', 'de', 'le', 'famille', 'un', 'langue', 'roman', '.']
    elif lang == 'glg':
        assert lemmas == ['O', 'galego', '(', '[', 'ɡaˈleɣo̝', ']', ')', 'ser', 'un', 'lingua', 'indoeuropeo', 'que', 'pertencer', 'á', 'póla', 'de', 'lingua', 'románico', '.']
    elif lang == 'deu':
        if lemmatizer == 'Lemmatization Lists - German Lemma List':
            assert lemmas == ['Die', 'deutsch', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abkürzen', 'dt', '.', 'oder', 'dtsch', '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', '.']
        elif lemmatizer == 'spaCy - German Lemmatizer':
            assert lemmas == ['der', 'deutsch', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abkürzen', 'dt', '.', 'oder', 'dtsch', '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', '.']
    elif lang == 'grc':
        assert lemmas == ['Με', 'τον', 'όρο', 'αρχαία', 'ελληνική', 'γλώσσα', 'εννοείται', 'μια', 'μορφή', 'της', 'ελληνικής', 'γλώσσας', ',', 'πού', 'ομιλούνταν', 'κατά', 'τους', 'αρχαϊκούς', 'χρόνους', 'και', 'την', 'κλασική', 'αρχαιότητα', '.']
    elif lang == 'ell':
        assert lemmas == ['η', 'ελληνικός', 'γλώσσα', 'ανήκω', 'στην', 'ινδοευρωπαϊκός', 'οικογένεια[9', ']', 'και', 'συγκεκριμένα', 'στον', 'ελληνικό', 'κλάδο', ',', 'μαζί', 'με', 'την', 'τσακωνικός', ',', 'ενώ', 'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'Ελλάδος', 'και', 'της', 'Κύπρου', '.']
    elif lang == 'hun':
        assert lemmas == ['A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tag', ',', 'a', 'finnugor', 'nyelv', 'köz', 'tartozó', 'ugor', 'nyelv', 'egyik', '.']
    elif lang == 'gle':
        assert lemmas == ['Is', 'ceann', 'de', 'na', 'teangach', 'Ceilteacha', 'í', 'an', 'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'tabhair', 'ar', 'corruair', ')', ',', 'agus', 'ceann', 'den', 'trí', 'ceann', 'de', 'teangach', 'Ceilteacha', 'air', 'a', 'tabhair', 'na', 'teangach', 'Gaelacha', '(', '.i.', 'an', 'Ghaeilge', ',', 'Gaeilge', 'na', 'hAlban', 'agus', 'Gaeilge', 'Mhanann', ')', 'go', 'áirithe', '.']
    elif lang == 'ita':
        assert lemmas == ["L'", 'italiano', '(', '[', 'itaˈljaːno][Nota', '1', ']', 'ascolta[?·info', ']', ')', 'essere', 'una', 'lingua', 'romanzo', 'parlato', 'principalmente', 'in', 'Italia', '.']
    elif lang == 'lit':
        assert lemmas == ['lietuvė', 'kalbėti', '–', 'ižti', 'baltas', 'prokalbės', 'kilęs', 'lietuvė', 'tauta', 'kalbėti', ',', '-PRON-', 'Lietuvoje', 'būti', 'valstybinis', ',', 'o', 'Europos', 'sąjunga', '–', 'viena', 'ižti', 'oficialus', 'kalbus', '.']
    elif lang == 'glv':
        assert lemmas == ['She', 'Gaelg', '(', 'graït', ':', '/gɪlg/', ')', 'çhengey', 'Gaelagh', 'Mannin', '.']
    elif lang == 'nob':
        assert lemmas == ['bokmål', 'være', 'en', 'varietet', 'av', 'norsk', 'språk', '.']
    elif lang == 'fas':
        assert lemmas == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان،[۳', ']', 'تاجیکستان[۴', ']', 'را', 'ازبکستان[۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
    elif lang == 'por':
        assert lemmas == ['A', 'língua', 'portuguesar', ',', 'também', 'designar', 'português', ',', 'ser', 'umar', 'língua', 'românico', 'flexivo', 'ocidental', 'originar', 'o', 'galego-português', 'falar', 'o', 'Reino', 'da', 'Galiza', 'e', 'o', 'norte', 'de', 'Portugal', '.']
    elif lang == 'ron':
        assert lemmas == ['Limba', 'român', 'fi', 'vrea', 'limbă', 'indo', '-', 'european', ',', 'din', 'grup', 'italic', 'și', 'din', 'subgrupul', 'oriental', 'al', 'limbă', 'romanice', '.']
    elif lang == 'rus':
        assert lemmas == ['ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'информация', 'о', 'файл', 'слушать', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'один', 'из', 'восточнославянский', 'язык', ',', 'национальный', 'язык', 'русский', 'народ', '.']
    elif lang == 'gla':
        assert lemmas == ["'S", 'i', 'cànan', 'dùthchasach', 'na', 'h', '-', 'Alba', 'a', 'th', "'", 'anns', 'a', "'", 'Ghàidhlig', '.']
    elif lang == 'slk':
        assert lemmas == ['Slovenčina', 'patriť', 'do', 'skupina', 'západoslovanský', 'jazyk', '(', 'spolu', 's', 'čeština', ',', 'poľština', ',', 'horný', 'as', 'dolný', 'lužickou', 'srbčina', 'as', 'kašubčinou', ')', '.']
    elif lang == 'slv':
        assert lemmas == ['Slovenščina', '[', 'slovénščina', ']', '/', '[', 'sloˈʋenʃtʃina', ']', 'onbiti', 'združen', 'naziv', 'za', 'uraden', 'knjižen', 'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in', 'govoriti', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on', 'nekoč', 'govoriti', 'Slovenec', '.']
    elif lang == 'spa':
        assert lemmas == ['El', 'español', 'o', 'castellano', 'ser', 'uno', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablar', '.']
    elif lang == 'swe':
        assert lemmas == ['Svenska', '(', 'svensk', '(', 'info', ')', ')', 'vara', 'en', 'östnordiskt', 'språka', 'som', 'tala', 'av', 'ungefär', 'tio', 'miljon', 'person', 'främst', 'i', 'Sverige', 'där', 'språk', 'hare', 'man', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'mena', 'även', 'som', 'en', 'en', 'nationalspråk', 'i', 'Finland', 'och', 'som', 'enda', 'officiell', 'språka', 'på', 'Åland', '.']
    elif lang == 'bod':
        assert lemmas == ['བོད་', 'ཀྱི་', 'སྐད་ཡིག་', 'ནི་', 'བོད་ཡུལ་', 'དང་', 'དེ་', 'གི་', 'ཉེ་འཁོར་', 'གྱི་', 'ས་ཁུལ་', 'ཏེ་', ' །']
    elif lang == 'ukr':
        if lemmatizer == 'Lemmatization Lists - Ukrainian Lemma List':
            assert lemmas == ['Украї́нська', 'мо́ва', '(', 'МФА', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назвати', '—', 'ру́ська', ',', 'руси́нська[9][10][11', ']', '[', '*', '2', ']', ')', '—', 'національний', 'мова', 'українець', '.']
        elif lemmatizer == 'pymorphy2 - Morphological Analyzer':
            assert lemmas == ['украї́нський', 'мо́вий', '(', 'мфа', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назва', '—', 'ру́ський', ',', 'руси́нська[9][10][11', ']', '[', '*', '2', ']', ')', '—', 'національний', 'мова', 'українець', '.']
    elif lang == 'cym':
        assert lemmas == ['Aelod', "o'r", 'cangen', 'Frythonaidd', "o'r", 'iaith', 'Celtaidd', 'a', 'siarad', 'bod', 'brodorol', 'yn', 'Nghymru', ',', 'can', 'Gymry', 'a', 'pobl', 'arall', 'aredig', 'gwasgar', 'bod', 'Lloegr', ',', 'a', 'can', 'cymuno', 'bechan', 'bod', 'Y', 'Wladfa', ',', 'gwybod', 'Ariannin[7', ']', "yw'r", 'Gymraeg', '(', 'hefyd', 'Cymraeg', 'heb', 'yr', 'bannod', ')', '.']
def test_word_tokenize(lang, word_tokenizer, show_results = False):
    lang_text = wl_conversion.to_lang_text(main, lang)

    tokens = wl_word_tokenization.wl_word_tokenize(
        main,
        text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang = lang,
        word_tokenizer = word_tokenizer
    )
    tokens = list(wl_misc.flatten_list(tokens))

    if show_results:
        print(f'{lang} / {word_tokenizer}:')
        print(tokens)

    if lang == 'afr':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['Afrikaans', 'is', 'tipologies', 'gesien', "'n", 'Indo-Europese', ',', 'Wes-Germaanse', ',', 'Nederfrankiese', 'taal', ',', '[', '2', ']', 'wat', 'sy', 'ontstaan', 'aan', 'die', 'suidpunt', 'van', 'Afrika', 'gehad', 'het', 'onder', 'invloed', 'van', 'verskeie', 'ander', 'tale', 'en', 'taalgroepe', '.']
        elif word_tokenizer == ['NLTK - NLTK Tokenizer',
                                'NLTK - Twitter Tokenizer']:
            assert tokens == ['Afrikaans', 'is', 'tipologies', 'gesien', "'", 'n', 'Indo-Europese', ',', 'Wes-Germaanse', ',', 'Nederfrankiese', 'taal', ',', '[', '2', ']', 'wat', 'sy', 'ontstaan', 'aan', 'die', 'suidpunt', 'van', 'Afrika', 'gehad', 'het', 'onder', 'invloed', 'van', 'verskeie', 'ander', 'tale', 'en', 'taalgroepe', '.']
        elif word_tokenizer == 'spaCy - Afrikaans Word Tokenizer':
            assert tokens == ['Afrikaans', 'is', 'tipologies', 'gesien', "'", 'n', 'Indo', '-', 'Europese', ',', 'Wes', '-', 'Germaanse', ',', 'Nederfrankiese', 'taal,[2', ']', 'wat', 'sy', 'ontstaan', 'aan', 'die', 'suidpunt', 'van', 'Afrika', 'gehad', 'het', 'onder', 'invloed', 'van', 'verskeie', 'ander', 'tale', 'en', 'taalgroepe', '.']
    elif lang == 'sqi':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['Gjuha', 'shqipe', '(', 'ose', 'thjeshtë', 'shqipja', ')', 'është', 'gjuhë', 'dhe', 'degë', 'e', 'veçantë', 'e', 'familjes', 'indo-evropiane', 'të', 'folur', 'nga', 'më', 'shumë', 'se', '6', 'milionë', 'njerëz', '[', '4', ']', ',', 'kryesisht', 'në', 'Shqipëri', ',', 'Kosovë', 'dhe', 'Republikën', 'e', 'Maqedonisë', ',', 'por', 'edhe', 'në', 'zona', 'të', 'tjera', 'të', 'Evropës', 'Jugore', 'ku', 'ka', 'një', 'popullsi', 'shqiptare', ',', 'duke', 'përfshirë', 'Malin', 'e', 'Zi', 'dhe', 'Luginën', 'e', 'Preshevës', '.']
        elif word_tokenizer == 'spaCy - Albanian Word Tokenizer':
            assert tokens == ['Gjuha', 'shqipe', '(', 'ose', 'thjeshtë', 'shqipja', ')', 'është', 'gjuhë', 'dhe', 'degë', 'e', 'veçantë', 'e', 'familjes', 'indo', '-', 'evropiane', 'të', 'folur', 'nga', 'më', 'shumë', 'se', '6', 'milionë', 'njerëz[4', ']', ',', 'kryesisht', 'në', 'Shqipëri', ',', 'Kosovë', 'dhe', 'Republikën', 'e', 'Maqedonisë', ',', 'por', 'edhe', 'në', 'zona', 'të', 'tjera', 'të', 'Evropës', 'Jugore', 'ku', 'ka', 'një', 'popullsi', 'shqiptare', ',', 'duke', 'përfshirë', 'Malin', 'e', 'Zi', 'dhe', 'Luginën', 'e', 'Preshevës', '.']
    elif lang == 'ara':
        assert tokens == ['اللُّغَة', 'العَرَبِيّة', 'هي', 'أكثر', 'اللغات', 'تحدثاً', 'ونطقاً', 'ضمن', 'مجموعة', 'اللغات', 'السامية', '،', 'وإحدى', 'أكثر', 'اللغات', 'انتشاراً', 'في', 'العالم', '،', 'يتحدثها', 'أكثر', 'من', '467', 'مليون', 'نسمة،(1', ')', 'ويتوزع', 'متحدثوها', 'في', 'الوطن', 'العربي', '،', 'بالإضافة', 'إلى', 'العديد', 'من', 'المناطق', 'الأخرى', 'المجاورة', 'كالأحواز', 'وتركيا', 'وتشاد', 'ومالي', 'والسنغال', 'وإرتيريا', 'وإثيوبيا', 'وجنوب', 'السودان', 'وإيران', '.']
    elif lang == 'hye':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'spaCy - Armenian Word Tokenizer']:
            assert tokens == ['Հայոց', 'լեզվով', 'ստեղծվել', 'է', 'մեծ', 'գրականություն', ':', 'Գրաբարով', 'է', 'ավանդված', 'հայ', 'հին', 'պատմագրությունը', ',', 'գիտափիլիսոփայական', ',', 'մաթեմատիկական', ',', 'բժշկագիտական', ',', 'աստվածաբանական-դավանաբանական', 'գրականությունը։']
        elif word_tokenizer == 'NLTK - Twitter Tokenizer':
            assert tokens == ['Հայոց', 'լեզվով', 'ստեղծվել', 'է', 'մեծ', 'գրականություն', ':', 'Գրաբարով', 'է', 'ավանդված', 'հայ', 'հին', 'պատմագրությունը', ',', 'գիտափիլիսոփայական', ',', 'մաթեմատիկական', ',', 'բժշկագիտական', ',', 'աստվածաբանական-դավանաբանական', 'գրականությունը', '։']
    elif lang == 'eus':
        assert tokens == ['Euskara', 'Euskal', 'Herriko', 'hizkuntza', 'da.[5', ']']
    elif lang == 'ben':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামগুলোতেও', 'পরিচিত', ')', 'একটি', 'ইন্দো-আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা।']
        elif word_tokenizer == 'NLTK - Twitter Tokenizer':
            assert tokens == ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামগুলোতেও', 'পরিচিত', ')', 'একটি', 'ইন্দো-আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা', '।']
        elif word_tokenizer == 'spaCy - Arabic Word Tokenizer':
            assert tokens == ['বাংলা', 'ভাষা', '(', 'বাঙলা', ',', 'বাঙ্গলা', ',', 'তথা', 'বাঙ্গালা', 'নামগুলোতেও', 'পরিচিত', ')', 'একটি', 'ইন্দো', '-', 'আর্য', 'ভাষা', ',', 'যা', 'দক্ষিণ', 'এশিয়ার', 'বাঙালি', 'জাতির', 'প্রধান', 'কথ্য', 'ও', 'লেখ্য', 'ভাষা', '।']
    elif lang == 'bul':
        assert tokens == ['Бъ̀лгарският', 'езѝк', 'е', 'индоевропейски', 'език', 'от', 'групата', 'на', 'южнославянските', 'езици', '.']
    elif lang == 'cat':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'les', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'la', 'ciutat', 'de', "l'Alguer", 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'és', 'una', 'llengua', 'romànica', 'parlada', 'a', 'Catalunya', ',', 'el', 'País', 'Valencià', '(', 'tret', "d'algunes", 'comarques', 'i', 'localitats', 'de', "l'interior", ')', ',', 'les', 'Illes', 'Balears', ',', 'Andorra', ',', 'la', 'Franja', 'de', 'Ponent', '(', 'a', "l'Aragó", ')', ',', 'la', 'ciutat', 'de', "l'Alguer", '(', 'a', "l'illa", 'de', 'Sardenya', ')', ',', 'la', 'Catalunya', 'del', 'Nord', ',', '[', '8', ']', 'el', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblat', 'per', 'immigrats', 'valencians', ')', ',', '[', '9', ']', '[', '10', ']', 'i', 'en', 'petites', 'comunitats', 'arreu', 'del', 'món', '(', 'entre', 'les', 'quals', 'destaca', 'la', 'de', "l'Argentina", ',', 'amb', '195.000', 'parlants', ')', '.', '[', '11', ']']
        elif word_tokenizer == 'NLTK - Twitter Tokenizer':
            assert tokens == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'les', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'la', 'ciutat', 'de', "l'Alguer", 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'és', 'una', 'llengua', 'romànica', 'parlada', 'a', 'Catalunya', ',', 'el', 'País', 'Valencià', '(', 'tret', "d'algunes", 'comarques', 'i', 'localitats', 'de', "l'interior", ')', ',', 'les', 'Illes', 'Balears', ',', 'Andorra', ',', 'la', 'Franja', 'de', 'Ponent', '(', 'a', "l'Aragó", ')', ',', 'la', 'ciutat', 'de', "l'Alguer", '(', 'a', "l'illa", 'de', 'Sardenya', ')', ',', 'la', 'Catalunya', 'del', 'Nord', ',', '[8', ']', 'el', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblat', 'per', 'immigrats', 'valencians', ')', ',', '[', '9', ']', '[', '10', ']', 'i', 'en', 'petites', 'comunitats', 'arreu', 'del', 'món', '(', 'entre', 'les', 'quals', 'destaca', 'la', 'de', "l'Argentina", ',', 'amb', '195.000', 'parlants', ')', '.', '[', '11', ']']
        elif word_tokenizer == 'Sacremoses - Moses Tokenizer':
            assert tokens == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'les', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'la', 'ciutat', 'de', 'l', "'", 'Alguer', 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'és', 'una', 'llengua', 'romànica', 'parlada', 'a', 'Catalunya', ',', 'el', 'País', 'Valencià', '(', 'tret', 'd', "'", 'algunes', 'comarques', 'i', 'localitats', 'de', 'l', "'", 'interior', ')', ',', 'les', 'Illes', 'Balears', ',', 'Andorra', ',', 'la', 'Franja', 'de', 'Ponent', '(', 'a', 'l', "'", 'Aragó', ')', ',', 'la', 'ciutat', 'de', 'l', "'", 'Alguer', '(', 'a', 'l', "'", 'illa', 'de', 'Sardenya', ')', ',', 'la', 'Catalunya', 'del', 'Nord', ',', '[', '8', ']', 'el', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblat', 'per', 'immigrats', 'valencians', ')', ',', '[', '9', ']', '[', '10', ']', 'i', 'en', 'petites', 'comunitats', 'arreu', 'del', 'món', '(', 'entre', 'les', 'quals', 'destaca', 'la', 'de', 'l', "'", 'Argentina', ',', 'amb', '195.000', 'parlants', ')', '.', '[', '11', ']']
        elif word_tokenizer == 'spaCy - Catalan Word Tokenizer':
            assert tokens == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'les', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'la', 'ciutat', 'de', "l'", 'Alguer', 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'és', 'una', 'llengua', 'romànica', 'parlada', 'a', 'Catalunya', ',', 'el', 'País', 'Valencià', '(', 'tret', "d'", 'algunes', 'comarques', 'i', 'localitats', 'de', "l'", 'interior', ')', ',', 'les', 'Illes', 'Balears', ',', 'Andorra', ',', 'la', 'Franja', 'de', 'Ponent', '(', 'a', "l'", 'Aragó', ')', ',', 'la', 'ciutat', 'de', "l'", 'Alguer', '(', 'a', "l'", 'illa', 'de', 'Sardenya', ')', ',', 'la', 'Catalunya', 'del', 'Nord,[8', ']', 'el', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblat', 'per', 'immigrats', 'valencians),[9][10', ']', 'i', 'en', 'petites', 'comunitats', 'arreu', 'del', 'món', '(', 'entre', 'les', 'quals', 'destaca', 'la', 'de', "l'", 'Argentina', ',', 'amb', '195.000', 'parlants).[11', ']']
    elif lang == 'zho_cn':
        if word_tokenizer == 'jieba - Chinese Word Tokenizer':
            assert tokens == ['汉语', ',', '又称', '汉文', '、', '中文', '、', '中国', '话', '、', '中国', '语', '、', '华语', '、', '华文', '、', '唐话', '[', '2', ']', ',', '或', '被', '视为', '一个', '语族', ',', '或', '被', '视为', '隶属于', '汉藏语系', '汉语', '族', '之', '一种', '语言', '。']
        elif word_tokenizer == 'pkuseg - Chinese Word Tokenizer':
            assert tokens == ['汉语', ',', '又', '称', '汉文', '、', '中文', '、', '中国话', '、', '中国语', '、', '华语', '、', '华文', '、', '唐', '话[', '2', ']', ',', '或', '被', '视为', '一个', '语族', ',', '或', '被', '视为', '隶属于', '汉藏', '语系', '汉语族', '之一', '种', '语言', '。']
        elif word_tokenizer == 'Wordless - Chinese Character Tokenizer':
            assert tokens == ['汉', '语', ',', '又', '称', '汉', '文', '、', '中', '文', '、', '中', '国', '话', '、', '中', '国', '语', '、', '华', '语', '、', '华', '文', '、', '唐', '话', '[', '2', ']', ',', '或', '被', '视', '为', '一', '个', '语', '族', ',', '或', '被', '视', '为', '隶', '属', '于', '汉', '藏', '语', '系', '汉', '语', '族', '之', '一', '种', '语', '言', '。']
    elif lang == 'zho_tw':
        if word_tokenizer == 'jieba - Chinese Word Tokenizer':
            assert tokens == ['漢語', ',', '又', '稱漢文', '、', '中文', '、', '中國話', '、', '中國語', '、', '華語', '、', '華文', '、', '唐話', '[', '2', ']', ',', '或', '被', '視為', '一個', '語族', ',', '或', '被', '視為', '隸屬', '於', '漢藏語', '系漢', '語族', '之一', '種語', '言', '。']
        elif word_tokenizer == 'pkuseg - Chinese Word Tokenizer':
            assert tokens == ['漢語', ',', '又', '稱', '漢文', '、', '中文', '、', '中', '國話', '、', '中國語', '、', '華語', '、', '華文', '、', '唐', '話[', '2', ']', ',', '或', '被', '視為', '一', '個', '語族', ',', '或', '被', '視', '為隸', '屬於', '漢藏', '語系', '漢語族', '之一', '種', '語言', '。']
        elif word_tokenizer == 'Wordless - Chinese Character Tokenizer':
            assert tokens == ['漢', '語', ',', '又', '稱', '漢', '文', '、', '中', '文', '、', '中', '國', '話', '、', '中', '國', '語', '、', '華', '語', '、', '華', '文', '、', '唐', '話', '[', '2', ']', ',', '或', '被', '視', '為', '一', '個', '語', '族', ',', '或', '被', '視', '為', '隸', '屬', '於', '漢', '藏', '語', '系', '漢', '語', '族', '之', '一', '種', '語', '言', '。']
    elif lang == 'hrv':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'spaCy - Croatian Word Tokenizer']:
            assert tokens == ['Hrvatski', 'jezik', '(', 'ISO', '639', '-', '3', ':', 'hrv', ')', 'skupni', 'je', 'naziv', 'za', 'nacionalni', 'standardni', 'jezik', 'Hrvata', ',', 'te', 'za', 'skup', 'narječja', 'i', 'govora', 'kojima', 'govore', 'ili', 'su', 'nekada', 'govorili', 'Hrvati', '.']
        elif word_tokenizer in ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer',
                                'NLTK - Twitter Tokenizer']:
            assert tokens == ['Hrvatski', 'jezik', '(', 'ISO', '639-3', ':', 'hrv', ')', 'skupni', 'je', 'naziv', 'za', 'nacionalni', 'standardni', 'jezik', 'Hrvata', ',', 'te', 'za', 'skup', 'narječja', 'i', 'govora', 'kojima', 'govore', 'ili', 'su', 'nekada', 'govorili', 'Hrvati', '.']
        
    elif lang == 'ces':
        assert tokens == ['Čeština', 'neboli', 'český', 'jazyk', 'je', 'západoslovanský', 'jazyk', ',', 'nejbližší', 'slovenštině', ',', 'poté', 'lužické', 'srbštině', 'a', 'polštině', '.']
    elif lang == 'dan':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['Dansk', 'er', 'et', 'nordgermansk', 'sprog', 'af', 'den', 'østnordiske', '(', 'kontinentale', ')', 'gruppe', ',', 'der', 'tales', 'af', 'ca', '.', 'seks', 'millioner', 'mennesker', '.']
        elif word_tokenizer in ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer',
                                'spaCy - Danish Word Tokenizer']:
            assert tokens == ['Dansk', 'er', 'et', 'nordgermansk', 'sprog', 'af', 'den', 'østnordiske', '(', 'kontinentale', ')', 'gruppe', ',', 'der', 'tales', 'af', 'ca.', 'seks', 'millioner', 'mennesker', '.']
        
    elif lang == 'nld':
        assert tokens == ['Het', 'Nederlands', 'is', 'een', 'West-Germaanse', 'taal', 'en', 'de', 'moedertaal', 'van', 'de', 'meeste', 'inwoners', 'van', 'Nederland', ',', 'België', 'en', 'Suriname', '.']
    elif lang == 'eng':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - Twitter Tokenizer',
                              'Sacremoses - Moses Tokenizer']:
            assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'became', 'a', 'global', 'lingua', 'franca', '.', '[', '4', ']', '[', '5', ']']
        elif word_tokenizer in ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer',
                                'syntok - Word Tokenizer']:
            assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'became', 'a', 'global', 'lingua', 'franca.', '[', '4', ']', '[', '5', ']']
        elif word_tokenizer == 'NLTK - Tok-tok Tokenizer':
            assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'became', 'a', 'global', 'lingua', 'franca.[', '4', ']', '[', '5', ']']
        elif word_tokenizer == 'spaCy - English Word Tokenizer':
            assert tokens == ['English', 'is', 'a', 'West', 'Germanic', 'language', 'that', 'was', 'first', 'spoken', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'became', 'a', 'global', 'lingua', 'franca.[4][5', ']']
    elif lang == 'est':
        assert tokens == ['Eesti', 'keel', '(', 'varasem', 'nimetus', 'maakeel', ')', 'on', 'läänemeresoome', 'lõunarühma', 'kuuluv', 'keel', '.']
    elif lang == 'fin':
        assert tokens == ['Suomen', 'kieli', '(', 'suomi', ')', 'on', 'uralilaisten', 'kielten', 'itämerensuomalaiseen', 'ryhmään', 'kuuluva', 'kieli', '.']
    elif lang == 'fra':
        assert tokens == ['Le', 'français', 'est', 'une', 'langue', 'indo-européenne', 'de', 'la', 'famille', 'des', 'langues', 'romanes', '.']
    elif lang == 'deu':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'syntok - Word Tokenizer']:
            assert tokens == ['Die', 'deutsche', 'Sprache', 'bzw', '.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abgekürzt', 'dt', '.', 'oder', 'dtsch', '.', ')', 'ist', 'eine', 'westgermanische', 'Sprache', '.']
        elif word_tokenizer == ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer',
                                'NLTK - Tok-tok Tokenizer']:
            assert tokens == ['Die', 'deutsche', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abgekürzt', 'dt.', 'oder', 'dtsch.', ')', 'ist', 'eine', 'westgermanische', 'Sprache', '.']
        elif word_tokenizer == 'German / NLTK - Twitter Tokenizer':
            assert tokens == ['Die', 'deutsche', 'Sprache', 'bzw', '.', 'Deutsch', '(', '[', 'dɔʏ', '̯', 't', '͡', 'ʃ', '];', 'abgekürzt', 'dt', '.', 'oder', 'dtsch', '.', ')', 'ist', 'eine', 'westgermanische', 'Sprache', '.']
        elif word_tokenizer == 'Sacremoses - Moses Tokenizer':
            assert tokens == ['Die', 'deutsche', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ', '̯', 't', '͡', 'ʃ', ']', ';', 'abgekürzt', 'dt.', 'oder', 'dtsch', '.', ')', 'ist', 'eine', 'westgermanische', 'Sprache', '.']
        elif word_tokenizer == 'spaCy - German Word Tokenizer':
            assert tokens == ['Die', 'deutsche', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abgekürzt', 'dt', '.', 'oder', 'dtsch', '.', ')', 'ist', 'eine', 'westgermanische', 'Sprache', '.']
    elif lang == 'ell':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer',
                              'Sacremoses - Moses Tokenizer']:
            assert tokens == ['Η', 'ελληνική', 'γλώσσα', 'ανήκει', 'στην', 'ινδοευρωπαϊκή', 'οικογένεια', '[', '9', ']', 'και', 'συγκεκριμένα', 'στον', 'ελληνικό', 'κλάδο', ',', 'μαζί', 'με', 'την', 'τσακωνική', ',', 'ενώ', 'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'Ελλάδος', 'και', 'της', 'Κύπρου', '.']
        elif word_tokenizer == 'spaCy - Greek (Modern) Word Tokenizer':
            assert tokens == ['Η', 'ελληνική', 'γλώσσα', 'ανήκει', 'στην', 'ινδοευρωπαϊκή', 'οικογένεια[9', ']', 'και', 'συγκεκριμένα', 'στον', 'ελληνικό', 'κλάδο', ',', 'μαζί', 'με', 'την', 'τσακωνική', ',', 'ενώ', 'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'Ελλάδος', 'και', 'της', 'Κύπρου', '.']
    elif lang == 'guj':
        assert tokens == ['ગુજરાતી', '\u200d(/ɡʊdʒəˈrɑːti/[૭', ']', ',', 'રોમન', 'લિપિમાં', ':', 'Gujarātī', ',', 'ઉચ્ચાર', ':', '[', 'ɡudʒəˈɾɑːtiː', ']', ')', 'ભારત', 'દેશના', 'ગુજરાત', 'રાજ્યની', 'ઇન્ડો-આર્યન', 'ભાષા', 'છે', ',', 'અને', 'મુખ્યત્વે', 'ગુજરાતી', 'લોકો', 'દ્વારા', 'બોલાય', 'છે.']
    elif lang == 'heb':
        assert tokens == ['עִבְרִית', 'היא', 'שפה', 'שמית', ',', 'ממשפחת', 'השפות', 'האפרו', '-', 'אסיאתיות', ',', 'הידועה', 'כשפתם', 'של', 'היהודים', 'ושל', 'השומרונים', ',', 'אשר', 'ניב', 'מודרני', 'שלה', '(', 'עברית', 'ישראלית', ')', 'הוא', 'שפתה', 'הרשמית', 'של', 'מדינת', 'ישראל', ',', 'מעמד', 'שעוגן', 'בשנת', '2018', 'בחוק', 'יסוד', ':', 'ישראל', '–', 'מדינת', 'הלאום', 'של', 'העם', 'היהודי', '.']
    elif lang == 'hin':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['हिन्दी', 'विश्व', 'की', 'एक', 'प्रमुख', 'भाषा', 'है', 'एवं', 'भारत', 'की', 'राजभाषा', 'है।']
        elif word_tokenizer == ['NLTK - Twitter Tokenizer',
                                'spaCy - Hindi Word Tokenizer']:
            assert tokens == ['हिन्दी', 'विश्व', 'की', 'एक', 'प्रमुख', 'भाषा', 'है', 'एवं', 'भारत', 'की', 'राजभाषा', 'है', '।']
    elif lang == 'hun':
        assert tokens == ['A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tagja', ',', 'a', 'finnugor', 'nyelvek', 'közé', 'tartozó', 'ugor', 'nyelvek', 'egyike', '.']
    elif lang == 'isl':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - Twitter Tokenizer',
                              'Sacremoses - Moses Tokenizer',
                              'Tokenizer - Icelandic Word Tokenizer']:
            assert tokens == ['Íslenska', 'er', 'vesturnorrænt', ',', 'germanskt', 'og', 'indóevrópskt', 'tungumál', 'sem', 'er', 'einkum', 'talað', 'og', 'ritað', 'á', 'Íslandi', 'og', 'er', 'móðurmál', 'langflestra', 'Íslendinga', '.', '[', '4', ']']
        elif word_tokenizer == ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['Íslenska', 'er', 'vesturnorrænt', ',', 'germanskt', 'og', 'indóevrópskt', 'tungumál', 'sem', 'er', 'einkum', 'talað', 'og', 'ritað', 'á', 'Íslandi', 'og', 'er', 'móðurmál', 'langflestra', 'Íslendinga.', '[', '4', ']']
        elif word_tokenizer == 'spaCy - Icelandic Word Tokenizer':
            assert tokens == ['Íslenska', 'er', 'vesturnorrænt', ',', 'germanskt', 'og', 'indóevrópskt', 'tungumál', 'sem', 'er', 'einkum', 'talað', 'og', 'ritað', 'á', 'Íslandi', 'og', 'er', 'móðurmál', 'langflestra', 'Íslendinga.[4', ']']
    elif lang == 'ind':
        assert tokens == ['Bahasa', 'Indonesia', 'adalah', 'bahasa', 'Melayu', 'baku', 'yang', 'dijadikan', 'sebagai', 'bahasa', 'resmi', 'Republik', 'Indonesia[1', ']', 'dan', 'bahasa', 'persatuan', 'bangsa', 'Indonesia.[2', ']']
    elif lang == 'gle':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['Is', 'ceann', 'de', 'na', 'teangacha', 'Ceilteacha', 'í', 'an', 'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'thugtar', 'uirthi', 'corruair', ')', ',', 'agus', 'ceann', 'den', 'dtrí', 'cinn', 'de', 'theangacha', 'Ceilteacha', 'ar', 'a', 'dtugtar', 'na', 'teangacha', 'Gaelacha', '(', '.', 'i', '.', 'an', 'Ghaeilge', ',', 'Gaeilge', 'na', 'hAlban', 'agus', 'Gaeilge', 'Mhanann', ')', 'go', 'háirithe', '.']
        elif word_tokenizer in ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer',
                                'Sacremoses - Moses Tokenizer',
                                'spaCy - Irish Word Tokenizer']:
            assert tokens == ['Is', 'ceann', 'de', 'na', 'teangacha', 'Ceilteacha', 'í', 'an', 'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'thugtar', 'uirthi', 'corruair', ')', ',', 'agus', 'ceann', 'den', 'dtrí', 'cinn', 'de', 'theangacha', 'Ceilteacha', 'ar', 'a', 'dtugtar', 'na', 'teangacha', 'Gaelacha', '(', '.i.', 'an', 'Ghaeilge', ',', 'Gaeilge', 'na', 'hAlban', 'agus', 'Gaeilge', 'Mhanann', ')', 'go', 'háirithe', '.']
    elif lang == 'ita':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ["L'italiano", '(', '[', 'itaˈljaːno', ']', '[', 'Nota', '1', ']', 'ascolta', '[', '?', '·info', ']', ')', 'è', 'una', 'lingua', 'romanza', 'parlata', 'principalmente', 'in', 'Italia', '.']
        elif word_tokenizer == 'NLTK - Twitter Tokenizer':
            assert tokens == ["L'italiano", '(', '[', 'itaˈljaːno', ']', '[', 'Nota', '1', ']', 'ascolta', '[', '?', '·', 'info', ']', ')', 'è', 'una', 'lingua', 'romanza', 'parlata', 'principalmente', 'in', 'Italia', '.']
        elif word_tokenizer == 'Sacremoses - Moses Tokenizer':
            assert tokens == ["L'", 'italiano', '(', '[', 'itaˈljaːno', ']', '[', 'Nota', '1', ']', 'ascolta', '[', '?', '·', 'info', ']', ')', 'è', 'una', 'lingua', 'romanza', 'parlata', 'principalmente', 'in', 'Italia', '.']
        elif word_tokenizer == 'spaCy - Italian Word Tokenizer':
            assert tokens == ["L'", 'italiano', '(', '[', 'itaˈljaːno][Nota', '1', ']', 'ascolta[?·info', ']', ')', 'è', 'una', 'lingua', 'romanza', 'parlata', 'principalmente', 'in', 'Italia', '.']
    elif lang == 'jpn':
        if word_tokenizer == 'nagisa - Japanese Word Tokenizer':
            assert tokens == ['日本', '語', '(', 'にほんご', '、', 'にっぽん', 'ご', '[', '注', '1', ']', ')', 'は', '、', '主に', '日本', '国', '内', 'や', '日本', '人', '同士', 'の', '間', 'で', '使用', 'さ', 'れ', 'て', 'いる', '言語', 'で', 'ある', '。']
        elif word_tokenizer == 'Wordless - Japanese Kanji Tokenizer':
            assert tokens == ['日', '本', '語', '(', 'にほんご', '、', 'にっぽん', 'ご', '[', '注', '1', ']', ')', 'は', '、', '主', 'に', '日', '本', '国', '内', 'や', '日', '本', '人', '同', '士', 'の', '間', 'で', '使', '用', 'さ', 'れ', 'て', 'いる', '言', '語', 'で', 'ある', '。']
    elif lang == 'kan':
        assert tokens == ['ದ್ರಾವಿಡ', 'ಭಾಷೆಗಳಲ್ಲಿ', 'ಪ್ರಾಮುಖ್ಯವುಳ್ಳ', 'ಭಾಷೆಯೂ', 'ಭಾರತದ', 'ಪುರಾತನವಾದ', 'ಭಾಷೆಗಳಲ್ಲಿ', 'ಒಂದೂ', 'ಆಗಿರುವ', 'ಕನ್ನಡ', 'ಭಾಷೆಯನ್ನು', 'ಅದರ', 'ವಿವಿಧ', 'ರೂಪಗಳಲ್ಲಿ', 'ಸುಮಾರು', '೪೫', 'ದಶಲಕ್ಷ', 'ಜನರು', 'ಆಡು', 'ನುಡಿಯಾಗಿ', 'ಬಳಸುತ್ತಲಿದ್ದಾರೆ', '.']
    elif lang == 'lav':
        if word_tokenizer == ['NLTK - NIST Tokenizer',
                              'NLTK - Twitter Tokenizer',
                              'Sacremoses - Moses Tokenizer']:
            assert tokens == ['Latviešu', 'valoda', 'ir', 'dzimtā', 'valoda', 'apmēram', '1,7', 'miljoniem', 'cilvēku', ',', 'galvenokārt', 'Latvijā', ',', 'kur', 'tā', 'ir', 'vienīgā', 'valsts', 'valoda', '.', '[', '3', ']']
        elif word_tokenizer == ['NLTK - NLTK Tokenizer',
                                'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['Latviešu', 'valoda', 'ir', 'dzimtā', 'valoda', 'apmēram', '1,7', 'miljoniem', 'cilvēku', ',', 'galvenokārt', 'Latvijā', ',', 'kur', 'tā', 'ir', 'vienīgā', 'valsts', 'valoda.', '[', '3', ']']
        elif word_tokenizer == 'spaCy - Latvian Word Tokenizer':
            assert tokens == ['Latviešu', 'valoda', 'ir', 'dzimtā', 'valoda', 'apmēram', '1,7', 'miljoniem', 'cilvēku', ',', 'galvenokārt', 'Latvijā', ',', 'kur', 'tā', 'ir', 'vienīgā', 'valsts', 'valoda.[3', ']']
    elif lang == 'lij':
        if word_tokenizer == ['NLTK - NIST Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['O', 'Lìgure', '(', 'in', 'monegasco', ':', 'lenga', 'ligüra', 'e', 'lenga', 'lìgura', ')', 'o', "l'é", "'na", 'lengoa', '[', '1', ']', 'do', 'gruppo', 'lengoìstego', 'itàlico', 'oçidentâ', 'parlâ', 'in', 'Italia', '(', 'Liguria', ',', 'Piemonte', ',', 'Emilia-Romagna', 'e', 'Sardegna', ')', ',', 'into', 'sud', 'da', 'Fransa', ',', 'in', 'Còrsega', ',', 'e', 'into', 'Prinçipato', 'de', 'Monego', '.']
        elif word_tokenizer == 'NLTK - NLTK Tokenizer':
            assert tokens == ['O', 'Lìgure', '(', 'in', 'monegasco', ':', 'lenga', 'ligüra', 'e', 'lenga', 'lìgura', ')', 'o', 'l', "'", 'é', "'na", 'lengoa', '[', '1', ']', 'do', 'gruppo', 'lengoìstego', 'itàlico', 'oçidentâ', 'parlâ', 'in', 'Italia', '(', 'Liguria', ',', 'Piemonte', ',', 'Emilia-Romagna', 'e', 'Sardegna', ')', ',', 'into', 'sud', 'da', 'Fransa', ',', 'in', 'Còrsega', ',', 'e', 'into', 'Prinçipato', 'de', 'Monego', '.']
        elif word_tokenizer == 'spaCy - Ligurian Word Tokenizer':
            assert tokens == ['O', 'Lìgure', '(', 'in', 'monegasco', ':', 'lenga', 'ligüra', 'e', 'lenga', 'lìgura', ')', 'o', "l'", 'é', "'", 'na', 'lengoa[1', ']', 'do', 'gruppo', 'lengoìstego', 'itàlico', 'oçidentâ', 'parlâ', 'in', 'Italia', '(', 'Liguria', ',', 'Piemonte', ',', 'Emilia', '-', 'Romagna', 'e', 'Sardegna', ')', ',', 'into', 'sud', 'da', 'Fransa', ',', 'in', 'Còrsega', ',', 'e', 'into', 'Prinçipato', 'de', 'Monego', '.']
    elif lang == 'lit':
        assert tokens == ['Lietuvių', 'kalba', '–', 'iš', 'baltų', 'prokalbės', 'kilusi', 'lietuvių', 'tautos', 'kalba', ',', 'kuri', 'Lietuvoje', 'yra', 'valstybinė', ',', 'o', 'Europos', 'Sąjungoje', '–', 'viena', 'iš', 'oficialiųjų', 'kalbų', '.']
    elif lang == 'ltz':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ["D'Lëtzebuergesch", 'gëtt', 'an', 'der', 'däitscher', 'Dialektologie', 'als', 'ee', 'westgermaneschen', ',', 'mëtteldäitschen', 'Dialekt', 'aklasséiert', ',', 'deen', 'zum', 'Muselfränkesche', 'gehéiert', '.']
        elif word_tokenizer == 'spaCy - Luxembourgish Word Tokenizer':
            assert tokens == ["D'", 'Lëtzebuergesch', 'gëtt', 'an', 'der', 'däitscher', 'Dialektologie', 'als', 'ee', 'westgermaneschen', ',', 'mëtteldäitschen', 'Dialekt', 'aklasséiert', ',', 'deen', 'zum', 'Muselfränkesche', 'gehéiert', '.']
    elif lang == 'mal':
        assert tokens == ['ഇന്ത്യയിൽ', 'പ്രധാനമായും', 'കേരള', 'സംസ്ഥാനത്തിലും', 'ലക്ഷദ്വീപിലും', 'പുതുച്ചേരിയുടെ', 'ഭാഗമായ', 'മയ്യഴിയിലും', 'സംസാരിക്കപ്പെടുന്ന', 'ഭാഷയാണ്', 'മലയാളം.']
    elif lang == 'mar':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['मराठीभाषा', 'ही', 'इंडो-युरोपीय', 'भाषाकुलातील', 'एक', 'भाषा', 'आहे', '.']
        elif word_tokenizer == 'spaCy - Marathi Word Tokenizer':
            assert tokens == ['मराठीभाषा', 'ही', 'इंडो', '-', 'युरोपीय', 'भाषाकुलातील', 'एक', 'भाषा', 'आहे', '.']
    elif lang == 'nep':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['नेपाली', 'भाषा', '(', 'अन्तर्राष्ट्रिय', 'ध्वन्यात्मक', 'वर्णमाला', '[', 'neˈpali', 'bʱaʂa', ']', ')', 'नेपालको', 'सम्पर्क', 'भाषा', 'तथा', 'भारत', ',', 'भुटान', 'र', 'म्यानमारको', 'केही', 'भागमा', 'मातृभाषाको', 'रूपमा', 'बोलिने', 'भाषा', 'हो।']
        elif word_tokenizer in ['NLTK - Twitter Tokenizer',
                                'spaCy - Nepali Word Tokenizer']:
            assert tokens == ['नेपाली', 'भाषा', '(', 'अन्तर्राष्ट्रिय', 'ध्वन्यात्मक', 'वर्णमाला', '[', 'neˈpali', 'bʱaʂa', ']', ')', 'नेपालको', 'सम्पर्क', 'भाषा', 'तथा', 'भारत', ',', 'भुटान', 'र', 'म्यानमारको', 'केही', 'भागमा', 'मातृभाषाको', 'रूपमा', 'बोलिने', 'भाषा', 'हो', '।']
    elif lang == 'nob':
        assert tokens == ['Bokmål', 'er', 'en', 'varietet', 'av', 'norsk', 'språk', '.']
    elif lang == 'fas':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران،', 'افغانستان،', '[', '۳', ']', 'تاجیکستان', '[', '۴', ']', 'و', 'ازبکستان', '[', '۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
        elif word_tokenizer == 'NLTK - Tok-tok Tokenizer':
            assert tokens == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان', '،', '[', '۳', ']', 'تاجیکستان[', '۴', ']', 'و', 'ازبکستان[', '۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
        elif word_tokenizer == 'NLTK - Twitter Tokenizer':
            assert tokens == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان', '،', '[', '۳', ']', 'تاجیکستان', '[', '۴', ']', 'و', 'ازبکستان', '[', '۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
        elif word_tokenizer == 'spaCy - Persian Word Tokenizer':
            assert tokens == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان،[۳', ']', 'تاجیکستان[۴', ']', 'و', 'ازبکستان[۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.']
    elif lang == 'pol':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer',
                              'spaCy - Polish Word Tokenizer']:
            assert tokens == ['Język', 'polski', ',', 'polszczyzna', ',', 'skrót', ':', 'pol', '.', '–', 'język', 'naturalny', 'należący', 'do', 'grupy', 'języków', 'zachodniosłowiańskich', '(', 'do', 'której', 'należą', 'również', 'czeski', ',', 'słowacki', ',', 'kaszubski', ',', 'dolnołużycki', ',', 'górnołużycki', 'i', 'wymarły', 'połabski', ')', ',', 'stanowiącej', 'część', 'rodziny', 'języków', 'indoeuropejskich', '.']
        elif word_tokenizer == 'Sacremoses - Moses Tokenizer':
            assert tokens == ['Język', 'polski', ',', 'polszczyzna', ',', 'skrót', ':', 'pol.', '–', 'język', 'naturalny', 'należący', 'do', 'grupy', 'języków', 'zachodniosłowiańskich', '(', 'do', 'której', 'należą', 'również', 'czeski', ',', 'słowacki', ',', 'kaszubski', ',', 'dolnołużycki', ',', 'górnołużycki', 'i', 'wymarły', 'połabski', ')', ',', 'stanowiącej', 'część', 'rodziny', 'języków', 'indoeuropejskich', '.']
    elif lang == 'por':
        assert tokens == ['A', 'língua', 'portuguesa', ',', 'também', 'designada', 'português', ',', 'é', 'uma', 'língua', 'românica', 'flexiva', 'ocidental', 'originada', 'no', 'galego-português', 'falado', 'no', 'Reino', 'da', 'Galiza', 'e', 'no', 'norte', 'de', 'Portugal', '.']
    elif lang == 'ron':
        assert tokens == ['Limba', 'română', 'este', 'o', 'limbă', 'indo-europeană', ',', 'din', 'grupul', 'italic', 'și', 'din', 'subgrupul', 'oriental', 'al', 'limbilor', 'romanice', '.']
    elif lang == 'rus':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Tok-tok Tokenizer',
                              'NLTK - Twitter Tokenizer',
                              'razdel - Russian Word Tokenizer']:
            assert tokens == ['Ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'Информация', 'о', 'файле', 'слушать', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'один', 'из', 'восточнославянских', 'языков', ',', 'национальный', 'язык', 'русского', 'народа', '.']
        elif word_tokenizer == 'Sacremoses - Moses Tokenizer':
            assert tokens == ['Ру', '́', 'сский', 'язы', '́', 'к', '(', '[', 'ˈruskʲɪi', '̯', 'jɪˈzɨk', ']', 'Информация', 'о', 'файле', 'слушать', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'один', 'из', 'восточнославянских', 'языков', ',', 'национальный', 'язык', 'русского', 'народа', '.']
        elif word_tokenizer == 'spaCy - Russian Word Tokenizer':
            assert tokens == ['Ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'Информация', 'о', 'файле', 'слушать)[~', '3', ']', '[', '⇨', ']', '—', 'один', 'из', 'восточнославянских', 'языков', ',', 'национальный', 'язык', 'русского', 'народа', '.']
    elif lang == 'srp_cyrl':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['Српски', 'језик', 'припада', 'словенској', 'групи', 'језика', 'породице', 'индоевропских', 'језика', '.', '[', '12', ']']
        elif word_tokenizer == 'spaCy - Serbian Word Tokenizer':
            assert tokens == ['Српски', 'језик', 'припада', 'словенској', 'групи', 'језика', 'породице', 'индоевропских', 'језика.[12', ']']
    elif lang == 'srp_latn':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['Srpski', 'jezik', 'pripada', 'slovenskoj', 'grupi', 'jezika', 'porodice', 'indoevropskih', 'jezika', '.', '[', '12', ']']
        elif word_tokenizer == 'spaCy - Serbian Word Tokenizer':
            assert tokens == ['Srpski', 'jezik', 'pripada', 'slovenskoj', 'grupi', 'jezika', 'porodice', 'indoevropskih', 'jezika.[12', ']']
    elif lang == 'sin':
        assert tokens == ['ශ්\u200dරී', 'ලංකාවේ', 'ප්\u200dරධාන', 'ජාතිය', 'වන', 'සිංහල', 'ජනයාගේ', 'මව්', 'බස', 'සිංහල', 'වෙයි', '.']
    elif lang == 'slk':
        assert tokens == ['Slovenčina', 'patrí', 'do', 'skupiny', 'západoslovanských', 'jazykov', '(', 'spolu', 's', 'češtinou', ',', 'poľštinou', ',', 'hornou', 'a', 'dolnou', 'lužickou', 'srbčinou', 'a', 'kašubčinou', ')', '.']
    elif lang == 'slv':
        assert tokens == ['Slovenščina', '[', 'slovénščina', ']', '/', '[', 'sloˈʋenʃtʃina', ']', 'je', 'združeni', 'naziv', 'za', 'uradni', 'knjižni', 'jezik', 'Slovencev', 'in', 'skupno', 'ime', 'za', 'narečja', 'in', 'govore', ',', 'ki', 'jih', 'govorijo', 'ali', 'so', 'jih', 'nekoč', 'govorili', 'Slovenci', '.']
    elif lang == 'spa':
        assert tokens == ['El', 'español', 'o', 'castellano', 'es', 'una', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablado', '.']
    elif lang == 'swe':
        assert tokens == ['Svenska', '(', 'svenska', '(', 'info', ')', ')', 'är', 'ett', 'östnordiskt', 'språk', 'som', 'talas', 'av', 'ungefär', 'tio', 'miljoner', 'personer', 'främst', 'i', 'Sverige', 'där', 'språket', 'har', 'en', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'men', 'även', 'som', 'det', 'ena', 'nationalspråket', 'i', 'Finland', 'och', 'som', 'enda', 'officiella', 'språk', 'på', 'Åland', '.']
    elif lang == 'tgl':
        assert tokens == ['Ang', 'Wikang', 'Tagalog[2', ']', '(', 'Baybayin', ':', 'ᜏᜒᜃᜅ᜔', 'ᜆᜄᜎᜓᜄ᜔', ')', ',', 'na', 'kilala', 'rin', 'sa', 'payak', 'na', 'pangalang', 'Tagalog', ',', 'ay', 'isa', 'sa', 'mga', 'pangunahing', 'wika', 'ng', 'Pilipinas', 'at', 'sinasabing', 'ito', 'ang', 'de', 'facto', '(', '"', 'sa', 'katunayan', '"', ')', 'ngunit', 'hindî', 'de', 'jure', '(', '"', 'sa', 'batas', '"', ')', 'na', 'batayan', 'na', 'siyang', 'pambansang', 'Wikang', 'Filipino', '(', 'mula', '1961', 'hanggang', '1987', ':', 'Pilipino).[2', ']']
    elif lang == 'tgk':
        assert tokens == ['Забони', 'тоҷикӣ', '—', 'забоне', ',', 'ки', 'дар', 'Эрон', ':', 'форсӣ', ',', 'ва', 'дар', 'Афғонистон', 'дарӣ', 'номида', 'мешавад', ',', 'забони', 'давлатии', 'кишварҳои', 'Тоҷикистон', ',', 'Эрон', 'ва', 'Афғонистон', 'мебошад', '.']
    elif lang == 'tam':
        if word_tokenizer == 'Sacremoses - Moses Tokenizer':
            assert tokens == ['தமிழ', '்', 'மொழி', '(', 'Tamil', 'language', ')', 'தமிழர', '்', 'களினதும', '்', ',', 'தமிழ', '்', 'பேசும', '்', 'பலரதும', '்', 'தாய', '்', 'மொழி', 'ஆகும', '்', '.']
        elif word_tokenizer == 'spaCy - Tamil Word Tokenizer':
            assert tokens == ['தமிழ்', 'மொழி', '(', 'Tamil', 'language', ')', 'தமிழர்களினதும்', ',', 'தமிழ்', 'பேசும்', 'பலரதும்', 'தாய்மொழி', 'ஆகும்', '.']
    elif lang == 'tat':
        assert tokens == ['Татар', 'теле', '—', 'татарларның', 'милли', 'теле', ',', 'Татарстанның', 'дәүләт', 'теле', ',', 'таралышы', 'буенча', 'Русиядә', 'икенче', 'тел', '.']
    elif lang == 'tel':
        tokens == ['ఆంధ్ర', 'ప్రదేశ్', ',', 'తెలంగాణ', 'రాష్ట్రాల', 'అధికార', 'భాష', 'తెలుగు', '.']
    elif lang == 'tha':
        if word_tokenizer == ('AttaCut - Thai Word Tokenizer'):
            assert tokens == ['ภาษา', 'ไทย', 'หรือ', 'ภาษา', 'ไทย', 'กลาง', 'เป็น', 'ภาษา', 'ราชการ', 'และ', 'ภาษา', 'ประจำ', 'ชาติ', 'ของ', 'ประเทศไทย']
        elif word_tokenizer in ['PyThaiNLP - Longest Matching',
                                'PyThaiNLP - Maximum Matching + TCC',
                                'PyThaiNLP - Maximum Matching + TCC (Safe Mode)']:
            assert tokens == ['ภาษาไทย', 'หรือ', 'ภาษาไทย', 'กลาง', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย']
        elif word_tokenizer == 'PyThaiNLP - Maximum Matching':
            assert tokens == ['ภาษาไทย', 'หรือ', 'ภาษาไทยกลาง', 'เป็น', 'ภาษาราชการ', 'และ', 'ภาษาประจำชาติ', 'ของ', 'ประเทศ', 'ไทย']
    elif lang == 'bod':
        assert tokens == ['བོད་', 'ཀྱི་', 'སྐད་ཡིག་', 'ནི་', 'བོད་ཡུལ་', 'དང་', 'དེ', 'འི་', 'ཉེ་འཁོར་', 'གྱི་', 'ས་ཁུལ་', 'ཏེ', '།']
    elif lang == 'tur':
        assert tokens == ['Türkçe', 'ya', 'da', 'Türk', 'dili', ',', 'batıda', 'Balkanlar’dan', 'başlayıp', 'doğuda', 'Hazar', 'Denizi', 'sahasına', 'kadar', 'konuşulan', 'Türkî', 'diller', 'dil', 'ailesine', 'ait', 'sondan', 'eklemeli', 'bir', 'dil.[12', ']']
    elif lang == 'ukr':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer',
                              'NLTK - Twitter Tokenizer']:
            assert tokens == ['Украї́нська', 'мо́ва', '(', 'МФА', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичні', 'назви', '—', 'ру́ська', ',', 'руси́нська', '[', '9', ']', '[', '10', ']', '[', '11', ']', '[', '*', '2', ']', ')', '—', 'національна', 'мова', 'українців', '.']
        elif word_tokenizer == 'spaCy - Ukrainian Word Tokenizer':
            assert tokens == ['Украї́нська', 'мо́ва', '(', 'МФА', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичні', 'назви', '—', 'ру́ська', ',', 'руси́нська[9][10][11', ']', '[', '*', '2', ']', ')', '—', 'національна', 'мова', 'українців', '.']
    elif lang == 'urd':
        if word_tokenizer in ['NLTK - NIST Tokenizer',
                              'NLTK - NLTK Tokenizer',
                              'NLTK - Penn Treebank Tokenizer']:
            assert tokens == ['اُردُو', 'لشکری', 'زبان', '[', '8', ']', '(', 'یا', 'جدید', 'معیاری', 'اردو', ')', 'برصغیر', 'کی', 'معیاری', 'زبانوں', 'میں', 'سے', 'ایک', 'ہے۔']
        elif word_tokenizer == 'NLTK - Twitter Tokenizer':
            assert tokens == ['اُردُو', 'لشکری', 'زبان', '[8', ']', '(', 'یا', 'جدید', 'معیاری', 'اردو', ')', 'برصغیر', 'کی', 'معیاری', 'زبانوں', 'میں', 'سے', 'ایک', 'ہے', '۔']
        elif word_tokenizer == 'spaCy - Ukrainian Word Tokenizer':
            assert tokens == ['اُردُو', 'لشکری', 'زبان[8', ']', '(', 'یا', 'جدید', 'معیاری', 'اردو', ')', 'برصغیر', 'کی', 'معیاری', 'زبانوں', 'میں', 'سے', 'ایک', 'ہے', '۔']
    elif lang == 'vie':
        if word_tokenizer == 'NLTK - Tok-tok Tokenizer':
            assert tokens == ['Tiếng', 'Việt', ',', 'còn', 'gọi', 'tiếng', 'Việt', 'Nam[', '5', ']', ',', 'tiếng', 'Kinh', 'hay', 'Việt', 'ngữ', ',', 'là', 'ngôn', 'ngữ', 'của', 'người', 'Việt', '(', 'dân', 'tộc', 'Kinh', ')', 'và', 'là', 'ngôn', 'ngữ', 'chính', 'thức', 'tại', 'Việt', 'Nam', '.']
        elif word_tokenizer == 'Underthesea - Vietnamese Word Tokenizer':
            assert tokens == ['Tiếng', 'Việt', ',', 'còn', 'gọi', 'tiếng', 'Việt Nam', '[', '5', ']', ',', 'tiếng Kinh', 'hay', 'Việt ngữ', ',', 'là', 'ngôn ngữ', 'của', 'người', 'Việt', '(', 'dân tộc', 'Kinh', ')', 'và', 'là', 'ngôn ngữ', 'chính thức', 'tại', 'Việt Nam', '.']
    elif lang == 'yor':
        assert tokens == ['Èdè', 'Yorùbá', 'Ni', 'èdè', 'tí', 'ó', 'ṣàkójọ', 'pọ̀', 'gbogbo', 'kú', 'oótu', 'o', '-', 'ò', '-', 'jíire', 'bí', ',', 'níapá', 'ìwọ̀', 'Oòrùn', 'ilẹ̀', 'Nàìjíríà', ',', 'tí', 'a', 'bá', 'wo', 'èdè', 'Yorùbá', ',', 'àwọn', 'onímọ̀', 'pín', 'èdè', 'náà', 'sábẹ́', 'ẹ̀yà', 'Kwa', 'nínú', 'ẹbí', 'èdè', 'Niger', '-', 'Congo', '.']
def test_word_detokenize(lang, word_detokenizer, show_results=False):
    lang_text = wl_conversion.to_lang_text(main, lang)

    tokens = wl_word_tokenization.wl_word_tokenize(
        main,
        text=getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang=lang)
    text = wl_word_detokenization.wl_word_detokenize(
        main, tokens=tokens, lang=lang, word_detokenizer=word_detokenizer)

    if show_results:
        print(f'{lang} / {word_detokenizer}:')
        print(text)

    if lang == 'cat':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == "El català (denominació oficial a Catalunya, a les Illes Balears, a Andorra, a la ciutat de l' Alguer i tradicional a Catalunya Nord) o valencià (denominació oficial al País Valencià i tradicional al Carxe) és una llengua romànica parlada a Catalunya, el País Valencià (tret d' algunes comarques i localitats de l' interior), les Illes Balears, Andorra, la Franja de Ponent (a l' Aragó), la ciutat de l' Alguer (a l' illa de Sardenya), la Catalunya del Nord,[8] el Carxe (un petit territori de Múrcia poblat per immigrats valencians),[9][10] i en petites comunitats arreu del món (entre les quals destaca la de l' Argentina, amb 195.000 parlants).[11 ]"
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == "El català (denominació oficial a Catalunya, a les Illes Balears, a Andorra, a la ciutat de l' Alguer i tradicional a Catalunya Nord) o valencià (denominació oficial al País Valencià i tradicional al Carxe) és una llengua romànica parlada a Catalunya, el País Valencià (tret d' algunes comarques i localitats de l' interior), les Illes Balears, Andorra, la Franja de Ponent (a l' Aragó), la ciutat de l' Alguer (a l' illa de Sardenya), la Catalunya del Nord,[8] el Carxe (un petit territori de Múrcia poblat per immigrats valencians),[9][10] i en petites comunitats arreu del món (entre les quals destaca la de l' Argentina, amb 195.000 parlants).[11]"
    elif lang == 'zho_cn':
        assert text == '汉语,又称汉文、中文、中国话、中国语、华语、华文、唐话[2],或被视为一个语族,或被视为隶属于汉藏语系汉语族之一种语言。'
    elif lang == 'zho_tw':
        assert text == '漢語,又稱漢文、中文、中國話、中國語、華語、華文、唐話[2],或被視為一個語族,或被視為隸屬於漢藏語系漢語族之一種語言。'
    elif lang == 'ces':
        assert text == 'Čeština neboli český jazyk je západoslovanský jazyk, nejbližší slovenštině, poté lužické srbštině a polštině.'
    elif lang == 'nld':
        assert text == 'Het Nederlands is een West-Germaanse taal en de moedertaal van de meeste inwoners van Nederland, België en Suriname.'
    elif lang == 'eng':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'English is a West Germanic language that was first spoken in early medieval England and eventually became a global lingua franca.[4][5 ]'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'English is a West Germanic language that was first spoken in early medieval England and eventually became a global lingua franca.[4][5]'
    elif lang == 'fin':
        assert text == 'Suomen kieli (suomi) on uralilaisten kielten itämerensuomalaiseen ryhmään kuuluva kieli.'
    elif lang == 'fra':
        assert text == 'Le français est une langue indo-européenne de la famille des langues romanes.'
    elif lang == 'deu':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Die deutsche Sprache bzw. Deutsch ([ dɔʏ̯t͡ʃ]; abgekürzt dt . oder dtsch .) ist eine westgermanische Sprache.'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache.'
    elif lang == 'ell':
        assert text == 'Η ελληνική γλώσσα ανήκει στην ινδοευρωπαϊκή οικογένεια[9] και συγκεκριμένα στον ελληνικό κλάδο, μαζί με την τσακωνική, ενώ είναι η επίσημη γλώσσα της Ελλάδος και της Κύπρου.'
    elif lang == 'hun':
        assert text == 'A magyar nyelv az uráli nyelvcsalád tagja, a finnugor nyelvek közé tartozó ugor nyelvek egyike.'
    elif lang == 'isl':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Íslenska er vesturnorrænt, germanskt og indóevrópskt tungumál sem er einkum talað og ritað á Íslandi og er móðurmál langflestra Íslendinga.[4 ]'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Íslenska er vesturnorrænt, germanskt og indóevrópskt tungumál sem er einkum talað og ritað á Íslandi og er móðurmál langflestra Íslendinga.[4]'
    elif lang == 'gle':
        assert text == 'Is ceann de na teangacha Ceilteacha í an Ghaeilge (nó Gaeilge na hÉireann mar a thugtar uirthi corruair), agus ceann den dtrí cinn de theangacha Ceilteacha ar a dtugtar na teangacha Gaelacha (.i. an Ghaeilge, Gaeilge na hAlban agus Gaeilge Mhanann) go háirithe.'
    elif lang == 'ita':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == "L' italiano ([ itaˈljaːno][Nota 1] ascolta[?·info] ) è una lingua romanza parlata principalmente in Italia."
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == "L'italiano ([itaˈljaːno][Nota 1] ascolta[?·info]) è una lingua romanza parlata principalmente in Italia."
    elif lang == 'jpn':
        assert text == '日本語(にほんご、にっぽんご[注1])は、主に日本国内や日本人同士の間で使用されている言語である。'
    elif lang == 'lav':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Latviešu valoda ir dzimtā valoda apmēram 1,7 miljoniem cilvēku, galvenokārt Latvijā, kur tā ir vienīgā valsts valoda . [3 ]'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Latviešu valoda ir dzimtā valoda apmēram 1,7 miljoniem cilvēku, galvenokārt Latvijā, kur tā ir vienīgā valsts valoda. [3]'
    elif lang == 'lit':
        assert text == 'Lietuvių kalba – iš baltų prokalbės kilusi lietuvių tautos kalba, kuri Lietuvoje yra valstybinė, o Europos Sąjungoje – viena iš oficialiųjų kalbų.'
    elif lang == 'pol':
        assert text == 'Język polski, polszczyzna, skrót: pol. – język naturalny należący do grupy języków zachodniosłowiańskich (do której należą również czeski, słowacki, kaszubski, dolnołużycki, górnołużycki i wymarły połabski), stanowiącej część rodziny języków indoeuropejskich.'
    elif lang == 'por':
        assert text == 'A língua portuguesa, também designada português, é uma língua românica flexiva ocidental originada no galego-português falado no Reino da Galiza e no norte de Portugal.'
    elif lang == 'ron':
        assert text == 'Limba română este o limbă indo - europeană, din grupul italic și din subgrupul oriental al limbilor romanice.'
    elif lang == 'rus':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Ру́сский язы́к ([ ˈruskʲɪi̯ jɪˈzɨk] Информация о файле слушать) [~ 3] [⇨] — один из восточнославянских языков, национальный язык русского народа.'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Ру́сский язы́к ([ˈruskʲɪi̯ jɪˈzɨk] Информация о файле слушать) [~ 3] [⇨] — один из восточнославянских языков, национальный язык русского народа.'
    elif lang == 'slk':
        assert text == 'Slovenčina patrí do skupiny západoslovanských jazykov (spolu s češtinou, poľštinou, hornou a dolnou lužickou srbčinou a kašubčinou).'
    elif lang == 'slv':
        assert text == 'Slovenščina [slovénščina] / [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore, ki jih govorijo ali so jih nekoč govorili Slovenci.'
    elif lang == 'spa':
        assert text == 'El español o castellano es una lengua romance procedente del latín hablado.'
    elif lang == 'swe':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Svenska (svenska (info) ) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk, men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Svenska (svenska (info)) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk, men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.'
    elif lang == 'tam':
        assert text == 'தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும்.'
    elif lang == 'tha':
        assert text == 'ภาษาไทยหรือภาษาไทยกลางเป็นภาษาราชการและภาษาประจำชาติของประเทศไทย'
    elif lang == 'bod':
        assert text == 'བོད་ཀྱི་སྐད་ཡིག་ནི་བོད་ཡུལ་དང་དེའི་ཉེ་འཁོར་གྱི་ས་ཁུལ་ཏེ།'
Exemple #9
0
def test_pos_tag(lang, pos_tagger, show_results=False):
    lang_text = wl_conversion.to_lang_text(main, lang)

    tokens = wl_word_tokenization.wl_word_tokenize(
        main,
        text=getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang=lang)

    tokens_tagged = wl_pos_tagging.wl_pos_tag(main,
                                              tokens=tokens,
                                              lang=lang,
                                              pos_tagger=pos_tagger)
    tokens_tagged_universal = wl_pos_tagging.wl_pos_tag(main,
                                                        tokens=tokens,
                                                        lang=lang,
                                                        pos_tagger=pos_tagger,
                                                        tagset='universal')

    if show_results:
        print(f'{lang} / {pos_tagger}:')
        print(tokens_tagged)
        print(tokens_tagged_universal)

    if lang == 'zho_cn':
        assert tokens_tagged == [('汉语', 'nz'), (',', 'x'), ('又称', 'n'),
                                 ('汉文', 'nz'), ('、', 'x'), ('中文', 'nz'),
                                 ('、', 'x'), ('中国', 'ns'), ('话', 'n'),
                                 ('、', 'x'), ('中国', 'ns'), ('语', 'ng'),
                                 ('、', 'x'), ('华语', 'nz'), ('、', 'x'),
                                 ('华文', 'nz'), ('、', 'x'), ('唐', 'nr'),
                                 ('话', 'n'), ('[', 'x'), ('2', 'x'),
                                 (']', 'x'), (',', 'x'),
                                 ('或', 'c'), ('被', 'p'), ('视为', 'v'),
                                 ('一个', 'm'), ('语族', 'n'), (',', 'x'),
                                 ('或', 'c'), ('被', 'p'), ('视为', 'v'),
                                 ('隶属于', 'n'), ('汉藏语系', 'nz'), ('汉语', 'nz'),
                                 ('族', 'ng'), ('之', 'u'), ('一种', 'm'),
                                 ('语言', 'n'), ('。', 'x')]
        assert tokens_tagged_universal == [('汉语', 'PROPN'), (',', 'PUNCT/SYM'),
                                           ('又称', 'NOUN'), ('汉文', 'PROPN'),
                                           ('、', 'PUNCT/SYM'), ('中文', 'PROPN'),
                                           ('、', 'PUNCT/SYM'), ('中国', 'PROPN'),
                                           ('话', 'NOUN'), ('、', 'PUNCT/SYM'),
                                           ('中国', 'PROPN'), ('语', 'NOUN'),
                                           ('、', 'PUNCT/SYM'), ('华语', 'PROPN'),
                                           ('、', 'PUNCT/SYM'), ('华文', 'PROPN'),
                                           ('、', 'PUNCT/SYM'), ('唐', 'PRONP'),
                                           ('话', 'NOUN'), ('[', 'PUNCT/SYM'),
                                           ('2', 'PUNCT/SYM'),
                                           (']', 'PUNCT/SYM'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('视为', 'VERB'),
                                           ('一个', 'NUM'), ('语族', 'NOUN'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('视为', 'VERB'),
                                           ('隶属于', 'NOUN'), ('汉藏语系', 'PROPN'),
                                           ('汉语', 'PROPN'), ('族', 'NOUN'),
                                           ('之', 'PART'), ('一种', 'NUM'),
                                           ('语言', 'NOUN'), ('。', 'PUNCT/SYM')]
    elif lang == 'zho_tw':
        assert tokens_tagged == [
            ('漢語', 'nz'), (',', 'x'), ('又', 'd'), ('稱', 'v'), ('漢文', 'nz'),
            ('、', 'x'), ('中文', 'nz'), ('、', 'x'), ('中國', 'ns'), ('話', 'n'),
            ('、', 'x'), ('中國', 'ns'), ('語', 'n'), ('、', 'x'), ('華語', 'nz'),
            ('、', 'x'), ('華文', 'nz'), ('、', 'x'), ('唐', 'nr'), ('話', 'n'),
            ('[', 'x'), ('2', 'x'), (']', 'x'), (',', 'x'), ('或', 'c'),
            ('被', 'p'), ('視為', 'v'), ('一個', 'm'), ('語族', 'n'), (',', 'x'),
            ('或', 'c'), ('被', 'p'), ('視為', 'v'), ('隸', 'j'), ('屬', 'v'),
            ('於', 'nr'), ('漢藏語', 'nz'), ('系漢', 'n'), ('語族', 'n'), ('之一', 'r'),
            ('種語', 'n'), ('言', 'vg'), ('。', 'x')
        ]
        assert tokens_tagged_universal == [('漢語', 'PROPN'), (',', 'PUNCT/SYM'),
                                           ('又', 'ADV'), ('稱', 'VERB'),
                                           ('漢文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('中文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('中國', 'PROPN'), ('話', 'NOUN'),
                                           ('、', 'PUNCT/SYM'), ('中國', 'PROPN'),
                                           ('語', 'NOUN'), ('、', 'PUNCT/SYM'),
                                           ('華語', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('華文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('唐', 'PRONP'), ('話', 'NOUN'),
                                           ('[', 'PUNCT/SYM'),
                                           ('2', 'PUNCT/SYM'),
                                           (']', 'PUNCT/SYM'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('視為', 'VERB'),
                                           ('一個', 'NUM'), ('語族', 'NOUN'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('視為', 'VERB'),
                                           ('隸', 'X'), ('屬', 'VERB'),
                                           ('於', 'PRONP'), ('漢藏語', 'PROPN'),
                                           ('系漢', 'NOUN'), ('語族', 'NOUN'),
                                           ('之一', 'PRON'), ('種語', 'NOUN'),
                                           ('言', 'VERB'), ('。', 'PUNCT/SYM')]
    elif lang == 'nld':
        assert tokens_tagged == [
            ('Het',
             'Art|bep|onzijd|neut__Definite=Def|Gender=Neut|PronType=Art'),
            ('Nederlands', 'Adj|zelfst|stell|onverv__Degree=Pos'),
            ('is',
             'V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'
             ),
            ('een',
             'Art|onbep|zijdofonzijd|neut__Definite=Ind|Number=Sing|PronType=Art'
             ),
            ('West-Germaanse', 'Adj|attr|stell|vervneut__Case=Nom|Degree=Pos'),
            ('taal', 'N|soort|ev|neut__Number=Sing'), ('en', 'Conj|neven___'),
            ('de', 'Art|bep|zijdofmv|neut__Definite=Def|PronType=Art'),
            ('moedertaal', 'N|soort|ev|neut__Number=Sing'),
            ('van', 'Prep|voor__AdpType=Prep'),
            ('de', 'Art|bep|zijdofmv|neut__Definite=Def|PronType=Art'),
            ('meeste', 'Num__Case=Nom|Degree=Sup|NumType=Card|PronType=Ind'),
            ('inwoners', 'N|soort|mv|neut__Number=Plur'),
            ('van', 'Prep|voor__AdpType=Prep'),
            ('Nederland', 'N|eigen|ev|neut__Number=Sing'),
            (',', 'Punc|komma__PunctType=Comm'),
            ('België', 'N|eigen|ev|neut__Number=Sing'),
            ('en', 'Conj|neven___'),
            ('Suriname', 'N|eigen|ev|neut__Number=Sing'),
            ('.', 'Punc|punt__PunctType=Peri')
        ]
        assert tokens_tagged_universal == [
            ('Het', 'DET'), ('Nederlands', 'ADJ'), ('is', 'VERB'),
            ('een', 'DET'), ('West-Germaanse', 'ADJ'), ('taal', 'NOUN'),
            ('en', 'CONJ'), ('de', 'DET'), ('moedertaal', 'NOUN'),
            ('van', 'ADP'), ('de', 'DET'), ('meeste', 'NUM'),
            ('inwoners', 'NOUN'), ('van', 'ADP'), ('Nederland', 'NOUN'),
            (',', 'PUNCT'), ('België', 'NOUN'), ('en', 'CONJ'),
            ('Suriname', 'NOUN'), ('.', 'PUNCT')
        ]
    elif lang == 'eng':
        if pos_tagger == 'NLTK - Perceptron POS Tagger':
            assert tokens_tagged == [('English', 'NNP'), ('is', 'VBZ'),
                                     ('a', 'DT'), ('West', 'NNP'),
                                     ('Germanic', 'NNP'), ('language', 'NN'),
                                     ('that', 'WDT'), ('was', 'VBD'),
                                     ('first', 'RB'), ('spoken', 'VBN'),
                                     ('in', 'IN'), ('early', 'JJ'),
                                     ('medieval', 'NN'), ('England', 'NNP'),
                                     ('and', 'CC'), ('eventually', 'RB'),
                                     ('became', 'VBD'), ('a', 'DT'),
                                     ('global', 'JJ'), ('lingua', 'NN'),
                                     ('franca.[4][5', 'NN'), (']', 'NN')]
            assert tokens_tagged_universal == [('English', 'PROPN'),
                                               ('is', 'VERB'), ('a', 'DET'),
                                               ('West', 'PROPN'),
                                               ('Germanic', 'PROPN'),
                                               ('language', 'NOUN'),
                                               ('that', 'DET'), ('was',
                                                                 'VERB'),
                                               ('first', 'ADV'),
                                               ('spoken', 'VERB'),
                                               ('in', 'ADP/SCONJ'),
                                               ('early', 'ADJ'),
                                               ('medieval', 'NOUN'),
                                               ('England', 'PROPN'),
                                               ('and', 'CCONJ'),
                                               ('eventually', 'ADV'),
                                               ('became', 'VERB'),
                                               ('a', 'DET'), ('global', 'ADJ'),
                                               ('lingua', 'NOUN'),
                                               ('franca.[4][5', 'NOUN'),
                                               (']', 'NOUN')]
        elif pos_tagger == 'spaCy - English POS Tagger':
            assert tokens_tagged == [('English', 'NNP'), ('is', 'VBZ'),
                                     ('a', 'DT'), ('West', 'NNP'),
                                     ('Germanic', 'JJ'), ('language', 'NN'),
                                     ('that', 'WDT'), ('was', 'VBD'),
                                     ('first', 'RB'), ('spoken', 'VBN'),
                                     ('in', 'IN'), ('early', 'JJ'),
                                     ('medieval', 'NN'), ('England', 'NNP'),
                                     ('and', 'CC'), ('eventually', 'RB'),
                                     ('became', 'VBD'), ('a', 'DT'),
                                     ('global', 'JJ'), ('lingua', 'NN'),
                                     ('franca.[4][5', 'NN'), (']', '-RRB-')]
            assert tokens_tagged_universal == [('English', 'PROPN'),
                                               ('is', 'VERB'), ('a', 'DET'),
                                               ('West', 'PROPN'),
                                               ('Germanic', 'ADJ'),
                                               ('language', 'NOUN'),
                                               ('that', 'DET'), ('was',
                                                                 'VERB'),
                                               ('first', 'ADV'),
                                               ('spoken', 'VERB'),
                                               ('in', 'ADP/SCONJ'),
                                               ('early', 'ADJ'),
                                               ('medieval', 'NOUN'),
                                               ('England', 'PROPN'),
                                               ('and', 'CCONJ'),
                                               ('eventually', 'ADV'),
                                               ('became', 'VERB'),
                                               ('a', 'DET'), ('global', 'ADJ'),
                                               ('lingua', 'NOUN'),
                                               ('franca.[4][5', 'NOUN'),
                                               (']', 'PUNCT')]
    elif lang == 'fra':
        assert tokens_tagged == [
            ('Le', 'DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('français', 'NOUN__Gender=Masc|Number=Sing'),
            ('est',
             'AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('une', 'DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('langue', 'NOUN__Gender=Fem|Number=Sing'),
            ('indo-européenne', 'ADJ___'), ('de', 'ADP___'),
            ('la', 'DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('famille', 'NOUN__Gender=Fem|Number=Sing'),
            ('des', 'DET__Definite=Ind|Number=Plur|PronType=Art'),
            ('langues', 'ADJ__Number=Plur'),
            ('romanes', 'ADJ__Gender=Fem|Number=Plur'), ('.', 'PUNCT___')
        ]
        assert tokens_tagged_universal == [('Le', 'DET'), ('français', 'NOUN'),
                                           ('est', 'AUX'), ('une', 'DET'),
                                           ('langue', 'NOUN'),
                                           ('indo-européenne', 'ADJ'),
                                           ('de', 'ADP'), ('la', 'DET'),
                                           ('famille', 'NOUN'), ('des', 'DET'),
                                           ('langues', 'ADJ'),
                                           ('romanes', 'ADJ'), ('.', 'PUNCT')]
    elif lang == 'deu':
        assert tokens_tagged == [('Die', 'ART'), ('deutsche', 'ADJA'),
                                 ('Sprache', 'NN'), ('bzw.', 'VVFIN'),
                                 ('Deutsch', 'NN'), ('(', '$('), ('[', 'NN'),
                                 ('dɔʏ̯t͡ʃ', 'NE'), (']', 'PTKVZ'),
                                 (';', '$.'), ('abgekürzt', 'VVFIN'),
                                 ('dt', 'XY'), ('.', '$.'), ('oder', 'KON'),
                                 ('dtsch', 'ADJD'), ('.', '$.'), (')', '$('),
                                 ('ist', 'VAFIN'), ('eine', 'ART'),
                                 ('westgermanische', 'ADJA'),
                                 ('Sprache', 'NN'), ('.', '$.')]
        assert tokens_tagged_universal == [('Die', 'DET'), ('deutsche', 'ADJ'),
                                           ('Sprache', 'NOUN'),
                                           ('bzw.', 'VERB'),
                                           ('Deutsch', 'NOUN'), ('(', 'PUNCT'),
                                           ('[', 'NOUN'), ('dɔʏ̯t͡ʃ', 'PROPN'),
                                           (']', 'PART'), (';', 'PUNCT'),
                                           ('abgekürzt', 'VERB'), ('dt', 'X'),
                                           ('.', 'PUNCT'), ('oder', 'CCONJ'),
                                           ('dtsch', 'ADJ'), ('.', 'PUNCT'),
                                           (')', 'PUNCT'), ('ist', 'AUX'),
                                           ('eine', 'DET'),
                                           ('westgermanische', 'ADJ'),
                                           ('Sprache', 'NOUN'), ('.', 'PUNCT')]
    elif lang == 'ell':
        assert tokens_tagged == [('Η', 'DET'), ('ελληνική', 'ADJ'),
                                 ('γλώσσα', 'NOUN'), ('ανήκει', 'VERB'),
                                 ('στην', 'ADJ'), ('ινδοευρωπαϊκή', 'ADJ'),
                                 ('οικογένεια[9', 'NOUN'), (']', 'NUM'),
                                 ('και', 'CCONJ'), ('συγκεκριμένα', 'ADV'),
                                 ('στον', 'ADV'), ('ελληνικό', 'ADJ'),
                                 ('κλάδο', 'NOUN'), (',', 'PUNCT'),
                                 ('μαζί', 'ADV'), ('με', 'ADP'),
                                 ('την', 'DET'), ('τσακωνική', 'ADJ'),
                                 (',', 'PUNCT'), ('ενώ', 'SCONJ'),
                                 ('είναι', 'AUX'), ('η', 'DET'),
                                 ('επίσημη', 'ADJ'), ('γλώσσα', 'NOUN'),
                                 ('της', 'DET'), ('Ελλάδος', 'PROPN'),
                                 ('και', 'CCONJ'), ('της', 'DET'),
                                 ('Κύπρου', 'PROPN'), ('.', 'PUNCT')]
        assert tokens_tagged_universal == [('Η', 'DET'), ('ελληνική', 'ADJ'),
                                           ('γλώσσα', 'NOUN'),
                                           ('ανήκει', 'VERB'), ('στην', 'ADJ'),
                                           ('ινδοευρωπαϊκή', 'ADJ'),
                                           ('οικογένεια[9', 'NOUN'),
                                           (']', 'NUM'), ('και', 'CCONJ'),
                                           ('συγκεκριμένα', 'ADV'),
                                           ('στον', 'ADV'),
                                           ('ελληνικό', 'ADJ'),
                                           ('κλάδο', 'NOUN'), (',', 'PUNCT'),
                                           ('μαζί', 'ADV'), ('με', 'ADP'),
                                           ('την', 'DET'), ('τσακωνική',
                                                            'ADJ'),
                                           (',', 'PUNCT'), ('ενώ', 'SCONJ'),
                                           ('είναι', 'AUX'), ('η', 'DET'),
                                           ('επίσημη', 'ADJ'),
                                           ('γλώσσα', 'NOUN'), ('της', 'DET'),
                                           ('Ελλάδος', 'PROPN'),
                                           ('και', 'CCONJ'), ('της', 'DET'),
                                           ('Κύπρου', 'PROPN'), ('.', 'PUNCT')]
    elif lang == 'ita':
        assert tokens_tagged == [
            ("L'", 'RD__Definite=Def|Number=Sing|PronType=Art'),
            ('italiano', 'S__Gender=Masc|Number=Sing'), ('(', 'FB___'),
            ('[', 'FB___'), ('itaˈljaːno][Nota', 'A__Gender=Fem|Number=Sing'),
            ('1', 'N__NumType=Card'), (']', 'FB___'),
            ('ascolta[?·info', 'SP___'), (']', 'FB___'), (')', 'FB___'),
            ('è', 'V__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('una', 'RI__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('lingua', 'S__Gender=Fem|Number=Sing'),
            ('romanza', 'S__Gender=Fem|Number=Sing'),
            ('parlata', 'V__Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part'),
            ('principalmente', 'B___'), ('in', 'E___'), ('Italia', 'SP___'),
            ('.', 'FS___')
        ]
        assert tokens_tagged_universal == [("L'", 'DET'), ('italiano', 'NOUN'),
                                           ('(', 'PUNCT'), ('[', 'PUNCT'),
                                           ('itaˈljaːno][Nota', 'ADJ'),
                                           ('1', 'NUM'), (']', 'PUNCT'),
                                           ('ascolta[?·info', 'PROPN'),
                                           (']', 'PUNCT'), (')', 'PUNCT'),
                                           ('è', 'VERB'), ('una', 'DET'),
                                           ('lingua', 'NOUN'),
                                           ('romanza', 'NOUN'),
                                           ('parlata', 'VERB'),
                                           ('principalmente', 'ADV'),
                                           ('in', 'ADP'), ('Italia', 'PROPN'),
                                           ('.', 'PUNCT')]
    elif lang == 'jpn':
        assert tokens_tagged == [('日本', '名詞'), ('語', '名詞'), ('(', '補助記号'),
                                 ('にほんご', '名詞'), ('、', '補助記号'), ('にっぽん', '名詞'),
                                 ('ご', '接尾辞'), ('[', '補助記号'), ('注', '名詞'),
                                 ('1', '名詞'), (']', '補助記号'), (')', '補助記号'),
                                 ('は', '助詞'), ('、', '補助記号'), ('主に', '副詞'),
                                 ('日本', '名詞'), ('国', '接尾辞'), ('内', '接尾辞'),
                                 ('や', '助詞'), ('日本', '名詞'), ('人', '接尾辞'),
                                 ('同士', '接尾辞'), ('の', '助詞'), ('間', '名詞'),
                                 ('で', '助詞'), ('使用', '名詞'), ('さ', '動詞'),
                                 ('れ', '助動詞'), ('て', '助詞'), ('いる', '動詞'),
                                 ('言語', '名詞'), ('で', '助動詞'), ('ある', '動詞'),
                                 ('。', '補助記号')]
        assert tokens_tagged_universal == [('日本', 'NOUN'), ('語', 'NOUN'),
                                           ('(', 'PUNCT/SYM'), ('にほんご',
                                                                'NOUN'),
                                           ('、', 'PUNCT/SYM'),
                                           ('にっぽん', 'NOUN'), ('ご', 'PART'),
                                           ('[', 'PUNCT/SYM'), ('注', 'NOUN'),
                                           ('1', 'NOUN'), (']', 'PUNCT/SYM'),
                                           (')', 'PUNCT/SYM'), ('は', 'PART'),
                                           ('、', 'PUNCT/SYM'), ('主に', 'ADV'),
                                           ('日本', 'NOUN'), ('国', 'PART'),
                                           ('内', 'PART'), ('や', 'PART'),
                                           ('日本', 'NOUN'), ('人', 'PART'),
                                           ('同士', 'PART'), ('の', 'PART'),
                                           ('間', 'NOUN'), ('で', 'PART'),
                                           ('使用', 'NOUN'), ('さ', 'VERB'),
                                           ('れ', 'AUX'), ('て', 'PART'),
                                           ('いる', 'VERB'), ('言語', 'NOUN'),
                                           ('で', 'AUX'), ('ある', 'VERB'),
                                           ('。', 'PUNCT/SYM')]
    elif lang == 'lit':
        assert tokens_tagged == [
            ('Lietuvių', 'Ncmpgn-'), ('kalba', 'Ncfsnn-'), ('–', 'Z'),
            ('iš', 'Sgg'), ('baltų', 'Agpmpgy'), ('prokalbės', 'Ncfsgn-'),
            ('kilusi', 'Agpmsin'), ('lietuvių', 'Ncmpgn-'),
            ('tautos', 'Ncfsgn-'), ('kalba', 'Ncfsin-'), (',', 'Z'),
            ('kuri', 'Pgfsnn'), ('Lietuvoje', 'Npfslng'),
            ('yra', 'Vgmp3s--n--ni-'), ('valstybinė', 'Agpfsnn'), (',', 'Z'),
            ('o', 'Cg'), ('Europos', 'Npfsgng'), ('Sąjungoje', 'Npfslng'),
            ('–', 'Z'), ('viena', 'Pgn--n'), ('iš', 'Sgg'),
            ('oficialiųjų', 'Agpmpgy'), ('kalbų', 'Ncmsnn-'), ('.', 'Z')
        ]
        assert tokens_tagged_universal == [
            ('Lietuvių', 'NOUN'), ('kalba', 'NOUN'), ('–', 'PUNCT'),
            ('iš', 'ADP'), ('baltų', 'ADJ'), ('prokalbės', 'NOUN'),
            ('kilusi', 'ADJ'), ('lietuvių', 'NOUN'), ('tautos', 'NOUN'),
            ('kalba', 'NOUN'), (',', 'PUNCT'), ('kuri', 'PRON'),
            ('Lietuvoje', 'PROPN'), ('yra', 'VERB'), ('valstybinė', 'ADJ'),
            (',', 'PUNCT'), ('o', 'CONJ'), ('Europos', 'PROPN'),
            ('Sąjungoje', 'PROPN'), ('–', 'PUNCT'), ('viena', 'PRON'),
            ('iš', 'ADP'), ('oficialiųjų', 'ADJ'), ('kalbų', 'NOUN'),
            ('.', 'PUNCT')
        ]
    elif lang == 'nob':
        assert tokens_tagged == [
            ('Bokmål', 'NOUN__Definite=Ind|Gender=Neut|Number=Sing'),
            ('er', 'VERB__Mood=Ind|Tense=Pres|VerbForm=Fin'),
            ('en', 'DET__Gender=Masc|Number=Sing'),
            ('varietet', 'ADJ__Definite=Ind|Number=Sing'), ('av', 'ADP___'),
            ('norsk', 'ADJ__Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing'),
            ('språk', 'NOUN__Definite=Ind|Gender=Neut|Number=Sing'),
            ('.', 'PUNCT___')
        ]
        assert tokens_tagged_universal == [('Bokmål', 'NOUN'), ('er', 'VERB'),
                                           ('en', 'DET'), ('varietet', 'ADJ'),
                                           ('av', 'ADP'), ('norsk', 'ADJ'),
                                           ('språk', 'NOUN'), ('.', 'PUNCT')]
    elif lang == 'por':
        assert tokens_tagged == [('A', '<artd>|ART|F|S|@>N'),
                                 ('língua', '<np-def>|N|F|S|@SUBJ>'),
                                 ('portuguesa', 'ADJ|F|S|@N<'),
                                 (',', 'PU|@PU'), ('também', 'ADV|@ADVL>'),
                                 ('designada', '<mv>|V|PCP|F|S|@ICL-N<PRED'),
                                 ('português', 'ADJ|F|S|@N<'), (',', 'PU|@PU'),
                                 ('é', '<mv>|V|PR|3S|IND|@FS-STA'),
                                 ('uma', '<arti>|ART|F|S|@>N'),
                                 ('língua', '<np-idf>|N|F|S|@<SC'),
                                 ('românica', 'ADJ|F|S|@N<'),
                                 ('flexiva', 'ADJ|F|S|@N<'),
                                 ('ocidental', 'ADJ|F|S|@N<'),
                                 ('originada', '<mv>|V|PCP|F|S|@ICL-N<'),
                                 ('no', 'PRP|@<OC'),
                                 ('galego-português', '<np-def>|N|M|S|@<ACC'),
                                 ('falado', '<mv>|V|PCP|M|S|@ICL-N<'),
                                 ('no', '<artd>|ART|M|S|@>N'),
                                 ('Reino', 'PROP|M|S|@P<'), ('da', 'PRP|@N<'),
                                 ('Galiza', 'PROPN'),
                                 ('e', '<co-prparg>|KC|@CO'),
                                 ('no', '<cjt>|PRP|@N<'), ('norte', 'N|@P<'),
                                 ('de', 'PRP|@N<'),
                                 ('Portugal', 'PROP|M|S|@P<'), ('.', 'PU|@PU')]
        assert tokens_tagged_universal == [
            ('A', 'DET'), ('língua', 'NOUN'), ('portuguesa', 'ADJ'),
            (',', 'PUNCT'), ('também', 'ADV'), ('designada', 'VERB'),
            ('português', 'ADJ'), (',', 'PUNCT'), ('é', 'VERB'),
            ('uma', 'DET'), ('língua', 'NOUN'), ('românica', 'ADJ'),
            ('flexiva', 'ADJ'), ('ocidental', 'ADJ'), ('originada', 'VERB'),
            ('no', 'ADP'), ('galego-português', 'NOUN'), ('falado', 'VERB'),
            ('no', 'DET'), ('Reino', 'PROPN'), ('da', 'ADP'),
            ('Galiza', 'PROPN'), ('e', 'CCONJ'), ('no', 'ADP'),
            ('norte', 'NOUN'), ('de', 'ADP'), ('Portugal', 'PROPN'),
            ('.', 'PUNCT')
        ]
    elif lang == 'rus':
        if pos_tagger == 'NLTK - Perceptron POS Tagger':
            assert tokens_tagged == [('Ру́сский', 'A=m'), ('язы́к', 'S'),
                                     ('(', 'NONLEX'), ('[', 'NONLEX'),
                                     ('ˈruskʲɪi̯', 'NONLEX'),
                                     ('jɪˈzɨk', 'NONLEX'), (']', 'NONLEX'),
                                     ('Информация', 'S'), ('о', 'PR'),
                                     ('файле', 'S'), ('слушать', 'V'),
                                     (')', 'NONLEX'), ('[', 'NONLEX'),
                                     ('~', 'NONLEX'), ('3', 'NUM=ciph'),
                                     (']', 'NONLEX'), ('[', 'NONLEX'),
                                     ('⇨', 'NONLEX'), (']', 'NONLEX'),
                                     ('—', 'NONLEX'), ('один', 'A-PRO=m'),
                                     ('из', 'PR'),
                                     ('восточнославянских', 'A=pl'),
                                     ('языков', 'S'), (',', 'NONLEX'),
                                     ('национальный', 'A=m'), ('язык', 'S'),
                                     ('русского', 'A=m'), ('народа', 'S'),
                                     ('.', 'NONLEX')]
            assert tokens_tagged_universal == [('Ру́сский', 'ADJ'),
                                               ('язы́к', 'NOUN'),
                                               ('(', 'PUNCT'), ('[', 'PUNCT'),
                                               ('ˈruskʲɪi̯', 'PUNCT'),
                                               ('jɪˈzɨk', 'PUNCT'),
                                               (']', 'PUNCT'),
                                               ('Информация', 'NOUN'),
                                               ('о', 'ADP'), ('файле', 'NOUN'),
                                               ('слушать', 'VERB'),
                                               (')', 'PUNCT'), ('[', 'PUNCT'),
                                               ('~', 'PUNCT'), ('3', 'NUM'),
                                               (']', 'PUNCT'), ('[', 'PUNCT'),
                                               ('⇨', 'PUNCT'), (']', 'PUNCT'),
                                               ('—', 'PUNCT'),
                                               ('один', 'PRON'), ('из', 'ADP'),
                                               ('восточнославянских', 'ADJ'),
                                               ('языков', 'NOUN'),
                                               (',', 'PUNCT'),
                                               ('национальный', 'ADJ'),
                                               ('язык', 'NOUN'),
                                               ('русского', 'ADJ'),
                                               ('народа', 'NOUN'),
                                               ('.', 'PUNCT')]
        elif pos_tagger == 'pymorphy2 - Morphological Analyzer':
            assert tokens_tagged == [
                ('Ру́сский', 'NOUN'), ('язы́к', 'NOUN'), ('(', 'PNCT'),
                ('[', 'PNCT'), ('ˈruskʲɪi̯', 'UNKN'), ('jɪˈzɨk', 'UNKN'),
                (']', 'PNCT'), ('Информация', 'NOUN'), ('о', 'PREP'),
                ('файле', 'NOUN'), ('слушать', 'INFN'), (')', 'PNCT'),
                ('[', 'PNCT'), ('~', 'UNKN'), ('3', 'NUMB'), (']', 'PNCT'),
                ('[', 'PNCT'), ('⇨', 'UNKN'), (']', 'PNCT'), ('—', 'PNCT'),
                ('один', 'ADJF'), ('из', 'PREP'),
                ('восточнославянских', 'ADJF'), ('языков', 'NOUN'),
                (',', 'PNCT'), ('национальный', 'ADJF'), ('язык', 'NOUN'),
                ('русского', 'ADJF'), ('народа', 'NOUN'), ('.', 'PNCT')
            ]
            assert tokens_tagged_universal == [('Ру́сский', 'NOUN'),
                                               ('язы́к', 'NOUN'),
                                               ('(', 'PUNCT'), ('[', 'PUNCT'),
                                               ('ˈruskʲɪi̯', 'SYM/X'),
                                               ('jɪˈzɨk', 'SYM/X'),
                                               (']', 'PUNCT'),
                                               ('Информация', 'NOUN'),
                                               ('о', 'ADP'), ('файле', 'NOUN'),
                                               ('слушать', 'VERB'),
                                               (')', 'PUNCT'), ('[', 'PUNCT'),
                                               ('~', 'SYM/X'), ('3', 'NUM'),
                                               (']', 'PUNCT'), ('[', 'PUNCT'),
                                               ('⇨', 'SYM/X'), (']', 'PUNCT'),
                                               ('—', 'PUNCT'), ('один', 'ADJ'),
                                               ('из', 'ADP'),
                                               ('восточнославянских', 'ADJ'),
                                               ('языков', 'NOUN'),
                                               (',', 'PUNCT'),
                                               ('национальный', 'ADJ'),
                                               ('язык', 'NOUN'),
                                               ('русского', 'ADJ'),
                                               ('народа', 'NOUN'),
                                               ('.', 'PUNCT')]
    elif lang == 'spa':
        assert tokens_tagged == [
            ('El', 'DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('español', 'NOUN__Gender=Masc|Number=Sing'), ('o', 'CCONJ___'),
            ('castellano', 'NOUN__Gender=Masc|Number=Sing'),
            ('es',
             'AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('una', 'DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('lengua', 'NOUN__Gender=Fem|Number=Sing'),
            ('romance', 'NOUN__Gender=Masc|Number=Sing'),
            ('procedente', 'ADJ__Number=Sing'),
            ('del', 'ADP__AdpType=Preppron|Gender=Masc|Number=Sing'),
            ('latín', 'NOUN__Gender=Masc|Number=Sing'),
            ('hablado', 'ADJ__Gender=Masc|Number=Sing|VerbForm=Part'),
            ('.', 'PUNCT__PunctType=Peri')
        ]
        assert tokens_tagged_universal == [('El', 'DET'), ('español', 'NOUN'),
                                           ('o', 'CCONJ'),
                                           ('castellano', 'NOUN'),
                                           ('es', 'AUX'), ('una', 'DET'),
                                           ('lengua', 'NOUN'),
                                           ('romance', 'NOUN'),
                                           ('procedente', 'ADJ'),
                                           ('del', 'ADP'), ('latín', 'NOUN'),
                                           ('hablado', 'ADJ'), ('.', 'PUNCT')]
    elif lang == 'tha':
        if pos_tagger == 'PyThaiNLP - Perceptron Tagger (ORCHID)':
            assert tokens_tagged == [('ภาษาไทย', 'NPRP'), ('หรือ', 'JCRG'),
                                     ('ภาษาไทย', 'NPRP'), ('กลาง', 'VATT'),
                                     ('เป็น', 'VSTA'), ('ภาษาราชการ', 'NCMN'),
                                     ('และ', 'JCRG'),
                                     ('ภาษาประจำชาติ', 'NCMN'),
                                     ('ของ', 'RPRE'), ('ประเทศ', 'NCMN'),
                                     ('ไทย', 'NPRP')]
            assert tokens_tagged_universal == [('ภาษาไทย', 'PROPN'),
                                               ('หรือ', 'CCONJ'),
                                               ('ภาษาไทย', 'PROPN'),
                                               ('กลาง', 'VERB'),
                                               ('เป็น', 'VERB'),
                                               ('ภาษาราชการ', 'NOUN'),
                                               ('และ', 'CCONJ'),
                                               ('ภาษาประจำชาติ', 'NOUN'),
                                               ('ของ', 'ADP'),
                                               ('ประเทศ', 'NOUN'),
                                               ('ไทย', 'PROPN')]
        elif pos_tagger == 'PyThaiNLP - Perceptron Tagger (PUD)':
            assert tokens_tagged == [('ภาษาไทย', 'NOUN'), ('หรือ', 'CCONJ'),
                                     ('ภาษาไทย', 'NOUN'), ('กลาง', 'NOUN'),
                                     ('เป็น', 'AUX'), ('ภาษาราชการ', 'NOUN'),
                                     ('และ', 'CCONJ'),
                                     ('ภาษาประจำชาติ', 'NOUN'), ('ของ', 'ADP'),
                                     ('ประเทศ', 'NOUN'), ('ไทย', 'PROPN')]
            assert tokens_tagged_universal == [('ภาษาไทย', 'NOUN'),
                                               ('หรือ', 'CCONJ'),
                                               ('ภาษาไทย', 'NOUN'),
                                               ('กลาง', 'NOUN'),
                                               ('เป็น', 'AUX'),
                                               ('ภาษาราชการ', 'NOUN'),
                                               ('และ', 'CCONJ'),
                                               ('ภาษาประจำชาติ', 'NOUN'),
                                               ('ของ', 'ADP'),
                                               ('ประเทศ', 'NOUN'),
                                               ('ไทย', 'PROPN')]
    elif lang == 'bod':
        assert tokens_tagged == [('བོད་', 'PROPN'), ('ཀྱི་', 'NO_POS'),
                                 ('སྐད་ཡིག་', 'NOUN'), ('ནི་', 'NO_POS'),
                                 ('བོད་ཡུལ་', 'PROPN'), ('དང་', 'NO_POS'),
                                 ('དེ', 'DET'), ('འི་', 'PART'),
                                 ('ཉེ་འཁོར་', 'NOUN'), ('གྱི་', 'NO_POS'),
                                 ('ས་ཁུལ་', 'OTHER'), ('ཏེ', 'NO_POS'),
                                 ('།', 'PUNCT')]
        assert tokens_tagged_universal == [('བོད་', 'PROPN'), ('ཀྱི་', 'X'),
                                           ('སྐད་ཡིག་', 'NOUN'), ('ནི་', 'X'),
                                           ('བོད་ཡུལ་', 'PROPN'), ('དང་', 'X'),
                                           ('དེ', 'DET'), ('འི་', 'PART'),
                                           ('ཉེ་འཁོར་', 'NOUN'), ('གྱི་', 'X'),
                                           ('ས་ཁུལ་', 'X'), ('ཏེ', 'X'),
                                           ('།', 'PUNCT')]
    elif lang == 'ukr':
        assert tokens_tagged == [('Украї́нська', 'ADJF'), ('мо́ва', 'ADJF'),
                                 ('(', 'PNCT'), ('МФА', 'UNKN'), (':', 'PNCT'),
                                 ('[', 'PNCT'), ('ukrɑ̽ˈjɪnʲsʲkɑ̽', 'UNKN'),
                                 ('ˈmɔwɑ̽', 'UNKN'), (']', 'PNCT'),
                                 (',', 'PNCT'), ('історичні', 'ADJF'),
                                 ('назви', 'NOUN'), ('—', 'PNCT'),
                                 ('ру́ська', 'ADJF'), (',', 'PNCT'),
                                 ('руси́нська[9][10][11', 'UNKN'),
                                 (']', 'PNCT'), ('[', 'PNCT'), ('*', 'PNCT'),
                                 ('2', 'NUMB'), (']', 'PNCT'), (')', 'PNCT'),
                                 ('—', 'PNCT'), ('національна', 'ADJF'),
                                 ('мова', 'NOUN'), ('українців', 'NOUN'),
                                 ('.', 'PNCT')]
        assert tokens_tagged_universal == [('Украї́нська', 'ADJ'),
                                           ('мо́ва', 'ADJ'), ('(', 'PUNCT'),
                                           ('МФА', 'SYM/X'), (':', 'PUNCT'),
                                           ('[', 'PUNCT'),
                                           ('ukrɑ̽ˈjɪnʲsʲkɑ̽', 'SYM/X'),
                                           ('ˈmɔwɑ̽', 'SYM/X'), (']', 'PUNCT'),
                                           (',', 'PUNCT'),
                                           ('історичні', 'ADJ'),
                                           ('назви', 'NOUN'), ('—', 'PUNCT'),
                                           ('ру́ська', 'ADJ'), (',', 'PUNCT'),
                                           ('руси́нська[9][10][11', 'SYM/X'),
                                           (']', 'PUNCT'), ('[', 'PUNCT'),
                                           ('*', 'PUNCT'), ('2', 'NUM'),
                                           (']', 'PUNCT'), (')', 'PUNCT'),
                                           ('—', 'PUNCT'),
                                           ('національна', 'ADJ'),
                                           ('мова', 'NOUN'),
                                           ('українців', 'NOUN'),
                                           ('.', 'PUNCT')]
    elif lang == 'vie':
        assert tokens_tagged == [('Tiếng', 'N'), ('Việt', 'Np'), (',', 'CH'),
                                 ('còn', 'C'), ('gọi', 'V'), ('tiếng', 'N'),
                                 ('Việt Nam', 'Np'), ('[', 'V'), ('5', 'M'),
                                 (']', 'CH'), (',', 'CH'), ('tiếng Kinh', 'N'),
                                 ('hay', 'C'), ('Việt ngữ', 'V'), (',', 'CH'),
                                 ('là', 'V'), ('ngôn ngữ', 'N'), ('của', 'E'),
                                 ('người', 'Nc'), ('Việt', 'Np'), ('(', 'CH'),
                                 ('dân tộc', 'N'), ('Kinh', 'Np'), (')', 'CH'),
                                 ('và', 'C'), ('là', 'V'), ('ngôn ngữ', 'N'),
                                 ('chính thức', 'A'), ('tại', 'E'),
                                 ('Việt Nam', 'Np'), ('.', 'CH')]
        assert tokens_tagged_universal == [
            ('Tiếng', 'NOUN'), ('Việt', 'PROPN'), (',', 'PUNCT'),
            ('còn', 'CCONJ'), ('gọi', 'VERB'), ('tiếng', 'NOUN'),
            ('Việt Nam', 'PROPN'), ('[', 'VERB'), ('5', 'NUM'), (']', 'PUNCT'),
            (',', 'PUNCT'), ('tiếng Kinh', 'NOUN'), ('hay', 'CCONJ'),
            ('Việt ngữ', 'VERB'), (',', 'PUNCT'), ('là', 'VERB'),
            ('ngôn ngữ', 'NOUN'), ('của', 'ADP'), ('người', 'NOUN'),
            ('Việt', 'PROPN'), ('(', 'PUNCT'), ('dân tộc', 'NOUN'),
            ('Kinh', 'PROPN'), (')', 'PUNCT'), ('và', 'CCONJ'), ('là', 'VERB'),
            ('ngôn ngữ', 'NOUN'), ('chính thức', 'ADJ'), ('tại', 'ADP'),
            ('Việt Nam', 'PROPN'), ('.', 'PUNCT')
        ]
Exemple #10
0
    def __init__(self, main, file):
        self.main = main
        self.lang = file['lang']
        self.tokenized = file['tokenized']
        self.tagged = file['tagged']

        self.offsets_paras = []
        self.offsets_sentences = []

        self.tokens_multilevel = []
        self.tokens_flat = []
        self.tags = []

        re_tags = wl_matching.get_re_tags(main)

        if re.search(r'\.txt', file['path'], flags=re.IGNORECASE):
            with open(file['path'], 'r', encoding=file['encoding']) as f:
                # Untokenized & Untagged
                if self.tokenized == 'No' and self.tagged == 'No':
                    for line in f:
                        text = line.rstrip()

                        if text:
                            tokens = wl_word_tokenization.wl_word_tokenize(
                                main, text, lang=self.lang)

                            self.tokens_multilevel.append(tokens)
                            self.tags.extend(
                                [[]] * len(list(wl_misc.flatten_list(tokens))))
                # Untokenized & Tagged
                elif self.tokenized == 'No' and self.tagged == 'Yes':
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            tokens = wl_word_tokenization.wl_word_tokenize(
                                main, text_no_tags, lang=self.lang)

                            self.tokens_multilevel.append(tokens)

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags, text):
                                self.tokens_multilevel[0][0].insert(0, '')
                                self.tags.append([])

                            # Extract tags
                            for tag in re.findall(re_tags, text):
                                i_tag = text.index(tag)

                                self.tokenize_text(text[:i_tag])
                                self.tags[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.tokenize_text(text)
                # Tokenized & Untagged
                elif self.tokenized == 'Yes' and self.tagged == 'No':
                    for line in f:
                        text = line.rstrip()

                        if text:
                            self.tokens_multilevel.append([])

                            for sentence in wl_sentence_tokenization.wl_sentence_split(
                                    main, text):
                                self.tokens_multilevel[-1].append(
                                    sentence.split())
                # Tokenized & Tagged
                elif self.tokenized == 'Yes' and self.tagged == 'Yes':
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_multilevel.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            for sentence in wl_sentence_tokenization.wl_sentence_split(
                                    main, text_no_tags):
                                self.tokens_multilevel[-1].append(
                                    sentence.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags, text):
                                self.tokens_multilevel[0][0].insert(0, '')

                                self.tags.append([])

                            # Extract tags
                            for tag in re.findall(re_tags, text):
                                i_tag = text.index(tag)

                                self.tokenize_text(text[:i_tag])
                                self.tags[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.tokenize_text(text)
        elif re.search(r'\.xml', file['path'], flags=re.IGNORECASE):
            text = ''

            with open(file['path'], 'r', encoding=file['encoding']) as f:
                for line in f:
                    text += line

            soup = bs4.BeautifulSoup(text, features='lxml-xml')

            tags_para = []
            tags_sentence = []
            tags_word = []

            for _, level, opening_tag, _ in self.main.settings_custom['tags'][
                    'tags_xml']:
                if level == 'Paragraph':
                    tags_para.append(opening_tag[1:-1])
                elif level == 'Sentence':
                    tags_sentence.append(opening_tag[1:-1])
                elif level == 'Word':
                    tags_word.append(opening_tag[1:-1])

            for para in div.select(','.join(tags_para)):
                self.tokens_multilevel.append([])

                for sentence in para.select(','.join(tags_sentence)):
                    self.tokens_multilevel[-1].append([])

                    for word in sentence.select(','.join(tags_word)):
                        self.tokens_multilevel[-1][-1].append(word.get_text())

                        self.tags.append([])

        # Paragraph and sentence offsets
        for para in self.tokens_multilevel:
            self.offsets_paras.append(len(self.tokens_flat))

            for sentence in para:
                self.offsets_sentences.append(len(self.tokens_flat))

                self.tokens_flat.extend(sentence)

        # Remove whitespace around all tags
        self.tags = [[tag.strip() for tag in tags] for tags in self.tags]
Exemple #11
0
def test_pos_tag(lang, pos_tagger, show_results=False):
    lang_text = wl_conversion.to_lang_text(main, lang)

    tokens = wl_word_tokenization.wl_word_tokenize(
        main,
        text=getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang=lang)
    tokens = list(wl_misc.flatten_list(tokens))

    tokens_tagged = wl_pos_tagging.wl_pos_tag(main,
                                              tokens=tokens,
                                              lang=lang,
                                              pos_tagger=pos_tagger)
    tokens_tagged_universal = wl_pos_tagging.wl_pos_tag(main,
                                                        tokens=tokens,
                                                        lang=lang,
                                                        pos_tagger=pos_tagger,
                                                        tagset='universal')

    if show_results:
        print(f'{lang} / {pos_tagger}:')
        print(tokens_tagged)
        print(tokens_tagged_universal)

    if lang == 'zho_cn':
        assert tokens_tagged == [('汉语', 'nz'), (',', 'x'), ('又', 'd'),
                                 ('称', 'v'), ('汉文', 'nz'), ('、', 'x'),
                                 ('中文', 'nz'), ('、', 'x'), ('中国', 'ns'),
                                 ('话', 'n'), ('、', 'x'), ('中国', 'ns'),
                                 ('语', 'ng'), ('、', 'x'), ('华语', 'nz'),
                                 ('、', 'x'), ('华文', 'nz'), ('、', 'x'),
                                 ('唐', 'nr'), ('话', 'n'), ('[', 'x'),
                                 ('2', 'x'), (']', 'x'), (',', 'x'),
                                 ('或', 'c'), ('被', 'p'), ('视为', 'v'),
                                 ('一个', 'm'), ('语族', 'n'), (',', 'x'),
                                 ('或', 'c'), ('被', 'p'), ('视为', 'v'),
                                 ('隶属于', 'n'), ('汉藏', 'ns'), ('语系', 'n'),
                                 ('汉语', 'nz'), ('族', 'ng'), ('之一', 'r'),
                                 ('种', 'm'), ('语言', 'n'), ('。', 'x')]
        assert tokens_tagged_universal == [('汉语', 'PROPN'), (',', 'PUNCT/SYM'),
                                           ('又', 'ADV'), ('称', 'VERB'),
                                           ('汉文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('中文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('中国', 'PROPN'), ('话', 'NOUN'),
                                           ('、', 'PUNCT/SYM'), ('中国', 'PROPN'),
                                           ('语', 'NOUN'), ('、', 'PUNCT/SYM'),
                                           ('华语', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('华文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('唐', 'PRONP'), ('话', 'NOUN'),
                                           ('[', 'PUNCT/SYM'),
                                           ('2', 'PUNCT/SYM'),
                                           (']', 'PUNCT/SYM'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('视为', 'VERB'),
                                           ('一个', 'NUM'), ('语族', 'NOUN'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('视为', 'VERB'),
                                           ('隶属于', 'NOUN'), ('汉藏', 'PROPN'),
                                           ('语系', 'NOUN'), ('汉语', 'PROPN'),
                                           ('族', 'NOUN'), ('之一', 'PRON'),
                                           ('种', 'NUM'), ('语言', 'NOUN'),
                                           ('。', 'PUNCT/SYM')]
    elif lang == 'zho_tw':
        assert tokens_tagged == [
            ('漢語', 'nz'), (',', 'x'), ('又', 'd'), ('稱', 'zg'), ('漢文', 'nz'),
            ('、', 'x'), ('中文', 'nz'), ('、', 'x'), ('中', 'f'), ('國話', 'n'),
            ('、', 'x'), ('中國', 'ns'), ('語', 'n'), ('、', 'x'), ('華語', 'nz'),
            ('、', 'x'), ('華文', 'nz'), ('、', 'x'), ('唐', 'nr'), ('話', 'x'),
            ('[', 'x'), ('2', 'x'), (']', 'x'), (',', 'x'), ('或', 'c'),
            ('被', 'p'), ('視為', 'v'), ('一', 'm'), ('個', 'zg'), ('語族', 'n'),
            (',', 'x'), ('或', 'c'), ('被', 'p'), ('視', 'x'), ('為', 'p'),
            ('隸', 'j'), ('屬', 'v'), ('於', 'nr'), ('漢', 'j'), ('藏', 'j'),
            ('語系', 'n'), ('漢語', 'nz'), ('族', 'ng'), ('之一', 'r'), ('種', 'x'),
            ('語言', 'n'), ('。', 'x')
        ]
        assert tokens_tagged_universal == [('漢語', 'PROPN'), (',', 'PUNCT/SYM'),
                                           ('又', 'ADV'), ('稱', 'PART'),
                                           ('漢文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('中文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('中', 'ADP'), ('國話', 'NOUN'),
                                           ('、', 'PUNCT/SYM'), ('中國', 'PROPN'),
                                           ('語', 'NOUN'), ('、', 'PUNCT/SYM'),
                                           ('華語', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('華文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('唐', 'PRONP'), ('話', 'PUNCT/SYM'),
                                           ('[', 'PUNCT/SYM'),
                                           ('2', 'PUNCT/SYM'),
                                           (']', 'PUNCT/SYM'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('視為', 'VERB'),
                                           ('一', 'NUM'), ('個', 'PART'),
                                           ('語族', 'NOUN'), (',', 'PUNCT/SYM'),
                                           ('或', 'CONJ'), ('被', 'ADP'),
                                           ('視', 'PUNCT/SYM'), ('為', 'ADP'),
                                           ('隸', 'X'), ('屬', 'VERB'),
                                           ('於', 'PRONP'), ('漢', 'X'),
                                           ('藏', 'X'), ('語系', 'NOUN'),
                                           ('漢語', 'PROPN'), ('族', 'NOUN'),
                                           ('之一', 'PRON'), ('種', 'PUNCT/SYM'),
                                           ('語言', 'NOUN'), ('。', 'PUNCT/SYM')]
    elif lang == 'dan':
        assert tokens_tagged == [
            ('Dansk', 'ADJ__Definite=Ind|Degree=Pos|Number=Sing'),
            ('er', 'AUX__Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act'),
            ('et', 'DET__Gender=Neut|Number=Sing|PronType=Ind'),
            ('nordgermansk', 'ADJ__Definite=Ind|Degree=Pos|Number=Sing'),
            ('sprog', 'NOUN__Definite=Ind|Gender=Neut|Number=Sing'),
            ('af', 'ADP__AdpType=Prep'),
            ('den', 'DET__Gender=Com|Number=Sing|PronType=Dem'),
            ('østnordiske', 'ADJ__Definite=Def|Degree=Pos|Number=Sing'),
            ('(', 'PUNCT'), ('kontinentale', 'ADJ__Degree=Pos|Number=Plur'),
            (')', 'PUNCT'),
            ('gruppe', 'NOUN__Definite=Ind|Gender=Com|Number=Sing'),
            (',', 'PUNCT'), ('der', 'PRON__PartType=Inf'),
            ('tales', 'VERB__Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Pass'),
            ('af', 'ADP__AdpType=Prep'), ('ca.', 'ADV'),
            ('seks', 'NUM__NumType=Card'),
            ('millioner', 'NOUN__Definite=Ind|Gender=Com|Number=Plur'),
            ('mennesker', 'NOUN__Definite=Ind|Gender=Neut|Number=Plur'),
            ('.', 'PUNCT')
        ]
        assert tokens_tagged_universal == [('Dansk', 'ADJ'), ('er', 'AUX'),
                                           ('et', 'DET'),
                                           ('nordgermansk', 'ADJ'),
                                           ('sprog', 'NOUN'), ('af', 'ADP'),
                                           ('den', 'DET'),
                                           ('østnordiske', 'ADJ'),
                                           ('(', 'PUNCT'),
                                           ('kontinentale', 'ADJ'),
                                           (')', 'PUNCT'), ('gruppe', 'NOUN'),
                                           (',', 'PUNCT'), ('der', 'PRON'),
                                           ('tales', 'VERB'), ('af', 'ADP'),
                                           ('ca.', 'ADV'), ('seks', 'NUM'),
                                           ('millioner', 'NOUN'),
                                           ('mennesker', 'NOUN'),
                                           ('.', 'PUNCT')]
    elif lang == 'nld':
        assert tokens_tagged == [
            ('Het', 'LID|bep|stan|evon__Definite=Def'),
            ('Nederlands',
             'N|eigen|ev|basis|onz|stan__Gender=Neut|Number=Sing'),
            ('is', 'WW|pv|tgw|ev__Number=Sing|Tense=Pres|VerbForm=Fin'),
            ('een', 'LID|onbep|stan|agr__Definite=Ind'),
            ('West-Germaanse', 'ADJ|prenom|basis|met-e|stan__Degree=Pos'),
            ('taal', 'N|soort|ev|basis|zijd|stan__Gender=Com|Number=Sing'),
            ('en', 'VG|neven'), ('de', 'LID|bep|stan|rest__Definite=Def'),
            ('moedertaal',
             'N|soort|ev|basis|zijd|stan__Gender=Com|Number=Sing'),
            ('van', 'VZ|init'), ('de', 'LID|bep|stan|rest__Definite=Def'),
            ('meeste', 'VNW|onbep|grad|stan|prenom|met-e|agr|sup'),
            ('inwoners', 'N|soort|mv|basis__Number=Plur'), ('van', 'VZ|init'),
            ('Nederland',
             'N|eigen|ev|basis|onz|stan__Gender=Neut|Number=Sing'),
            (',', 'LET'),
            ('België', 'N|eigen|ev|basis|onz|stan__Gender=Neut|Number=Sing'),
            ('en', 'VG|neven'),
            ('Suriname', 'N|eigen|ev|basis|onz|stan__Gender=Neut|Number=Sing'),
            ('.', 'LET')
        ]
        assert tokens_tagged_universal == [
            ('Het', 'DET'), ('Nederlands', 'PROPN'), ('is', 'VERB'),
            ('een', 'DET'), ('West-Germaanse', 'ADJ'), ('taal', 'NOUN'),
            ('en', 'CCONJ'), ('de', 'DET'), ('moedertaal', 'NOUN'),
            ('van', 'ADP'), ('de', 'DET'), ('meeste', 'ADV'),
            ('inwoners', 'NOUN'), ('van', 'ADP'), ('Nederland', 'PROPN'),
            (',', 'SYM'), ('België', 'PROPN'), ('en', 'CCONJ'),
            ('Suriname', 'PROPN'), ('.', 'SYM')
        ]
    elif lang == 'eng':
        if pos_tagger == 'NLTK - Perceptron POS Tagger':
            assert tokens_tagged == [('English', 'NNP'), ('is', 'VBZ'),
                                     ('a', 'DT'), ('West', 'NNP'),
                                     ('Germanic', 'NNP'), ('language', 'NN'),
                                     ('that', 'WDT'), ('was', 'VBD'),
                                     ('first', 'RB'), ('spoken', 'VBN'),
                                     ('in', 'IN'), ('early', 'JJ'),
                                     ('medieval', 'NN'), ('England', 'NNP'),
                                     ('and', 'CC'), ('eventually', 'RB'),
                                     ('became', 'VBD'), ('a', 'DT'),
                                     ('global', 'JJ'), ('lingua', 'NN'),
                                     ('franca.[4][5', 'NN'), (']', 'NN')]
            assert tokens_tagged_universal == [('English', 'PROPN'),
                                               ('is', 'VERB'), ('a', 'DET'),
                                               ('West', 'PROPN'),
                                               ('Germanic', 'PROPN'),
                                               ('language', 'NOUN'),
                                               ('that', 'DET'), ('was',
                                                                 'VERB'),
                                               ('first', 'ADV'),
                                               ('spoken', 'VERB'),
                                               ('in', 'ADP/SCONJ'),
                                               ('early', 'ADJ'),
                                               ('medieval', 'NOUN'),
                                               ('England', 'PROPN'),
                                               ('and', 'CCONJ'),
                                               ('eventually', 'ADV'),
                                               ('became', 'VERB'),
                                               ('a', 'DET'), ('global', 'ADJ'),
                                               ('lingua', 'NOUN'),
                                               ('franca.[4][5', 'NOUN'),
                                               (']', 'NOUN')]
        elif pos_tagger == 'spaCy - English POS Tagger':
            assert tokens_tagged == [('English', 'NNP'), ('is', 'VBZ'),
                                     ('a', 'DT'), ('West', 'NNP'),
                                     ('Germanic', 'NNP'), ('language', 'NN'),
                                     ('that', 'WDT'), ('was', 'VBD'),
                                     ('first', 'RB'), ('spoken', 'VBN'),
                                     ('in', 'IN'), ('early', 'JJ'),
                                     ('medieval', 'JJ'), ('England', 'NNP'),
                                     ('and', 'CC'), ('eventually', 'RB'),
                                     ('became', 'VBD'), ('a', 'DT'),
                                     ('global', 'JJ'), ('lingua', 'NN'),
                                     ('franca.[4][5', 'NNP'), (']', '-RRB-')]
            assert tokens_tagged_universal == [('English', 'PROPN'),
                                               ('is', 'AUX'), ('a', 'DET'),
                                               ('West', 'PROPN'),
                                               ('Germanic', 'PROPN'),
                                               ('language', 'NOUN'),
                                               ('that', 'DET'), ('was', 'AUX'),
                                               ('first', 'ADV'),
                                               ('spoken', 'VERB'),
                                               ('in', 'ADP'), ('early', 'ADJ'),
                                               ('medieval', 'ADJ'),
                                               ('England', 'PROPN'),
                                               ('and', 'CCONJ'),
                                               ('eventually', 'ADV'),
                                               ('became', 'VERB'),
                                               ('a', 'DET'), ('global', 'ADJ'),
                                               ('lingua', 'NOUN'),
                                               ('franca.[4][5', 'PROPN'),
                                               (']', 'PUNCT')]
    elif lang == 'fra':
        assert tokens_tagged == [
            ('Le', 'DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('français', 'ADJ__Gender=Masc'),
            ('est',
             'AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('une', 'DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('langue', 'NOUN__Gender=Fem|Number=Sing'),
            ('indo-européenne', 'ADJ__Gender=Fem|Number=Sing'), ('de', 'ADP'),
            ('la', 'DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('famille', 'NOUN__Gender=Fem|Number=Sing'),
            ('des', 'ADP_DET__Definite=Def|Number=Plur|PronType=Art'),
            ('langues', 'NOUN__Gender=Fem|Number=Plur'),
            ('romanes', 'ADJ__Gender=Fem|Number=Plur'), ('.', 'PUNCT')
        ]
        assert tokens_tagged_universal == [('Le', 'DET'), ('français', 'ADJ'),
                                           ('est', 'AUX'), ('une', 'DET'),
                                           ('langue', 'NOUN'),
                                           ('indo-européenne', 'ADJ'),
                                           ('de', 'ADP'), ('la', 'DET'),
                                           ('famille', 'NOUN'), ('des', 'ADP'),
                                           ('langues', 'NOUN'),
                                           ('romanes', 'ADJ'), ('.', 'PUNCT')]
    elif lang == 'deu':
        assert tokens_tagged == [('Die', 'ART'), ('deutsche', 'ADJA'),
                                 ('Sprache', 'NN'), ('bzw.', 'ADJA'),
                                 ('Deutsch', 'NN'), ('(', '$('), ('[', 'NE'),
                                 ('dɔʏ̯t͡ʃ', 'NE'), (']', 'NE'), (';', '$.'),
                                 ('abgekürzt', 'VVFIN'), ('dt', 'NE'),
                                 ('.', '$.'), ('oder', 'KON'),
                                 ('dtsch', 'ADJD'), ('.', '$.'), (')', '$('),
                                 ('ist', 'VAFIN'), ('eine', 'ART'),
                                 ('westgermanische', 'ADJA'),
                                 ('Sprache', 'NN'), ('.', '$.')]
        assert tokens_tagged_universal == [('Die', 'DET'), ('deutsche', 'ADJ'),
                                           ('Sprache', 'NOUN'), ('bzw.',
                                                                 'ADJ'),
                                           ('Deutsch', 'NOUN'), ('(', 'PUNCT'),
                                           ('[', 'PROPN'),
                                           ('dɔʏ̯t͡ʃ', 'PROPN'),
                                           (']', 'PROPN'), (';', 'PUNCT'),
                                           ('abgekürzt', 'VERB'),
                                           ('dt', 'PROPN'), ('.', 'PUNCT'),
                                           ('oder', 'CCONJ'), ('dtsch', 'ADJ'),
                                           ('.', 'PUNCT'), (')', 'PUNCT'),
                                           ('ist', 'AUX'), ('eine', 'DET'),
                                           ('westgermanische', 'ADJ'),
                                           ('Sprache', 'NOUN'), ('.', 'PUNCT')]
    elif lang == 'ell':
        assert tokens_tagged == [
            ('Η',
             'DET__Case=Nom|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('ελληνική', 'ADJ__Case=Nom|Gender=Fem|Number=Sing'),
            ('γλώσσα', 'NOUN__Case=Nom|Gender=Fem|Number=Sing'),
            ('ανήκει',
             'VERB__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act'
             ), ('στην', 'AsPpSp_AtDf__Case=Acc|Gender=Fem|Number=Sing'),
            ('ινδοευρωπαϊκή', 'ADJ__Case=Acc|Gender=Fem|Number=Sing'),
            ('οικογένεια[9', 'NOUN__Case=Acc|Gender=Fem|Number=Sing'),
            (']',
             'VERB__Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Tense=Past|VerbForm=Fin|Voice=Pass'
             ), ('και', 'CCONJ'), ('συγκεκριμένα', 'ADV'),
            ('στον', 'AsPpSp_AtDf__Case=Acc|Gender=Masc|Number=Sing'),
            ('ελληνικό', 'ADJ__Case=Acc|Gender=Masc|Number=Sing'),
            ('κλάδο', 'NOUN__Case=Acc|Gender=Masc|Number=Sing'),
            (',', 'PUNCT'), ('μαζί', 'ADV'), ('με', 'ADP'),
            ('την',
             'DET__Case=Acc|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('τσακωνική', 'NOUN__Case=Acc|Gender=Fem|Number=Sing'),
            (',', 'PUNCT'), ('ενώ', 'SCONJ'),
            ('είναι',
             'AUX__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Pass'
             ),
            ('η',
             'DET__Case=Nom|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('επίσημη', 'ADJ__Case=Nom|Gender=Fem|Number=Sing'),
            ('γλώσσα', 'NOUN__Case=Nom|Gender=Fem|Number=Sing'),
            ('της',
             'DET__Case=Gen|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('Ελλάδος', 'PROPN__Case=Gen|Gender=Fem|Number=Sing'),
            ('και', 'CCONJ'),
            ('της',
             'DET__Case=Gen|Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('Κύπρου', 'PROPN__Case=Gen|Gender=Fem|Number=Sing'),
            ('.', 'PUNCT')
        ]
        assert tokens_tagged_universal == [('Η', 'DET'), ('ελληνική', 'ADJ'),
                                           ('γλώσσα', 'NOUN'),
                                           ('ανήκει', 'VERB'), ('στην', 'ADP'),
                                           ('ινδοευρωπαϊκή', 'ADJ'),
                                           ('οικογένεια[9', 'NOUN'),
                                           (']', 'VERB'), ('και', 'CCONJ'),
                                           ('συγκεκριμένα', 'ADV'),
                                           ('στον', 'ADP'),
                                           ('ελληνικό', 'ADJ'),
                                           ('κλάδο', 'NOUN'), (',', 'PUNCT'),
                                           ('μαζί', 'ADV'), ('με', 'ADP'),
                                           ('την', 'DET'), ('τσακωνική',
                                                            'NOUN'),
                                           (',', 'PUNCT'), ('ενώ', 'SCONJ'),
                                           ('είναι', 'AUX'), ('η', 'DET'),
                                           ('επίσημη', 'ADJ'),
                                           ('γλώσσα', 'NOUN'), ('της', 'DET'),
                                           ('Ελλάδος', 'PROPN'),
                                           ('και', 'CCONJ'), ('της', 'DET'),
                                           ('Κύπρου', 'PROPN'), ('.', 'PUNCT')]
    elif lang == 'ita':
        assert tokens_tagged == [
            ("L'", 'RD__Definite=Def|Number=Sing|PronType=Art'),
            ('italiano', 'S__Gender=Masc|Number=Sing'), ('(', 'FB'),
            ('[', 'FB'), ('itaˈljaːno][Nota', 'S__Gender=Masc|Number=Sing'),
            ('1', 'N__NumType=Card'), (']', 'FB'), ('ascolta[?·info', 'S'),
            (']', 'FB'), (')', 'FB'),
            ('è', 'V__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('una', 'RI__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('lingua', 'S__Gender=Fem|Number=Sing'),
            ('romanza', 'S__Gender=Fem|Number=Sing'),
            ('parlata', 'A__Gender=Fem|Number=Sing'), ('principalmente', 'B'),
            ('in', 'E'), ('Italia', 'SP'), ('.', 'FS')
        ]
        assert tokens_tagged_universal == [("L'", 'DET'), ('italiano', 'NOUN'),
                                           ('(', 'PUNCT'), ('[', 'PUNCT'),
                                           ('itaˈljaːno][Nota', 'NOUN'),
                                           ('1', 'NUM'), (']', 'PUNCT'),
                                           ('ascolta[?·info', 'NOUN'),
                                           (']', 'PUNCT'), (')', 'PUNCT'),
                                           ('è', 'VERB'), ('una', 'DET'),
                                           ('lingua', 'NOUN'),
                                           ('romanza', 'NOUN'),
                                           ('parlata', 'ADJ'),
                                           ('principalmente', 'ADV'),
                                           ('in', 'ADP'), ('Italia', 'PROPN'),
                                           ('.', 'PUNCT')]
    elif lang == 'jpn':
        assert tokens_tagged == [('日本', '名詞'), ('語', '名詞'), ('(', '補助記号'),
                                 ('にほんご', '名詞'), ('、', '補助記号'), ('にっぽん', '名詞'),
                                 ('ご', '接尾辞'), ('[', '補助記号'), ('注', '名詞'),
                                 ('1', '名詞'), (']', '補助記号'), (')', '補助記号'),
                                 ('は', '助詞'), ('、', '補助記号'), ('主に', '副詞'),
                                 ('日本', '名詞'), ('国', '接尾辞'), ('内', '接尾辞'),
                                 ('や', '助詞'), ('日本', '名詞'), ('人', '接尾辞'),
                                 ('同士', '接尾辞'), ('の', '助詞'), ('間', '名詞'),
                                 ('で', '助詞'), ('使用', '名詞'), ('さ', '動詞'),
                                 ('れ', '助動詞'), ('て', '助詞'), ('いる', '動詞'),
                                 ('言語', '名詞'), ('で', '助動詞'), ('ある', '動詞'),
                                 ('。', '補助記号')]
        assert tokens_tagged_universal == [('日本', 'NOUN'), ('語', 'NOUN'),
                                           ('(', 'PUNCT/SYM'), ('にほんご',
                                                                'NOUN'),
                                           ('、', 'PUNCT/SYM'),
                                           ('にっぽん', 'NOUN'), ('ご', 'PART'),
                                           ('[', 'PUNCT/SYM'), ('注', 'NOUN'),
                                           ('1', 'NOUN'), (']', 'PUNCT/SYM'),
                                           (')', 'PUNCT/SYM'), ('は', 'PART'),
                                           ('、', 'PUNCT/SYM'), ('主に', 'ADV'),
                                           ('日本', 'NOUN'), ('国', 'PART'),
                                           ('内', 'PART'), ('や', 'PART'),
                                           ('日本', 'NOUN'), ('人', 'PART'),
                                           ('同士', 'PART'), ('の', 'PART'),
                                           ('間', 'NOUN'), ('で', 'PART'),
                                           ('使用', 'NOUN'), ('さ', 'VERB'),
                                           ('れ', 'AUX'), ('て', 'PART'),
                                           ('いる', 'VERB'), ('言語', 'NOUN'),
                                           ('で', 'AUX'), ('ある', 'VERB'),
                                           ('。', 'PUNCT/SYM')]
    elif lang == 'lit':
        assert tokens_tagged == [
            ('Lietuvių', 'dkt.vyr.dgs.K.__Case=Gen|Gender=Masc|Number=Plur'),
            ('kalba', 'dkt.mot.vns.Įn.__Case=Ins|Gender=Fem|Number=Sing'),
            ('–', 'skyr.'), ('iš', 'prl.K.__AdpType=Prep|Case=Gen'),
            ('baltų',
             'bdv.nelygin.mot.vns.K.__Case=Gen|Definite=Ind|Degree=Pos|Gender=Fem|Number=Sing'
             ),
            ('prokalbės', 'dkt.mot.vns.K.__Case=Gen|Gender=Fem|Number=Sing'),
            ('kilusi',
             'bdv.aukšč.vyr.dgs.V.__Case=Nom|Definite=Ind|Degree=Sup|Gender=Masc|Number=Plur'
             ),
            ('lietuvių', 'dkt.vyr.dgs.K.__Case=Gen|Gender=Masc|Number=Plur'),
            ('tautos', 'dkt.mot.vns.K.__Case=Gen|Gender=Fem|Number=Sing'),
            ('kalba', 'dkt.mot.vns.Įn.__Case=Ins|Gender=Fem|Number=Sing'),
            (',', 'skyr.'),
            ('kuri',
             'įv.mot.vns.V.__Case=Nom|Definite=Ind|Gender=Fem|Number=Sing|PronType=Int'
             ),
            ('Lietuvoje',
             'dkt.tikr.mot.vns.Vt.__Case=Loc|Gender=Fem|Number=Sing'),
            ('yra',
             'vksm.asm.tiesiog.es.vns.3.__Mood=Ind|Number=Sing|Person=3|Polarity=Pos|Tense=Pres|VerbForm=Fin'
             ),
            ('valstybinė',
             'bdv.nelygin.mot.vns.V.__Case=Nom|Definite=Ind|Degree=Pos|Gender=Fem|Number=Sing'
             ), (',', 'skyr.'), ('o', 'jng.'),
            ('Europos',
             'dkt.tikr.mot.vns.K.__Case=Gen|Gender=Fem|Number=Sing'),
            ('Sąjungoje', 'dkt.mot.vns.Vt.__Case=Loc|Gender=Fem|Number=Sing'),
            ('–', 'skyr.'),
            ('viena',
             'įv.mot.vns.V.__Case=Nom|Definite=Ind|Gender=Fem|Number=Sing|PronType=Ind'
             ), ('iš', 'prl.K.__AdpType=Prep|Case=Gen'),
            ('oficialiųjų',
             'bdv.nelygin.įvardž.vyr.dgs.K.__Case=Gen|Definite=Def|Degree=Pos|Gender=Masc|Number=Plur'
             ), ('kalbų', 'dkt.vyr.dgs.V.__Case=Nom|Gender=Masc|Number=Plur'),
            ('.', 'skyr.')
        ]
        assert tokens_tagged_universal == [
            ('Lietuvių', 'NOUN'), ('kalba', 'NOUN'), ('–', 'PUNCT'),
            ('iš', 'ADP'), ('baltų', 'ADJ'), ('prokalbės', 'NOUN'),
            ('kilusi', 'ADJ'), ('lietuvių', 'NOUN'), ('tautos', 'NOUN'),
            ('kalba', 'NOUN'), (',', 'PUNCT'), ('kuri', 'DET'),
            ('Lietuvoje', 'PROPN'), ('yra', 'AUX'), ('valstybinė', 'ADJ'),
            (',', 'PUNCT'), ('o', 'CCONJ'), ('Europos', 'PROPN'),
            ('Sąjungoje', 'NOUN'), ('–', 'PUNCT'), ('viena', 'PRON'),
            ('iš', 'ADP'), ('oficialiųjų', 'ADJ'), ('kalbų', 'NOUN'),
            ('.', 'PUNCT')
        ]
    elif lang == 'nob':
        assert tokens_tagged == [
            ('Bokmål', 'PROPN'),
            ('er', 'AUX__Mood=Ind|Tense=Pres|VerbForm=Fin'),
            ('en', 'DET__Gender=Masc|Number=Sing|PronType=Art'),
            ('varietet', 'NOUN__Definite=Ind|Gender=Masc|Number=Sing'),
            ('av', 'ADP'),
            ('norsk', 'ADJ__Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing'),
            ('språk', 'NOUN__Definite=Ind|Gender=Neut|Number=Sing'),
            ('.', 'PUNCT')
        ]
        assert tokens_tagged_universal == [('Bokmål', 'PROPN'), ('er', 'AUX'),
                                           ('en', 'DET'), ('varietet', 'NOUN'),
                                           ('av', 'ADP'), ('norsk', 'ADJ'),
                                           ('språk', 'NOUN'), ('.', 'PUNCT')]
    elif lang == 'pol':
        assert tokens_tagged == [('Język', 'SUBST'), ('polski', 'ADJ'),
                                 (',', 'INTERP'), ('polszczyzna', 'SUBST'),
                                 (',', 'INTERP'), ('skrót', 'SUBST'),
                                 (':', 'INTERP'), ('pol', 'BREV'),
                                 ('.', 'INTERP'), ('–', 'INTERP'),
                                 ('język', 'SUBST'), ('naturalny', 'ADJ'),
                                 ('należący', 'PACT'), ('do', 'PREP'),
                                 ('grupy', 'SUBST'), ('języków', 'SUBST'),
                                 ('zachodniosłowiańskich', 'ADJ'),
                                 ('(', 'INTERP'), ('do', 'PREP'),
                                 ('której', 'ADJ'), ('należą', 'FIN'),
                                 ('również', 'QUB'), ('czeski', 'ADJ'),
                                 (',', 'INTERP'), ('słowacki', 'ADJ'),
                                 (',', 'INTERP'), ('kaszubski', 'ADJ'),
                                 (',', 'INTERP'), ('dolnołużycki', 'ADJ'),
                                 (',', 'INTERP'), ('górnołużycki', 'SUBST'),
                                 ('i', 'CONJ'), ('wymarły', 'SUBST'),
                                 ('połabski', 'ADJ'), (')', 'INTERP'),
                                 (',', 'INTERP'), ('stanowiącej', 'PACT'),
                                 ('część', 'SUBST'), ('rodziny', 'SUBST'),
                                 ('języków', 'SUBST'),
                                 ('indoeuropejskich', 'ADJ'), ('.', 'INTERP')]
        assert tokens_tagged_universal == [
            ('Język', 'NOUN'), ('polski', 'ADJ'), (',', 'PUNCT'),
            ('polszczyzna', 'NOUN'), (',', 'PUNCT'), ('skrót', 'NOUN'),
            (':', 'PUNCT'), ('pol', 'X'), ('.', 'PUNCT'), ('–', 'PUNCT'),
            ('język', 'NOUN'), ('naturalny', 'ADJ'), ('należący', 'VERB'),
            ('do', 'ADP'), ('grupy', 'NOUN'), ('języków', 'NOUN'),
            ('zachodniosłowiańskich', 'ADJ'), ('(', 'PUNCT'), ('do', 'ADP'),
            ('której', 'ADJ'), ('należą', 'VERB'), ('również', 'PART'),
            ('czeski', 'ADJ'), (',', 'PUNCT'), ('słowacki', 'ADJ'),
            (',', 'PUNCT'), ('kaszubski', 'ADJ'), (',', 'PUNCT'),
            ('dolnołużycki', 'ADJ'), (',', 'PUNCT'), ('górnołużycki', 'NOUN'),
            ('i', 'CCONJ'), ('wymarły', 'NOUN'), ('połabski', 'ADJ'),
            (')', 'PUNCT'), (',', 'PUNCT'), ('stanowiącej', 'VERB'),
            ('część', 'NOUN'), ('rodziny', 'NOUN'), ('języków', 'NOUN'),
            ('indoeuropejskich', 'ADJ'), ('.', 'PUNCT')
        ]
    elif lang == 'por':
        assert tokens_tagged == [
            ('A', 'DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('língua', 'NOUN__Gender=Fem|Number=Sing'),
            ('portuguesa', 'ADJ__Gender=Fem|Number=Sing'), (',', 'PUNCT'),
            ('também', 'ADV'),
            ('designada', 'VERB__Gender=Fem|Number=Sing|VerbForm=Part'),
            ('português', 'NOUN__Gender=Masc|Number=Sing'), (',', 'PUNCT'),
            ('é',
             'AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('uma', 'DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('língua', 'NOUN__Gender=Fem|Number=Sing'),
            ('românica', 'ADJ__Gender=Fem|Number=Sing'),
            ('flexiva', 'ADJ__Gender=Fem|Number=Sing'),
            ('ocidental', 'ADJ__Gender=Fem|Number=Sing'),
            ('originada', 'VERB__Gender=Fem|Number=Sing|VerbForm=Part'),
            ('no',
             'ADP_DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('galego-português', 'NOUN__Gender=Masc|Number=Sing'),
            ('falado', 'VERB__Gender=Masc|Number=Sing|VerbForm=Part'),
            ('no',
             'ADP_DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('Reino', 'PROPN__Gender=Masc|Number=Sing'),
            ('da',
             'ADP_DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('Galiza', 'PROPN__Number=Sing'), ('e', 'CCONJ'),
            ('no',
             'ADP_DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('norte', 'NOUN__Gender=Masc|Number=Sing'), ('de', 'ADP'),
            ('Portugal', 'PROPN__Gender=Masc|Number=Sing'), ('.', 'PUNCT')
        ]
        assert tokens_tagged_universal == [
            ('A', 'DET'), ('língua', 'NOUN'), ('portuguesa', 'ADJ'),
            (',', 'PUNCT'), ('também', 'ADV'), ('designada', 'VERB'),
            ('português', 'NOUN'), (',', 'PUNCT'), ('é', 'AUX'),
            ('uma', 'DET'), ('língua', 'NOUN'), ('românica', 'ADJ'),
            ('flexiva', 'ADJ'), ('ocidental', 'ADJ'), ('originada', 'VERB'),
            ('no', 'DET'), ('galego-português', 'NOUN'), ('falado', 'VERB'),
            ('no', 'DET'), ('Reino', 'PROPN'), ('da', 'DET'),
            ('Galiza', 'PROPN'), ('e', 'CCONJ'), ('no', 'DET'),
            ('norte', 'NOUN'), ('de', 'ADP'), ('Portugal', 'PROPN'),
            ('.', 'PUNCT')
        ]
    elif lang == 'ron':
        assert tokens_tagged == [('Limba', 'Ncfsry'), ('română', 'Afpfsrn'),
                                 ('este', 'Vmip3s'), ('o', 'Tifsr'),
                                 ('limbă', 'Ncfsrn'),
                                 ('indo-europeană', 'Afpfsrn'), (',', 'COMMA'),
                                 ('din', 'Spsa'), ('grupul', 'Ncmsry'),
                                 ('italic', 'Afpms-n'), ('și', 'Crssp'),
                                 ('din', 'Spsa'), ('subgrupul', 'Ncmsry'),
                                 ('oriental', 'Afpms-n'), ('al', 'Tsms'),
                                 ('limbilor', 'Ncfpoy'),
                                 ('romanice', 'Afpfp-n'), ('.', 'PERIOD')]
        assert tokens_tagged_universal == [('Limba', 'NOUN'),
                                           ('română', 'ADJ'), ('este', 'AUX'),
                                           ('o', 'DET'), ('limbă', 'NOUN'),
                                           ('indo-europeană', 'ADJ'),
                                           (',', 'PUNCT'), ('din', 'ADP'),
                                           ('grupul', 'NOUN'), ('italic',
                                                                'ADJ'),
                                           ('și', 'CCONJ'), ('din', 'ADP'),
                                           ('subgrupul', 'NOUN'),
                                           ('oriental', 'ADJ'), ('al', 'DET'),
                                           ('limbilor', 'NOUN'),
                                           ('romanice', 'ADJ'), ('.', 'PUNCT')]
    elif lang == 'rus':
        if pos_tagger == 'NLTK - Perceptron POS Tagger':
            assert tokens_tagged == [('Ру́сский', 'A=m'), ('язы́к', 'S'),
                                     ('(', 'NONLEX'), ('[', 'NONLEX'),
                                     ('ˈruskʲɪi̯', 'NONLEX'),
                                     ('jɪˈzɨk', 'NONLEX'), (']', 'NONLEX'),
                                     ('Информация', 'S'), ('о', 'PR'),
                                     ('файле', 'S'), ('слушать', 'V'),
                                     (')', 'NONLEX'), ('[', 'NONLEX'),
                                     ('~', 'NONLEX'), ('3', 'NUM=ciph'),
                                     (']', 'NONLEX'), ('[', 'NONLEX'),
                                     ('⇨', 'NONLEX'), (']', 'NONLEX'),
                                     ('—', 'NONLEX'), ('один', 'A-PRO=m'),
                                     ('из', 'PR'),
                                     ('восточнославянских', 'A=pl'),
                                     ('языков', 'S'), (',', 'NONLEX'),
                                     ('национальный', 'A=m'), ('язык', 'S'),
                                     ('русского', 'A=m'), ('народа', 'S'),
                                     ('.', 'NONLEX')]
            assert tokens_tagged_universal == [('Ру́сский', 'ADJ'),
                                               ('язы́к', 'NOUN'),
                                               ('(', 'PUNCT'), ('[', 'PUNCT'),
                                               ('ˈruskʲɪi̯', 'PUNCT'),
                                               ('jɪˈzɨk', 'PUNCT'),
                                               (']', 'PUNCT'),
                                               ('Информация', 'NOUN'),
                                               ('о', 'ADP'), ('файле', 'NOUN'),
                                               ('слушать', 'VERB'),
                                               (')', 'PUNCT'), ('[', 'PUNCT'),
                                               ('~', 'PUNCT'), ('3', 'NUM'),
                                               (']', 'PUNCT'), ('[', 'PUNCT'),
                                               ('⇨', 'PUNCT'), (']', 'PUNCT'),
                                               ('—', 'PUNCT'),
                                               ('один', 'PRON'), ('из', 'ADP'),
                                               ('восточнославянских', 'ADJ'),
                                               ('языков', 'NOUN'),
                                               (',', 'PUNCT'),
                                               ('национальный', 'ADJ'),
                                               ('язык', 'NOUN'),
                                               ('русского', 'ADJ'),
                                               ('народа', 'NOUN'),
                                               ('.', 'PUNCT')]
        elif pos_tagger == 'pymorphy2 - Morphological Analyzer':
            assert tokens_tagged == [
                ('Ру́сский', 'NOUN'), ('язы́к', 'NOUN'), ('(', 'PNCT'),
                ('[', 'PNCT'), ('ˈruskʲɪi̯', 'UNKN'), ('jɪˈzɨk', 'UNKN'),
                (']', 'PNCT'), ('Информация', 'NOUN'), ('о', 'PREP'),
                ('файле', 'NOUN'), ('слушать', 'INFN'), (')', 'PNCT'),
                ('[', 'PNCT'), ('~', 'UNKN'), ('3', 'NUMB'), (']', 'PNCT'),
                ('[', 'PNCT'), ('⇨', 'UNKN'), (']', 'PNCT'), ('—', 'PNCT'),
                ('один', 'ADJF'), ('из', 'PREP'),
                ('восточнославянских', 'ADJF'), ('языков', 'NOUN'),
                (',', 'PNCT'), ('национальный', 'ADJF'), ('язык', 'NOUN'),
                ('русского', 'ADJF'), ('народа', 'NOUN'), ('.', 'PNCT')
            ]
            assert tokens_tagged_universal == [('Ру́сский', 'NOUN'),
                                               ('язы́к', 'NOUN'),
                                               ('(', 'PUNCT'), ('[', 'PUNCT'),
                                               ('ˈruskʲɪi̯', 'SYM/X'),
                                               ('jɪˈzɨk', 'SYM/X'),
                                               (']', 'PUNCT'),
                                               ('Информация', 'NOUN'),
                                               ('о', 'ADP'), ('файле', 'NOUN'),
                                               ('слушать', 'VERB'),
                                               (')', 'PUNCT'), ('[', 'PUNCT'),
                                               ('~', 'SYM/X'), ('3', 'NUM'),
                                               (']', 'PUNCT'), ('[', 'PUNCT'),
                                               ('⇨', 'SYM/X'), (']', 'PUNCT'),
                                               ('—', 'PUNCT'), ('один', 'ADJ'),
                                               ('из', 'ADP'),
                                               ('восточнославянских', 'ADJ'),
                                               ('языков', 'NOUN'),
                                               (',', 'PUNCT'),
                                               ('национальный', 'ADJ'),
                                               ('язык', 'NOUN'),
                                               ('русского', 'ADJ'),
                                               ('народа', 'NOUN'),
                                               ('.', 'PUNCT')]
    elif lang == 'spa':
        assert tokens_tagged == [
            ('El', 'DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('español', 'NOUN__Gender=Masc|Number=Sing'), ('o', 'CCONJ'),
            ('castellano', 'NOUN__Gender=Masc|Number=Sing'),
            ('es',
             'AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('una', 'DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('lengua', 'NOUN__Gender=Fem|Number=Sing'),
            ('romance', 'NOUN__Gender=Masc|Number=Sing'),
            ('procedente', 'ADJ__Number=Sing'),
            ('del', 'ADP__AdpType=Preppron'),
            ('latín', 'NOUN__Gender=Masc|Number=Sing'),
            ('hablado', 'ADJ__Gender=Masc|Number=Sing|VerbForm=Part'),
            ('.', 'PUNCT__PunctType=Peri')
        ]
        assert tokens_tagged_universal == [('El', 'DET'), ('español', 'NOUN'),
                                           ('o', 'CCONJ'),
                                           ('castellano', 'NOUN'),
                                           ('es', 'AUX'), ('una', 'DET'),
                                           ('lengua', 'NOUN'),
                                           ('romance', 'NOUN'),
                                           ('procedente', 'ADJ'),
                                           ('del', 'ADP'), ('latín', 'NOUN'),
                                           ('hablado', 'ADJ'), ('.', 'PUNCT')]
    elif lang == 'tha':
        if pos_tagger == 'PyThaiNLP - Perceptron Tagger (ORCHID)':
            assert tokens_tagged == [('ภาษา', 'NCMN'), ('ไทย', 'NPRP'),
                                     ('หรือ', 'JCRG'), ('ภาษา', 'NCMN'),
                                     ('ไทย', 'NPRP'), ('กลาง', 'VATT'),
                                     ('เป็น', 'VSTA'), ('ภาษา', 'NCMN'),
                                     ('ราชการ', 'NCMN'), ('และ', 'JCRG'),
                                     ('ภาษา', 'NCMN'), ('ประจำ', 'RPRE'),
                                     ('ชาติ', 'NCMN'), ('ของ', 'RPRE'),
                                     ('ประเทศไทย', 'NPRP')]
            assert tokens_tagged_universal == [('ภาษา', 'NOUN'),
                                               ('ไทย', 'PROPN'),
                                               ('หรือ', 'CCONJ'),
                                               ('ภาษา', 'NOUN'),
                                               ('ไทย', 'PROPN'),
                                               ('กลาง', 'VERB'),
                                               ('เป็น', 'VERB'),
                                               ('ภาษา', 'NOUN'),
                                               ('ราชการ', 'NOUN'),
                                               ('และ', 'CCONJ'),
                                               ('ภาษา', 'NOUN'),
                                               ('ประจำ', 'ADP'),
                                               ('ชาติ', 'NOUN'),
                                               ('ของ', 'ADP'),
                                               ('ประเทศไทย', 'PROPN')]
        elif pos_tagger == 'PyThaiNLP - Perceptron Tagger (PUD)':
            assert tokens_tagged == [('ภาษา', 'NOUN'), ('ไทย', 'PROPN'),
                                     ('หรือ', 'CCONJ'), ('ภาษา', 'NOUN'),
                                     ('ไทย', 'PROPN'), ('กลาง', 'NOUN'),
                                     ('เป็น', 'AUX'), ('ภาษา', 'NOUN'),
                                     ('ราชการ', 'NOUN'), ('และ', 'CCONJ'),
                                     ('ภาษา', 'NOUN'), ('ประจำ', 'VERB'),
                                     ('ชาติ', 'NOUN'), ('ของ', 'ADP'),
                                     ('ประเทศไทย', 'PROPN')]
            assert tokens_tagged_universal == [
                ('ภาษา', 'NOUN'), ('ไทย', 'PROPN'), ('หรือ', 'CCONJ'),
                ('ภาษา', 'NOUN'), ('ไทย', 'PROPN'), ('กลาง', 'NOUN'),
                ('เป็น', 'AUX'), ('ภาษา', 'NOUN'), ('ราชการ', 'NOUN'),
                ('และ', 'CCONJ'), ('ภาษา', 'NOUN'), ('ประจำ', 'VERB'),
                ('ชาติ', 'NOUN'), ('ของ', 'ADP'), ('ประเทศไทย', 'PROPN')
            ]
    elif lang == 'bod':
        assert tokens_tagged == [('བོད་', 'PROPN'), ('ཀྱི་', 'NO_POS'),
                                 ('སྐད་ཡིག་', 'NOUN'), ('ནི་', 'NO_POS'),
                                 ('བོད་ཡུལ་', 'PROPN'), ('དང་', 'NO_POS'),
                                 ('དེ', 'DET'), ('འི་', 'PART'),
                                 ('ཉེ་འཁོར་', 'NOUN'), ('གྱི་', 'NO_POS'),
                                 ('ས་ཁུལ་', 'OTHER'), ('ཏེ', 'NO_POS'),
                                 ('།', 'PUNCT')]
        assert tokens_tagged_universal == [('བོད་', 'PROPN'), ('ཀྱི་', 'X'),
                                           ('སྐད་ཡིག་', 'NOUN'), ('ནི་', 'X'),
                                           ('བོད་ཡུལ་', 'PROPN'), ('དང་', 'X'),
                                           ('དེ', 'DET'), ('འི་', 'PART'),
                                           ('ཉེ་འཁོར་', 'NOUN'), ('གྱི་', 'X'),
                                           ('ས་ཁུལ་', 'X'), ('ཏེ', 'X'),
                                           ('།', 'PUNCT')]
    elif lang == 'ukr':
        assert tokens_tagged == [('Украї́нська', 'ADJF'), ('мо́ва', 'ADJF'),
                                 ('(', 'PNCT'), ('МФА', 'UNKN'), (':', 'PNCT'),
                                 ('[', 'PNCT'), ('ukrɑ̽ˈjɪnʲsʲkɑ̽', 'UNKN'),
                                 ('ˈmɔwɑ̽', 'UNKN'), (']', 'PNCT'),
                                 (',', 'PNCT'), ('історичні', 'ADJF'),
                                 ('назви', 'NOUN'), ('—', 'PNCT'),
                                 ('ру́ська', 'ADJF'), (',', 'PNCT'),
                                 ('руси́нська[9][10][11', 'UNKN'),
                                 (']', 'PNCT'), ('[', 'PNCT'), ('*', 'PNCT'),
                                 ('2', 'NUMB'), (']', 'PNCT'), (')', 'PNCT'),
                                 ('—', 'PNCT'), ('національна', 'ADJF'),
                                 ('мова', 'NOUN'), ('українців', 'NOUN'),
                                 ('.', 'PNCT')]
        assert tokens_tagged_universal == [('Украї́нська', 'ADJ'),
                                           ('мо́ва', 'ADJ'), ('(', 'PUNCT'),
                                           ('МФА', 'SYM/X'), (':', 'PUNCT'),
                                           ('[', 'PUNCT'),
                                           ('ukrɑ̽ˈjɪnʲsʲkɑ̽', 'SYM/X'),
                                           ('ˈmɔwɑ̽', 'SYM/X'), (']', 'PUNCT'),
                                           (',', 'PUNCT'),
                                           ('історичні', 'ADJ'),
                                           ('назви', 'NOUN'), ('—', 'PUNCT'),
                                           ('ру́ська', 'ADJ'), (',', 'PUNCT'),
                                           ('руси́нська[9][10][11', 'SYM/X'),
                                           (']', 'PUNCT'), ('[', 'PUNCT'),
                                           ('*', 'PUNCT'), ('2', 'NUM'),
                                           (']', 'PUNCT'), (')', 'PUNCT'),
                                           ('—', 'PUNCT'),
                                           ('національна', 'ADJ'),
                                           ('мова', 'NOUN'),
                                           ('українців', 'NOUN'),
                                           ('.', 'PUNCT')]
    elif lang == 'vie':
        assert tokens_tagged == [('Tiếng', 'N'), ('Việt', 'Np'), (',', 'CH'),
                                 ('còn', 'C'), ('gọi', 'V'), ('tiếng', 'N'),
                                 ('Việt Nam', 'Np'), ('[', 'V'), ('5', 'M'),
                                 (']', 'CH'), (',', 'CH'), ('tiếng Kinh', 'N'),
                                 ('hay', 'C'), ('Việt ngữ', 'V'), (',', 'CH'),
                                 ('là', 'V'), ('ngôn ngữ', 'N'), ('của', 'E'),
                                 ('người', 'Nc'), ('Việt', 'Np'), ('(', 'CH'),
                                 ('dân tộc', 'N'), ('Kinh', 'Np'), (')', 'CH'),
                                 ('và', 'C'), ('là', 'V'), ('ngôn ngữ', 'N'),
                                 ('chính thức', 'A'), ('tại', 'E'),
                                 ('Việt Nam', 'Np'), ('.', 'CH')]
        assert tokens_tagged_universal == [
            ('Tiếng', 'NOUN'), ('Việt', 'PROPN'), (',', 'PUNCT'),
            ('còn', 'CCONJ'), ('gọi', 'VERB'), ('tiếng', 'NOUN'),
            ('Việt Nam', 'PROPN'), ('[', 'VERB'), ('5', 'NUM'), (']', 'PUNCT'),
            (',', 'PUNCT'), ('tiếng Kinh', 'NOUN'), ('hay', 'CCONJ'),
            ('Việt ngữ', 'VERB'), (',', 'PUNCT'), ('là', 'VERB'),
            ('ngôn ngữ', 'NOUN'), ('của', 'ADP'), ('người', 'NOUN'),
            ('Việt', 'PROPN'), ('(', 'PUNCT'), ('dân tộc', 'NOUN'),
            ('Kinh', 'PROPN'), (')', 'PUNCT'), ('và', 'CCONJ'), ('là', 'VERB'),
            ('ngôn ngữ', 'NOUN'), ('chính thức', 'ADJ'), ('tại', 'ADP'),
            ('Việt Nam', 'PROPN'), ('.', 'PUNCT')
        ]
Exemple #12
0
    def __init__(self, main, file, flat_tokens=True):
        self.main = main
        self.lang = file['lang']
        self.text_type = file['text_type']

        self.offsets_paras = []
        self.offsets_sentences = []
        self.offsets_clauses = []

        if flat_tokens:
            self.tokens_multilevel = [[[[]]]]
        else:
            self.tokens_multilevel = []

        self.tokens_flat = []

        self.tags_all = []
        self.tags_pos = []
        self.tags_non_pos = []

        re_tags_all = wl_matching.get_re_tags(main, tags='all')
        re_tags_pos = wl_matching.get_re_tags(main, tags='pos')
        re_tags_non_pos = wl_matching.get_re_tags(main, tags='non_pos')

        with open(file['path'], 'r', encoding=file['encoding']) as f:
            # Untokenized / Untagged
            if self.text_type == ('untokenized', 'untagged'):
                if flat_tokens:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            tokens = wl_word_tokenization.wl_word_tokenize(
                                main, text, lang=self.lang, flat_tokens=True)

                            self.tokens_multilevel[0][0][0].extend(tokens)
                else:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            tokens = wl_word_tokenization.wl_word_tokenize(
                                main, text, lang=self.lang, flat_tokens=False)

                            self.tokens_multilevel.append(tokens)

            # Untokenized / Tagged (Non-POS)
            elif self.text_type == ('untokenized', 'tagged_non_pos'):
                if flat_tokens:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            tokens = wl_word_tokenization.wl_word_tokenize(
                                main,
                                text_no_tags,
                                lang=self.lang,
                                flat_tokens=True)

                            self.tokens_multilevel[0][0][0].extend(tokens)

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_non_pos, text):
                                self.tokens_multilevel[0][0][0].insert(0, '')
                                self.tags_non_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                self.tokenize_text(text[:i_tag])
                                self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.tokenize_text(text)
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            tokens = wl_word_tokenization.wl_word_tokenize(
                                main,
                                text_no_tags,
                                lang=self.lang,
                                flat_tokens=False)

                            self.tokens_multilevel.append(tokens)

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_non_pos, text):
                                self.tokens_multilevel[0][0][0].insert(0, '')
                                self.tags_non_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                self.tokenize_text(text[:i_tag])
                                self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.tokenize_text(text)
            # Tokenized / Untagged
            elif self.text_type == ('tokenized', 'untagged'):
                if flat_tokens:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            self.tokens_multilevel[0][0][0].extend(
                                text.split())
                else:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            self.tokens_multilevel.append([])

                            for sentence in wl_sentence_tokenization.wl_sentence_split(
                                    main, text):
                                self.tokens_multilevel[-1].append([])

                                for clause in wl_sentence_tokenization.wl_clause_split(
                                        main, sentence):
                                    self.tokens_multilevel[-1][-1].append(
                                        clause.split())
            # Tokenized / Tagged (POS)
            elif self.text_type == ('tokenized', 'tagged_pos'):
                if flat_tokens:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_multilevel[0][0][0].extend(
                                text_no_tags.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_pos, text):
                                self.tokens_multilevel[0][0][0].insert(0, '')
                                self.tags_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_pos, text):
                                i_tag = text.index(tag)

                                self.split_text(text[:i_tag])
                                self.tags_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_multilevel.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            for sentence in wl_sentence_tokenization.wl_sentence_split(
                                    main, text_no_tags):
                                self.tokens_multilevel[-1].append([])

                                for clause in wl_sentence_tokenization.wl_clause_split(
                                        main, sentence):
                                    self.tokens_multilevel[-1][-1].append(
                                        clause.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_pos, text):
                                self.tokens_multilevel[0][0][0].insert(0, '')
                                self.tags_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_pos, text):
                                i_tag = text.index(tag)

                                self.split_text(text[:i_tag])
                                self.tags_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
            # Tokenized / Tagged (Non-POS)
            elif self.text_type == ('tokenized', 'tagged_non_pos'):
                if flat_tokens:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_multilevel[0][0][0].extend(
                                text_no_tags.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_non_pos, text):
                                self.tokens_multilevel[0][0][0].insert(0, '')
                                self.tags_non_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                self.split_text(text[:i_tag])
                                self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_multilevel.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            for sentence in wl_sentence_tokenization.wl_sentence_split(
                                    main, text_no_tags):
                                self.tokens_multilevel[-1].append([])

                                for clause in wl_sentence_tokenization.wl_clause_split(
                                        main, sentence):
                                    self.tokens_multilevel[-1][-1].append(
                                        clause.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_non_pos, text):
                                self.tokens_multilevel[0][0][0].insert(0, '')
                                self.tags_non_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                self.split_text(text[:i_tag])
                                self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
            # Tokenized / Tagged (Both)
            elif self.text_type == ('tokenized', 'tagged_both'):
                if flat_tokens:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_all, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_multilevel[0][0][0].extend(
                                text_no_tags.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and (re.match(re_tags_pos, text)
                                           or re.match(re_tags_non_pos, text)):
                                self.tokens_multilevel[0][0][0].insert(0, '')

                                self.tags_all.append([])
                                self.tags_pos.append([])
                                self.tags_non_pos.append([])

                            # Extract tags
                            while text:
                                tag_pos = re.search(re_tags_pos, text)
                                tag_non_pos = re.search(re_tags_non_pos, text)

                                if tag_pos:
                                    i_tag_pos = text.index(tag_pos.group())

                                if tag_non_pos:
                                    i_tag_non_pos = text.index(
                                        tag_non_pos.group())

                                if (tag_pos and tag_non_pos
                                        and i_tag_pos < i_tag_non_pos
                                        or tag_pos and not tag_non_pos):
                                    self.split_text(text[:i_tag_pos])

                                    self.tags_pos[-1].append(tag_pos.group())
                                    self.tags_all[-1].append(tag_pos.group())

                                    text = text[i_tag_pos +
                                                len(tag_pos.group()):]
                                elif (tag_pos and tag_non_pos
                                      and i_tag_pos > i_tag_non_pos
                                      or not tag_pos and tag_non_pos):
                                    self.split_text(text[:i_tag_non_pos])

                                    self.tags_all[-1].append(
                                        tag_non_pos.group())
                                    self.tags_non_pos[-1].append(
                                        tag_non_pos.group())

                                    text = text[i_tag_non_pos +
                                                len(tag_non_pos.group()):]
                                else:
                                    self.split_text(text)

                                    break
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_multilevel.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_all, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            for sentence in wl_sentence_tokenization.wl_sentence_split(
                                    main, text_no_tags):
                                self.tokens_multilevel[-1].append([])

                                for clause in wl_sentence_tokenization.wl_clause_split(
                                        main, sentence):
                                    self.tokens_multilevel[-1][-1].append(
                                        clause.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and (re.match(re_tags_pos, text)
                                           or re.match(re_tags_non_pos, text)):
                                self.tokens_multilevel[0][0][0].insert(0, '')

                                self.tags_all.append([])
                                self.tags_pos.append([])
                                self.tags_non_pos.append([])

                            # Extract tags
                            while text:
                                tag_pos = re.search(re_tags_pos, text)
                                tag_non_pos = re.search(re_tags_non_pos, text)

                                if tag_pos:
                                    i_tag_pos = text.index(tag_pos.group())

                                if tag_non_pos:
                                    i_tag_non_pos = text.index(
                                        tag_non_pos.group())

                                if (tag_pos and tag_non_pos
                                        and i_tag_pos < i_tag_non_pos
                                        or tag_pos and not tag_non_pos):
                                    self.split_text(text[:i_tag_pos])

                                    self.tags_all[-1].append(tag_pos.group())
                                    self.tags_pos[-1].append(tag_pos.group())

                                    text = text[i_tag_pos +
                                                len(tag_pos.group()):]
                                elif (tag_pos and tag_non_pos
                                      and i_tag_pos > i_tag_non_pos
                                      or not tag_pos and tag_non_pos):
                                    self.split_text(text[:i_tag_non_pos])

                                    self.tags_all[-1].append(
                                        tag_non_pos.group())
                                    self.tags_non_pos[-1].append(
                                        tag_non_pos.group())

                                    text = text[i_tag_non_pos +
                                                len(tag_non_pos.group()):]
                                else:
                                    self.split_text(text)

                                    break

        # Paragraph, sentence and clause offsets
        for para in self.tokens_multilevel:
            self.offsets_paras.append(len(self.tokens_flat))

            for sentence in para:
                self.offsets_sentences.append(len(self.tokens_flat))

                for clause in sentence:
                    self.offsets_clauses.append(len(self.tokens_flat))

                    self.tokens_flat.extend(clause)

        # Tags
        if self.text_type[1] == 'tagged_pos':
            self.tags_non_pos = [[] for i in range(len(self.tokens_flat))]
            self.tags_all = copy.deepcopy(self.tags_pos)
        elif self.text_type[1] == 'tagged_non_pos':
            self.tags_pos = [[] for i in range(len(self.tokens_flat))]
            self.tags_all = copy.deepcopy(self.tags_non_pos)
        elif self.text_type[1] == 'untagged':
            self.tags_all = [[] for i in range(len(self.tokens_flat))]
            self.tags_pos = [[] for i in range(len(self.tokens_flat))]
            self.tags_non_pos = [[] for i in range(len(self.tokens_flat))]

        # Remove whitespace around all tags
        self.tags_all = [[tag.strip() for tag in tags]
                         for tags in self.tags_all]
        self.tags_pos = [[tag.strip() for tag in tags]
                         for tags in self.tags_pos]
        self.tags_non_pos = [[tag.strip() for tag in tags]
                             for tags in self.tags_non_pos]