Beispiel #1
0
    def tokenize_text(self, text):
        if text:
            tokens = wordless_text_processing.wordless_word_tokenize(self.main, text,
                                                                     lang = self.lang)

            for i in range(len(tokens)):
                self.tags_all.append([])
                self.tags_pos.append([])
                self.tags_non_pos.append([])
Beispiel #2
0
    def tokenize_text(self, text):
        if text:
            tokens_sentences = wordless_text_processing.wordless_word_tokenize(self.main, text,
                                                                               lang = self.lang,
                                                                               keep_sentences = False)

            for i in range(len([token for tokens in tokens_sentences for token in tokens])):
                self.tags_all.append([])
                self.tags_pos.append([])
                self.tags_non_pos.append([])
def testing_word_detokenize(lang, word_detokenizer):
    lang_text = wordless_conversion.to_lang_text(main, lang)

    print(f'{lang_text} / {word_detokenizer}:')

    tokens_sentences = wordless_text_processing.wordless_word_tokenize(
        main, globals()[f'SENTENCE_{lang.upper()}'], lang=lang)
    tokens = [token for tokens in tokens_sentences for token in tokens]

    text = wordless_text_processing.wordless_word_detokenize(
        main, tokens, lang=lang, word_detokenizer=word_detokenizer)

    print(f"\t{text}")
def testing_word_tokenize(lang, word_tokenizer):
    lang_text = wordless_conversion.to_lang_text(main, lang)

    print(f'{lang_text} / {word_tokenizer}:')

    tokens_sentences = wordless_text_processing.wordless_word_tokenize(
        main,
        globals()[f'SENTENCE_{lang.upper()}'],
        lang=lang,
        word_tokenizer=word_tokenizer)
    tokens = itertools.chain.from_iterable(tokens_sentences)

    print(f"\t{' '.join(tokens)}")
Beispiel #5
0
def testing_pos_tag(lang, pos_tagger):
    lang_text = wordless_conversion.to_lang_text(main, lang)

    print(f'{lang_text} / {pos_tagger}:')

    tokens_sentences = wordless_text_processing.wordless_word_tokenize(
        main, globals()[f'SENTENCE_{lang.upper()}'], lang=lang)
    tokens = [token for tokens in tokens_sentences for token in tokens]

    tokens_tagged = wordless_text_processing.wordless_pos_tag(
        main, tokens, lang=lang, pos_tagger=pos_tagger)
    tokens_tagged_universal = wordless_text_processing.wordless_pos_tag(
        main, tokens, lang=lang, pos_tagger=pos_tagger, tagset='universal')

    print(f"\t{tokens_tagged}")
    print(f"\t{tokens_tagged_universal}")
Beispiel #6
0
def test_word_detokenize(lang, word_detokenizer):
    lang_text = wordless_conversion.to_lang_text(main, lang)

    tokens = wordless_text_processing.wordless_word_tokenize(main, globals()[f'SENTENCE_{lang.upper()}'],
                                                             lang = lang)

    text = wordless_text_processing.wordless_word_detokenize(main, tokens,
                                                             lang = lang,
                                                             word_detokenizer = word_detokenizer)

    # print(text)

    if lang == 'cat':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == "El català (denominació oficial a Catalunya, a les Illes Balears, a Andorra, a la ciutat de l' Alguer i tradicional a Catalunya Nord) o valencià (denominació oficial al País Valencià i tradicional al Carxe) és una llengua romànica parlada a Catalunya, el País Valencià (tret d' algunes comarques i localitats de l' interior), les Illes Balears, Andorra, la Franja de Ponent (a l' Aragó), la ciutat de l' Alguer (a l' illa de Sardenya), la Catalunya del Nord,[8] el Carxe (un petit territori de Múrcia poblat per immigrats valencians),[9][10] i en petites comunitats arreu del món (entre les quals destaca la de l' Argentina, amb 195.000 parlants).[11 ]"
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == "El català (denominació oficial a Catalunya, a les Illes Balears, a Andorra, a la ciutat de l' Alguer i tradicional a Catalunya Nord) o valencià (denominació oficial al País Valencià i tradicional al Carxe) és una llengua romànica parlada a Catalunya, el País Valencià (tret d' algunes comarques i localitats de l' interior), les Illes Balears, Andorra, la Franja de Ponent (a l' Aragó), la ciutat de l' Alguer (a l' illa de Sardenya), la Catalunya del Nord,[8] el Carxe (un petit territori de Múrcia poblat per immigrats valencians),[9][10] i en petites comunitats arreu del món (entre les quals destaca la de l' Argentina, amb 195.000 parlants).[11]"
    elif lang == 'zho_cn':
        assert text == '汉语,又称汉文、中文、中国话、中国语、华语、华文、唐话[2],或被视为一个语族,或被视为隶属于汉藏语系汉语族之一种语言。'
    elif lang == 'zho_tw':
        assert text == '漢語,又稱漢文、中文、中國話、中國語、華語、華文、唐話[2],或被視為一個語族,或被視為隸屬於漢藏語系漢語族之一種語言。'
    elif lang == 'ces':
        assert text == 'Čeština neboli český jazyk je západoslovanský jazyk, nejbližší slovenštině, poté lužické srbštině a polštině.'
    elif lang == 'nld':
        assert text == 'Het Nederlands is een West-Germaanse taal en de moedertaal van de meeste inwoners van Nederland, België en Suriname.'
    elif lang == 'eng':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'English is a West Germanic language that was first spoken in early medieval England and eventually became a global lingua franca.[5][6 ]'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'English is a West Germanic language that was first spoken in early medieval England and eventually became a global lingua franca.[5][6]'
    elif lang == 'fin':
        assert text == 'Suomen kieli (suomi) on uralilaisten kielten itämerensuomalaiseen ryhmään kuuluva kieli.'
    elif lang == 'fra':
        assert text == 'Le français est une langue indo-européenne de la famille des langues romanes.'
    elif lang == 'deu':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Die deutsche Sprache bzw. Deutsch ([ dɔʏ̯t͡ʃ]; abgekürzt dt . oder dtsch .) ist eine westgermanische Sprache.'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache.'
    elif lang == 'ell':
        assert text == 'Η ελληνική γλώσσα ανήκει στην ινδοευρωπαϊκή οικογένεια[9] και συγκεκριμένα στον ελληνικό κλάδο, μαζί με την τσακωνική, ενώ είναι η επίσημη γλώσσα της Ελλάδος και της Κύπρου.'
    elif lang == 'hun':
        assert text == 'A magyar nyelv az uráli nyelvcsalád tagja, a finnugor nyelvek közé tartozó ugor nyelvek egyike.'
    elif lang == 'isl':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Íslenska er vesturnorrænt, germanskt og indóevrópskt tungumál sem er einkum talað og ritað á Íslandi og er móðurmál langflestra Íslendinga.[4 ]'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Íslenska er vesturnorrænt, germanskt og indóevrópskt tungumál sem er einkum talað og ritað á Íslandi og er móðurmál langflestra Íslendinga.[4]'
    elif lang == 'gle':
        assert text == 'Is ceann de na teangacha Ceilteacha í an Ghaeilge (nó Gaeilge na hÉireann mar a thugtar uirthi corruair), agus ceann den dtrí cinn de theangacha Ceilteacha ar a dtugtar na teangacha Gaelacha (.i. an Ghaeilge, Gaeilge na hAlban agus Gaeilge Mhanann) go háirithe.'
    elif lang == 'ita':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == "L' italiano ([ itaˈljaːno][Nota 1] ascolta[?·info] ) è una lingua romanza parlata principalmente in Italia."
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == "L'italiano ([itaˈljaːno][Nota 1] ascolta[?·info]) è una lingua romanza parlata principalmente in Italia."
    elif lang == 'jpn':
        assert text == '日本語(にほんご、にっぽんご[注1])は、主に日本国内や日本人同士の間で使用されている言語である。'
    elif lang == 'lav':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Latviešu valoda ir dzimtā valoda apmēram 1,7 miljoniem cilvēku, galvenokārt Latvijā, kur tā ir vienīgā valsts valoda . [3 ]'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Latviešu valoda ir dzimtā valoda apmēram 1,7 miljoniem cilvēku, galvenokārt Latvijā, kur tā ir vienīgā valsts valoda. [3]'
    elif lang == 'lit':
        assert text == 'Lietuvių kalba – iš baltų prokalbės kilusi lietuvių tautos kalba, kuri Lietuvoje yra valstybinė, o Europos Sąjungoje – viena iš oficialiųjų kalbų.'
    elif lang == 'pol':
        assert text == 'Język polski, polszczyzna, skrót: pol. – język naturalny należący do grupy języków zachodniosłowiańskich (do której należą również czeski, słowacki, kaszubski, dolnołużycki, górnołużycki i wymarły połabski), stanowiącej część rodziny języków indoeuropejskich.'
    elif lang == 'por':
        assert text == 'A língua portuguesa, também designada português, é uma língua românica flexiva ocidental originada no galego-português falado no Reino da Galiza e no norte de Portugal.'
    elif lang == 'ron':
        assert text == 'Limba română este o limbă indo - europeană, din grupul italic și din subgrupul oriental al limbilor romanice.'
    elif lang == 'rus':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Ру́сский язы́к ([ ˈruskʲɪi̯ jɪˈzɨk] Информация о файле слушать) [~ 3] [⇨] — один из восточнославянских языков, национальный язык русского народа.'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Ру́сский язы́к ([ˈruskʲɪi̯ jɪˈzɨk] Информация о файле слушать) [~ 3] [⇨] — один из восточнославянских языков, национальный язык русского народа.'
    elif lang == 'slk':
        assert text == 'Slovenčina patrí do skupiny západoslovanských jazykov (spolu s češtinou, poľštinou, hornou a dolnou lužickou srbčinou a kašubčinou).'
    elif lang == 'slv':
        assert text == 'Slovenščina [slovénščina] / [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore, ki jih govorijo ali so jih nekoč govorili Slovenci.'
    elif lang == 'spa':
        assert text == 'El español o castellano es una lengua romance procedente del latín hablado.'
    elif lang == 'swe':
        if word_detokenizer == 'NLTK - Penn Treebank Detokenizer':
            assert text == 'Svenska (svenska (info) ) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk, men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.'
        elif word_detokenizer == 'Sacremoses - Moses Detokenizer':
            assert text == 'Svenska (svenska (info)) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk, men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.'
    elif lang == 'tam':
        assert text == 'தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும்.'
    elif lang == 'tha':
        assert text == 'ภาษาไทย หรือ ภาษาไทยกลาง เป็นภาษาราชการและภาษาประจำชาติของประเทศไทย'
    elif lang == 'bod':
        assert text == '༄༅། །རྒྱ་གར་སྐད་དུ།བོ་དྷི་སཏྭ་ཙརྻ་ཨ་བ་ཏ་ར།བོད་སྐད་དུ།བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པ། །སངས་རྒྱས་དང་བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚལ་ལོ། །བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང༌། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །'
Beispiel #7
0
    def __init__(self, main, file, flat_tokens = True):
        self.main = main
        self.lang = file['lang']
        self.text_type = file['text_type']

        self.offsets_paras = []
        self.offsets_sentences = []
        self.offsets_clauses = []

        if flat_tokens:
            self.tokens_hierarchical = [[[[]]]]
        else:
            self.tokens_hierarchical = []

        self.tokens_flat = []

        self.tags_all = []
        self.tags_pos = []
        self.tags_non_pos = []

        re_tags_all = wordless_matching.get_re_tags(main, tags = 'all')
        re_tags_pos = wordless_matching.get_re_tags(main, tags = 'pos')
        re_tags_non_pos = wordless_matching.get_re_tags(main, tags = 'non_pos')

        with open(file['path'], 'r', encoding = file['encoding']) as f:
            # Untokenized / Untagged
            if self.text_type == ('untokenized', 'untagged'):
                if flat_tokens:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            tokens = wordless_text_processing.wordless_word_tokenize(main, text,
                                                                                     lang = self.lang,
                                                                                     flat_tokens = True)

                            self.tokens_hierarchical[0][0][0].extend(tokens)
                else:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            tokens = wordless_text_processing.wordless_word_tokenize(main, text,
                                                                                     lang = self.lang,
                                                                                     flat_tokens = False)

                            self.tokens_hierarchical.append(tokens)

            # Untokenized / Tagged (Non-POS)
            elif self.text_type == ('untokenized', 'tagged_non_pos'):
                if flat_tokens:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            tokens = wordless_text_processing.wordless_word_tokenize(main, text_no_tags,
                                                                                     lang = self.lang,
                                                                                     flat_tokens = True)

                            self.tokens_hierarchical[0][0][0].extend(tokens)

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_non_pos, text):
                                self.tokens_hierarchical[0][0][0].insert(0, '')
                                self.tags_non_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                self.tokenize_text(text[:i_tag])
                                self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.tokenize_text(text)
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            tokens = wordless_text_processing.wordless_word_tokenize(main, text_no_tags,
                                                                                     lang = self.lang,
                                                                                     flat_tokens = False)

                            self.tokens_hierarchical.append(tokens)

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_non_pos, text):
                                self.tokens_hierarchical[0][0][0].insert(0, '')
                                self.tags_non_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                self.tokenize_text(text[:i_tag])
                                self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.tokenize_text(text)
            # Tokenized / Untagged
            elif self.text_type == ('tokenized', 'untagged'):
                if flat_tokens:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            self.tokens_hierarchical[0][0][0].extend(text.split())
                else:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            self.tokens_hierarchical.append([])

                            for sentence in wordless_text_processing.wordless_sentence_tokenize(main, text,
                                                                                                lang = self.lang):
                                self.tokens_hierarchical[-1].append([])

                                for clause in wordless_text_processing.wordless_clause_tokenize(main, sentence,
                                                                                                lang = self.lang):
                                    self.tokens_hierarchical[-1][-1].append(clause.split())
            # Tokenized / Tagged (POS)
            elif self.text_type == ('tokenized', 'tagged_pos'):
                if flat_tokens:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_hierarchical[0][0][0].extend(text_no_tags.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_pos, text):
                                self.tokens_hierarchical[0][0][0].insert(0, '')
                                self.tags_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_pos, text):
                                i_tag = text.index(tag)

                                self.split_text(text[:i_tag])
                                self.tags_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_hierarchical.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            for sentence in wordless_text_processing.wordless_sentence_tokenize(main, text_no_tags,
                                                                                                lang = self.lang):
                                self.tokens_hierarchical[-1].append([])

                                for clause in wordless_text_processing.wordless_clause_tokenize(main, sentence,
                                                                                                lang = self.lang):
                                    self.tokens_hierarchical[-1][-1].append(clause.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_pos, text):
                                self.tokens_hierarchical[0][0][0].insert(0, '')
                                self.tags_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_pos, text):
                                i_tag = text.index(tag)

                                self.split_text(text[:i_tag])
                                self.tags_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
            # Tokenized / Tagged (Non-POS)
            elif self.text_type == ('tokenized', 'tagged_non_pos'):
                if flat_tokens:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_hierarchical[0][0][0].extend(text_no_tags.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_non_pos, text):
                                self.tokens_hierarchical[0][0][0].insert(0, '')
                                self.tags_non_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                self.split_text(text[:i_tag])
                                self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_hierarchical.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            for sentence in wordless_text_processing.wordless_sentence_tokenize(main, text_no_tags,
                                                                                                lang = self.lang):
                                self.tokens_hierarchical[-1].append([])

                                for clause in wordless_text_processing.wordless_clause_tokenize(main, sentence,
                                                                                                lang = self.lang):
                                    self.tokens_hierarchical[-1][-1].append(clause.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_non_pos, text):
                                self.tokens_hierarchical[0][0][0].insert(0, '')
                                self.tags_non_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                self.split_text(text[:i_tag])
                                self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
            # Tokenized / Tagged (Both)
            elif self.text_type == ('tokenized', 'tagged_both'):
                if flat_tokens:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_all, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_hierarchical[0][0][0].extend(text_no_tags.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and (re.match(re_tags_pos, text) or re.match(re_tags_non_pos, text)):
                                self.tokens_hierarchical[0][0][0].insert(0, '')

                                self.tags_all.append([])
                                self.tags_pos.append([])
                                self.tags_non_pos.append([])

                            # Extract tags
                            while text:
                                tag_pos = re.search(re_tags_pos, text)
                                tag_non_pos = re.search(re_tags_non_pos, text)

                                if tag_pos:
                                    i_tag_pos = text.index(tag_pos.group())

                                if tag_non_pos:
                                    i_tag_non_pos = text.index(tag_non_pos.group())

                                if (tag_pos and tag_non_pos and i_tag_pos < i_tag_non_pos or
                                    tag_pos and not tag_non_pos):
                                    self.split_text(text[:i_tag_pos])

                                    self.tags_pos[-1].append(tag_pos.group())
                                    self.tags_all[-1].append(tag_pos.group())

                                    text = text[i_tag_pos + len(tag_pos.group()):]
                                elif (tag_pos and tag_non_pos and i_tag_pos > i_tag_non_pos or
                                      not tag_pos and tag_non_pos):
                                    self.split_text(text[:i_tag_non_pos])

                                    self.tags_all[-1].append(tag_non_pos.group())
                                    self.tags_non_pos[-1].append(tag_non_pos.group())

                                    text = text[i_tag_non_pos + len(tag_non_pos.group()):]
                                else:
                                    self.split_text(text)

                                    break
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_hierarchical.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_all, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            for sentence in wordless_text_processing.wordless_sentence_tokenize(main, text_no_tags,
                                                                                                lang = self.lang):
                                self.tokens_hierarchical[-1].append([])

                                for clause in wordless_text_processing.wordless_clause_tokenize(main, sentence,
                                                                                                lang = self.lang):
                                    self.tokens_hierarchical[-1][-1].append(clause.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and (re.match(re_tags_pos, text) or re.match(re_tags_non_pos, text)):
                                self.tokens_hierarchical[0][0][0].insert(0, '')

                                self.tags_all.append([])
                                self.tags_pos.append([])
                                self.tags_non_pos.append([])

                            # Extract tags
                            while text:
                                tag_pos = re.search(re_tags_pos, text)
                                tag_non_pos = re.search(re_tags_non_pos, text)

                                if tag_pos:
                                    i_tag_pos = text.index(tag_pos.group())

                                if tag_non_pos:
                                    i_tag_non_pos = text.index(tag_non_pos.group())

                                if (tag_pos and tag_non_pos and i_tag_pos < i_tag_non_pos or
                                    tag_pos and not tag_non_pos):
                                    self.split_text(text[:i_tag_pos])

                                    self.tags_all[-1].append(tag_pos.group())
                                    self.tags_pos[-1].append(tag_pos.group())

                                    text = text[i_tag_pos + len(tag_pos.group()):]
                                elif (tag_pos and tag_non_pos and i_tag_pos > i_tag_non_pos or
                                      not tag_pos and tag_non_pos):
                                    self.split_text(text[:i_tag_non_pos])

                                    self.tags_all[-1].append(tag_non_pos.group())
                                    self.tags_non_pos[-1].append(tag_non_pos.group())

                                    text = text[i_tag_non_pos + len(tag_non_pos.group()):]
                                else:
                                    self.split_text(text)

                                    break

        # Paragraph, sentence and clause offsets
        for para in self.tokens_hierarchical:
            self.offsets_paras.append(len(self.tokens_flat))

            for sentence in para:
                self.offsets_sentences.append(len(self.tokens_flat))

                for clause in sentence:
                    self.offsets_clauses.append(len(self.tokens_flat))

                    self.tokens_flat.extend(clause)

        # Tags
        if self.text_type[1] == 'tagged_pos':
            self.tags_non_pos = [[] for i in range(len(self.tokens_flat))]
            self.tags_all = copy.deepcopy(self.tags_pos)
        elif self.text_type[1] == 'tagged_non_pos':
            self.tags_pos = [[] for i in range(len(self.tokens_flat))]
            self.tags_all = copy.deepcopy(self.tags_non_pos)
        elif self.text_type[1] == 'untagged':
            self.tags_all = [[] for i in range(len(self.tokens_flat))]
            self.tags_pos = [[] for i in range(len(self.tokens_flat))]
            self.tags_non_pos = [[] for i in range(len(self.tokens_flat))]

        # Remove whitespace around all tags
        self.tags_all = [[tag.strip() for tag in tags] for tags in self.tags_all]
        self.tags_pos = [[tag.strip() for tag in tags] for tags in self.tags_pos]
        self.tags_non_pos = [[tag.strip() for tag in tags] for tags in self.tags_non_pos]
def test_pos_tag(lang, pos_tagger, show_results=False):
    lang_text = wordless_conversion.to_lang_text(main, lang)

    tokens = wordless_text_processing.wordless_word_tokenize(
        main,
        text=getattr(wordless_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang=lang)

    tokens_tagged = wordless_text_processing.wordless_pos_tag(
        main, tokens=tokens, lang=lang, pos_tagger=pos_tagger)
    tokens_tagged_universal = wordless_text_processing.wordless_pos_tag(
        main,
        tokens=tokens,
        lang=lang,
        pos_tagger=pos_tagger,
        tagset='universal')

    if show_results:
        print(tokens_tagged)
        print(tokens_tagged_universal)

    if lang == 'zho_cn':
        assert tokens_tagged == [('汉语', 'nz'), (',', 'x'), ('又称', 'n'),
                                 ('汉文', 'nz'), ('、', 'x'), ('中文', 'nz'),
                                 ('、', 'x'), ('中国', 'ns'), ('话', 'n'),
                                 ('、', 'x'), ('中国', 'ns'), ('语', 'ng'),
                                 ('、', 'x'), ('华语', 'nz'), ('、', 'x'),
                                 ('华文', 'nz'), ('、', 'x'), ('唐', 'nr'),
                                 ('话', 'n'), ('[', 'x'), ('2', 'x'),
                                 (']', 'x'), (',', 'x'),
                                 ('或', 'c'), ('被', 'p'), ('视为', 'v'),
                                 ('一个', 'm'), ('语族', 'n'), (',', 'x'),
                                 ('或', 'c'), ('被', 'p'), ('视为', 'v'),
                                 ('隶属于', 'n'), ('汉藏语系', 'nz'), ('汉语', 'nz'),
                                 ('族', 'ng'), ('之', 'u'), ('一种', 'm'),
                                 ('语言', 'n'), ('。', 'x')]
        assert tokens_tagged_universal == [('汉语', 'PROPN'), (',', 'PUNCT/SYM'),
                                           ('又称', 'NOUN'), ('汉文', 'PROPN'),
                                           ('、', 'PUNCT/SYM'), ('中文', 'PROPN'),
                                           ('、', 'PUNCT/SYM'), ('中国', 'PROPN'),
                                           ('话', 'NOUN'), ('、', 'PUNCT/SYM'),
                                           ('中国', 'PROPN'), ('语', 'NOUN'),
                                           ('、', 'PUNCT/SYM'), ('华语', 'PROPN'),
                                           ('、', 'PUNCT/SYM'), ('华文', 'PROPN'),
                                           ('、', 'PUNCT/SYM'), ('唐', 'PRONP'),
                                           ('话', 'NOUN'), ('[', 'PUNCT/SYM'),
                                           ('2', 'PUNCT/SYM'),
                                           (']', 'PUNCT/SYM'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('视为', 'VERB'),
                                           ('一个', 'NUM'), ('语族', 'NOUN'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('视为', 'VERB'),
                                           ('隶属于', 'NOUN'), ('汉藏语系', 'PROPN'),
                                           ('汉语', 'PROPN'), ('族', 'NOUN'),
                                           ('之', 'PART'), ('一种', 'NUM'),
                                           ('语言', 'NOUN'), ('。', 'PUNCT/SYM')]
    elif lang == 'zho_tw':
        assert tokens_tagged == [
            ('漢語', 'nz'), (',', 'x'), ('又', 'd'), ('稱', 'v'), ('漢文', 'nz'),
            ('、', 'x'), ('中文', 'nz'), ('、', 'x'), ('中國', 'ns'), ('話', 'n'),
            ('、', 'x'), ('中國', 'ns'), ('語', 'n'), ('、', 'x'), ('華語', 'nz'),
            ('、', 'x'), ('華文', 'nz'), ('、', 'x'), ('唐', 'nr'), ('話', 'n'),
            ('[', 'x'), ('2', 'x'), (']', 'x'), (',', 'x'), ('或', 'c'),
            ('被', 'p'), ('視為', 'v'), ('一個', 'm'), ('語族', 'n'), (',', 'x'),
            ('或', 'c'), ('被', 'p'), ('視為', 'v'), ('隸', 'j'), ('屬', 'v'),
            ('於', 'nr'), ('漢藏語', 'nz'), ('系漢', 'n'), ('語族', 'n'), ('之一', 'r'),
            ('種語', 'n'), ('言', 'vg'), ('。', 'x')
        ]
        assert tokens_tagged_universal == [('漢語', 'PROPN'), (',', 'PUNCT/SYM'),
                                           ('又', 'ADV'), ('稱', 'VERB'),
                                           ('漢文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('中文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('中國', 'PROPN'), ('話', 'NOUN'),
                                           ('、', 'PUNCT/SYM'), ('中國', 'PROPN'),
                                           ('語', 'NOUN'), ('、', 'PUNCT/SYM'),
                                           ('華語', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('華文', 'PROPN'), ('、', 'PUNCT/SYM'),
                                           ('唐', 'PRONP'), ('話', 'NOUN'),
                                           ('[', 'PUNCT/SYM'),
                                           ('2', 'PUNCT/SYM'),
                                           (']', 'PUNCT/SYM'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('視為', 'VERB'),
                                           ('一個', 'NUM'), ('語族', 'NOUN'),
                                           (',', 'PUNCT/SYM'), ('或', 'CONJ'),
                                           ('被', 'ADP'), ('視為', 'VERB'),
                                           ('隸', 'X'), ('屬', 'VERB'),
                                           ('於', 'PRONP'), ('漢藏語', 'PROPN'),
                                           ('系漢', 'NOUN'), ('語族', 'NOUN'),
                                           ('之一', 'PRON'), ('種語', 'NOUN'),
                                           ('言', 'VERB'), ('。', 'PUNCT/SYM')]
    elif lang == 'nld':
        assert tokens_tagged == [
            ('Het',
             'Art|bep|onzijd|neut__Definite=Def|Gender=Neut|PronType=Art'),
            ('Nederlands', 'Adj|zelfst|stell|onverv__Degree=Pos'),
            ('is',
             'V|hulpofkopp|ott|3|ev__Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'
             ),
            ('een',
             'Art|onbep|zijdofonzijd|neut__Definite=Ind|Number=Sing|PronType=Art'
             ),
            ('West-Germaanse', 'Adj|attr|stell|vervneut__Case=Nom|Degree=Pos'),
            ('taal', 'N|soort|ev|neut__Number=Sing'), ('en', 'Conj|neven___'),
            ('de', 'Art|bep|zijdofmv|neut__Definite=Def|PronType=Art'),
            ('moedertaal', 'N|soort|ev|neut__Number=Sing'),
            ('van', 'Prep|voor__AdpType=Prep'),
            ('de', 'Art|bep|zijdofmv|neut__Definite=Def|PronType=Art'),
            ('meeste', 'Num__Case=Nom|Degree=Sup|NumType=Card|PronType=Ind'),
            ('inwoners', 'N|soort|mv|neut__Number=Plur'),
            ('van', 'Prep|voor__AdpType=Prep'),
            ('Nederland', 'N|eigen|ev|neut__Number=Sing'),
            (',', 'Punc|komma__PunctType=Comm'),
            ('België', 'N|eigen|ev|neut__Number=Sing'),
            ('en', 'Conj|neven___'),
            ('Suriname', 'N|eigen|ev|neut__Number=Sing'),
            ('.', 'Punc|punt__PunctType=Peri')
        ]
        assert tokens_tagged_universal == [
            ('Het', 'DET'), ('Nederlands', 'ADJ'), ('is', 'VERB'),
            ('een', 'DET'), ('West-Germaanse', 'ADJ'), ('taal', 'NOUN'),
            ('en', 'CONJ'), ('de', 'DET'), ('moedertaal', 'NOUN'),
            ('van', 'ADP'), ('de', 'DET'), ('meeste', 'NUM'),
            ('inwoners', 'NOUN'), ('van', 'ADP'), ('Nederland', 'NOUN'),
            (',', 'PUNCT'), ('België', 'NOUN'), ('en', 'CONJ'),
            ('Suriname', 'NOUN'), ('.', 'PUNCT')
        ]
    elif lang == 'eng':
        if pos_tagger == 'NLTK - Perceptron POS Tagger':
            assert tokens_tagged == [('English', 'NNP'), ('is', 'VBZ'),
                                     ('a', 'DT'), ('West', 'NNP'),
                                     ('Germanic', 'NNP'), ('language', 'NN'),
                                     ('that', 'WDT'), ('was', 'VBD'),
                                     ('first', 'RB'), ('spoken', 'VBN'),
                                     ('in', 'IN'), ('early', 'JJ'),
                                     ('medieval', 'NN'), ('England', 'NNP'),
                                     ('and', 'CC'), ('eventually', 'RB'),
                                     ('became', 'VBD'), ('a', 'DT'),
                                     ('global', 'JJ'), ('lingua', 'NN'),
                                     ('franca.[4][5', 'NN'), (']', 'NN')]
            assert tokens_tagged_universal == [('English', 'PROPN'),
                                               ('is', 'VERB'), ('a', 'DET'),
                                               ('West', 'PROPN'),
                                               ('Germanic', 'PROPN'),
                                               ('language', 'NOUN'),
                                               ('that', 'DET'), ('was',
                                                                 'VERB'),
                                               ('first', 'ADV'),
                                               ('spoken', 'VERB'),
                                               ('in', 'ADP/SCONJ'),
                                               ('early', 'ADJ'),
                                               ('medieval', 'NOUN'),
                                               ('England', 'PROPN'),
                                               ('and', 'CCONJ'),
                                               ('eventually', 'ADV'),
                                               ('became', 'VERB'),
                                               ('a', 'DET'), ('global', 'ADJ'),
                                               ('lingua', 'NOUN'),
                                               ('franca.[4][5', 'NOUN'),
                                               (']', 'NOUN')]
        elif pos_tagger == 'spaCy - English POS Tagger':
            assert tokens_tagged == [('English', 'NNP'), ('is', 'VBZ'),
                                     ('a', 'DT'), ('West', 'NNP'),
                                     ('Germanic', 'JJ'), ('language', 'NN'),
                                     ('that', 'WDT'), ('was', 'VBD'),
                                     ('first', 'RB'), ('spoken', 'VBN'),
                                     ('in', 'IN'), ('early', 'JJ'),
                                     ('medieval', 'NN'), ('England', 'NNP'),
                                     ('and', 'CC'), ('eventually', 'RB'),
                                     ('became', 'VBD'), ('a', 'DT'),
                                     ('global', 'JJ'), ('lingua', 'NN'),
                                     ('franca.[4][5', 'NN'), (']', '-RRB-')]
            assert tokens_tagged_universal == [('English', 'PROPN'),
                                               ('is', 'VERB'), ('a', 'DET'),
                                               ('West', 'PROPN'),
                                               ('Germanic', 'ADJ'),
                                               ('language', 'NOUN'),
                                               ('that', 'DET'), ('was',
                                                                 'VERB'),
                                               ('first', 'ADV'),
                                               ('spoken', 'VERB'),
                                               ('in', 'ADP/SCONJ'),
                                               ('early', 'ADJ'),
                                               ('medieval', 'NOUN'),
                                               ('England', 'PROPN'),
                                               ('and', 'CCONJ'),
                                               ('eventually', 'ADV'),
                                               ('became', 'VERB'),
                                               ('a', 'DET'), ('global', 'ADJ'),
                                               ('lingua', 'NOUN'),
                                               ('franca.[4][5', 'NOUN'),
                                               (']', 'PUNCT')]
    elif lang == 'fra':
        assert tokens_tagged == [
            ('Le', 'DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('français', 'NOUN__Gender=Masc|Number=Sing'),
            ('est',
             'AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('une', 'DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('langue', 'NOUN__Gender=Fem|Number=Sing'),
            ('indo-européenne', 'ADJ___'), ('de', 'ADP___'),
            ('la', 'DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art'),
            ('famille', 'NOUN__Gender=Fem|Number=Sing'),
            ('des', 'DET__Definite=Ind|Number=Plur|PronType=Art'),
            ('langues', 'ADJ__Number=Plur'),
            ('romanes', 'ADJ__Gender=Fem|Number=Plur'), ('.', 'PUNCT___')
        ]
        assert tokens_tagged_universal == [('Le', 'DET'), ('français', 'NOUN'),
                                           ('est', 'AUX'), ('une', 'DET'),
                                           ('langue', 'NOUN'),
                                           ('indo-européenne', 'ADJ'),
                                           ('de', 'ADP'), ('la', 'DET'),
                                           ('famille', 'NOUN'), ('des', 'DET'),
                                           ('langues', 'ADJ'),
                                           ('romanes', 'ADJ'), ('.', 'PUNCT')]
    elif lang == 'deu':
        assert tokens_tagged == [('Die', 'ART'), ('deutsche', 'ADJA'),
                                 ('Sprache', 'NN'), ('bzw.', 'VVFIN'),
                                 ('Deutsch', 'NN'), ('(', '$('), ('[', 'NN'),
                                 ('dɔʏ̯t͡ʃ', 'NE'), (']', 'PTKVZ'),
                                 (';', '$.'), ('abgekürzt', 'VVFIN'),
                                 ('dt', 'XY'), ('.', '$.'), ('oder', 'KON'),
                                 ('dtsch', 'ADJD'), ('.', '$.'), (')', '$('),
                                 ('ist', 'VAFIN'), ('eine', 'ART'),
                                 ('westgermanische', 'ADJA'),
                                 ('Sprache', 'NN'), ('.', '$.')]
        assert tokens_tagged_universal == [('Die', 'DET'), ('deutsche', 'ADJ'),
                                           ('Sprache', 'NOUN'),
                                           ('bzw.', 'VERB'),
                                           ('Deutsch', 'NOUN'), ('(', 'PUNCT'),
                                           ('[', 'NOUN'), ('dɔʏ̯t͡ʃ', 'PROPN'),
                                           (']', 'PART'), (';', 'PUNCT'),
                                           ('abgekürzt', 'VERB'), ('dt', 'X'),
                                           ('.', 'PUNCT'), ('oder', 'CCONJ'),
                                           ('dtsch', 'ADJ'), ('.', 'PUNCT'),
                                           (')', 'PUNCT'), ('ist', 'AUX'),
                                           ('eine', 'DET'),
                                           ('westgermanische', 'ADJ'),
                                           ('Sprache', 'NOUN'), ('.', 'PUNCT')]
    elif lang == 'ell':
        assert tokens_tagged == [('Η', 'DET'), ('ελληνική', 'ADJ'),
                                 ('γλώσσα', 'NOUN'), ('ανήκει', 'VERB'),
                                 ('στην', 'ADJ'), ('ινδοευρωπαϊκή', 'ADJ'),
                                 ('οικογένεια[9', 'NOUN'), (']', 'NUM'),
                                 ('και', 'CCONJ'), ('συγκεκριμένα', 'ADV'),
                                 ('στον', 'ADV'), ('ελληνικό', 'ADJ'),
                                 ('κλάδο', 'NOUN'), (',', 'PUNCT'),
                                 ('μαζί', 'ADV'), ('με', 'ADP'),
                                 ('την', 'DET'), ('τσακωνική', 'ADJ'),
                                 (',', 'PUNCT'), ('ενώ', 'SCONJ'),
                                 ('είναι', 'AUX'), ('η', 'DET'),
                                 ('επίσημη', 'ADJ'), ('γλώσσα', 'NOUN'),
                                 ('της', 'DET'), ('Ελλάδος', 'PROPN'),
                                 ('και', 'CCONJ'), ('της', 'DET'),
                                 ('Κύπρου', 'PROPN'), ('.', 'PUNCT')]
        assert tokens_tagged_universal == [('Η', 'DET'), ('ελληνική', 'ADJ'),
                                           ('γλώσσα', 'NOUN'),
                                           ('ανήκει', 'VERB'), ('στην', 'ADJ'),
                                           ('ινδοευρωπαϊκή', 'ADJ'),
                                           ('οικογένεια[9', 'NOUN'),
                                           (']', 'NUM'), ('και', 'CCONJ'),
                                           ('συγκεκριμένα', 'ADV'),
                                           ('στον', 'ADV'),
                                           ('ελληνικό', 'ADJ'),
                                           ('κλάδο', 'NOUN'), (',', 'PUNCT'),
                                           ('μαζί', 'ADV'), ('με', 'ADP'),
                                           ('την', 'DET'), ('τσακωνική',
                                                            'ADJ'),
                                           (',', 'PUNCT'), ('ενώ', 'SCONJ'),
                                           ('είναι', 'AUX'), ('η', 'DET'),
                                           ('επίσημη', 'ADJ'),
                                           ('γλώσσα', 'NOUN'), ('της', 'DET'),
                                           ('Ελλάδος', 'PROPN'),
                                           ('και', 'CCONJ'), ('της', 'DET'),
                                           ('Κύπρου', 'PROPN'), ('.', 'PUNCT')]
    elif lang == 'ita':
        assert tokens_tagged == [
            ("L'", 'RD__Definite=Def|Number=Sing|PronType=Art'),
            ('italiano', 'S__Gender=Masc|Number=Sing'), ('(', 'FB___'),
            ('[', 'FB___'), ('itaˈljaːno][Nota', 'A__Gender=Fem|Number=Sing'),
            ('1', 'N__NumType=Card'), (']', 'FB___'),
            ('ascolta[?·info', 'SP___'), (']', 'FB___'), (')', 'FB___'),
            ('è', 'V__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('una', 'RI__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('lingua', 'S__Gender=Fem|Number=Sing'),
            ('romanza', 'S__Gender=Fem|Number=Sing'),
            ('parlata', 'V__Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part'),
            ('principalmente', 'B___'), ('in', 'E___'), ('Italia', 'SP___'),
            ('.', 'FS___')
        ]
        assert tokens_tagged_universal == [("L'", 'DET'), ('italiano', 'NOUN'),
                                           ('(', 'PUNCT'), ('[', 'PUNCT'),
                                           ('itaˈljaːno][Nota', 'ADJ'),
                                           ('1', 'NUM'), (']', 'PUNCT'),
                                           ('ascolta[?·info', 'PROPN'),
                                           (']', 'PUNCT'), (')', 'PUNCT'),
                                           ('è', 'VERB'), ('una', 'DET'),
                                           ('lingua', 'NOUN'),
                                           ('romanza', 'NOUN'),
                                           ('parlata', 'VERB'),
                                           ('principalmente', 'ADV'),
                                           ('in', 'ADP'), ('Italia', 'PROPN'),
                                           ('.', 'PUNCT')]
    elif lang == 'jpn':
        assert tokens_tagged == [('日本', '名詞'), ('語', '名詞'), ('(', '補助記号'),
                                 ('にほんご', '名詞'), ('、', '補助記号'), ('にっぽん', '名詞'),
                                 ('ご', '接尾辞'), ('[', '補助記号'), ('注', '名詞'),
                                 ('1', '名詞'), (']', '補助記号'), (')', '補助記号'),
                                 ('は', '助詞'), ('、', '補助記号'), ('主に', '副詞'),
                                 ('日本', '名詞'), ('国', '接尾辞'), ('内', '接尾辞'),
                                 ('や', '助詞'), ('日本', '名詞'), ('人', '接尾辞'),
                                 ('同士', '接尾辞'), ('の', '助詞'), ('間', '名詞'),
                                 ('で', '助詞'), ('使用', '名詞'), ('さ', '動詞'),
                                 ('れ', '助動詞'), ('て', '助詞'), ('いる', '動詞'),
                                 ('言語', '名詞'), ('で', '助動詞'), ('ある', '動詞'),
                                 ('。', '補助記号')]
        assert tokens_tagged_universal == [('日本', 'NOUN'), ('語', 'NOUN'),
                                           ('(', 'PUNCT/SYM'), ('にほんご',
                                                                'NOUN'),
                                           ('、', 'PUNCT/SYM'),
                                           ('にっぽん', 'NOUN'), ('ご', 'PART'),
                                           ('[', 'PUNCT/SYM'), ('注', 'NOUN'),
                                           ('1', 'NOUN'), (']', 'PUNCT/SYM'),
                                           (')', 'PUNCT/SYM'), ('は', 'PART'),
                                           ('、', 'PUNCT/SYM'), ('主に', 'ADV'),
                                           ('日本', 'NOUN'), ('国', 'PART'),
                                           ('内', 'PART'), ('や', 'PART'),
                                           ('日本', 'NOUN'), ('人', 'PART'),
                                           ('同士', 'PART'), ('の', 'PART'),
                                           ('間', 'NOUN'), ('で', 'PART'),
                                           ('使用', 'NOUN'), ('さ', 'VERB'),
                                           ('れ', 'AUX'), ('て', 'PART'),
                                           ('いる', 'VERB'), ('言語', 'NOUN'),
                                           ('で', 'AUX'), ('ある', 'VERB'),
                                           ('。', 'PUNCT/SYM')]
    elif lang == 'lit':
        assert tokens_tagged == [
            ('Lietuvių', 'Ncmpgn-'), ('kalba', 'Ncfsnn-'), ('–', 'Z'),
            ('iš', 'Sgg'), ('baltų', 'Agpmpgy'), ('prokalbės', 'Ncfsgn-'),
            ('kilusi', 'Agpmsin'), ('lietuvių', 'Ncmpgn-'),
            ('tautos', 'Ncfsgn-'), ('kalba', 'Ncfsin-'), (',', 'Z'),
            ('kuri', 'Pgfsnn'), ('Lietuvoje', 'Npfslng'),
            ('yra', 'Vgmp3s--n--ni-'), ('valstybinė', 'Agpfsnn'), (',', 'Z'),
            ('o', 'Cg'), ('Europos', 'Npfsgng'), ('Sąjungoje', 'Npfslng'),
            ('–', 'Z'), ('viena', 'Pgn--n'), ('iš', 'Sgg'),
            ('oficialiųjų', 'Agpmpgy'), ('kalbų', 'Ncmsnn-'), ('.', 'Z')
        ]
        assert tokens_tagged_universal == [
            ('Lietuvių', 'NOUN'), ('kalba', 'NOUN'), ('–', 'PUNCT'),
            ('iš', 'ADP'), ('baltų', 'ADJ'), ('prokalbės', 'NOUN'),
            ('kilusi', 'ADJ'), ('lietuvių', 'NOUN'), ('tautos', 'NOUN'),
            ('kalba', 'NOUN'), (',', 'PUNCT'), ('kuri', 'PRON'),
            ('Lietuvoje', 'PROPN'), ('yra', 'VERB'), ('valstybinė', 'ADJ'),
            (',', 'PUNCT'), ('o', 'CONJ'), ('Europos', 'PROPN'),
            ('Sąjungoje', 'PROPN'), ('–', 'PUNCT'), ('viena', 'PRON'),
            ('iš', 'ADP'), ('oficialiųjų', 'ADJ'), ('kalbų', 'NOUN'),
            ('.', 'PUNCT')
        ]
    elif lang == 'nob':
        assert tokens_tagged == [
            ('Bokmål', 'NOUN__Definite=Ind|Gender=Neut|Number=Sing'),
            ('er', 'VERB__Mood=Ind|Tense=Pres|VerbForm=Fin'),
            ('en', 'DET__Gender=Masc|Number=Sing'),
            ('varietet', 'ADJ__Definite=Ind|Number=Sing'), ('av', 'ADP___'),
            ('norsk', 'ADJ__Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing'),
            ('språk', 'NOUN__Definite=Ind|Gender=Neut|Number=Sing'),
            ('.', 'PUNCT___')
        ]
        assert tokens_tagged_universal == [('Bokmål', 'NOUN'), ('er', 'VERB'),
                                           ('en', 'DET'), ('varietet', 'ADJ'),
                                           ('av', 'ADP'), ('norsk', 'ADJ'),
                                           ('språk', 'NOUN'), ('.', 'PUNCT')]
    elif lang == 'por':
        assert tokens_tagged == [('A', '<artd>|ART|F|S|@>N'),
                                 ('língua', '<np-def>|N|F|S|@SUBJ>'),
                                 ('portuguesa', 'ADJ|F|S|@N<'),
                                 (',', 'PU|@PU'), ('também', 'ADV|@ADVL>'),
                                 ('designada', '<mv>|V|PCP|F|S|@ICL-N<PRED'),
                                 ('português', 'ADJ|F|S|@N<'), (',', 'PU|@PU'),
                                 ('é', '<mv>|V|PR|3S|IND|@FS-STA'),
                                 ('uma', '<arti>|ART|F|S|@>N'),
                                 ('língua', '<np-idf>|N|F|S|@<SC'),
                                 ('românica', 'ADJ|F|S|@N<'),
                                 ('flexiva', 'ADJ|F|S|@N<'),
                                 ('ocidental', 'ADJ|F|S|@N<'),
                                 ('originada', '<mv>|V|PCP|F|S|@ICL-N<'),
                                 ('no', 'PRP|@<OC'),
                                 ('galego-português', '<np-def>|N|M|S|@<ACC'),
                                 ('falado', '<mv>|V|PCP|M|S|@ICL-N<'),
                                 ('no', '<artd>|ART|M|S|@>N'),
                                 ('Reino', 'PROP|M|S|@P<'), ('da', 'PRP|@N<'),
                                 ('Galiza', 'PROPN'),
                                 ('e', '<co-prparg>|KC|@CO'),
                                 ('no', '<cjt>|PRP|@N<'), ('norte', 'N|@P<'),
                                 ('de', 'PRP|@N<'),
                                 ('Portugal', 'PROP|M|S|@P<'), ('.', 'PU|@PU')]
        assert tokens_tagged_universal == [
            ('A', 'DET'), ('língua', 'NOUN'), ('portuguesa', 'ADJ'),
            (',', 'PUNCT'), ('também', 'ADV'), ('designada', 'VERB'),
            ('português', 'ADJ'), (',', 'PUNCT'), ('é', 'VERB'),
            ('uma', 'DET'), ('língua', 'NOUN'), ('românica', 'ADJ'),
            ('flexiva', 'ADJ'), ('ocidental', 'ADJ'), ('originada', 'VERB'),
            ('no', 'ADP'), ('galego-português', 'NOUN'), ('falado', 'VERB'),
            ('no', 'DET'), ('Reino', 'PROPN'), ('da', 'ADP'),
            ('Galiza', 'PROPN'), ('e', 'CCONJ'), ('no', 'ADP'),
            ('norte', 'NOUN'), ('de', 'ADP'), ('Portugal', 'PROPN'),
            ('.', 'PUNCT')
        ]
    elif lang == 'rus':
        if pos_tagger == 'NLTK - Perceptron POS Tagger':
            assert tokens_tagged == [('Ру́сский', 'A=m'), ('язы́к', 'S'),
                                     ('(', 'NONLEX'), ('[', 'NONLEX'),
                                     ('ˈruskʲɪi̯', 'NONLEX'),
                                     ('jɪˈzɨk', 'NONLEX'), (']', 'NONLEX'),
                                     ('Информация', 'S'), ('о', 'PR'),
                                     ('файле', 'S'), ('слушать', 'V'),
                                     (')', 'NONLEX'), ('[', 'NONLEX'),
                                     ('~', 'NONLEX'), ('3', 'NUM=ciph'),
                                     (']', 'NONLEX'), ('[', 'NONLEX'),
                                     ('⇨', 'NONLEX'), (']', 'NONLEX'),
                                     ('—', 'NONLEX'), ('один', 'A-PRO=m'),
                                     ('из', 'PR'),
                                     ('восточнославянских', 'A=pl'),
                                     ('языков', 'S'), (',', 'NONLEX'),
                                     ('национальный', 'A=m'), ('язык', 'S'),
                                     ('русского', 'A=m'), ('народа', 'S'),
                                     ('.', 'NONLEX')]
            assert tokens_tagged_universal == [('Ру́сский', 'ADJ'),
                                               ('язы́к', 'NOUN'),
                                               ('(', 'PUNCT'), ('[', 'PUNCT'),
                                               ('ˈruskʲɪi̯', 'PUNCT'),
                                               ('jɪˈzɨk', 'PUNCT'),
                                               (']', 'PUNCT'),
                                               ('Информация', 'NOUN'),
                                               ('о', 'ADP'), ('файле', 'NOUN'),
                                               ('слушать', 'VERB'),
                                               (')', 'PUNCT'), ('[', 'PUNCT'),
                                               ('~', 'PUNCT'), ('3', 'NUM'),
                                               (']', 'PUNCT'), ('[', 'PUNCT'),
                                               ('⇨', 'PUNCT'), (']', 'PUNCT'),
                                               ('—', 'PUNCT'),
                                               ('один', 'PRON'), ('из', 'ADP'),
                                               ('восточнославянских', 'ADJ'),
                                               ('языков', 'NOUN'),
                                               (',', 'PUNCT'),
                                               ('национальный', 'ADJ'),
                                               ('язык', 'NOUN'),
                                               ('русского', 'ADJ'),
                                               ('народа', 'NOUN'),
                                               ('.', 'PUNCT')]
        elif pos_tagger == 'pymorphy2 - Morphological Analyzer':
            assert tokens_tagged == [
                ('Ру́сский', 'NOUN'), ('язы́к', 'NOUN'), ('(', 'PNCT'),
                ('[', 'PNCT'), ('ˈruskʲɪi̯', 'UNKN'), ('jɪˈzɨk', 'UNKN'),
                (']', 'PNCT'), ('Информация', 'NOUN'), ('о', 'PREP'),
                ('файле', 'NOUN'), ('слушать', 'INFN'), (')', 'PNCT'),
                ('[', 'PNCT'), ('~', 'UNKN'), ('3', 'NUMB'), (']', 'PNCT'),
                ('[', 'PNCT'), ('⇨', 'UNKN'), (']', 'PNCT'), ('—', 'PNCT'),
                ('один', 'ADJF'), ('из', 'PREP'),
                ('восточнославянских', 'ADJF'), ('языков', 'NOUN'),
                (',', 'PNCT'), ('национальный', 'ADJF'), ('язык', 'NOUN'),
                ('русского', 'ADJF'), ('народа', 'NOUN'), ('.', 'PNCT')
            ]
            assert tokens_tagged_universal == [('Ру́сский', 'NOUN'),
                                               ('язы́к', 'NOUN'),
                                               ('(', 'PUNCT'), ('[', 'PUNCT'),
                                               ('ˈruskʲɪi̯', 'SYM/X'),
                                               ('jɪˈzɨk', 'SYM/X'),
                                               (']', 'PUNCT'),
                                               ('Информация', 'NOUN'),
                                               ('о', 'ADP'), ('файле', 'NOUN'),
                                               ('слушать', 'VERB'),
                                               (')', 'PUNCT'), ('[', 'PUNCT'),
                                               ('~', 'SYM/X'), ('3', 'NUM'),
                                               (']', 'PUNCT'), ('[', 'PUNCT'),
                                               ('⇨', 'SYM/X'), (']', 'PUNCT'),
                                               ('—', 'PUNCT'), ('один', 'ADJ'),
                                               ('из', 'ADP'),
                                               ('восточнославянских', 'ADJ'),
                                               ('языков', 'NOUN'),
                                               (',', 'PUNCT'),
                                               ('национальный', 'ADJ'),
                                               ('язык', 'NOUN'),
                                               ('русского', 'ADJ'),
                                               ('народа', 'NOUN'),
                                               ('.', 'PUNCT')]
    elif lang == 'spa':
        assert tokens_tagged == [
            ('El', 'DET__Definite=Def|Gender=Masc|Number=Sing|PronType=Art'),
            ('español', 'NOUN__Gender=Masc|Number=Sing'), ('o', 'CCONJ___'),
            ('castellano', 'NOUN__Gender=Masc|Number=Sing'),
            ('es',
             'AUX__Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'),
            ('una', 'DET__Definite=Ind|Gender=Fem|Number=Sing|PronType=Art'),
            ('lengua', 'NOUN__Gender=Fem|Number=Sing'),
            ('romance', 'NOUN__Gender=Masc|Number=Sing'),
            ('procedente', 'ADJ__Number=Sing'),
            ('del', 'ADP__AdpType=Preppron|Gender=Masc|Number=Sing'),
            ('latín', 'NOUN__Gender=Masc|Number=Sing'),
            ('hablado', 'ADJ__Gender=Masc|Number=Sing|VerbForm=Part'),
            ('.', 'PUNCT__PunctType=Peri')
        ]
        assert tokens_tagged_universal == [('El', 'DET'), ('español', 'NOUN'),
                                           ('o', 'CCONJ'),
                                           ('castellano', 'NOUN'),
                                           ('es', 'AUX'), ('una', 'DET'),
                                           ('lengua', 'NOUN'),
                                           ('romance', 'NOUN'),
                                           ('procedente', 'ADJ'),
                                           ('del', 'ADP'), ('latín', 'NOUN'),
                                           ('hablado', 'ADJ'), ('.', 'PUNCT')]
    elif lang == 'tha':
        if pos_tagger == 'PyThaiNLP - Perceptron POS Tagger - ORCHID Corpus':
            assert tokens_tagged == [('ภาษาไทย', 'NPRP'), ('หรือ', 'JCRG'),
                                     ('ภาษาไทย', 'NPRP'), ('กลาง', 'VATT'),
                                     ('เป็น', 'VSTA'), ('ภาษาราชการ', 'NCMN'),
                                     ('และ', 'JCRG'),
                                     ('ภาษาประจำชาติ', 'NCMN'),
                                     ('ของ', 'RPRE'), ('ประเทศไทย', 'NPRP')]
            assert tokens_tagged_universal == [('ภาษาไทย', 'PROPN'),
                                               ('หรือ', 'CCONJ'),
                                               ('ภาษาไทย', 'PROPN'),
                                               ('กลาง', 'VERB'),
                                               ('เป็น', 'VERB'),
                                               ('ภาษาราชการ', 'NOUN'),
                                               ('และ', 'CCONJ'),
                                               ('ภาษาประจำชาติ', 'NOUN'),
                                               ('ของ', 'ADP'),
                                               ('ประเทศไทย', 'PROPN')]
        elif pos_tagger == 'PyThaiNLP - Perceptron POS Tagger - PUD Corpus':
            assert tokens_tagged == [('ภาษาไทย', 'NOUN'), ('หรือ', 'CCONJ'),
                                     ('ภาษาไทย', 'NOUN'), ('กลาง', 'NOUN'),
                                     ('เป็น', 'AUX'), ('ภาษาราชการ', 'NOUN'),
                                     ('และ', 'CCONJ'),
                                     ('ภาษาประจำชาติ', 'NOUN'), ('ของ', 'ADP'),
                                     ('ประเทศไทย', 'PROPN')]
            assert tokens_tagged_universal == [('ภาษาไทย', 'NOUN'),
                                               ('หรือ', 'CCONJ'),
                                               ('ภาษาไทย', 'NOUN'),
                                               ('กลาง', 'NOUN'),
                                               ('เป็น', 'AUX'),
                                               ('ภาษาราชการ', 'NOUN'),
                                               ('และ', 'CCONJ'),
                                               ('ภาษาประจำชาติ', 'NOUN'),
                                               ('ของ', 'ADP'),
                                               ('ประเทศไทย', 'PROPN')]
    elif lang == 'bod':
        assert tokens_tagged == [('༄༅། །', 'PUNCT'), ('རྒྱ་གར་', 'PROPN'),
                                 ('སྐད་', 'VERB'), ('དུ', 'ADP'),
                                 ('།', 'PUNCT'), ('བོ་', 'PART'),
                                 ('དྷི་', 'NON_WORD'), ('སཏྭ་', 'NON_WORD'),
                                 ('ཙརྻ་', 'NON_WORD'), ('ཨ་བ་', 'OOV'),
                                 ('ཏ་', 'OTHER'), ('ར', 'ADP'), ('།', 'PUNCT'),
                                 ('བོད་སྐད་', 'PROPN'), ('དུ', 'ADP'),
                                 ('།', 'PUNCT'), ('བྱང་ཆུབ་', 'NOUN'),
                                 ('སེམས་དཔ', 'NOUN'), ('འི་', 'PART'),
                                 ('སྤྱོད་པ་', 'VERB'), ('ལ་', 'ADP'),
                                 ('འཇུག་པ', 'VERB'), ('། །', 'PUNCT'),
                                 ('སངས་རྒྱས་', 'NOUN'), ('དང་', 'ADP'),
                                 ('བྱང་ཆུབ་', 'NOUN'), ('སེམས་དཔའ་', 'NOUN'),
                                 ('ཐམས་ཅད་', 'DET'), ('ལ་', 'ADP'),
                                 ('ཕྱག་', 'NOUN'), ('འཚལ་', 'VERB'),
                                 ('ལོ', 'PART'), ('། །', 'PUNCT'),
                                 ('བདེ་གཤེགས་', 'NOUN'), ('ཆོས་', 'NOUN'),
                                 ('ཀྱི་', 'ADP'), ('སྐུ་', 'NOUN'),
                                 ('མངའ་', 'VERB'), ('སྲས་', 'NOUN'),
                                 ('བཅས་', 'VERB'), ('དང༌', 'ADP'),
                                 ('། །', 'PUNCT'), ('ཕྱག་འོས་', 'OOV'),
                                 ('ཀུན་', 'DET'), ('ལ', 'ADP'),
                                 ('འང་', 'PART'), ('གུས་པ', 'VERB'),
                                 ('ར་', 'PART'), ('ཕྱག་', 'NOUN'),
                                 ('འཚལ་', 'VERB'), ('ཏེ', 'SCONJ'),
                                 ('། །', 'PUNCT'), ('བདེ་གཤེགས་', 'NOUN'),
                                 ('སྲས་', 'NOUN'), ('ཀྱི་', 'ADP'),
                                 ('སྡོམ་', 'NOUN'), ('ལ་', 'ADP'),
                                 ('འཇུག་པ་', 'VERB'), ('ནི', 'PART'),
                                 ('། །', 'PUNCT'), ('ལུང་', 'NOUN'),
                                 ('བཞིན་', 'NOUN'), ('མདོར་བསྡུས་', 'ADJ'),
                                 ('ནས་', 'SCONJ'), ('ནི་', 'PART'),
                                 ('བརྗོད་པ', 'VERB'), ('ར་', 'PART'),
                                 ('བྱ', 'VERB'), ('། །', 'PUNCT')]
        assert tokens_tagged_universal == [
            ('༄༅། །', 'PUNCT'), ('རྒྱ་གར་', 'PROPN'), ('སྐད་', 'VERB'),
            ('དུ', 'ADP'), ('།', 'PUNCT'), ('བོ་', 'PART'), ('དྷི་', 'X'),
            ('སཏྭ་', 'X'), ('ཙརྻ་', 'X'), ('ཨ་བ་', 'X'), ('ཏ་', 'X'),
            ('ར', 'ADP'), ('།', 'PUNCT'), ('བོད་སྐད་', 'PROPN'), ('དུ', 'ADP'),
            ('།', 'PUNCT'), ('བྱང་ཆུབ་', 'NOUN'), ('སེམས་དཔ', 'NOUN'),
            ('འི་', 'PART'), ('སྤྱོད་པ་', 'VERB'), ('ལ་', 'ADP'),
            ('འཇུག་པ', 'VERB'), ('། །', 'PUNCT'), ('སངས་རྒྱས་', 'NOUN'),
            ('དང་', 'ADP'), ('བྱང་ཆུབ་', 'NOUN'), ('སེམས་དཔའ་', 'NOUN'),
            ('ཐམས་ཅད་', 'DET'), ('ལ་', 'ADP'), ('ཕྱག་', 'NOUN'),
            ('འཚལ་', 'VERB'), ('ལོ', 'PART'), ('། །', 'PUNCT'),
            ('བདེ་གཤེགས་', 'NOUN'), ('ཆོས་', 'NOUN'), ('ཀྱི་', 'ADP'),
            ('སྐུ་', 'NOUN'), ('མངའ་', 'VERB'), ('སྲས་', 'NOUN'),
            ('བཅས་', 'VERB'), ('དང༌', 'ADP'), ('། །', 'PUNCT'),
            ('ཕྱག་འོས་', 'X'), ('ཀུན་', 'DET'), ('ལ', 'ADP'), ('འང་', 'PART'),
            ('གུས་པ', 'VERB'), ('ར་', 'PART'), ('ཕྱག་', 'NOUN'),
            ('འཚལ་', 'VERB'), ('ཏེ', 'SCONJ'), ('། །', 'PUNCT'),
            ('བདེ་གཤེགས་', 'NOUN'), ('སྲས་', 'NOUN'), ('ཀྱི་', 'ADP'),
            ('སྡོམ་', 'NOUN'), ('ལ་', 'ADP'), ('འཇུག་པ་', 'VERB'),
            ('ནི', 'PART'), ('། །', 'PUNCT'), ('ལུང་', 'NOUN'),
            ('བཞིན་', 'NOUN'), ('མདོར་བསྡུས་', 'ADJ'), ('ནས་', 'SCONJ'),
            ('ནི་', 'PART'), ('བརྗོད་པ', 'VERB'), ('ར་', 'PART'),
            ('བྱ', 'VERB'), ('། །', 'PUNCT')
        ]
    elif lang == 'ukr':
        assert tokens_tagged == [('Украї́нська', 'ADJF'), ('мо́ва', 'ADJF'),
                                 ('(', 'PNCT'), ('МФА', 'UNKN'), (':', 'PNCT'),
                                 ('[', 'PNCT'), ('ukrɑ̽ˈjɪnʲsʲkɑ̽', 'UNKN'),
                                 ('ˈmɔwɑ̽', 'UNKN'), (']', 'PNCT'),
                                 (',', 'PNCT'), ('історичні', 'ADJF'),
                                 ('назви', 'NOUN'), ('—', 'PNCT'),
                                 ('ру́ська', 'ADJF'), (',', 'PNCT'),
                                 ('руси́нська[9][10][11', 'UNKN'),
                                 (']', 'PNCT'), ('[', 'PNCT'), ('*', 'PNCT'),
                                 ('2', 'NUMB'), (']', 'PNCT'), (')', 'PNCT'),
                                 ('—', 'PNCT'), ('національна', 'ADJF'),
                                 ('мова', 'NOUN'), ('українців', 'NOUN'),
                                 ('.', 'PNCT')]
        assert tokens_tagged_universal == [('Украї́нська', 'ADJ'),
                                           ('мо́ва', 'ADJ'), ('(', 'PUNCT'),
                                           ('МФА', 'SYM/X'), (':', 'PUNCT'),
                                           ('[', 'PUNCT'),
                                           ('ukrɑ̽ˈjɪnʲsʲkɑ̽', 'SYM/X'),
                                           ('ˈmɔwɑ̽', 'SYM/X'), (']', 'PUNCT'),
                                           (',', 'PUNCT'),
                                           ('історичні', 'ADJ'),
                                           ('назви', 'NOUN'), ('—', 'PUNCT'),
                                           ('ру́ська', 'ADJ'), (',', 'PUNCT'),
                                           ('руси́нська[9][10][11', 'SYM/X'),
                                           (']', 'PUNCT'), ('[', 'PUNCT'),
                                           ('*', 'PUNCT'), ('2', 'NUM'),
                                           (']', 'PUNCT'), (')', 'PUNCT'),
                                           ('—', 'PUNCT'),
                                           ('національна', 'ADJ'),
                                           ('мова', 'NOUN'),
                                           ('українців', 'NOUN'),
                                           ('.', 'PUNCT')]
    elif lang == 'vie':
        assert tokens_tagged == [('Tiếng', 'N'), ('Việt', 'Np'), (',', 'CH'),
                                 ('còn', 'C'), ('gọi', 'V'), ('tiếng', 'N'),
                                 ('Việt Nam', 'Np'), ('[', 'V'), ('5', 'M'),
                                 (']', 'CH'), (',', 'CH'), ('tiếng Kinh', 'N'),
                                 ('hay', 'C'), ('Việt ngữ', 'V'), (',', 'CH'),
                                 ('là', 'V'), ('ngôn ngữ', 'N'), ('của', 'E'),
                                 ('người', 'Nc'), ('Việt', 'Np'), ('(', 'CH'),
                                 ('dân tộc', 'N'), ('Kinh', 'Np'), (')', 'CH'),
                                 ('và', 'C'), ('là', 'V'), ('ngôn ngữ', 'N'),
                                 ('chính thức', 'A'), ('tại', 'E'),
                                 ('Việt Nam', 'Np'), ('.', 'CH')]
        assert tokens_tagged_universal == [
            ('Tiếng', 'NOUN'), ('Việt', 'PROPN'), (',', 'PUNCT'),
            ('còn', 'CCONJ'), ('gọi', 'VERB'), ('tiếng', 'NOUN'),
            ('Việt Nam', 'PROPN'), ('[', 'VERB'), ('5', 'NUM'), (']', 'PUNCT'),
            (',', 'PUNCT'), ('tiếng Kinh', 'NOUN'), ('hay', 'CCONJ'),
            ('Việt ngữ', 'VERB'), (',', 'PUNCT'), ('là', 'VERB'),
            ('ngôn ngữ', 'NOUN'), ('của', 'ADP'), ('người', 'NOUN'),
            ('Việt', 'PROPN'), ('(', 'PUNCT'), ('dân tộc', 'NOUN'),
            ('Kinh', 'PROPN'), (')', 'PUNCT'), ('và', 'CCONJ'), ('là', 'VERB'),
            ('ngôn ngữ', 'NOUN'), ('chính thức', 'ADJ'), ('tại', 'ADP'),
            ('Việt Nam', 'PROPN'), ('.', 'PUNCT')
        ]
def test_lemmatize(lang, lemmatizer, show_results=False):
    lang_text = wordless_conversion.to_lang_text(main, lang)

    tokens = wordless_text_processing.wordless_word_tokenize(
        main,
        text=getattr(wordless_test_lang_examples, f'SENTENCE_{lang.upper()}'),
        lang=lang)

    lemmas = wordless_text_processing.wordless_lemmatize(main,
                                                         tokens=tokens,
                                                         lang=lang,
                                                         lemmatizer=lemmatizer)

    if show_results:
        print(lemmas)

    if lang == 'ast':
        assert lemmas == [
            "L'asturianu", 'ser', 'unu', 'llingua', 'romance', 'propiu',
            "d'Asturies,[1", ']', 'perteneciente', 'al', 'subgrupu',
            'asturllionés', '.'
        ]
    elif lang == 'bul':
        assert lemmas == [
            'Бъ̀лгарският', 'езѝк', 'съм', 'индоевропейски', 'език', 'от',
            'група', 'на', 'южнославянските', 'език', '.'
        ]
    elif lang == 'cat':
        assert lemmas == [
            'El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya',
            ',', 'a', 'ell', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a',
            'ell', 'ciutat', 'de', 'ell', 'Alguer', 'i', 'tradicional', 'a',
            'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació',
            'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al',
            'Carxe', ')', 'ser', 'un', 'llengua', 'romànic', 'parlar', 'a',
            'Catalunya', ',', 'ell', 'País', 'Valencià', '(', 'treure', 'de',
            'algun', 'comarca', 'i', 'localitat', 'de', 'ell', 'interior', ')',
            ',', 'ell', 'Illes', 'Balears', ',', 'Andorra', ',', 'ell',
            'Franja', 'de', 'Ponent', '(', 'a', 'ell', 'Aragó', ')', ',',
            'ell', 'ciutat', 'de', 'ell', 'Alguer', '(', 'a', 'ell', 'illa',
            'de', 'Sardenya', ')', ',', 'ell', 'Catalunya', 'del', 'Nord,[8',
            ']', 'ell', 'Carxe', '(', 'un', 'petit', 'territori', 'de',
            'Múrcia', 'poblar', 'per', 'immigrar', 'valencians),[9][10', ']',
            'i', 'en', 'petita', 'comunitat', 'arreu', 'del', 'món', '(',
            'entrar', 'ell', 'qual', 'destacar', 'ell', 'de', 'ell',
            'Argentina', ',', 'amb', '195.000', 'parlants).[11', ']'
        ]
    elif lang == 'ces':
        assert lemmas == [
            'Čeština', 'neboli', 'český', 'jazyk', 'on', 'západoslovanský',
            'jazyk', ',', 'blízký', 'slovenštině', ',', 'poté', 'lužické',
            'srbštině', 'a', 'polštině', '.'
        ]
    elif lang == 'nld':
        assert lemmas == [
            'het', 'nederlands', 'zijn', 'een', 'west-germaans', 'taal', 'en',
            'de', 'moedertaal', 'van', 'de', 'veel', 'inwoner', 'van',
            'nederland', ',', 'belgië', 'en', 'suriname', '.'
        ]
    elif lang == 'eng':
        if lemmatizer == 'Lemmatization Lists - English Lemma List':
            assert lemmas == [
                'English', 'be', 'a', 'West', 'Germanic', 'language', 'that',
                'be', '1', 'speak', 'in', 'early', 'medieval', 'England',
                'and', 'eventually', 'become', 'a', 'global', 'lingua',
                'franca.[4][5', ']'
            ]
        elif lemmatizer == 'NLTK - WordNet Lemmatizer':
            assert lemmas == [
                'English', 'be', 'a', 'West', 'Germanic', 'language', 'that',
                'be', 'first', 'speak', 'in', 'early', 'medieval', 'England',
                'and', 'eventually', 'become', 'a', 'global', 'lingua',
                'franca.[4][5', ']'
            ]
        elif lemmatizer == 'spaCy - English Lemmatizer':
            assert lemmas == [
                'English', 'be', 'a', 'West', 'germanic', 'language', 'that',
                'be', 'first', 'speak', 'in', 'early', 'medieval', 'England',
                'and', 'eventually', 'become', 'a', 'global', 'lingua',
                'franca.[4][5', ']'
            ]
    elif lang == 'est':
        assert lemmas == [
            'Eesti', 'kee', '(', 'varasem', 'nimetu', ':', 'maakeel', ')',
            'olema', 'läänemeresoome', 'lõunarühma', 'kuuluma', 'kee', '.'
        ]
    elif lang == 'fra':
        if lemmatizer == 'Lemmatization Lists - French Lemma List':
            assert lemmas == [
                'Le', 'français', 'être', 'un', 'langue', 'indo-européen',
                'de', 'le', 'famille', 'un', 'langue', 'roman', '.'
            ]
        elif lemmatizer == 'spaCy - French Lemmatizer':
            assert lemmas == [
                'le', 'français', 'être', 'un', 'langue', 'indo-européen',
                'de', 'le', 'famille', 'un', 'langue', 'roman', '.'
            ]
    elif lang == 'glg':
        assert lemmas == [
            'O', 'galego', '(', '[', 'ɡaˈleɣo̝', ']', ')', 'ser', 'un',
            'lingua', 'indoeuropeo', 'que', 'pertencer', 'á', 'póla', 'de',
            'lingua', 'románico', '.'
        ]
    elif lang == 'deu':
        if lemmatizer == 'Lemmatization Lists - German Lemma List':
            assert lemmas == [
                'Die', 'deutsch', 'Sprache', 'bzw.', 'Deutsch', '(', '[',
                'dɔʏ̯t͡ʃ', ']', ';', 'abkürzen', 'dt', '.', 'oder', 'dtsch',
                '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', '.'
            ]
        elif lemmatizer == 'spaCy - German Lemmatizer':
            assert lemmas == [
                'der', 'deutsch', 'Sprache', 'bzw.', 'Deutsch', '(', '[',
                'dɔʏ̯t͡ʃ', ']', ';', 'abkürzen', 'dt', '.', 'oder', 'dtsch',
                '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', '.'
            ]
    elif lang == 'grc':
        assert lemmas == [
            'Με', 'τον', 'όρο', 'αρχαία', 'ελληνική', 'γλώσσα', 'εννοείται',
            'μια', 'μορφή', 'της', 'ελληνικής', 'γλώσσας', ',', 'πού',
            'ομιλούνταν', 'κατά', 'τους', 'αρχαϊκούς', 'χρόνους', 'και', 'την',
            'κλασική', 'αρχαιότητα', '.'
        ]
    elif lang == 'ell':
        assert lemmas == [
            'η', 'ελληνικός', 'γλώσσα', 'ανήκω', 'στην', 'ινδοευρωπαϊκός',
            'οικογένεια[9', ']', 'και', 'συγκεκριμένα', 'στον', 'ελληνικό',
            'κλάδο', ',', 'μαζί', 'με', 'την', 'τσακωνικός', ',', 'ενώ',
            'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'Ελλάδος', 'και', 'της',
            'Κύπρου', '.'
        ]
    elif lang == 'hun':
        assert lemmas == [
            'A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tag', ',',
            'a', 'finnugor', 'nyelv', 'köz', 'tartozó', 'ugor', 'nyelv',
            'egyik', '.'
        ]
    elif lang == 'gle':
        assert lemmas == [
            'Is', 'ceann', 'de', 'na', 'teangach', 'Ceilteacha', 'í', 'an',
            'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a',
            'tabhair', 'ar', 'corruair', ')', ',', 'agus', 'ceann', 'den',
            'trí', 'ceann', 'de', 'teangach', 'Ceilteacha', 'air', 'a',
            'tabhair', 'na', 'teangach', 'Gaelacha', '(', '.i.', 'an',
            'Ghaeilge', ',', 'Gaeilge', 'na', 'hAlban', 'agus', 'Gaeilge',
            'Mhanann', ')', 'go', 'áirithe', '.'
        ]
    elif lang == 'ita':
        assert lemmas == [
            "L'", 'italiano', '(', '[', 'itaˈljaːno][Nota', '1', ']',
            'ascolta[?·info', ']', ')', 'essere', 'una', 'lingua', 'romanzo',
            'parlato', 'principalmente', 'in', 'Italia', '.'
        ]
    elif lang == 'lit':
        assert lemmas == [
            'lietuvė', 'kalbėti', '–', 'ižti', 'baltas', 'prokalbės', 'kilęs',
            'lietuvė', 'tauta', 'kalbėti', ',', '-PRON-', 'Lietuvoje', 'būti',
            'valstybinis', ',', 'o', 'Europos', 'sąjunga', '–', 'viena',
            'ižti', 'oficialus', 'kalbus', '.'
        ]
    elif lang == 'glv':
        assert lemmas == [
            'She', 'Gaelg', '(', 'graït', ':', '/gɪlg/', ')', 'çhengey',
            'Gaelagh', 'Mannin', '.'
        ]
    elif lang == 'nob':
        assert lemmas == [
            'bokmål', 'være', 'en', 'varietet', 'av', 'norsk', 'språk', '.'
        ]
    elif lang == 'fas':
        assert lemmas == [
            'فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای',
            'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب',
            'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان،[۳',
            ']', 'تاجیکستان[۴', ']', 'را', 'ازبکستان[۵', ']', 'به', 'آن',
            'سخن', 'می\u200cگویند', '.'
        ]
    elif lang == 'por':
        assert lemmas == [
            'A', 'língua', 'portuguesar', ',', 'também', 'designar',
            'português', ',', 'ser', 'umar', 'língua', 'românico', 'flexivo',
            'ocidental', 'originar', 'o', 'galego-português', 'falar', 'o',
            'Reino', 'da', 'Galiza', 'e', 'o', 'norte', 'de', 'Portugal', '.'
        ]
    elif lang == 'ron':
        assert lemmas == [
            'Limba', 'român', 'fi', 'vrea', 'limbă', 'indo', '-', 'european',
            ',', 'din', 'grup', 'italic', 'și', 'din', 'subgrupul', 'oriental',
            'al', 'limbă', 'romanice', '.'
        ]
    elif lang == 'rus':
        assert lemmas == [
            'ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']',
            'информация', 'о', 'файл', 'слушать', ')', '[', '~', '3', ']', '[',
            '⇨', ']', '—', 'один', 'из', 'восточнославянский', 'язык', ',',
            'национальный', 'язык', 'русский', 'народ', '.'
        ]
    elif lang == 'gla':
        assert lemmas == [
            "'S", 'i', 'cànan', 'dùthchasach', 'na', 'h', '-', 'Alba', 'a',
            'th', "'", 'anns', 'a', "'", 'Ghàidhlig', '.'
        ]
    elif lang == 'slk':
        assert lemmas == [
            'Slovenčina', 'patriť', 'do', 'skupina', 'západoslovanský',
            'jazyk', '(', 'spolu', 's', 'čeština', ',', 'poľština', ',',
            'horný', 'as', 'dolný', 'lužickou', 'srbčina', 'as', 'kašubčinou',
            ')', '.'
        ]
    elif lang == 'slv':
        assert lemmas == [
            'Slovenščina', '[', 'slovénščina', ']', '/', '[', 'sloˈʋenʃtʃina',
            ']', 'onbiti', 'združen', 'naziv', 'za', 'uraden', 'knjižen',
            'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in',
            'govoriti', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on',
            'nekoč', 'govoriti', 'Slovenec', '.'
        ]
    elif lang == 'spa':
        assert lemmas == [
            'El', 'español', 'o', 'castellano', 'ser', 'uno', 'lengua',
            'romance', 'procedente', 'del', 'latín', 'hablar', '.'
        ]
    elif lang == 'swe':
        assert lemmas == [
            'Svenska', '(', 'svensk', '(', 'info', ')', ')', 'vara', 'en',
            'östnordiskt', 'språka', 'som', 'tala', 'av', 'ungefär', 'tio',
            'miljon', 'person', 'främst', 'i', 'Sverige', 'där', 'språk',
            'hare', 'man', 'dominant', 'ställning', 'som', 'huvudspråk', ',',
            'mena', 'även', 'som', 'en', 'en', 'nationalspråk', 'i', 'Finland',
            'och', 'som', 'enda', 'officiell', 'språka', 'på', 'Åland', '.'
        ]
    elif lang == 'bod':
        assert lemmas == [
            '༄༅། ། ', 'རྒྱ་གར་', 'སྐད་', 'དུ་', ' ། ', 'བོ་', ' དྷི་', ' སཏྭ་',
            ' ཙརྻ་', 'ཨ་བ་', 'ཏ་', 'ར་', ' ། ', 'བོད་སྐད་', 'དུ་', ' ། ',
            'བྱང་ཆུབ་', 'སེམས་དཔའ་', 'གི་', 'སྤྱོད་པ་', 'ལ་', 'འཇུག་པ་',
            ' ། ། ', 'སངས་རྒྱས་', 'དང་', 'བྱང་ཆུབ་', 'སེམས་དཔའ་', 'ཐམས་ཅད་',
            'ལ་', 'ཕྱག་', 'འཚལ་', 'ལོ་', ' ། ། ', 'བདེ་གཤེགས་', 'ཆོ་', 'ཀྱི་',
            'སྐུ་', 'མངའ་', 'སྲ་', 'བཅའ་', 'དང་', ' ། ། ', 'ཕྱག་འོས་', 'ཀུན་',
            'ལ་', 'ཀྱང་', 'གུས་པ་', 'ལ་', 'ཕྱག་', 'འཚལ་', 'ཏེ་', ' ། ། ',
            'བདེ་གཤེགས་', 'སྲ་', 'ཀྱི་', 'སྡོམ་', 'ལ་', 'འཇུག་པ་', 'ནི་',
            ' ། ། ', 'ལུང་', 'བཞིན་', 'མདོར་བསྡུས་', 'ན་', 'ནི་', 'བརྗོད་པ་',
            'ལ་', 'བྱ་', ' ། །'
        ]
    elif lang == 'ukr':
        if lemmatizer == 'Lemmatization Lists - Ukrainian Lemma List':
            assert lemmas == [
                'Украї́нська', 'мо́ва', '(', 'МФА', ':', '[',
                'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назвати',
                '—', 'ру́ська', ',', 'руси́нська[9][10][11', ']', '[', '*',
                '2', ']', ')', '—', 'національний', 'мова', 'українець', '.'
            ]
        elif lemmatizer == 'pymorphy2 - Morphological Analyzer':
            assert lemmas == [
                'украї́нський', 'мо́вий', '(', 'мфа', ':', '[',
                'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назва',
                '—', 'ру́ський', ',', 'руси́нська[9][10][11', ']', '[', '*',
                '2', ']', ')', '—', 'національний', 'мова', 'українець', '.'
            ]
    elif lang == 'cym':
        assert lemmas == [
            'Aelod', "o'r", 'cangen', 'Frythonaidd', "o'r", 'iaith',
            'Celtaidd', 'a', 'siarad', 'bod', 'brodorol', 'yn', 'Nghymru', ',',
            'can', 'Gymry', 'a', 'pobl', 'arall', 'aredig', 'gwasgar', 'bod',
            'Lloegr', ',', 'a', 'can', 'cymuno', 'bechan', 'bod', 'Y',
            'Wladfa', ',', 'gwybod', 'Ariannin[7', ']', "yw'r", 'Gymraeg', '(',
            'hefyd', 'Cymraeg', 'heb', 'yr', 'bannod', ')', '.'
        ]
Beispiel #10
0
    def __init__(self, main, file, tokens_only = True):
        self.main = main
        self.lang = file['lang']
        self.text_type = file['text_type']

        self.para_offsets = []
        self.sentence_offsets = []
        self.tokens_sentences_paras = []
        self.tokens = []

        self.tags_all = []
        self.tags_pos = []
        self.tags_non_pos = []

        re_tags_all = wordless_matching.get_re_tags(main, tags = 'all')
        re_tags_pos = wordless_matching.get_re_tags(main, tags = 'pos')
        re_tags_non_pos = wordless_matching.get_re_tags(main, tags = 'non_pos')

        if tokens_only:
            keep_sentences = False
        else:
            keep_sentences = True

        with open(file['path'], 'r', encoding = file['encoding']) as f:
            # Untokenized / Untagged
            if self.text_type == ('untokenized', 'untagged'):
                for line in f:
                    text = line.rstrip()

                    if text:
                        tokens_sentences = wordless_text_processing.wordless_word_tokenize(main, text,
                                                                                           lang = self.lang,
                                                                                           keep_sentences = keep_sentences)

                        self.tokens_sentences_paras.append(tokens_sentences)

            # Untokenized / Tagged (Non-POS)
            elif self.text_type == ('untokenized', 'tagged_non_pos'):
                for i, line in enumerate(f):
                    text = line.rstrip()

                    if text:
                        # Replace all tags with a whitespace to ensure no words run together
                        text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                        text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                        tokens_sentences = wordless_text_processing.wordless_word_tokenize(main, text_no_tags,
                                                                                           lang = self.lang,
                                                                                           keep_sentences = keep_sentences)

                        self.tokens_sentences_paras.append(tokens_sentences)

                        # Extract tags
                        for tag in re.findall(re_tags_non_pos, text):
                            i_tag = text.index(tag)

                            if i == 0 and i_tag == 0 and not self.tags_non_pos:
                                self.tokens_sentences_paras[0][0].insert(0, '')

                                self.tags_non_pos.append([tag])
                            else:
                                self.tokenize_text(text[:i_tag])

                                self.tags_non_pos[-1].append(tag)

                            text = text[i_tag + len(tag):]

                        # The last part of the text
                        if text:
                            self.tokenize_text(text)
            # Tokenized / Untagged
            elif self.text_type == ('tokenized', 'untagged'):
                if tokens_only:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            self.tokens_sentences_paras.append([text.split()])
                else:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            self.tokens_sentences_paras.append([])

                            for sentence in wordless_text_processing.wordless_sentence_tokenize(main, text,
                                                                                                lang = self.lang):
                                self.tokens_sentences_paras[-1].append(sentence.split())
            # Tokenized / Tagged (POS)
            elif self.text_type == ('tokenized', 'tagged_pos'):
                if tokens_only:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_sentences_paras.append([text_no_tags.split()])

                            # Extract tags
                            for tag in re.findall(re_tags_pos, text):
                                i_tag = text.index(tag)

                                if self.tokens_sentences_paras:
                                    self.split_text(text[:i_tag])

                                    self.tags_pos[-1].append(tag)
                                else:
                                    self.tokens_sentences_paras[0][0].insert(0, '')

                                    self.tags_pos.append([tag])

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_sentences_paras.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            sentences = wordless_text_processing.wordless_sentence_tokenize(main, text_no_tags,
                                                                                            lang = self.lang)

                            for sentence in sentences:
                                self.tokens_sentences_paras[-1].append(sentence.split())

                            # Extract tags
                            for tag in re.findall(re_tags_pos, text):
                                i_tag = text.index(tag)

                                if self.tokens_sentences_paras:
                                    self.split_text(text[:i_tag])

                                    self.tags_pos[-1].append(tag)
                                else:
                                    self.tokens_sentences_paras[0][0].insert(0, '')

                                    self.tags_pos.append([tag])

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
            # Tokenized / Tagged (Non-POS)
            elif self.text_type == ('tokenized', 'tagged_non_pos'):
                if tokens_only:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_sentences_paras.append([text_no_tags.split()])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                if i == 0 and i_tag == 0 and not self.tags_non_pos:
                                    self.tokens_sentences_paras[0][0].insert(0, '')

                                    self.tags_non_pos.append([tag])
                                else:
                                    self.split_text(text[:i_tag])

                                    self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_sentences_paras.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            sentences = wordless_text_processing.wordless_sentence_tokenize(main, text_no_tags,
                                                                                            lang = self.lang)

                            for sentence in sentences:
                                self.tokens_sentences_paras[-1].append(sentence.split())

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                if i == 0 and i_tag == 0 and not self.tags_non_pos:
                                    self.tokens_sentences_paras[0][0].insert(0, '')

                                    self.tags_non_pos.append([tag])
                                else:
                                    self.split_text(text[:i_tag])

                                    self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
            # Tokenized / Tagged (Both)
            elif self.text_type == ('tokenized', 'tagged_both'):
                if tokens_only:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_all, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_sentences_paras.append([text_no_tags.split()])

                            # Extract tags
                            while text:
                                tag_pos = re.search(re_tags_pos, text)
                                tag_non_pos = re.search(re_tags_non_pos, text)

                                if tag_pos:
                                    i_tag_pos = text.index(tag_pos.group())

                                if tag_non_pos:
                                    i_tag_non_pos = text.index(tag_non_pos.group())

                                if (tag_pos and tag_non_pos and i_tag_pos < i_tag_non_pos or
                                    tag_pos and not tag_non_pos):
                                    if i == 0 and i_tag_pos == 0 and not self.tags_all:
                                        self.tokens_sentences_paras[0][0].insert(0, '')

                                        self.tags_all.append([tag_pos.group()])
                                        self.tags_pos.append([tag_pos.group()])
                                        self.tags_non_pos.append([])
                                    else:
                                        self.split_text(text[:i_tag_pos])

                                        self.tags_pos[-1].append(tag_pos.group())
                                        self.tags_all[-1].append(tag_pos.group())

                                    text = text[i_tag_pos + len(tag_pos.group()):]
                                elif (tag_pos and tag_non_pos and i_tag_pos > i_tag_non_pos or
                                      not tag_pos and tag_non_pos):
                                    if i == 0 and i_tag_non_pos == 0 and not self.tags_all:
                                        self.tokens_sentences_paras[0][0].insert(0, '')

                                        self.tags_all.append([tag_non_pos.group()])
                                        self.tags_pos.append([])
                                        self.tags_non_pos.append([tag_non_pos.group()])
                                    else:
                                        self.split_text(text[:i_tag_non_pos])

                                        self.tags_all[-1].append(tag_non_pos.group())
                                        self.tags_non_pos[-1].append(tag_non_pos.group())

                                    text = text[i_tag_non_pos + len(tag_non_pos.group()):]
                                else:
                                    self.split_text(text)

                                    break
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_sentences_paras.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_all, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            sentences = wordless_text_processing.wordless_sentence_tokenize(main, text_no_tags,
                                                                                            lang = self.lang)

                            for sentence in sentences:
                                self.tokens_sentences_paras[-1].append(sentence.split())

                            # Extract tags
                            while text:
                                tag_pos = re.search(re_tags_pos, text)
                                tag_non_pos = re.search(re_tags_non_pos, text)

                                if tag_pos:
                                    i_tag_pos = text.index(tag_pos.group())

                                if tag_non_pos:
                                    i_tag_non_pos = text.index(tag_non_pos.group())

                                if (tag_pos and tag_non_pos and i_tag_pos < i_tag_non_pos or
                                    tag_pos and not tag_non_pos):
                                    if i == 0 and i_tag_pos == 0 and not self.tags_all:
                                        self.tokens_sentences_paras[0][0].insert(0, '')

                                        self.tags_all.append([tag_pos.group()])
                                        self.tags_pos.append([tag_pos.group()])
                                        self.tags_non_pos.append([])
                                    else:
                                        self.split_text(text[:i_tag_pos])

                                        self.tags_all[-1].append(tag_pos.group())
                                        self.tags_pos[-1].append(tag_pos.group())

                                    text = text[i_tag_pos + len(tag_pos.group()):]
                                elif (tag_pos and tag_non_pos and i_tag_pos > i_tag_non_pos or
                                      not tag_pos and tag_non_pos):
                                    if i == 0 and i_tag_non_pos == 0 and not self.tags_all:
                                        self.tokens_sentences_paras[0][0].insert(0, '')

                                        self.tags_all.append([tag_non_pos.group()])
                                        self.tags_pos.append([])
                                        self.tags_non_pos.append([tag_non_pos.group()])
                                    else:
                                        self.split_text(text[:i_tag_non_pos])

                                        self.tags_all[-1].append(tag_non_pos.group())
                                        self.tags_non_pos[-1].append(tag_non_pos.group())

                                    text = text[i_tag_non_pos + len(tag_non_pos.group()):]
                                else:
                                    self.split_text(text)

                                    break

        # Record paragraph and sentence offsets
        for tokens_sentences in self.tokens_sentences_paras:
            self.para_offsets.append(len(self.tokens))

            for tokens in tokens_sentences:
                self.sentence_offsets.append(len(self.tokens))

                self.tokens.extend(tokens)

        if self.text_type[1] == 'tagged_pos':
            self.tags_non_pos = [[] for i in range(len(self.tokens))]
            self.tags_all = copy.deepcopy(self.tags_pos)
        elif self.text_type[1] == 'tagged_non_pos':
            self.tags_pos = [[] for i in range(len(self.tokens))]
            self.tags_all = copy.deepcopy(self.tags_non_pos)
        elif self.text_type[1] == 'untagged':
            self.tags_all = [[] for i in range(len(self.tokens))]
            self.tags_pos = [[] for i in range(len(self.tokens))]
            self.tags_non_pos = [[] for i in range(len(self.tokens))]

        # Remove whitespace around all tags
        self.tags_all = [[tag.strip() for tag in tags] for tags in self.tags_all]
        self.tags_pos = [[tag.strip() for tag in tags] for tags in self.tags_pos]
        self.tags_non_pos = [[tag.strip() for tag in tags] for tags in self.tags_non_pos]