Exemple #1
0
def detect_text_type(main, file):
    tagged_pos = False
    tagged_non_pos = False

    try:
        with open(file['path'], 'r', encoding = file['encoding']) as f:
            re_tags_pos = wordless_matching.get_re_tags(main, tags = 'pos')
            re_tags_non_pos = wordless_matching.get_re_tags(main, tags = 'non_pos')

            if main.settings_custom['auto_detection']['detection_settings']['number_lines_no_limit']:
                for line in f:
                    if re.search(re_tags_pos, line):
                        tagged_pos = True

                        break

                f.seek(0)

                for line in f:
                    if re.search(re_tags_non_pos, line):
                        tagged_non_pos = True

                        break
            else:
                for i, line in enumerate(f):
                    if i >= main.settings_custom['auto_detection']['detection_settings']['number_lines']:
                        break

                    if re.search(re_tags_pos, line):
                        tagged_pos = True

                        break

                f.seek(0)

                for i, line in enumerate(f):
                    if i >= main.settings_custom['auto_detection']['detection_settings']['number_lines']:
                        break

                    if re.search(re_tags_non_pos, line):
                        tagged_non_pos = True

                        break

        if tagged_pos and tagged_non_pos:
            text_type = ('tokenized', 'tagged_both')
        elif tagged_pos:
            text_type = ('tokenized', 'tagged_pos')
        elif tagged_non_pos:
            text_type = ('untokenized', 'tagged_non_pos')
        else:
            text_type = ('untokenized', 'untagged')

        success = True
    except:
        text_type = main.settings_custom['auto_detection']['default_settings']['default_text_type']

        success = False

    return text_type, success
Exemple #2
0
    def __init__(self, main, file, flat_tokens = True):
        self.main = main
        self.lang = file['lang']
        self.text_type = file['text_type']

        self.offsets_paras = []
        self.offsets_sentences = []
        self.offsets_clauses = []

        if flat_tokens:
            self.tokens_hierarchical = [[[[]]]]
        else:
            self.tokens_hierarchical = []

        self.tokens_flat = []

        self.tags_all = []
        self.tags_pos = []
        self.tags_non_pos = []

        re_tags_all = wordless_matching.get_re_tags(main, tags = 'all')
        re_tags_pos = wordless_matching.get_re_tags(main, tags = 'pos')
        re_tags_non_pos = wordless_matching.get_re_tags(main, tags = 'non_pos')

        with open(file['path'], 'r', encoding = file['encoding']) as f:
            # Untokenized / Untagged
            if self.text_type == ('untokenized', 'untagged'):
                if flat_tokens:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            tokens = wordless_text_processing.wordless_word_tokenize(main, text,
                                                                                     lang = self.lang,
                                                                                     flat_tokens = True)

                            self.tokens_hierarchical[0][0][0].extend(tokens)
                else:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            tokens = wordless_text_processing.wordless_word_tokenize(main, text,
                                                                                     lang = self.lang,
                                                                                     flat_tokens = False)

                            self.tokens_hierarchical.append(tokens)

            # Untokenized / Tagged (Non-POS)
            elif self.text_type == ('untokenized', 'tagged_non_pos'):
                if flat_tokens:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            tokens = wordless_text_processing.wordless_word_tokenize(main, text_no_tags,
                                                                                     lang = self.lang,
                                                                                     flat_tokens = True)

                            self.tokens_hierarchical[0][0][0].extend(tokens)

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_non_pos, text):
                                self.tokens_hierarchical[0][0][0].insert(0, '')
                                self.tags_non_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                self.tokenize_text(text[:i_tag])
                                self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.tokenize_text(text)
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            tokens = wordless_text_processing.wordless_word_tokenize(main, text_no_tags,
                                                                                     lang = self.lang,
                                                                                     flat_tokens = False)

                            self.tokens_hierarchical.append(tokens)

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_non_pos, text):
                                self.tokens_hierarchical[0][0][0].insert(0, '')
                                self.tags_non_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                self.tokenize_text(text[:i_tag])
                                self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.tokenize_text(text)
            # Tokenized / Untagged
            elif self.text_type == ('tokenized', 'untagged'):
                if flat_tokens:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            self.tokens_hierarchical[0][0][0].extend(text.split())
                else:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            self.tokens_hierarchical.append([])

                            for sentence in wordless_text_processing.wordless_sentence_tokenize(main, text,
                                                                                                lang = self.lang):
                                self.tokens_hierarchical[-1].append([])

                                for clause in wordless_text_processing.wordless_clause_tokenize(main, sentence,
                                                                                                lang = self.lang):
                                    self.tokens_hierarchical[-1][-1].append(clause.split())
            # Tokenized / Tagged (POS)
            elif self.text_type == ('tokenized', 'tagged_pos'):
                if flat_tokens:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_hierarchical[0][0][0].extend(text_no_tags.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_pos, text):
                                self.tokens_hierarchical[0][0][0].insert(0, '')
                                self.tags_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_pos, text):
                                i_tag = text.index(tag)

                                self.split_text(text[:i_tag])
                                self.tags_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_hierarchical.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            for sentence in wordless_text_processing.wordless_sentence_tokenize(main, text_no_tags,
                                                                                                lang = self.lang):
                                self.tokens_hierarchical[-1].append([])

                                for clause in wordless_text_processing.wordless_clause_tokenize(main, sentence,
                                                                                                lang = self.lang):
                                    self.tokens_hierarchical[-1][-1].append(clause.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_pos, text):
                                self.tokens_hierarchical[0][0][0].insert(0, '')
                                self.tags_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_pos, text):
                                i_tag = text.index(tag)

                                self.split_text(text[:i_tag])
                                self.tags_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
            # Tokenized / Tagged (Non-POS)
            elif self.text_type == ('tokenized', 'tagged_non_pos'):
                if flat_tokens:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_hierarchical[0][0][0].extend(text_no_tags.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_non_pos, text):
                                self.tokens_hierarchical[0][0][0].insert(0, '')
                                self.tags_non_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                self.split_text(text[:i_tag])
                                self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_hierarchical.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            for sentence in wordless_text_processing.wordless_sentence_tokenize(main, text_no_tags,
                                                                                                lang = self.lang):
                                self.tokens_hierarchical[-1].append([])

                                for clause in wordless_text_processing.wordless_clause_tokenize(main, sentence,
                                                                                                lang = self.lang):
                                    self.tokens_hierarchical[-1][-1].append(clause.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and re.match(re_tags_non_pos, text):
                                self.tokens_hierarchical[0][0][0].insert(0, '')
                                self.tags_non_pos.append([])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                self.split_text(text[:i_tag])
                                self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
            # Tokenized / Tagged (Both)
            elif self.text_type == ('tokenized', 'tagged_both'):
                if flat_tokens:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_all, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_hierarchical[0][0][0].extend(text_no_tags.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and (re.match(re_tags_pos, text) or re.match(re_tags_non_pos, text)):
                                self.tokens_hierarchical[0][0][0].insert(0, '')

                                self.tags_all.append([])
                                self.tags_pos.append([])
                                self.tags_non_pos.append([])

                            # Extract tags
                            while text:
                                tag_pos = re.search(re_tags_pos, text)
                                tag_non_pos = re.search(re_tags_non_pos, text)

                                if tag_pos:
                                    i_tag_pos = text.index(tag_pos.group())

                                if tag_non_pos:
                                    i_tag_non_pos = text.index(tag_non_pos.group())

                                if (tag_pos and tag_non_pos and i_tag_pos < i_tag_non_pos or
                                    tag_pos and not tag_non_pos):
                                    self.split_text(text[:i_tag_pos])

                                    self.tags_pos[-1].append(tag_pos.group())
                                    self.tags_all[-1].append(tag_pos.group())

                                    text = text[i_tag_pos + len(tag_pos.group()):]
                                elif (tag_pos and tag_non_pos and i_tag_pos > i_tag_non_pos or
                                      not tag_pos and tag_non_pos):
                                    self.split_text(text[:i_tag_non_pos])

                                    self.tags_all[-1].append(tag_non_pos.group())
                                    self.tags_non_pos[-1].append(tag_non_pos.group())

                                    text = text[i_tag_non_pos + len(tag_non_pos.group()):]
                                else:
                                    self.split_text(text)

                                    break
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_hierarchical.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_all, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            for sentence in wordless_text_processing.wordless_sentence_tokenize(main, text_no_tags,
                                                                                                lang = self.lang):
                                self.tokens_hierarchical[-1].append([])

                                for clause in wordless_text_processing.wordless_clause_tokenize(main, sentence,
                                                                                                lang = self.lang):
                                    self.tokens_hierarchical[-1][-1].append(clause.split())

                            # Check if the first token in the text is a tag
                            if i == 0 and (re.match(re_tags_pos, text) or re.match(re_tags_non_pos, text)):
                                self.tokens_hierarchical[0][0][0].insert(0, '')

                                self.tags_all.append([])
                                self.tags_pos.append([])
                                self.tags_non_pos.append([])

                            # Extract tags
                            while text:
                                tag_pos = re.search(re_tags_pos, text)
                                tag_non_pos = re.search(re_tags_non_pos, text)

                                if tag_pos:
                                    i_tag_pos = text.index(tag_pos.group())

                                if tag_non_pos:
                                    i_tag_non_pos = text.index(tag_non_pos.group())

                                if (tag_pos and tag_non_pos and i_tag_pos < i_tag_non_pos or
                                    tag_pos and not tag_non_pos):
                                    self.split_text(text[:i_tag_pos])

                                    self.tags_all[-1].append(tag_pos.group())
                                    self.tags_pos[-1].append(tag_pos.group())

                                    text = text[i_tag_pos + len(tag_pos.group()):]
                                elif (tag_pos and tag_non_pos and i_tag_pos > i_tag_non_pos or
                                      not tag_pos and tag_non_pos):
                                    self.split_text(text[:i_tag_non_pos])

                                    self.tags_all[-1].append(tag_non_pos.group())
                                    self.tags_non_pos[-1].append(tag_non_pos.group())

                                    text = text[i_tag_non_pos + len(tag_non_pos.group()):]
                                else:
                                    self.split_text(text)

                                    break

        # Paragraph, sentence and clause offsets
        for para in self.tokens_hierarchical:
            self.offsets_paras.append(len(self.tokens_flat))

            for sentence in para:
                self.offsets_sentences.append(len(self.tokens_flat))

                for clause in sentence:
                    self.offsets_clauses.append(len(self.tokens_flat))

                    self.tokens_flat.extend(clause)

        # Tags
        if self.text_type[1] == 'tagged_pos':
            self.tags_non_pos = [[] for i in range(len(self.tokens_flat))]
            self.tags_all = copy.deepcopy(self.tags_pos)
        elif self.text_type[1] == 'tagged_non_pos':
            self.tags_pos = [[] for i in range(len(self.tokens_flat))]
            self.tags_all = copy.deepcopy(self.tags_non_pos)
        elif self.text_type[1] == 'untagged':
            self.tags_all = [[] for i in range(len(self.tokens_flat))]
            self.tags_pos = [[] for i in range(len(self.tokens_flat))]
            self.tags_non_pos = [[] for i in range(len(self.tokens_flat))]

        # Remove whitespace around all tags
        self.tags_all = [[tag.strip() for tag in tags] for tags in self.tags_all]
        self.tags_pos = [[tag.strip() for tag in tags] for tags in self.tags_pos]
        self.tags_non_pos = [[tag.strip() for tag in tags] for tags in self.tags_non_pos]
def wordless_lemmatize(main,
                       tokens,
                       lang,
                       text_type=('untokenized', 'untagged'),
                       lemmatizer='default'):
    empty_offsets = []
    mapping_lemmas = {}
    lemmas = []

    tokens = [str(token) for token in tokens]

    re_tags_all = wordless_matching.get_re_tags(main, tags='all')
    re_tags_pos = wordless_matching.get_re_tags(main, tags='pos')
    re_tags_non_pos = wordless_matching.get_re_tags(main, tags='non_pos')

    if text_type[1] == 'tagged_both':
        tags = [''.join(re.findall(re_tags_all, token)) for token in tokens]
        tokens = [re.sub(re_tags_all, '', token) for token in tokens]
    elif text_type[1] == 'tagged_pos':
        tags = [''.join(re.findall(re_tags_pos, token)) for token in tokens]
        tokens = [re.sub(re_tags_pos, '', token) for token in tokens]
    elif text_type[1] == 'tagged_non_pos':
        tags = [
            ''.join(re.findall(re_tags_non_pos, token)) for token in tokens
        ]
        tokens = [re.sub(re_tags_non_pos, '', token) for token in tokens]
    else:
        tags = [''] * len(tokens)

    # Record empty tokens
    for i, token in reversed(list(enumerate(tokens))):
        if not token.strip():
            tokens.remove(token)

            empty_offsets.append(i)

    wordless_text_utils.check_lemmatizers(main, lang)

    if tokens and lang in main.settings_global['lemmatizers']:
        if lemmatizer == 'default':
            lemmatizer = main.settings_custom['lemmatization']['lemmatizers'][
                lang]

        # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish
        if 'spaCy' in lemmatizer:
            nlp = main.__dict__[f'spacy_nlp_{lang}']

            doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
            nlp.tagger(doc)

            lemmas = [token.lemma_ for token in doc]
        # English
        elif lemmatizer == main.tr('NLTK - WordNet Lemmatizer'):
            word_net_lemmatizer = nltk.WordNetLemmatizer()

            for token, pos in wordless_pos_tag(
                    main,
                    tokens,
                    lang='eng',
                    pos_tagger='NLTK - Perceptron POS Tagger',
                    tagset='universal'):
                if pos == 'ADJ':
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.ADJ))
                elif pos in ['NOUN', 'PROPN']:
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.NOUN))
                elif pos == 'ADV':
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.ADV))
                elif pos in ['VERB', 'AUX']:
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.VERB))
                else:
                    lemmas.append(word_net_lemmatizer.lemmatize(token))
        # Greek (Ancient)
        elif lemmatizer == main.tr(
                'lemmalist-greek - Greek (Ancient) Lemma List'):
            with open(wordless_misc.get_abs_path(
                    'lemmatization/lemmalist-greek/lemmalist-greek.txt'),
                      'r',
                      encoding='utf_8') as f:
                for line in f.readlines():
                    line = line.rstrip()

                    if line:
                        lemma, *words = line.split()

                        for word in words:
                            mapping_lemmas[word] = lemma
        # Russian & Ukrainian
        elif lemmatizer == main.tr('pymorphy2 - Morphological Analyzer'):
            if lang == 'rus':
                morphological_analyzer = pymorphy2.MorphAnalyzer(lang='ru')
            else:
                morphological_analyzer = pymorphy2.MorphAnalyzer(lang='uk')

            for token in tokens:
                lemmas.append(
                    morphological_analyzer.parse(token)[0].normal_form)
        # Tibetan
        elif lemmatizer == main.tr('pybo - Tibetan Lemmatizer'):
            word_tokenizer = main.settings_custom['word_tokenization'][
                'word_tokenizers'][lang]

            wordless_text_utils.check_pybo_tokenizers(
                main, word_tokenizer=word_tokenizer)

            if word_tokenizer == main.tr(
                    'pybo - Tibetan Word Tokenizer (GMD)'):
                tokens = main.pybo_tokenizer_gmd.tokenize(' '.join(tokens))
            elif word_tokenizer == main.tr(
                    'pybo - Tibetan Word Tokenizer (POS)'):
                tokens = main.pybo_tokenizer_pos.tokenize(' '.join(tokens))
            elif word_tokenizer == main.tr(
                    'pybo - Tibetan Word Tokenizer (tsikchen)'):
                tokens = main.pybo_tokenizer_tsikchen.tokenize(
                    ' '.join(tokens))

            for token in tokens:
                if token.lemma:
                    lemmas.append(token.lemma)
                else:
                    lemmas.append(token.text)
        # Other Languages
        elif 'Lemmatization Lists' in lemmatizer:
            lang = wordless_conversion.to_iso_639_1(main, lang)

            with open(wordless_misc.get_abs_path(
                    f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'
            ),
                      'r',
                      encoding='utf_8_sig') as f:
                for line in f:
                    try:
                        lemma, word = line.rstrip().split('\t')

                        mapping_lemmas[word] = lemma
                    except:
                        pass
    else:
        lemmas = tokens

    if mapping_lemmas:
        lemmas = [mapping_lemmas.get(token, token) for token in tokens]

    # Insert empty lemmas
    for empty_offset in empty_offsets:
        lemmas.insert(empty_offset, '')

    return [lemma + tag for lemma, tag in zip(lemmas, tags)]
Exemple #4
0
    def __init__(self, main, file, tokens_only = True):
        self.main = main
        self.lang = file['lang']
        self.text_type = file['text_type']

        self.para_offsets = []
        self.sentence_offsets = []
        self.tokens_sentences_paras = []
        self.tokens = []

        self.tags_all = []
        self.tags_pos = []
        self.tags_non_pos = []

        re_tags_all = wordless_matching.get_re_tags(main, tags = 'all')
        re_tags_pos = wordless_matching.get_re_tags(main, tags = 'pos')
        re_tags_non_pos = wordless_matching.get_re_tags(main, tags = 'non_pos')

        if tokens_only:
            keep_sentences = False
        else:
            keep_sentences = True

        with open(file['path'], 'r', encoding = file['encoding']) as f:
            # Untokenized / Untagged
            if self.text_type == ('untokenized', 'untagged'):
                for line in f:
                    text = line.rstrip()

                    if text:
                        tokens_sentences = wordless_text_processing.wordless_word_tokenize(main, text,
                                                                                           lang = self.lang,
                                                                                           keep_sentences = keep_sentences)

                        self.tokens_sentences_paras.append(tokens_sentences)

            # Untokenized / Tagged (Non-POS)
            elif self.text_type == ('untokenized', 'tagged_non_pos'):
                for i, line in enumerate(f):
                    text = line.rstrip()

                    if text:
                        # Replace all tags with a whitespace to ensure no words run together
                        text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                        text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                        tokens_sentences = wordless_text_processing.wordless_word_tokenize(main, text_no_tags,
                                                                                           lang = self.lang,
                                                                                           keep_sentences = keep_sentences)

                        self.tokens_sentences_paras.append(tokens_sentences)

                        # Extract tags
                        for tag in re.findall(re_tags_non_pos, text):
                            i_tag = text.index(tag)

                            if i == 0 and i_tag == 0 and not self.tags_non_pos:
                                self.tokens_sentences_paras[0][0].insert(0, '')

                                self.tags_non_pos.append([tag])
                            else:
                                self.tokenize_text(text[:i_tag])

                                self.tags_non_pos[-1].append(tag)

                            text = text[i_tag + len(tag):]

                        # The last part of the text
                        if text:
                            self.tokenize_text(text)
            # Tokenized / Untagged
            elif self.text_type == ('tokenized', 'untagged'):
                if tokens_only:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            self.tokens_sentences_paras.append([text.split()])
                else:
                    for line in f:
                        text = line.rstrip()

                        if text:
                            self.tokens_sentences_paras.append([])

                            for sentence in wordless_text_processing.wordless_sentence_tokenize(main, text,
                                                                                                lang = self.lang):
                                self.tokens_sentences_paras[-1].append(sentence.split())
            # Tokenized / Tagged (POS)
            elif self.text_type == ('tokenized', 'tagged_pos'):
                if tokens_only:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_sentences_paras.append([text_no_tags.split()])

                            # Extract tags
                            for tag in re.findall(re_tags_pos, text):
                                i_tag = text.index(tag)

                                if self.tokens_sentences_paras:
                                    self.split_text(text[:i_tag])

                                    self.tags_pos[-1].append(tag)
                                else:
                                    self.tokens_sentences_paras[0][0].insert(0, '')

                                    self.tags_pos.append([tag])

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_sentences_paras.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            sentences = wordless_text_processing.wordless_sentence_tokenize(main, text_no_tags,
                                                                                            lang = self.lang)

                            for sentence in sentences:
                                self.tokens_sentences_paras[-1].append(sentence.split())

                            # Extract tags
                            for tag in re.findall(re_tags_pos, text):
                                i_tag = text.index(tag)

                                if self.tokens_sentences_paras:
                                    self.split_text(text[:i_tag])

                                    self.tags_pos[-1].append(tag)
                                else:
                                    self.tokens_sentences_paras[0][0].insert(0, '')

                                    self.tags_pos.append([tag])

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
            # Tokenized / Tagged (Non-POS)
            elif self.text_type == ('tokenized', 'tagged_non_pos'):
                if tokens_only:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_sentences_paras.append([text_no_tags.split()])

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                if i == 0 and i_tag == 0 and not self.tags_non_pos:
                                    self.tokens_sentences_paras[0][0].insert(0, '')

                                    self.tags_non_pos.append([tag])
                                else:
                                    self.split_text(text[:i_tag])

                                    self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_sentences_paras.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_non_pos, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            sentences = wordless_text_processing.wordless_sentence_tokenize(main, text_no_tags,
                                                                                            lang = self.lang)

                            for sentence in sentences:
                                self.tokens_sentences_paras[-1].append(sentence.split())

                            # Extract tags
                            for tag in re.findall(re_tags_non_pos, text):
                                i_tag = text.index(tag)

                                if i == 0 and i_tag == 0 and not self.tags_non_pos:
                                    self.tokens_sentences_paras[0][0].insert(0, '')

                                    self.tags_non_pos.append([tag])
                                else:
                                    self.split_text(text[:i_tag])

                                    self.tags_non_pos[-1].append(tag)

                                text = text[i_tag + len(tag):]

                            # The last part of the text
                            if text:
                                self.split_text(text)
            # Tokenized / Tagged (Both)
            elif self.text_type == ('tokenized', 'tagged_both'):
                if tokens_only:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_all, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            self.tokens_sentences_paras.append([text_no_tags.split()])

                            # Extract tags
                            while text:
                                tag_pos = re.search(re_tags_pos, text)
                                tag_non_pos = re.search(re_tags_non_pos, text)

                                if tag_pos:
                                    i_tag_pos = text.index(tag_pos.group())

                                if tag_non_pos:
                                    i_tag_non_pos = text.index(tag_non_pos.group())

                                if (tag_pos and tag_non_pos and i_tag_pos < i_tag_non_pos or
                                    tag_pos and not tag_non_pos):
                                    if i == 0 and i_tag_pos == 0 and not self.tags_all:
                                        self.tokens_sentences_paras[0][0].insert(0, '')

                                        self.tags_all.append([tag_pos.group()])
                                        self.tags_pos.append([tag_pos.group()])
                                        self.tags_non_pos.append([])
                                    else:
                                        self.split_text(text[:i_tag_pos])

                                        self.tags_pos[-1].append(tag_pos.group())
                                        self.tags_all[-1].append(tag_pos.group())

                                    text = text[i_tag_pos + len(tag_pos.group()):]
                                elif (tag_pos and tag_non_pos and i_tag_pos > i_tag_non_pos or
                                      not tag_pos and tag_non_pos):
                                    if i == 0 and i_tag_non_pos == 0 and not self.tags_all:
                                        self.tokens_sentences_paras[0][0].insert(0, '')

                                        self.tags_all.append([tag_non_pos.group()])
                                        self.tags_pos.append([])
                                        self.tags_non_pos.append([tag_non_pos.group()])
                                    else:
                                        self.split_text(text[:i_tag_non_pos])

                                        self.tags_all[-1].append(tag_non_pos.group())
                                        self.tags_non_pos[-1].append(tag_non_pos.group())

                                    text = text[i_tag_non_pos + len(tag_non_pos.group()):]
                                else:
                                    self.split_text(text)

                                    break
                else:
                    for i, line in enumerate(f):
                        text = line.rstrip()

                        if text:
                            self.tokens_sentences_paras.append([])

                            # Replace all tags with a whitespace to ensure no words run together
                            text_no_tags = re.sub(re_tags_all, ' ', text)
                            text_no_tags = re.sub(r'\s+', ' ', text_no_tags)

                            sentences = wordless_text_processing.wordless_sentence_tokenize(main, text_no_tags,
                                                                                            lang = self.lang)

                            for sentence in sentences:
                                self.tokens_sentences_paras[-1].append(sentence.split())

                            # Extract tags
                            while text:
                                tag_pos = re.search(re_tags_pos, text)
                                tag_non_pos = re.search(re_tags_non_pos, text)

                                if tag_pos:
                                    i_tag_pos = text.index(tag_pos.group())

                                if tag_non_pos:
                                    i_tag_non_pos = text.index(tag_non_pos.group())

                                if (tag_pos and tag_non_pos and i_tag_pos < i_tag_non_pos or
                                    tag_pos and not tag_non_pos):
                                    if i == 0 and i_tag_pos == 0 and not self.tags_all:
                                        self.tokens_sentences_paras[0][0].insert(0, '')

                                        self.tags_all.append([tag_pos.group()])
                                        self.tags_pos.append([tag_pos.group()])
                                        self.tags_non_pos.append([])
                                    else:
                                        self.split_text(text[:i_tag_pos])

                                        self.tags_all[-1].append(tag_pos.group())
                                        self.tags_pos[-1].append(tag_pos.group())

                                    text = text[i_tag_pos + len(tag_pos.group()):]
                                elif (tag_pos and tag_non_pos and i_tag_pos > i_tag_non_pos or
                                      not tag_pos and tag_non_pos):
                                    if i == 0 and i_tag_non_pos == 0 and not self.tags_all:
                                        self.tokens_sentences_paras[0][0].insert(0, '')

                                        self.tags_all.append([tag_non_pos.group()])
                                        self.tags_pos.append([])
                                        self.tags_non_pos.append([tag_non_pos.group()])
                                    else:
                                        self.split_text(text[:i_tag_non_pos])

                                        self.tags_all[-1].append(tag_non_pos.group())
                                        self.tags_non_pos[-1].append(tag_non_pos.group())

                                    text = text[i_tag_non_pos + len(tag_non_pos.group()):]
                                else:
                                    self.split_text(text)

                                    break

        # Record paragraph and sentence offsets
        for tokens_sentences in self.tokens_sentences_paras:
            self.para_offsets.append(len(self.tokens))

            for tokens in tokens_sentences:
                self.sentence_offsets.append(len(self.tokens))

                self.tokens.extend(tokens)

        if self.text_type[1] == 'tagged_pos':
            self.tags_non_pos = [[] for i in range(len(self.tokens))]
            self.tags_all = copy.deepcopy(self.tags_pos)
        elif self.text_type[1] == 'tagged_non_pos':
            self.tags_pos = [[] for i in range(len(self.tokens))]
            self.tags_all = copy.deepcopy(self.tags_non_pos)
        elif self.text_type[1] == 'untagged':
            self.tags_all = [[] for i in range(len(self.tokens))]
            self.tags_pos = [[] for i in range(len(self.tokens))]
            self.tags_non_pos = [[] for i in range(len(self.tokens))]

        # Remove whitespace around all tags
        self.tags_all = [[tag.strip() for tag in tags] for tags in self.tags_all]
        self.tags_pos = [[tag.strip() for tag in tags] for tags in self.tags_pos]
        self.tags_non_pos = [[tag.strip() for tag in tags] for tags in self.tags_non_pos]