def testing_to_iso_639_1(lang_code):
    len_iso_639_3 = max(
        [len(lang_code) for lang_code in main.settings_global['lang_codes']])
    iso_639_1 = wordless_conversion.to_iso_639_1(main, lang_code)

    print(f'{lang_code:{len_iso_639_3}} -> {iso_639_1}')

    assert iso_639_1 == main.settings_global['lang_codes'][lang_code]
Exemple #2
0
def check_spacy_models(main, lang, pipeline):
    if pipeline == 'word_tokenization':
        nlp_pipelines = []
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['sentence_tokenization', 'tokenization']:
        nlp_pipelines = ['sentencizer']
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['pos_tagging', 'lemmatization']:
        nlp_pipelines = ['tagger']
        nlp_disable = ['parser', 'ner']

    # Languages with models
    if lang in [
            'nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa', 'other'
    ]:
        if f'spacy_nlp_{lang}' in main.__dict__:
            if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
                del main.__dict__[f'spacy_nlp_{lang}']

        if f'spacy_nlp_{lang}' not in main.__dict__:
            # Dutch
            if lang == 'nld':
                import nl_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load(
                    disable=nlp_disable)
            # English
            elif lang == 'eng':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
            # French
            elif lang == 'fra':
                import fr_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load(
                    disable=nlp_disable)
            # German
            elif lang == 'deu':
                import de_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load(
                    disable=nlp_disable)
            # Greek (Modern)
            elif lang == 'ell':
                import el_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load(
                    disable=nlp_disable)
            # Italian
            elif lang == 'ita':
                import it_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load(
                    disable=nlp_disable)
            # Portuguese
            elif lang == 'por':
                import pt_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load(
                    disable=nlp_disable)
            # Spanish
            elif lang == 'spa':
                import es_core_news_sm

                main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load(
                    disable=nlp_disable)
            # Other Languages
            elif lang == 'other':
                import en_core_web_sm

                main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load(
                    disable=nlp_disable)
    # Languages without models
    else:
        # Serbian (Cyrillic) & Serbian (Latin)
        if lang in ['srp_cyrl', 'srp_latn']:
            main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('rs')
            main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('rs')
        else:
            main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank(
                wordless_conversion.to_iso_639_1(main, lang))

    if 'sentencizer' in nlp_pipelines:
        nlp = main.__dict__[f'spacy_nlp_{lang}']

        if 'sentencizer' not in nlp.pipe_names:
            nlp.add_pipe(nlp.create_pipe('sentencizer'))
def wordless_get_stop_words(main, lang, list_stop_words='default'):
    if list_stop_words == 'default':
        list_stop_words = main.settings_custom['stop_words']['stop_words'][
            lang]

    lang_639_1 = wordless_conversion.to_iso_639_1(main, lang)

    # Chinese (Simplified)
    if lang_639_1 == 'zh_cn':
        lang_639_1 = 'zh'

    if 'Stopwords ISO' in list_stop_words:
        # Norwegian Bokmål & Norwegian Nynorsk
        if lang_639_1 in ['nb', 'nn']:
            lang_639_1 = 'no'

        # Chinese (Traditional)
        if lang_639_1 == 'zh_tw':
            with open(wordless_misc.get_abs_path(
                    'stop_words/Stopwords ISO/stop_words_zh_tw.txt'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = [line.rstrip() for line in f]
        else:
            with open(wordless_misc.get_abs_path(
                    'stop_words/Stopwords ISO/stopwords_iso.json'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = json.load(f)[lang_639_1]
    elif 'spaCy' in list_stop_words:
        # Chinese (Traditional)
        if lang_639_1 == 'zh_tw':
            with open(wordless_misc.get_abs_path(
                    'stop_words/spaCy/stop_words_zh_tw.txt'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = [line.rstrip() for line in f]
        else:
            spacy_lang = importlib.import_module(f'spacy.lang.{lang_639_1}')

            stop_words = spacy_lang.STOP_WORDS
    elif 'NLTK' in list_stop_words:
        lang_texts = {
            'ara': 'arabic',
            'aze': 'azerbaijani',
            'dan': 'danish',
            'nld': 'dutch',
            'eng': 'english',
            'fin': 'finnish',
            'fra': 'french',
            'deu': 'german',
            'ell': 'greek',
            'hun': 'hungarian',
            'ind': 'indonesian',
            'ita': 'italian',
            'kaz': 'kazakh',
            'nep': 'nepali',
            # Norwegian Bokmål & Norwegian Nynorsk
            'nob': 'norwegian',
            'nno': 'norwegian',
            'por': 'portuguese',
            'ron': 'romanian',
            'rus': 'russian',
            'spa': 'spanish',
            'swe': 'swedish',
            'tur': 'turkish'
        }

        stop_words = nltk.corpus.stopwords.words(lang_texts[lang])
    # Greek (Ancient)
    elif list_stop_words == main.tr(
            'grk-stoplist - Greek (Ancient) Stop Words'):
        with open(wordless_misc.get_abs_path(
                'stop_words/grk-stoplist/stoplist-greek.txt'),
                  'r',
                  encoding='utf_8') as f:
            stop_words = [line.rstrip() for line in f.readlines()]
    # Thai
    elif list_stop_words == main.tr('PyThaiNLP - Thai Stop Words'):
        stop_words = pythainlp.corpus.common.thai_stopwords()
    # Custom Lists
    elif list_stop_words == main.tr('Custom List'):
        stop_words = main.settings_custom['stop_words']['custom_lists'][lang]

    return sorted(stop_words)
def wordless_lemmatize(main,
                       tokens,
                       lang,
                       text_type=('untokenized', 'untagged'),
                       lemmatizer='default'):
    empty_offsets = []
    mapping_lemmas = {}
    lemmas = []

    tokens = [str(token) for token in tokens]

    re_tags_all = wordless_matching.get_re_tags(main, tags='all')
    re_tags_pos = wordless_matching.get_re_tags(main, tags='pos')
    re_tags_non_pos = wordless_matching.get_re_tags(main, tags='non_pos')

    if text_type[1] == 'tagged_both':
        tags = [''.join(re.findall(re_tags_all, token)) for token in tokens]
        tokens = [re.sub(re_tags_all, '', token) for token in tokens]
    elif text_type[1] == 'tagged_pos':
        tags = [''.join(re.findall(re_tags_pos, token)) for token in tokens]
        tokens = [re.sub(re_tags_pos, '', token) for token in tokens]
    elif text_type[1] == 'tagged_non_pos':
        tags = [
            ''.join(re.findall(re_tags_non_pos, token)) for token in tokens
        ]
        tokens = [re.sub(re_tags_non_pos, '', token) for token in tokens]
    else:
        tags = [''] * len(tokens)

    # Record empty tokens
    for i, token in reversed(list(enumerate(tokens))):
        if not token.strip():
            tokens.remove(token)

            empty_offsets.append(i)

    wordless_text_utils.check_lemmatizers(main, lang)

    if tokens and lang in main.settings_global['lemmatizers']:
        if lemmatizer == 'default':
            lemmatizer = main.settings_custom['lemmatization']['lemmatizers'][
                lang]

        # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish
        if 'spaCy' in lemmatizer:
            nlp = main.__dict__[f'spacy_nlp_{lang}']

            doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
            nlp.tagger(doc)

            lemmas = [token.lemma_ for token in doc]
        # English
        elif lemmatizer == main.tr('NLTK - WordNet Lemmatizer'):
            word_net_lemmatizer = nltk.WordNetLemmatizer()

            for token, pos in wordless_pos_tag(
                    main,
                    tokens,
                    lang='eng',
                    pos_tagger='NLTK - Perceptron POS Tagger',
                    tagset='universal'):
                if pos == 'ADJ':
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.ADJ))
                elif pos in ['NOUN', 'PROPN']:
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.NOUN))
                elif pos == 'ADV':
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.ADV))
                elif pos in ['VERB', 'AUX']:
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.VERB))
                else:
                    lemmas.append(word_net_lemmatizer.lemmatize(token))
        # Greek (Ancient)
        elif lemmatizer == main.tr(
                'lemmalist-greek - Greek (Ancient) Lemma List'):
            with open(wordless_misc.get_abs_path(
                    'lemmatization/lemmalist-greek/lemmalist-greek.txt'),
                      'r',
                      encoding='utf_8') as f:
                for line in f.readlines():
                    line = line.rstrip()

                    if line:
                        lemma, *words = line.split()

                        for word in words:
                            mapping_lemmas[word] = lemma
        # Russian & Ukrainian
        elif lemmatizer == main.tr('pymorphy2 - Morphological Analyzer'):
            if lang == 'rus':
                morphological_analyzer = pymorphy2.MorphAnalyzer(lang='ru')
            else:
                morphological_analyzer = pymorphy2.MorphAnalyzer(lang='uk')

            for token in tokens:
                lemmas.append(
                    morphological_analyzer.parse(token)[0].normal_form)
        # Tibetan
        elif lemmatizer == main.tr('pybo - Tibetan Lemmatizer'):
            word_tokenizer = main.settings_custom['word_tokenization'][
                'word_tokenizers'][lang]

            wordless_text_utils.check_pybo_tokenizers(
                main, word_tokenizer=word_tokenizer)

            if word_tokenizer == main.tr(
                    'pybo - Tibetan Word Tokenizer (GMD)'):
                tokens = main.pybo_tokenizer_gmd.tokenize(' '.join(tokens))
            elif word_tokenizer == main.tr(
                    'pybo - Tibetan Word Tokenizer (POS)'):
                tokens = main.pybo_tokenizer_pos.tokenize(' '.join(tokens))
            elif word_tokenizer == main.tr(
                    'pybo - Tibetan Word Tokenizer (tsikchen)'):
                tokens = main.pybo_tokenizer_tsikchen.tokenize(
                    ' '.join(tokens))

            for token in tokens:
                if token.lemma:
                    lemmas.append(token.lemma)
                else:
                    lemmas.append(token.text)
        # Other Languages
        elif 'Lemmatization Lists' in lemmatizer:
            lang = wordless_conversion.to_iso_639_1(main, lang)

            with open(wordless_misc.get_abs_path(
                    f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'
            ),
                      'r',
                      encoding='utf_8_sig') as f:
                for line in f:
                    try:
                        lemma, word = line.rstrip().split('\t')

                        mapping_lemmas[word] = lemma
                    except:
                        pass
    else:
        lemmas = tokens

    if mapping_lemmas:
        lemmas = [mapping_lemmas.get(token, token) for token in tokens]

    # Insert empty lemmas
    for empty_offset in empty_offsets:
        lemmas.insert(empty_offset, '')

    return [lemma + tag for lemma, tag in zip(lemmas, tags)]
def wordless_word_detokenize(main, tokens, lang, word_detokenizer='default'):
    sentence_start = 0
    sentences = []
    text = ''

    if lang not in main.settings_global['word_detokenizers']:
        lang = 'other'

    if word_detokenizer == 'default':
        word_detokenizer = main.settings_custom['word_detokenization'][
            'word_detokenizers'][lang]

    for i, token in enumerate(tokens):
        if type(token
                ) == wordless_text.Wordless_Token and token.sentence_ending:
            sentences.append(tokens[sentence_start:i + 1])

            sentence_start = i + 1
        elif i == len(tokens) - 1:
            sentences.append(tokens[sentence_start:])

    # English & Other Languages
    if word_detokenizer == main.tr('NLTK - Penn Treebank Detokenizer'):
        treebank_detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer()

        for sentence in sentences:
            text += treebank_detokenizer.tokenize(tokens)
    elif word_detokenizer == main.tr('Sacremoses - Moses Detokenizer'):
        moses_detokenizer = sacremoses.MosesDetokenizer(
            lang=wordless_conversion.to_iso_639_1(main, lang))

        for sentence in sentences:
            text += moses_detokenizer.detokenize(sentence)
    # Chinese
    elif word_detokenizer == main.tr('Wordless - Chinese Word Detokenizer'):
        non_cjk_start = 0

        for i, token in enumerate(tokens):
            if i >= non_cjk_start:
                if (wordless_checking_unicode.has_han(token)
                        or all(map(str.isnumeric, token))):
                    text += token

                    non_cjk_start += 1
                else:
                    # English
                    if wordless_checking_unicode.is_eng_token(token):
                        for j, token in enumerate(tokens[i:]):
                            if i + j + 1 == len(
                                    tokens
                            ) or not wordless_checking_unicode.is_eng_token(
                                    tokens[i + j + 1]):
                                text += wordless_word_detokenize(
                                    main,
                                    tokens[non_cjk_start:i + j + 1],
                                    lang='eng')

                                non_cjk_start = i + j + 1

                                break
                    # Other Languages
                    else:
                        for j, token in enumerate(tokens[i:]):
                            if (i + j + 1 == len(tokens)
                                    or wordless_checking_unicode.has_han(
                                        tokens[i + j + 1])):
                                text += wordless_word_detokenize(
                                    main,
                                    tokens[non_cjk_start:i + j + 1],
                                    lang='other')

                                non_cjk_start = i + j + 1

                                break
    elif word_detokenizer == main.tr('Wordless - Japanese Word Detokenizer'):
        non_cjk_start = 0

        for i, token in enumerate(tokens):
            if i < non_cjk_start:
                continue

            if (wordless_checking_unicode.has_han(token)
                    or wordless_checking_unicode.has_kana(token)
                    or all(map(str.isnumeric, token))):
                text += token

                non_cjk_start = i + 1
            else:
                # English
                if wordless_checking_unicode.is_eng_token(token):
                    for j, token in enumerate(tokens[i:]):
                        if i + j + 1 == len(
                                tokens
                        ) or not wordless_checking_unicode.is_eng_token(
                                tokens[i + j + 1]):
                            text += wordless_word_detokenize(
                                main,
                                tokens[non_cjk_start:i + j + 1],
                                lang='eng')

                            non_cjk_start = i + j + 1

                            break
                # Other Languages
                else:
                    for j, token in enumerate(tokens[i:]):
                        if (i + j + 1 == len(tokens)
                                or wordless_checking_unicode.has_han(
                                    tokens[i + j + 1])
                                or wordless_checking_unicode.has_kana(
                                    tokens[i + j + 1])):
                            text += wordless_word_detokenize(
                                main,
                                tokens[non_cjk_start:i + j + 1],
                                lang='other')

                            non_cjk_start = i + j + 1

                            break
    # Thai
    elif word_detokenizer in main.tr('Wordless - Thai Word Detokenizer'):
        non_thai_start = 0

        for i, token in enumerate(tokens):
            if i < non_thai_start:
                continue

            if wordless_checking_unicode.has_thai(token):
                if type(token) == wordless_text.Wordless_Token:
                    text += token + token.boundary
                else:
                    text += token

                non_thai_start = i + 1
            else:
                # English
                if wordless_checking_unicode.is_eng_token(token):
                    for j, token in enumerate(tokens[i:]):
                        if i + j + 1 == len(
                                tokens
                        ) or not wordless_checking_unicode.is_eng_token(
                                tokens[i + j + 1]):
                            text += wordless_word_detokenize(
                                main,
                                tokens[non_thai_start:i + j + 1],
                                lang='eng')

                            non_thai_start = i + j + 1

                            break
                # Other Languages
                else:
                    for j, token in enumerate(tokens[i:]):
                        if (i + j + 1 == len(tokens)
                                or wordless_checking_unicode.has_thai(
                                    tokens[i + j + 1])):
                            text += wordless_word_detokenize(
                                main,
                                tokens[non_thai_start:i + j + 1],
                                lang='other')

                            non_thai_start = i + j + 1

                            break
    # Tibetan
    elif word_detokenizer == main.tr('Wordless - Tibetan Word Detokenizer'):
        non_tibetan_start = 0

        for i, token in enumerate(tokens):
            if i < non_tibetan_start:
                continue

            if wordless_checking_unicode.has_tibetan(token):
                # Check for Tibetan Mark Shad
                # See: https://w3c.github.io/tlreq/#section_breaks
                if i > 0 and token[0] == '།':
                    text += token
                else:
                    text += token

                non_tibetan_start = i + 1
            else:
                # English
                if wordless_checking_unicode.is_eng_token(token):
                    for j, token in enumerate(tokens[i:]):
                        if i + j + 1 == len(
                                tokens
                        ) or not wordless_checking_unicode.is_eng_token(
                                tokens[i + j + 1]):
                            text += wordless_word_detokenize(
                                main,
                                tokens[non_tibetan_start:i + j + 1],
                                lang='eng')

                            non_tibetan_start = i + j + 1

                            break
                # Other Languages
                else:
                    for j, token in enumerate(tokens[i:]):
                        if (i + j + 1 == len(tokens)
                                or wordless_checking_unicode.has_tibetan(
                                    tokens[i + j + 1])):
                            text += wordless_word_detokenize(
                                main,
                                tokens[non_tibetan_start:i + j + 1],
                                lang='other')

                            non_tibetan_start = i + j + 1

                            break

    return re.sub(r'\s{2,}', ' ', text)
def wordless_word_tokenize(main,
                           text,
                           lang,
                           word_tokenizer='default',
                           keep_sentences=False):
    tokens_sentences = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    wordless_text_utils.check_word_tokenizers(main,
                                              lang=lang,
                                              word_tokenizer=word_tokenizer)

    if 'NLTK' in word_tokenizer:
        sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_sentences.append(treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_sentences.append(tweet_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_sentences.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_sentences.append(toktok_tokenizer.tokenize(sentence))

        if not keep_sentences:
            tokens_sentences = [
                itertools.chain.from_iterable(tokens_sentences)
            ]
    elif 'Sacremoses' in word_tokenizer:
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang)
        else:
            sentences = [text]

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_sentences.append(
                    moses_tokenizer.tokenize(sentence, escape=False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_sentences.append(
                    moses_tokenizer.penn_tokenize(sentence))
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if keep_sentences:
            for sentence in doc.sents:
                tokens_sentences.append(
                    [token.text for token in sentence.as_doc()])
        else:
            tokens_sentences.append([token.text for token in doc])

    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang=lang)
        else:
            sentences = [text]

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_sentences.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_sentences.extend(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_sentences.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wordless_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_sentences.extend(tokens)
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        sentences = wordless_sentence_tokenize(
            main,
            text,
            lang='tha',
            sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer')

        if word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm + TCC'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(
                        sentence, engine='longest-matching'))
    # Tibetan
    elif 'pybo' in word_tokenizer:
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang='bod')
        else:
            sentences = [text]

        if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text
                    for token in main.pybo_tokenizer_gmd.tokenize(sentence)
                ])
        elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text
                    for token in main.pybo_tokenizer_pos.tokenize(sentence)
                ])
        elif word_tokenizer == main.tr(
                'pybo - Tibetan Word Tokenizer (tsikchen)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text for token in
                    main.pybo_tokenizer_tsikchen.tokenize(sentence)
                ])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if keep_sentences:
            sentences = wordless_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )
        else:
            sentences = [text]

        for sentence in sentences:
            tokens_sentences.append(underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, tokens in enumerate(tokens_sentences):
        tokens_sentences[i] = [
            token.strip() for token in tokens if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for tokens in tokens_sentences:
            if tokens:
                tokens[-1] = wordless_text.Wordless_Token(tokens[-1],
                                                          boundary='',
                                                          sentence_ending=True)
    else:
        for tokens in tokens_sentences:
            if tokens:
                tokens[-1] = wordless_text.Wordless_Token(tokens[-1],
                                                          boundary=' ',
                                                          sentence_ending=True)

    return tokens_sentences
def wordless_word_tokenize(main,
                           text,
                           lang,
                           word_tokenizer='default',
                           flat_tokens=True):
    tokens_hierarchical = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    # Check initialization status of word (and sentence) tokenizers
    if flat_tokens:
        wordless_text_utils.check_word_tokenizers(
            main, lang=lang, word_tokenizer=word_tokenizer)
    else:
        wordless_text_utils.check_tokenizers(main,
                                             lang=lang,
                                             word_tokenizer=word_tokenizer)

    # NLTK
    if 'NLTK' in word_tokenizer:
        sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(
                    treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(tweet_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence))
    # Sacremoses
    elif 'Sacremoses' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(
                    moses_tokenizer.tokenize(sentence, escape=False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(
                    moses_tokenizer.penn_tokenize(sentence))
    # spaCy
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if flat_tokens:
            tokens_hierarchical.append([token.text for token in doc])
        else:
            for sentence in doc.sents:
                tokens_hierarchical.append(
                    [token.text for token in sentence.as_doc()])
    # syntok
    elif word_tokenizer == 'syntok - Word Tokenizer':
        syntok_tokenizer = syntok.tokenizer.Tokenizer()

        if flat_tokens:
            tokens_hierarchical.append(
                [token.value for token in syntok_tokenizer.tokenize(text)])
        else:
            for para in syntok.segmenter.analyze(text):
                for sentence in para:
                    tokens_hierarchical.append(
                        [token.value for token in sentence])
    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang=lang)

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_hierarchical.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_hierarchical.append(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_hierarchical.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wordless_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_hierarchical.append(tokens)
    # Russian
    elif word_tokenizer == 'razdel - Russian Word Tokenizer':
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang='rus')

        for sentence in sentences:
            tokens_hierarchical.append(
                [token.text for token in razdel.tokenize(sentence)])
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        # Preserve sentence boundaries
        sentences = wordless_sentence_tokenize(
            main,
            text,
            lang='tha',
            sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer')

        if word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm + TCC'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(
                        sentence, engine='longest-matching'))
    # Tibetan
    elif 'botok' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang='bod')

        botok_tokenizer = wordless_text_utils.check_botok_tokenizers(
            main, word_tokenizer)

        for sentence in sentences:
            tokens_hierarchical.append(
                [token.text for token in botok_tokenizer.tokenize(sentence)])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )

        for sentence in sentences:
            tokens_hierarchical.append(underthesea.word_tokenize(
                str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, sentence in enumerate(tokens_hierarchical):
        tokens_hierarchical[i] = [
            token.strip() for token in sentence if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(
                    sentence[-1], boundary='', sentence_ending=True)
    else:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(
                    sentence[-1], boundary=' ', sentence_ending=True)

    # Clause tokenization
    if not flat_tokens:
        for i, sentence in enumerate(tokens_hierarchical):
            tokens_hierarchical[i] = wordless_clause_tokenize(
                main, sentence, lang)

    # Flatten tokens
    tokens_flat = list(wordless_misc.flatten_list(tokens_hierarchical))

    if flat_tokens:
        return tokens_flat
    else:
        return tokens_hierarchical
def wordless_get_stop_words(main, lang, list_stop_words='default'):
    if list_stop_words == 'default':
        list_stop_words = main.settings_custom['stop_words']['stop_words'][
            lang]

    lang_639_1 = wordless_conversion.to_iso_639_1(main, lang)

    # Chinese (Simplified)
    if lang_639_1 == 'zh_cn':
        lang_639_1 = 'zh'

    # extra-stopwords
    if 'extra-stopwords' in list_stop_words:
        LANG_TEXTS = {
            'sqi': 'albanian',
            'ara': 'arabic',
            'hye': 'armenian',
            'eus': 'basque',
            'bel': 'belarusian',
            'ben': 'bengali',
            'bul': 'bulgarian',
            'cat': 'catalan',
            'zho_cn': 'chinese',
            # Chinese (Traditional)
            'zho_tw': 'chinese-traditional',
            'hrv': 'croatian',
            'ces': 'czech',
            'dan': 'danish',
            'nld': 'dutch',
            'eng': 'english',
            'est': 'estonian',
            'fin': 'finnish',
            'fra': 'french',
            'glg': 'galician',
            'deu': 'german',
            'ell': 'greek',
            'hau': 'hausa',
            'heb': 'hebrew',
            'hin': 'hindi',
            'hun': 'hungarian',
            'isl': 'icelandic',
            'ind': 'indonesian',
            'gle': 'irish',
            'ita': 'italian',
            'jpn': 'japanese',
            'kor': 'korean',
            'kur': 'kurdish',
            'lav': 'latvian',
            'lit': 'lithuanian',
            'msa': 'malay',
            'mar': 'marathi',
            'mon': 'mongolian',
            'nep': 'nepali',
            # Norwegian Bokmål & Norwegian Nynorsk
            'nob': 'norwegian',
            'nno': 'norwegian',
            'fas': 'persian',
            'pol': 'polish',
            'por': 'portuguese',
            'ron': 'romanian',
            'rus': 'russian',
            'srp_cyrl': 'serbian-cyrillic',
            'srp_latn': 'serbian',
            'slk': 'slovak',
            'slv': 'slovenian',
            'spa': 'spanish',
            'swa': 'swahili',
            'swe': 'swedish',
            'tgl': 'tagalog',
            'tel': 'telugu',
            'tha': 'thai',
            'tur': 'turkish',
            'ukr': 'ukranian',
            'urd': 'urdu',
            'vie': 'vietnamese',
            'yor': 'yoruba'
        }

        with open(wordless_misc.get_normalized_path(
                f'stop_words/extra-stopwords/{LANG_TEXTS[lang]}'),
                  'r',
                  encoding='utf_8') as f:
            stop_words = [
                line.rstrip() for line in f if not line.startswith('#')
            ]
    # NLTK
    elif 'NLTK' in list_stop_words:
        LANG_TEXTS = {
            'ara': 'arabic',
            'aze': 'azerbaijani',
            'dan': 'danish',
            'nld': 'dutch',
            'eng': 'english',
            'fin': 'finnish',
            'fra': 'french',
            'deu': 'german',
            'ell': 'greek',
            'hun': 'hungarian',
            'ind': 'indonesian',
            'ita': 'italian',
            'kaz': 'kazakh',
            'nep': 'nepali',
            # Norwegian Bokmål & Norwegian Nynorsk
            'nob': 'norwegian',
            'nno': 'norwegian',
            'por': 'portuguese',
            'ron': 'romanian',
            'rus': 'russian',
            'slv': 'slovene',
            'spa': 'spanish',
            'swe': 'swedish',
            'tgk': 'tajik',
            'tur': 'turkish'
        }

        stop_words = nltk.corpus.stopwords.words(LANG_TEXTS[lang])
    # spaCy
    elif 'spaCy' in list_stop_words:
        # Chinese (Traditional)
        if lang_639_1 == 'zh_tw':
            with open(wordless_misc.get_normalized_path(
                    'stop_words/spaCy/stop_words_zh_tw.txt'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = [line.rstrip() for line in f]
        else:
            # Serbian (Cyrillic) & Serbian (Latin)
            if lang_639_1 == 'sr_cyrl':
                spacy_lang = importlib.import_module('spacy.lang.sr')

                stop_words = spacy_lang.STOP_WORDS
            elif lang_639_1 == 'sr_latn':
                spacy_lang = importlib.import_module('spacy.lang.sr')

                stop_words = spacy_lang.STOP_WORDS
                stop_words = wordless_text_utils.to_srp_latn(stop_words)
            else:
                spacy_lang = importlib.import_module(
                    f'spacy.lang.{lang_639_1}')

                stop_words = spacy_lang.STOP_WORDS
    # Stopwords ISO
    elif 'Stopwords ISO' in list_stop_words:
        # Norwegian Bokmål & Norwegian Nynorsk
        if lang_639_1 in ['nb', 'nn']:
            lang_639_1 = 'no'

        # Chinese (Traditional)
        if lang_639_1 == 'zh_tw':
            with open(wordless_misc.get_normalized_path(
                    'stop_words/Stopwords ISO/stop_words_zh_tw.txt'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = [line.rstrip() for line in f]
        else:
            with open(wordless_misc.get_normalized_path(
                    'stop_words/Stopwords ISO/stopwords_iso.json'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = json.load(f)[lang_639_1]
    # Greek (Ancient)
    elif list_stop_words == main.tr(
            'grk-stoplist - Greek (Ancient) Stop Words'):
        with open(wordless_misc.get_normalized_path(
                'stop_words/grk-stoplist/stoplist-greek.txt'),
                  'r',
                  encoding='utf_8') as f:
            stop_words = [line.rstrip() for line in f.readlines()]
    # Thai
    elif list_stop_words == main.tr('PyThaiNLP - Thai Stop Words'):
        stop_words = pythainlp.corpus.common.thai_stopwords()
    # Custom Lists
    elif list_stop_words == main.tr('Custom List'):
        stop_words = main.settings_custom['stop_words']['custom_lists'][lang]

    # Remove empty tokens
    stop_words = [stop_word for stop_word in stop_words if stop_word]

    return sorted(set(stop_words))
def test_to_iso_639_1(lang_code):
    len_iso_639_3 = max(
        [len(lang_code) for lang_code in main.settings_global['lang_codes']])
    iso_639_1 = wordless_conversion.to_iso_639_1(main, lang_code)

    assert iso_639_1 == main.settings_global['lang_codes'][lang_code]