Beispiel #1
0
def init_syl_tokenizers(main, lang, syl_tokenizer):
    # Pyphen
    if syl_tokenizer.startswith('pyphen_'):
        if f'pyphen_syl_tokenizer_{lang}' not in main.__dict__:
            lang_pyphen = wl_conversion.to_iso_639_1(main, lang)

            if lang.find('_') > -1:
                lang_pyphen = f"{lang_pyphen.split('_')[0]}_{lang_pyphen.split('_')[1].upper()}"
            else:
                lang_pyphen = wl_conversion.to_iso_639_1(main, lang)

            main.__dict__[f'pyphen_syl_tokenizer_{lang}'] = pyphen.Pyphen(lang = lang_pyphen)
Beispiel #2
0
def init_spacy_models(main, lang):
    # Chinese, English, German, Portuguese
    if not lang.startswith('srp_'):
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

    if f'spacy_nlp_{lang}' not in main.__dict__:
        # Languages with models
        if lang in SPACY_LANGS:
            model = importlib.import_module(SPACY_LANGS[lang])

            main.__dict__[f'spacy_nlp_{lang}'] = model.load(disable = ['parser', 'ner'])
            # Add senter
            main.__dict__[f'spacy_nlp_{lang}'].enable_pipe('senter')
        # Languages without models
        else:
            # Serbian
            if lang == 'srp_cyrl':
                main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('sr')
            elif lang == 'srp_latn':
                main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('sr')
            else:
                main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank(wl_conversion.to_iso_639_1(main, lang))

            # Add sentencizer and lemmatizer
            main.__dict__[f'spacy_nlp_{lang}'].add_pipe('sentencizer')

            if lang in SPACY_LANGS_LEMMATIZERS:
                main.__dict__[f'spacy_nlp_{lang}'].add_pipe('lemmatizer')

                main.__dict__[f'spacy_nlp_{lang}'].initialize()
Beispiel #3
0
def init_word_detokenizers(main, lang):
    if lang not in ['zho_cn', 'zho_tw', 'jpn', 'tha', 'bod']:
        # Sacremoses
        lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang))
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        if f'sacremoses_moses_detokenizer_{lang}' not in main.__dict__:
            main.__dict__[f'sacremoses_moses_detokenizer_{lang}'] = sacremoses.MosesDetokenizer(lang = lang_sacremoses)
def check_spacy_models(main, lang, pipeline):
    spacy_langs = {
        'dan': 'da_core_news_sm',
        'nld': 'nl_core_news_sm',
        'eng': 'en_core_web_sm',
        'fra': 'fr_core_news_sm',
        'deu': 'de_core_news_sm',
        'ell': 'el_core_news_sm',
        'ita': 'it_core_news_sm',
        'lit': 'lt_core_news_sm',
        'nob': 'nb_core_news_sm',
        'pol': 'pl_core_news_sm',
        'por': 'pt_core_news_sm',
        'ron': 'ro_core_news_sm',
        'spa': 'es_core_news_sm',
        'other': 'en_core_web_sm'
    }

    # Remove unused pipelines to boost speed
    if pipeline == 'word_tokenization':
        nlp_pipelines = []
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['sentence_tokenization', 'tokenization']:
        nlp_pipelines = ['sentencizer']
        nlp_disable = ['tagger', 'parser', 'ner']
    elif pipeline in ['pos_tagging', 'lemmatization']:
        nlp_pipelines = ['tagger']
        nlp_disable = ['parser', 'ner']

    # Languages with models
    if lang in spacy_langs:
        if f'spacy_nlp_{lang}' in main.__dict__:
            if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines:
                del main.__dict__[f'spacy_nlp_{lang}']

        if f'spacy_nlp_{lang}' not in main.__dict__:
            model = importlib.import_module(spacy_langs[lang])

            main.__dict__[f'spacy_nlp_{lang}'] = model.load(
                disable=nlp_disable)
    # Languages without models
    else:
        # Serbian (Cyrillic) & Serbian (Latin)
        if lang in ['srp_cyrl', 'srp_latn']:
            main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('sr')
            main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('sr')
        else:
            main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank(
                wl_conversion.to_iso_639_1(main, lang))

    if 'sentencizer' in nlp_pipelines:
        nlp = main.__dict__[f'spacy_nlp_{lang}']

        if 'sentencizer' not in nlp.pipe_names:
            nlp.add_pipe(nlp.create_pipe('sentencizer'))
Beispiel #5
0
def init_word_tokenizers(main, lang, word_tokenizer = 'default'):
    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang]

    # NLTK
    if word_tokenizer.startswith('nltk_'):
        if word_tokenizer == 'nltk_nist':
            if 'nltk_nist_tokenizer' not in main.__dict__:
                main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()
        elif word_tokenizer == 'nltk_nltk':
            if 'nltk_nltk_tokenizer' not in main.__dict__:
                main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer()
        elif word_tokenizer == 'nltk_penn_treebank':
            if 'nltk_treebank_tokenizer' not in main.__dict__:
                main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer()
        elif word_tokenizer == 'nltk_tok_tok':
            if 'nltk_toktok_tokenizer' not in main.__dict__:
                main.nltk_toktok_tokenizer = nltk.ToktokTokenizer()
        elif word_tokenizer == 'nltk_twitter':
            if 'nltk_tweet_tokenizer' not in main.__dict__:
                main.nltk_tweet_tokenizer = nltk.TweetTokenizer()
    # Sacremoses
    elif word_tokenizer == 'sacremoses_moses':
        lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang))
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__:
            main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses)
    # spaCy
    elif word_tokenizer.startswith('spacy_'):
        init_spacy_models(main, lang)
    # Chinese
    elif word_tokenizer == 'pkuseg_zho':
        if 'pkuseg_word_tokenizer' not in main.__dict__:
            main.pkuseg_word_tokenizer = pkuseg.pkuseg()
    # Chinese & Japanese
    elif word_tokenizer.startswith('wordless_'):
        init_spacy_models(main, 'eng_us')
        init_spacy_models(main, 'other')
    # Japanese
    elif word_tokenizer.startswith('sudachipy_jpn'):
        if 'sudachipy_word_tokenizer' not in main.__dict__:
            main.sudachipy_word_tokenizer = sudachipy.Dictionary().create()
    # Tibetan
    elif word_tokenizer == 'botok_bod':
        if 'botok_word_tokenizer' not in main.__dict__:
            main.botok_word_tokenizer = botok.WordTokenizer()
def check_missing_extra_langs(langs_supported, langs_global, msg):
    global lang_missing
    global lang_extra

    for lang_code in langs_supported:
        lang_code_639_3 = wl_conversion.to_iso_639_3(main, lang_code)

        if lang_code_639_3 not in langs_global:
            print(
                f'''Missing language code "{lang_code_639_3}/{lang_code}" found for {msg}!'''
            )

            lang_missing = True

    for lang_code in langs_global:
        lang_code_639_1 = wl_conversion.to_iso_639_1(main, lang_code)

        if lang_code_639_1 not in langs_supported:
            print(
                f'''Extra language code "{lang_code}/{lang_code_639_1}" found for {msg}!'''
            )

            lang_extra = True
Beispiel #7
0
def wl_word_detokenize(main, tokens, lang, word_detokenizer='default'):
    sentence_start = 0
    sentences = []
    text = ''

    if lang not in main.settings_global['word_detokenizers']:
        lang = 'other'

    if word_detokenizer == 'default':
        word_detokenizer = main.settings_custom['word_detokenization'][
            'word_detokenizers'][lang]

    for i, token in enumerate(tokens):
        if type(token) == wl_text.Wl_Token and token.sentence_ending:
            sentences.append(tokens[sentence_start:i + 1])

            sentence_start = i + 1
        elif i == len(tokens) - 1:
            sentences.append(tokens[sentence_start:])

    # English & Other Languages
    if word_detokenizer == main.tr('NLTK - Penn Treebank Detokenizer'):
        treebank_detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer()

        for sentence in sentences:
            text += treebank_detokenizer.tokenize(tokens)
    elif word_detokenizer == main.tr('Sacremoses - Moses Detokenizer'):
        moses_detokenizer = sacremoses.MosesDetokenizer(
            lang=wl_conversion.to_iso_639_1(main, lang))

        for sentence in sentences:
            text += moses_detokenizer.detokenize(sentence)
    # Chinese
    elif word_detokenizer == main.tr('Wordless - Chinese Word Detokenizer'):
        non_cjk_start = 0

        for i, token in enumerate(tokens):
            if i >= non_cjk_start:
                if (wl_checking_unicode.has_han(token)
                        or all(map(str.isnumeric, token))):
                    text += token

                    non_cjk_start += 1
                else:
                    # English
                    if wl_checking_unicode.is_eng_token(token):
                        for j, token in enumerate(tokens[i:]):
                            if i + j + 1 == len(
                                    tokens
                            ) or not wl_checking_unicode.is_eng_token(
                                    tokens[i + j + 1]):
                                text += wl_word_detokenize(
                                    main,
                                    tokens[non_cjk_start:i + j + 1],
                                    lang='eng')

                                non_cjk_start = i + j + 1

                                break
                    # Other Languages
                    else:
                        for j, token in enumerate(tokens[i:]):
                            if (i + j + 1 == len(tokens)
                                    or wl_checking_unicode.has_han(
                                        tokens[i + j + 1])):
                                text += wl_word_detokenize(
                                    main,
                                    tokens[non_cjk_start:i + j + 1],
                                    lang='other')

                                non_cjk_start = i + j + 1

                                break
    elif word_detokenizer == main.tr('Wordless - Japanese Word Detokenizer'):
        non_cjk_start = 0

        for i, token in enumerate(tokens):
            if i < non_cjk_start:
                continue

            if (wl_checking_unicode.has_han(token)
                    or wl_checking_unicode.has_kana(token)
                    or all(map(str.isnumeric, token))):
                text += token

                non_cjk_start = i + 1
            else:
                # English
                if wl_checking_unicode.is_eng_token(token):
                    for j, token in enumerate(tokens[i:]):
                        if i + j + 1 == len(
                                tokens
                        ) or not wl_checking_unicode.is_eng_token(
                                tokens[i + j + 1]):
                            text += wl_word_detokenize(main,
                                                       tokens[non_cjk_start:i +
                                                              j + 1],
                                                       lang='eng')

                            non_cjk_start = i + j + 1

                            break
                # Other Languages
                else:
                    for j, token in enumerate(tokens[i:]):
                        if (i + j + 1 == len(tokens) or
                                wl_checking_unicode.has_han(tokens[i + j + 1])
                                or wl_checking_unicode.has_kana(
                                    tokens[i + j + 1])):
                            text += wl_word_detokenize(main,
                                                       tokens[non_cjk_start:i +
                                                              j + 1],
                                                       lang='other')

                            non_cjk_start = i + j + 1

                            break
    # Thai
    elif word_detokenizer in main.tr('Wordless - Thai Word Detokenizer'):
        non_thai_start = 0

        for i, token in enumerate(tokens):
            if i < non_thai_start:
                continue

            if wl_checking_unicode.has_thai(token):
                if type(token) == wl_text.Wl_Token:
                    text += token + token.boundary
                else:
                    text += token

                non_thai_start = i + 1
            else:
                # English
                if wl_checking_unicode.is_eng_token(token):
                    for j, token in enumerate(tokens[i:]):
                        if i + j + 1 == len(
                                tokens
                        ) or not wl_checking_unicode.is_eng_token(
                                tokens[i + j + 1]):
                            text += wl_word_detokenize(
                                main,
                                tokens[non_thai_start:i + j + 1],
                                lang='eng')

                            non_thai_start = i + j + 1

                            break
                # Other Languages
                else:
                    for j, token in enumerate(tokens[i:]):
                        if (i + j + 1 == len(tokens)
                                or wl_checking_unicode.has_thai(
                                    tokens[i + j + 1])):
                            text += wl_word_detokenize(
                                main,
                                tokens[non_thai_start:i + j + 1],
                                lang='other')

                            non_thai_start = i + j + 1

                            break
    # Tibetan
    elif word_detokenizer == main.tr('Wordless - Tibetan Word Detokenizer'):
        non_tibetan_start = 0

        for i, token in enumerate(tokens):
            if i < non_tibetan_start:
                continue

            if wl_checking_unicode.has_tibetan(token):
                # Check for Tibetan Mark Shad
                # See: https://w3c.github.io/tlreq/#section_breaks
                if i > 0 and token[0] == '།':
                    text += token
                else:
                    text += token

                non_tibetan_start = i + 1
            else:
                # English
                if wl_checking_unicode.is_eng_token(token):
                    for j, token in enumerate(tokens[i:]):
                        if i + j + 1 == len(
                                tokens
                        ) or not wl_checking_unicode.is_eng_token(
                                tokens[i + j + 1]):
                            text += wl_word_detokenize(
                                main,
                                tokens[non_tibetan_start:i + j + 1],
                                lang='eng')

                            non_tibetan_start = i + j + 1

                            break
                # Other Languages
                else:
                    for j, token in enumerate(tokens[i:]):
                        if (i + j + 1 == len(tokens)
                                or wl_checking_unicode.has_tibetan(
                                    tokens[i + j + 1])):
                            text += wl_word_detokenize(
                                main,
                                tokens[non_tibetan_start:i + j + 1],
                                lang='other')

                            non_tibetan_start = i + j + 1

                            break

    text = re.sub(r'\s{2,}', ' ', text)

    return text.strip()
def wl_get_stop_word_list(main, lang, stop_word_list = 'default'):
    if stop_word_list == 'default':
        stop_word_list = main.settings_custom['stop_word_lists']['stop_word_lists'][lang]

    if stop_word_list == main.tr('Custom List'):
        stop_word_list = main.settings_custom['stop_word_lists']['custom_lists'][lang]
    else:
        lang_639_1 = wl_conversion.to_iso_639_1(main, lang)

        # Chinese (Simplified)
        if lang_639_1 == 'zh_cn':
            lang_639_1 = 'zh'

        # Chinese (Traditional)
        if lang_639_1 == 'zh_tw':
            cc = opencc.OpenCC('s2tw')

            stop_word_list_zho_cn = wl_get_stop_word_list(
                main,
                lang = 'zho_cn',
                stop_word_list = stop_word_list.replace('Chinese (Traditional)', 'Chinese (Simplified)'))
            stop_word_list = [cc.convert(stop_word) for stop_word in stop_word_list_zho_cn]
        # extra-stopwords
        elif 'extra-stopwords' in stop_word_list:
            LANG_TEXTS = {
                'sqi': 'albanian',
                'ara': 'arabic',
                'hye': 'armenian',
                'eus': 'basque',
                'bel': 'belarusian',
                'ben': 'bengali',
                'bul': 'bulgarian',
                'cat': 'catalan',
                'zho_cn': 'chinese',
                'hrv': 'croatian',
                'ces': 'czech',
                'dan': 'danish',
                'nld': 'dutch',
                'eng': 'english',
                'est': 'estonian',
                'fin': 'finnish',
                'fra': 'french',
                'glg': 'galician',
                'deu': 'german',
                'ell': 'greek',
                'hau': 'hausa',
                'heb': 'hebrew',
                'hin': 'hindi',
                'hun': 'hungarian',
                'isl': 'icelandic',
                'ind': 'indonesian',
                'gle': 'irish',
                'ita': 'italian',
                'jpn': 'japanese',
                'kor': 'korean',
                'kur': 'kurdish',
                'lav': 'latvian',
                'lit': 'lithuanian',
                'msa': 'malay',
                'mar': 'marathi',
                'mon': 'mongolian',
                'nep': 'nepali',
                # Norwegian Bokmål & Norwegian Nynorsk
                'nob': 'norwegian',
                'nno': 'norwegian',
                'fas': 'persian',
                'pol': 'polish',
                'por': 'portuguese',
                'ron': 'romanian',
                'rus': 'russian',
                'srp_cyrl': 'serbian-cyrillic',
                'srp_latn': 'serbian',
                'slk': 'slovak',
                'slv': 'slovenian',
                'spa': 'spanish',
                'swa': 'swahili',
                'swe': 'swedish',
                'tgl': 'tagalog',
                'tel': 'telugu',
                'tha': 'thai',
                'tur': 'turkish',
                'ukr': 'ukranian',
                'urd': 'urdu',
                'vie': 'vietnamese',
                'yor': 'yoruba'
            }

            with open(wl_misc.get_normalized_path(f'stop_word_lists/extra-stopwords/{LANG_TEXTS[lang]}'), 'r', encoding = 'utf_8') as f:
                stop_word_list = [line.rstrip() for line in f if not line.startswith('#')]
        # NLTK
        elif 'NLTK' in stop_word_list:
            LANG_TEXTS = {
                'ara': 'arabic',
                'aze': 'azerbaijani',
                'dan': 'danish',
                'nld': 'dutch',
                'eng': 'english',
                'fin': 'finnish',
                'fra': 'french',
                'deu': 'german',
                'ell': 'greek',
                'hun': 'hungarian',
                'ind': 'indonesian',
                'ita': 'italian',
                'kaz': 'kazakh',
                'nep': 'nepali',
                # Norwegian Bokmål & Norwegian Nynorsk
                'nob': 'norwegian',
                'nno': 'norwegian',
                'por': 'portuguese',
                'ron': 'romanian',
                'rus': 'russian',
                'slv': 'slovene',
                'spa': 'spanish',
                'swe': 'swedish',
                'tgk': 'tajik',
                'tur': 'turkish'
            }

            stop_word_list = nltk.corpus.stopwords.words(LANG_TEXTS[lang])
        # spaCy
        elif 'spaCy' in stop_word_list:
            # Serbian (Cyrillic) & Serbian (Latin)
            if lang_639_1 == 'sr_cyrl':
                spacy_lang = importlib.import_module('spacy.lang.sr')

                stop_word_list = spacy_lang.STOP_WORDS
            elif lang_639_1 == 'sr_latn':
                spacy_lang = importlib.import_module('spacy.lang.sr')

                stop_word_list = spacy_lang.STOP_WORDS
                stop_word_list = wl_text_utils.to_srp_latn(stop_word_list)
            else:
                spacy_lang = importlib.import_module(f'spacy.lang.{lang_639_1}')

                stop_word_list = spacy_lang.STOP_WORDS
        # Stopwords ISO
        elif 'Stopwords ISO' in stop_word_list:
            # Greek (Ancient)
            if lang_639_1 == 'grc':
                lang_639_1 = 'el'

            # Norwegian Bokmål & Norwegian Nynorsk
            if lang_639_1 in ['nb', 'nn']:
                lang_639_1 = 'no'

            with open(wl_misc.get_normalized_path('stop_word_lists/Stopwords ISO/stopwords_iso.json'), 'r', encoding = 'utf_8') as f:
                stop_word_list = json.load(f)[lang_639_1]
        # Thai
        elif stop_word_list == main.tr('PyThaiNLP - Thai Stop Word List'):
            stop_word_list = pythainlp.corpus.common.thai_stopwords()

    # Remove empty tokens
    stop_word_list = [stop_word for stop_word in stop_word_list if stop_word]

    return sorted(set(stop_word_list))
Beispiel #9
0
def wl_lemmatize(main, tokens, lang, text_type = ('untokenized', 'untagged'), lemmatizer = 'default'):
    empty_offsets = []
    mapping_lemmas = {}
    lemmas = []

    tokens = [str(token) for token in tokens]

    re_tags_all = wl_matching.get_re_tags(main, tags = 'all')
    re_tags_pos = wl_matching.get_re_tags(main, tags = 'pos')
    re_tags_non_pos = wl_matching.get_re_tags(main, tags = 'non_pos')

    if text_type[1] == 'tagged_both':
        tags = [''.join(re.findall(re_tags_all, token)) for token in tokens]
        tokens = [re.sub(re_tags_all, '', token) for token in tokens]
    elif text_type[1] == 'tagged_pos':
        tags = [''.join(re.findall(re_tags_pos, token)) for token in tokens]
        tokens = [re.sub(re_tags_pos, '', token) for token in tokens]
    elif text_type[1] == 'tagged_non_pos':
        tags = [''.join(re.findall(re_tags_non_pos, token)) for token in tokens]
        tokens = [re.sub(re_tags_non_pos, '', token) for token in tokens]
    else:
        tags = [''] * len(tokens)

    # Record empty tokens
    for i, token in reversed(list(enumerate(tokens))):
        if not token.strip():
            empty_offsets.append(i)

            tokens.remove(token)

    wl_text_utils.check_lemmatizers(main, lang)

    if tokens and lang in main.settings_global['lemmatizers']:
        if lemmatizer == 'default':
            lemmatizer = main.settings_custom['lemmatization']['lemmatizers'][lang]

        # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish
        if 'spaCy' in lemmatizer:
            nlp = main.__dict__[f'spacy_nlp_{lang}']

            doc = spacy.tokens.Doc(nlp.vocab, words = tokens)
            nlp.tagger(doc)

            lemmas = [token.lemma_ for token in doc]
        # English
        elif lemmatizer == main.tr('NLTK - WordNet Lemmatizer'):
            word_net_lemmatizer = nltk.WordNetLemmatizer()

            for token, pos in wl_pos_tagging.wl_pos_tag(
                main, tokens,
                lang = 'eng',
                pos_tagger = 'NLTK - Perceptron POS Tagger',
                tagset = 'universal'
            ):
                if pos == 'ADJ':
                    lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ))
                elif pos in ['NOUN', 'PROPN']:
                    lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN))
                elif pos == 'ADV':
                    lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV))
                elif pos in ['VERB', 'AUX']:
                    lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB))
                else:
                    lemmas.append(word_net_lemmatizer.lemmatize(token))
        # Greek (Ancient)
        elif lemmatizer == main.tr('lemmalist-greek - Greek (Ancient) Lemma List'):
            with open(wl_misc.get_normalized_path('lemmatization/lemmalist-greek/lemmalist-greek.txt'), 'r', encoding = 'utf_8') as f:
                for line in f.readlines():
                    line = line.rstrip()

                    if line:
                        lemma, *words = line.split()

                        for word in words:
                            mapping_lemmas[word] = lemma
        # Russian & Ukrainian
        elif lemmatizer == main.tr('pymorphy2 - Morphological Analyzer'):
            if lang == 'rus':
                morphological_analyzer = pymorphy2.MorphAnalyzer(lang = 'ru')
            else:
                morphological_analyzer = pymorphy2.MorphAnalyzer(lang = 'uk')

            for token in tokens:
                lemmas.append(morphological_analyzer.parse(token)[0].normal_form)
        # Tibetan
        elif lemmatizer == main.tr('botok - Tibetan Lemmatizer'):
            wl_text_utils.check_word_tokenizers(main,
                                                      lang = 'bod')
            tokens = main.botok_word_tokenizer.tokenize(' '.join(tokens))

            for token in tokens:
                if token.lemma:
                    lemmas.append(token.lemma)
                else:
                    lemmas.append(token.text)
        # Other Languages
        elif 'Lemmatization Lists' in lemmatizer:
            lang = wl_conversion.to_iso_639_1(main, lang)

            with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f:
                for line in f:
                    try:
                        lemma, word = line.rstrip().split('\t')

                        mapping_lemmas[word] = lemma
                    except:
                        pass
    else:
        lemmas = tokens

    if mapping_lemmas:
        lemmas = [mapping_lemmas.get(token, token) for token in tokens]

    # Insert empty lemmas
    for empty_offset in sorted(empty_offsets):
        lemmas.insert(empty_offset, '')

    return [lemma + tag for lemma, tag in zip(lemmas, tags)]
Beispiel #10
0
def test_to_iso_639_1(lang_code):
    len_iso_639_3 = max(
        [len(lang_code) for lang_code in main.settings_global['lang_codes']])
    iso_639_1 = wl_conversion.to_iso_639_1(main, lang_code)

    assert iso_639_1 == main.settings_global['lang_codes'][lang_code]
def wl_word_tokenize(main,
                     text,
                     lang,
                     word_tokenizer='default',
                     flat_tokens=True):
    tokens_multilevel = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    # Check initialization status of word (and sentence) tokenizers
    if flat_tokens:
        wl_text_utils.check_word_tokenizers(main,
                                            lang=lang,
                                            word_tokenizer=word_tokenizer)
    else:
        wl_text_utils.check_tokenizers(main,
                                       lang=lang,
                                       word_tokenizer=word_tokenizer)

    # NLTK
    if 'NLTK' in word_tokenizer:
        sentences = wl_sentence_tokenization.wl_sentence_tokenize(
            main, text, lang)

        if word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NLTK Tokenizer'):
            nltk_tokenizer = nltk.NLTKWordTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(nltk_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(toktok_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_multilevel.append(tweet_tokenizer.tokenize(sentence))
    # Sacremoses
    elif 'Sacremoses' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main, text, lang)

        moses_tokenizer = sacremoses.MosesTokenizer(
            lang=wl_conversion.to_iso_639_1(main, lang))

        for sentence in sentences:
            tokens_multilevel.append(
                moses_tokenizer.tokenize(sentence, escape=False))

    # spaCy
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if flat_tokens:
            tokens_multilevel.append([token.text for token in doc])
        else:
            for sentence in doc.sents:
                tokens_multilevel.append(
                    [token.text for token in sentence.as_doc()])
    # syntok
    elif word_tokenizer == 'syntok - Word Tokenizer':
        syntok_tokenizer = syntok.tokenizer.Tokenizer()

        if flat_tokens:
            tokens_multilevel.append(
                [token.value for token in syntok_tokenizer.tokenize(text)])
        else:
            for para in syntok.segmenter.analyze(text):
                for sentence in para:
                    tokens_multilevel.append(
                        [token.value for token in sentence])
    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main, text, lang=lang)

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_multilevel.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wl_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wl_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wl_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wl_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_multilevel.append(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_multilevel.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wl_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wl_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wl_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wl_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wl_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wl_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wl_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_multilevel.append(tokens)
    # Russian
    elif word_tokenizer == 'razdel - Russian Word Tokenizer':
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main, text, lang='rus')

        for sentence in sentences:
            tokens_multilevel.append(
                [token.text for token in razdel.tokenize(sentence)])
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        # Preserve sentence boundaries
        sentences = wl_sentence_tokenization.wl_sentence_tokenize(main,
                                                                  text,
                                                                  lang='tha')

        if word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_multilevel.append(
                    pythainlp.word_tokenize(sentence, engine='longest'))
        elif word_tokenizer == main.tr('PyThaiNLP - Maximum Matching'):
            for sentence in sentences:
                tokens_multilevel.append(
                    pythainlp.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Maximum Matching + TCC'):
            for sentence in sentences:
                tokens_multilevel.append(
                    pythainlp.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching + TCC (Safe Mode)'):
            for sentence in sentences:
                tokens_multilevel.append(
                    pythainlp.word_tokenize(sentence, engine='newmm-safe'))
    # Tibetan
    elif 'botok' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main, text, lang='bod')

        for sentence in sentences:
            tokens_multilevel.append([
                token.text
                for token in main.botok_word_tokenizer.tokenize(sentence)
            ])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )

        for sentence in sentences:
            tokens_multilevel.append(underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, sentence in enumerate(tokens_multilevel):
        tokens_multilevel[i] = [
            token.strip() for token in sentence if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for sentence in tokens_multilevel:
            if sentence:
                sentence[-1] = wl_text.Wl_Token(sentence[-1],
                                                boundary='',
                                                sentence_ending=True)
    else:
        for sentence in tokens_multilevel:
            if sentence:
                sentence[-1] = wl_text.Wl_Token(sentence[-1],
                                                boundary=' ',
                                                sentence_ending=True)

    # Clause tokenization
    if not flat_tokens:
        for i, sentence in enumerate(tokens_multilevel):
            tokens_multilevel[i] = wl_sentence_tokenization.wl_clause_tokenize(
                main, sentence, lang)

    # Flatten tokens
    tokens_flat = list(wl_misc.flatten_list(tokens_multilevel))

    if flat_tokens:
        return tokens_flat
    else:
        return tokens_multilevel
Beispiel #12
0
def test_to_iso_639_1():
    for lang_code in TO_ISO_639_1.keys():
        lang_code_639_1 = wl_conversion.to_iso_639_1(main, lang_code)

        assert lang_code_639_1 == TO_ISO_639_1[lang_code]
Beispiel #13
0
def wl_lemmatize_text(main, text, lang, tokenized, tagged, lemmatizer):
    lemmas = []

    # spaCy
    if lemmatizer.startswith('spacy_'):
        if not lang.startswith('srp_'):
            lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)

        lemmas = [token.lemma_ for token in doc]
    # English
    elif lemmatizer == 'nltk_wordnet':
        word_net_lemmatizer = nltk.WordNetLemmatizer()

        for token, pos in wl_pos_tagging.wl_pos_tag(
            main, text,
            lang = 'eng_us',
            pos_tagger = 'nltk_perceptron',
            tagset = 'universal'
        ):
            if pos == 'ADJ':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ))
            elif pos in ['NOUN', 'PROPN']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN))
            elif pos == 'ADV':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV))
            elif pos in ['VERB', 'AUX']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB))
            else:
                lemmas.append(word_net_lemmatizer.lemmatize(token))
    # Japanese
    elif lemmatizer == 'sudachipy_jpn':
        lemmas = [
            token.dictionary_form()
            for token in main.sudachipy_word_tokenizer.tokenize(text)
        ]
    # Russian & Ukrainian
    elif lemmatizer == 'pymorphy2_morphological_analyzer':
        if lang == 'rus':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_rus
        elif lang == 'ukr':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang)

        for token in tokens:
            lemmas.append(morphological_analyzer.parse(token)[0].normal_form)
    # Tibetan
    elif lemmatizer == 'botok_bod':
        tokens = main.botok_word_tokenizer.tokenize(text)

        for token in tokens:
            if token.lemma:
                lemmas.append(token.lemma)
            else:
                lemmas.append(token.text)
    # Lemmatization Lists
    elif lemmatizer.startswith('lemmatization_lists_'):
        mapping_lemmas = {}

        lang = wl_conversion.to_iso_639_1(main, lang)
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f:
            for line in f:
                try:
                    lemma, word = line.rstrip().split('\t')

                    mapping_lemmas[word] = lemma
                except ValueError:
                    pass

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang)
        lemmas = [mapping_lemmas.get(token, token) for token in tokens]

    # Remove empty lemmas and strip whitespace in tokens
    lemmas = [
        str(lemma).strip()
        for lemma in lemmas
        if str(lemma).strip()
    ]

    return lemmas
Beispiel #14
0
def wl_lemmatize_tokens(main, tokens, lang, tokenized, tagged, lemmatizer):
    empty_offsets = []
    lemmas = []

    tokens = [str(token) for token in tokens]

    re_tags = wl_matching.get_re_tags(main, tag_type = 'body')

    if tagged == _tr('wl_lemmatize_tokens', 'Yes'):
        tags = [''.join(re.findall(re_tags, token)) for token in tokens]
        tokens = [re.sub(re_tags, '', token) for token in tokens]
    else:
        tags = [''] * len(tokens)

    # Record empty tokens with their tags
    for i, token in reversed(list(enumerate(tokens))):
        if not token.strip():
            empty_offsets.append(i)

            del tokens[i]
            del tags[i]

    # spaCy
    if 'spacy' in lemmatizer:
        if not lang.startswith('srp_'):
            lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        nlp = main.__dict__[f'spacy_nlp_{lang}']

        if lang != 'jpn':
            doc = spacy.tokens.Doc(nlp.vocab, words = tokens, spaces = [False] * len(tokens))

            for pipe_name in nlp.pipe_names:
                nlp.get_pipe(pipe_name)(doc)
        # The Japanese model do not have a lemmatizer component and Japanese lemmas are taken directly from SudachiPy
        # See: https://github.com/explosion/spaCy/discussions/9983#discussioncomment-1923647
        else:
            doc = nlp(''.join(tokens))

        lemma_tokens = [token.text for token in doc]
        lemmas = [token.lemma_ for token in doc]
    # English
    elif lemmatizer == 'nltk_wordnet':
        word_net_lemmatizer = nltk.WordNetLemmatizer()

        for token, pos in wl_pos_tagging.wl_pos_tag(
            main, tokens,
            lang = 'eng_us',
            pos_tagger = 'nltk_perceptron',
            tagset = 'universal'
        ):
            if pos == 'ADJ':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ))
            elif pos in ['NOUN', 'PROPN']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN))
            elif pos == 'ADV':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV))
            elif pos in ['VERB', 'AUX']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB))
            else:
                lemmas.append(word_net_lemmatizer.lemmatize(token))

        lemma_tokens = tokens.copy()
    # Japanese
    elif lemmatizer == 'sudachipy_jpn':
        tokens_retokenized = main.sudachipy_word_tokenizer.tokenize(''.join(tokens))

        lemma_tokens = [token.surface() for token in tokens_retokenized]
        lemmas = [token.dictionary_form() for token in tokens_retokenized]
    # Russian & Ukrainian
    elif lemmatizer == 'pymorphy2_morphological_analyzer':
        if lang == 'rus':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_rus
        elif lang == 'ukr':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr

        for token in tokens:
            lemmas.append(morphological_analyzer.parse(token)[0].normal_form)

        lemma_tokens = tokens.copy()
    # Tibetan
    elif lemmatizer == 'botok_bod':
        lemma_tokens = []
        tokens_retokenized = main.botok_word_tokenizer.tokenize(''.join(tokens))

        for token in tokens_retokenized:
            if token.lemma:
                lemmas.append(token.lemma)
            else:
                lemmas.append(token.text)

            lemma_tokens.append(token.text)
    # Lemmatization Lists
    elif 'lemmatization_lists' in lemmatizer:
        mapping_lemmas = {}

        lang = wl_conversion.to_iso_639_1(main, lang)
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f:
            for line in f:
                try:
                    lemma, word = line.rstrip().split('\t')

                    mapping_lemmas[word] = lemma
                except ValueError:
                    pass

        lemma_tokens = tokens.copy()
        lemmas = [mapping_lemmas.get(token, token) for token in tokens]

    # Remove empty lemmas and strip whitespace in tokens
    for i, lemma in reversed(list(enumerate(lemmas))):
        lemma_tokens[i] = lemma_tokens[i].strip()
        lemmas[i] = lemma.strip()

        if not lemmas[i]:
            del lemmas[i]
            del lemma_tokens[i]

    # Make sure that tokenization is not modified during lemmatization
    i_tokens = 0
    i_lemmas = 0

    len_tokens = len(tokens)
    len_lemmas = len(lemmas)

    if len_tokens != len_lemmas:
        tags_modified = []
        lemmas_modified = []

        while i_tokens < len_tokens and i_lemmas < len_lemmas:
            # Different token
            if len(tokens[i_tokens]) != len(lemma_tokens[i_lemmas]):
                tokens_temp = [tokens[i_tokens]]
                tags_temp = [tags[i_tokens]]
                lemma_tokens_temp = [lemma_tokens[i_lemmas]]
                lemmas_temp = [lemmas[i_lemmas]]

                # Align tokens
                while i_tokens < len_tokens - 1 or i_lemmas < len_lemmas - 1:
                    len_tokens_temp = sum([len(token) for token in tokens_temp])
                    len_lemma_tokens_temp = sum([len(token) for token in lemma_tokens_temp])

                    if len_tokens_temp > len_lemma_tokens_temp:
                        lemma_tokens_temp.append(lemma_tokens[i_lemmas + 1])
                        lemmas_temp.append(lemmas[i_lemmas + 1])

                        i_lemmas += 1
                    elif len_tokens_temp < len_lemma_tokens_temp:
                        tokens_temp.append(tokens[i_tokens + 1])
                        tags_temp.append(tags[i_tokens + 1])

                        i_tokens += 1
                    else:
                        # Use lemmas in one-to-one
                        if len(tokens_temp) == len(lemma_tokens_temp):
                            tags_modified.extend(tags_temp)
                            lemmas_modified.extend(lemmas_temp)
                        # Use original tokens in many-to-one or one-to-many
                        else:
                            tags_modified.extend(tags)
                            lemmas_modified.extend(tokens_temp)

                        tokens_temp = []
                        tags_temp = []
                        lemma_tokens_temp = []
                        lemmas_temp = []

                        break

                if tokens_temp:
                    # Use lemmas in one-to-one
                    if len(tokens_temp) == len(lemma_tokens_temp):
                        tags_modified.extend(tags_temp)
                        lemmas_modified.extend(lemmas_temp)
                    # Use original tokens in many-to-one or one-to-many
                    else:
                        tags_modified.extend(tags)
                        lemmas_modified.extend(tokens_temp)
            else:
                tags_modified.extend(tags[i_tokens])
                lemmas_modified.append(lemmas[i_lemmas])

            i_tokens += 1
            i_lemmas += 1

        len_lemmas_modified = len(lemmas_modified)

        if len_tokens < len_lemmas_modified:
            tags = tags_modified[:len_tokens]
            lemmas = lemmas_modified[:len_tokens]
        elif len_tokens > len_lemmas_modified:
            tags = tags_modified + [tags_modified[-1]] * (len_tokens - len_lemmas_modified)
            lemmas = lemmas_modified + [lemmas_modified[-1]] * (len_tokens - len_lemmas_modified)
        else:
            tags = tags_modified.copy()
            lemmas = lemmas_modified.copy()

    # Insert empty lemmas and their tags after alignment of input and output
    for empty_offset in sorted(empty_offsets):
        lemmas.insert(empty_offset, '')
        tags.insert(empty_offset, '')

    return [lemma + tag for lemma, tag in zip(lemmas, tags)]