Ejemplo n.º 1
0
def init_word_detokenizers(main, lang):
    if lang not in ['zho_cn', 'zho_tw', 'jpn', 'tha', 'bod']:
        # Sacremoses
        lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang))
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        if f'sacremoses_moses_detokenizer_{lang}' not in main.__dict__:
            main.__dict__[f'sacremoses_moses_detokenizer_{lang}'] = sacremoses.MosesDetokenizer(lang = lang_sacremoses)
Ejemplo n.º 2
0
def test_remove_lang_code_suffixes():
    for lang_code_639_3, lang_code_639_1 in TO_ISO_639_1.items():
        if lang_code_639_3.find('_') > -1:
            lang_code_639_3 = wl_conversion.remove_lang_code_suffixes(main, lang_code_639_3)

            assert lang_code_639_3.find('_') == -1

        if lang_code_639_1.find('_') > -1:
            lang_code_639_1 = wl_conversion.remove_lang_code_suffixes(main, lang_code_639_1)

            assert lang_code_639_1.find('_') == -1
Ejemplo n.º 3
0
def init_word_tokenizers(main, lang, word_tokenizer = 'default'):
    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang]

    # NLTK
    if word_tokenizer.startswith('nltk_'):
        if word_tokenizer == 'nltk_nist':
            if 'nltk_nist_tokenizer' not in main.__dict__:
                main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()
        elif word_tokenizer == 'nltk_nltk':
            if 'nltk_nltk_tokenizer' not in main.__dict__:
                main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer()
        elif word_tokenizer == 'nltk_penn_treebank':
            if 'nltk_treebank_tokenizer' not in main.__dict__:
                main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer()
        elif word_tokenizer == 'nltk_tok_tok':
            if 'nltk_toktok_tokenizer' not in main.__dict__:
                main.nltk_toktok_tokenizer = nltk.ToktokTokenizer()
        elif word_tokenizer == 'nltk_twitter':
            if 'nltk_tweet_tokenizer' not in main.__dict__:
                main.nltk_tweet_tokenizer = nltk.TweetTokenizer()
    # Sacremoses
    elif word_tokenizer == 'sacremoses_moses':
        lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang))
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__:
            main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses)
    # spaCy
    elif word_tokenizer.startswith('spacy_'):
        init_spacy_models(main, lang)
    # Chinese
    elif word_tokenizer == 'pkuseg_zho':
        if 'pkuseg_word_tokenizer' not in main.__dict__:
            main.pkuseg_word_tokenizer = pkuseg.pkuseg()
    # Chinese & Japanese
    elif word_tokenizer.startswith('wordless_'):
        init_spacy_models(main, 'eng_us')
        init_spacy_models(main, 'other')
    # Japanese
    elif word_tokenizer.startswith('sudachipy_jpn'):
        if 'sudachipy_word_tokenizer' not in main.__dict__:
            main.sudachipy_word_tokenizer = sudachipy.Dictionary().create()
    # Tibetan
    elif word_tokenizer == 'botok_bod':
        if 'botok_word_tokenizer' not in main.__dict__:
            main.botok_word_tokenizer = botok.WordTokenizer()
Ejemplo n.º 4
0
def init_spacy_models(main, lang):
    # Chinese, English, German, Portuguese
    if not lang.startswith('srp_'):
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

    if f'spacy_nlp_{lang}' not in main.__dict__:
        # Languages with models
        if lang in SPACY_LANGS:
            model = importlib.import_module(SPACY_LANGS[lang])

            main.__dict__[f'spacy_nlp_{lang}'] = model.load(disable = ['parser', 'ner'])
            # Add senter
            main.__dict__[f'spacy_nlp_{lang}'].enable_pipe('senter')
        # Languages without models
        else:
            # Serbian
            if lang == 'srp_cyrl':
                main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('sr')
            elif lang == 'srp_latn':
                main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('sr')
            else:
                main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank(wl_conversion.to_iso_639_1(main, lang))

            # Add sentencizer and lemmatizer
            main.__dict__[f'spacy_nlp_{lang}'].add_pipe('sentencizer')

            if lang in SPACY_LANGS_LEMMATIZERS:
                main.__dict__[f'spacy_nlp_{lang}'].add_pipe('lemmatizer')

                main.__dict__[f'spacy_nlp_{lang}'].initialize()
Ejemplo n.º 5
0
def wl_word_tokenize(main, text, lang, word_tokenizer='default'):
    tokens_multilevel = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    wl_nlp_utils.init_word_tokenizers(main,
                                      lang=lang,
                                      word_tokenizer=word_tokenizer)

    if word_tokenizer.startswith('spacy_'):
        # Input of SudachiPy cannot be more than 49149 BYTES
        if word_tokenizer == 'spacy_jpn' and len(text) > 49149 // 4:
            # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300)
            sections = wl_nlp_utils.split_into_chunks_text(text,
                                                           section_size=10)
        else:
            sections = wl_nlp_utils.split_into_chunks_text(
                text,
                section_size=main.settings_custom['files']['misc']
                ['read_files_in_chunks'])
    else:
        sections = wl_nlp_utils.split_into_chunks_text(text, 1)

    for section in sections:
        # spaCy
        if word_tokenizer.startswith('spacy_'):
            # Chinese, English, German, Portuguese
            if not lang.startswith('srp_'):
                lang = wl_conversion.remove_lang_code_suffixes(main, lang)

            nlp = main.__dict__[f'spacy_nlp_{lang}']
            doc = nlp(section)

            tokens_multilevel.append([])

            len_sents = len(list(doc.sents))

            for i, sentence in enumerate(doc.sents):
                tokens_sentence = []

                tokens = [token.text for token in sentence]
                len_tokens = len(tokens)

                for j, token in enumerate(tokens):
                    # Split paragraphs by new line character
                    len_lines = len(re.findall(r'\n', token))

                    if len_lines:
                        # Check if the last paragraph is empty
                        if i == len_sents - 1 and j == len_tokens - 1 and token.endswith(
                                '\n'):
                            len_lines -= 1

                        if tokens_sentence:
                            tokens_multilevel[-1].append(tokens_sentence)

                            tokens_sentence = []

                        tokens_multilevel.extend([[]
                                                  for j in range(len_lines)])
                    else:
                        if token.strip():
                            tokens_sentence.append(token)

                if tokens_sentence:
                    tokens_multilevel[-1].append(tokens_sentence)
        else:
            tokens_multilevel.append([])

            if section.strip():
                # NLTK
                if word_tokenizer.startswith('nltk_'):
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang)

                    if word_tokenizer == 'nltk_nist':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_nist_tokenizer.tokenize(sentence))
                    elif word_tokenizer == 'nltk_nltk':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_nltk_tokenizer.tokenize(sentence))
                    elif word_tokenizer == 'nltk_penn_treebank':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_treebank_tokenizer.tokenize(
                                    sentence))
                    elif word_tokenizer == 'nltk_tok_tok':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_toktok_tokenizer.tokenize(sentence))
                    elif word_tokenizer == 'nltk_twitter':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_tweet_tokenizer.tokenize(sentence))
                # Sacremoses
                elif word_tokenizer == 'sacremoses_moses':
                    lang = wl_conversion.remove_lang_code_suffixes(main, lang)
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang)

                    for sentence in sentences:
                        tokens_multilevel[-1].append(
                            main.__dict__[f'sacremoses_moses_tokenizer_{lang}']
                            .tokenize(sentence, escape=False))
                # Chinese
                elif word_tokenizer == 'jieba_zho':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens_multilevel[-1].append(jieba.lcut(sentence))
                elif word_tokenizer == 'pkuseg_zho':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens_multilevel[-1].append(
                            main.pkuseg_word_tokenizer.cut(sentence))
                elif word_tokenizer == 'wordless_zho_char':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens = []
                        non_han_start = 0

                        for i, char in enumerate(sentence):
                            if i >= non_han_start:
                                if wl_checking_unicode.is_han(char):
                                    tokens.append(char)

                                    non_han_start += 1
                                else:
                                    # English
                                    if wl_checking_unicode.is_eng(char):
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or not wl_checking_unicode.is_eng(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='eng_us'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break
                                    # Other Languages
                                    else:
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or wl_checking_unicode.is_han(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='other'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break

                        tokens_multilevel[-1].append(tokens)
                # Japanese
                elif word_tokenizer == 'nagisa_jpn':
                    import nagisa

                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens_multilevel[-1].append(
                            nagisa.tagging(str(sentence)).words)
                elif word_tokenizer.startswith('sudachipy_jpn'):
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    if word_tokenizer == 'sudachipy_jpn_split_mode_a':
                        for sentence in sentences:
                            tokens_multilevel[-1].append([
                                token.surface()
                                for token in main.sudachipy_word_tokenizer.
                                tokenize(sentence, sudachipy.SplitMode.A)
                            ])
                    elif word_tokenizer == 'sudachipy_jpn_split_mode_b':
                        for sentence in sentences:
                            tokens_multilevel[-1].append([
                                token.surface()
                                for token in main.sudachipy_word_tokenizer.
                                tokenize(sentence, sudachipy.SplitMode.B)
                            ])
                    elif word_tokenizer == 'sudachipy_jpn_split_mode_c':
                        for sentence in sentences:
                            tokens_multilevel[-1].append([
                                token.surface()
                                for token in main.sudachipy_word_tokenizer.
                                tokenize(sentence, sudachipy.SplitMode.C)
                            ])
                elif word_tokenizer == 'wordless_jpn_kanji':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens = []
                        non_han_start = 0

                        for i, char in enumerate(sentence):
                            if i >= non_han_start:
                                if wl_checking_unicode.is_han(char):
                                    tokens.append(char)

                                    non_han_start += 1
                                else:
                                    # Japanese Kana
                                    if wl_checking_unicode.is_kana(char):
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or not wl_checking_unicode.is_kana(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='jpn'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break
                                    # English
                                    elif wl_checking_unicode.is_eng(char):
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or not wl_checking_unicode.is_eng(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='eng_us'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break
                                    # Other Languages
                                    else:
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or wl_checking_unicode.is_han(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='other'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break

                        tokens_multilevel[-1].append(tokens)
                # Icelandic
                elif word_tokenizer == 'tokenizer_isl':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main,
                        section,
                        lang='isl',
                        sentence_tokenizer='tokenizer_isl')

                    for sentence in sentences:
                        tokens_multilevel[-1].append([
                            token for kind, token, val in tokenizer.tokenize(
                                sentence) if token
                        ])
                # Thai
                elif word_tokenizer.startswith('pythainlp_'):
                    # Preserve sentence boundaries
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang='tha')

                    if word_tokenizer == 'pythainlp_longest_matching':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence,
                                                        engine='longest'))
                    elif word_tokenizer == 'pythainlp_max_matching':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence, engine='mm'))
                    elif word_tokenizer == 'pythainlp_max_matching_tcc':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence,
                                                        engine='newmm'))
                    elif word_tokenizer == 'pythainlp_max_matching_tcc_safe_mode':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence,
                                                        engine='newmm-safe'))
                    elif word_tokenizer == 'pythainlp_nercut':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence,
                                                        engine='nercut'))
                # Tibetan
                elif word_tokenizer == 'botok_bod':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang='bod')

                    for sentence in sentences:
                        tokens_multilevel[-1].append([
                            token.text for token in
                            main.botok_word_tokenizer.tokenize(sentence)
                        ])
                # Vietnamese
                elif word_tokenizer == 'underthesea_vie':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main,
                        section,
                        lang='vie',
                        sentence_tokenizer='underthesea_vie')

                    for sentence in sentences:
                        tokens_multilevel[-1].append(
                            underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for para in tokens_multilevel:
        for i, sentence in enumerate(para):
            para[i] = [token.strip() for token in sentence if token.strip()]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for para in tokens_multilevel:
            for sentence in para:
                if sentence:
                    sentence[-1] = wl_texts.Wl_Token(sentence[-1],
                                                     boundary='',
                                                     sentence_ending=True)
    else:
        for para in tokens_multilevel:
            for sentence in para:
                if sentence:
                    sentence[-1] = wl_texts.Wl_Token(sentence[-1],
                                                     boundary=' ',
                                                     sentence_ending=True)

    return tokens_multilevel
Ejemplo n.º 6
0
def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'):
    sentences = []

    if lang not in main.settings_global['sentence_tokenizers']:
        lang = 'other'

    if sentence_tokenizer == 'default':
        sentence_tokenizer = main.settings_custom['sentence_tokenization'][
            'sentence_tokenizers'][lang]

    wl_nlp_utils.init_sentence_tokenizers(
        main, lang=lang, sentence_tokenizer=sentence_tokenizer)

    # Input of SudachiPy cannot be more than 49149 BYTES
    if sentence_tokenizer == 'spacy_jpn' and len(text) > 49149 // 4:
        # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300)
        sections = wl_nlp_utils.split_into_chunks_text(text, section_size=10)
    else:
        sections = wl_nlp_utils.split_into_chunks_text(
            text,
            section_size=main.settings_custom['files']['misc']
            ['read_files_in_chunks'])

    for section in sections:
        # NLTK
        if sentence_tokenizer == 'nltk_punkt':
            lang_texts = {
                'ces': 'czech',
                'dan': 'danish',
                'nld': 'dutch',
                # English
                'eng_gb': 'english',
                'eng_us': 'english',
                'est': 'estonian',
                'fin': 'finnish',
                'fra': 'french',
                # German
                'deu_at': 'german',
                'deu_de': 'german',
                'deu_ch': 'german',
                'ell': 'greek',
                'ita': 'italian',
                # Norwegian
                'nob': 'norwegian',
                'nno': 'norwegian',
                'pol': 'polish',
                # Portuguese
                'por_br': 'portuguese',
                'por_pt': 'portuguese',
                'rus': 'russian',
                'slv': 'slovene',
                'spa': 'spanish',
                'swe': 'swedish',
                'tur': 'turkish',
                # Other languages
                'other': 'english'
            }

            sentences.extend(
                nltk.sent_tokenize(section, language=lang_texts[lang]))
        # spaCy
        elif sentence_tokenizer.startswith('spacy_'):
            # Chinese, English, German, Portuguese
            if not lang.startswith('srp_'):
                lang = wl_conversion.remove_lang_code_suffixes(main, lang)

            nlp = main.__dict__[f'spacy_nlp_{lang}']
            doc = nlp(section)

            sentences.extend([sentence.text for sentence in doc.sents])
        # Chinese & Japanese
        elif sentence_tokenizer in ['wordless_zho', 'wordless_jpn']:
            for line in section.splitlines():
                sentence_start = 0

                for i, char in enumerate(line):
                    if i >= sentence_start and char in [
                            '。', '!', '?', '!', '?'
                    ]:
                        for j, char_next in enumerate(line):
                            if j > i and char_next not in [
                                    '。', '!', '?', '!', '?', '’', '”', ')', ')'
                            ]:
                                sentences.append(line[sentence_start:j])

                                sentence_start = j

                                break

                if sentence_start <= len(line):
                    sentences.append(line[sentence_start:])
        # Icelandic
        elif sentence_tokenizer == 'tokenizer_isl':
            for sentence in tokenizer.split_into_sentences(section):
                sentences.append(
                    wl_word_detokenization.wl_word_detokenize(
                        main, tokens=sentence.split(), lang='isl'))
        # Thai
        elif sentence_tokenizer == 'pythainlp_crfcut':
            sentences.extend(pythainlp.sent_tokenize(section))
        # Tibetan
        elif sentence_tokenizer == 'botok_bod':
            wl_nlp_utils.init_word_tokenizers(main, lang='bod')

            tokens = main.botok_word_tokenizer.tokenize(section)

            for sentence_tokens in botok.sentence_tokenizer(tokens):
                sentences.append(''.join([
                    sentence_token.text
                    for sentence_token in sentence_tokens['tokens']
                ]))
        # Vietnamese
        elif sentence_tokenizer == 'underthesea_vie':
            sentences.extend(underthesea.sent_tokenize(section))

    # Strip spaces
    sentences = [
        sentence_non_empty for sentence in sentences
        if (sentence_non_empty := sentence.strip())
    ]
Ejemplo n.º 7
0
def wl_pos_tag_tokens(main, tokens, lang, pos_tagger, tagset):
    tokens_tagged = []

    if pos_tagger == 'nagisa_jpn':
        # Defer import to save loading time
        import nagisa

    lang = wl_conversion.remove_lang_code_suffixes(main, lang)

    # spaCy
    if pos_tagger.startswith('spacy_'):
        if not lang.startswith('srp_'):
            lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        nlp = main.__dict__[f'spacy_nlp_{lang}']

        if lang != 'jpn':
            doc = spacy.tokens.Doc(nlp.vocab,
                                   words=tokens,
                                   spaces=[False] * len(tokens))

            for pipe_name in nlp.pipe_names:
                nlp.get_pipe(pipe_name)(doc)
        # The Japanese model do not have a tagger component and Japanese POS tags are taken directly from SudachiPy
        # See: https://github.com/explosion/spaCy/discussions/9983#discussioncomment-1910117
        else:
            doc = nlp(''.join(tokens))

        if tagset == 'default':
            tokens_tagged = [(token.text, token.tag_) for token in doc]
        elif tagset == 'universal':
            tokens_tagged = [(token.text, token.pos_) for token in doc]
    # Chinese
    elif pos_tagger == 'jieba_zho':
        tokens_tagged = jieba.posseg.cut(''.join(tokens))
    # English & Russian
    elif pos_tagger == 'nltk_perceptron':
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        tokens_tagged = nltk.pos_tag(tokens, lang=lang)
    # Japanese
    elif pos_tagger == 'nagisa_jpn':
        tokens_tagged = zip(tokens, nagisa.postagging(tokens))
    elif pos_tagger == 'sudachipy_jpn':
        tokens_tagged = [(token.surface(), '-'.join(
            [pos for pos in token.part_of_speech()[:4] if pos != '*']))
                         for token in main.sudachipy_word_tokenizer.tokenize(
                             ''.join(tokens))]
    # Russian & Ukrainian
    elif pos_tagger == 'pymorphy2_morphological_analyzer':
        if lang == 'rus':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_rus
        elif lang == 'ukr':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr

        for token in tokens:
            tokens_tagged.append(
                (token, morphological_analyzer.parse(token)[0].tag._POS))
    # Thai
    elif pos_tagger == 'pythainlp_perceptron_lst20':
        tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                              engine='perceptron',
                                              corpus='lst20')
    elif pos_tagger == 'pythainlp_perceptron_orchid':
        tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                              engine='perceptron',
                                              corpus='orchid')
    elif pos_tagger == 'pythainlp_perceptron_pud':
        tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                              engine='perceptron',
                                              corpus='pud')
    # Tibetan
    elif pos_tagger == 'botok_bod':
        tokens_retokenized = main.botok_word_tokenizer.tokenize(
            ''.join(tokens))

        for token in tokens_retokenized:
            if token.pos:
                tokens_tagged.append((token.text, token.pos))
            else:
                tokens_tagged.append((token.text, token.chunk_type))

    # Vietnamese
    elif pos_tagger == 'underthesea_vie':
        tokens_tagged = underthesea.pos_tag(' '.join(tokens))

    # Remove empty tokens and strip whitespace in tokens
    tokens_tagged = [(str(token).strip(), tag) for token, tag in tokens_tagged
                     if str(token).strip()]

    # Make sure that tokenization is not modified during POS tagging
    i_tokens = 0
    i_tokens_tagged = 0

    len_tokens = len(tokens)
    len_tokens_tagged = len(tokens_tagged)

    if len_tokens != len_tokens_tagged:
        tokens_tagged_modified = []

        while i_tokens < len_tokens and i_tokens_tagged < len_tokens_tagged:
            # Different token
            if len(tokens[i_tokens]) != len(tokens_tagged[i_tokens_tagged][0]):
                tokens_temp = [tokens[i_tokens]]
                tokens_tagged_temp = [tokens_tagged[i_tokens_tagged][0]]
                tags_temp = [tokens_tagged[i_tokens_tagged][1]]

                # Align tokens
                while i_tokens < len_tokens - 1 or i_tokens_tagged < len_tokens_tagged - 1:
                    len_tokens_temp = sum(
                        [len(token) for token in tokens_temp])
                    len_tokens_tagged_temp = sum(
                        [len(token) for token in tokens_tagged_temp])

                    if len_tokens_temp > len_tokens_tagged_temp:
                        tokens_tagged_temp.append(
                            tokens_tagged[i_tokens_tagged + 1][0])
                        tags_temp.append(tokens_tagged[i_tokens_tagged + 1][1])

                        i_tokens_tagged += 1
                    elif len_tokens_temp < len_tokens_tagged_temp:
                        tokens_temp.append(tokens[i_tokens + 1])

                        i_tokens += 1
                    else:
                        if len(tokens_temp) == len(tokens_tagged_temp):
                            tokens_tagged_modified.extend([
                                (token, tag)
                                for token, tag in zip(tokens_temp, tags_temp)
                            ])
                        elif len(tokens_temp) > len(tokens_tagged_temp):
                            tokens_tagged_modified.extend([
                                (token, tags_temp[0]) for token in tokens_temp
                            ])
                        else:
                            tokens_tagged_modified.append(
                                (tokens_temp[0], tags_temp[0]))

                        tokens_temp = []
                        tokens_tagged_temp = []
                        tags_temp = []

                        break

                if tokens_temp:
                    if len(tokens_temp) == len(tokens_tagged_temp):
                        tokens_tagged_modified.extend([
                            (token, tag)
                            for token, tag in zip(tokens_temp, tags_temp)
                        ])
                    elif len(tokens_temp) > len(tokens_tagged_temp):
                        tokens_tagged_modified.extend([
                            (token, tags_temp[0]) for token in tokens_temp
                        ])
                    else:
                        tokens_tagged_modified.append(
                            (tokens_temp[0], tags_temp[0]))
            else:
                tokens_tagged_modified.append(
                    (tokens[i_tokens], tokens_tagged[i_tokens_tagged][1]))

            i_tokens += 1
            i_tokens_tagged += 1

        len_tokens_tagged_modified = len(tokens_tagged_modified)

        if len_tokens < len_tokens_tagged_modified:
            tokens_tagged = tokens_tagged_modified[:len_tokens]
        elif len_tokens > len_tokens_tagged_modified:
            tokens_tagged = tokens_tagged_modified + [
                tokens_tagged_modified[-1]
            ] * (len_tokens - len_tokens_tagged_modified)
        else:
            tokens_tagged = tokens_tagged_modified.copy()
    else:
        tokens_tagged = [(tokens[i], tokens_tagged[i][1])
                         for i in range(len(tokens))]

    return tokens_tagged
Ejemplo n.º 8
0
def wl_pos_tag_text(main, text, lang, pos_tagger, tagset):
    tokens_tagged = []

    if pos_tagger == 'nagisa_jpn':
        # Defer import to save loading time
        import nagisa

    # spaCy
    if pos_tagger.startswith('spacy_'):
        if not lang.startswith('srp_'):
            lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)

        if tagset == 'default':
            tokens_tagged = [(token.text, token.tag_) for token in doc]
        elif tagset == 'universal':
            tokens_tagged = [(token.text, token.pos_) for token in doc]
    # Chinese
    elif pos_tagger == 'jieba_zho':
        tokens_tagged = jieba.posseg.cut(text)
    # English & Russian
    elif pos_tagger == 'nltk_perceptron':
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main,
                                                            text,
                                                            lang=lang)
        tokens_tagged = nltk.pos_tag(tokens, lang=lang)
    # Japanese
    elif pos_tagger == 'nagisa_jpn':
        tokens_tagged = nagisa.tagging(text)
        tokens_tagged = zip(tokens_tagged.words, tokens_tagged.postags)
    elif pos_tagger == 'sudachipy_jpn':
        tokens_tagged = [(token.surface(), '-'.join([
            pos for pos in token.part_of_speech()[:4] if pos != '*'
        ])) for token in main.sudachipy_word_tokenizer.tokenize(text)]
    # Russian & Ukrainian
    elif pos_tagger == 'pymorphy2_morphological_analyzer':
        if lang == 'rus':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_rus
        elif lang == 'ukr':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main,
                                                            text,
                                                            lang=lang)

        for token in tokens:
            tokens_tagged.append(
                (token, morphological_analyzer.parse(token)[0].tag._POS))
    # Thai
    elif pos_tagger.startswith('pythainlp_'):
        tokens = wl_word_tokenization.wl_word_tokenize_flat(main,
                                                            text,
                                                            lang=lang)

        if pos_tagger == 'pythainlp_perceptron_lst20':
            tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                                  engine='perceptron',
                                                  corpus='lst20')
        elif pos_tagger == 'pythainlp_perceptron_orchid':
            tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                                  engine='perceptron',
                                                  corpus='orchid')
        elif pos_tagger == 'pythainlp_perceptron_pud':
            tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                                  engine='perceptron',
                                                  corpus='pud')
    # Tibetan
    elif pos_tagger == 'botok_bod':
        tokens = main.botok_word_tokenizer.tokenize(text)

        for token in tokens:
            if token.pos:
                tokens_tagged.append((token.text, token.pos))
            else:
                tokens_tagged.append((token.text, token.chunk_type))
    # Vietnamese
    elif pos_tagger == 'underthesea_vie':
        tokens_tagged = underthesea.pos_tag(text)

    # Remove empty tokens and strip whitespace in tokens
    tokens_tagged = [(str(token).strip(), tag) for token, tag in tokens_tagged
                     if str(token).strip()]

    return tokens_tagged
Ejemplo n.º 9
0
def wl_get_stop_word_list(main, lang, stop_word_list='default'):
    if lang not in main.settings_global['stop_word_lists']:
        lang = 'other'

    if stop_word_list == 'default':
        stop_word_list = main.settings_custom['stop_word_lists'][
            'stop_word_lists'][lang]

    stop_words = []

    if stop_word_list == 'custom':
        stop_words = main.settings_custom['stop_word_lists']['custom_lists'][
            lang]
    else:
        lang_639_1 = wl_conversion.to_iso_639_1(main, lang)

        # Chinese (Simplified), English, German, Portuguese
        if lang != 'zho_tw' and not lang.startswith('srp_'):
            lang_639_1 = wl_conversion.remove_lang_code_suffixes(
                main, wl_conversion.to_iso_639_1(main, lang))
            lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        # Chinese (Traditional)
        if lang_639_1 == 'zh_tw':
            cc = opencc.OpenCC('s2twp')

            stop_words_zho_cn = wl_get_stop_word_list(
                main,
                lang='zho_cn',
                stop_word_list=stop_word_list.replace('zho_tw', 'zho_cn'))
            stop_words = [
                cc.convert(stop_word) for stop_word in stop_words_zho_cn
            ]
        elif stop_word_list.startswith('cltk_'):
            stop_words = importlib.import_module(
                f'stop_word_lists.cltk.{lang}').STOPS
        # extra-stopwords
        elif stop_word_list.startswith('extra_stopwords_'):
            LANG_TEXTS = {
                'sqi': 'albanian',
                'ara': 'arabic',
                'hye': 'armenian',
                'eus': 'basque',
                'bel': 'belarusian',
                'ben': 'bengali',
                'bul': 'bulgarian',
                'cat': 'catalan',
                'zho': 'chinese',
                'hrv': 'croatian',
                'ces': 'czech',
                'dan': 'danish',
                'nld': 'dutch',
                'eng': 'english',
                'est': 'estonian',
                'fin': 'finnish',
                'fra': 'french',
                'glg': 'galician',
                'deu': 'german',
                'ell': 'greek',
                'hau': 'hausa',
                'heb': 'hebrew',
                'hin': 'hindi',
                'hun': 'hungarian',
                'isl': 'icelandic',
                'ind': 'indonesian',
                'gle': 'irish',
                'ita': 'italian',
                'jpn': 'japanese',
                'kor': 'korean',
                'kur': 'kurdish',
                'lav': 'latvian',
                'lit': 'lithuanian',
                'msa': 'malay',
                'mar': 'marathi',
                'mon': 'mongolian',
                'nep': 'nepali',
                # Norwegian
                'nob': 'norwegian',
                'nno': 'norwegian',
                'fas': 'persian',
                'pol': 'polish',
                'por': 'portuguese',
                'ron': 'romanian',
                'rus': 'russian',
                # Serbian
                'srp_cyrl': 'serbian-cyrillic',
                'srp_latn': 'serbian',
                'slk': 'slovak',
                'slv': 'slovenian',
                'spa': 'spanish',
                'swa': 'swahili',
                'swe': 'swedish',
                'tgl': 'tagalog',
                'tel': 'telugu',
                'tha': 'thai',
                'tur': 'turkish',
                'ukr': 'ukranian',
                'urd': 'urdu',
                'vie': 'vietnamese',
                'yor': 'yoruba'
            }

            with open(wl_misc.get_normalized_path(
                    f'stop_word_lists/extra-stopwords/{LANG_TEXTS[lang]}'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = [
                    line.rstrip() for line in f if not line.startswith('#')
                ]
        # NLTK
        elif stop_word_list.startswith('nltk_'):
            LANG_TEXTS = {
                'ara': 'arabic',
                'aze': 'azerbaijani',
                'dan': 'danish',
                'nld': 'dutch',
                'eng': 'english',
                'fin': 'finnish',
                'fra': 'french',
                'deu': 'german',
                'ell': 'greek',
                'hun': 'hungarian',
                'ind': 'indonesian',
                'ita': 'italian',
                'kaz': 'kazakh',
                'nep': 'nepali',
                # Norwegian
                'nob': 'norwegian',
                'nno': 'norwegian',
                'por': 'portuguese',
                'ron': 'romanian',
                'rus': 'russian',
                'slv': 'slovene',
                'spa': 'spanish',
                'swe': 'swedish',
                'tgk': 'tajik',
                'tur': 'turkish'
            }

            stop_words = nltk.corpus.stopwords.words(LANG_TEXTS[lang])
        # spaCy
        elif stop_word_list.startswith('spacy_'):
            # Serbian
            if lang_639_1 == 'sr_cyrl':
                spacy_lang = importlib.import_module('spacy.lang.sr')

                stop_words = spacy_lang.STOP_WORDS
            elif lang_639_1 == 'sr_latn':
                spacy_lang = importlib.import_module('spacy.lang.sr')

                stop_words = spacy_lang.STOP_WORDS
                stop_words = wl_nlp_utils.to_srp_latn(stop_words)
            else:
                spacy_lang = importlib.import_module(
                    f'spacy.lang.{lang_639_1}')

                stop_words = spacy_lang.STOP_WORDS
        # Stopwords ISO
        elif stop_word_list.startswith('stopwords_iso_'):
            # Greek (Ancient)
            if lang_639_1 == 'grc':
                lang_639_1 = 'el'

            # Norwegian
            if lang_639_1 in ['nb', 'nn']:
                lang_639_1 = 'no'

            with open(wl_misc.get_normalized_path(
                    'stop_word_lists/Stopwords ISO/stopwords_iso.json'),
                      'r',
                      encoding='utf_8') as f:
                stop_words = json.load(f)[lang_639_1]
        # Thai
        elif stop_word_list == 'pythainlp_tha':
            stop_words = pythainlp.corpus.common.thai_stopwords()

    # Remove empty tokens
    stop_words = [stop_word for stop_word in stop_words if stop_word.strip()]

    return sorted(set(stop_words))
Ejemplo n.º 10
0
def wl_lemmatize_text(main, text, lang, tokenized, tagged, lemmatizer):
    lemmas = []

    # spaCy
    if lemmatizer.startswith('spacy_'):
        if not lang.startswith('srp_'):
            lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)

        lemmas = [token.lemma_ for token in doc]
    # English
    elif lemmatizer == 'nltk_wordnet':
        word_net_lemmatizer = nltk.WordNetLemmatizer()

        for token, pos in wl_pos_tagging.wl_pos_tag(
            main, text,
            lang = 'eng_us',
            pos_tagger = 'nltk_perceptron',
            tagset = 'universal'
        ):
            if pos == 'ADJ':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ))
            elif pos in ['NOUN', 'PROPN']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN))
            elif pos == 'ADV':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV))
            elif pos in ['VERB', 'AUX']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB))
            else:
                lemmas.append(word_net_lemmatizer.lemmatize(token))
    # Japanese
    elif lemmatizer == 'sudachipy_jpn':
        lemmas = [
            token.dictionary_form()
            for token in main.sudachipy_word_tokenizer.tokenize(text)
        ]
    # Russian & Ukrainian
    elif lemmatizer == 'pymorphy2_morphological_analyzer':
        if lang == 'rus':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_rus
        elif lang == 'ukr':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang)

        for token in tokens:
            lemmas.append(morphological_analyzer.parse(token)[0].normal_form)
    # Tibetan
    elif lemmatizer == 'botok_bod':
        tokens = main.botok_word_tokenizer.tokenize(text)

        for token in tokens:
            if token.lemma:
                lemmas.append(token.lemma)
            else:
                lemmas.append(token.text)
    # Lemmatization Lists
    elif lemmatizer.startswith('lemmatization_lists_'):
        mapping_lemmas = {}

        lang = wl_conversion.to_iso_639_1(main, lang)
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f:
            for line in f:
                try:
                    lemma, word = line.rstrip().split('\t')

                    mapping_lemmas[word] = lemma
                except ValueError:
                    pass

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang)
        lemmas = [mapping_lemmas.get(token, token) for token in tokens]

    # Remove empty lemmas and strip whitespace in tokens
    lemmas = [
        str(lemma).strip()
        for lemma in lemmas
        if str(lemma).strip()
    ]

    return lemmas
Ejemplo n.º 11
0
def wl_lemmatize_tokens(main, tokens, lang, tokenized, tagged, lemmatizer):
    empty_offsets = []
    lemmas = []

    tokens = [str(token) for token in tokens]

    re_tags = wl_matching.get_re_tags(main, tag_type = 'body')

    if tagged == _tr('wl_lemmatize_tokens', 'Yes'):
        tags = [''.join(re.findall(re_tags, token)) for token in tokens]
        tokens = [re.sub(re_tags, '', token) for token in tokens]
    else:
        tags = [''] * len(tokens)

    # Record empty tokens with their tags
    for i, token in reversed(list(enumerate(tokens))):
        if not token.strip():
            empty_offsets.append(i)

            del tokens[i]
            del tags[i]

    # spaCy
    if 'spacy' in lemmatizer:
        if not lang.startswith('srp_'):
            lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        nlp = main.__dict__[f'spacy_nlp_{lang}']

        if lang != 'jpn':
            doc = spacy.tokens.Doc(nlp.vocab, words = tokens, spaces = [False] * len(tokens))

            for pipe_name in nlp.pipe_names:
                nlp.get_pipe(pipe_name)(doc)
        # The Japanese model do not have a lemmatizer component and Japanese lemmas are taken directly from SudachiPy
        # See: https://github.com/explosion/spaCy/discussions/9983#discussioncomment-1923647
        else:
            doc = nlp(''.join(tokens))

        lemma_tokens = [token.text for token in doc]
        lemmas = [token.lemma_ for token in doc]
    # English
    elif lemmatizer == 'nltk_wordnet':
        word_net_lemmatizer = nltk.WordNetLemmatizer()

        for token, pos in wl_pos_tagging.wl_pos_tag(
            main, tokens,
            lang = 'eng_us',
            pos_tagger = 'nltk_perceptron',
            tagset = 'universal'
        ):
            if pos == 'ADJ':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ))
            elif pos in ['NOUN', 'PROPN']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN))
            elif pos == 'ADV':
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV))
            elif pos in ['VERB', 'AUX']:
                lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB))
            else:
                lemmas.append(word_net_lemmatizer.lemmatize(token))

        lemma_tokens = tokens.copy()
    # Japanese
    elif lemmatizer == 'sudachipy_jpn':
        tokens_retokenized = main.sudachipy_word_tokenizer.tokenize(''.join(tokens))

        lemma_tokens = [token.surface() for token in tokens_retokenized]
        lemmas = [token.dictionary_form() for token in tokens_retokenized]
    # Russian & Ukrainian
    elif lemmatizer == 'pymorphy2_morphological_analyzer':
        if lang == 'rus':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_rus
        elif lang == 'ukr':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr

        for token in tokens:
            lemmas.append(morphological_analyzer.parse(token)[0].normal_form)

        lemma_tokens = tokens.copy()
    # Tibetan
    elif lemmatizer == 'botok_bod':
        lemma_tokens = []
        tokens_retokenized = main.botok_word_tokenizer.tokenize(''.join(tokens))

        for token in tokens_retokenized:
            if token.lemma:
                lemmas.append(token.lemma)
            else:
                lemmas.append(token.text)

            lemma_tokens.append(token.text)
    # Lemmatization Lists
    elif 'lemmatization_lists' in lemmatizer:
        mapping_lemmas = {}

        lang = wl_conversion.to_iso_639_1(main, lang)
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f:
            for line in f:
                try:
                    lemma, word = line.rstrip().split('\t')

                    mapping_lemmas[word] = lemma
                except ValueError:
                    pass

        lemma_tokens = tokens.copy()
        lemmas = [mapping_lemmas.get(token, token) for token in tokens]

    # Remove empty lemmas and strip whitespace in tokens
    for i, lemma in reversed(list(enumerate(lemmas))):
        lemma_tokens[i] = lemma_tokens[i].strip()
        lemmas[i] = lemma.strip()

        if not lemmas[i]:
            del lemmas[i]
            del lemma_tokens[i]

    # Make sure that tokenization is not modified during lemmatization
    i_tokens = 0
    i_lemmas = 0

    len_tokens = len(tokens)
    len_lemmas = len(lemmas)

    if len_tokens != len_lemmas:
        tags_modified = []
        lemmas_modified = []

        while i_tokens < len_tokens and i_lemmas < len_lemmas:
            # Different token
            if len(tokens[i_tokens]) != len(lemma_tokens[i_lemmas]):
                tokens_temp = [tokens[i_tokens]]
                tags_temp = [tags[i_tokens]]
                lemma_tokens_temp = [lemma_tokens[i_lemmas]]
                lemmas_temp = [lemmas[i_lemmas]]

                # Align tokens
                while i_tokens < len_tokens - 1 or i_lemmas < len_lemmas - 1:
                    len_tokens_temp = sum([len(token) for token in tokens_temp])
                    len_lemma_tokens_temp = sum([len(token) for token in lemma_tokens_temp])

                    if len_tokens_temp > len_lemma_tokens_temp:
                        lemma_tokens_temp.append(lemma_tokens[i_lemmas + 1])
                        lemmas_temp.append(lemmas[i_lemmas + 1])

                        i_lemmas += 1
                    elif len_tokens_temp < len_lemma_tokens_temp:
                        tokens_temp.append(tokens[i_tokens + 1])
                        tags_temp.append(tags[i_tokens + 1])

                        i_tokens += 1
                    else:
                        # Use lemmas in one-to-one
                        if len(tokens_temp) == len(lemma_tokens_temp):
                            tags_modified.extend(tags_temp)
                            lemmas_modified.extend(lemmas_temp)
                        # Use original tokens in many-to-one or one-to-many
                        else:
                            tags_modified.extend(tags)
                            lemmas_modified.extend(tokens_temp)

                        tokens_temp = []
                        tags_temp = []
                        lemma_tokens_temp = []
                        lemmas_temp = []

                        break

                if tokens_temp:
                    # Use lemmas in one-to-one
                    if len(tokens_temp) == len(lemma_tokens_temp):
                        tags_modified.extend(tags_temp)
                        lemmas_modified.extend(lemmas_temp)
                    # Use original tokens in many-to-one or one-to-many
                    else:
                        tags_modified.extend(tags)
                        lemmas_modified.extend(tokens_temp)
            else:
                tags_modified.extend(tags[i_tokens])
                lemmas_modified.append(lemmas[i_lemmas])

            i_tokens += 1
            i_lemmas += 1

        len_lemmas_modified = len(lemmas_modified)

        if len_tokens < len_lemmas_modified:
            tags = tags_modified[:len_tokens]
            lemmas = lemmas_modified[:len_tokens]
        elif len_tokens > len_lemmas_modified:
            tags = tags_modified + [tags_modified[-1]] * (len_tokens - len_lemmas_modified)
            lemmas = lemmas_modified + [lemmas_modified[-1]] * (len_tokens - len_lemmas_modified)
        else:
            tags = tags_modified.copy()
            lemmas = lemmas_modified.copy()

    # Insert empty lemmas and their tags after alignment of input and output
    for empty_offset in sorted(empty_offsets):
        lemmas.insert(empty_offset, '')
        tags.insert(empty_offset, '')

    return [lemma + tag for lemma, tag in zip(lemmas, tags)]
Ejemplo n.º 12
0
def wl_word_detokenize(main, tokens, lang):
    text = ''

    if lang == 'other':
        lang = 'eng_gb'

    wl_nlp_utils.init_word_detokenizers(main, lang=lang)

    # Chinese
    if lang.startswith('zho'):
        non_cjk_start = 0

        for i, token in enumerate(tokens):
            if i >= non_cjk_start:
                if (wl_checking_unicode.has_han(token)
                        or all(map(str.isnumeric, token))):
                    text += token

                    non_cjk_start += 1
                else:
                    # Non-Chinese
                    for j, _ in enumerate(tokens[i:]):
                        if (i + j + 1 == len(tokens)
                                or wl_checking_unicode.has_han(
                                    tokens[i + j + 1])):
                            text += wl_word_detokenize(
                                main,
                                tokens=tokens[non_cjk_start:i + j + 1],
                                lang='other')

                            non_cjk_start = i + j + 1

                            break
    # Japanese
    elif lang == 'jpn':
        non_cjk_start = 0

        for i, token in enumerate(tokens):
            if i < non_cjk_start:
                continue

            if (wl_checking_unicode.has_han(token)
                    or wl_checking_unicode.has_kana(token)
                    or all(map(str.isnumeric, token))):
                text += token

                non_cjk_start = i + 1
            else:
                # Non-Japanese
                for j, _ in enumerate(tokens[i:]):
                    if (i + j + 1 == len(tokens)
                            or wl_checking_unicode.has_han(tokens[i + j + 1])
                            or wl_checking_unicode.has_kana(
                                tokens[i + j + 1])):
                        text += wl_word_detokenize(
                            main,
                            tokens=tokens[non_cjk_start:i + j + 1],
                            lang='other')

                        non_cjk_start = i + j + 1

                        break
    # Thai
    elif lang == 'tha':
        non_thai_start = 0

        for i, token in enumerate(tokens):
            if i < non_thai_start:
                continue

            if wl_checking_unicode.has_thai(token):
                if type(token) == wl_texts.Wl_Token:
                    text += token + token.boundary
                else:
                    text += token

                non_thai_start = i + 1
            else:
                # Non-Thai
                for j, _ in enumerate(tokens[i:]):
                    if (i + j + 1 == len(tokens) or
                            wl_checking_unicode.has_thai(tokens[i + j + 1])):
                        text += wl_word_detokenize(
                            main,
                            tokens=tokens[non_thai_start:i + j + 1],
                            lang='other')

                        non_thai_start = i + j + 1

                        break
    # Tibetan
    elif lang == 'bod':
        non_tibetan_start = 0

        for i, token in enumerate(tokens):
            if i < non_tibetan_start:
                continue

            if wl_checking_unicode.has_tibetan(token):
                # Check for Tibetan Mark Shad
                # See: https://w3c.github.io/tlreq/#section_breaks
                if i > 0 and text[-1] == '།' and token[0] == '།':
                    text += ' ' + token
                else:
                    text += token

                non_tibetan_start = i + 1
            else:
                # Non-Tibetan
                for j, _ in enumerate(tokens[i:]):
                    if (i + j + 1 == len(tokens)
                            or wl_checking_unicode.has_tibetan(
                                tokens[i + j + 1])):
                        text += wl_word_detokenize(
                            main,
                            tokens=tokens[non_tibetan_start:i + j + 1],
                            lang='other')

                        non_tibetan_start = i + j + 1

                        break
    else:
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        sentence_start = 0
        sentences = []

        for i, token in enumerate(tokens):
            if type(token) == wl_texts.Wl_Token and token.sentence_ending:
                sentences.append(tokens[sentence_start:i + 1])

                sentence_start = i + 1
            elif i == len(tokens) - 1:
                sentences.append(tokens[sentence_start:])

        for sentence in sentences:
            text += main.__dict__[
                f'sacremoses_moses_detokenizer_{lang}'].detokenize(sentence)

    text = re.sub(r'\s{2,}', ' ', text)

    return text.strip()