Esempio n. 1
0
    def test_tagging(self):
        # test_1
        text = 'Pythonで簡単に使えるツールです'
        output = 'Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞'
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_2
        output = 'python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞'
        words = nagisa.tagging(text, lower=True)
        self.assertEqual(output, str(words))

        # test_3
        text = 'ニューラルネットワークを使ってます。'
        output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号'
        self.assertEqual(output, str(nagisa.tagging(text)))

        # test_4
        tagger_nn = nagisa.Tagger(single_word_list=['ニューラルネットワーク', "ニューラルネット"])
        output = 'ニューラルネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号'
        self.assertEqual(output, str(tagger_nn.tagging(text)))

        # test_5
        text = "3月に見た「3月のライオン」"
        new_tagger = nagisa.Tagger(single_word_list=['3月のライオン'])
        output = '3/名詞 月/名詞 に/助詞 見/動詞 た/助動詞 「/補助記号 3月のライオン/名詞 」/補助記号'
        self.assertEqual(output, str(new_tagger.tagging(text)))

        # test_6
        text = '(人•ᴗ•♡)こんばんは♪'
        output = '(人•ᴗ•♡)/補助記号 こんばんは/感動詞 ♪/補助記号'
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_7
        url = 'https://github.com/taishi-i/nagisaでコードを公開中(๑¯ω¯๑)'
        output = 'コード/名詞 公開/名詞 中/接尾辞'
        words = nagisa.filter(url, filter_postags=['URL', '補助記号', '助詞'])
        self.assertEqual(output, str(words))

        # test_8
        output = 'https://github.com/taishi-i/nagisa/URL で/助詞 を/助詞 (๑ ̄ω ̄๑)/補助記号'
        words = nagisa.extract(url, extract_postags=['URL', '補助記号', '助詞'])
        self.assertEqual(output, str(words))

        # test_9
        words = [" (人•ᴗ•♡)", "こんばんは", "♪"]
        output = ['補助記号', '感動詞', '補助記号']
        postags = nagisa.postagging(words)
        self.assertEqual(output, postags)

        # test_10
        postags = nagisa.decode(words)
        self.assertEqual(output, postags)
Esempio n. 2
0
def wl_pos_tag(main, tokens, lang, pos_tagger='default', tagset='custom'):
    tokens_tagged = []

    # Check if the first token is empty
    if tokens and tokens[0] == '':
        first_token_empty = True
    else:
        first_token_empty = False

    tokens = [str(token) for token in tokens if token]

    if pos_tagger == 'default':
        pos_tagger = main.settings_custom['pos_tagging']['pos_taggers'][lang]

    wl_text_utils.check_pos_taggers(main, lang=lang, pos_tagger=pos_tagger)

    # Chinese
    if pos_tagger == main.tr('jieba - Chinese POS Tagger'):
        tokens_tagged = jieba.posseg.cut(' '.join(tokens))

    # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish
    elif 'spaCy' in pos_tagger:
        nlp = main.__dict__[f'spacy_nlp_{lang}']

        doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
        nlp.tagger(doc)

        tokens_tagged = [(token.text, token.tag_) for token in doc]

    # English & Russian
    elif pos_tagger == main.tr('NLTK - Perceptron POS Tagger'):
        tokens_tagged = nltk.pos_tag(tokens, lang=lang)

    # Japanese
    elif pos_tagger == main.tr('nagisa - Japanese POS Tagger'):
        import nagisa

        tokens_tagged = zip(tokens, nagisa.postagging(tokens))

    # Russian & Ukrainian
    elif pos_tagger == main.tr('pymorphy2 - Morphological Analyzer'):
        if lang == 'rus':
            morphological_analyzer = pymorphy2.MorphAnalyzer(lang='ru')
        elif lang == 'ukr':
            morphological_analyzer = pymorphy2.MorphAnalyzer(lang='uk')

        for token in tokens:
            tokens_tagged.append(
                (token, morphological_analyzer.parse(token)[0].tag._POS))

    # Thai
    elif pos_tagger == main.tr('PyThaiNLP - Perceptron Tagger (ORCHID)'):
        tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                              engine='perceptron',
                                              corpus='orchid')
    elif pos_tagger == main.tr('PyThaiNLP - Perceptron Tagger (PUD)'):
        tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                              engine='perceptron',
                                              corpus='pud')

    # Tibetan
    elif pos_tagger == main.tr('botok - Tibetan POS Tagger'):
        wl_text_utils.check_word_tokenizers(main, lang='bod')
        tokens = main.botok_word_tokenizer.tokenize(' '.join(tokens))

        for token in tokens:
            if token.pos:
                tokens_tagged.append((token.text, token.pos))
            else:
                tokens_tagged.append((token.text, token.chunk_type))

    # Vietnamese
    elif pos_tagger == main.tr('Underthesea - Vietnamese POS Tagger'):
        tokens_tagged = underthesea.pos_tag(' '.join(tokens))

    # Convert to Universal Tagset
    if (tagset == 'custom'
            and main.settings_custom['pos_tagging']['to_universal_pos_tags']
            or tagset == 'universal'):

        mappings = {
            tag: tag_universal
            for tag, tag_universal, _, _ in main.settings_custom['tagsets']
            ['mappings'][lang][pos_tagger]
        }
        tokens_tagged = list(tokens_tagged)

        # Issue warnings if any tag is missing from the mapping table
        for _, tag in tokens_tagged:
            if tag not in mappings:
                print(
                    f'Warning: tag "{tag}" is missing from the {wl_conversion.to_lang_text(main, lang)} mapping table!'
                )

        tokens_tagged = [(token, mappings.get(tag, 'X'))
                         for token, tag in tokens_tagged]

    # Strip empty tokens and strip whitespace in tokens
    tokens_tagged = [(token.strip(), tag) for token, tag in tokens_tagged
                     if token.strip()]

    # Add the first empty token (if any)
    if first_token_empty:
        tokens_tagged.insert(0, ('', ''))

    return tokens_tagged
def wordless_pos_tag(main,
                     tokens,
                     lang,
                     pos_tagger='default',
                     tagset='custom'):
    tokens_tagged = []

    tokens = [str(token) for token in tokens]

    if pos_tagger == 'default':
        pos_tagger = main.settings_custom['pos_tagging']['pos_taggers'][lang]

    wordless_text_utils.check_pos_taggers(main,
                                          lang=lang,
                                          pos_tagger=pos_tagger)

    # Chinese
    if pos_tagger == main.tr('jieba - Chinese POS Tagger'):
        tokens_tagged = jieba.posseg.cut(' '.join(tokens))

    # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish
    elif 'spaCy' in pos_tagger:
        nlp = main.__dict__[f'spacy_nlp_{lang}']

        doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
        nlp.tagger(doc)

        tokens_tagged = [(token.text, token.tag_) for token in doc]

    # English & Russian
    elif pos_tagger == main.tr('NLTK - Perceptron POS Tagger'):
        tokens_tagged = nltk.pos_tag(tokens, lang=lang)

    # Japanese
    elif pos_tagger == main.tr('nagisa - Japanese POS Tagger'):
        import nagisa

        tokens_tagged = zip(tokens, nagisa.postagging(tokens))

    # Russian & Ukrainian
    elif pos_tagger == main.tr('pymorphy2 - Morphological Analyzer'):
        if lang == 'rus':
            morphological_analyzer = pymorphy2.MorphAnalyzer(lang='ru')
        elif lang == 'ukr':
            morphological_analyzer = pymorphy2.MorphAnalyzer(lang='uk')

        for token in tokens:
            tokens_tagged.append(
                (token, morphological_analyzer.parse(token)[0].tag._POS))

    # Thai
    elif pos_tagger == main.tr(
            'PyThaiNLP - Perceptron POS Tagger - ORCHID Corpus'):
        tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                              engine='perceptron',
                                              corpus='orchid')
    elif pos_tagger == main.tr(
            'PyThaiNLP - Perceptron POS Tagger - PUD Corpus'):
        tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                              engine='perceptron',
                                              corpus='pud')

    # Tibetan
    elif pos_tagger == main.tr('pybo - Tibetan POS Tagger'):
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

        wordless_text_utils.check_pybo_tokenizers(
            main, word_tokenizer=word_tokenizer)

        if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'):
            tokens = main.pybo_tokenizer_gmd.tokenize(' '.join(tokens))
        elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'):
            tokens = main.pybo_tokenizer_pos.tokenize(' '.join(tokens))
        elif word_tokenizer == main.tr(
                'pybo - Tibetan Word Tokenizer (tsikchen)'):
            tokens = main.pybo_tokenizer_tsikchen.tokenize(' '.join(tokens))

        for token in tokens:
            if token.pos:
                tokens_tagged.append((token.text, token.pos))
            else:
                tokens_tagged.append((token.text, token.chunk_type))

    # Vietnamese
    elif pos_tagger == main.tr('Underthesea - Vietnamese POS Tagger'):
        tokens_tagged = underthesea.pos_tag(' '.join(tokens))

    # Convert to Universal Tagset
    if (tagset == 'custom'
            and main.settings_custom['pos_tagging']['to_universal_pos_tags']
            or tagset == 'universal'):

        mappings = {
            tag: tag_universal
            for tag, tag_universal, _, _ in main.settings_custom['tagsets']
            ['mappings'][lang][pos_tagger]
        }

        tokens_tagged = [(token, mappings[tag])
                         for token, tag in tokens_tagged]

    # Strip empty tokens and strip whitespace in tokens
    tokens_tagged = [(token.strip(), tag) for token, tag in tokens_tagged
                     if token.strip()]

    # Check if the first token is empty
    if tokens[0] == '':
        tokens_tagged.insert(0, ('', ''))

    return tokens_tagged
Esempio n. 4
0
def wl_pos_tag_tokens(main, tokens, lang, pos_tagger, tagset):
    tokens_tagged = []

    if pos_tagger == 'nagisa_jpn':
        # Defer import to save loading time
        import nagisa

    lang = wl_conversion.remove_lang_code_suffixes(main, lang)

    # spaCy
    if pos_tagger.startswith('spacy_'):
        if not lang.startswith('srp_'):
            lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        nlp = main.__dict__[f'spacy_nlp_{lang}']

        if lang != 'jpn':
            doc = spacy.tokens.Doc(nlp.vocab,
                                   words=tokens,
                                   spaces=[False] * len(tokens))

            for pipe_name in nlp.pipe_names:
                nlp.get_pipe(pipe_name)(doc)
        # The Japanese model do not have a tagger component and Japanese POS tags are taken directly from SudachiPy
        # See: https://github.com/explosion/spaCy/discussions/9983#discussioncomment-1910117
        else:
            doc = nlp(''.join(tokens))

        if tagset == 'default':
            tokens_tagged = [(token.text, token.tag_) for token in doc]
        elif tagset == 'universal':
            tokens_tagged = [(token.text, token.pos_) for token in doc]
    # Chinese
    elif pos_tagger == 'jieba_zho':
        tokens_tagged = jieba.posseg.cut(''.join(tokens))
    # English & Russian
    elif pos_tagger == 'nltk_perceptron':
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        tokens_tagged = nltk.pos_tag(tokens, lang=lang)
    # Japanese
    elif pos_tagger == 'nagisa_jpn':
        tokens_tagged = zip(tokens, nagisa.postagging(tokens))
    elif pos_tagger == 'sudachipy_jpn':
        tokens_tagged = [(token.surface(), '-'.join(
            [pos for pos in token.part_of_speech()[:4] if pos != '*']))
                         for token in main.sudachipy_word_tokenizer.tokenize(
                             ''.join(tokens))]
    # Russian & Ukrainian
    elif pos_tagger == 'pymorphy2_morphological_analyzer':
        if lang == 'rus':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_rus
        elif lang == 'ukr':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr

        for token in tokens:
            tokens_tagged.append(
                (token, morphological_analyzer.parse(token)[0].tag._POS))
    # Thai
    elif pos_tagger == 'pythainlp_perceptron_lst20':
        tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                              engine='perceptron',
                                              corpus='lst20')
    elif pos_tagger == 'pythainlp_perceptron_orchid':
        tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                              engine='perceptron',
                                              corpus='orchid')
    elif pos_tagger == 'pythainlp_perceptron_pud':
        tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                              engine='perceptron',
                                              corpus='pud')
    # Tibetan
    elif pos_tagger == 'botok_bod':
        tokens_retokenized = main.botok_word_tokenizer.tokenize(
            ''.join(tokens))

        for token in tokens_retokenized:
            if token.pos:
                tokens_tagged.append((token.text, token.pos))
            else:
                tokens_tagged.append((token.text, token.chunk_type))

    # Vietnamese
    elif pos_tagger == 'underthesea_vie':
        tokens_tagged = underthesea.pos_tag(' '.join(tokens))

    # Remove empty tokens and strip whitespace in tokens
    tokens_tagged = [(str(token).strip(), tag) for token, tag in tokens_tagged
                     if str(token).strip()]

    # Make sure that tokenization is not modified during POS tagging
    i_tokens = 0
    i_tokens_tagged = 0

    len_tokens = len(tokens)
    len_tokens_tagged = len(tokens_tagged)

    if len_tokens != len_tokens_tagged:
        tokens_tagged_modified = []

        while i_tokens < len_tokens and i_tokens_tagged < len_tokens_tagged:
            # Different token
            if len(tokens[i_tokens]) != len(tokens_tagged[i_tokens_tagged][0]):
                tokens_temp = [tokens[i_tokens]]
                tokens_tagged_temp = [tokens_tagged[i_tokens_tagged][0]]
                tags_temp = [tokens_tagged[i_tokens_tagged][1]]

                # Align tokens
                while i_tokens < len_tokens - 1 or i_tokens_tagged < len_tokens_tagged - 1:
                    len_tokens_temp = sum(
                        [len(token) for token in tokens_temp])
                    len_tokens_tagged_temp = sum(
                        [len(token) for token in tokens_tagged_temp])

                    if len_tokens_temp > len_tokens_tagged_temp:
                        tokens_tagged_temp.append(
                            tokens_tagged[i_tokens_tagged + 1][0])
                        tags_temp.append(tokens_tagged[i_tokens_tagged + 1][1])

                        i_tokens_tagged += 1
                    elif len_tokens_temp < len_tokens_tagged_temp:
                        tokens_temp.append(tokens[i_tokens + 1])

                        i_tokens += 1
                    else:
                        if len(tokens_temp) == len(tokens_tagged_temp):
                            tokens_tagged_modified.extend([
                                (token, tag)
                                for token, tag in zip(tokens_temp, tags_temp)
                            ])
                        elif len(tokens_temp) > len(tokens_tagged_temp):
                            tokens_tagged_modified.extend([
                                (token, tags_temp[0]) for token in tokens_temp
                            ])
                        else:
                            tokens_tagged_modified.append(
                                (tokens_temp[0], tags_temp[0]))

                        tokens_temp = []
                        tokens_tagged_temp = []
                        tags_temp = []

                        break

                if tokens_temp:
                    if len(tokens_temp) == len(tokens_tagged_temp):
                        tokens_tagged_modified.extend([
                            (token, tag)
                            for token, tag in zip(tokens_temp, tags_temp)
                        ])
                    elif len(tokens_temp) > len(tokens_tagged_temp):
                        tokens_tagged_modified.extend([
                            (token, tags_temp[0]) for token in tokens_temp
                        ])
                    else:
                        tokens_tagged_modified.append(
                            (tokens_temp[0], tags_temp[0]))
            else:
                tokens_tagged_modified.append(
                    (tokens[i_tokens], tokens_tagged[i_tokens_tagged][1]))

            i_tokens += 1
            i_tokens_tagged += 1

        len_tokens_tagged_modified = len(tokens_tagged_modified)

        if len_tokens < len_tokens_tagged_modified:
            tokens_tagged = tokens_tagged_modified[:len_tokens]
        elif len_tokens > len_tokens_tagged_modified:
            tokens_tagged = tokens_tagged_modified + [
                tokens_tagged_modified[-1]
            ] * (len_tokens - len_tokens_tagged_modified)
        else:
            tokens_tagged = tokens_tagged_modified.copy()
    else:
        tokens_tagged = [(tokens[i], tokens_tagged[i][1])
                         for i in range(len(tokens))]

    return tokens_tagged
Esempio n. 5
0
    def test_tagging(self):
        # test_1
        text = 'Pythonで簡単に使えるツールです'
        output = 'Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞'
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_2
        output = 'python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞'
        words = nagisa.tagging(text, lower=True)
        self.assertEqual(output, str(words))

        # test_3
        text = 'ニューラルネットワークを使ってます。'
        output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号'
        self.assertEqual(output, str(nagisa.tagging(text)))

        # test_4
        tagger_nn = nagisa.Tagger(single_word_list=['ニューラルネットワーク', "ニューラルネット"])
        output = 'ニューラルネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号'
        self.assertEqual(output, str(tagger_nn.tagging(text)))

        # test_5
        text = "3月に見た「3月のライオン」"
        new_tagger = nagisa.Tagger(single_word_list=['3月のライオン'])
        output = '3/名詞 月/名詞 に/助詞 見/動詞 た/助動詞 「/補助記号 3月のライオン/名詞 」/補助記号'
        self.assertEqual(output, str(new_tagger.tagging(text)))

        # test_6
        text = "それが、iPhone XSです。"
        output = "それ/代名詞 が/助詞 、/補助記号 iPhone XS/名詞 です/助動詞 。/補助記号"
        new_tagger = nagisa.Tagger(single_word_list=["iPhone[a-zA-Z0-9 ]+"])

        self.assertEqual(output, str(new_tagger.tagging(text)))

        # test_7
        text = "1234abc ABC"
        output = "1234/名詞 abc ABC/名詞"
        new_tagger = nagisa.Tagger(single_word_list=["[a-zA-Z ]+", "[0-9]+"])

        self.assertEqual(output, str(new_tagger.tagging(text)))

        # test_8
        text = '(人•ᴗ•♡)こんばんは♪'
        output = '(人•ᴗ•♡)/補助記号 こんばんは/感動詞 ♪/補助記号'
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_9
        url = 'https://github.com/taishi-i/nagisaでコードを公開中(๑¯ω¯๑)'
        output = 'コード/名詞 公開/名詞 中/接尾辞'
        words = nagisa.filter(url, filter_postags=['URL', '補助記号', '助詞'])
        self.assertEqual(output, str(words))

        # test_10
        output = 'https://github.com/taishi-i/nagisa/URL で/助詞 を/助詞 (๑ ̄ω ̄๑)/補助記号'
        words = nagisa.extract(url, extract_postags=['URL', '補助記号', '助詞'])
        self.assertEqual(output, str(words))

        # test_11
        words = [" (人•ᴗ•♡)", "こんばんは", "♪"]
        output = ['補助記号', '感動詞', '補助記号']
        postags = nagisa.postagging(words)
        self.assertEqual(output, postags)

        # test_12
        postags = nagisa.decode(words)
        self.assertEqual(output, postags)

        # test_13
        words = [" (人•ᴗ•♡)", " ", "こんばんは", "♪"]
        output = ['補助記号', "空白", '感動詞', '補助記号']
        postags = nagisa.postagging(words)

        self.assertEqual(output, postags)

        # test_14
        postags = nagisa.decode(words)
        self.assertEqual(output, postags)

        # test_15
        words = [" (人•ᴗ•♡)", " ", "こんばんは", "♪"]
        output = ['補助記号', "空白", '感動詞', '補助記号']
        postags = nagisa.postagging(words)

        self.assertEqual(output, postags)

        # test_16
        postags = nagisa.decode(words)
        self.assertEqual(output, postags)

        # test_17
        text = "こんばんは😀"
        output = "こんばんは/感動詞 😀/補助記号"
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_18
        text = "コンバンハ12345"
        output = "コンバンハ/名詞 1/名詞 2/名詞 3/名詞 4/名詞 5/名詞"
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_19
        text = "𪗱𪘂𪘚𪚲"
        output = "𪗱/補助記号 𪘂/補助記号 𪘚/補助記号 𪚲/補助記号"
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))