Esempio n. 1
0
    def test_tagging(self):
        # test_1
        text = 'Pythonで簡単に使えるツールです'
        output = 'Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞'
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_2
        text = 'ニューラルネットワークを使ってます。'
        output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号'
        self.assertEqual(output, str(nagisa.tagging(text)))

        # test_3
        tagger_nn = nagisa.Tagger(single_word_list=['ニューラルネットワーク'])
        output = 'ニューラルネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号'
        self.assertEqual(output, str(tagger_nn.tagging(text)))

        # test_4
        text = '(人•ᴗ•♡)こんばんは♪'
        output = '(人•ᴗ•♡)/補助記号 こんばんは/感動詞 ♪/補助記号'
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_5
        url = 'https://github.com/taishi-i/nagisaでコードを公開中(๑¯ω¯๑)'
        output = 'コード/名詞 公開/名詞 中/接尾辞'
        words = nagisa.filter(url, filter_postags=['URL', '補助記号', '助詞'])
        self.assertEqual(output, str(words))
Esempio n. 2
0
    def test_tagging(self):
        # test_1
        text = 'Pythonで簡単に使えるツールです'
        output = 'Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞'
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_2
        output = 'python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞'
        words = nagisa.tagging(text, lower=True)
        self.assertEqual(output, str(words))

        # test_3
        text = 'ニューラルネットワークを使ってます。'
        output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号'
        self.assertEqual(output, str(nagisa.tagging(text)))

        # test_4
        tagger_nn = nagisa.Tagger(single_word_list=['ニューラルネットワーク', "ニューラルネット"])
        output = 'ニューラルネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号'
        self.assertEqual(output, str(tagger_nn.tagging(text)))

        # test_5
        text = "3月に見た「3月のライオン」"
        new_tagger = nagisa.Tagger(single_word_list=['3月のライオン'])
        output = '3/名詞 月/名詞 に/助詞 見/動詞 た/助動詞 「/補助記号 3月のライオン/名詞 」/補助記号'
        self.assertEqual(output, str(new_tagger.tagging(text)))

        # test_6
        text = '(人•ᴗ•♡)こんばんは♪'
        output = '(人•ᴗ•♡)/補助記号 こんばんは/感動詞 ♪/補助記号'
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_7
        url = 'https://github.com/taishi-i/nagisaでコードを公開中(๑¯ω¯๑)'
        output = 'コード/名詞 公開/名詞 中/接尾辞'
        words = nagisa.filter(url, filter_postags=['URL', '補助記号', '助詞'])
        self.assertEqual(output, str(words))

        # test_8
        output = 'https://github.com/taishi-i/nagisa/URL で/助詞 を/助詞 (๑ ̄ω ̄๑)/補助記号'
        words = nagisa.extract(url, extract_postags=['URL', '補助記号', '助詞'])
        self.assertEqual(output, str(words))

        # test_9
        words = [" (人•ᴗ•♡)", "こんばんは", "♪"]
        output = ['補助記号', '感動詞', '補助記号']
        postags = nagisa.postagging(words)
        self.assertEqual(output, postags)

        # test_10
        postags = nagisa.decode(words)
        self.assertEqual(output, postags)
Esempio n. 3
0
    def filter_sentence_by_pos_tag_japanese(
        self,
        # string or word list
        word_list,
        keep_tags=DEFAULT_KEEP_TAGS_JAP,
    ):
        try:
            import nagisa
        except Exception as ex:
            raise Exception(
                str(__name__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Unable to load nagisa: ' + str(ex))
        if type(word_list) in [list, tuple]:
            text = ' '.join(word_list)
        else:
            text = word_list
        words_postags_obj = nagisa.tagging(text)
        txt_sym_tok = words_postags_obj.words
        txt_sym_postags = words_postags_obj.postags
        Log.debug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Japanese segmentation ' + str(txt_sym_tok) +
            ', word & POS tags: ' + str(txt_sym_postags))

        words_postags = list(zip(txt_sym_tok, txt_sym_postags))
        sent_filtered = [w for w, t in words_postags if (t in keep_tags)]
        Log.debugdebug(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
            ': POS TAGs: ' + str(words_postags))
        Log.debugdebug(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
            ': Filtered sentence: ' + str(sent_filtered))
        return sent_filtered
Esempio n. 4
0
  def classify(self,link):
    # 対象記事の形態素解析
    html = urllib.request.urlopen(link)
    soup = BeautifulSoup(html, 'html.parser')

    text = soup.find_all("p")
    text = [t.text for t in text]
    text = ','.join(text)

    words = nagisa.tagging(text)
    words = nagisa.filter(text, filter_postags=['助詞', '助動詞','接頭辞','接尾辞','補助記号','URL','空白']).words
    words = list(filter(lambda x: len(x) != 1, words)) # 一文字の単語を削除
    words = list(filter(lambda x: x != '', [re.sub(r'[0-9]', "", s) for s in words]))# 数字を削除

    # 学習データをloadする
    self.word_ct = np.load('./classify_app/data/word_ct.npy',allow_pickle=True).tolist()
    self.category_ct = np.load('./classify_app/data/category_ct.npy',allow_pickle=True).tolist()
    self.vocabularies = np.load('./classify_app/data/vocabularies.npy',allow_pickle=True).tolist()

    best_category = None # もっとも近いカテゴリ

    max_prob = -sys.maxsize# 最小整数値を設定

    # カテゴリ毎に対象文書(単語)のカテゴリー出現率P(C|W)を求める
    for category in self.category_ct.keys():
      # 文書内のカテゴリー出現率P(C|W)を求める
      prob = self.score(words, category)
      if prob > max_prob:
        max_prob = prob
        best_category = category

    return best_category
Esempio n. 5
0
def tag_and_write(line):
    line = line.replace('\n', '')
    line = line.replace('〜', '')
    split = line.split(',')
    tagged = nagisa.tagging(split[1] if split[1] else split[0])
    if tagged.postags:
        out.write(line + ',' + tagged.postags[0] + '\n')
    else:
        out.write(line + '\n')
Esempio n. 6
0
 def tokenize_jp(sentences_list):
     tokens_tags_list = []
     tokens_list = []
     tags_list = []
     for s in sentences_list:
         words_tags = nagisa.tagging(s)
         tokens_tags_list.append(str(words_tags))
         tokens_list.append(words_tags.words)
         tags_list.append(words_tags.postags)
     return tokens_tags_list, tokens_list, tags_list
Esempio n. 7
0
def detect_verb(input):
    texts = input.splitlines()
    flag = False
    for text in texts:
        words = nagisa.tagging(text)
        print("___________________________________")

        if '動詞' in words.postags:
            flag = True
            return flag
    return flag
Esempio n. 8
0
def translate_from_japanese(text):
    if (text is None):
        return "入力されていません"
    else:
        preds = model(
            torch.LongTensor(
                [ja_vocabulary.encode(nagisa.tagging(text).words)]))
        _out = preds.view(-1).tolist()
        out = ' '.join(th_vocabulary.decode(_out))
        begin = out.find('<s>')
        end = out.find('</s>')
        return out[begin + 3:end]
Esempio n. 9
0
def get_kanji(nodeList, kanjiCounter):
    #filter out only text events
    textEvents = [
        event for node in nodeList for event in node
        if event[_param_dict['code']] == 401
    ]

    for each in textEvents:
        japText = each[_param_dict['parameters']][0]
        words = nagisa.tagging(japText)

        for word in words.words:
            if not_kanji(word):
                continue
            # add to list of kanji
            kanjiCounter[word] += 1
Esempio n. 10
0
def original_usage(text):
    """
    Return the analysis results by nagisa.

    Parameters
    ----------
    text : str
        An input text

    Returns
    -------
    tokens : nagisa.tagger.Tagger._Token
        The analysis results by nagisa
    """
    tokens = nagisa.tagging(text)
    return tokens
Esempio n. 11
0
    async def nagisa(self, context: commands.Context, *, message: str):
        """指定した文章を形態素解析します。"""
        stock = []
        character_message = self.bot.storage.lexicon.get_character_message_for_command_nagisa(
        )
        result = f"{character_message}\n"

        await context.trigger_typing()

        words = nagisa.tagging(message)
        for index in range(len(words.words)):
            word = words.words[index]
            postag = words.postags[index]
            result += " " + word + "`[" + postag + "]`"

        stock.append(f'{result}')
        await self.bot.send("\n".join(stock))
def Morphological_analysis(link):
    html = urllib.request.urlopen(link)
    soup = BeautifulSoup(html, 'html.parser')

    text = soup.find_all("p")
    text = [t.text for t in text]
    text = ','.join(text)

    words = nagisa.tagging(text)
    words = nagisa.filter(
        text, filter_postags=['助詞', '助動詞', '接頭辞', '接尾辞', '補助記号', 'URL',
                              '空白']).words
    words = list(filter(lambda x: len(x) != 1, words))  # 一文字の単語を削除
    words = list(
        filter(lambda x: x != '',
               [re.sub(r'[0-9]', "", s) for s in words]))  # 数字を削除

    return words
Esempio n. 13
0
 def segment_ko_ja(
         self,
         text,
         return_array_of_split_words = False
 ):
     try:
         if self.lang in [lf.LangFeatures.LANG_JA]:
             words_postags = nagisa.tagging(text)
             txt_sym_tok = words_postags.words
             txt_sym_postags = words_postags.postags
             Log.debug(
                 str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                 + ': Japanese segmentation "' + str(txt_sym_tok) + '", word & POS tags: ' + str(words_postags)
             )
             if return_array_of_split_words:
                 return txt_sym_tok
             else:
                 return BasicPreprocessor.get_word_separator(lang=self.lang).join(txt_sym_tok)
         elif self.lang in [lf.LangFeatures.LANG_KO]:
             self.warn_korean()
             words_postags = self.kkma.pos(
                 phrase = text
             )
             txt_sym_tok = [wp[0] for wp in words_postags]
             txt_sym_postags = [wp[1] for wp in words_postags]
             Log.debug(
                 str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                 + ': Korean segmentation "' + str(txt_sym_tok) + '", word & POS tags: ' + str(words_postags)
             )
             if return_array_of_split_words:
                 return txt_sym_tok
             else:
                 return BasicPreprocessor.get_word_separator(lang=self.lang).join(txt_sym_tok)
         else:
             raise Exception(
                 str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                 + ': No external library supported for language "' + str(self.lang) + '"'
             )
     except Exception as ex:
         errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                  + ': Error segmenting lang "' + str(self.lang) + '", text "' + str(text) \
                  + '", exception: ' + str(ex)
         Log.error(errmsg)
         raise Exception(errmsg)
Esempio n. 14
0
def tagging_document(file_name, tokens_only = False):
    with smart_open.smart_open(file_name) as f :
        for i, line in enumerate(f):
            text = nagisa.tagging(line)
            text_list = text.words
            text_postags = text.postags
            listed_text = []
            for text_list_elements, text_postags_element in zip(text_list, text_postags):
                if text_postags_element == "名詞" :
                    listed_text.append(text_list_elements)
                elif text_postags_element == "動詞" : 
                    listed_text.append(text_list_elements)
                elif text_postags_element == "形状詞" :
                    listed_text.append(text_list_elements)
            
            if tokens_only :
                yield listed_text
            else :
                yield gensim.models.doc2vec.TaggedDocument(listed_text, [i])
Esempio n. 15
0
def wordless_word_tokenize(main,
                           text,
                           lang,
                           word_tokenizer='default',
                           keep_sentences=False):
    tokens_sentences = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    wordless_text_utils.check_word_tokenizers(main,
                                              lang=lang,
                                              word_tokenizer=word_tokenizer)

    if 'NLTK' in word_tokenizer:
        sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_sentences.append(treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_sentences.append(tweet_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_sentences.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_sentences.append(toktok_tokenizer.tokenize(sentence))

        if not keep_sentences:
            tokens_sentences = [
                itertools.chain.from_iterable(tokens_sentences)
            ]
    elif 'Sacremoses' in word_tokenizer:
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang)
        else:
            sentences = [text]

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_sentences.append(
                    moses_tokenizer.tokenize(sentence, escape=False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_sentences.append(
                    moses_tokenizer.penn_tokenize(sentence))
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if keep_sentences:
            for sentence in doc.sents:
                tokens_sentences.append(
                    [token.text for token in sentence.as_doc()])
        else:
            tokens_sentences.append([token.text for token in doc])

    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang=lang)
        else:
            sentences = [text]

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_sentences.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_sentences.extend(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_sentences.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wordless_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_sentences.extend(tokens)
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        sentences = wordless_sentence_tokenize(
            main,
            text,
            lang='tha',
            sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer')

        if word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm + TCC'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(
                        sentence, engine='longest-matching'))
    # Tibetan
    elif 'pybo' in word_tokenizer:
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang='bod')
        else:
            sentences = [text]

        if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text
                    for token in main.pybo_tokenizer_gmd.tokenize(sentence)
                ])
        elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text
                    for token in main.pybo_tokenizer_pos.tokenize(sentence)
                ])
        elif word_tokenizer == main.tr(
                'pybo - Tibetan Word Tokenizer (tsikchen)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text for token in
                    main.pybo_tokenizer_tsikchen.tokenize(sentence)
                ])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if keep_sentences:
            sentences = wordless_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )
        else:
            sentences = [text]

        for sentence in sentences:
            tokens_sentences.append(underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, tokens in enumerate(tokens_sentences):
        tokens_sentences[i] = [
            token.strip() for token in tokens if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for tokens in tokens_sentences:
            if tokens:
                tokens[-1] = wordless_text.Wordless_Token(tokens[-1],
                                                          boundary='',
                                                          sentence_ending=True)
    else:
        for tokens in tokens_sentences:
            if tokens:
                tokens[-1] = wordless_text.Wordless_Token(tokens[-1],
                                                          boundary=' ',
                                                          sentence_ending=True)

    return tokens_sentences
Esempio n. 16
0
def printDefs(outputs, seconds):
    if not hasattr(printDefs, "counter"):
        printDefs.counter = 0
    if not hasattr(printDefs, "tokenization_counter"):
        printDefs.tokenization_counter = 0
    tokens = set()
    with open("tout3.txt") as lines:
        for line in lines:
            #tinysegmenter
            tinysegmenter_tokens = tinysegmenter.tokenize(line.strip())
            #nagisa
            nagisa_tokens = nagisa.tagging(line.strip()).words
            if printDefs.tokenization_counter % 2 == 0:
                print("tinysegmenter:")
                tokenized_statement = tinysegmenter_tokens
            else:
                print("nagisa:")
                tokenized_statement = nagisa_tokens
            print(tokenized_statement)
            printDefs.tokenization_counter += 1
            for token in tokenized_statement:
                t = token.strip()
                if t not in banlist and t != "":
                    tokens.add(t)

    if len(tokens) == 0:
        time.sleep(2)
    translated = []
    for token in tokens:
        try:
            definition = subprocess.check_output(["myougiden", "-f", "--human", "-c", "-e", "whole", token.replace("|","!")]).decode("utf-8", "ignore")
            if len(outputs) == 0:
                print("")
                print("----------------------------------------------------------------------------")
                print("")
                print(token)
                print(definition)
            else:
                print("outputting " + token + " to " + outputs[printDefs.counter % len(outputs)])
                f = open(outputs[printDefs.counter % len(outputs)], "a")
                f.write("\n\n\n---------------------------------------------------\n\n\n\n")
                f.write(token)
                f.write("\n")
                f.write(definition)
                f.close()
                printDefs.counter += 1
            time.sleep(seconds)
        except:
            try:
                definition = subprocess.check_output(["myougiden", "--human", "-c", "-e", "whole", token.replace("|","!")]).decode("utf-8", "ignore")
                if len(outputs) == 0:
                    print("")
                    print("----------------------------------------------------------------------------")
                    print("")
                    print(token)
                    print(definition)
                else:
                    print("outputting " + token + " to " + outputs[printDefs.counter % len(outputs)])
                    f = open(outputs[printDefs.counter % len(outputs)], "a")
                    f.write("\n\n\n---------------------------------------------------\n\n\n\n")
                    f.write(token)
                    f.write("\n")
                    f.write(definition)
                    f.close()
                    printDefs.counter += 1
                time.sleep(seconds)
            except:
                time.sleep(0.01)
            #print(token + " not found in dictionary")
    return
def wordless_word_tokenize(main,
                           text,
                           lang,
                           word_tokenizer='default',
                           flat_tokens=True):
    tokens_hierarchical = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    # Check initialization status of word (and sentence) tokenizers
    if flat_tokens:
        wordless_text_utils.check_word_tokenizers(
            main, lang=lang, word_tokenizer=word_tokenizer)
    else:
        wordless_text_utils.check_tokenizers(main,
                                             lang=lang,
                                             word_tokenizer=word_tokenizer)

    # NLTK
    if 'NLTK' in word_tokenizer:
        sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(
                    treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(tweet_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence))
    # Sacremoses
    elif 'Sacremoses' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(
                    moses_tokenizer.tokenize(sentence, escape=False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(
                    moses_tokenizer.penn_tokenize(sentence))
    # spaCy
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if flat_tokens:
            tokens_hierarchical.append([token.text for token in doc])
        else:
            for sentence in doc.sents:
                tokens_hierarchical.append(
                    [token.text for token in sentence.as_doc()])
    # syntok
    elif word_tokenizer == 'syntok - Word Tokenizer':
        syntok_tokenizer = syntok.tokenizer.Tokenizer()

        if flat_tokens:
            tokens_hierarchical.append(
                [token.value for token in syntok_tokenizer.tokenize(text)])
        else:
            for para in syntok.segmenter.analyze(text):
                for sentence in para:
                    tokens_hierarchical.append(
                        [token.value for token in sentence])
    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang=lang)

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_hierarchical.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_hierarchical.append(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_hierarchical.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wordless_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_hierarchical.append(tokens)
    # Russian
    elif word_tokenizer == 'razdel - Russian Word Tokenizer':
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang='rus')

        for sentence in sentences:
            tokens_hierarchical.append(
                [token.text for token in razdel.tokenize(sentence)])
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        # Preserve sentence boundaries
        sentences = wordless_sentence_tokenize(
            main,
            text,
            lang='tha',
            sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer')

        if word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm + TCC'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(
                        sentence, engine='longest-matching'))
    # Tibetan
    elif 'botok' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang='bod')

        botok_tokenizer = wordless_text_utils.check_botok_tokenizers(
            main, word_tokenizer)

        for sentence in sentences:
            tokens_hierarchical.append(
                [token.text for token in botok_tokenizer.tokenize(sentence)])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )

        for sentence in sentences:
            tokens_hierarchical.append(underthesea.word_tokenize(
                str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, sentence in enumerate(tokens_hierarchical):
        tokens_hierarchical[i] = [
            token.strip() for token in sentence if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(
                    sentence[-1], boundary='', sentence_ending=True)
    else:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(
                    sentence[-1], boundary=' ', sentence_ending=True)

    # Clause tokenization
    if not flat_tokens:
        for i, sentence in enumerate(tokens_hierarchical):
            tokens_hierarchical[i] = wordless_clause_tokenize(
                main, sentence, lang)

    # Flatten tokens
    tokens_flat = list(wordless_misc.flatten_list(tokens_hierarchical))

    if flat_tokens:
        return tokens_flat
    else:
        return tokens_hierarchical
Esempio n. 18
0
        n5words.add(word.strip())

print("Loading n4 list")
with open("n4.list") as n4:
    for word in n4:
        n4words.add(word.strip())

occurrences = {}
sentences = {}

print("Counting words")
with codecs.open(sys.argv[1], 'r', encoding='utf-8',
                 errors='ignore') as infile:
    for line in infile:
        #print(line)
        tokenizedLine = tinysegmenter.tokenize(line.strip()) + nagisa.tagging(
            line.strip()).words
        for token in tokenizedLine:
            if token not in banlist and token not in n5words and token not in n4words and token.strip(
            ) != "":
                if token in occurrences.keys():
                    occurrences[token] += 1
                else:
                    occurrences[token] = 1
                if token not in sentences.keys():
                    sentences[token] = line.replace("\n", "").replace("|", "!")

print("Sorting words")
sortedOccurrences = []
for character in occurrences.keys():
    sortedOccurrences.append([character, occurrences[character]])
Esempio n. 19
0
import nagisa  # 日文分词
from sklearn.feature_extraction.text import TfidfVectorizer  # 文本特征提取
from sklearn.linear_model import LogisticRegression  # 逻辑回归
from sklearn.pipeline import make_pipeline  # 组合流水线

# 读取数据
train_cn = pd.read_excel('汽车领域多语种迁移学习挑战赛初赛训练集/中文_trian.xlsx')
train_ja = pd.read_excel('汽车领域多语种迁移学习挑战赛初赛训练集/日语_train.xlsx')
train_en = pd.read_excel('汽车领域多语种迁移学习挑战赛初赛训练集/英文_train.xlsx')

test_ja = pd.read_excel('testA.xlsx', sheet_name='日语_testA')
test_en = pd.read_excel('testA.xlsx', sheet_name='英文_testA')

# 文本分词
train_ja['words'] = train_ja['原始文本'].apply(
    lambda x: ' '.join(nagisa.tagging(x).words))
train_en['words'] = train_en['原始文本'].apply(lambda x: x.lower())

test_ja['words'] = test_ja['原始文本'].apply(
    lambda x: ' '.join(nagisa.tagging(x).words))
test_en['words'] = test_en['原始文本'].apply(lambda x: x.lower())

# 训练TFIDF和逻辑回归
pipline = make_pipeline(TfidfVectorizer(), LogisticRegression())
pipline.fit(train_ja['words'].tolist() + train_en['words'].tolist(),
            train_ja['意图'].tolist() + train_en['意图'].tolist())

# 模型预测
test_ja['意图'] = pipline.predict(test_ja['words'])
test_en['意图'] = pipline.predict(test_en['words'])
test_en['槽值1'] = np.nan
Esempio n. 20
0
#coding:utf-8
import django
from django.conf import settings
#japanese text segmentaion
import nagisa
string='何がちゃうねん、言うてみ'
string00='明日東京いかたん'
word=nagisa.tagging(string)
word00=nagisa.tagging(string00)
print(word.words)
print(word00.words)
Esempio n. 21
0
def Nagisa_Run(str):
    tokens = nagisa.tagging(str)
    print(tokens)
 def tokenize(self, text):
     words = nagisa.tagging(text)
     return dict(words=words.words, postags=words.postags)
Esempio n. 23
0
#!/usr/bin/env python
# coding: utf-8

import nagisa
import collections

text1 = "水を含んだ湿った空気が流れてくる"
text2 = "時代を漂う湿った空気が読みとれる"

doc = nagisa.tagging(text1 + text2)


def tokenize_jp(doc):
    doc = nagisa.tagging(doc)
    return doc.words


text1 = (tokenize_jp(text1))
text2 = (tokenize_jp(text2))

count_air1 = 426
count_air2 = 455
percent_air1 = count_air1 / (count_air1 + count_air2)
percent_air2 = count_air2 / (count_air1 + count_air2)

c1 = collections.Counter(湿っ=52, 水=34, 流れ=11, 感じ=14, 時代=6, 漂う=5, 読み=2)
c2 = collections.Counter(湿っ=1, 水=3, 流れ=22, 感じ=34, 時代=32, 漂う=33, 読み=89)

text1_c1_probability = 1
text1_c2_probability = 1
Esempio n. 24
0
def extract_emoji(text):
    results = nagisa.tagging(text) # 形態素解析
    words = results.words
    return [w for w in words if w in emoji.UNICODE_EMOJI]
Esempio n. 25
0
def tokenize(doc):
    doc = nagisa.tagging(doc)
    return doc.words
Esempio n. 26
0
 def on_paste(self,event):
     self.words = nagisa.tagging(self.master.clipboard_get())
     print(self.words.words)
     self.idx = [0,0]
     self.arranging(self.words.words)
Esempio n. 27
0
def wl_word_tokenize(main, text, lang, word_tokenizer='default'):
    tokens_multilevel = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    wl_nlp_utils.init_word_tokenizers(main,
                                      lang=lang,
                                      word_tokenizer=word_tokenizer)

    if word_tokenizer.startswith('spacy_'):
        # Input of SudachiPy cannot be more than 49149 BYTES
        if word_tokenizer == 'spacy_jpn' and len(text) > 49149 // 4:
            # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300)
            sections = wl_nlp_utils.split_into_chunks_text(text,
                                                           section_size=10)
        else:
            sections = wl_nlp_utils.split_into_chunks_text(
                text,
                section_size=main.settings_custom['files']['misc']
                ['read_files_in_chunks'])
    else:
        sections = wl_nlp_utils.split_into_chunks_text(text, 1)

    for section in sections:
        # spaCy
        if word_tokenizer.startswith('spacy_'):
            # Chinese, English, German, Portuguese
            if not lang.startswith('srp_'):
                lang = wl_conversion.remove_lang_code_suffixes(main, lang)

            nlp = main.__dict__[f'spacy_nlp_{lang}']
            doc = nlp(section)

            tokens_multilevel.append([])

            len_sents = len(list(doc.sents))

            for i, sentence in enumerate(doc.sents):
                tokens_sentence = []

                tokens = [token.text for token in sentence]
                len_tokens = len(tokens)

                for j, token in enumerate(tokens):
                    # Split paragraphs by new line character
                    len_lines = len(re.findall(r'\n', token))

                    if len_lines:
                        # Check if the last paragraph is empty
                        if i == len_sents - 1 and j == len_tokens - 1 and token.endswith(
                                '\n'):
                            len_lines -= 1

                        if tokens_sentence:
                            tokens_multilevel[-1].append(tokens_sentence)

                            tokens_sentence = []

                        tokens_multilevel.extend([[]
                                                  for j in range(len_lines)])
                    else:
                        if token.strip():
                            tokens_sentence.append(token)

                if tokens_sentence:
                    tokens_multilevel[-1].append(tokens_sentence)
        else:
            tokens_multilevel.append([])

            if section.strip():
                # NLTK
                if word_tokenizer.startswith('nltk_'):
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang)

                    if word_tokenizer == 'nltk_nist':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_nist_tokenizer.tokenize(sentence))
                    elif word_tokenizer == 'nltk_nltk':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_nltk_tokenizer.tokenize(sentence))
                    elif word_tokenizer == 'nltk_penn_treebank':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_treebank_tokenizer.tokenize(
                                    sentence))
                    elif word_tokenizer == 'nltk_tok_tok':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_toktok_tokenizer.tokenize(sentence))
                    elif word_tokenizer == 'nltk_twitter':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                main.nltk_tweet_tokenizer.tokenize(sentence))
                # Sacremoses
                elif word_tokenizer == 'sacremoses_moses':
                    lang = wl_conversion.remove_lang_code_suffixes(main, lang)
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang)

                    for sentence in sentences:
                        tokens_multilevel[-1].append(
                            main.__dict__[f'sacremoses_moses_tokenizer_{lang}']
                            .tokenize(sentence, escape=False))
                # Chinese
                elif word_tokenizer == 'jieba_zho':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens_multilevel[-1].append(jieba.lcut(sentence))
                elif word_tokenizer == 'pkuseg_zho':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens_multilevel[-1].append(
                            main.pkuseg_word_tokenizer.cut(sentence))
                elif word_tokenizer == 'wordless_zho_char':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens = []
                        non_han_start = 0

                        for i, char in enumerate(sentence):
                            if i >= non_han_start:
                                if wl_checking_unicode.is_han(char):
                                    tokens.append(char)

                                    non_han_start += 1
                                else:
                                    # English
                                    if wl_checking_unicode.is_eng(char):
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or not wl_checking_unicode.is_eng(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='eng_us'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break
                                    # Other Languages
                                    else:
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or wl_checking_unicode.is_han(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='other'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break

                        tokens_multilevel[-1].append(tokens)
                # Japanese
                elif word_tokenizer == 'nagisa_jpn':
                    import nagisa

                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens_multilevel[-1].append(
                            nagisa.tagging(str(sentence)).words)
                elif word_tokenizer.startswith('sudachipy_jpn'):
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    if word_tokenizer == 'sudachipy_jpn_split_mode_a':
                        for sentence in sentences:
                            tokens_multilevel[-1].append([
                                token.surface()
                                for token in main.sudachipy_word_tokenizer.
                                tokenize(sentence, sudachipy.SplitMode.A)
                            ])
                    elif word_tokenizer == 'sudachipy_jpn_split_mode_b':
                        for sentence in sentences:
                            tokens_multilevel[-1].append([
                                token.surface()
                                for token in main.sudachipy_word_tokenizer.
                                tokenize(sentence, sudachipy.SplitMode.B)
                            ])
                    elif word_tokenizer == 'sudachipy_jpn_split_mode_c':
                        for sentence in sentences:
                            tokens_multilevel[-1].append([
                                token.surface()
                                for token in main.sudachipy_word_tokenizer.
                                tokenize(sentence, sudachipy.SplitMode.C)
                            ])
                elif word_tokenizer == 'wordless_jpn_kanji':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang=lang)

                    for sentence in sentences:
                        tokens = []
                        non_han_start = 0

                        for i, char in enumerate(sentence):
                            if i >= non_han_start:
                                if wl_checking_unicode.is_han(char):
                                    tokens.append(char)

                                    non_han_start += 1
                                else:
                                    # Japanese Kana
                                    if wl_checking_unicode.is_kana(char):
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or not wl_checking_unicode.is_kana(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='jpn'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break
                                    # English
                                    elif wl_checking_unicode.is_eng(char):
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or not wl_checking_unicode.is_eng(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='eng_us'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break
                                    # Other Languages
                                    else:
                                        for j, _ in enumerate(sentence[i:]):
                                            if i + j + 1 == len(
                                                    sentence
                                            ) or wl_checking_unicode.is_han(
                                                    sentence[i + j + 1]):
                                                tokens.extend(
                                                    wl_word_tokenize(
                                                        main,
                                                        sentence[
                                                            non_han_start:i +
                                                            j + 1],
                                                        lang='other'))
                                                tokens = list(
                                                    wl_misc.flatten_list(
                                                        tokens))

                                                non_han_start = i + j + 1

                                                break

                        tokens_multilevel[-1].append(tokens)
                # Icelandic
                elif word_tokenizer == 'tokenizer_isl':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main,
                        section,
                        lang='isl',
                        sentence_tokenizer='tokenizer_isl')

                    for sentence in sentences:
                        tokens_multilevel[-1].append([
                            token for kind, token, val in tokenizer.tokenize(
                                sentence) if token
                        ])
                # Thai
                elif word_tokenizer.startswith('pythainlp_'):
                    # Preserve sentence boundaries
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang='tha')

                    if word_tokenizer == 'pythainlp_longest_matching':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence,
                                                        engine='longest'))
                    elif word_tokenizer == 'pythainlp_max_matching':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence, engine='mm'))
                    elif word_tokenizer == 'pythainlp_max_matching_tcc':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence,
                                                        engine='newmm'))
                    elif word_tokenizer == 'pythainlp_max_matching_tcc_safe_mode':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence,
                                                        engine='newmm-safe'))
                    elif word_tokenizer == 'pythainlp_nercut':
                        for sentence in sentences:
                            tokens_multilevel[-1].append(
                                pythainlp.word_tokenize(sentence,
                                                        engine='nercut'))
                # Tibetan
                elif word_tokenizer == 'botok_bod':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main, section, lang='bod')

                    for sentence in sentences:
                        tokens_multilevel[-1].append([
                            token.text for token in
                            main.botok_word_tokenizer.tokenize(sentence)
                        ])
                # Vietnamese
                elif word_tokenizer == 'underthesea_vie':
                    sentences = wl_sentence_tokenization.wl_sentence_tokenize(
                        main,
                        section,
                        lang='vie',
                        sentence_tokenizer='underthesea_vie')

                    for sentence in sentences:
                        tokens_multilevel[-1].append(
                            underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for para in tokens_multilevel:
        for i, sentence in enumerate(para):
            para[i] = [token.strip() for token in sentence if token.strip()]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for para in tokens_multilevel:
            for sentence in para:
                if sentence:
                    sentence[-1] = wl_texts.Wl_Token(sentence[-1],
                                                     boundary='',
                                                     sentence_ending=True)
    else:
        for para in tokens_multilevel:
            for sentence in para:
                if sentence:
                    sentence[-1] = wl_texts.Wl_Token(sentence[-1],
                                                     boundary=' ',
                                                     sentence_ending=True)

    return tokens_multilevel
Esempio n. 28
0
def wl_pos_tag_text(main, text, lang, pos_tagger, tagset):
    tokens_tagged = []

    if pos_tagger == 'nagisa_jpn':
        # Defer import to save loading time
        import nagisa

    # spaCy
    if pos_tagger.startswith('spacy_'):
        if not lang.startswith('srp_'):
            lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)

        if tagset == 'default':
            tokens_tagged = [(token.text, token.tag_) for token in doc]
        elif tagset == 'universal':
            tokens_tagged = [(token.text, token.pos_) for token in doc]
    # Chinese
    elif pos_tagger == 'jieba_zho':
        tokens_tagged = jieba.posseg.cut(text)
    # English & Russian
    elif pos_tagger == 'nltk_perceptron':
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main,
                                                            text,
                                                            lang=lang)
        tokens_tagged = nltk.pos_tag(tokens, lang=lang)
    # Japanese
    elif pos_tagger == 'nagisa_jpn':
        tokens_tagged = nagisa.tagging(text)
        tokens_tagged = zip(tokens_tagged.words, tokens_tagged.postags)
    elif pos_tagger == 'sudachipy_jpn':
        tokens_tagged = [(token.surface(), '-'.join([
            pos for pos in token.part_of_speech()[:4] if pos != '*'
        ])) for token in main.sudachipy_word_tokenizer.tokenize(text)]
    # Russian & Ukrainian
    elif pos_tagger == 'pymorphy2_morphological_analyzer':
        if lang == 'rus':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_rus
        elif lang == 'ukr':
            morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr

        tokens = wl_word_tokenization.wl_word_tokenize_flat(main,
                                                            text,
                                                            lang=lang)

        for token in tokens:
            tokens_tagged.append(
                (token, morphological_analyzer.parse(token)[0].tag._POS))
    # Thai
    elif pos_tagger.startswith('pythainlp_'):
        tokens = wl_word_tokenization.wl_word_tokenize_flat(main,
                                                            text,
                                                            lang=lang)

        if pos_tagger == 'pythainlp_perceptron_lst20':
            tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                                  engine='perceptron',
                                                  corpus='lst20')
        elif pos_tagger == 'pythainlp_perceptron_orchid':
            tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                                  engine='perceptron',
                                                  corpus='orchid')
        elif pos_tagger == 'pythainlp_perceptron_pud':
            tokens_tagged = pythainlp.tag.pos_tag(tokens,
                                                  engine='perceptron',
                                                  corpus='pud')
    # Tibetan
    elif pos_tagger == 'botok_bod':
        tokens = main.botok_word_tokenizer.tokenize(text)

        for token in tokens:
            if token.pos:
                tokens_tagged.append((token.text, token.pos))
            else:
                tokens_tagged.append((token.text, token.chunk_type))
    # Vietnamese
    elif pos_tagger == 'underthesea_vie':
        tokens_tagged = underthesea.pos_tag(text)

    # Remove empty tokens and strip whitespace in tokens
    tokens_tagged = [(str(token).strip(), tag) for token, tag in tokens_tagged
                     if str(token).strip()]

    return tokens_tagged
Esempio n. 29
0
 https://goo.gl/dMHvwc

================================================
クラウドエース株式会社 第一営業部
" 信実さを持ち、応えていく "
茨城県出身  石塚 健斗 〈KENTO ISHIZUKA〉 
e-mail [email protected]
web  https://www.cloud-ace.jp?xx=1

tel    03-6280-5939 (本社)
tel         080-3668-6458   (携帯)
fax   03-6800-3954 
address 〒100-0004 
              東京都千代田区大手町2丁目6番2号 日本ビルヂング11F
"""
texts = text.splitlines()
for text in texts:
    words = nagisa.tagging(text)
    print("___________________________________")
    print(words)
    #=> Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞

    # Get a list of words
    print(words.words)
    #=> ['Python', 'で', '簡単', 'に', '使える', 'ツール', 'です']

    # Get a list of POS-tags
    print(words.postags)
    #=> ['名詞', '助詞', '形状詞', '助動詞', '動詞', '名詞', '助動詞']
    if '動詞' not in words.postags:
        print("signature")
Esempio n. 30
0
    def test_tagging(self):
        # test_1
        text = 'Pythonで簡単に使えるツールです'
        output = 'Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞'
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_2
        output = 'python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞'
        words = nagisa.tagging(text, lower=True)
        self.assertEqual(output, str(words))

        # test_3
        text = 'ニューラルネットワークを使ってます。'
        output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号'
        self.assertEqual(output, str(nagisa.tagging(text)))

        # test_4
        tagger_nn = nagisa.Tagger(single_word_list=['ニューラルネットワーク', "ニューラルネット"])
        output = 'ニューラルネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号'
        self.assertEqual(output, str(tagger_nn.tagging(text)))

        # test_5
        text = "3月に見た「3月のライオン」"
        new_tagger = nagisa.Tagger(single_word_list=['3月のライオン'])
        output = '3/名詞 月/名詞 に/助詞 見/動詞 た/助動詞 「/補助記号 3月のライオン/名詞 」/補助記号'
        self.assertEqual(output, str(new_tagger.tagging(text)))

        # test_6
        text = "それが、iPhone XSです。"
        output = "それ/代名詞 が/助詞 、/補助記号 iPhone XS/名詞 です/助動詞 。/補助記号"
        new_tagger = nagisa.Tagger(single_word_list=["iPhone[a-zA-Z0-9 ]+"])

        self.assertEqual(output, str(new_tagger.tagging(text)))

        # test_7
        text = "1234abc ABC"
        output = "1234/名詞 abc ABC/名詞"
        new_tagger = nagisa.Tagger(single_word_list=["[a-zA-Z ]+", "[0-9]+"])

        self.assertEqual(output, str(new_tagger.tagging(text)))

        # test_8
        text = '(人•ᴗ•♡)こんばんは♪'
        output = '(人•ᴗ•♡)/補助記号 こんばんは/感動詞 ♪/補助記号'
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_9
        url = 'https://github.com/taishi-i/nagisaでコードを公開中(๑¯ω¯๑)'
        output = 'コード/名詞 公開/名詞 中/接尾辞'
        words = nagisa.filter(url, filter_postags=['URL', '補助記号', '助詞'])
        self.assertEqual(output, str(words))

        # test_10
        output = 'https://github.com/taishi-i/nagisa/URL で/助詞 を/助詞 (๑ ̄ω ̄๑)/補助記号'
        words = nagisa.extract(url, extract_postags=['URL', '補助記号', '助詞'])
        self.assertEqual(output, str(words))

        # test_11
        words = [" (人•ᴗ•♡)", "こんばんは", "♪"]
        output = ['補助記号', '感動詞', '補助記号']
        postags = nagisa.postagging(words)
        self.assertEqual(output, postags)

        # test_12
        postags = nagisa.decode(words)
        self.assertEqual(output, postags)

        # test_13
        words = [" (人•ᴗ•♡)", " ", "こんばんは", "♪"]
        output = ['補助記号', "空白", '感動詞', '補助記号']
        postags = nagisa.postagging(words)

        self.assertEqual(output, postags)

        # test_14
        postags = nagisa.decode(words)
        self.assertEqual(output, postags)

        # test_15
        words = [" (人•ᴗ•♡)", " ", "こんばんは", "♪"]
        output = ['補助記号', "空白", '感動詞', '補助記号']
        postags = nagisa.postagging(words)

        self.assertEqual(output, postags)

        # test_16
        postags = nagisa.decode(words)
        self.assertEqual(output, postags)

        # test_17
        text = "こんばんは😀"
        output = "こんばんは/感動詞 😀/補助記号"
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_18
        text = "コンバンハ12345"
        output = "コンバンハ/名詞 1/名詞 2/名詞 3/名詞 4/名詞 5/名詞"
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))

        # test_19
        text = "𪗱𪘂𪘚𪚲"
        output = "𪗱/補助記号 𪘂/補助記号 𪘚/補助記号 𪚲/補助記号"
        words = nagisa.tagging(text)
        self.assertEqual(output, str(words))