def test_tagging(self): # test_1 text = 'Pythonで簡単に使えるツールです' output = 'Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_2 text = 'ニューラルネットワークを使ってます。' output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(nagisa.tagging(text))) # test_3 tagger_nn = nagisa.Tagger(single_word_list=['ニューラルネットワーク']) output = 'ニューラルネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(tagger_nn.tagging(text))) # test_4 text = '(人•ᴗ•♡)こんばんは♪' output = '(人•ᴗ•♡)/補助記号 こんばんは/感動詞 ♪/補助記号' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_5 url = 'https://github.com/taishi-i/nagisaでコードを公開中(๑¯ω¯๑)' output = 'コード/名詞 公開/名詞 中/接尾辞' words = nagisa.filter(url, filter_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words))
def test_tagging(self): # test_1 text = 'Pythonで簡単に使えるツールです' output = 'Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_2 output = 'python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text, lower=True) self.assertEqual(output, str(words)) # test_3 text = 'ニューラルネットワークを使ってます。' output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(nagisa.tagging(text))) # test_4 tagger_nn = nagisa.Tagger(single_word_list=['ニューラルネットワーク', "ニューラルネット"]) output = 'ニューラルネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(tagger_nn.tagging(text))) # test_5 text = "3月に見た「3月のライオン」" new_tagger = nagisa.Tagger(single_word_list=['3月のライオン']) output = '3/名詞 月/名詞 に/助詞 見/動詞 た/助動詞 「/補助記号 3月のライオン/名詞 」/補助記号' self.assertEqual(output, str(new_tagger.tagging(text))) # test_6 text = '(人•ᴗ•♡)こんばんは♪' output = '(人•ᴗ•♡)/補助記号 こんばんは/感動詞 ♪/補助記号' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_7 url = 'https://github.com/taishi-i/nagisaでコードを公開中(๑¯ω¯๑)' output = 'コード/名詞 公開/名詞 中/接尾辞' words = nagisa.filter(url, filter_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words)) # test_8 output = 'https://github.com/taishi-i/nagisa/URL で/助詞 を/助詞 (๑ ̄ω ̄๑)/補助記号' words = nagisa.extract(url, extract_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words)) # test_9 words = [" (人•ᴗ•♡)", "こんばんは", "♪"] output = ['補助記号', '感動詞', '補助記号'] postags = nagisa.postagging(words) self.assertEqual(output, postags) # test_10 postags = nagisa.decode(words) self.assertEqual(output, postags)
def filter_sentence_by_pos_tag_japanese( self, # string or word list word_list, keep_tags=DEFAULT_KEEP_TAGS_JAP, ): try: import nagisa except Exception as ex: raise Exception( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unable to load nagisa: ' + str(ex)) if type(word_list) in [list, tuple]: text = ' '.join(word_list) else: text = word_list words_postags_obj = nagisa.tagging(text) txt_sym_tok = words_postags_obj.words txt_sym_postags = words_postags_obj.postags Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Japanese segmentation ' + str(txt_sym_tok) + ', word & POS tags: ' + str(txt_sym_postags)) words_postags = list(zip(txt_sym_tok, txt_sym_postags)) sent_filtered = [w for w, t in words_postags if (t in keep_tags)] Log.debugdebug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': POS TAGs: ' + str(words_postags)) Log.debugdebug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Filtered sentence: ' + str(sent_filtered)) return sent_filtered
def classify(self,link): # 対象記事の形態素解析 html = urllib.request.urlopen(link) soup = BeautifulSoup(html, 'html.parser') text = soup.find_all("p") text = [t.text for t in text] text = ','.join(text) words = nagisa.tagging(text) words = nagisa.filter(text, filter_postags=['助詞', '助動詞','接頭辞','接尾辞','補助記号','URL','空白']).words words = list(filter(lambda x: len(x) != 1, words)) # 一文字の単語を削除 words = list(filter(lambda x: x != '', [re.sub(r'[0-9]', "", s) for s in words]))# 数字を削除 # 学習データをloadする self.word_ct = np.load('./classify_app/data/word_ct.npy',allow_pickle=True).tolist() self.category_ct = np.load('./classify_app/data/category_ct.npy',allow_pickle=True).tolist() self.vocabularies = np.load('./classify_app/data/vocabularies.npy',allow_pickle=True).tolist() best_category = None # もっとも近いカテゴリ max_prob = -sys.maxsize# 最小整数値を設定 # カテゴリ毎に対象文書(単語)のカテゴリー出現率P(C|W)を求める for category in self.category_ct.keys(): # 文書内のカテゴリー出現率P(C|W)を求める prob = self.score(words, category) if prob > max_prob: max_prob = prob best_category = category return best_category
def tag_and_write(line): line = line.replace('\n', '') line = line.replace('〜', '') split = line.split(',') tagged = nagisa.tagging(split[1] if split[1] else split[0]) if tagged.postags: out.write(line + ',' + tagged.postags[0] + '\n') else: out.write(line + '\n')
def tokenize_jp(sentences_list): tokens_tags_list = [] tokens_list = [] tags_list = [] for s in sentences_list: words_tags = nagisa.tagging(s) tokens_tags_list.append(str(words_tags)) tokens_list.append(words_tags.words) tags_list.append(words_tags.postags) return tokens_tags_list, tokens_list, tags_list
def detect_verb(input): texts = input.splitlines() flag = False for text in texts: words = nagisa.tagging(text) print("___________________________________") if '動詞' in words.postags: flag = True return flag return flag
def translate_from_japanese(text): if (text is None): return "入力されていません" else: preds = model( torch.LongTensor( [ja_vocabulary.encode(nagisa.tagging(text).words)])) _out = preds.view(-1).tolist() out = ' '.join(th_vocabulary.decode(_out)) begin = out.find('<s>') end = out.find('</s>') return out[begin + 3:end]
def get_kanji(nodeList, kanjiCounter): #filter out only text events textEvents = [ event for node in nodeList for event in node if event[_param_dict['code']] == 401 ] for each in textEvents: japText = each[_param_dict['parameters']][0] words = nagisa.tagging(japText) for word in words.words: if not_kanji(word): continue # add to list of kanji kanjiCounter[word] += 1
def original_usage(text): """ Return the analysis results by nagisa. Parameters ---------- text : str An input text Returns ------- tokens : nagisa.tagger.Tagger._Token The analysis results by nagisa """ tokens = nagisa.tagging(text) return tokens
async def nagisa(self, context: commands.Context, *, message: str): """指定した文章を形態素解析します。""" stock = [] character_message = self.bot.storage.lexicon.get_character_message_for_command_nagisa( ) result = f"{character_message}\n" await context.trigger_typing() words = nagisa.tagging(message) for index in range(len(words.words)): word = words.words[index] postag = words.postags[index] result += " " + word + "`[" + postag + "]`" stock.append(f'{result}') await self.bot.send("\n".join(stock))
def Morphological_analysis(link): html = urllib.request.urlopen(link) soup = BeautifulSoup(html, 'html.parser') text = soup.find_all("p") text = [t.text for t in text] text = ','.join(text) words = nagisa.tagging(text) words = nagisa.filter( text, filter_postags=['助詞', '助動詞', '接頭辞', '接尾辞', '補助記号', 'URL', '空白']).words words = list(filter(lambda x: len(x) != 1, words)) # 一文字の単語を削除 words = list( filter(lambda x: x != '', [re.sub(r'[0-9]', "", s) for s in words])) # 数字を削除 return words
def segment_ko_ja( self, text, return_array_of_split_words = False ): try: if self.lang in [lf.LangFeatures.LANG_JA]: words_postags = nagisa.tagging(text) txt_sym_tok = words_postags.words txt_sym_postags = words_postags.postags Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Japanese segmentation "' + str(txt_sym_tok) + '", word & POS tags: ' + str(words_postags) ) if return_array_of_split_words: return txt_sym_tok else: return BasicPreprocessor.get_word_separator(lang=self.lang).join(txt_sym_tok) elif self.lang in [lf.LangFeatures.LANG_KO]: self.warn_korean() words_postags = self.kkma.pos( phrase = text ) txt_sym_tok = [wp[0] for wp in words_postags] txt_sym_postags = [wp[1] for wp in words_postags] Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Korean segmentation "' + str(txt_sym_tok) + '", word & POS tags: ' + str(words_postags) ) if return_array_of_split_words: return txt_sym_tok else: return BasicPreprocessor.get_word_separator(lang=self.lang).join(txt_sym_tok) else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': No external library supported for language "' + str(self.lang) + '"' ) except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error segmenting lang "' + str(self.lang) + '", text "' + str(text) \ + '", exception: ' + str(ex) Log.error(errmsg) raise Exception(errmsg)
def tagging_document(file_name, tokens_only = False): with smart_open.smart_open(file_name) as f : for i, line in enumerate(f): text = nagisa.tagging(line) text_list = text.words text_postags = text.postags listed_text = [] for text_list_elements, text_postags_element in zip(text_list, text_postags): if text_postags_element == "名詞" : listed_text.append(text_list_elements) elif text_postags_element == "動詞" : listed_text.append(text_list_elements) elif text_postags_element == "形状詞" : listed_text.append(text_list_elements) if tokens_only : yield listed_text else : yield gensim.models.doc2vec.TaggedDocument(listed_text, [i])
def wordless_word_tokenize(main, text, lang, word_tokenizer='default', keep_sentences=False): tokens_sentences = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] wordless_text_utils.check_word_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) if 'NLTK' in word_tokenizer: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'): treebank_tokenizer = nltk.TreebankWordTokenizer() for sentence in sentences: tokens_sentences.append(treebank_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'): tweet_tokenizer = nltk.TweetTokenizer() for sentence in sentences: tokens_sentences.append(tweet_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'): nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() for sentence in sentences: tokens_sentences.append(nist_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'): toktok_tokenizer = nltk.ToktokTokenizer() for sentence in sentences: tokens_sentences.append(toktok_tokenizer.tokenize(sentence)) if not keep_sentences: tokens_sentences = [ itertools.chain.from_iterable(tokens_sentences) ] elif 'Sacremoses' in word_tokenizer: if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang) else: sentences = [text] if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_sentences.append( moses_tokenizer.tokenize(sentence, escape=False)) elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_sentences.append( moses_tokenizer.penn_tokenize(sentence)) elif 'spaCy' in word_tokenizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True if keep_sentences: for sentence in doc.sents: tokens_sentences.append( [token.text for token in sentence.as_doc()]) else: tokens_sentences.append([token.text for token in doc]) # Chinese & Japanese elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer or 'Wordless' in word_tokenizer): if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang=lang) else: sentences = [text] # Chinese if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'): for sentence in sentences: tokens_sentences.append(jieba.cut(sentence)) elif word_tokenizer == main.tr( 'Wordless - Chinese Character Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_sentences.extend(tokens) # Japanese elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'): import nagisa for sentence in sentences: tokens_sentences.append(nagisa.tagging(str(sentence)).words) elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wordless_checking_unicode.is_kana(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='jpn')) non_han_start = i + j + 1 break # English elif wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_sentences.extend(tokens) # Thai elif 'PyThaiNLP' in word_tokenizer: sentences = wordless_sentence_tokenize( main, text, lang='tha', sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer') if word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm + TCC'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize(sentence, engine='mm')) elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize( sentence, engine='longest-matching')) # Tibetan elif 'pybo' in word_tokenizer: if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang='bod') else: sentences = [text] if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_gmd.tokenize(sentence) ]) elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_pos.tokenize(sentence) ]) elif word_tokenizer == main.tr( 'pybo - Tibetan Word Tokenizer (tsikchen)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_tsikchen.tokenize(sentence) ]) # Vietnamese elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'): if keep_sentences: sentences = wordless_sentence_tokenize( main, text, lang='vie', sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer' ) else: sentences = [text] for sentence in sentences: tokens_sentences.append(underthesea.word_tokenize(str(sentence))) # Remove empty tokens and strip whitespace for i, tokens in enumerate(tokens_sentences): tokens_sentences[i] = [ token.strip() for token in tokens if token.strip() ] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for tokens in tokens_sentences: if tokens: tokens[-1] = wordless_text.Wordless_Token(tokens[-1], boundary='', sentence_ending=True) else: for tokens in tokens_sentences: if tokens: tokens[-1] = wordless_text.Wordless_Token(tokens[-1], boundary=' ', sentence_ending=True) return tokens_sentences
def printDefs(outputs, seconds): if not hasattr(printDefs, "counter"): printDefs.counter = 0 if not hasattr(printDefs, "tokenization_counter"): printDefs.tokenization_counter = 0 tokens = set() with open("tout3.txt") as lines: for line in lines: #tinysegmenter tinysegmenter_tokens = tinysegmenter.tokenize(line.strip()) #nagisa nagisa_tokens = nagisa.tagging(line.strip()).words if printDefs.tokenization_counter % 2 == 0: print("tinysegmenter:") tokenized_statement = tinysegmenter_tokens else: print("nagisa:") tokenized_statement = nagisa_tokens print(tokenized_statement) printDefs.tokenization_counter += 1 for token in tokenized_statement: t = token.strip() if t not in banlist and t != "": tokens.add(t) if len(tokens) == 0: time.sleep(2) translated = [] for token in tokens: try: definition = subprocess.check_output(["myougiden", "-f", "--human", "-c", "-e", "whole", token.replace("|","!")]).decode("utf-8", "ignore") if len(outputs) == 0: print("") print("----------------------------------------------------------------------------") print("") print(token) print(definition) else: print("outputting " + token + " to " + outputs[printDefs.counter % len(outputs)]) f = open(outputs[printDefs.counter % len(outputs)], "a") f.write("\n\n\n---------------------------------------------------\n\n\n\n") f.write(token) f.write("\n") f.write(definition) f.close() printDefs.counter += 1 time.sleep(seconds) except: try: definition = subprocess.check_output(["myougiden", "--human", "-c", "-e", "whole", token.replace("|","!")]).decode("utf-8", "ignore") if len(outputs) == 0: print("") print("----------------------------------------------------------------------------") print("") print(token) print(definition) else: print("outputting " + token + " to " + outputs[printDefs.counter % len(outputs)]) f = open(outputs[printDefs.counter % len(outputs)], "a") f.write("\n\n\n---------------------------------------------------\n\n\n\n") f.write(token) f.write("\n") f.write(definition) f.close() printDefs.counter += 1 time.sleep(seconds) except: time.sleep(0.01) #print(token + " not found in dictionary") return
def wordless_word_tokenize(main, text, lang, word_tokenizer='default', flat_tokens=True): tokens_hierarchical = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] # Check initialization status of word (and sentence) tokenizers if flat_tokens: wordless_text_utils.check_word_tokenizers( main, lang=lang, word_tokenizer=word_tokenizer) else: wordless_text_utils.check_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) # NLTK if 'NLTK' in word_tokenizer: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'): treebank_tokenizer = nltk.TreebankWordTokenizer() for sentence in sentences: tokens_hierarchical.append( treebank_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'): tweet_tokenizer = nltk.TweetTokenizer() for sentence in sentences: tokens_hierarchical.append(tweet_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'): nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() for sentence in sentences: tokens_hierarchical.append(nist_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'): toktok_tokenizer = nltk.ToktokTokenizer() for sentence in sentences: tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence)) # Sacremoses elif 'Sacremoses' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_hierarchical.append( moses_tokenizer.tokenize(sentence, escape=False)) elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_hierarchical.append( moses_tokenizer.penn_tokenize(sentence)) # spaCy elif 'spaCy' in word_tokenizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True if flat_tokens: tokens_hierarchical.append([token.text for token in doc]) else: for sentence in doc.sents: tokens_hierarchical.append( [token.text for token in sentence.as_doc()]) # syntok elif word_tokenizer == 'syntok - Word Tokenizer': syntok_tokenizer = syntok.tokenizer.Tokenizer() if flat_tokens: tokens_hierarchical.append( [token.value for token in syntok_tokenizer.tokenize(text)]) else: for para in syntok.segmenter.analyze(text): for sentence in para: tokens_hierarchical.append( [token.value for token in sentence]) # Chinese & Japanese elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer or 'Wordless' in word_tokenizer): if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang=lang) # Chinese if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'): for sentence in sentences: tokens_hierarchical.append(jieba.cut(sentence)) elif word_tokenizer == main.tr( 'Wordless - Chinese Character Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_hierarchical.append(tokens) # Japanese elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'): import nagisa for sentence in sentences: tokens_hierarchical.append(nagisa.tagging(str(sentence)).words) elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wordless_checking_unicode.is_kana(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='jpn')) non_han_start = i + j + 1 break # English elif wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_hierarchical.append(tokens) # Russian elif word_tokenizer == 'razdel - Russian Word Tokenizer': if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang='rus') for sentence in sentences: tokens_hierarchical.append( [token.text for token in razdel.tokenize(sentence)]) # Thai elif 'PyThaiNLP' in word_tokenizer: # Preserve sentence boundaries sentences = wordless_sentence_tokenize( main, text, lang='tha', sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer') if word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm + TCC'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize(sentence, engine='mm')) elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize( sentence, engine='longest-matching')) # Tibetan elif 'botok' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang='bod') botok_tokenizer = wordless_text_utils.check_botok_tokenizers( main, word_tokenizer) for sentence in sentences: tokens_hierarchical.append( [token.text for token in botok_tokenizer.tokenize(sentence)]) # Vietnamese elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'): if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize( main, text, lang='vie', sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer' ) for sentence in sentences: tokens_hierarchical.append(underthesea.word_tokenize( str(sentence))) # Remove empty tokens and strip whitespace for i, sentence in enumerate(tokens_hierarchical): tokens_hierarchical[i] = [ token.strip() for token in sentence if token.strip() ] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for sentence in tokens_hierarchical: if sentence: sentence[-1] = wordless_text.Wordless_Token( sentence[-1], boundary='', sentence_ending=True) else: for sentence in tokens_hierarchical: if sentence: sentence[-1] = wordless_text.Wordless_Token( sentence[-1], boundary=' ', sentence_ending=True) # Clause tokenization if not flat_tokens: for i, sentence in enumerate(tokens_hierarchical): tokens_hierarchical[i] = wordless_clause_tokenize( main, sentence, lang) # Flatten tokens tokens_flat = list(wordless_misc.flatten_list(tokens_hierarchical)) if flat_tokens: return tokens_flat else: return tokens_hierarchical
n5words.add(word.strip()) print("Loading n4 list") with open("n4.list") as n4: for word in n4: n4words.add(word.strip()) occurrences = {} sentences = {} print("Counting words") with codecs.open(sys.argv[1], 'r', encoding='utf-8', errors='ignore') as infile: for line in infile: #print(line) tokenizedLine = tinysegmenter.tokenize(line.strip()) + nagisa.tagging( line.strip()).words for token in tokenizedLine: if token not in banlist and token not in n5words and token not in n4words and token.strip( ) != "": if token in occurrences.keys(): occurrences[token] += 1 else: occurrences[token] = 1 if token not in sentences.keys(): sentences[token] = line.replace("\n", "").replace("|", "!") print("Sorting words") sortedOccurrences = [] for character in occurrences.keys(): sortedOccurrences.append([character, occurrences[character]])
import nagisa # 日文分词 from sklearn.feature_extraction.text import TfidfVectorizer # 文本特征提取 from sklearn.linear_model import LogisticRegression # 逻辑回归 from sklearn.pipeline import make_pipeline # 组合流水线 # 读取数据 train_cn = pd.read_excel('汽车领域多语种迁移学习挑战赛初赛训练集/中文_trian.xlsx') train_ja = pd.read_excel('汽车领域多语种迁移学习挑战赛初赛训练集/日语_train.xlsx') train_en = pd.read_excel('汽车领域多语种迁移学习挑战赛初赛训练集/英文_train.xlsx') test_ja = pd.read_excel('testA.xlsx', sheet_name='日语_testA') test_en = pd.read_excel('testA.xlsx', sheet_name='英文_testA') # 文本分词 train_ja['words'] = train_ja['原始文本'].apply( lambda x: ' '.join(nagisa.tagging(x).words)) train_en['words'] = train_en['原始文本'].apply(lambda x: x.lower()) test_ja['words'] = test_ja['原始文本'].apply( lambda x: ' '.join(nagisa.tagging(x).words)) test_en['words'] = test_en['原始文本'].apply(lambda x: x.lower()) # 训练TFIDF和逻辑回归 pipline = make_pipeline(TfidfVectorizer(), LogisticRegression()) pipline.fit(train_ja['words'].tolist() + train_en['words'].tolist(), train_ja['意图'].tolist() + train_en['意图'].tolist()) # 模型预测 test_ja['意图'] = pipline.predict(test_ja['words']) test_en['意图'] = pipline.predict(test_en['words']) test_en['槽值1'] = np.nan
#coding:utf-8 import django from django.conf import settings #japanese text segmentaion import nagisa string='何がちゃうねん、言うてみ' string00='明日東京いかたん' word=nagisa.tagging(string) word00=nagisa.tagging(string00) print(word.words) print(word00.words)
def Nagisa_Run(str): tokens = nagisa.tagging(str) print(tokens)
def tokenize(self, text): words = nagisa.tagging(text) return dict(words=words.words, postags=words.postags)
#!/usr/bin/env python # coding: utf-8 import nagisa import collections text1 = "水を含んだ湿った空気が流れてくる" text2 = "時代を漂う湿った空気が読みとれる" doc = nagisa.tagging(text1 + text2) def tokenize_jp(doc): doc = nagisa.tagging(doc) return doc.words text1 = (tokenize_jp(text1)) text2 = (tokenize_jp(text2)) count_air1 = 426 count_air2 = 455 percent_air1 = count_air1 / (count_air1 + count_air2) percent_air2 = count_air2 / (count_air1 + count_air2) c1 = collections.Counter(湿っ=52, 水=34, 流れ=11, 感じ=14, 時代=6, 漂う=5, 読み=2) c2 = collections.Counter(湿っ=1, 水=3, 流れ=22, 感じ=34, 時代=32, 漂う=33, 読み=89) text1_c1_probability = 1 text1_c2_probability = 1
def extract_emoji(text): results = nagisa.tagging(text) # 形態素解析 words = results.words return [w for w in words if w in emoji.UNICODE_EMOJI]
def tokenize(doc): doc = nagisa.tagging(doc) return doc.words
def on_paste(self,event): self.words = nagisa.tagging(self.master.clipboard_get()) print(self.words.words) self.idx = [0,0] self.arranging(self.words.words)
def wl_word_tokenize(main, text, lang, word_tokenizer='default'): tokens_multilevel = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] wl_nlp_utils.init_word_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) if word_tokenizer.startswith('spacy_'): # Input of SudachiPy cannot be more than 49149 BYTES if word_tokenizer == 'spacy_jpn' and len(text) > 49149 // 4: # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300) sections = wl_nlp_utils.split_into_chunks_text(text, section_size=10) else: sections = wl_nlp_utils.split_into_chunks_text( text, section_size=main.settings_custom['files']['misc'] ['read_files_in_chunks']) else: sections = wl_nlp_utils.split_into_chunks_text(text, 1) for section in sections: # spaCy if word_tokenizer.startswith('spacy_'): # Chinese, English, German, Portuguese if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(section) tokens_multilevel.append([]) len_sents = len(list(doc.sents)) for i, sentence in enumerate(doc.sents): tokens_sentence = [] tokens = [token.text for token in sentence] len_tokens = len(tokens) for j, token in enumerate(tokens): # Split paragraphs by new line character len_lines = len(re.findall(r'\n', token)) if len_lines: # Check if the last paragraph is empty if i == len_sents - 1 and j == len_tokens - 1 and token.endswith( '\n'): len_lines -= 1 if tokens_sentence: tokens_multilevel[-1].append(tokens_sentence) tokens_sentence = [] tokens_multilevel.extend([[] for j in range(len_lines)]) else: if token.strip(): tokens_sentence.append(token) if tokens_sentence: tokens_multilevel[-1].append(tokens_sentence) else: tokens_multilevel.append([]) if section.strip(): # NLTK if word_tokenizer.startswith('nltk_'): sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang) if word_tokenizer == 'nltk_nist': for sentence in sentences: tokens_multilevel[-1].append( main.nltk_nist_tokenizer.tokenize(sentence)) elif word_tokenizer == 'nltk_nltk': for sentence in sentences: tokens_multilevel[-1].append( main.nltk_nltk_tokenizer.tokenize(sentence)) elif word_tokenizer == 'nltk_penn_treebank': for sentence in sentences: tokens_multilevel[-1].append( main.nltk_treebank_tokenizer.tokenize( sentence)) elif word_tokenizer == 'nltk_tok_tok': for sentence in sentences: tokens_multilevel[-1].append( main.nltk_toktok_tokenizer.tokenize(sentence)) elif word_tokenizer == 'nltk_twitter': for sentence in sentences: tokens_multilevel[-1].append( main.nltk_tweet_tokenizer.tokenize(sentence)) # Sacremoses elif word_tokenizer == 'sacremoses_moses': lang = wl_conversion.remove_lang_code_suffixes(main, lang) sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang) for sentence in sentences: tokens_multilevel[-1].append( main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] .tokenize(sentence, escape=False)) # Chinese elif word_tokenizer == 'jieba_zho': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang=lang) for sentence in sentences: tokens_multilevel[-1].append(jieba.lcut(sentence)) elif word_tokenizer == 'pkuseg_zho': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang=lang) for sentence in sentences: tokens_multilevel[-1].append( main.pkuseg_word_tokenizer.cut(sentence)) elif word_tokenizer == 'wordless_zho_char': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang=lang) for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wl_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wl_checking_unicode.is_eng(char): for j, _ in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wl_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[ non_han_start:i + j + 1], lang='eng_us')) tokens = list( wl_misc.flatten_list( tokens)) non_han_start = i + j + 1 break # Other Languages else: for j, _ in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wl_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[ non_han_start:i + j + 1], lang='other')) tokens = list( wl_misc.flatten_list( tokens)) non_han_start = i + j + 1 break tokens_multilevel[-1].append(tokens) # Japanese elif word_tokenizer == 'nagisa_jpn': import nagisa sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang=lang) for sentence in sentences: tokens_multilevel[-1].append( nagisa.tagging(str(sentence)).words) elif word_tokenizer.startswith('sudachipy_jpn'): sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang=lang) if word_tokenizer == 'sudachipy_jpn_split_mode_a': for sentence in sentences: tokens_multilevel[-1].append([ token.surface() for token in main.sudachipy_word_tokenizer. tokenize(sentence, sudachipy.SplitMode.A) ]) elif word_tokenizer == 'sudachipy_jpn_split_mode_b': for sentence in sentences: tokens_multilevel[-1].append([ token.surface() for token in main.sudachipy_word_tokenizer. tokenize(sentence, sudachipy.SplitMode.B) ]) elif word_tokenizer == 'sudachipy_jpn_split_mode_c': for sentence in sentences: tokens_multilevel[-1].append([ token.surface() for token in main.sudachipy_word_tokenizer. tokenize(sentence, sudachipy.SplitMode.C) ]) elif word_tokenizer == 'wordless_jpn_kanji': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang=lang) for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wl_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wl_checking_unicode.is_kana(char): for j, _ in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wl_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[ non_han_start:i + j + 1], lang='jpn')) tokens = list( wl_misc.flatten_list( tokens)) non_han_start = i + j + 1 break # English elif wl_checking_unicode.is_eng(char): for j, _ in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wl_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[ non_han_start:i + j + 1], lang='eng_us')) tokens = list( wl_misc.flatten_list( tokens)) non_han_start = i + j + 1 break # Other Languages else: for j, _ in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wl_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[ non_han_start:i + j + 1], lang='other')) tokens = list( wl_misc.flatten_list( tokens)) non_han_start = i + j + 1 break tokens_multilevel[-1].append(tokens) # Icelandic elif word_tokenizer == 'tokenizer_isl': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang='isl', sentence_tokenizer='tokenizer_isl') for sentence in sentences: tokens_multilevel[-1].append([ token for kind, token, val in tokenizer.tokenize( sentence) if token ]) # Thai elif word_tokenizer.startswith('pythainlp_'): # Preserve sentence boundaries sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang='tha') if word_tokenizer == 'pythainlp_longest_matching': for sentence in sentences: tokens_multilevel[-1].append( pythainlp.word_tokenize(sentence, engine='longest')) elif word_tokenizer == 'pythainlp_max_matching': for sentence in sentences: tokens_multilevel[-1].append( pythainlp.word_tokenize(sentence, engine='mm')) elif word_tokenizer == 'pythainlp_max_matching_tcc': for sentence in sentences: tokens_multilevel[-1].append( pythainlp.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == 'pythainlp_max_matching_tcc_safe_mode': for sentence in sentences: tokens_multilevel[-1].append( pythainlp.word_tokenize(sentence, engine='newmm-safe')) elif word_tokenizer == 'pythainlp_nercut': for sentence in sentences: tokens_multilevel[-1].append( pythainlp.word_tokenize(sentence, engine='nercut')) # Tibetan elif word_tokenizer == 'botok_bod': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang='bod') for sentence in sentences: tokens_multilevel[-1].append([ token.text for token in main.botok_word_tokenizer.tokenize(sentence) ]) # Vietnamese elif word_tokenizer == 'underthesea_vie': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang='vie', sentence_tokenizer='underthesea_vie') for sentence in sentences: tokens_multilevel[-1].append( underthesea.word_tokenize(str(sentence))) # Remove empty tokens and strip whitespace for para in tokens_multilevel: for i, sentence in enumerate(para): para[i] = [token.strip() for token in sentence if token.strip()] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for para in tokens_multilevel: for sentence in para: if sentence: sentence[-1] = wl_texts.Wl_Token(sentence[-1], boundary='', sentence_ending=True) else: for para in tokens_multilevel: for sentence in para: if sentence: sentence[-1] = wl_texts.Wl_Token(sentence[-1], boundary=' ', sentence_ending=True) return tokens_multilevel
def wl_pos_tag_text(main, text, lang, pos_tagger, tagset): tokens_tagged = [] if pos_tagger == 'nagisa_jpn': # Defer import to save loading time import nagisa # spaCy if pos_tagger.startswith('spacy_'): if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) if tagset == 'default': tokens_tagged = [(token.text, token.tag_) for token in doc] elif tagset == 'universal': tokens_tagged = [(token.text, token.pos_) for token in doc] # Chinese elif pos_tagger == 'jieba_zho': tokens_tagged = jieba.posseg.cut(text) # English & Russian elif pos_tagger == 'nltk_perceptron': lang = wl_conversion.remove_lang_code_suffixes(main, lang) tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang=lang) tokens_tagged = nltk.pos_tag(tokens, lang=lang) # Japanese elif pos_tagger == 'nagisa_jpn': tokens_tagged = nagisa.tagging(text) tokens_tagged = zip(tokens_tagged.words, tokens_tagged.postags) elif pos_tagger == 'sudachipy_jpn': tokens_tagged = [(token.surface(), '-'.join([ pos for pos in token.part_of_speech()[:4] if pos != '*' ])) for token in main.sudachipy_word_tokenizer.tokenize(text)] # Russian & Ukrainian elif pos_tagger == 'pymorphy2_morphological_analyzer': if lang == 'rus': morphological_analyzer = main.pymorphy2_morphological_analyzer_rus elif lang == 'ukr': morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang=lang) for token in tokens: tokens_tagged.append( (token, morphological_analyzer.parse(token)[0].tag._POS)) # Thai elif pos_tagger.startswith('pythainlp_'): tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang=lang) if pos_tagger == 'pythainlp_perceptron_lst20': tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='lst20') elif pos_tagger == 'pythainlp_perceptron_orchid': tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='orchid') elif pos_tagger == 'pythainlp_perceptron_pud': tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='pud') # Tibetan elif pos_tagger == 'botok_bod': tokens = main.botok_word_tokenizer.tokenize(text) for token in tokens: if token.pos: tokens_tagged.append((token.text, token.pos)) else: tokens_tagged.append((token.text, token.chunk_type)) # Vietnamese elif pos_tagger == 'underthesea_vie': tokens_tagged = underthesea.pos_tag(text) # Remove empty tokens and strip whitespace in tokens tokens_tagged = [(str(token).strip(), tag) for token, tag in tokens_tagged if str(token).strip()] return tokens_tagged
https://goo.gl/dMHvwc ================================================ クラウドエース株式会社 第一営業部 " 信実さを持ち、応えていく " 茨城県出身 石塚 健斗 〈KENTO ISHIZUKA〉 e-mail [email protected] web https://www.cloud-ace.jp?xx=1 tel 03-6280-5939 (本社) tel 080-3668-6458 (携帯) fax 03-6800-3954 address 〒100-0004 東京都千代田区大手町2丁目6番2号 日本ビルヂング11F """ texts = text.splitlines() for text in texts: words = nagisa.tagging(text) print("___________________________________") print(words) #=> Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞 # Get a list of words print(words.words) #=> ['Python', 'で', '簡単', 'に', '使える', 'ツール', 'です'] # Get a list of POS-tags print(words.postags) #=> ['名詞', '助詞', '形状詞', '助動詞', '動詞', '名詞', '助動詞'] if '動詞' not in words.postags: print("signature")
def test_tagging(self): # test_1 text = 'Pythonで簡単に使えるツールです' output = 'Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_2 output = 'python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text, lower=True) self.assertEqual(output, str(words)) # test_3 text = 'ニューラルネットワークを使ってます。' output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(nagisa.tagging(text))) # test_4 tagger_nn = nagisa.Tagger(single_word_list=['ニューラルネットワーク', "ニューラルネット"]) output = 'ニューラルネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(tagger_nn.tagging(text))) # test_5 text = "3月に見た「3月のライオン」" new_tagger = nagisa.Tagger(single_word_list=['3月のライオン']) output = '3/名詞 月/名詞 に/助詞 見/動詞 た/助動詞 「/補助記号 3月のライオン/名詞 」/補助記号' self.assertEqual(output, str(new_tagger.tagging(text))) # test_6 text = "それが、iPhone XSです。" output = "それ/代名詞 が/助詞 、/補助記号 iPhone XS/名詞 です/助動詞 。/補助記号" new_tagger = nagisa.Tagger(single_word_list=["iPhone[a-zA-Z0-9 ]+"]) self.assertEqual(output, str(new_tagger.tagging(text))) # test_7 text = "1234abc ABC" output = "1234/名詞 abc ABC/名詞" new_tagger = nagisa.Tagger(single_word_list=["[a-zA-Z ]+", "[0-9]+"]) self.assertEqual(output, str(new_tagger.tagging(text))) # test_8 text = '(人•ᴗ•♡)こんばんは♪' output = '(人•ᴗ•♡)/補助記号 こんばんは/感動詞 ♪/補助記号' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_9 url = 'https://github.com/taishi-i/nagisaでコードを公開中(๑¯ω¯๑)' output = 'コード/名詞 公開/名詞 中/接尾辞' words = nagisa.filter(url, filter_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words)) # test_10 output = 'https://github.com/taishi-i/nagisa/URL で/助詞 を/助詞 (๑ ̄ω ̄๑)/補助記号' words = nagisa.extract(url, extract_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words)) # test_11 words = [" (人•ᴗ•♡)", "こんばんは", "♪"] output = ['補助記号', '感動詞', '補助記号'] postags = nagisa.postagging(words) self.assertEqual(output, postags) # test_12 postags = nagisa.decode(words) self.assertEqual(output, postags) # test_13 words = [" (人•ᴗ•♡)", " ", "こんばんは", "♪"] output = ['補助記号', "空白", '感動詞', '補助記号'] postags = nagisa.postagging(words) self.assertEqual(output, postags) # test_14 postags = nagisa.decode(words) self.assertEqual(output, postags) # test_15 words = [" (人•ᴗ•♡)", " ", "こんばんは", "♪"] output = ['補助記号', "空白", '感動詞', '補助記号'] postags = nagisa.postagging(words) self.assertEqual(output, postags) # test_16 postags = nagisa.decode(words) self.assertEqual(output, postags) # test_17 text = "こんばんは😀" output = "こんばんは/感動詞 😀/補助記号" words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_18 text = "コンバンハ12345" output = "コンバンハ/名詞 1/名詞 2/名詞 3/名詞 4/名詞 5/名詞" words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_19 text = "𪗱𪘂𪘚𪚲" output = "𪗱/補助記号 𪘂/補助記号 𪘚/補助記号 𪚲/補助記号" words = nagisa.tagging(text) self.assertEqual(output, str(words))