def test_tagging(self): # test_1 text = 'Pythonで簡単に使えるツールです' output = 'Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_2 output = 'python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text, lower=True) self.assertEqual(output, str(words)) # test_3 text = 'ニューラルネットワークを使ってます。' output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(nagisa.tagging(text))) # test_4 tagger_nn = nagisa.Tagger(single_word_list=['ニューラルネットワーク', "ニューラルネット"]) output = 'ニューラルネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(tagger_nn.tagging(text))) # test_5 text = "3月に見た「3月のライオン」" new_tagger = nagisa.Tagger(single_word_list=['3月のライオン']) output = '3/名詞 月/名詞 に/助詞 見/動詞 た/助動詞 「/補助記号 3月のライオン/名詞 」/補助記号' self.assertEqual(output, str(new_tagger.tagging(text))) # test_6 text = '(人•ᴗ•♡)こんばんは♪' output = '(人•ᴗ•♡)/補助記号 こんばんは/感動詞 ♪/補助記号' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_7 url = 'https://github.com/taishi-i/nagisaでコードを公開中(๑¯ω¯๑)' output = 'コード/名詞 公開/名詞 中/接尾辞' words = nagisa.filter(url, filter_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words)) # test_8 output = 'https://github.com/taishi-i/nagisa/URL で/助詞 を/助詞 (๑ ̄ω ̄๑)/補助記号' words = nagisa.extract(url, extract_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words)) # test_9 words = [" (人•ᴗ•♡)", "こんばんは", "♪"] output = ['補助記号', '感動詞', '補助記号'] postags = nagisa.postagging(words) self.assertEqual(output, postags) # test_10 postags = nagisa.decode(words) self.assertEqual(output, postags)
def main(): # load the testset test_X, test_Y = nagisa.utils.load_file("kwdlc.test") # build the tagger for kwdlc ner_tagger = nagisa.Tagger(vocabs='kwdlc_ner_model.vocabs', params='kwdlc_ner_model.params', hp='kwdlc_ner_model.hp') # predict true_Y = [] pred_Y = [] for words, true_y in zip(test_X, test_Y): pred_y = ner_tagger.decode(words) _pred_y = [] _true_y = [] for word, pred, true in zip(words, pred_y, true_y): _pred_y.append(pred) _true_y.append(true) true_Y.append(_true_y) pred_Y.append(_pred_y) # evaluate accuracy = accuracy_score(true_Y, pred_Y) print("accuracy: {}".format(accuracy)) f1 = f1_score(true_Y, pred_Y) print("macro-f1: {}".format(f1)) report = classification_report(true_Y, pred_Y) print(report)
def test_tagging(self): # test_1 text = 'Pythonで簡単に使えるツールです' output = 'Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_2 text = 'ニューラルネットワークを使ってます。' output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(nagisa.tagging(text))) # test_3 tagger_nn = nagisa.Tagger(single_word_list=['ニューラルネットワーク']) output = 'ニューラルネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(tagger_nn.tagging(text))) # test_4 text = '(人•ᴗ•♡)こんばんは♪' output = '(人•ᴗ•♡)/補助記号 こんばんは/感動詞 ♪/補助記号' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_5 url = 'https://github.com/taishi-i/nagisaでコードを公開中(๑¯ω¯๑)' output = 'コード/名詞 公開/名詞 中/接尾辞' words = nagisa.filter(url, filter_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words))
def __init__(self, data, single_words, stop_words, extract_postags, word_num, parser, parse_func): self.words, self.names = self._init_data(data) self.word_num = word_num self.single_words = single_words self.extract_postags = extract_postags self.stop_words = stop_words self.parser = nagisa.Tagger( single_word_list=self.single_words) if not parser else parser self.num_regex = re.compile('^[0-9]+$') if parse_func: self.words = [self.count(parse_func(x)) for x in self.words] else: self.words = [self.count(self.parse(x)) for x in self.words]
def main(): ner_tagger = nagisa.Tagger( vocabs='data/kwdlc_ner_model.vocabs', params='data/kwdlc_ner_model.params', hp='data/kwdlc_ner_model.hp' ) fn_in_test = "data/kwdlc.test" test_X, test_Y = nagisa.utils.load_file(fn_in_test) true_Y = [] pred_Y = [] for x, true_y in zip(test_X, test_Y): pred_y = ner_tagger.decode(x) true_Y += true_y pred_Y += pred_y report = classification_report(true_Y, pred_Y) print(report)
import requests import json import jieba from flask import Flask, request, Response app = Flask(__name__, static_url_path='') import nagisa tagger = nagisa.Tagger( vocabs='cantonese/model.vocabs', params='cantonese/model.params', hp='cantonese/model.hp') def q(s): return "{}".format(tagger.tagging(s)) @app.route("/api") def get(): r = request.args.get('text', '') if r == "": return Response(json.dumps({'status':"error", 'message':"empty input"})) return Response(json.dumps({'status':"ok", 'message':q(r), 'request':r}, ensure_ascii=False), mimetype="application/json") @app.route('/') def index(): return app.send_static_file('index.html')# if __name__ == "__main__": app.run(host='127.0.0.1', port=5003, debug=True)
import nagisa # Build the tagger by loading the trained model files. sample_tagger = nagisa.Tagger(vocabs='cn/sample.vocabs', params='cn/sample.params', hp='cn/sample.hp') while True: text = input(">>") words = sample_tagger.tagging(text) print(words)
dt_now = datetime.datetime.now() start_year_to_filter = st.sidebar.slider('開始年', 2018, 2020, dt_now.year) start_month_to_filter = st.sidebar.slider('開始月', 1, 12, dt_now.month) end_year_to_filter = st.sidebar.slider('終了年', 2018, 2020, dt_now.year) end_month_to_filter = st.sidebar.slider('終了月', 1, 12, dt_now.month) condition = start_year_to_filter * 100 + start_month_to_filter <= end_year_to_filter * 100 + end_month_to_filter user_input = st.text_input("Search", '') data_load_state = st.text('Loading data...') if condition: data = concat_data(start_year_to_filter, start_month_to_filter, end_year_to_filter, end_month_to_filter) if st.checkbox('人気キーワードを表示'): import nagisa tagger = nagisa.Tagger() tags = [ tagger.extract(text, extract_postags=['名詞']).words for text in data['message'].sample(n=100, random_state=7) ] tags = [w for w in list(itertools.chain(*tags)) if len(w) > 2] c = Counter(tags) st.write(', '.join([d[0] for d in c.most_common(10)])) if user_input != '': data = data[data['message'].str.contains( user_input, case=False)].reset_index(drop=True) data_load_state.subheader(f"{len(data)}件中、最新{min(len(data), 5)}件を表示") st.write(data.tail(5).to_html(escape=False), unsafe_allow_html=True) else:
if true_tags != pred_tags: for true_tag, pred_tag in zip(true_tags, pred_tags): if true_tag != pred_tag: if true_tag not in label2id: label2id[true_tag] = len(label2id) if pred_tag not in label2id: label2id[pred_tag] = len(label2id) true_cm.append(label2id[true_tag]) pred_cm.append(label2id[pred_tag]) cm = confusion_matrix(true_cm, pred_cm) labels = list(label2id.keys()) cm_labeled = pd.DataFrame(cm, columns=labels, index=labels) return cm_labeled if __name__ == "__main__": # load the testset test_X, test_Y = load_file("ja_gsd_ud.test") # build the tagger for UD ud_tagger = nagisa.Tagger(vocabs='ja_gsd_ud.vocabs', params='ja_gsd_ud.params', hp='ja_gsd_ud.hp') # create a confusion matrix if tagger make a mistake in prediction. cm_labeled = create_confusion_matrix(ud_tagger, test_X, test_Y) print(cm_labeled)
def handle_message(event): text = event.message.text user_id = event.source.user_id debug_mode_login = False verified = False reply_text = "ごめんなさい、その文章は理解できないの\n使い方を見るには'help'って送信してみて" conn = psycopg2.connect(DATABASE_URL, sslmode='require') conn.autocommit = True cur = conn.cursor() cur.execute("SELECT verified FROM admins WHERE user_id = %s", [user_id]) result = cur.fetchone() if result is not None: (verified, ) = result if not verified: debug_mode_login = True if debug_mode_login: if text == DEBUG_MODE_PASSWORD: cur.execute("UPDATE admins SET verified = TRUE WHERE user_id = %s", [user_id]) reply_text = "デバッグモードに入りました\n終了するには'exit'と打って送信してください" else: cur.execute("DELETE FROM admins WHERE user_id = %s", [user_id]) reply_text = '合言葉が違います' else: if text == 'debug mode': if verified: reply_text = 'すでにデバッグモードですよ' else: cur.execute("INSERT INTO admins ( user_id ) values ( %s )", [user_id]) reply_text = '合言葉を言ってね' elif text == 'help': f = open('help.txt', 'r') reply_text = f.read() f.close() elif (verified and text == 'exit'): cur.execute("DELETE FROM admins WHERE user_id = %s", [user_id]) reply_text = 'デバッグモードを終了しました' else: cur.execute("SELECT question FROM questions") word_list = [i[0] for i in cur.fetchall()] tagger = nagisa.Tagger(single_word_list=word_list) words = tagger.tagging(text) if '助詞' in words.postags: pp_index = words.postags.index('助詞') pp = words.words[pp_index] if (pp in pp_list and '名詞' in words.postags[0:pp_index] and '代名詞' in words.postags[pp_index + 1:len(words.postags)]): noun_index = words.postags.index('名詞', 0, pp_index) noun = words.words[noun_index] pronoun_index = words.postags.index( '代名詞', pp_index + 1, len(words.postags)) pronoun = words.words[pronoun_index] for i in pronoun_list: if i['pronoun'] == pronoun: pronoun_type = i['type'] cur.execute( "SELECT answer_id FROM questions WHERE question = %s AND type = %s", [noun, pronoun_type]) result = cur.fetchone() if result is not None: (answer_id, ) = result cur.execute( "SELECT answer, sentence FROM answers WHERE id = %s", [answer_id]) (answer, sentence) = cur.fetchone() if sentence is None: sentence = '{0[question]}は{0[answer]}よ' v = dict(question=noun, answer=answer) reply_text = sentence.format(v) break cur.close() conn.close() line_bot_api.reply_message(event.reply_token, TextSendMessage(text=reply_text))
def test_tagging(self): # test_1 text = 'Pythonで簡単に使えるツールです' output = 'Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_2 output = 'python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text, lower=True) self.assertEqual(output, str(words)) # test_3 text = 'ニューラルネットワークを使ってます。' output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(nagisa.tagging(text))) # test_4 tagger_nn = nagisa.Tagger(single_word_list=['ニューラルネットワーク', "ニューラルネット"]) output = 'ニューラルネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(tagger_nn.tagging(text))) # test_5 text = "3月に見た「3月のライオン」" new_tagger = nagisa.Tagger(single_word_list=['3月のライオン']) output = '3/名詞 月/名詞 に/助詞 見/動詞 た/助動詞 「/補助記号 3月のライオン/名詞 」/補助記号' self.assertEqual(output, str(new_tagger.tagging(text))) # test_6 text = "それが、iPhone XSです。" output = "それ/代名詞 が/助詞 、/補助記号 iPhone XS/名詞 です/助動詞 。/補助記号" new_tagger = nagisa.Tagger(single_word_list=["iPhone[a-zA-Z0-9 ]+"]) self.assertEqual(output, str(new_tagger.tagging(text))) # test_7 text = "1234abc ABC" output = "1234/名詞 abc ABC/名詞" new_tagger = nagisa.Tagger(single_word_list=["[a-zA-Z ]+", "[0-9]+"]) self.assertEqual(output, str(new_tagger.tagging(text))) # test_8 text = '(人•ᴗ•♡)こんばんは♪' output = '(人•ᴗ•♡)/補助記号 こんばんは/感動詞 ♪/補助記号' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_9 url = 'https://github.com/taishi-i/nagisaでコードを公開中(๑¯ω¯๑)' output = 'コード/名詞 公開/名詞 中/接尾辞' words = nagisa.filter(url, filter_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words)) # test_10 output = 'https://github.com/taishi-i/nagisa/URL で/助詞 を/助詞 (๑ ̄ω ̄๑)/補助記号' words = nagisa.extract(url, extract_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words)) # test_11 words = [" (人•ᴗ•♡)", "こんばんは", "♪"] output = ['補助記号', '感動詞', '補助記号'] postags = nagisa.postagging(words) self.assertEqual(output, postags) # test_12 postags = nagisa.decode(words) self.assertEqual(output, postags) # test_13 words = [" (人•ᴗ•♡)", " ", "こんばんは", "♪"] output = ['補助記号', "空白", '感動詞', '補助記号'] postags = nagisa.postagging(words) self.assertEqual(output, postags) # test_14 postags = nagisa.decode(words) self.assertEqual(output, postags) # test_15 words = [" (人•ᴗ•♡)", " ", "こんばんは", "♪"] output = ['補助記号', "空白", '感動詞', '補助記号'] postags = nagisa.postagging(words) self.assertEqual(output, postags) # test_16 postags = nagisa.decode(words) self.assertEqual(output, postags) # test_17 text = "こんばんは😀" output = "こんばんは/感動詞 😀/補助記号" words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_18 text = "コンバンハ12345" output = "コンバンハ/名詞 1/名詞 2/名詞 3/名詞 4/名詞 5/名詞" words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_19 text = "𪗱𪘂𪘚𪚲" output = "𪗱/補助記号 𪘂/補助記号 𪘚/補助記号 𪚲/補助記号" words = nagisa.tagging(text) self.assertEqual(output, str(words))
def make_nagisa_tagger(single_words: List[str]): return nagisa.Tagger(single_word_list=single_words)
import nagisa ner_tagger = nagisa.Tagger( vocabs="data/kwdlc_ner_model.vocabs", params="data/kwdlc_ner_model.params", hp="data/kwdlc_ner_model.hp" ) text = "FacebookのAIラボ所長でもあるヤン・ルカン博士" tokens = ner_tagger.tagging(text) print(tokens)