def __init__( self, incTags=['NN', 'NR', 'NP', 'VA', 'VV', 'SF', "MA", "MD", "XS"], dbConfig={ 'user': '******', 'password': '******', 'host': 'mthx.cafe24.com', 'database': 'chatbot', 'raise_on_warnings': True }, isLogOut=True): self.isLogOut = isLogOut if (self.isLogOut): print("[Chatbot mod] created") self.lIncTags = incTags self.db_config = dbConfig self.db = dbConMysql(self.db_config) if (self.isLogOut): print("[Chatbot mod] db is created.") self.komohelper = komoHelper() self.komohelper.UpdateTextFile("userDict.txt") self.komoran = Komoran() self.komoran.set_user_dictionary("userDict.txt") self.state = 0
import sys import time import pprint sys.path.append("../Common/Crawler") sys.path.append("../Common/Util") sys.path.append("../Common/NLP") from komoran.komoran3py import Komoran from mod_crawler_base import craw_file_reader as txt_reader from mod_crawler_base import craw_history_logger as logger #1.komoran test komoran = Komoran() komoran.set_user_dictionary('../Data/KomoranDic/user_dictionary.txt') result = komoran.getTokensList('청하는아이오아이멤버입니다') pprint.pprint(result) #2.txt_reader #iterate with column # handle to writecolumn txtreader = txt_reader("../Data/Text/Joins/Sasul.txt", False) list_words = [] for i, doc in enumerate(txtreader): #if( i == 10 ) : break if False: print(doc.split("\t")[0]) print(doc.split("\t")[1]) print(doc.split("\t")[2]) print(doc.split("\t")[3]) print(doc.split("\t")[4])
class mod_chatbot_dialog: def __init__( self, incTags=['NN', 'NR', 'NP', 'VA', 'VV', 'SF', "MA", "MD", "XS"], dbConfig={ 'user': '******', 'password': '******', 'host': 'mthx.cafe24.com', 'database': 'chatbot', 'raise_on_warnings': True }, isLogOut=True): self.isLogOut = isLogOut if (self.isLogOut): print("[Chatbot mod] created") self.lIncTags = incTags self.db_config = dbConfig self.db = dbConMysql(self.db_config) if (self.isLogOut): print("[Chatbot mod] db is created.") self.komohelper = komoHelper() self.komohelper.UpdateTextFile("userDict.txt") self.komoran = Komoran() self.komoran.set_user_dictionary("userDict.txt") self.state = 0 def learning(self): self.lTokens_tagged = [] self.db_ret = self.db.selectQuery("select * from tDlgTest") if (len(self.db_ret) == 0): print("[Chatbot mod] Dialog Data is empty.") return self.df = pd.DataFrame(self.db_ret) self.ds = dp.mod_ds_helper(self.df) nShape = self.df.shape[0] lWordTags = [] for i in range(0, nShape): sent = self.df.loc[i, 'sentence'] lWordTags.append(self._cvtSentToWordTagList(sent)) self.df['Tags'] = lWordTags # listField -> counter -> Dict dicTags = self.ds.get_cntDict_from_listField("Tags") self.dicEntry = self.ds.get_entryDict_from_listField( "Tags") ## this is used to self.ds.cvt_klistTovlist("Tags", "TagsVal", self.dicEntry) self.train = self.ds.get_onehotcoding_df('TagsVal') dicIntent = self.ds.get_entryDict_from_Field("sentID") self.ds.cvtWithMap("sentID", "sentID", dicIntent) self.target = self.ds.df['sentID'] self.clf = GaussianNB() self.clf.fit(self.train, self.target) self.state = 1 # "명사" -> "명사_NN" def _cvtSentToWordTagList(self, sent): listWordTag = [] komo_ret = self.komoran.getTokensList(sent) for i, elem in enumerate(komo_ret): tag = elem['pos'][:2] if (tag in self.lIncTags): word = elem['morph'] + "_" + tag listWordTag.append(word) return listWordTag def _sentToVector(self, sent): listWordTag = self._cvtSentToWordTagList(sent) dict_df = defaultdict(lambda: 0) for k, v in self.dicEntry.items(): dict_df[v] = 0 for i in listWordTag: # print( "{}".format( i ) ) if (i in self.dicEntry.keys()): dict_df[self.dicEntry[i]] = 1 list_dict_df = [dict_df] tmp_df = pd.DataFrame(list_dict_df) return tmp_df def reply(self, sent): ret_df = self._sentToVector(sent) class_id = int(self.clf.predict(ret_df)) #print( "class_id {}".format( class_id )) inner_product = np.inner(self.train.loc[class_id], ret_df) #print( "train = {}".format( self.train )) #print( inner_product ) #result = np.sum(self.train.loc[class_id], axis=1) if (inner_product[0] != 0): print("%s" % (self.db_ret[int(class_id)]['reply'])) else: print("언젠가는 네가 무슨 말을 하는지 알수 있겠지?") def dump(self): with open('chatbot_clf.pkl', 'wb') as f: pk.dump(self.clf, f) if False: with open('chatbot_dicEntry.pkl', 'wb') as f: pk.dump(self.dicEntry, f) with open('chatbot_lncTags.pkl', 'wb') as f: pk.dump(self.lIncTags, f)
import sys import time import pprint sys.path.append("../Common/Crawler") sys.path.append("../Common/Util") sys.path.append("../Common/NLP") from komoran.komoran3py import Komoran komoran = Komoran() komoran.set_user_dictionary('../Data/KomoranDic/user_dictionary.txt') result = komoran.getTokensList('청하는아이오아이멤버입니다') pprint.pprint(result) Sentence = "사냥과 채집을 하던 시절에 공간을 기억하는 능력은 생존과 직결되었다." result = komoran.getTokensList(Sentence) pprint.pprint(result) Sentence = "운동은 신체의 건강 유지는 물론 전두엽이 담당하는 기억과 학습 능력의 향상을 위해서도 매우 중요한 생활 습관이다" result = komoran.getTokensList(Sentence) pprint.pprint(result) Sentence = "빛의자녀들교회는하나님을사랑하는교회입니다." result = komoran.getTokensList(Sentence) pprint.pprint(result) # add new word to library # adding 한 단어가 적용되는지를 테스트. f = open('../Data/KomoranDic/user_dictionary.txt', "a+", encoding="UTF-8") f.write("\n%s\t%s" % ("빛의자녀들교회", "NNP")) f.close()
sys.path.append("../../Common/Crawler") sys.path.append("../../Common/Util") sys.path.append("../../Common/NLP") sys.path.append("../../Common/Mysql") sys.path.append("../../Common") from komoran.komoran3py import Komoran from Visualizer import mod_viz_helper as viz from DataScience import mod_ds_helper as dp from mod_nlp_helper import komoHelper from libmysql import dbConMysql komohelper = komoHelper() komoran = Komoran() # Include Tag lists listIncTag = ['NN','NR','NP','VA','VV','SF',"MA","MD","XS"] config = { 'user': '******', 'password': '******', 'host': 'mthx.cafe24.com', 'database': 'chatbot', 'raise_on_warnings': True } db = dbConMysql( config ) dlg_ret = db.selectQuery( "select * from tDlgTest" )
sys.path.append("../Common") from komoran.komoran3py import Komoran from Visualizer import mod_viz_helper as viz from DataScience import mod_ds_helper as dp from mod_nlp_helper import komoHelper komohelper = komoHelper() # for Bigram from sklearn.feature_extraction.text import CountVectorizer from collections import Counter # Create Tagger with komoran komoran = Komoran() # Include Tag lists listIncTag = ['NN', 'NR', 'NP', 'VA', 'VV', 'SF', "MA", "MD", "XS"] # Komoran test if False: df_dlg = pd.read_csv("../Data/Dialog/Dialog1.csv", delimiter=",", encoding="euckr") print(df_dlg) listDic = [] def apply_df_komoran(x): result = komoran.getTokensList(x) listDic.append(result)