Esempio n. 1
0
    def learning(self):
        self.lTokens_tagged = []
        self.db_ret = self.db.selectQuery("select * from tDlgTest")
        if (len(self.db_ret) == 0):
            print("[Chatbot mod] Dialog Data is empty.")
            return

        self.df = pd.DataFrame(self.db_ret)
        self.ds = dp.mod_ds_helper(self.df)

        nShape = self.df.shape[0]
        lWordTags = []
        for i in range(0, nShape):
            sent = self.df.loc[i, 'sentence']
            lWordTags.append(self._cvtSentToWordTagList(sent))
        self.df['Tags'] = lWordTags

        # listField -> counter -> Dict
        dicTags = self.ds.get_cntDict_from_listField("Tags")
        self.dicEntry = self.ds.get_entryDict_from_listField(
            "Tags")  ## this is used to
        self.ds.cvt_klistTovlist("Tags", "TagsVal", self.dicEntry)

        self.train = self.ds.get_onehotcoding_df('TagsVal')
        dicIntent = self.ds.get_entryDict_from_Field("sentID")
        self.ds.cvtWithMap("sentID", "sentID", dicIntent)
        self.target = self.ds.df['sentID']

        self.clf = GaussianNB()
        self.clf.fit(self.train, self.target)
        self.state = 1
Esempio n. 2
0
import sys
sys.path.append("../../Common")
from Visualizer import mod_viz_helper as viz
from DataScience import mod_ds_helper as dp

# movie recommend engine .. codes
# with simialrity
#https://www.kaggle.com/fabiendaniel/film-recommendation-engine/notebook
#
#https://www.kaggle.com/sirpunch/exploring-the-movies-dataset/data

# 1. Load CSV
df_movie = pd.read_csv("tmdb_5000_movies.csv")
df_cred = pd.read_csv("tmdb_5000_credits.csv")

ds_movie = dp.mod_ds_helper(df_movie)
ds_cred = dp.mod_ds_helper(df_cred)

# 2. Show Information
ds_movie.info()
ds_cred.info()

# processing JSON Field
# 3. convert json field to list [Genres]
ds_movie.json_to_list_withField("genres", "gen_name", "name")
ds_movie.json_to_list_withField("genres", "gen_id", "id")

if False:  # checked
    print(ds_movie.df['gen_name'])
    print(ds_movie.df['gen_id'])
        if (i in dicVectorWord.keys()):
            dict_df[dicVectorWord[i]] = 1
    list_dict_df = [dict_df]
    tmp_df = pd.DataFrame(list_dict_df)
    return tmp_df


for i in range(0, nShape):
    listTokens = []
    sent   = df_dlg.loc[i, 'sentence']
    listTokens_s.append( cvtSentToWordTagList( sent, komoran, listIncTag ) )

df_dlg['TAGS'] = listTokens_s

print( df_dlg )
ds_dialog = dp.mod_ds_helper(df_dlg )
dicDatas = ds_dialog.get_cntDict_from_listField("TAGS")
print( dicDatas )

dicEntry = ds_dialog.get_entryDict_from_listField("TAGS")
print( dicEntry )

ds_dialog.cvt_klistTovlist("TAGS", "TAGS_V", dicEntry)
print(ds_dialog.df['TAGS_V'])

# one hot coding에서 소숫점이 발생해서 없앴어
train = ds_dialog.get_onehotcoding_df('TAGS_V')
print(ds_dialog.df.sentID)

dicIntent = ds_dialog.get_entryDict_from_Field("sentID")
print("dicIntent={}".format(dicIntent))