def EmolexRes(): # Read as DF os.chdir(paths.READ_PATH_GAME_INFO) dfEmolex = useful_methods.csv_dic_df("Predict-Emolex-63min-NonRT.csv") # Convert strings to Integer & Float dfEmolex['score_ht_home'] = [int(st) for st in dfEmolex['score_ht_home']] dfEmolex['score_ht_away'] = [int(st) for st in dfEmolex['score_ht_away']] dfEmolex['score_ft_home'] = [int(st) for st in dfEmolex['score_ft_home']] dfEmolex['score_ft_away'] = [int(st) for st in dfEmolex['score_ft_away']] dfEmolex['pn_home'] = [float(st) for st in dfEmolex['pn_home']] dfEmolex['pn_away'] = [float(st) for st in dfEmolex['pn_away']] # Use only Twitter data Games dfEmolex = dfEmolex[(dfEmolex['pn_home'] != -1)].copy().reset_index( drop=True) # Show Resulst EmolexAnalyze(dfEmolex) return dfEmolex
def ReviewRes(): # Read as DF os.chdir(paths.READ_PATH_GAME_INFO) dfNB = useful_methods.csv_dic_df("Predict-NB-63min.csv") # Convert strings to Integer & Float dfNB['score_ht_home'] = [int(st) for st in dfNB['score_ht_home']] dfNB['score_ht_away'] = [int(st) for st in dfNB['score_ht_away']] dfNB['score_ft_home'] = [int(st) for st in dfNB['score_ft_home']] dfNB['score_ft_away'] = [int(st) for st in dfNB['score_ft_away']] dfNB['nb_pos_home'] = [float(st) for st in dfNB['nb_pos_home']] dfNB['nb_neg_home'] = [float(st) for st in dfNB['nb_neg_home']] dfNB['nb_pos_away'] = [float(st) for st in dfNB['nb_pos_away']] dfNB['nb_neg_away'] = [float(st) for st in dfNB['nb_neg_away']] # Use only Twitter data Games dfNB = dfNB[(dfNB['nb_neg_home'] != -1)].copy().reset_index(drop=True) # Print resulst ReviewAnalyze(dfNB) return dfNB
# ******************************************************* # ******************************************************* # Limitations TIME_LIMIT = 63 RETWEET_STATUS = False FILTER_STATUS = True START_TIME = 1 END_TIME = 63 # ******************************************************* # ******************************************************* # Game Infos os.chdir(paths.READ_PATH_GAME_INFO) dfGameInfos = useful_methods.csv_dic_df('game_infos.csv') dfGameInfos = useful_methods.DropNanGames(dfGameInfos) # which week WEEK_NUM = input() dfGameInfos = dfGameInfos[dfGameInfos.GW == int(WEEK_NUM)].copy().reset_index( drop=True) # Convert number strings to integers dfGameInfos['GW'] = [int(GW) for GW in dfGameInfos['GW']] dfGameInfos['score_ht_home'] = [ int(number) for number in dfGameInfos['score_ht_home'] ] dfGameInfos['score_ht_away'] = [ int(number) for number in dfGameInfos['score_ht_away'] ]
# ******************************************************* # Limitations TIME_LIMIT = 63 RETWEET_STATUS = False FILTER_STATUS = True START_TIME = 1 END_TIME = 63 # ******************************************************* # ******************************************************* # Game Infos os.chdir(paths.READ_PATH_GAME_INFO) dfGameInfos = useful_methods.csv_dic_df('game_infos.csv') dfGameInfos = useful_methods.DropNanGames(dfGameInfos) # Convert number strings to integers dfGameInfos['GW'] = [int(GW) for GW in dfGameInfos['GW']] dfGameInfos['score_ht_home'] = [int(number) for number in dfGameInfos['score_ht_home']] dfGameInfos['score_ht_away'] = [int(number) for number in dfGameInfos['score_ht_away']] dfGameInfos['score_ft_home'] = [int(number) for number in dfGameInfos['score_ft_home']] dfGameInfos['score_ft_away'] = [int(number) for number in dfGameInfos['score_ft_away']] # Read Emotion-Lexicon-Soccer as Dictionary dic_emolex_soccer = emolex.EmolexSoccerDic()
def ClassifierTrain(save=False): date_now = strftime("%Y-%m-%d %H:%M:%S", gmtime()).replace(" ", "_") # *************************************************** # [Step 1]: Data Load # *************************************************** # Read Hashtags in Emotion Words Tweets df = useful_methods.csv_dic_df(paths.DATA_HOME + "TweetsPN/tweet_hash_emolex_pn.csv") # positive: 1, negative: 0 df['label'] = [int(label) for label in df['label']] # *************************************************** # [Step 2]: Data Split(train=0.8, test=0.2) # *************************************************** # Split data Train and Test data X_train, X_test, y_train, y_test = train_test_split( df['text'], df['label'], test_size=0.2 ) print( "\n\n### DATA ##################################\n", "\n\tTrain data: \t", len(X_train), "\n\tTest data: \t", len(X_test), "\n\tAll data: \t", len(y_train) + len(y_test) ) # *************************************************** # [Step 3]: Define Classifier # *************************************************** grid_search = SVM(y_train) # *************************************************** # [Step 4]: Compute Classifier # *************************************************** start_time = time() # fitting training sets to classifier grid_search.fit(X_train, y_train) # *************************************************** # [Step 4]: Print Classifier Details # *************************************************** # print trained parameters DetecterParams(grid_search, title="SVM") # print computed time print("\n\n### COMPUTED TIME #########################\n") taken_time = time() - start_time print("[Started Time]: ", date_now) print("\n[Taken Time]: ", str(datetime.timedelta(seconds=taken_time))) # print classifier test results DetecterMetrics(X_test, y_test, grid_search, title="Test") # *************************************************** # [Step 5]: Save Classifier Details # *************************************************** if save: filename = "dtr_hash_svn_" + date_now + ".pkl" with open(paths.DETECTER_HOME + filename, 'wb') as fout: pickle.dump(grid_search, fout) print("\n\n[Saved in]: ", paths.DETECTER_HOME + filename)
def HashEmolexAllCreate(): # Emolex dic dic_emolex_soccer, dic_emolex_stemmed_soccer = emolex.EmolexSoccerDic() # Define: All Hash Emolex DF dfNewHashEmolex = pd.DataFrame( columns=['text', 'hash_emolex_word', 'sentiments']) texts = [] hash_emolexs = [] sentiments = [] start_time = time.time() # Start All Here for index in range(len(weeks)): try: # Read Single Game as DF filename = home_teams[index] + '_vs_' + away_teams[index] + '.csv' os.chdir(paths.READ_PATH_EXTRACTED_CSV + 'GW' + str(weeks[index]) + '/SingleGames/') df = useful_methods.csv_dic_df(filename) ########################################################################### # Filter DF: remove tweets of stream, bots ... dfFilter = useful_methods.FilterDF(df[df.status != 'retweet']) # Add column: hash_emolex dfFilter['hash_emolex'] = [ tokenizer.HashEmolex(tags, dic_emolex_soccer) for tags in dfFilter.tags ] # Emolex in Hashtags dfHashEmolex = dfFilter[(dfFilter.hash_emolex != '')] # Check through each Tweet for i in range(len(dfHashEmolex)): # hash tags hash_emolex = dfHashEmolex.iloc[i]['hash_emolex'].split(',') # each tag for emo in set(hash_emolex): if emo in dic_emolex_soccer: text = (dfHashEmolex.iloc[i]['text']) if text not in texts: # appedings texts.append(text) hash_emolexs.append(emo) sentiment = ",".join([ key for key, value in dic_emolex_soccer[emo].items() if (value > 0) ]) sentiments.append(sentiment) except: continue print("[Done]: %.2f" % (time.time() - start_time)) # Result dfNewHashEmolex['text'] = texts dfNewHashEmolex['hash_emolex_word'] = hash_emolexs dfNewHashEmolex['sentiments'] = sentiments # Save useful_methods.DFtoCSV(dfNewHashEmolex, "/Users/Bya/Dropbox/Research/datas/TweetsPN/", "hash_emolex_all", index=False) print( "[Saved in]: /Users/Bya/Dropbox/Research/datas/TweetsPN/hash_emolex_all.csv" )
def TweetPnEqualRead(): os.chdir("/Users/Bya/Dropbox/Research/datas/TweetsPN/") df = useful_methods.csv_dic_df('tweets_pn_eq.csv') return df
def HashEmolexAllRead(): os.chdir("/Users/Bya/Dropbox/Research/datas/TweetsPN/") df = useful_methods.csv_dic_df('hash_emolex_all.csv') return df
def ClassifierTrain(save=False): date_now = strftime("%Y-%m-%d %H:%M:%S", gmtime()).replace(" ", "_") # *************************************************** # [Step 1]: Data Load # *************************************************** # Read Hashtags in Emotion Words Tweets df = useful_methods.csv_dic_df(paths.DATA_HOME + "TweetsPN/tweet_hash_emolex_pn.csv") # positive: 1, negative: 0 df['label'] = [int(label) for label in df['label']] # *************************************************** # [Step 2]: Data Split(train=0.8, test=0.2) # *************************************************** # Split data Train and Test data X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2) print("\n\n### DATA ##################################\n", "\n\tTrain data: \t", len(X_train), "\n\tTest data: \t", len(X_test), "\n\tAll data: \t", len(y_train) + len(y_test)) # *************************************************** # [Step 3]: Define Classifier # *************************************************** grid_search = SVM(y_train) # *************************************************** # [Step 4]: Compute Classifier # *************************************************** start_time = time() # fitting training sets to classifier grid_search.fit(X_train, y_train) # *************************************************** # [Step 4]: Print Classifier Details # *************************************************** # print trained parameters DetecterParams(grid_search, title="SVM") # print computed time print("\n\n### COMPUTED TIME #########################\n") taken_time = time() - start_time print("[Started Time]: ", date_now) print("\n[Taken Time]: ", str(datetime.timedelta(seconds=taken_time))) # print classifier test results DetecterMetrics(X_test, y_test, grid_search, title="Test") # *************************************************** # [Step 5]: Save Classifier Details # *************************************************** if save: filename = "dtr_hash_svn_" + date_now + ".pkl" with open(paths.DETECTER_HOME + filename, 'wb') as fout: pickle.dump(grid_search, fout) print("\n\n[Saved in]: ", paths.DETECTER_HOME + filename)