def dataset(type, polarity_zone, subjectivity_zone): if type == 'old': # Load the text dataset, drop null non-text, assign UserId all_df = ld.all_post() all_df = all_df[['postId', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength', 'PostTextPolarity', 'PostTextSubjectivity']].dropna() all_df['UserId'] = all_df['postId'].str.split('_').str.get(0).astype(int) all_df['zone'] = 'a' all_df = threshold_zones(all_df, polarity_zone, subjectivity_zone) zone_dummies = pd.get_dummies(all_df['zone']) column_zone_dummies = zone_dummies.columns.tolist() dict_aggr = {x:np.sum for x in column_zone_dummies} all_df = all_df.join(zone_dummies) aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index() # Load golden standard file gs_df = pd.read_csv('data/userlevel_all_features_1007.csv', header=0) # Merge zone file and golden standard aggr_df = pd.merge(aggr_df, gs_df, how='inner', left_on='UserId', right_on='UserNum') aggr_df = aggr_df[['UserId', 'ActiveInterests']+column_zone_dummies] # harusnya digabung dulu, baru activeinterests dibikin dummies elif type == 'new': # Load the new dataset, drop null non-text, assign UserId ds_file_array = ['data/english_foodgroup_new.json', 'data/english_TEDtranslate_new.json', 'data/english_traveladdiction_new.json'] all_df = ld.new_dataset(ds_file_array) # print all_df.dtypes all_df = all_df[['UserID', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength', 'PostTextPolarity','PostTextSubjectivity', 'ActiveInterests']].dropna() all_df['UserId'] = all_df['UserID'] all_df['zone'] = 'a' all_df = threshold_zones(all_df, polarity_zone, subjectivity_zone) zone_dummies = pd.get_dummies(all_df['zone']) column_zone_dummies = zone_dummies.columns.tolist() dict_aggr = {x: np.sum for x in column_zone_dummies} dict_aggr.update({'ActiveInterests': np.min}) all_df = all_df.join(zone_dummies) aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index() """ target_dummies = pd.get_dummies(aggr_df['ActiveInterests']) aggr_df = aggr_df.join(target_dummies) target = sorted(list(set(target_dummies.columns.tolist()) - set(['Random']))) """ return aggr_df, column_zone_dummies #, target
def dataset_allpost(type): if type == 'old': # Load the text dataset, drop null non-text, assign UserId all_df = ld.all_post() all_df = all_df[['postId', 'PostTextLength', 'PostTextPolarity', 'PostTextSubjectivity']].dropna() all_df['UserId'] = all_df['postId'].str.split('_').str.get(0).astype(int) all_df.drop('postId', axis=1, inplace=True) elif type == 'new': # Load the new dataset, drop null non-text, assign UserId ds_file_array = ['data/english_foodgroup_new.json', 'data/english_TEDtranslate_new.json', 'data/english_traveladdiction_new.json'] photo_file = ['data/album_english_foodgroups.json', 'data/album_english_TEDtranslate.json', 'data/album_english_traveladdiction.json'] friendsnum_file = ['data/english_foodgroups_friendsnum.json', 'data/english_TEDtranslate_friendsnum.json', 'data/english_traveladdiction_friendsnum.json'] all_df = ld.new_dataset(ds_file_array) # print all_df.dtypes all_df = all_df[['UserID', 'PostTextLength', 'PostTextPolarity','PostTextSubjectivity']].dropna() all_df['UserId'] = all_df['UserID'] all_df.drop('UserID', axis=1, inplace=True) return all_df
# this is to cluster posts into 3 polarity score import load_all_dataset as ld import pandas as pd import csv # LOAD THE DATASET # all dataset all_df = ld.all_post() post_cluster_df = ld.post_cluster_result() # merge with post clustered result all_df_clustered = pd.merge(all_df, post_cluster_df, on='postId', how='inner') all_df_clustered = all_df_clustered[[ 'postId', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength', 'PostTextSubjectivity', 'PostTextPolarity', 'resultCluster' ]] all_df_clustered[ 'Interactions'] = all_df_clustered['LikesCount'] + all_df_clustered[ 'SharesCount'] + all_df_clustered['CommentsCount'] all_df_clustered['resultCluster'] = all_df_clustered['resultCluster'].astype( 'category', ordered=False) # bikin level polarity into 3 all_df_clustered['3polarityLevel'] = 0 all_df_clustered['3polarityLevel'].loc[ all_df_clustered['PostTextPolarity'] > 0] = 1 all_df_clustered['3polarityLevel'].loc[ all_df_clustered['PostTextPolarity'] < 0] = -1 all_df_clustered['3polarityLevel'] = all_df_clustered['3polarityLevel'].astype( 'category') # print all_df_clustered.dtypes
def dataset(type, polarity_zone, subjectivity_zone): if type == 'old': # Load the text dataset, drop null non-text, assign UserId all_df = ld.all_post() all_df = all_df[['postId', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength', 'PostTextPolarity', 'PostTextSubjectivity', 'PostTime']].dropna() all_df['UserId'] = all_df['postId'].str.split('_').str.get(0).astype(int) all_df['zone'] = 'a' all_df = threshold_zones(all_df, polarity_zone, subjectivity_zone) zone_dummies = pd.get_dummies(all_df['zone']) column_zone_dummies = zone_dummies.columns.tolist() ent_ = [x + '_ratio' for x in column_zone_dummies] ent_dummies = pd.get_dummies(all_df['entropy_all']) all_df = all_df.join(ent_dummies) all_df = all_df.join(zone_dummies) dict_aggr = {x:np.sum for x in column_zone_dummies} dict_aggr.update({x: np.mean for x in ent_}) aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index() # add day part day_part_df = ld.func_day_part(all_df) aggr_df = pd.merge(aggr_df, day_part_df, how='inner', left_on='UserId', right_on='UserId') # Load golden standard file gs_df = pd.read_csv('data/userlevel_all_features_1007.csv', header=0) # Merge zone file and golden standard aggr_df = pd.merge(aggr_df, gs_df, how='inner', left_on='UserId', right_on='UserNum') # aggr_df = aggr_df[['UserId', 'ActiveInterests']+column_zone_dummies+ent_] # harusnya digabung dulu, baru activeinterests dibikin dummies elif type == 'new': # Load the new dataset, drop null non-text, assign UserId ds_file_array = ['data/english_foodgroup_new.json', 'data/english_TEDtranslate_new.json', 'data/english_traveladdiction_new.json'] photo_file = ['data/album_english_foodgroups.json', 'data/album_english_TEDtranslate.json', 'data/album_english_traveladdiction.json'] friendsnum_file = ['data/english_foodgroups_friendsnum.json', 'data/english_TEDtranslate_friendsnum.json', 'data/english_traveladdiction_friendsnum.json'] all_df = ld.new_dataset(ds_file_array) # print all_df.dtypes all_df = all_df[['UserID', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength', 'PostTextPolarity','PostTextSubjectivity', 'ActiveInterests', 'PostTime']].dropna() all_df['UserId'] = all_df['UserID'] all_df['zone'] = 'a' all_df = threshold_zones(all_df, polarity_zone, subjectivity_zone) zone_dummies = pd.get_dummies(all_df['zone']) column_zone_dummies = zone_dummies.columns.tolist() ent_ = [x + '_ratio' for x in column_zone_dummies] ent_dummies = pd.get_dummies(all_df['entropy_all']) all_df = all_df.join(ent_dummies) all_df = all_df.join(zone_dummies) dict_aggr = {x: np.sum for x in column_zone_dummies} dict_aggr.update({'ActiveInterests': np.min}) dict_aggr.update({x: np.mean for x in ent_}) aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index() # obtain NoPosts, SharedNewsSum, UploadVideoSum df_1 = ld.aggr_feature_user(ds_file_array) aggr_df = pd.merge(aggr_df, df_1, how='inner', left_on='UserId', right_on='UserID') # obtain about df_2 = ld.about_dataset(ds_file_array) aggr_df = pd.merge(aggr_df, df_2, how='inner', left_on='UserId', right_on='UserID') # UserID, NoProfilePhotos, NoCoverPhotos, NoUploadedPhotos, NoPhotos df_3 = ld.photo_dataset(photo_file) aggr_df = pd.merge(aggr_df, df_3, how='inner', left_on='UserId', right_on='UserID') # NumOfFriends df_4 = ld.friendsnum_dataset(friendsnum_file) aggr_df = pd.merge(aggr_df, df_4, how='inner', left_on='UserId', right_on='UserID') # day_part df_5 = ld.func_day_part(all_df) aggr_df = pd.merge(aggr_df, df_5, how='inner', left_on='UserId', right_on='UserId') aggr_df.drop(['userId', 'UserID_y', 'UserID_x'], axis=1, inplace=True) # print 'data baru kolom', aggr_df.dtypes aggr_df['frequent_day_part'] = aggr_df['frequent_day_part'].map( {'Early_Morning': 0, 'Morning': 1, 'Afternoon': 2, 'Evening': 3}) # aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index() # add entropy features aggr_df = entropy_features_(aggr_df, ent_) return aggr_df, column_zone_dummies #, target
def dataset_raw(type): if type == 'old': all_df = ld.all_post() all_df = all_df[['postId', 'PostTextLength', 'PostTextPolarity','PostTextSubjectivity']].dropna() all_df['UserId'] = all_df['postId'].str.split('_').str.get(0).astype(int) # Load golden standard file gs_df = pd.read_csv('data/userlevel_all_features_1007.csv', header=0) gs_df['UserId'] = 0 gs_df['UserId'] = gs_df['UserNum'] gs_df = gs_df[['UserId', 'ActiveInterests']] # add PostTextLengthLevel postTextLengthLevel_df = ld.separate_postTextLength(all_df) print postTextLengthLevel_df aggr_df = pd.merge(gs_df, postTextLengthLevel_df, how='inner', left_on='UserId', right_on='UserId') """ # add day part day_part_df = ld.func_day_part(all_df) aggr_df = pd.merge(aggr_df, day_part_df, how='inner', left_on='UserId', right_on='UserId') """ # harusnya digabung dulu, baru activeinterests dibikin dummies elif type == 'new': # Load the new dataset, drop null non-text, assign UserId ds_file_array = ['data/english_foodgroup_new.json', 'data/english_TEDtranslate_new.json', 'data/english_traveladdiction_new.json'] photo_file = ['data/album_english_foodgroups.json', 'data/album_english_TEDtranslate.json', 'data/album_english_traveladdiction.json'] friendsnum_file = ['data/english_foodgroups_friendsnum.json', 'data/english_TEDtranslate_friendsnum.json', 'data/english_traveladdiction_friendsnum.json'] all_df = ld.new_dataset(ds_file_array) # print all_df.dtypes # all_df = all_df[['UserID', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength', 'PostTextPolarity','PostTextSubjectivity', 'ActiveInterests', 'PostTime']].dropna() all_df['UserId'] = all_df['UserID'] all_df = all_df[['UserId', 'ActiveInterests', 'PostTextLength']].dropna() dict_aggr = {'ActiveInterests': np.min} aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index() # add PostTextLengthLevel postTextLengthLevel_df = ld.separate_postTextLength(all_df) aggr_df = pd.merge(aggr_df, postTextLengthLevel_df, how='inner', left_on='UserId', right_on='UserId') # aggr_df = pd.merge(aggr_df, postTextLengthLevel_df, how='inner', left_on='UserId', right_on='UserID') """ # obtain NoPosts, SharedNewsSum, UploadVideoSum df_1 = ld.aggr_feature_user(ds_file_array) aggr_df = pd.merge(aggr_df, df_1, how='inner', left_on='UserId', right_on='UserID') # obtain about df_2 = ld.about_dataset(ds_file_array) aggr_df = pd.merge(aggr_df, df_2, how='inner', left_on='UserId', right_on='UserID') # UserID, NoProfilePhotos, NoCoverPhotos, NoUploadedPhotos, NoPhotos df_3 = ld.photo_dataset(photo_file) aggr_df = pd.merge(aggr_df, df_3, how='inner', left_on='UserId', right_on='UserID') # NumOfFriends df_4 = ld.friendsnum_dataset(friendsnum_file) aggr_df = pd.merge(aggr_df, df_4, how='inner', left_on='UserId', right_on='UserID') # day_part df_5 = ld.func_day_part(all_df) aggr_df = pd.merge(aggr_df, df_5, how='inner', left_on='UserId', right_on='UserId') aggr_df.drop(['userId', 'UserID', 'UserID_y', 'UserID_x'], axis=1, inplace=True) # print 'data baru kolom', aggr_df.dtypes aggr_df['frequent_day_part'] = aggr_df['frequent_day_part'].map( {'Early_Morning': 0, 'Morning': 1, 'Afternoon': 2, 'Evening': 3}) # aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index() # add entropy features aggr_df = entropy_features_(aggr_df, ent_) """ # return all_df return aggr_df
def fitness_old(individu): # rumus F1 # print individu polarity_zone = individu subjectivity_zone = [0.0, 0.5, 1.0] # Load the text dataset, drop null non-text, assign UserId all_df = ld.all_post() all_df = all_df[['postId', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength', 'PostTextPolarity', 'PostTextSubjectivity']].dropna() all_df['UserId'] = all_df['postId'].str.split('_').str.get(0).astype(int) all_df['zone'] = 'a' for index_sub, subjectivity in enumerate(subjectivity_zone): # print subjectivity if index_sub < len(subjectivity_zone)-1: for index_pol, polarity in enumerate(polarity_zone): # print index_pol, polarity nama_zone = str(index_sub)+'_'+str(index_pol) if polarity[1] < 0.0: # jika polarity < 0 all_df['zone'].loc[ ((all_df['PostTextPolarity'] < polarity[0]) & (all_df['PostTextPolarity'] >= polarity[1])) & ((all_df['PostTextSubjectivity'] >= subjectivity_zone[index_sub] ) & (all_df['PostTextSubjectivity'] <= subjectivity_zone[index_sub+1])) ] = nama_zone elif polarity[1] == 0.0: # jika polarity = 0 all_df['zone'].loc[ ( all_df['PostTextPolarity'] == 0.0 ) & ((all_df['PostTextSubjectivity'] >= subjectivity_zone[index_sub] ) & (all_df['PostTextSubjectivity'] <= subjectivity_zone[index_sub+1])) ] = nama_zone else: all_df['zone'].loc[ ((all_df['PostTextPolarity'] > polarity[0]) & (all_df['PostTextPolarity'] <= polarity[1])) & ((all_df['PostTextSubjectivity'] >= subjectivity_zone[index_sub] ) & (all_df['PostTextSubjectivity'] <= subjectivity_zone[index_sub+1])) ] = nama_zone zone_dummies = pd.get_dummies(all_df['zone']) column_zone_dummies = zone_dummies.columns.tolist() dict_aggr = {x:np.sum for x in column_zone_dummies} all_df = all_df.join(zone_dummies) aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index() # Load golden standard file gs_df = pd.read_csv('data/userlevel_all_features_1007.csv', header=0) # Merge zone file and golden standard aggr_df = pd.merge(aggr_df, gs_df, how='inner', left_on='UserId', right_on='UserNum') # Separate golden standard column name target_dummies = pd.get_dummies(aggr_df['ActiveInterests']) aggr_df = aggr_df.join(target_dummies) target = sorted( list( set( target_dummies.columns.tolist() ) - set( ['Random'] ) ) ) models = [GaussianNB()] names = ["Gaussian Naive Bayes"] gabung = zip(models, names) numCV = 5 fscore_arr = [] for model, name in gabung: # print name for i, ai in enumerate(target): # cari F1 score # print ai fscore_arr.append( cross_val_score(model, aggr_df[column_zone_dummies], aggr_df[target][ai], cv=numCV, scoring='f1_macro') ) # round_feature_important = map(lambda x:round(x, 4), rf.feature_importances_) # map(lambda x:round(x, 3), correlation) # print fscore_arr rerataf1 = round(np.mean(fscore_arr), 3) return rerataf1