Esempio n. 1
0
def dataset(type, polarity_zone, subjectivity_zone):
    if type == 'old':
        # Load the text dataset, drop null non-text, assign UserId
        all_df = ld.all_post()
        all_df = all_df[['postId', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength', 'PostTextPolarity', 'PostTextSubjectivity']].dropna()
        all_df['UserId'] = all_df['postId'].str.split('_').str.get(0).astype(int)
        all_df['zone'] = 'a'

        all_df = threshold_zones(all_df, polarity_zone, subjectivity_zone)

        zone_dummies = pd.get_dummies(all_df['zone'])
        column_zone_dummies = zone_dummies.columns.tolist()
        dict_aggr = {x:np.sum for x in column_zone_dummies}
        all_df = all_df.join(zone_dummies)
        aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index()

        # Load golden standard file
        gs_df = pd.read_csv('data/userlevel_all_features_1007.csv', header=0)

        # Merge zone file and golden standard
        aggr_df = pd.merge(aggr_df, gs_df, how='inner', left_on='UserId', right_on='UserNum')
        aggr_df = aggr_df[['UserId', 'ActiveInterests']+column_zone_dummies]

        # harusnya digabung dulu, baru activeinterests dibikin dummies
    elif type == 'new':
        # Load the new dataset, drop null non-text, assign UserId
        ds_file_array = ['data/english_foodgroup_new.json', 'data/english_TEDtranslate_new.json',
                         'data/english_traveladdiction_new.json']
        all_df = ld.new_dataset(ds_file_array)
        # print all_df.dtypes
        all_df = all_df[['UserID', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength', 'PostTextPolarity','PostTextSubjectivity', 'ActiveInterests']].dropna()
        all_df['UserId'] = all_df['UserID']
        all_df['zone'] = 'a'
        all_df = threshold_zones(all_df, polarity_zone, subjectivity_zone)

        zone_dummies = pd.get_dummies(all_df['zone'])
        column_zone_dummies = zone_dummies.columns.tolist()
        dict_aggr = {x: np.sum for x in column_zone_dummies}
        dict_aggr.update({'ActiveInterests': np.min})
        all_df = all_df.join(zone_dummies)
        aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index()
    """
    target_dummies = pd.get_dummies(aggr_df['ActiveInterests'])
    aggr_df = aggr_df.join(target_dummies)
    target = sorted(list(set(target_dummies.columns.tolist()) - set(['Random'])))
    """

    return aggr_df, column_zone_dummies #, target
def dataset_allpost(type):
    if type == 'old':
        # Load the text dataset, drop null non-text, assign UserId
        all_df = ld.all_post()
        all_df = all_df[['postId', 'PostTextLength', 'PostTextPolarity', 'PostTextSubjectivity']].dropna()
        all_df['UserId'] = all_df['postId'].str.split('_').str.get(0).astype(int)
        all_df.drop('postId', axis=1, inplace=True)

    elif type == 'new':
        # Load the new dataset, drop null non-text, assign UserId
        ds_file_array = ['data/english_foodgroup_new.json', 'data/english_TEDtranslate_new.json',
                         'data/english_traveladdiction_new.json']
        photo_file = ['data/album_english_foodgroups.json', 'data/album_english_TEDtranslate.json',
                      'data/album_english_traveladdiction.json']
        friendsnum_file = ['data/english_foodgroups_friendsnum.json', 'data/english_TEDtranslate_friendsnum.json',
                           'data/english_traveladdiction_friendsnum.json']

        all_df = ld.new_dataset(ds_file_array)
        # print all_df.dtypes
        all_df = all_df[['UserID', 'PostTextLength', 'PostTextPolarity','PostTextSubjectivity']].dropna()
        all_df['UserId'] = all_df['UserID']
        all_df.drop('UserID', axis=1, inplace=True)

    return all_df
Esempio n. 3
0
# this is to cluster posts into 3 polarity score
import load_all_dataset as ld
import pandas as pd
import csv

# LOAD THE DATASET
# all dataset
all_df = ld.all_post()
post_cluster_df = ld.post_cluster_result()
# merge with post clustered result
all_df_clustered = pd.merge(all_df, post_cluster_df, on='postId', how='inner')
all_df_clustered = all_df_clustered[[
    'postId', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength',
    'PostTextSubjectivity', 'PostTextPolarity', 'resultCluster'
]]
all_df_clustered[
    'Interactions'] = all_df_clustered['LikesCount'] + all_df_clustered[
        'SharesCount'] + all_df_clustered['CommentsCount']
all_df_clustered['resultCluster'] = all_df_clustered['resultCluster'].astype(
    'category', ordered=False)

# bikin level polarity into 3
all_df_clustered['3polarityLevel'] = 0
all_df_clustered['3polarityLevel'].loc[
    all_df_clustered['PostTextPolarity'] > 0] = 1
all_df_clustered['3polarityLevel'].loc[
    all_df_clustered['PostTextPolarity'] < 0] = -1
all_df_clustered['3polarityLevel'] = all_df_clustered['3polarityLevel'].astype(
    'category')

# print all_df_clustered.dtypes
Esempio n. 4
0
def dataset(type, polarity_zone, subjectivity_zone):
    if type == 'old':
        # Load the text dataset, drop null non-text, assign UserId
        all_df = ld.all_post()
        all_df = all_df[['postId', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength', 'PostTextPolarity', 'PostTextSubjectivity', 'PostTime']].dropna()
        all_df['UserId'] = all_df['postId'].str.split('_').str.get(0).astype(int)
        all_df['zone'] = 'a'

        all_df = threshold_zones(all_df, polarity_zone, subjectivity_zone)

        zone_dummies = pd.get_dummies(all_df['zone'])
        column_zone_dummies = zone_dummies.columns.tolist()

        ent_ = [x + '_ratio' for x in column_zone_dummies]
        ent_dummies = pd.get_dummies(all_df['entropy_all'])

        all_df = all_df.join(ent_dummies)
        all_df = all_df.join(zone_dummies)

        dict_aggr = {x:np.sum for x in column_zone_dummies}
        dict_aggr.update({x: np.mean for x in ent_})

        aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index()

        # add day part
        day_part_df = ld.func_day_part(all_df)
        aggr_df = pd.merge(aggr_df, day_part_df, how='inner', left_on='UserId', right_on='UserId')

        # Load golden standard file
        gs_df = pd.read_csv('data/userlevel_all_features_1007.csv', header=0)

        # Merge zone file and golden standard
        aggr_df = pd.merge(aggr_df, gs_df, how='inner', left_on='UserId', right_on='UserNum')
        # aggr_df = aggr_df[['UserId', 'ActiveInterests']+column_zone_dummies+ent_]

        # harusnya digabung dulu, baru activeinterests dibikin dummies
    elif type == 'new':
        # Load the new dataset, drop null non-text, assign UserId
        ds_file_array = ['data/english_foodgroup_new.json', 'data/english_TEDtranslate_new.json',
                         'data/english_traveladdiction_new.json']
        photo_file = ['data/album_english_foodgroups.json', 'data/album_english_TEDtranslate.json',
                      'data/album_english_traveladdiction.json']
        friendsnum_file = ['data/english_foodgroups_friendsnum.json', 'data/english_TEDtranslate_friendsnum.json',
                           'data/english_traveladdiction_friendsnum.json']

        all_df = ld.new_dataset(ds_file_array)
        # print all_df.dtypes
        all_df = all_df[['UserID', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength', 'PostTextPolarity','PostTextSubjectivity', 'ActiveInterests', 'PostTime']].dropna()
        all_df['UserId'] = all_df['UserID']

        all_df['zone'] = 'a'
        all_df = threshold_zones(all_df, polarity_zone, subjectivity_zone)

        zone_dummies = pd.get_dummies(all_df['zone'])
        column_zone_dummies = zone_dummies.columns.tolist()

        ent_ = [x + '_ratio' for x in column_zone_dummies]
        ent_dummies = pd.get_dummies(all_df['entropy_all'])

        all_df = all_df.join(ent_dummies)
        all_df = all_df.join(zone_dummies)

        dict_aggr = {x: np.sum for x in column_zone_dummies}
        dict_aggr.update({'ActiveInterests': np.min})
        dict_aggr.update({x: np.mean for x in ent_})

        aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index()

        # obtain NoPosts, SharedNewsSum, UploadVideoSum
        df_1 = ld.aggr_feature_user(ds_file_array)
        aggr_df = pd.merge(aggr_df, df_1, how='inner', left_on='UserId', right_on='UserID')

        # obtain about
        df_2 = ld.about_dataset(ds_file_array)
        aggr_df = pd.merge(aggr_df, df_2, how='inner', left_on='UserId', right_on='UserID')

        # UserID, NoProfilePhotos, NoCoverPhotos, NoUploadedPhotos, NoPhotos
        df_3 = ld.photo_dataset(photo_file)
        aggr_df = pd.merge(aggr_df, df_3, how='inner', left_on='UserId', right_on='UserID')

        # NumOfFriends
        df_4 = ld.friendsnum_dataset(friendsnum_file)
        aggr_df = pd.merge(aggr_df, df_4, how='inner', left_on='UserId', right_on='UserID')

        # day_part
        df_5 = ld.func_day_part(all_df)
        aggr_df = pd.merge(aggr_df, df_5, how='inner', left_on='UserId', right_on='UserId')

        aggr_df.drop(['userId', 'UserID_y', 'UserID_x'], axis=1, inplace=True)
        # print 'data baru kolom', aggr_df.dtypes

    aggr_df['frequent_day_part'] = aggr_df['frequent_day_part'].map(
        {'Early_Morning': 0, 'Morning': 1, 'Afternoon': 2, 'Evening': 3})
    # aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index()
    # add entropy features
    aggr_df = entropy_features_(aggr_df, ent_)
    return aggr_df, column_zone_dummies #, target
Esempio n. 5
0
def dataset_raw(type):
    if type == 'old':
        all_df = ld.all_post()
        all_df = all_df[['postId', 'PostTextLength', 'PostTextPolarity','PostTextSubjectivity']].dropna()
        all_df['UserId'] = all_df['postId'].str.split('_').str.get(0).astype(int)

        # Load golden standard file
        gs_df = pd.read_csv('data/userlevel_all_features_1007.csv', header=0)
        gs_df['UserId'] = 0
        gs_df['UserId'] = gs_df['UserNum']
        gs_df = gs_df[['UserId', 'ActiveInterests']]

        # add PostTextLengthLevel
        postTextLengthLevel_df = ld.separate_postTextLength(all_df)
        print postTextLengthLevel_df
        aggr_df = pd.merge(gs_df, postTextLengthLevel_df, how='inner', left_on='UserId', right_on='UserId')

        """
        # add day part
        day_part_df = ld.func_day_part(all_df)
        aggr_df = pd.merge(aggr_df, day_part_df, how='inner', left_on='UserId', right_on='UserId')
        """
        # harusnya digabung dulu, baru activeinterests dibikin dummies
    elif type == 'new':
        # Load the new dataset, drop null non-text, assign UserId
        ds_file_array = ['data/english_foodgroup_new.json', 'data/english_TEDtranslate_new.json',
                         'data/english_traveladdiction_new.json']
        photo_file = ['data/album_english_foodgroups.json', 'data/album_english_TEDtranslate.json',
                      'data/album_english_traveladdiction.json']
        friendsnum_file = ['data/english_foodgroups_friendsnum.json', 'data/english_TEDtranslate_friendsnum.json',
                           'data/english_traveladdiction_friendsnum.json']

        all_df = ld.new_dataset(ds_file_array)
        # print all_df.dtypes
        # all_df = all_df[['UserID', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength', 'PostTextPolarity','PostTextSubjectivity', 'ActiveInterests', 'PostTime']].dropna()
        all_df['UserId'] = all_df['UserID']
        all_df = all_df[['UserId', 'ActiveInterests', 'PostTextLength']].dropna()

        dict_aggr = {'ActiveInterests': np.min}
        aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index()

        # add PostTextLengthLevel
        postTextLengthLevel_df = ld.separate_postTextLength(all_df)
        aggr_df = pd.merge(aggr_df, postTextLengthLevel_df, how='inner', left_on='UserId', right_on='UserId')

        # aggr_df = pd.merge(aggr_df, postTextLengthLevel_df, how='inner', left_on='UserId', right_on='UserID')

        """
        # obtain NoPosts, SharedNewsSum, UploadVideoSum
        df_1 = ld.aggr_feature_user(ds_file_array)
        aggr_df = pd.merge(aggr_df, df_1, how='inner', left_on='UserId', right_on='UserID')

        # obtain about
        df_2 = ld.about_dataset(ds_file_array)
        aggr_df = pd.merge(aggr_df, df_2, how='inner', left_on='UserId', right_on='UserID')

        # UserID, NoProfilePhotos, NoCoverPhotos, NoUploadedPhotos, NoPhotos
        df_3 = ld.photo_dataset(photo_file)
        aggr_df = pd.merge(aggr_df, df_3, how='inner', left_on='UserId', right_on='UserID')

        # NumOfFriends
        df_4 = ld.friendsnum_dataset(friendsnum_file)
        aggr_df = pd.merge(aggr_df, df_4, how='inner', left_on='UserId', right_on='UserID')

        # day_part
        df_5 = ld.func_day_part(all_df)
        aggr_df = pd.merge(aggr_df, df_5, how='inner', left_on='UserId', right_on='UserId')

        aggr_df.drop(['userId', 'UserID', 'UserID_y', 'UserID_x'], axis=1, inplace=True)
        # print 'data baru kolom', aggr_df.dtypes

        aggr_df['frequent_day_part'] = aggr_df['frequent_day_part'].map(
            {'Early_Morning': 0, 'Morning': 1, 'Afternoon': 2, 'Evening': 3})
        # aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index()
        # add entropy features
        aggr_df = entropy_features_(aggr_df, ent_)
        """
    # return all_df
    return aggr_df
Esempio n. 6
0
def fitness_old(individu):
    # rumus F1
    # print individu
    polarity_zone = individu
    subjectivity_zone = [0.0, 0.5, 1.0]

    # Load the text dataset, drop null non-text, assign UserId
    all_df = ld.all_post()
    all_df = all_df[['postId', 'LikesCount', 'SharesCount', 'CommentsCount', 'PostTextLength', 'PostTextPolarity', 'PostTextSubjectivity']].dropna()
    all_df['UserId'] = all_df['postId'].str.split('_').str.get(0).astype(int)
    all_df['zone'] = 'a'

    for index_sub, subjectivity in enumerate(subjectivity_zone):
        # print subjectivity
        if index_sub < len(subjectivity_zone)-1:
            for index_pol, polarity in enumerate(polarity_zone):
                # print index_pol, polarity
                nama_zone = str(index_sub)+'_'+str(index_pol)
                if polarity[1] < 0.0:
                    # jika polarity < 0
                    all_df['zone'].loc[ ((all_df['PostTextPolarity'] < polarity[0]) & (all_df['PostTextPolarity'] >= polarity[1])) &
                                        ((all_df['PostTextSubjectivity'] >= subjectivity_zone[index_sub] ) & (all_df['PostTextSubjectivity'] <= subjectivity_zone[index_sub+1])) ] = nama_zone
                elif polarity[1] == 0.0:
                    # jika polarity = 0
                    all_df['zone'].loc[ ( all_df['PostTextPolarity'] == 0.0 ) &
                                        ((all_df['PostTextSubjectivity'] >= subjectivity_zone[index_sub] ) & (all_df['PostTextSubjectivity'] <= subjectivity_zone[index_sub+1])) ] = nama_zone
                else:
                    all_df['zone'].loc[ ((all_df['PostTextPolarity'] > polarity[0]) & (all_df['PostTextPolarity'] <= polarity[1])) &
                                        ((all_df['PostTextSubjectivity'] >= subjectivity_zone[index_sub] ) & (all_df['PostTextSubjectivity'] <= subjectivity_zone[index_sub+1])) ] = nama_zone

    zone_dummies = pd.get_dummies(all_df['zone'])
    column_zone_dummies = zone_dummies.columns.tolist()
    dict_aggr = {x:np.sum for x in column_zone_dummies}
    all_df = all_df.join(zone_dummies)
    aggr_df = all_df.groupby(['UserId'], sort=True).agg(dict_aggr).reset_index()

    # Load golden standard file
    gs_df = pd.read_csv('data/userlevel_all_features_1007.csv', header=0)

    # Merge zone file and golden standard
    aggr_df = pd.merge(aggr_df, gs_df, how='inner', left_on='UserId', right_on='UserNum')

    # Separate golden standard column name
    target_dummies = pd.get_dummies(aggr_df['ActiveInterests'])
    aggr_df = aggr_df.join(target_dummies)
    target = sorted( list( set( target_dummies.columns.tolist() ) - set( ['Random'] ) ) )

    models = [GaussianNB()]
    names = ["Gaussian Naive Bayes"]
    gabung = zip(models, names)
    numCV = 5
    fscore_arr = []
    for model, name in gabung:
        # print name
        for i, ai in enumerate(target):
            # cari F1 score
            # print ai
            fscore_arr.append( cross_val_score(model, aggr_df[column_zone_dummies], aggr_df[target][ai], cv=numCV, scoring='f1_macro') )
            # round_feature_important = map(lambda x:round(x, 4), rf.feature_importances_)
            # map(lambda x:round(x, 3), correlation)
        # print fscore_arr
    rerataf1 = round(np.mean(fscore_arr), 3)
    return rerataf1