Esempio n. 1
0
def extract_pgt(config, p, k, t, d):
    dataset_names = config['dataset']
    compressed = config['kwargs']['compress_output']
    pgt_root = config['directory']['pgt']
    make_sure_path_exists('/'.join([pgt_root, dataset_names[p]]))
    if compressed is True:
        pgt_name = config['intermediate']['pgt']['pgt_output_compressed']
        compression = 'bz2'
    else:
        pgt_name = config['intermediate']['pgt']['pgt_output']
        compression = None
    intermediate_file = '/'.join(
        [pgt_root, dataset_names[p],
         pgt_name.format(p, k, t, d)])
    if is_file_exists(intermediate_file) is False:
        ### Extracting each feature
        if config['kwargs']['pgt']['extract_pgt']['run'] is True:
            g1 = None
            g2 = None
            g3 = None
            g4 = None
            if config['kwargs']['pgt']['extract_pgt']['personal'] is True:
                g1, g2 = personal_factor(config, p, k, t, d)  ### P in PGT
                debug('Finished loading personal factor', 'p', p, 'k', k, 't',
                      t, 'd', d)
            if config['kwargs']['pgt']['extract_pgt']['global'] is True:
                g3 = global_factor(config, p, k, t, d, g2)  ### PG in PGT
                debug('Finished loading global factor', 'p', p, 'k', k, 't', t,
                      'd', d)
            if config['kwargs']['pgt']['extract_pgt']['temporal'] is True:
                g4 = temporal_factor(config, p, k, t, d, g2)  ### PGT in PGT
                debug('Finished loading temporal factor', 'p', p, 'k', k, 't',
                      t, 'd', d)
            ### Merging all together
            if config['kwargs']['pgt']['extract_pgt']['merge'] is True:
                if g1 is not None and g2 is not None and g3 is not None and g4 is not None:
                    df = g1[['user1', 'user2',
                             'g1']].merge(g2[['user1', 'user2', 'g2']],
                                          on=['user1', 'user2'])
                    df = df.merge(g3[['user1', 'user2', 'g3']],
                                  on=['user1', 'user2'])
                    df = df.merge(g4[['user1', 'user2', 'g4']],
                                  on=['user1', 'user2'])
                    friend_df = extract_friendships(dataset_names[p], config)
                    df = determine_social_tie(df, friend_df)
                    df.to_csv(intermediate_file,
                              header=True,
                              index=False,
                              compression=compression)
Esempio n. 2
0
def test_colocation_stats():
    config_name = 'config_test.json'
    ### Started the program
    debug('Started Test test_checkin_stats on SCI+', config_name)
    config = read_config('config_test.json')
    ### Read config
    config = read_config('config_test.json')
    kwargs = config['kwargs']
    all_datasets = config['dataset']
    all_modes = config['mode']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    t_diffs = kwargs['ts']
    s_diffs = kwargs['ds']
    for dataset_name in datasets:
        p = all_datasets.index(dataset_name)
        friend_df = extract_friendships(dataset_name, config)
        for mode in modes:
            k = all_modes.index(mode)
            for t in t_diffs:
                for d in s_diffs:
                    total_user = 0
                    total_friend = 0
                    total_colocation = 0
                    i = 0
                    for colocation_df in read_colocation_file(
                            config,
                            p,
                            k,
                            t,
                            d,
                            chunksize=10**6,
                            usecols=['user1', 'user2']):
                        colocation_df = determine_social_tie(
                            colocation_df, friend_df)
                        total_colocation += len(colocation_df)
                        colocation_df = colocation_df.drop_duplicates(
                            ['user1', 'user2'])
                        total_user += len(colocation_df)
                        total_friend += sum(colocation_df['link'])
                        i += 1
                        # debug('Processing chunks #%d' % i)
                    # debug('#colocations', total_colocation, '#total_user', total_user, '#total_friend', total_friend, 'p', p, 'k', k, 't', t, 'd', d)
                    debug(total_colocation, total_user, total_friend, p, k, t,
                          d)
                    gc.collect()
    debug('Finished Test on SCI+')
Esempio n. 3
0
def generating_walk2friend_data():
    config_name = 'config_test.json'
    debug('Started Test test_checkin_stats on SCI+', config_name)
    config = read_config(config_name)
    ### Started the program
    ### Read config
    config = read_config(config_name)
    kwargs = config['kwargs']
    all_datasets = config['dataset']
    all_modes = config['mode']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    directory = config['directory']['intermediate']
    for dataset_name in datasets:
        p = all_datasets.index(dataset_name)
        for mode in modes:
            k = all_modes.index(mode)
            debug('Run Test on Dataset', dataset_name, p, 'Mode', mode, k)
            ### Test extract check-ins
            checkins, _ = extract_checkins_all(dataset_name, mode, config)
            checkins.sort_values(["user", "timestamp"], inplace=True)
            checkins['mid'] = range(1, len(checkins) + 1)
            checkins.rename(columns={
                "user": "******",
                "location": "locid"
            },
                            inplace=True)
            checkins = checkins[['mid', 'uid', 'locid']]
            checkins.to_csv('/'.join(
                [directory,
                 '%s_%s_10.checkin' % (dataset_name, mode)]),
                            index=False,
                            header=True)
            ### Test extract friendships
            friend_df = extract_friendships(dataset_name, config)
            friend_df.sort_values(["user1", "user2"], inplace=True)
            friend_df.sort_values(["user1", "user2"], inplace=True)
            friend_df.rename(columns={
                "user1": "u1",
                "user2": "u2"
            },
                             inplace=True)
            friend_df.to_csv('/'.join(
                [directory,
                 '%s_%s_10.friends' % (dataset_name, mode)]),
                             index=False,
                             header=True)
Esempio n. 4
0
def extract_colocation_features(stat_lp, config, p, k, t, d):
    debug('p', p, 'k', k, 't', t, 'd', d)
    ### Check if SCI intermediate exists
    dataset_names = config['dataset']
    compressed = config['kwargs']['compress_output']
    sci_root = config['directory']['sci']
    make_sure_path_exists('/'.join([sci_root, dataset_names[p]]))
    if compressed is True:
        sci_name = config['intermediate']['sci']['evaluation_compressed']
    else:
        sci_name = config['intermediate']['sci']['evaluation']
    sci_name = '/'.join(
        [sci_root, dataset_names[p],
         sci_name.format(p, k, t, d)])
    if is_file_exists(sci_name):
        debug('File %s exists' % sci_name)
    else:
        ### Read (original) friendship from file
        friend_df = extract_friendships(dataset_names[p], config)
        colocation_df = read_colocation_file(config, p, k, t, d)
        ### Find if the two users in the colocated check-ins are friends / stranger
        colocation_df = determine_social_tie(colocation_df, friend_df)
        debug('#colocations', len(colocation_df), 'p', p, 'k', k, 't', t, 'd',
              d)
        ### Find the stability value for each co-location pairs
        groups = colocation_df.groupby(['user1', 'user2', 'link'])
        grouped = aggregate_stats(groups, stat_lp, p, k, t, d)
        ### Write the result into a csv output
        write_statistics(grouped, config, p, k, t, d)

        ### Memory management
        del friend_df
        del colocation_df
        del grouped
    debug('Finished extract_colocation_features', 'p', p, 'k', k, 't', t, 'd',
          d)
Esempio n. 5
0
def test_checkin_stats():
    config_name = 'config_test.json'
    ### Started the program
    debug('Started Test test_checkin_stats on SCI+', config_name)
    ### Read config
    config = read_config(config_name)
    kwargs = config['kwargs']
    all_datasets = config['dataset']
    all_modes = config['mode']
    n_core = kwargs['n_core']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    for dataset_name in datasets:
        p = all_datasets.index(dataset_name)
        for mode in modes:
            k = all_modes.index(mode)
            debug('Run Test on Dataset', dataset_name, p, 'Mode', mode, k,
                  '#Core', n_core)
            ### Test extract check-ins
            checkins, _ = extract_checkins_all(dataset_name, mode, config)
            checkins['u_count'] = checkins.groupby('user')['user'].transform(
                'count')
            df, _ = extract_checkins_all(dataset_name,
                                         mode,
                                         config,
                                         filter=False)
            df['u_count'] = df.groupby('user')['user'].transform('count')
            df = df[(df['u_count'] > 1)]
            n_user_ori = len(df['user'].unique())
            n_checkins_ori = len(df)
            n_checkins_filter = len(checkins)
            ### Test extract friendships
            friend_df = extract_friendships(dataset_name, config)
            n_friend = len(friend_df)
            uids = checkins['user'].unique()
            n_user_filter = len(uids)
            locs = checkins['location'].unique()
            n_locs = len(locs)
            friend_match_checkin = friend_df.isin(uids)
            friend_df = friend_df[friend_match_checkin['user1']
                                  & friend_match_checkin['user2']]
            friend_ids = np.unique(
                np.concatenate(
                    (friend_df['user1'].values, friend_df['user2'].values)))
            checkins = df.loc[df['user'].isin(friend_ids)]
            n_user_friend = len(checkins['user'].unique())
            n_checkins_friend = len(checkins)
            avg_checkin_ori = n_checkins_ori / n_user_ori
            avg_checkin_filter = n_checkins_filter / n_user_filter
            avg_checkin_friend = n_checkins_friend / n_user_friend
            # debug('#user ori', n_user_ori)
            # debug('#friend', n_friend)
            # debug('#user', n_user_filter)
            # debug('#location', n_locs)
            # debug('#friend w/ checkins', n_friend)
            # debug('Avg. #Checkins ori', avg_checkin_ori)
            # debug('Avg. #Checkins filtered', avg_checkin_filter)
            # debug('#user (#friend>1)', n_user_friend)
            debug(n_user_ori, n_user_friend, n_user_filter, n_checkins_ori,
                  n_checkins_friend, n_checkins_filter, n_friend, n_locs,
                  avg_checkin_ori, avg_checkin_friend, avg_checkin_filter)
    debug('Finished Test on SCI+')