Example #1
0
def generate_colocation_single(checkins, config, p, k, t_diff, s_diff):
    dataset_name = config['dataset'][p]
    kwargs = config['kwargs']
    n_core = kwargs['n_core']
    start = kwargs['colocation']['start']
    finish = kwargs['colocation']['finish']
    order = kwargs['colocation']['order']
    working_directory = '/'.join(
        [config['directory']['intermediate'], dataset_name])
    make_sure_path_exists(working_directory)
    kdtree_intermediate = '/'.join([
        working_directory,
        config['intermediate']['colocation']['kdtree'].format(p, k)
    ])
    if is_file_exists(kdtree_intermediate):
        st_tree = sk_joblib.load(kdtree_intermediate)
    else:
        st_tree = create_spatiotemporal_kd_tree(checkins, kdtree_intermediate,
                                                t_diff, s_diff)
    other = extract_spatiotemporal_normalized(checkins, t_diff, s_diff)
    begins, ends = init_begin_end(n_core,
                                  len(checkins),
                                  start=start,
                                  finish=finish)
    ### Generate colocation based on extracted checkins
    prepare_colocation(config, p, k, t_diff, s_diff, begins, ends)
    ### Start from bottom
    if order == 'ascending':
        Parallel(n_jobs=n_core)(delayed(execute_parallel_st_tree_single)(checkins, config, st_tree, other[begins[i]:ends[i]], \
          p, k, t_diff, s_diff, begins[i], ends[i]) \
          for i in range(len(begins)))
    else:
        Parallel(n_jobs=n_core)(delayed(execute_parallel_st_tree_single)(checkins, config, st_tree, other[begins[i-1]:ends[i-1]], \
          p, k, t_diff, s_diff, begins[i-1], ends[i-1]) \
          for i in xrange(len(begins), 0, -1))
Example #2
0
def extract_popularity(checkins, config, p, k):
    intermediate_root = config['directory']['intermediate']
    dataset_names = config['dataset']
    modes = config['mode']
    popularity_intermediate_file = config['intermediate']['sci']['popularity']
    pickle_directory = '/'.join([intermediate_root, dataset_names[p]])
    make_sure_path_exists(pickle_directory)
    pickle_filename = '/'.join(
        [pickle_directory,
         popularity_intermediate_file.format(modes[k])])
    if not is_file_exists(pickle_filename):
        stat_lp = {}  ### Popularity score for location l
        visit_per_venue, p_l = extract_visit_per_venue(checkins, config)
        p_ul = extract_aggregated_visit(visit_per_venue, p_l)
        ### Evaluate the weight for each venue
        for vid, arr in p_ul.items():
            if len(arr) > 0:
                ent = entropy(arr)
                stat_lp[vid] = ent
            else:
                stat_lp[vid] = 0.0
        ### Memory management
        del p_l[:]
        del p_l
        visit_per_venue.clear()
        p_ul.clear()
        del visit_per_venue, p_ul
        ### Write to pickle intermediate file
        with open(pickle_filename, 'wb') as handle:
            pickle.dump(stat_lp, handle, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        with open(pickle_filename, 'rb') as handle:
            stat_lp = pickle.load(handle)
    ### Return the result
    return stat_lp
Example #3
0
def temporal_factor(config, p, k, t, d, g2):
    g4 = None
    ### Intermediate file -- check if exists
    pgt_root = config['directory']['pgt']
    dataset_names = config['dataset']
    modes = config['mode']
    pgt_root = config['directory']['pgt']
    make_sure_path_exists('/'.join([pgt_root, dataset_names[p]]))
    g4_file = '/'.join([pgt_root, dataset_names[p], \
        config['intermediate']['pgt']['pgt_g4'].format(modes[k], t, d)])
    if is_file_exists(g4_file) is True:
        g4 = pd.read_csv(g4_file)
    else:
        global_df = transform_colocation_pgt(config, p, k, t, d, 'global')
        colocation_df = read_colocation_file(config, p, k, t, d, \
            usecols=['user1', 'user2', 'time1', 'time2'])
        colocation_df['wg'] = global_df['wg'].values
        colocation_df['time'] = (colocation_df['time1'] +
                                 colocation_df['time2']) / 2
        colocation_df.drop(columns=['time1', 'time2'], inplace=True)
        groups = colocation_df.groupby(['user1', 'user2'])
        g4 = applyParallel(config, groups, lambda_temporal)
        g4 = g4.groupby(['user1', 'user2'])['wt'].agg(['sum'])
        g4.reset_index(inplace=True)
        g4.sort_values(['user1', 'user2'], inplace=True)
        g4['g4'] = g2['g2'] * g4['sum']
        g4[g4 < 0] = 0.0  ### Prevent negatives in values
        g4['g4'] = g4['g4'] / max(g4['g4'])
        g4.to_csv(g4_file, header=True, index=False, compression='bz2')
        del colocation_df, groups
    return g4
Example #4
0
def extract_pi_loc_k(checkins, grouped, venues, user_visit, config, p, k,
                     start, finish, feature):
    pgt_part_root = config['directory']['pgt_temp']
    dataset_names = config['dataset']
    modes = config['mode']
    make_sure_path_exists('/'.join([pgt_part_root, dataset_names[p]]))
    pgt_file_part = '/'.join([pgt_part_root, dataset_names[p], \
        config['intermediate']['pgt']['%s_part' % feature].format(modes[k], start, finish)])
    if is_file_exists(pgt_file_part) is True:
        pass
    else:
        user_visit = user_visit[(user_visit['visit_count'] > 0)]
        #debug('start', start, 'finish', finish)
        if feature == 'personal':
            ids = grouped['user'].values
            grouping = 'user'
            result = pd.DataFrame(columns=['user', 'location', 'p_i'])
        elif feature == 'global':
            ids = grouped['location'].values
            grouping = 'location'
            result = pd.DataFrame(columns=['location', 'p_i'])
        t0 = time.time()
        for i in range(start, finish):
            u_i = ids[i]
            df = df_uid(checkins, u_i, config, grouping)
            visit_match = user_visit.isin({grouping: df[grouping].unique()})
            visit_temp = user_visit[visit_match[grouping]]
            if len(visit_temp > 0):
                if feature == 'personal':
                    ### Extract the p_i of each user's visit
                    visit_temp['p_i'] = visit_temp.apply(
                        lambda x: calculate_density(x, cd, df, venues), axis=1)
                    visit_temp = visit_temp[['user', 'location', 'p_i']]
                    result = result.append(visit_temp, ignore_index=True)
                elif feature == 'global':
                    ### Aggregate visit on each location
                    aggregations = {
                        'user_count': {
                            'entropy': lambda x: calculate_entropy(x)
                        },
                    }
                    grouped = visit_temp.groupby(['location']) \
                        .agg(aggregations)
                    grouped.columns = [
                        "_".join(x) for x in grouped.columns.ravel()
                    ]
                    grouped.rename(columns={"user_count_entropy": "p_i"},
                                   inplace=True)
                    grouped.reset_index(inplace=True)
                    # debug(grouped.columns.values)
                    # debug(grouped.head())
                    grouped = grouped[['location', 'p_i']]
                    result = result.append(grouped, ignore_index=True)
        t1 = time.time()
        ### Writing to temp file
        if feature == 'personal':
            result.drop_duplicates(subset=['user', 'location'], inplace=True)
        result.to_csv(pgt_file_part, index=False, header=True)
        debug('Finished density calculation into %s in %s seconds' %
              (pgt_file_part, str(t1 - t0)))
Example #5
0
def parallel_sampling(config, p, k, t, d):
    debug('Start sampling', 'p', p, 'k', k, 't', t, 'd', d)
    kwargs = config['kwargs']
    is_read_compressed = kwargs['read_compressed']
    colocation_root = config['directory']['colocation']
    make_sure_path_exists(colocation_root)
    if is_read_compressed is False:
        sample_name = config['intermediate']['colocation']['sample_csv']
        compression = None
    else:
        sample_name = config['intermediate']['colocation']['sample_compressed']
        compression = 'bz2'
    sample_rate = kwargs['preprocessing']['sampling']['rate']
    sample_fullname = '/'.join(
        [colocation_root,
         sample_name.format(p, k, t, d, sample_rate)])
    df = read_colocation_file(config, p, k, t, d)
    df = df.sample(frac=sample_rate, random_state=1)
    df.to_csv(sample_fullname,
              header=True,
              index=False,
              compression=compression,
              mode='w')
    debug('Finished sampling', 'p', p, 'k', k, 't', t, 'd', d, '#sample: ',
          len(df))
Example #6
0
def run_pgt_evaluation(config):
    kwargs = config['kwargs']
    n_core = kwargs['n_core']
    all_datasets = config['dataset']
    all_modes = config['mode']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    t_diffs = kwargs['ts']
    s_diffs = kwargs['ds']
    report_directory = config['directory']['report']
    make_sure_path_exists(report_directory)
    for dataset_name in datasets:
        p = all_datasets.index(dataset_name)
        for mode in modes:
            k = all_modes.index(mode)
            debug('Run PGT Evaluation on Dataset', dataset_name, p, 'Mode',
                  mode, k, '#Core', n_core)
            ### Creating the report file
            result_filename = '/'.join(
                [report_directory, 'PGT_result_p{}_k{}.csv'.format(p, k)])
            remove_file_if_exists(result_filename)
            with open(result_filename, 'ab') as fw:
                fw.write(
                    'p,k,t,d,auc,precision,recall,f1,#friends,#data,feature_set,preprocessing\n'
                )
            for t_diff in t_diffs:
                for s_diff in s_diffs:
                    pgt_evaluation(config, p, k, t_diff, s_diff)
                    gc.collect()
Example #7
0
def write_statistics(df, config, p, k, t, d):
    dataset_names = config['dataset']
    compressed = config['kwargs']['compress_output']
    sci_root = config['directory']['sci']
    make_sure_path_exists('/'.join([sci_root, dataset_names[p]]))
    if compressed is True:
        sci_name = config['intermediate']['sci']['evaluation_compressed']
        compression = 'bz2'
    else:
        sci_name = config['intermediate']['sci']['evaluation']
        compression = None
    df.to_csv('/'.join([sci_root, dataset_names[p], sci_name.format(p, k, t, d)]), \
        header=True, index=False, compression=compression)
Example #8
0
def extract_pgt(config, p, k, t, d):
    dataset_names = config['dataset']
    compressed = config['kwargs']['compress_output']
    pgt_root = config['directory']['pgt']
    make_sure_path_exists('/'.join([pgt_root, dataset_names[p]]))
    if compressed is True:
        pgt_name = config['intermediate']['pgt']['pgt_output_compressed']
        compression = 'bz2'
    else:
        pgt_name = config['intermediate']['pgt']['pgt_output']
        compression = None
    intermediate_file = '/'.join(
        [pgt_root, dataset_names[p],
         pgt_name.format(p, k, t, d)])
    if is_file_exists(intermediate_file) is False:
        ### Extracting each feature
        if config['kwargs']['pgt']['extract_pgt']['run'] is True:
            g1 = None
            g2 = None
            g3 = None
            g4 = None
            if config['kwargs']['pgt']['extract_pgt']['personal'] is True:
                g1, g2 = personal_factor(config, p, k, t, d)  ### P in PGT
                debug('Finished loading personal factor', 'p', p, 'k', k, 't',
                      t, 'd', d)
            if config['kwargs']['pgt']['extract_pgt']['global'] is True:
                g3 = global_factor(config, p, k, t, d, g2)  ### PG in PGT
                debug('Finished loading global factor', 'p', p, 'k', k, 't', t,
                      'd', d)
            if config['kwargs']['pgt']['extract_pgt']['temporal'] is True:
                g4 = temporal_factor(config, p, k, t, d, g2)  ### PGT in PGT
                debug('Finished loading temporal factor', 'p', p, 'k', k, 't',
                      t, 'd', d)
            ### Merging all together
            if config['kwargs']['pgt']['extract_pgt']['merge'] is True:
                if g1 is not None and g2 is not None and g3 is not None and g4 is not None:
                    df = g1[['user1', 'user2',
                             'g1']].merge(g2[['user1', 'user2', 'g2']],
                                          on=['user1', 'user2'])
                    df = df.merge(g3[['user1', 'user2', 'g3']],
                                  on=['user1', 'user2'])
                    df = df.merge(g4[['user1', 'user2', 'g4']],
                                  on=['user1', 'user2'])
                    friend_df = extract_friendships(dataset_names[p], config)
                    df = determine_social_tie(df, friend_df)
                    df.to_csv(intermediate_file,
                              header=True,
                              index=False,
                              compression=compression)
Example #9
0
def prepare_colocation(config, p, k, t_diff, s_diff, begins, ends):
    working_directory = config['directory']['colocation']
    filename = config['intermediate']['colocation']['part']
    dataset_name = config['dataset'][p]
    make_sure_path_exists('/'.join([working_directory, dataset_name]))
    clear_dir = config['kwargs']['colocation']['clear_dir']
    if clear_dir is True:
        remove_all_files('/'.join([working_directory, dataset_name]))
    ### Prepare the files
    for i in range(len(begins)):
        with open(
                '/'.join([
                    working_directory, dataset_name,
                    filename.format(p, k, t_diff, s_diff, begins[i], ends[i])
                ]), 'wb'):
            pass
    debug('Each colocation part file has been created')
Example #10
0
def pgt_evaluation(config, p, k, t, d):
    debug('Evaluating PGT for p{}, k{}, t{}, d{}'.format(p, k, t, d))
    dataset_names = config['dataset']
    compressed = config['kwargs']['read_compressed']
    pgt_root = config['directory']['pgt']
    make_sure_path_exists('/'.join([pgt_root, dataset_names[p]]))
    if compressed is True:
        pgt_name = config['intermediate']['pgt']['pgt_output_compressed']
    else:
        pgt_name = config['intermediate']['pgt']['pgt_output']
    evaluation_name = '/'.join(
        [pgt_root, dataset_names[p],
         pgt_name.format(p, k, t, d)])
    if is_file_exists(evaluation_name) is True:
        dataset = pd.read_csv(evaluation_name)
        # Format: 'user1', 'user2', 'g1', 'g2', 'g3', 'g4', 'link'
        X = dataset[['g1', 'g2', 'g3', 'g4']].values
        y = dataset[['link']].values
        ### Normalize unexpected values
        X[np.isinf(X)] = 0
        X[np.isnan(X)] = 0
        y[np.isinf(y)] = 0
        y[np.isnan(y)] = 0
        selected_feature_set = config['kwargs']['pgt_eval']['features']
        if selected_feature_set == 'all':
            notes = ["PGT+", "PGT", "P0", "P", "PG"]
            assign = [[0, 1, 2, 3], [3], [0], [1], [2]]
        else:  ### Summary only
            notes = ["PGT+", "PGT"]
            assign = [[0, 1, 2, 3], [3]]
        debug(notes, assign)
        texts = generate_report(config, X, y, assign, notes, p, k, t, d)
        del X, y
        report_directory = config['directory']['report']
        result_filename = '/'.join(
            [report_directory, 'PGT_result_p{}_k{}.csv'.format(p, k)])
        for text in texts:
            if text is not None:
                with open(result_filename, 'ab') as fw:
                    fw.write(text + '\n')
    else:
        debug('File not found', evaluation_name)
Example #11
0
def personal_factor(config, p, k, t, d):
    g1 = None
    g2 = None
    ### Intermediate file -- check if exists
    pgt_root = config['directory']['pgt']
    dataset_names = config['dataset']
    modes = config['mode']
    pgt_root = config['directory']['pgt']
    make_sure_path_exists('/'.join([pgt_root, dataset_names[p]]))
    g1_file = '/'.join([pgt_root, dataset_names[p], \
        config['intermediate']['pgt']['pgt_g1'].format(modes[k], t, d)])
    g2_file = '/'.join([pgt_root, dataset_names[p], \
        config['intermediate']['pgt']['pgt_g2'].format(modes[k], t, d)])
    if is_file_exists(g1_file) is True and is_file_exists(g2_file) is True:
        g1 = pd.read_csv(g1_file)
        g2 = pd.read_csv(g2_file)
    else:
        ### If it does not exist
        feature = 'personal'
        colocation_df = transform_colocation_pgt(config, p, k, t, d, feature)
        ### Aggregate the weight for each user pair
        g1 = colocation_df.groupby(['user1',
                                    'user2'])['wp'].agg(['mean', 'count'])
        g2 = colocation_df.groupby(['user1',
                                    'user2'])['wp'].agg(['max', 'count'])
        g1.reset_index(inplace=True)
        g2.reset_index(inplace=True)
        g1.sort_values(['user1', 'user2'], inplace=True)
        g2.sort_values(['user1', 'user2'], inplace=True)
        g1['g1'] = g1['mean'] * g1['count']
        g2['g2'] = g2['max'] * g2['count']
        g1['g1'] = g1['g1'] / max(g1['g1'])
        g2['g2'] = g2['g2'] / max(g2['g2'])
        g1.to_csv(g1_file, header=True, index=False, compression='bz2')
        g2.to_csv(g2_file, header=True, index=False, compression='bz2')
        # debug(g1.head())
        # debug(g2.head())
        del colocation_df
    return g1, g2
Example #12
0
def global_factor(config, p, k, t, d, g2):
    g3 = None
    ### Intermediate file -- check if exists
    pgt_root = config['directory']['pgt']
    dataset_names = config['dataset']
    modes = config['mode']
    pgt_root = config['directory']['pgt']
    make_sure_path_exists('/'.join([pgt_root, dataset_names[p]]))
    g3_file = '/'.join([pgt_root, dataset_names[p], \
        config['intermediate']['pgt']['pgt_g3'].format(modes[k], t, d)])
    if is_file_exists(g3_file) is True:
        g3 = pd.read_csv(g3_file)
    else:
        feature = 'global'
        colocation_df = transform_colocation_pgt(config, p, k, t, d, feature)
        g3 = colocation_df.groupby(['user1', 'user2'])['wg'].agg(['sum'])
        g3.reset_index(inplace=True)
        g3.sort_values(['user1', 'user2'], inplace=True)
        g3['g3'] = g2['g2'] * g3['sum']
        g3['g3'] = g3['g3'] / max(g3['g3'])
        g3.to_csv(g3_file, header=True, index=False, compression='bz2')
        del colocation_df
    return g3
Example #13
0
def extract_colocation_features(stat_lp, config, p, k, t, d):
    debug('p', p, 'k', k, 't', t, 'd', d)
    ### Check if SCI intermediate exists
    dataset_names = config['dataset']
    compressed = config['kwargs']['compress_output']
    sci_root = config['directory']['sci']
    make_sure_path_exists('/'.join([sci_root, dataset_names[p]]))
    if compressed is True:
        sci_name = config['intermediate']['sci']['evaluation_compressed']
    else:
        sci_name = config['intermediate']['sci']['evaluation']
    sci_name = '/'.join(
        [sci_root, dataset_names[p],
         sci_name.format(p, k, t, d)])
    if is_file_exists(sci_name):
        debug('File %s exists' % sci_name)
    else:
        ### Read (original) friendship from file
        friend_df = extract_friendships(dataset_names[p], config)
        colocation_df = read_colocation_file(config, p, k, t, d)
        ### Find if the two users in the colocated check-ins are friends / stranger
        colocation_df = determine_social_tie(colocation_df, friend_df)
        debug('#colocations', len(colocation_df), 'p', p, 'k', k, 't', t, 'd',
              d)
        ### Find the stability value for each co-location pairs
        groups = colocation_df.groupby(['user1', 'user2', 'link'])
        grouped = aggregate_stats(groups, stat_lp, p, k, t, d)
        ### Write the result into a csv output
        write_statistics(grouped, config, p, k, t, d)

        ### Memory management
        del friend_df
        del colocation_df
        del grouped
    debug('Finished extract_colocation_features', 'p', p, 'k', k, 't', t, 'd',
          d)
Example #14
0
def process_reduce(config, p, k, t_diff, s_diff):
    out_format = config['intermediate']['colocation']['csv']
    re_format = config['intermediate']['colocation']['re']
    working_directory = config['directory']['colocation']
    dataset_name = config['dataset'][p]
    make_sure_path_exists('/'.join([working_directory, dataset_name]))
    pattern = re.compile(re_format.format(p, k, t_diff, s_diff))
    file_list = []
    for fname in os.listdir('/'.join([working_directory, dataset_name])):
        if fname.endswith(".csv"):
            if pattern.match(fname):
                file_list.append('/'.join(
                    [working_directory, dataset_name, fname]))
    output = '/'.join([
        working_directory, dataset_name,
        out_format.format(p, k, t_diff, s_diff)
    ])
    with open(output, 'wb') as fw:
        fw.write('%s' % colocation_header)
    with open(output, 'ab') as wfd:
        for f in file_list:
            with open(f, 'rb') as fd:
                shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10)
                #10MB per writing chunk to avoid reading big file into memory.
Example #15
0
def prepare_extraction(config, feature, p, k):
    ### Check if PGT intermediate exists
    dataset_names = config['dataset']
    modes = config['mode']
    pgt_root = config['directory']['pgt']
    pgt_part_root = config['directory']['pgt_temp']
    make_sure_path_exists('/'.join([pgt_root, dataset_names[p]]))
    pgt_file = '/'.join([pgt_root, dataset_names[p], \
        config['intermediate']['pgt'][feature].format(modes[k])])
    if is_file_exists(pgt_file) is True:
        debug('PGT %s exists' % feature)
    else:
        if feature == 'personal':
            checkins, grouped = extract_checkins_per_user(
                dataset_names[p], modes[k], config)
        elif feature == 'global':
            checkins, grouped = extract_checkins_per_venue(
                dataset_names[p], modes[k], config)
        user_visit_dir = '/'.join(
            [config['directory']['intermediate'], config['dataset'][p]])
        user_visit_name = config['intermediate']['pgt']['user_visit'].format(
            config['mode'][k])
        final_name = '/'.join([user_visit_dir, user_visit_name])
        ### Using user visit database
        if is_file_exists(final_name) is True:
            user_visit = pd.read_csv(final_name, compression='bz2')
            venues = checkins[['location', 'latitude', 'longitude'
                               ]].drop_duplicates(subset=['location'])
            debug('#Venues', len(venues), 'p', p, 'k', k)
            kwargs = config['kwargs']
            n_core = kwargs['n_core']
            start = kwargs['pgt'][feature]['start']
            finish = kwargs['pgt'][feature]['finish']
            begins, ends = init_begin_end(n_core,
                                          len(grouped),
                                          start=start,
                                          finish=finish)
            if feature == 'personal':
                function = density_location
            elif feature == 'global':
                function = entropy_location
            ### Map step
            Parallel(n_jobs=n_core)(delayed(function)(checkins, grouped, venues, user_visit, \
                config, p, k, begins[i-1], ends[i-1]) \
                for i in xrange(len(begins), 0, -1))
            ### Reduce step
            if feature == 'personal':
                result = pd.DataFrame(columns=['user', 'location', 'p_i'])
            elif feature == 'global':
                result = pd.DataFrame(columns=['location', 'p_i'])
            for i in range(len(begins)):
                start = begins[i]
                finish = ends[i]
                pgt_file_part = '/'.join([pgt_part_root, dataset_names[p], \
                    config['intermediate']['pgt']['%s_part' % feature].format(modes[k], start, finish)])
                temp = pd.read_csv(pgt_file_part)
                result = result.append(temp, ignore_index=True)
            if feature == 'personal':
                result.drop_duplicates(subset=['user', 'location'],
                                       inplace=True)
                result.sort_values(['user', 'location'], inplace=True)
            elif feature == 'global':
                result.sort_values(['location'], inplace=True)
            debug('#User Visits', len(result))
            result.to_csv(pgt_file, index=False, header=True)
            ### Clean up mess if needed
            if config['kwargs']['pgt'][feature]['clean_temp'] is True:
                for i in range(len(begins)):
                    start = begins[i]
                    finish = ends[i]
                    pgt_file_part = '/'.join([pgt_part_root, dataset_names[p], \
                        config['intermediate']['pgt']['%s_part' % feature].format(modes[k], start, finish)])
                    remove_file_if_exists(pgt_file_part)
        else:
            debug(
                'Please generate the user visit first through preprocessing/read.py',
                '(function: generate_user_visit)')
Example #16
0
def transform_colocation_pgt(config, p, k, t, d, feature):
    dataset_names = config['dataset']
    modes = config['mode']
    pgt_root = config['directory']['pgt']
    pgt_part_root = config['directory']['pgt_temp']
    make_sure_path_exists('/'.join([pgt_root, dataset_names[p]]))
    make_sure_path_exists('/'.join([pgt_part_root, dataset_names[p]]))
    pgt_file = '/'.join([pgt_root, dataset_names[p], \
        config['intermediate']['pgt'][feature].format(modes[k])])
    ### Check if PGT intermediate exists
    if is_file_exists(pgt_file) is False:
        debug('PGT %s does not exists' % feature)
        debug('Please run PGT %s factor extraction first' % feature)
        return None
    else:
        g0 = '/'.join([pgt_root, dataset_names[p], \
            config['intermediate']['pgt']['pgt_g0_%s' % feature].format(modes[k], t, d)])
        if is_file_exists(g0) is False:
            if feature == 'personal':
                ### columns=['user', 'location', 'p_i']
                personal_density = pd.read_csv(pgt_file)
                col_name = 'wp'
            elif feature == 'global':
                ### columns=['location', 'p_i']
                entropy_location = pd.read_csv(pgt_file)
                col_name = 'wg'
            ### user1,user2,location1,location2,time1,time2,lat1,lon1,lat2,lon2,t_diff,s_diff
            ### Evaluate the weight for each colocation
            ### Map step
            i = 0
            chunksize = 10**5
            debug('chunksize for transform_colocation_pgt', chunksize)
            for colocation_df in read_colocation_file(
                    config,
                    p,
                    k,
                    t,
                    d,
                    chunksize=chunksize,
                    usecols=['user1', 'user2', 'location1', 'location2']):
                g0_part = '/'.join([pgt_part_root, dataset_names[p], \
                    config['intermediate']['pgt']['pgt_g0_%s_part' % feature].format(modes[k], t, d, i)])
                debug('Processing', feature, 'part', g0_part)
                if is_file_exists(g0_part) is False:
                    if feature == 'personal':
                        colocation_df[col_name] = colocation_df.apply(
                            lambda x: calculate_personal(x, personal_density),
                            axis=1)
                    elif feature == 'global':
                        colocation_df[col_name] = colocation_df.apply(
                            lambda x: calculate_global(x, entropy_location),
                            axis=1)
                    colocation_df.to_csv(g0_part,
                                         index=False,
                                         header=True,
                                         compression='bz2')
                i += 1
            ### Reduce step
            colocation_df = pd.DataFrame(
                columns=['user1', 'user2', 'location1', 'location2', col_name])
            condition = True
            i = 0
            while (condition is True):
                ### Iterate over all chunks
                g0_part = '/'.join([pgt_part_root, dataset_names[p], \
                    config['intermediate']['pgt']['pgt_g0_%s_part' % feature].format(modes[k], t, d, i)])
                if is_file_exists(g0_part) is False:
                    condition = False
                    break
                temp = pd.read_csv(g0_part)
                colocation_df = colocation_df.append(temp, ignore_index=True)
                i += 1
            if config['kwargs']['pgt'][feature]['clean_temp'] is True:
                condition = True
                i = 0
                while (condition is True):
                    g0_part = '/'.join([pgt_part_root, dataset_names[p], \
                        config['intermediate']['pgt']['pgt_g0_%s_part' % feature].format(modes[k], t, d, i)])
                    if is_file_exists(g0_part) is False:
                        condition = False
                        break
                    remove_file_if_exists(g0_part)
                    i += 1
            colocation_df.replace([np.inf, -np.inf], np.nan, inplace=True)
            colocation_df.fillna(0, inplace=True)
            colocation_df.to_csv(g0,
                                 index=False,
                                 header=True,
                                 compression='bz2')
            gc.collect()
        else:
            colocation_df = pd.read_csv(g0)
            debug('Loaded g0 %s successfully [%s]' % (feature, g0))
        return colocation_df
Example #17
0
def sci_evaluation(config, p, k, t, d):
    debug('Evaluating SCI for p{}, k{}, t{}, d{}'.format(p, k, t, d))
    dataset_names = config['dataset']
    compressed = config['kwargs']['read_compressed']
    sci_root = config['directory']['sci']
    make_sure_path_exists('/'.join([sci_root, dataset_names[p]]))
    if compressed is True:
        sci_name = config['intermediate']['sci']['evaluation_compressed']
    else:
        sci_name = config['intermediate']['sci']['evaluation']
    evaluation_name = '/'.join(
        [sci_root, dataset_names[p],
         sci_name.format(p, k, t, d)])
    if is_file_exists(evaluation_name) is True:
        dataset = pd.read_csv(evaluation_name)
        # Format: 'uid1', 'uid2', 'frequency', 'diversity', 'duration', 'stability', 'popularity', 'link'
        X = dataset[[
            'frequency', 'diversity', 'duration', 'stability_std',
            'popularity', 'stability_avg', 'stability_old'
        ]].values
        y = dataset[['link']].values
        ### Selecting the feature set
        selected_feature_set = config['kwargs']['sci_eval']['features']
        ### PAKDD 2017 Submission
        if selected_feature_set == 'pakdd_2017_all':
            notes = [
                "SCI", "frequency", "diversity", "duration", "stability",
                "F+D", "F+TD", "F+TS", "D+TD", "D+TS", "TD+TS", "F+D+TD",
                "F+D+TS", "F+TD+TS", "D+TD+TS"
            ]
            assign = [[0, 1, 2, 6], [0], [1], [2], [6], [0, 1], [0, 2], [0, 6],
                      [1, 2], [1, 6], [2, 6], [0, 1, 2], [0, 1, 6], [0, 2, 6],
                      [1, 2, 6]]
        ### PAKDD 2017 Submission (All)
        elif selected_feature_set == 'pakdd_2017_summary':
            notes = ['SCI']
            assign = [[0, 1, 2, 6]]
        ## New Feature added (Popularity)
        elif selected_feature_set == 'all_features':
            notes = [
                'SCI+', 'Frequency', 'Diversity', 'Duration', 'Stability',
                'Popularity', 'F+D', 'F+TD', 'F+TS', 'F+P', 'D+TD', 'D+TS',
                'D+P', 'TD+TS', 'TD+P', 'TS+P', 'F+D+TD', 'F+D+TS', 'F+D+P',
                'F+TD+TS', 'F+TD+P', 'F+TS+P', 'D+TD+TS', 'D+TD+P', 'D+TS+P',
                'TD+TS+P', 'F+D+TD+TS', 'F+D+TD+P', 'F+D+TS+P', 'F+TD+TS+P',
                'D+TD+TS+P', 'SCI'
            ]
            assign = [[0, 1, 2, 3, 4], [0], [1], [2], [3], [4], [0, 1], [0, 2],
                      [0, 3], [0, 4], [1, 2], [1, 3], [1, 4], [2, 3], [2, 4],
                      [3, 4], [0, 1, 2], [0, 1, 3], [0, 1, 4], [0, 2, 3],
                      [0, 2, 4], [0, 3, 4], [1, 2, 3], [1, 2, 4], [1, 3, 4],
                      [2, 3, 4], [0, 1, 2, 3], [0, 1, 2, 4], [0, 1, 3, 4],
                      [0, 2, 3, 4], [1, 2, 3, 4], [0, 1, 2, 3]]
        ### Only All features
        elif selected_feature_set == 'summary':
            notes = ['SCI+']
            assign = [[0, 1, 2, 3, 4]]
        ### Added Popularity,
        elif selected_feature_set == 'sci_plus_all':
            notes = [
                'All', 'F', 'D', 'TD', 'TSD', 'P', 'TSA', 'TS', 'F+D', 'F+TD',
                'F+TSD', 'F+P', 'F+TSA', 'F+TS', 'D+TD', 'D+TSD', 'D+P',
                'D+TSA', 'D+TS', 'TD+TSD', 'TD+P', 'TD+TSA', 'TD+TS', 'TSD+P',
                'TSD+TSA', 'TSD+TS', 'P+TSA', 'P+TS', 'TSA+TS', 'F+D+TD',
                'F+D+TSD', 'F+D+P', 'F+D+TSA', 'F+D+TS', 'F+TD+TSD', 'F+TD+P',
                'F+TD+TSA', 'F+TD+TS', 'F+TSD+P', 'F+TSD+TSA', 'F+TSD+TS',
                'F+P+TSA', 'F+P+TS', 'F+TSA+TS', 'D+TD+TSD', 'D+TD+P',
                'D+TD+TSA', 'D+TD+TS', 'D+TSD+P', 'D+TSD+TSA', 'D+TSD+TS',
                'D+P+TSA', 'D+P+TS', 'D+TSA+TS', 'TD+TSD+P', 'TD+TSD+TSA',
                'TD+TSD+TS', 'TD+P+TSA', 'TD+P+TS', 'TD+TSA+TS', 'TSD+P+TSA',
                'TSD+P+TS', 'TSD+TSA+TS', 'P+TSA+TS', 'F+D+TD+TSD', 'F+D+TD+P',
                'F+D+TD+TSA', 'F+D+TD+TS', 'F+D+TSD+P', 'F+D+TSD+TSA',
                'F+D+TSD+TS', 'F+D+P+TSA', 'F+D+P+TS', 'F+D+TSA+TS',
                'F+TD+TSD+P', 'F+TD+TSD+TSA', 'F+TD+TSD+TS', 'F+TD+P+TSA',
                'F+TD+P+TS', 'F+TD+TSA+TS', 'F+TSD+P+TSA', 'F+TSD+P+TS',
                'F+TSD+TSA+TS', 'F+P+TSA+TS', 'D+TD+TSD+P', 'D+TD+TSD+TSA',
                'D+TD+TSD+TS', 'D+TD+P+TSA', 'D+TD+P+TS', 'D+TD+TSA+TS',
                'D+TSD+P+TSA', 'D+TSD+P+TS', 'D+TSD+TSA+TS', 'D+P+TSA+TS',
                'TD+TSD+P+TSA', 'TD+TSD+P+TS', 'TD+TSD+TSA+TS', 'TD+P+TSA+TS',
                'TSD+P+TSA+TS', 'F+D+TD+TSD+P', 'F+D+TD+TSD+TSA',
                'F+D+TD+TSD+TS', 'F+D+TD+P+TSA', 'F+D+TD+P+TS',
                'F+D+TD+TSA+TS', 'F+D+TSD+P+TSA', 'F+D+TSD+P+TS',
                'F+D+TSD+TSA+TS', 'F+D+P+TSA+TS', 'F+TD+TSD+P+TSA',
                'F+TD+TSD+P+TS', 'F+TD+TSD+TSA+TS', 'F+TD+P+TSA+TS',
                'F+TSD+P+TSA+TS', 'D+TD+TSD+P+TSA', 'D+TD+TSD+P+TS',
                'D+TD+TSD+TSA+TS', 'D+TD+P+TSA+TS', 'D+TSD+P+TSA+TS',
                'TD+TSD+P+TSA+TS', 'F+D+TD+TSD+P+TSA', 'F+D+TD+TSD+P+TS',
                'F+D+TD+TSD+TSA+TS', 'F+D+TD+P+TSA+TS', 'F+D+TSD+P+TSA+TS',
                'F+TD+TSD+P+TSA+TS', 'D+TD+TSD+P+TSA+TS'
            ]
            assign = [[0, 1, 2, 3, 4, 5, 6], [0], [1], [2], [3], [4], [5], [6],
                      [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [1, 2],
                      [1, 3], [1, 4], [1, 5], [1, 6], [2, 3], [2, 4], [2, 5],
                      [2, 6], [3, 4], [3, 5], [3, 6], [4, 5], [4, 6], [5, 6],
                      [0, 1, 2], [0, 1, 3], [0, 1, 4], [0, 1, 5], [0, 1, 6],
                      [0, 2, 3], [0, 2, 4], [0, 2, 5], [0, 2, 6], [0, 3, 4],
                      [0, 3, 5], [0, 3, 6], [0, 4, 5], [0, 4, 6], [0, 5, 6],
                      [1, 2, 3], [1, 2, 4], [1, 2, 5], [1, 2, 6], [1, 3, 4],
                      [1, 3, 5], [1, 3, 6], [1, 4, 5], [1, 4, 6], [1, 5, 6],
                      [2, 3, 4], [2, 3, 5], [2, 3, 6], [2, 4, 5], [2, 4, 6],
                      [2, 5, 6], [3, 4, 5], [3, 4, 6], [3, 5, 6], [4, 5, 6],
                      [0, 1, 2, 3], [0, 1, 2, 4], [0, 1, 2, 5], [0, 1, 2, 6],
                      [0, 1, 3, 4], [0, 1, 3, 5], [0, 1, 3, 6], [0, 1, 4, 5],
                      [0, 1, 4, 6], [0, 1, 5, 6], [0, 2, 3, 4], [0, 2, 3, 5],
                      [0, 2, 3, 6], [0, 2, 4, 5], [0, 2, 4, 6], [0, 2, 5, 6],
                      [0, 3, 4, 5], [0, 3, 4, 6], [0, 3, 5, 6], [0, 4, 5, 6],
                      [1, 2, 3, 4], [1, 2, 3, 5], [1, 2, 3, 6], [1, 2, 4, 5],
                      [1, 2, 4, 6], [1, 2, 5, 6], [1, 3, 4, 5], [1, 3, 4, 6],
                      [1, 3, 5, 6], [1, 4, 5, 6], [2, 3, 4, 5], [2, 3, 4, 6],
                      [2, 3, 5, 6], [2, 4, 5, 6],
                      [3, 4, 5, 6], [0, 1, 2, 3, 4], [0, 1, 2, 3, 5],
                      [0, 1, 2, 3, 6], [0, 1, 2, 4, 5], [0, 1, 2, 4, 6],
                      [0, 1, 2, 5, 6], [0, 1, 3, 4, 5], [0, 1, 3, 4, 6],
                      [0, 1, 3, 5, 6], [0, 1, 4, 5, 6], [0, 2, 3, 4, 5],
                      [0, 2, 3, 4, 6], [0, 2, 3, 5, 6], [0, 2, 4, 5, 6],
                      [0, 3, 4, 5, 6], [1, 2, 3, 4, 5], [1, 2, 3, 4, 6],
                      [1, 2, 3, 5, 6], [1, 2, 4, 5, 6], [1, 3, 4, 5, 6],
                      [2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 6],
                      [0, 1, 2, 3, 5, 6], [0, 1, 2, 4, 5,
                                           6], [0, 1, 3, 4, 5, 6],
                      [0, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]]
        ### SCI and SCI+
        else:  ### 'summary_old_new'
            notes = ['SCI+', 'SCI']
            assign = [[0, 1, 2, 3, 4], [0, 1, 2, 6]]
        ### Generate the report
        debug(notes, assign)
        texts = generate_report(config, X, y, assign, notes, p, k, t, d)
        del X, y
        report_directory = config['directory']['report']
        result_filename = '/'.join(
            [report_directory, 'SCI_result_p{}_k{}.csv'.format(p, k)])
        for text in texts:
            if text is not None:
                with open(result_filename, 'ab') as fw:
                    fw.write(text + '\n')
    else:
        debug('File not found', evaluation_name)
Example #18
0
def generate_walk2friend(config):
    kwargs = config['kwargs']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    t_diffs = kwargs['ts']
    s_diffs = kwargs['ds']
    for dataset_name in datasets:
        p = config['dataset'].index(dataset_name)
        for mode in modes:
            k = config['mode'].index(mode)
            for t in t_diffs:
                for d in s_diffs:
                    output_dir = config['directory']['walk2friend']
                    make_sure_path_exists(output_dir)
                    debug('p', p, 'k', k, 't', t, 'd', d)
                    checkin_name = '/'.join([
                        output_dir,
                        '{}_{}_t{}_d{}.checkin'.format(dataset_name, mode, t,
                                                       d)
                    ])
                    friends_name = '/'.join([
                        output_dir,
                        '{}_{}_t{}_d{}.friends'.format(dataset_name, mode, t,
                                                       d)
                    ])
                    if is_file_exists(checkin_name) is False or is_file_exists(
                            friends_name) is False:
                        checkins, _ = extract_checkins_all(
                            dataset_name, mode, config)
                        friends = extract_friendships(dataset_name, config)
                        user_unique = []
                        for colocations in read_colocation_file(
                                config,
                                p,
                                k,
                                t,
                                d,
                                chunksize=10**6,
                                usecols=['user1', 'user2']):
                            user_unique.append(colocations['user1'].unique())
                            user_unique.append(colocations['user2'].unique())
                        # user_unique = np.array(user_unique)
                        user_unique = np.ravel(user_unique)
                        debug(user_unique)
                        user_unique = np.unique(user_unique)
                        debug('Before', '#checkins', len(checkins), '#friends',
                              len(friends))
                        checkins = checkins.loc[(
                            checkins['user'].isin(user_unique))]
                        friends = friends.loc[
                            (friends['user1'].isin(user_unique))
                            & (friends['user2'].isin(user_unique))]
                        debug('After', '#checkins', len(checkins), '#friends',
                              len(friends))
                        checkins.sort_values(['user', 'location'],
                                             inplace=True)
                        checkins.rename(columns={
                            "user": "******",
                            "location": "locid"
                        },
                                        inplace=True)
                        checkins['mid'] = range(len(checkins))
                        checkins = checkins[['mid', 'uid', 'locid']]
                        checkins.to_csv(checkin_name, index=False, header=True)
                        friends.rename(columns={
                            "user1": "u1",
                            "user2": "u2"
                        },
                                       inplace=True)
                        friends.sort_values(['u1', 'u2'], inplace=True)
                        friends = friends[['u1', 'u2']]
                        friends.to_csv(friends_name, index=False, header=True)
                        del user_unique
            gc.collect()