Ejemplo n.º 1
0
def main(config_name='config.json'):
    ### Started the program
    debug('Started SCI+', config_name)
    ### Read config
    config = read_config(config_name)
    kwargs = config['kwargs']
    ### Co-location
    is_run = kwargs['colocation']['run']
    run_by = kwargs['colocation']['run_by']
    if is_run is not None and is_run is True:
        ### Co-location generation
        run_colocation(config, run_by)
    ### SCI
    is_run = kwargs['sci']['run']
    if is_run is not None and is_run is True:
        run_sci(config)
    ### SCI Evaluation
    is_run = kwargs['sci_eval']['run']
    if is_run is not None and is_run is True:
        run_sci_eval(config)
    ### PGT
    is_run = kwargs['pgt']['run']
    if is_run is not None and is_run is True:
        run_pgt(config)
    ### PGT Evaluation
    is_run = kwargs['pgt_eval']['run']
    if is_run is not None and is_run is True:
        run_pgt_evaluation(config)
    ### Finished the program
    debug('Finished SCI+')
Ejemplo n.º 2
0
def sort_colocation(config):
    kwargs = config['kwargs']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    t_diffs = kwargs['ts']
    s_diffs = kwargs['ds']
    is_read_compressed = config['kwargs']['read_compressed']
    colocation_root = config['directory']['colocation']
    if is_read_compressed is False:
        colocation_name = config['intermediate']['colocation']['csv']
        compression = None
    else:
        colocation_name = config['intermediate']['colocation']['compressed']
        compression = 'bz2'
    for dataset_name in datasets:
        p = config['dataset'].index(dataset_name)
        for mode in modes:
            k = config['mode'].index(mode)
            for t in t_diffs:
                for d in s_diffs:
                    colocation_df = read_colocation_file(config, p, k, t, d)
                    colocation_df.sort_values([
                        'user1', 'user2', 'time1', 'time2', 'location1',
                        'location2'
                    ],
                                              inplace=True)
                    colocation_fullname = '/'.join(
                        [colocation_root,
                         colocation_name.format(p, k, t, d)])
                    remove_file_if_exists(colocation_fullname)
                    colocation_df.to_csv(colocation_fullname,
                                         index=False,
                                         header=True,
                                         compression=compression)
                    debug('Finished sorting %s' % colocation_fullname)
Ejemplo n.º 3
0
def extract_pi_loc_k(checkins, grouped, venues, user_visit, config, p, k,
                     start, finish, feature):
    pgt_part_root = config['directory']['pgt_temp']
    dataset_names = config['dataset']
    modes = config['mode']
    make_sure_path_exists('/'.join([pgt_part_root, dataset_names[p]]))
    pgt_file_part = '/'.join([pgt_part_root, dataset_names[p], \
        config['intermediate']['pgt']['%s_part' % feature].format(modes[k], start, finish)])
    if is_file_exists(pgt_file_part) is True:
        pass
    else:
        user_visit = user_visit[(user_visit['visit_count'] > 0)]
        #debug('start', start, 'finish', finish)
        if feature == 'personal':
            ids = grouped['user'].values
            grouping = 'user'
            result = pd.DataFrame(columns=['user', 'location', 'p_i'])
        elif feature == 'global':
            ids = grouped['location'].values
            grouping = 'location'
            result = pd.DataFrame(columns=['location', 'p_i'])
        t0 = time.time()
        for i in range(start, finish):
            u_i = ids[i]
            df = df_uid(checkins, u_i, config, grouping)
            visit_match = user_visit.isin({grouping: df[grouping].unique()})
            visit_temp = user_visit[visit_match[grouping]]
            if len(visit_temp > 0):
                if feature == 'personal':
                    ### Extract the p_i of each user's visit
                    visit_temp['p_i'] = visit_temp.apply(
                        lambda x: calculate_density(x, cd, df, venues), axis=1)
                    visit_temp = visit_temp[['user', 'location', 'p_i']]
                    result = result.append(visit_temp, ignore_index=True)
                elif feature == 'global':
                    ### Aggregate visit on each location
                    aggregations = {
                        'user_count': {
                            'entropy': lambda x: calculate_entropy(x)
                        },
                    }
                    grouped = visit_temp.groupby(['location']) \
                        .agg(aggregations)
                    grouped.columns = [
                        "_".join(x) for x in grouped.columns.ravel()
                    ]
                    grouped.rename(columns={"user_count_entropy": "p_i"},
                                   inplace=True)
                    grouped.reset_index(inplace=True)
                    # debug(grouped.columns.values)
                    # debug(grouped.head())
                    grouped = grouped[['location', 'p_i']]
                    result = result.append(grouped, ignore_index=True)
        t1 = time.time()
        ### Writing to temp file
        if feature == 'personal':
            result.drop_duplicates(subset=['user', 'location'], inplace=True)
        result.to_csv(pgt_file_part, index=False, header=True)
        debug('Finished density calculation into %s in %s seconds' %
              (pgt_file_part, str(t1 - t0)))
Ejemplo n.º 4
0
def process_map(checkins,
                grouped,
                config,
                start,
                finish,
                p,
                k,
                t_diff=1800,
                s_diff=0,
                write_per_instance=True):
    ### Execute the mapping process
    debug('Process map [p%d, k%d, t%d, d%.3f, start%d, finish%d] has started' %
          (p, k, t_diff, s_diff, start, finish))
    t0 = time.time()
    colocations = generate_colocation(checkins, grouped, config, p, k, t_diff,
                                      s_diff, start, finish,
                                      write_per_instance)
    if write_per_instance is False:
        write_colocation(colocations, config, p, k, t_diff, s_diff, start,
                         finish)
        if colocations is not None:
            del colocations[:]
            del colocations
            _ = gc.collect()
    elapsed = time.time() - t0
    debug(
        'Process map [p%d, k%d, t%d, d%.3f, start%d, finish%d] finished in %s seconds'
        % (p, k, t_diff, s_diff, start, finish, elapsed))
Ejemplo n.º 5
0
def run_pgt_evaluation(config):
    kwargs = config['kwargs']
    n_core = kwargs['n_core']
    all_datasets = config['dataset']
    all_modes = config['mode']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    t_diffs = kwargs['ts']
    s_diffs = kwargs['ds']
    report_directory = config['directory']['report']
    make_sure_path_exists(report_directory)
    for dataset_name in datasets:
        p = all_datasets.index(dataset_name)
        for mode in modes:
            k = all_modes.index(mode)
            debug('Run PGT Evaluation on Dataset', dataset_name, p, 'Mode',
                  mode, k, '#Core', n_core)
            ### Creating the report file
            result_filename = '/'.join(
                [report_directory, 'PGT_result_p{}_k{}.csv'.format(p, k)])
            remove_file_if_exists(result_filename)
            with open(result_filename, 'ab') as fw:
                fw.write(
                    'p,k,t,d,auc,precision,recall,f1,#friends,#data,feature_set,preprocessing\n'
                )
            for t_diff in t_diffs:
                for s_diff in s_diffs:
                    pgt_evaluation(config, p, k, t_diff, s_diff)
                    gc.collect()
Ejemplo n.º 6
0
def parallel_sampling(config, p, k, t, d):
    debug('Start sampling', 'p', p, 'k', k, 't', t, 'd', d)
    kwargs = config['kwargs']
    is_read_compressed = kwargs['read_compressed']
    colocation_root = config['directory']['colocation']
    make_sure_path_exists(colocation_root)
    if is_read_compressed is False:
        sample_name = config['intermediate']['colocation']['sample_csv']
        compression = None
    else:
        sample_name = config['intermediate']['colocation']['sample_compressed']
        compression = 'bz2'
    sample_rate = kwargs['preprocessing']['sampling']['rate']
    sample_fullname = '/'.join(
        [colocation_root,
         sample_name.format(p, k, t, d, sample_rate)])
    df = read_colocation_file(config, p, k, t, d)
    df = df.sample(frac=sample_rate, random_state=1)
    df.to_csv(sample_fullname,
              header=True,
              index=False,
              compression=compression,
              mode='w')
    debug('Finished sampling', 'p', p, 'k', k, 't', t, 'd', d, '#sample: ',
          len(df))
Ejemplo n.º 7
0
def run_pgt(config):
    kwargs = config['kwargs']
    n_core = kwargs['n_core']
    all_datasets = config['dataset']
    all_modes = config['mode']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    t_diffs = kwargs['ts']
    s_diffs = kwargs['ds']
    for dataset_name in datasets:
        p = all_datasets.index(dataset_name)
        for mode in modes:
            k = all_modes.index(mode)
            debug('Run PGT Extraction on Dataset', dataset_name, p, 'Mode',
                  mode, k, '#Core', n_core)
            if kwargs['pgt']['personal']['run']:
                extract_personal_pgt(config, p, k)
            if kwargs['pgt']['global']['run']:
                extract_global_pgt(config, p, k)
            if config['kwargs']['pgt']['extract_pgt']['temporal'] is False:
                Parallel(n_jobs=n_core)(delayed(extract_pgt)(config, p, k, t_diff, s_diff) \
                  for t_diff in t_diffs for s_diff in s_diffs)
                gc.collect()
            else:
                for t_diff in t_diffs:
                    for s_diff in s_diffs:
                        extract_pgt(config, p, k, t_diff, s_diff)
                        gc.collect()
Ejemplo n.º 8
0
def auc_score(config, X, y, ptype='original'):
    kfold = config['kwargs']['sci_eval']['kfold']
    n_core = config['kwargs']['n_core']
    cv = StratifiedKFold(n_splits=kfold)
    clf = RandomForestClassifier(n_jobs=n_core)

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)

    mean_precision = 0.0
    mean_recall = 0.0
    mean_f1 = 0.0
    mean_auc = 0.0

    total_ytrue = sum(y)

    i = 0
    success = 0
    for (train, test) in cv.split(X, y):
        X_pp, y_pp = sampling(X[train], y[train], ptype)
        fit = clf.fit(X_pp, y_pp)
        probas_ = fit.predict_proba(X[test])
        inference = fit.predict(X[test])
        try:
            # Compute ROC curve and area the curve
            # fpr, tpr, thresholds
            fpr, tpr, _ = roc_curve(y[test], probas_[:, 1])
            mean_tpr += interp(mean_fpr, fpr, tpr)
            mean_tpr[0] = 0.0
            roc_auc = auc(fpr, tpr)
            mean_auc += roc_auc

            # precision, recall, thresholds = precision_recall_curve(y[test], probas_[:, 1])
            average = 'weighted'
            precision = precision_score(y[test], inference, average=average)
            recall = recall_score(y[test], inference, average=average)
            f1 = f1_score(y[test], inference, average=average)

            mean_precision += precision
            mean_recall += recall
            mean_f1 += f1

            success += 1
        except:
            pass
        i += 1
    mean_tpr /= success
    mean_tpr[-1] = 1.0
    # mean_auc = auc(mean_fpr, mean_tpr)

    mean_precision /= success
    mean_recall /= success
    mean_f1 /= success
    mean_auc /= success

    debug('{:.3f} {:.3f} {:.3f} {:.3f} {} {}'.format(mean_auc, mean_precision,
                                                     mean_recall, mean_f1,
                                                     int(total_ytrue), len(y)))

    return mean_auc, mean_precision, mean_recall, mean_f1, total_ytrue[0]
Ejemplo n.º 9
0
def main():
    n_args = len(sys.argv)
    config_name = 'config.json'
    if n_args > 1:
        config_name = sys.argv[1]
    if is_file_exists(config_name) is False:
        config_name = 'config.json'
    ### Read config
    config = read_config(config_name)
    kwargs = config['kwargs']

    debug('Started Preprocessing', config_name)

    ### Read original data and generate standardized data
    if kwargs['preprocessing']['run_extraction'] is True:
        if kwargs['preprocessing']['read_original'] is True:
            dataset_root = config['directory']['dataset']
            preprocess_data(dataset_root)
    ### Extract user visit from co-location
    if kwargs['preprocessing']['user_visit'] is True:
        generate_user_visit(config)
    ### Sorting co-location based on several criteria
    if kwargs['preprocessing']['sort_colocation'] is True:
        sort_colocation(config)
    ### Generating check-ins based on co-locations -- for walk2friend evaluation
    if kwargs['preprocessing']['walk2friend'] is True:
        generate_walk2friend(config)
    ### Generating sampled co-location (for testing purpose)
    if kwargs['preprocessing']['sampling']['run'] is True:
        sampling_colocation(config)
Ejemplo n.º 10
0
def run_colocation(config, run_by):
    ### Read standardized data and perform preprocessing
    kwargs = config['kwargs']
    all_datasets = config['dataset']
    all_modes = config['mode']
    n_core = kwargs['n_core']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    t_diffs = kwargs['ts']
    s_diffs = kwargs['ds']
    skip_tolerance = kwargs['colocation']['early_stop']
    debug('early_stop', skip_tolerance)
    for dataset_name in datasets:
        p = all_datasets.index(dataset_name)
        for mode in modes:
            k = all_modes.index(mode)
            debug('Run co-location on Dataset', dataset_name, p, 'Mode', mode,
                  k, '#Core', n_core)
            ### Extracting checkins
            checkins, grouped = extract_checkins(config, dataset_name, mode,
                                                 run_by)
            for t_diff in t_diffs:
                for s_diff in s_diffs:
                    if run_by == 'user' or run_by == 'location':
                        map_reduce_colocation(config, checkins, grouped, p, k,
                                              t_diff, s_diff)
                    else:
                        map_reduce_colocation_kdtree(checkins, config, p, k,
                                                     t_diff, s_diff)
            checkins.drop(checkins.index, inplace=True)
            del checkins
            if grouped is not None:
                grouped.drop(grouped.index, inplace=True)
                del grouped
            gc.collect()
Ejemplo n.º 11
0
def read_colocation_file(config, p, k, t, d, chunksize=None, usecols=None):
    ### Read co-location from file
    colocation_root = config['directory']['colocation']
    colocation_fullname = None
    is_read_compressed = config['kwargs']['read_compressed']
    is_read_sampled = config['kwargs']['colocation']['sampling'][
        'use_sampling']
    if is_read_sampled is True:
        sample_rate = config['kwargs']['colocation']['sampling']['rate']
        if is_read_compressed is False:
            colocation_name = config['intermediate']['colocation'][
                'sample_csv']
        else:
            colocation_name = config['intermediate']['colocation'][
                'sample_compressed']
        colocation_fullname = '/'.join(
            [colocation_root,
             colocation_name.format(p, k, t, d, sample_rate)])
        if is_file_exists(colocation_fullname) is False:
            colocation_fullname = None
    if colocation_fullname is None:
        if is_read_compressed is False:
            colocation_name = config['intermediate']['colocation']['csv']
        else:
            colocation_name = config['intermediate']['colocation'][
                'compressed']
        colocation_fullname = '/'.join(
            [colocation_root,
             colocation_name.format(p, k, t, d)])
    colocation_dtypes = {
        'user1': np.int_,
        'user2': np.int_,
        'location1': np.int_,
        'location2': np.int_,
        'time1': np.int_,
        'time2': np.int_,
        'lat1': np.float_,
        'lon1': np.float_,
        'lat2': np.float_,
        'lon2': np.float_,
        't_diff': np.int_,
        's_diff': np.float_
    }
    debug('Read colocation file', colocation_fullname)
    if chunksize is None:
        colocation_df = pd.read_csv(colocation_fullname,
                                    dtype=colocation_dtypes,
                                    usecols=usecols)
    else:
        colocation_df = pd.read_csv(colocation_fullname,
                                    dtype=colocation_dtypes,
                                    chunksize=chunksize,
                                    usecols=usecols)
    return colocation_df
Ejemplo n.º 12
0
def main():
    debug('Started extracting partial colocation')
    ### Read config
    config = read_config()
    config_partial = config['kwargs']['partial_colocation']
    ps = config_partial['p']
    ks = config_partial['k']
    t_input = config_partial['t_input']
    d_input = config_partial['d_input']
    t_targets = config_partial['t_target']
    d_targets = config_partial['d_target']
    extract_colocation(config, ps, ks, t_input, d_input, t_targets, d_targets)
    debug('Finished extracting partial colocation')
Ejemplo n.º 13
0
def generating_walk2friend_data():
    config_name = 'config_test.json'
    debug('Started Test test_checkin_stats on SCI+', config_name)
    config = read_config(config_name)
    ### Started the program
    ### Read config
    config = read_config(config_name)
    kwargs = config['kwargs']
    all_datasets = config['dataset']
    all_modes = config['mode']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    directory = config['directory']['intermediate']
    for dataset_name in datasets:
        p = all_datasets.index(dataset_name)
        for mode in modes:
            k = all_modes.index(mode)
            debug('Run Test on Dataset', dataset_name, p, 'Mode', mode, k)
            ### Test extract check-ins
            checkins, _ = extract_checkins_all(dataset_name, mode, config)
            checkins.sort_values(["user", "timestamp"], inplace=True)
            checkins['mid'] = range(1, len(checkins) + 1)
            checkins.rename(columns={
                "user": "******",
                "location": "locid"
            },
                            inplace=True)
            checkins = checkins[['mid', 'uid', 'locid']]
            checkins.to_csv('/'.join(
                [directory,
                 '%s_%s_10.checkin' % (dataset_name, mode)]),
                            index=False,
                            header=True)
            ### Test extract friendships
            friend_df = extract_friendships(dataset_name, config)
            friend_df.sort_values(["user1", "user2"], inplace=True)
            friend_df.sort_values(["user1", "user2"], inplace=True)
            friend_df.rename(columns={
                "user1": "u1",
                "user2": "u2"
            },
                             inplace=True)
            friend_df.to_csv('/'.join(
                [directory,
                 '%s_%s_10.friends' % (dataset_name, mode)]),
                             index=False,
                             header=True)
Ejemplo n.º 14
0
def prepare_colocation(config, p, k, t_diff, s_diff, begins, ends):
    working_directory = config['directory']['colocation']
    filename = config['intermediate']['colocation']['part']
    dataset_name = config['dataset'][p]
    make_sure_path_exists('/'.join([working_directory, dataset_name]))
    clear_dir = config['kwargs']['colocation']['clear_dir']
    if clear_dir is True:
        remove_all_files('/'.join([working_directory, dataset_name]))
    ### Prepare the files
    for i in range(len(begins)):
        with open(
                '/'.join([
                    working_directory, dataset_name,
                    filename.format(p, k, t_diff, s_diff, begins[i], ends[i])
                ]), 'wb'):
            pass
    debug('Each colocation part file has been created')
Ejemplo n.º 15
0
def execute_parallel_st_tree_single(checkins, config, st_tree, data, p, k,
                                    t_diff, s_diff, start, finish):
    t0 = time.time()
    idx = st_tree.query_radius(data, 1)
    count = sum(len(x) for x in idx)
    if count > 0:
        colocations = extract_spatiotemporal_search_results(
            checkins, idx, start)
        write_colocation(colocations, config, p, k, t_diff, s_diff, start,
                         finish)
        if colocations is not None:
            del colocations
    elapsed = time.time() - t0
    del idx
    gc.collect()
    debug(
        'Process map [p%d, k%d, t%d, d%.3f, start%d, finish%d] finished in %s seconds'
        % (p, k, t_diff, s_diff, start, finish, elapsed))
Ejemplo n.º 16
0
def generate_report(config, X, y, assign, notes, p, k, t, d):
    texts = []
    names = config['kwargs']['sci_eval']['sampling']
    for i in range(len(names)):
        Xs = []
        for arr in assign:
            X_indexed = X[:, arr]
            Xs.append(X_indexed)
        name = names[i]
        debug('Evaluating {}'.format(name))
        for idx in range(len(Xs)):
            Xi = Xs[idx]
            debug('Feature {}'.format(notes[idx]))
            mean_auc, mean_precision, mean_recall, mean_f1, total_ytrue = auc_score(
                config, Xi, y, name)
            text = '{},{},{},{},{:.9f},{:.9f},{:.9f},{:.9f},{},{},{},{}'.format(
                p, k, t, d, mean_auc, mean_precision, mean_recall, mean_f1,
                total_ytrue, len(y), notes[idx], name)
            texts.append(text)
    return texts
Ejemplo n.º 17
0
def map_reduce_colocation(config, checkins, grouped, p, k, t_diff, s_diff):
    kwargs = config['kwargs']
    n_core = kwargs['n_core']
    start = kwargs['colocation']['start']
    finish = kwargs['colocation']['finish']
    order = kwargs['colocation']['order']
    ### For the sake of parallelization
    begins, ends = init_begin_end(n_core,
                                  len(grouped),
                                  start=start,
                                  finish=finish)
    debug('Begins', begins)
    debug('Ends', ends)
    ### Generate colocation based on extracted checkins
    prepare_colocation(config, p, k, t_diff, s_diff, begins, ends)
    ### Start from bottom
    if order == 'ascending':
        Parallel(n_jobs=n_core)(delayed(process_map)(checkins, grouped, config, begins[i], ends[i], \
          p, k, t_diff, s_diff) for i in range(len(begins)))
    else:
        Parallel(n_jobs=n_core)(delayed(process_map)(checkins, grouped, config, begins[i-1], ends[i-1], \
          p, k, t_diff, s_diff) for i in xrange(len(begins), 0, -1))
    process_reduce(config, p, k, t_diff, s_diff)
    debug('Finished map-reduce for [p%d, k%d, t%d, d%.3f]' %
          (p, k, t_diff, s_diff))
Ejemplo n.º 18
0
def sampling(X, y, ptype='original'):
    if ptype == 'original':
        return (X, y)
    ### ovesampling
    elif ptype == 'over':
        query_time = time.time()
        pp = SMOTE(kind='regular')
        X_pp, y_pp = pp.fit_sample(X, y)
        process_time = int(time.time() - query_time)
        debug('Finished sampling SMOTE in {} seconds'.format(process_time))
        return (X_pp, y_pp)
    ### undersampling
    elif ptype == 'under':
        query_time = time.time()
        pp = EditedNearestNeighbours()
        X_pp, y_pp = pp.fit_sample(X, y)
        process_time = int(time.time() - query_time)
        debug('Finished sampling ENN in {} seconds'.format(process_time))
        return (X_pp, y_pp)
    ### oversampling + undersampling
    elif ptype == 'combo':
        query_time = time.time()
        pp = SMOTEENN()
        X_pp, y_pp = pp.fit_sample(X, y)
        process_time = int(time.time() - query_time)
        debug('Finished sampling SMOTE-ENN in {} seconds'.format(process_time))
        return (X_pp, y_pp)
    return (X, y)
Ejemplo n.º 19
0
def extract_checkins(dataset_name, mode, config, id, filter):
    debug('Processing %s [%s] for each %s [filter=%s]' %
          (dataset_name, mode, id, filter))
    dataset_root = config['directory']['dataset']
    df, grouped = read_processed(dataset_root, dataset_name, mode, id, filter)
    debug('#checkins', len(df))
    if grouped is not None:
        debug('#%ss' % id, len(grouped))
    return df, grouped
Ejemplo n.º 20
0
def run_sci(config):
    ### Read standardized data and perform preprocessing
    kwargs = config['kwargs']
    n_core = kwargs['n_core']
    all_datasets = config['dataset']
    all_modes = config['mode']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    t_diffs = kwargs['ts']
    s_diffs = kwargs['ds']
    for dataset_name in datasets:
        p = all_datasets.index(dataset_name)
        for mode in modes:
            k = all_modes.index(mode)
            debug('Run SCI on Dataset', dataset_name, p, 'Mode', mode, k,
                  '#Core', n_core)
            ### Extracting checkins
            checkins, _ = extract_checkins(config, dataset_name, mode, 'user')
            stat_lp = extract_popularity(checkins, config, p, k)
            Parallel(n_jobs=n_core)(delayed(extract_colocation_features)(stat_lp, config, \
              p, k, t_diff, s_diff) for t_diff in t_diffs for s_diff in s_diffs)
            checkins.drop(checkins.index, inplace=True)
            del checkins
            gc.collect()
Ejemplo n.º 21
0
def read_snap_stanford_checkin(root, dataset='gowalla', write=True):
    debug('Read SNAP Stanford Checkin %s' % dataset)
    df = pd.read_csv(
        '/'.join([root, dataset, RAW_CHECKIN_FILE]),
        header=None,
        names=['user', 'timestamp', 'latitude', 'longitude', 'location'])
    debug(df.describe(include='all'))
    debug(df.head())
    ### Create a datetime column as the index
    df['time'] = pd.to_datetime(df['timestamp'], unit='s')
    df = df.set_index('time')
    debug(df.head())
    ### Reordering columns
    df = df[final_column]
    ### Writing results to files
    if write is True:
        generate_results(root, dataset, df)
Ejemplo n.º 22
0
def extract_pgt(config, p, k, t, d):
    dataset_names = config['dataset']
    compressed = config['kwargs']['compress_output']
    pgt_root = config['directory']['pgt']
    make_sure_path_exists('/'.join([pgt_root, dataset_names[p]]))
    if compressed is True:
        pgt_name = config['intermediate']['pgt']['pgt_output_compressed']
        compression = 'bz2'
    else:
        pgt_name = config['intermediate']['pgt']['pgt_output']
        compression = None
    intermediate_file = '/'.join(
        [pgt_root, dataset_names[p],
         pgt_name.format(p, k, t, d)])
    if is_file_exists(intermediate_file) is False:
        ### Extracting each feature
        if config['kwargs']['pgt']['extract_pgt']['run'] is True:
            g1 = None
            g2 = None
            g3 = None
            g4 = None
            if config['kwargs']['pgt']['extract_pgt']['personal'] is True:
                g1, g2 = personal_factor(config, p, k, t, d)  ### P in PGT
                debug('Finished loading personal factor', 'p', p, 'k', k, 't',
                      t, 'd', d)
            if config['kwargs']['pgt']['extract_pgt']['global'] is True:
                g3 = global_factor(config, p, k, t, d, g2)  ### PG in PGT
                debug('Finished loading global factor', 'p', p, 'k', k, 't', t,
                      'd', d)
            if config['kwargs']['pgt']['extract_pgt']['temporal'] is True:
                g4 = temporal_factor(config, p, k, t, d, g2)  ### PGT in PGT
                debug('Finished loading temporal factor', 'p', p, 'k', k, 't',
                      t, 'd', d)
            ### Merging all together
            if config['kwargs']['pgt']['extract_pgt']['merge'] is True:
                if g1 is not None and g2 is not None and g3 is not None and g4 is not None:
                    df = g1[['user1', 'user2',
                             'g1']].merge(g2[['user1', 'user2', 'g2']],
                                          on=['user1', 'user2'])
                    df = df.merge(g3[['user1', 'user2', 'g3']],
                                  on=['user1', 'user2'])
                    df = df.merge(g4[['user1', 'user2', 'g4']],
                                  on=['user1', 'user2'])
                    friend_df = extract_friendships(dataset_names[p], config)
                    df = determine_social_tie(df, friend_df)
                    df.to_csv(intermediate_file,
                              header=True,
                              index=False,
                              compression=compression)
Ejemplo n.º 23
0
def test_colocation_stats():
    config_name = 'config_test.json'
    ### Started the program
    debug('Started Test test_checkin_stats on SCI+', config_name)
    config = read_config('config_test.json')
    ### Read config
    config = read_config('config_test.json')
    kwargs = config['kwargs']
    all_datasets = config['dataset']
    all_modes = config['mode']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    t_diffs = kwargs['ts']
    s_diffs = kwargs['ds']
    for dataset_name in datasets:
        p = all_datasets.index(dataset_name)
        friend_df = extract_friendships(dataset_name, config)
        for mode in modes:
            k = all_modes.index(mode)
            for t in t_diffs:
                for d in s_diffs:
                    total_user = 0
                    total_friend = 0
                    total_colocation = 0
                    i = 0
                    for colocation_df in read_colocation_file(
                            config,
                            p,
                            k,
                            t,
                            d,
                            chunksize=10**6,
                            usecols=['user1', 'user2']):
                        colocation_df = determine_social_tie(
                            colocation_df, friend_df)
                        total_colocation += len(colocation_df)
                        colocation_df = colocation_df.drop_duplicates(
                            ['user1', 'user2'])
                        total_user += len(colocation_df)
                        total_friend += sum(colocation_df['link'])
                        i += 1
                        # debug('Processing chunks #%d' % i)
                    # debug('#colocations', total_colocation, '#total_user', total_user, '#total_friend', total_friend, 'p', p, 'k', k, 't', t, 'd', d)
                    debug(total_colocation, total_user, total_friend, p, k, t,
                          d)
                    gc.collect()
    debug('Finished Test on SCI+')
Ejemplo n.º 24
0
def extract_colocation_features(stat_lp, config, p, k, t, d):
    debug('p', p, 'k', k, 't', t, 'd', d)
    ### Check if SCI intermediate exists
    dataset_names = config['dataset']
    compressed = config['kwargs']['compress_output']
    sci_root = config['directory']['sci']
    make_sure_path_exists('/'.join([sci_root, dataset_names[p]]))
    if compressed is True:
        sci_name = config['intermediate']['sci']['evaluation_compressed']
    else:
        sci_name = config['intermediate']['sci']['evaluation']
    sci_name = '/'.join(
        [sci_root, dataset_names[p],
         sci_name.format(p, k, t, d)])
    if is_file_exists(sci_name):
        debug('File %s exists' % sci_name)
    else:
        ### Read (original) friendship from file
        friend_df = extract_friendships(dataset_names[p], config)
        colocation_df = read_colocation_file(config, p, k, t, d)
        ### Find if the two users in the colocated check-ins are friends / stranger
        colocation_df = determine_social_tie(colocation_df, friend_df)
        debug('#colocations', len(colocation_df), 'p', p, 'k', k, 't', t, 'd',
              d)
        ### Find the stability value for each co-location pairs
        groups = colocation_df.groupby(['user1', 'user2', 'link'])
        grouped = aggregate_stats(groups, stat_lp, p, k, t, d)
        ### Write the result into a csv output
        write_statistics(grouped, config, p, k, t, d)

        ### Memory management
        del friend_df
        del colocation_df
        del grouped
    debug('Finished extract_colocation_features', 'p', p, 'k', k, 't', t, 'd',
          d)
Ejemplo n.º 25
0
def read_foursquare2012_checkin(root, write=True):
    dataset = 'foursquare'
    debug('Read Checkin %s' % dataset)
    df = pd.read_csv('/'.join([root, dataset, RAW_CHECKIN_FILE]),
                     parse_dates=['time'])
    debug(df.describe(include='all'))
    debug(df.head())
    ### Create a UNIX timestamp column from the datetime format
    df['timestamp'] = df['time'].values.astype(np.int64) // 10**9
    ### Set the datetime as the index
    df = df.set_index('time')
    ### Reordering columns
    df = df[final_column]
    ### Error checking
    # odd = df.loc[df.longitude>-80, ['longitude', 'latitude']]
    ### Writing results to files
    if write is True:
        generate_results(root, dataset, df)
Ejemplo n.º 26
0
def pgt_evaluation(config, p, k, t, d):
    debug('Evaluating PGT for p{}, k{}, t{}, d{}'.format(p, k, t, d))
    dataset_names = config['dataset']
    compressed = config['kwargs']['read_compressed']
    pgt_root = config['directory']['pgt']
    make_sure_path_exists('/'.join([pgt_root, dataset_names[p]]))
    if compressed is True:
        pgt_name = config['intermediate']['pgt']['pgt_output_compressed']
    else:
        pgt_name = config['intermediate']['pgt']['pgt_output']
    evaluation_name = '/'.join(
        [pgt_root, dataset_names[p],
         pgt_name.format(p, k, t, d)])
    if is_file_exists(evaluation_name) is True:
        dataset = pd.read_csv(evaluation_name)
        # Format: 'user1', 'user2', 'g1', 'g2', 'g3', 'g4', 'link'
        X = dataset[['g1', 'g2', 'g3', 'g4']].values
        y = dataset[['link']].values
        ### Normalize unexpected values
        X[np.isinf(X)] = 0
        X[np.isnan(X)] = 0
        y[np.isinf(y)] = 0
        y[np.isnan(y)] = 0
        selected_feature_set = config['kwargs']['pgt_eval']['features']
        if selected_feature_set == 'all':
            notes = ["PGT+", "PGT", "P0", "P", "PG"]
            assign = [[0, 1, 2, 3], [3], [0], [1], [2]]
        else:  ### Summary only
            notes = ["PGT+", "PGT"]
            assign = [[0, 1, 2, 3], [3]]
        debug(notes, assign)
        texts = generate_report(config, X, y, assign, notes, p, k, t, d)
        del X, y
        report_directory = config['directory']['report']
        result_filename = '/'.join(
            [report_directory, 'PGT_result_p{}_k{}.csv'.format(p, k)])
        for text in texts:
            if text is not None:
                with open(result_filename, 'ab') as fw:
                    fw.write(text + '\n')
    else:
        debug('File not found', evaluation_name)
Ejemplo n.º 27
0
def generate_user_visit(config):
    kwargs = config['kwargs']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    for dataset_name in datasets:
        p = config['dataset'].index(dataset_name)
        for mode in modes:
            k = config['mode'].index(mode)
            out_dir = '/'.join(
                [config['directory']['intermediate'], config['dataset'][p]])
            out_name = config['intermediate']['pgt']['user_visit'].format(
                config['mode'][k])
            final_name = '/'.join([out_dir, out_name])
            if is_file_exists(final_name):
                debug('File %s already exists' % final_name)
            else:
                df, _ = extract_checkins_all(dataset_name,
                                             mode,
                                             config,
                                             filter=True)
                visits = df.groupby(['user', 'location'
                                     ])['timestamp'].count().reset_index()
                visits.rename(columns={"timestamp": "visit_count"},
                              inplace=True)
                u_count = df.groupby('user')['timestamp'].count().reset_index()
                u_count.rename(columns={"timestamp": "user_count"},
                               inplace=True)
                v_count = df.groupby(
                    'location')['timestamp'].count().reset_index()
                v_count.rename(columns={"timestamp": "location_count"},
                               inplace=True)
                visits = visits.join(u_count,
                                     on='user',
                                     how='outer',
                                     rsuffix='r')
                visits = visits.join(v_count,
                                     on='location',
                                     how='outer',
                                     rsuffix='r')
                visits = visits[[
                    'user', 'location', 'visit_count', 'user_count',
                    'location_count'
                ]]
                visits.fillna(0, inplace=True)
                ### All of these must have the same amount
                debug('Total #Checkins', len(df))
                debug('#Total user visits', int(visits['visit_count'].sum()))
                debug(
                    '#Total user counts',
                    int(visits.drop_duplicates(['user'])['user_count'].sum()))
                debug(
                    '#Total location counts',
                    int(
                        visits.drop_duplicates(['location'
                                                ])['location_count'].sum()))
                visits.to_csv(final_name,
                              header=True,
                              index=False,
                              compression='bz2')
                del visits, df
                gc.collect()
Ejemplo n.º 28
0
def generate_walk2friend(config):
    kwargs = config['kwargs']
    datasets = kwargs['active_dataset']
    modes = kwargs['active_mode']
    t_diffs = kwargs['ts']
    s_diffs = kwargs['ds']
    for dataset_name in datasets:
        p = config['dataset'].index(dataset_name)
        for mode in modes:
            k = config['mode'].index(mode)
            for t in t_diffs:
                for d in s_diffs:
                    output_dir = config['directory']['walk2friend']
                    make_sure_path_exists(output_dir)
                    debug('p', p, 'k', k, 't', t, 'd', d)
                    checkin_name = '/'.join([
                        output_dir,
                        '{}_{}_t{}_d{}.checkin'.format(dataset_name, mode, t,
                                                       d)
                    ])
                    friends_name = '/'.join([
                        output_dir,
                        '{}_{}_t{}_d{}.friends'.format(dataset_name, mode, t,
                                                       d)
                    ])
                    if is_file_exists(checkin_name) is False or is_file_exists(
                            friends_name) is False:
                        checkins, _ = extract_checkins_all(
                            dataset_name, mode, config)
                        friends = extract_friendships(dataset_name, config)
                        user_unique = []
                        for colocations in read_colocation_file(
                                config,
                                p,
                                k,
                                t,
                                d,
                                chunksize=10**6,
                                usecols=['user1', 'user2']):
                            user_unique.append(colocations['user1'].unique())
                            user_unique.append(colocations['user2'].unique())
                        # user_unique = np.array(user_unique)
                        user_unique = np.ravel(user_unique)
                        debug(user_unique)
                        user_unique = np.unique(user_unique)
                        debug('Before', '#checkins', len(checkins), '#friends',
                              len(friends))
                        checkins = checkins.loc[(
                            checkins['user'].isin(user_unique))]
                        friends = friends.loc[
                            (friends['user1'].isin(user_unique))
                            & (friends['user2'].isin(user_unique))]
                        debug('After', '#checkins', len(checkins), '#friends',
                              len(friends))
                        checkins.sort_values(['user', 'location'],
                                             inplace=True)
                        checkins.rename(columns={
                            "user": "******",
                            "location": "locid"
                        },
                                        inplace=True)
                        checkins['mid'] = range(len(checkins))
                        checkins = checkins[['mid', 'uid', 'locid']]
                        checkins.to_csv(checkin_name, index=False, header=True)
                        friends.rename(columns={
                            "user1": "u1",
                            "user2": "u2"
                        },
                                       inplace=True)
                        friends.sort_values(['u1', 'u2'], inplace=True)
                        friends = friends[['u1', 'u2']]
                        friends.to_csv(friends_name, index=False, header=True)
                        del user_unique
            gc.collect()
Ejemplo n.º 29
0
def generate_colocation(checkins, grouped, config, p, k, t_diff, s_diff, start,
                        finish, write_per_instance):
    colocations = []
    run_by = config['kwargs']['colocation']['run_by']
    if grouped is not None:
        ids = grouped[run_by].values
    counter = 0
    total_skip = 0
    is_debugging_colocation = config['kwargs']['colocation']['debug']
    skip_tolerance = config['kwargs']['colocation']['early_stop']
    for i in range(start, finish):
        consecutive_skip = 0
        if i < 0 or i > len(ids):
            break
        u_i = ids[i]
        df_i = df_uid(checkins, u_i, config)
        if grouped is not None:
            stats_i = df_uid(grouped, u_i, config)
        si_tree = create_spatial_kd_tree(df_i)
        ti_tree = create_temporal_kd_tree(df_i)
        for j in range(i + 1, len(ids)):
            u_j = ids[j]
            if grouped is not None:
                stats_j = df_uid(grouped, u_j, config)
                ### If there are no intersections between two users' timestamp, then skip
                if stats_i['t_max'].values[0]+t_diff < stats_j['t_min'].values[0] \
                  or stats_j['t_max'].values[0]+t_diff < stats_i['t_min'].values[0]:
                    total_skip += 1
                    consecutive_skip += 1
                    del u_j, stats_j
                    if consecutive_skip > skip_tolerance and skip_tolerance > 0:
                        total_skip += len(ids) - j - 1
                        break
                    else:
                        continue
            df_j = df_uid(checkins, u_j, config)
            if grouped is not None:
                ### If the GPS coordinates have no intersections
                if( stats_i['lat_min'].values[0] > stats_j['lat_max'].values[0]+s_diff or \
                    stats_i['lat_max'].values[0]+s_diff < stats_j['lat_min'].values[0] or \
                    stats_i['lon_min'].values[0] > stats_j['lon_max'].values[0]+s_diff or \
                    stats_i['lon_max'].values[0]+s_diff < stats_j['lon_min'].values[0]
                ):
                    total_skip += 1
                    consecutive_skip += 1
                    del df_j, u_j, stats_j
                    if consecutive_skip > skip_tolerance and skip_tolerance > 0:
                        total_skip += len(ids) - j - 1
                        break
                    else:
                        continue
                else:
                    consecutive_skip = 0
            tj_tree = create_temporal_kd_tree(df_j)
            ### temporal co-occurrence
            t_idx = ti_tree.query_ball_tree(tj_tree, t_diff)
            t_count = sum(len(x) for x in t_idx)
            if t_count > 0:
                ### spatial co-occurrence
                sj_tree = create_spatial_kd_tree(df_j)
                s_idx = si_tree.query_ball_tree(sj_tree, s_diff)
                s_count = sum(len(x) for x in s_idx)
                ### Only if both temporal and spatial co-occurrence > 0
                if s_count > 0:
                    ### Finding the intersection and adding colocations to the list
                    result = extract_radius_search_results(
                        df_i, df_j, s_idx, t_idx)
                    if result is not None and len(result) > 0:
                        colocations.extend(result)
                        del result[:]
                        del result
                del s_idx, sj_tree
            del tj_tree, t_idx, df_j, u_j, stats_j
            ### For testing purpose
            if is_debugging_colocation is True and j > i + 11:
                break
        ### Prepare for the next iteration
        counter += 1
        if write_per_instance is True:
            if colocations is not None:
                if len(colocations) > 0:
                    write_colocation(colocations, config, p, k, t_diff, s_diff,
                                     start, finish)
                del colocations[:]
        ### Clear-up memory
        del u_i, df_i, si_tree, ti_tree, stats_i
        _ = gc.collect()
    del ids
    debug('Skipped', total_skip,
          'user pairs due to the missing time / spatial intersections')
    if write_per_instance is True:
        ### Delete the last colocations set if it is per-user
        if colocations is not None:
            del colocations[:]
            del colocations
            _ = gc.collect()
        return None
    else:
        return colocations
Ejemplo n.º 30
0
def extract_colocation(config, ps, ks, t_diff_input, s_diff_input,
                       t_diff_targets, s_diff_targets):
    ### Format: user1,user2,location1,location2,time1,time2,lat1,lon1,lat2,lon2,t_diff,s_diff
    working_directory = config['directory']['colocation']
    in_filename = config['intermediate']['colocation']['csv']
    out_filename = config['intermediate']['colocation']['compressed']
    for p in ps:
        for k in ks:
            debug(
                'Reading colocation file', '/'.join([
                    working_directory,
                    in_filename.format(p, k, t_diff_input, s_diff_input)
                ]))
            df = pd.read_csv('/'.join([
                working_directory,
                in_filename.format(p, k, t_diff_input, s_diff_input)
            ]))
            debug('Original colocation size', len(df))
            debug('t_diff', df['t_diff'].max(), 's_diff', df['s_diff'].max())
            for t_diff_target in t_diff_targets:
                df = df[(df['t_diff'] <= t_diff_target)]
                for s_diff_target in s_diff_targets:
                    output_final_name = '/'.join([
                        working_directory,
                        out_filename.format(p, k, t_diff_target, s_diff_target)
                    ])
                    df_temp = df[(df['s_diff'] <= s_diff_target)]
                    debug('Filtered colocation size', len(df_temp))
                    debug('t_diff', df_temp['t_diff'].max(), 's_diff',
                          df_temp['s_diff'].max())
                    debug('Writing colocation file', output_final_name)
                    df_temp.to_csv(output_final_name,
                                   index=False,
                                   compression='bz2')
                    del df_temp