def main(config_name='config.json'): ### Started the program debug('Started SCI+', config_name) ### Read config config = read_config(config_name) kwargs = config['kwargs'] ### Co-location is_run = kwargs['colocation']['run'] run_by = kwargs['colocation']['run_by'] if is_run is not None and is_run is True: ### Co-location generation run_colocation(config, run_by) ### SCI is_run = kwargs['sci']['run'] if is_run is not None and is_run is True: run_sci(config) ### SCI Evaluation is_run = kwargs['sci_eval']['run'] if is_run is not None and is_run is True: run_sci_eval(config) ### PGT is_run = kwargs['pgt']['run'] if is_run is not None and is_run is True: run_pgt(config) ### PGT Evaluation is_run = kwargs['pgt_eval']['run'] if is_run is not None and is_run is True: run_pgt_evaluation(config) ### Finished the program debug('Finished SCI+')
def sort_colocation(config): kwargs = config['kwargs'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] t_diffs = kwargs['ts'] s_diffs = kwargs['ds'] is_read_compressed = config['kwargs']['read_compressed'] colocation_root = config['directory']['colocation'] if is_read_compressed is False: colocation_name = config['intermediate']['colocation']['csv'] compression = None else: colocation_name = config['intermediate']['colocation']['compressed'] compression = 'bz2' for dataset_name in datasets: p = config['dataset'].index(dataset_name) for mode in modes: k = config['mode'].index(mode) for t in t_diffs: for d in s_diffs: colocation_df = read_colocation_file(config, p, k, t, d) colocation_df.sort_values([ 'user1', 'user2', 'time1', 'time2', 'location1', 'location2' ], inplace=True) colocation_fullname = '/'.join( [colocation_root, colocation_name.format(p, k, t, d)]) remove_file_if_exists(colocation_fullname) colocation_df.to_csv(colocation_fullname, index=False, header=True, compression=compression) debug('Finished sorting %s' % colocation_fullname)
def extract_pi_loc_k(checkins, grouped, venues, user_visit, config, p, k, start, finish, feature): pgt_part_root = config['directory']['pgt_temp'] dataset_names = config['dataset'] modes = config['mode'] make_sure_path_exists('/'.join([pgt_part_root, dataset_names[p]])) pgt_file_part = '/'.join([pgt_part_root, dataset_names[p], \ config['intermediate']['pgt']['%s_part' % feature].format(modes[k], start, finish)]) if is_file_exists(pgt_file_part) is True: pass else: user_visit = user_visit[(user_visit['visit_count'] > 0)] #debug('start', start, 'finish', finish) if feature == 'personal': ids = grouped['user'].values grouping = 'user' result = pd.DataFrame(columns=['user', 'location', 'p_i']) elif feature == 'global': ids = grouped['location'].values grouping = 'location' result = pd.DataFrame(columns=['location', 'p_i']) t0 = time.time() for i in range(start, finish): u_i = ids[i] df = df_uid(checkins, u_i, config, grouping) visit_match = user_visit.isin({grouping: df[grouping].unique()}) visit_temp = user_visit[visit_match[grouping]] if len(visit_temp > 0): if feature == 'personal': ### Extract the p_i of each user's visit visit_temp['p_i'] = visit_temp.apply( lambda x: calculate_density(x, cd, df, venues), axis=1) visit_temp = visit_temp[['user', 'location', 'p_i']] result = result.append(visit_temp, ignore_index=True) elif feature == 'global': ### Aggregate visit on each location aggregations = { 'user_count': { 'entropy': lambda x: calculate_entropy(x) }, } grouped = visit_temp.groupby(['location']) \ .agg(aggregations) grouped.columns = [ "_".join(x) for x in grouped.columns.ravel() ] grouped.rename(columns={"user_count_entropy": "p_i"}, inplace=True) grouped.reset_index(inplace=True) # debug(grouped.columns.values) # debug(grouped.head()) grouped = grouped[['location', 'p_i']] result = result.append(grouped, ignore_index=True) t1 = time.time() ### Writing to temp file if feature == 'personal': result.drop_duplicates(subset=['user', 'location'], inplace=True) result.to_csv(pgt_file_part, index=False, header=True) debug('Finished density calculation into %s in %s seconds' % (pgt_file_part, str(t1 - t0)))
def process_map(checkins, grouped, config, start, finish, p, k, t_diff=1800, s_diff=0, write_per_instance=True): ### Execute the mapping process debug('Process map [p%d, k%d, t%d, d%.3f, start%d, finish%d] has started' % (p, k, t_diff, s_diff, start, finish)) t0 = time.time() colocations = generate_colocation(checkins, grouped, config, p, k, t_diff, s_diff, start, finish, write_per_instance) if write_per_instance is False: write_colocation(colocations, config, p, k, t_diff, s_diff, start, finish) if colocations is not None: del colocations[:] del colocations _ = gc.collect() elapsed = time.time() - t0 debug( 'Process map [p%d, k%d, t%d, d%.3f, start%d, finish%d] finished in %s seconds' % (p, k, t_diff, s_diff, start, finish, elapsed))
def run_pgt_evaluation(config): kwargs = config['kwargs'] n_core = kwargs['n_core'] all_datasets = config['dataset'] all_modes = config['mode'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] t_diffs = kwargs['ts'] s_diffs = kwargs['ds'] report_directory = config['directory']['report'] make_sure_path_exists(report_directory) for dataset_name in datasets: p = all_datasets.index(dataset_name) for mode in modes: k = all_modes.index(mode) debug('Run PGT Evaluation on Dataset', dataset_name, p, 'Mode', mode, k, '#Core', n_core) ### Creating the report file result_filename = '/'.join( [report_directory, 'PGT_result_p{}_k{}.csv'.format(p, k)]) remove_file_if_exists(result_filename) with open(result_filename, 'ab') as fw: fw.write( 'p,k,t,d,auc,precision,recall,f1,#friends,#data,feature_set,preprocessing\n' ) for t_diff in t_diffs: for s_diff in s_diffs: pgt_evaluation(config, p, k, t_diff, s_diff) gc.collect()
def parallel_sampling(config, p, k, t, d): debug('Start sampling', 'p', p, 'k', k, 't', t, 'd', d) kwargs = config['kwargs'] is_read_compressed = kwargs['read_compressed'] colocation_root = config['directory']['colocation'] make_sure_path_exists(colocation_root) if is_read_compressed is False: sample_name = config['intermediate']['colocation']['sample_csv'] compression = None else: sample_name = config['intermediate']['colocation']['sample_compressed'] compression = 'bz2' sample_rate = kwargs['preprocessing']['sampling']['rate'] sample_fullname = '/'.join( [colocation_root, sample_name.format(p, k, t, d, sample_rate)]) df = read_colocation_file(config, p, k, t, d) df = df.sample(frac=sample_rate, random_state=1) df.to_csv(sample_fullname, header=True, index=False, compression=compression, mode='w') debug('Finished sampling', 'p', p, 'k', k, 't', t, 'd', d, '#sample: ', len(df))
def run_pgt(config): kwargs = config['kwargs'] n_core = kwargs['n_core'] all_datasets = config['dataset'] all_modes = config['mode'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] t_diffs = kwargs['ts'] s_diffs = kwargs['ds'] for dataset_name in datasets: p = all_datasets.index(dataset_name) for mode in modes: k = all_modes.index(mode) debug('Run PGT Extraction on Dataset', dataset_name, p, 'Mode', mode, k, '#Core', n_core) if kwargs['pgt']['personal']['run']: extract_personal_pgt(config, p, k) if kwargs['pgt']['global']['run']: extract_global_pgt(config, p, k) if config['kwargs']['pgt']['extract_pgt']['temporal'] is False: Parallel(n_jobs=n_core)(delayed(extract_pgt)(config, p, k, t_diff, s_diff) \ for t_diff in t_diffs for s_diff in s_diffs) gc.collect() else: for t_diff in t_diffs: for s_diff in s_diffs: extract_pgt(config, p, k, t_diff, s_diff) gc.collect()
def auc_score(config, X, y, ptype='original'): kfold = config['kwargs']['sci_eval']['kfold'] n_core = config['kwargs']['n_core'] cv = StratifiedKFold(n_splits=kfold) clf = RandomForestClassifier(n_jobs=n_core) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) mean_precision = 0.0 mean_recall = 0.0 mean_f1 = 0.0 mean_auc = 0.0 total_ytrue = sum(y) i = 0 success = 0 for (train, test) in cv.split(X, y): X_pp, y_pp = sampling(X[train], y[train], ptype) fit = clf.fit(X_pp, y_pp) probas_ = fit.predict_proba(X[test]) inference = fit.predict(X[test]) try: # Compute ROC curve and area the curve # fpr, tpr, thresholds fpr, tpr, _ = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) mean_auc += roc_auc # precision, recall, thresholds = precision_recall_curve(y[test], probas_[:, 1]) average = 'weighted' precision = precision_score(y[test], inference, average=average) recall = recall_score(y[test], inference, average=average) f1 = f1_score(y[test], inference, average=average) mean_precision += precision mean_recall += recall mean_f1 += f1 success += 1 except: pass i += 1 mean_tpr /= success mean_tpr[-1] = 1.0 # mean_auc = auc(mean_fpr, mean_tpr) mean_precision /= success mean_recall /= success mean_f1 /= success mean_auc /= success debug('{:.3f} {:.3f} {:.3f} {:.3f} {} {}'.format(mean_auc, mean_precision, mean_recall, mean_f1, int(total_ytrue), len(y))) return mean_auc, mean_precision, mean_recall, mean_f1, total_ytrue[0]
def main(): n_args = len(sys.argv) config_name = 'config.json' if n_args > 1: config_name = sys.argv[1] if is_file_exists(config_name) is False: config_name = 'config.json' ### Read config config = read_config(config_name) kwargs = config['kwargs'] debug('Started Preprocessing', config_name) ### Read original data and generate standardized data if kwargs['preprocessing']['run_extraction'] is True: if kwargs['preprocessing']['read_original'] is True: dataset_root = config['directory']['dataset'] preprocess_data(dataset_root) ### Extract user visit from co-location if kwargs['preprocessing']['user_visit'] is True: generate_user_visit(config) ### Sorting co-location based on several criteria if kwargs['preprocessing']['sort_colocation'] is True: sort_colocation(config) ### Generating check-ins based on co-locations -- for walk2friend evaluation if kwargs['preprocessing']['walk2friend'] is True: generate_walk2friend(config) ### Generating sampled co-location (for testing purpose) if kwargs['preprocessing']['sampling']['run'] is True: sampling_colocation(config)
def run_colocation(config, run_by): ### Read standardized data and perform preprocessing kwargs = config['kwargs'] all_datasets = config['dataset'] all_modes = config['mode'] n_core = kwargs['n_core'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] t_diffs = kwargs['ts'] s_diffs = kwargs['ds'] skip_tolerance = kwargs['colocation']['early_stop'] debug('early_stop', skip_tolerance) for dataset_name in datasets: p = all_datasets.index(dataset_name) for mode in modes: k = all_modes.index(mode) debug('Run co-location on Dataset', dataset_name, p, 'Mode', mode, k, '#Core', n_core) ### Extracting checkins checkins, grouped = extract_checkins(config, dataset_name, mode, run_by) for t_diff in t_diffs: for s_diff in s_diffs: if run_by == 'user' or run_by == 'location': map_reduce_colocation(config, checkins, grouped, p, k, t_diff, s_diff) else: map_reduce_colocation_kdtree(checkins, config, p, k, t_diff, s_diff) checkins.drop(checkins.index, inplace=True) del checkins if grouped is not None: grouped.drop(grouped.index, inplace=True) del grouped gc.collect()
def read_colocation_file(config, p, k, t, d, chunksize=None, usecols=None): ### Read co-location from file colocation_root = config['directory']['colocation'] colocation_fullname = None is_read_compressed = config['kwargs']['read_compressed'] is_read_sampled = config['kwargs']['colocation']['sampling'][ 'use_sampling'] if is_read_sampled is True: sample_rate = config['kwargs']['colocation']['sampling']['rate'] if is_read_compressed is False: colocation_name = config['intermediate']['colocation'][ 'sample_csv'] else: colocation_name = config['intermediate']['colocation'][ 'sample_compressed'] colocation_fullname = '/'.join( [colocation_root, colocation_name.format(p, k, t, d, sample_rate)]) if is_file_exists(colocation_fullname) is False: colocation_fullname = None if colocation_fullname is None: if is_read_compressed is False: colocation_name = config['intermediate']['colocation']['csv'] else: colocation_name = config['intermediate']['colocation'][ 'compressed'] colocation_fullname = '/'.join( [colocation_root, colocation_name.format(p, k, t, d)]) colocation_dtypes = { 'user1': np.int_, 'user2': np.int_, 'location1': np.int_, 'location2': np.int_, 'time1': np.int_, 'time2': np.int_, 'lat1': np.float_, 'lon1': np.float_, 'lat2': np.float_, 'lon2': np.float_, 't_diff': np.int_, 's_diff': np.float_ } debug('Read colocation file', colocation_fullname) if chunksize is None: colocation_df = pd.read_csv(colocation_fullname, dtype=colocation_dtypes, usecols=usecols) else: colocation_df = pd.read_csv(colocation_fullname, dtype=colocation_dtypes, chunksize=chunksize, usecols=usecols) return colocation_df
def main(): debug('Started extracting partial colocation') ### Read config config = read_config() config_partial = config['kwargs']['partial_colocation'] ps = config_partial['p'] ks = config_partial['k'] t_input = config_partial['t_input'] d_input = config_partial['d_input'] t_targets = config_partial['t_target'] d_targets = config_partial['d_target'] extract_colocation(config, ps, ks, t_input, d_input, t_targets, d_targets) debug('Finished extracting partial colocation')
def generating_walk2friend_data(): config_name = 'config_test.json' debug('Started Test test_checkin_stats on SCI+', config_name) config = read_config(config_name) ### Started the program ### Read config config = read_config(config_name) kwargs = config['kwargs'] all_datasets = config['dataset'] all_modes = config['mode'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] directory = config['directory']['intermediate'] for dataset_name in datasets: p = all_datasets.index(dataset_name) for mode in modes: k = all_modes.index(mode) debug('Run Test on Dataset', dataset_name, p, 'Mode', mode, k) ### Test extract check-ins checkins, _ = extract_checkins_all(dataset_name, mode, config) checkins.sort_values(["user", "timestamp"], inplace=True) checkins['mid'] = range(1, len(checkins) + 1) checkins.rename(columns={ "user": "******", "location": "locid" }, inplace=True) checkins = checkins[['mid', 'uid', 'locid']] checkins.to_csv('/'.join( [directory, '%s_%s_10.checkin' % (dataset_name, mode)]), index=False, header=True) ### Test extract friendships friend_df = extract_friendships(dataset_name, config) friend_df.sort_values(["user1", "user2"], inplace=True) friend_df.sort_values(["user1", "user2"], inplace=True) friend_df.rename(columns={ "user1": "u1", "user2": "u2" }, inplace=True) friend_df.to_csv('/'.join( [directory, '%s_%s_10.friends' % (dataset_name, mode)]), index=False, header=True)
def prepare_colocation(config, p, k, t_diff, s_diff, begins, ends): working_directory = config['directory']['colocation'] filename = config['intermediate']['colocation']['part'] dataset_name = config['dataset'][p] make_sure_path_exists('/'.join([working_directory, dataset_name])) clear_dir = config['kwargs']['colocation']['clear_dir'] if clear_dir is True: remove_all_files('/'.join([working_directory, dataset_name])) ### Prepare the files for i in range(len(begins)): with open( '/'.join([ working_directory, dataset_name, filename.format(p, k, t_diff, s_diff, begins[i], ends[i]) ]), 'wb'): pass debug('Each colocation part file has been created')
def execute_parallel_st_tree_single(checkins, config, st_tree, data, p, k, t_diff, s_diff, start, finish): t0 = time.time() idx = st_tree.query_radius(data, 1) count = sum(len(x) for x in idx) if count > 0: colocations = extract_spatiotemporal_search_results( checkins, idx, start) write_colocation(colocations, config, p, k, t_diff, s_diff, start, finish) if colocations is not None: del colocations elapsed = time.time() - t0 del idx gc.collect() debug( 'Process map [p%d, k%d, t%d, d%.3f, start%d, finish%d] finished in %s seconds' % (p, k, t_diff, s_diff, start, finish, elapsed))
def generate_report(config, X, y, assign, notes, p, k, t, d): texts = [] names = config['kwargs']['sci_eval']['sampling'] for i in range(len(names)): Xs = [] for arr in assign: X_indexed = X[:, arr] Xs.append(X_indexed) name = names[i] debug('Evaluating {}'.format(name)) for idx in range(len(Xs)): Xi = Xs[idx] debug('Feature {}'.format(notes[idx])) mean_auc, mean_precision, mean_recall, mean_f1, total_ytrue = auc_score( config, Xi, y, name) text = '{},{},{},{},{:.9f},{:.9f},{:.9f},{:.9f},{},{},{},{}'.format( p, k, t, d, mean_auc, mean_precision, mean_recall, mean_f1, total_ytrue, len(y), notes[idx], name) texts.append(text) return texts
def map_reduce_colocation(config, checkins, grouped, p, k, t_diff, s_diff): kwargs = config['kwargs'] n_core = kwargs['n_core'] start = kwargs['colocation']['start'] finish = kwargs['colocation']['finish'] order = kwargs['colocation']['order'] ### For the sake of parallelization begins, ends = init_begin_end(n_core, len(grouped), start=start, finish=finish) debug('Begins', begins) debug('Ends', ends) ### Generate colocation based on extracted checkins prepare_colocation(config, p, k, t_diff, s_diff, begins, ends) ### Start from bottom if order == 'ascending': Parallel(n_jobs=n_core)(delayed(process_map)(checkins, grouped, config, begins[i], ends[i], \ p, k, t_diff, s_diff) for i in range(len(begins))) else: Parallel(n_jobs=n_core)(delayed(process_map)(checkins, grouped, config, begins[i-1], ends[i-1], \ p, k, t_diff, s_diff) for i in xrange(len(begins), 0, -1)) process_reduce(config, p, k, t_diff, s_diff) debug('Finished map-reduce for [p%d, k%d, t%d, d%.3f]' % (p, k, t_diff, s_diff))
def sampling(X, y, ptype='original'): if ptype == 'original': return (X, y) ### ovesampling elif ptype == 'over': query_time = time.time() pp = SMOTE(kind='regular') X_pp, y_pp = pp.fit_sample(X, y) process_time = int(time.time() - query_time) debug('Finished sampling SMOTE in {} seconds'.format(process_time)) return (X_pp, y_pp) ### undersampling elif ptype == 'under': query_time = time.time() pp = EditedNearestNeighbours() X_pp, y_pp = pp.fit_sample(X, y) process_time = int(time.time() - query_time) debug('Finished sampling ENN in {} seconds'.format(process_time)) return (X_pp, y_pp) ### oversampling + undersampling elif ptype == 'combo': query_time = time.time() pp = SMOTEENN() X_pp, y_pp = pp.fit_sample(X, y) process_time = int(time.time() - query_time) debug('Finished sampling SMOTE-ENN in {} seconds'.format(process_time)) return (X_pp, y_pp) return (X, y)
def extract_checkins(dataset_name, mode, config, id, filter): debug('Processing %s [%s] for each %s [filter=%s]' % (dataset_name, mode, id, filter)) dataset_root = config['directory']['dataset'] df, grouped = read_processed(dataset_root, dataset_name, mode, id, filter) debug('#checkins', len(df)) if grouped is not None: debug('#%ss' % id, len(grouped)) return df, grouped
def run_sci(config): ### Read standardized data and perform preprocessing kwargs = config['kwargs'] n_core = kwargs['n_core'] all_datasets = config['dataset'] all_modes = config['mode'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] t_diffs = kwargs['ts'] s_diffs = kwargs['ds'] for dataset_name in datasets: p = all_datasets.index(dataset_name) for mode in modes: k = all_modes.index(mode) debug('Run SCI on Dataset', dataset_name, p, 'Mode', mode, k, '#Core', n_core) ### Extracting checkins checkins, _ = extract_checkins(config, dataset_name, mode, 'user') stat_lp = extract_popularity(checkins, config, p, k) Parallel(n_jobs=n_core)(delayed(extract_colocation_features)(stat_lp, config, \ p, k, t_diff, s_diff) for t_diff in t_diffs for s_diff in s_diffs) checkins.drop(checkins.index, inplace=True) del checkins gc.collect()
def read_snap_stanford_checkin(root, dataset='gowalla', write=True): debug('Read SNAP Stanford Checkin %s' % dataset) df = pd.read_csv( '/'.join([root, dataset, RAW_CHECKIN_FILE]), header=None, names=['user', 'timestamp', 'latitude', 'longitude', 'location']) debug(df.describe(include='all')) debug(df.head()) ### Create a datetime column as the index df['time'] = pd.to_datetime(df['timestamp'], unit='s') df = df.set_index('time') debug(df.head()) ### Reordering columns df = df[final_column] ### Writing results to files if write is True: generate_results(root, dataset, df)
def extract_pgt(config, p, k, t, d): dataset_names = config['dataset'] compressed = config['kwargs']['compress_output'] pgt_root = config['directory']['pgt'] make_sure_path_exists('/'.join([pgt_root, dataset_names[p]])) if compressed is True: pgt_name = config['intermediate']['pgt']['pgt_output_compressed'] compression = 'bz2' else: pgt_name = config['intermediate']['pgt']['pgt_output'] compression = None intermediate_file = '/'.join( [pgt_root, dataset_names[p], pgt_name.format(p, k, t, d)]) if is_file_exists(intermediate_file) is False: ### Extracting each feature if config['kwargs']['pgt']['extract_pgt']['run'] is True: g1 = None g2 = None g3 = None g4 = None if config['kwargs']['pgt']['extract_pgt']['personal'] is True: g1, g2 = personal_factor(config, p, k, t, d) ### P in PGT debug('Finished loading personal factor', 'p', p, 'k', k, 't', t, 'd', d) if config['kwargs']['pgt']['extract_pgt']['global'] is True: g3 = global_factor(config, p, k, t, d, g2) ### PG in PGT debug('Finished loading global factor', 'p', p, 'k', k, 't', t, 'd', d) if config['kwargs']['pgt']['extract_pgt']['temporal'] is True: g4 = temporal_factor(config, p, k, t, d, g2) ### PGT in PGT debug('Finished loading temporal factor', 'p', p, 'k', k, 't', t, 'd', d) ### Merging all together if config['kwargs']['pgt']['extract_pgt']['merge'] is True: if g1 is not None and g2 is not None and g3 is not None and g4 is not None: df = g1[['user1', 'user2', 'g1']].merge(g2[['user1', 'user2', 'g2']], on=['user1', 'user2']) df = df.merge(g3[['user1', 'user2', 'g3']], on=['user1', 'user2']) df = df.merge(g4[['user1', 'user2', 'g4']], on=['user1', 'user2']) friend_df = extract_friendships(dataset_names[p], config) df = determine_social_tie(df, friend_df) df.to_csv(intermediate_file, header=True, index=False, compression=compression)
def test_colocation_stats(): config_name = 'config_test.json' ### Started the program debug('Started Test test_checkin_stats on SCI+', config_name) config = read_config('config_test.json') ### Read config config = read_config('config_test.json') kwargs = config['kwargs'] all_datasets = config['dataset'] all_modes = config['mode'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] t_diffs = kwargs['ts'] s_diffs = kwargs['ds'] for dataset_name in datasets: p = all_datasets.index(dataset_name) friend_df = extract_friendships(dataset_name, config) for mode in modes: k = all_modes.index(mode) for t in t_diffs: for d in s_diffs: total_user = 0 total_friend = 0 total_colocation = 0 i = 0 for colocation_df in read_colocation_file( config, p, k, t, d, chunksize=10**6, usecols=['user1', 'user2']): colocation_df = determine_social_tie( colocation_df, friend_df) total_colocation += len(colocation_df) colocation_df = colocation_df.drop_duplicates( ['user1', 'user2']) total_user += len(colocation_df) total_friend += sum(colocation_df['link']) i += 1 # debug('Processing chunks #%d' % i) # debug('#colocations', total_colocation, '#total_user', total_user, '#total_friend', total_friend, 'p', p, 'k', k, 't', t, 'd', d) debug(total_colocation, total_user, total_friend, p, k, t, d) gc.collect() debug('Finished Test on SCI+')
def extract_colocation_features(stat_lp, config, p, k, t, d): debug('p', p, 'k', k, 't', t, 'd', d) ### Check if SCI intermediate exists dataset_names = config['dataset'] compressed = config['kwargs']['compress_output'] sci_root = config['directory']['sci'] make_sure_path_exists('/'.join([sci_root, dataset_names[p]])) if compressed is True: sci_name = config['intermediate']['sci']['evaluation_compressed'] else: sci_name = config['intermediate']['sci']['evaluation'] sci_name = '/'.join( [sci_root, dataset_names[p], sci_name.format(p, k, t, d)]) if is_file_exists(sci_name): debug('File %s exists' % sci_name) else: ### Read (original) friendship from file friend_df = extract_friendships(dataset_names[p], config) colocation_df = read_colocation_file(config, p, k, t, d) ### Find if the two users in the colocated check-ins are friends / stranger colocation_df = determine_social_tie(colocation_df, friend_df) debug('#colocations', len(colocation_df), 'p', p, 'k', k, 't', t, 'd', d) ### Find the stability value for each co-location pairs groups = colocation_df.groupby(['user1', 'user2', 'link']) grouped = aggregate_stats(groups, stat_lp, p, k, t, d) ### Write the result into a csv output write_statistics(grouped, config, p, k, t, d) ### Memory management del friend_df del colocation_df del grouped debug('Finished extract_colocation_features', 'p', p, 'k', k, 't', t, 'd', d)
def read_foursquare2012_checkin(root, write=True): dataset = 'foursquare' debug('Read Checkin %s' % dataset) df = pd.read_csv('/'.join([root, dataset, RAW_CHECKIN_FILE]), parse_dates=['time']) debug(df.describe(include='all')) debug(df.head()) ### Create a UNIX timestamp column from the datetime format df['timestamp'] = df['time'].values.astype(np.int64) // 10**9 ### Set the datetime as the index df = df.set_index('time') ### Reordering columns df = df[final_column] ### Error checking # odd = df.loc[df.longitude>-80, ['longitude', 'latitude']] ### Writing results to files if write is True: generate_results(root, dataset, df)
def pgt_evaluation(config, p, k, t, d): debug('Evaluating PGT for p{}, k{}, t{}, d{}'.format(p, k, t, d)) dataset_names = config['dataset'] compressed = config['kwargs']['read_compressed'] pgt_root = config['directory']['pgt'] make_sure_path_exists('/'.join([pgt_root, dataset_names[p]])) if compressed is True: pgt_name = config['intermediate']['pgt']['pgt_output_compressed'] else: pgt_name = config['intermediate']['pgt']['pgt_output'] evaluation_name = '/'.join( [pgt_root, dataset_names[p], pgt_name.format(p, k, t, d)]) if is_file_exists(evaluation_name) is True: dataset = pd.read_csv(evaluation_name) # Format: 'user1', 'user2', 'g1', 'g2', 'g3', 'g4', 'link' X = dataset[['g1', 'g2', 'g3', 'g4']].values y = dataset[['link']].values ### Normalize unexpected values X[np.isinf(X)] = 0 X[np.isnan(X)] = 0 y[np.isinf(y)] = 0 y[np.isnan(y)] = 0 selected_feature_set = config['kwargs']['pgt_eval']['features'] if selected_feature_set == 'all': notes = ["PGT+", "PGT", "P0", "P", "PG"] assign = [[0, 1, 2, 3], [3], [0], [1], [2]] else: ### Summary only notes = ["PGT+", "PGT"] assign = [[0, 1, 2, 3], [3]] debug(notes, assign) texts = generate_report(config, X, y, assign, notes, p, k, t, d) del X, y report_directory = config['directory']['report'] result_filename = '/'.join( [report_directory, 'PGT_result_p{}_k{}.csv'.format(p, k)]) for text in texts: if text is not None: with open(result_filename, 'ab') as fw: fw.write(text + '\n') else: debug('File not found', evaluation_name)
def generate_user_visit(config): kwargs = config['kwargs'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] for dataset_name in datasets: p = config['dataset'].index(dataset_name) for mode in modes: k = config['mode'].index(mode) out_dir = '/'.join( [config['directory']['intermediate'], config['dataset'][p]]) out_name = config['intermediate']['pgt']['user_visit'].format( config['mode'][k]) final_name = '/'.join([out_dir, out_name]) if is_file_exists(final_name): debug('File %s already exists' % final_name) else: df, _ = extract_checkins_all(dataset_name, mode, config, filter=True) visits = df.groupby(['user', 'location' ])['timestamp'].count().reset_index() visits.rename(columns={"timestamp": "visit_count"}, inplace=True) u_count = df.groupby('user')['timestamp'].count().reset_index() u_count.rename(columns={"timestamp": "user_count"}, inplace=True) v_count = df.groupby( 'location')['timestamp'].count().reset_index() v_count.rename(columns={"timestamp": "location_count"}, inplace=True) visits = visits.join(u_count, on='user', how='outer', rsuffix='r') visits = visits.join(v_count, on='location', how='outer', rsuffix='r') visits = visits[[ 'user', 'location', 'visit_count', 'user_count', 'location_count' ]] visits.fillna(0, inplace=True) ### All of these must have the same amount debug('Total #Checkins', len(df)) debug('#Total user visits', int(visits['visit_count'].sum())) debug( '#Total user counts', int(visits.drop_duplicates(['user'])['user_count'].sum())) debug( '#Total location counts', int( visits.drop_duplicates(['location' ])['location_count'].sum())) visits.to_csv(final_name, header=True, index=False, compression='bz2') del visits, df gc.collect()
def generate_walk2friend(config): kwargs = config['kwargs'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] t_diffs = kwargs['ts'] s_diffs = kwargs['ds'] for dataset_name in datasets: p = config['dataset'].index(dataset_name) for mode in modes: k = config['mode'].index(mode) for t in t_diffs: for d in s_diffs: output_dir = config['directory']['walk2friend'] make_sure_path_exists(output_dir) debug('p', p, 'k', k, 't', t, 'd', d) checkin_name = '/'.join([ output_dir, '{}_{}_t{}_d{}.checkin'.format(dataset_name, mode, t, d) ]) friends_name = '/'.join([ output_dir, '{}_{}_t{}_d{}.friends'.format(dataset_name, mode, t, d) ]) if is_file_exists(checkin_name) is False or is_file_exists( friends_name) is False: checkins, _ = extract_checkins_all( dataset_name, mode, config) friends = extract_friendships(dataset_name, config) user_unique = [] for colocations in read_colocation_file( config, p, k, t, d, chunksize=10**6, usecols=['user1', 'user2']): user_unique.append(colocations['user1'].unique()) user_unique.append(colocations['user2'].unique()) # user_unique = np.array(user_unique) user_unique = np.ravel(user_unique) debug(user_unique) user_unique = np.unique(user_unique) debug('Before', '#checkins', len(checkins), '#friends', len(friends)) checkins = checkins.loc[( checkins['user'].isin(user_unique))] friends = friends.loc[ (friends['user1'].isin(user_unique)) & (friends['user2'].isin(user_unique))] debug('After', '#checkins', len(checkins), '#friends', len(friends)) checkins.sort_values(['user', 'location'], inplace=True) checkins.rename(columns={ "user": "******", "location": "locid" }, inplace=True) checkins['mid'] = range(len(checkins)) checkins = checkins[['mid', 'uid', 'locid']] checkins.to_csv(checkin_name, index=False, header=True) friends.rename(columns={ "user1": "u1", "user2": "u2" }, inplace=True) friends.sort_values(['u1', 'u2'], inplace=True) friends = friends[['u1', 'u2']] friends.to_csv(friends_name, index=False, header=True) del user_unique gc.collect()
def generate_colocation(checkins, grouped, config, p, k, t_diff, s_diff, start, finish, write_per_instance): colocations = [] run_by = config['kwargs']['colocation']['run_by'] if grouped is not None: ids = grouped[run_by].values counter = 0 total_skip = 0 is_debugging_colocation = config['kwargs']['colocation']['debug'] skip_tolerance = config['kwargs']['colocation']['early_stop'] for i in range(start, finish): consecutive_skip = 0 if i < 0 or i > len(ids): break u_i = ids[i] df_i = df_uid(checkins, u_i, config) if grouped is not None: stats_i = df_uid(grouped, u_i, config) si_tree = create_spatial_kd_tree(df_i) ti_tree = create_temporal_kd_tree(df_i) for j in range(i + 1, len(ids)): u_j = ids[j] if grouped is not None: stats_j = df_uid(grouped, u_j, config) ### If there are no intersections between two users' timestamp, then skip if stats_i['t_max'].values[0]+t_diff < stats_j['t_min'].values[0] \ or stats_j['t_max'].values[0]+t_diff < stats_i['t_min'].values[0]: total_skip += 1 consecutive_skip += 1 del u_j, stats_j if consecutive_skip > skip_tolerance and skip_tolerance > 0: total_skip += len(ids) - j - 1 break else: continue df_j = df_uid(checkins, u_j, config) if grouped is not None: ### If the GPS coordinates have no intersections if( stats_i['lat_min'].values[0] > stats_j['lat_max'].values[0]+s_diff or \ stats_i['lat_max'].values[0]+s_diff < stats_j['lat_min'].values[0] or \ stats_i['lon_min'].values[0] > stats_j['lon_max'].values[0]+s_diff or \ stats_i['lon_max'].values[0]+s_diff < stats_j['lon_min'].values[0] ): total_skip += 1 consecutive_skip += 1 del df_j, u_j, stats_j if consecutive_skip > skip_tolerance and skip_tolerance > 0: total_skip += len(ids) - j - 1 break else: continue else: consecutive_skip = 0 tj_tree = create_temporal_kd_tree(df_j) ### temporal co-occurrence t_idx = ti_tree.query_ball_tree(tj_tree, t_diff) t_count = sum(len(x) for x in t_idx) if t_count > 0: ### spatial co-occurrence sj_tree = create_spatial_kd_tree(df_j) s_idx = si_tree.query_ball_tree(sj_tree, s_diff) s_count = sum(len(x) for x in s_idx) ### Only if both temporal and spatial co-occurrence > 0 if s_count > 0: ### Finding the intersection and adding colocations to the list result = extract_radius_search_results( df_i, df_j, s_idx, t_idx) if result is not None and len(result) > 0: colocations.extend(result) del result[:] del result del s_idx, sj_tree del tj_tree, t_idx, df_j, u_j, stats_j ### For testing purpose if is_debugging_colocation is True and j > i + 11: break ### Prepare for the next iteration counter += 1 if write_per_instance is True: if colocations is not None: if len(colocations) > 0: write_colocation(colocations, config, p, k, t_diff, s_diff, start, finish) del colocations[:] ### Clear-up memory del u_i, df_i, si_tree, ti_tree, stats_i _ = gc.collect() del ids debug('Skipped', total_skip, 'user pairs due to the missing time / spatial intersections') if write_per_instance is True: ### Delete the last colocations set if it is per-user if colocations is not None: del colocations[:] del colocations _ = gc.collect() return None else: return colocations
def extract_colocation(config, ps, ks, t_diff_input, s_diff_input, t_diff_targets, s_diff_targets): ### Format: user1,user2,location1,location2,time1,time2,lat1,lon1,lat2,lon2,t_diff,s_diff working_directory = config['directory']['colocation'] in_filename = config['intermediate']['colocation']['csv'] out_filename = config['intermediate']['colocation']['compressed'] for p in ps: for k in ks: debug( 'Reading colocation file', '/'.join([ working_directory, in_filename.format(p, k, t_diff_input, s_diff_input) ])) df = pd.read_csv('/'.join([ working_directory, in_filename.format(p, k, t_diff_input, s_diff_input) ])) debug('Original colocation size', len(df)) debug('t_diff', df['t_diff'].max(), 's_diff', df['s_diff'].max()) for t_diff_target in t_diff_targets: df = df[(df['t_diff'] <= t_diff_target)] for s_diff_target in s_diff_targets: output_final_name = '/'.join([ working_directory, out_filename.format(p, k, t_diff_target, s_diff_target) ]) df_temp = df[(df['s_diff'] <= s_diff_target)] debug('Filtered colocation size', len(df_temp)) debug('t_diff', df_temp['t_diff'].max(), 's_diff', df_temp['s_diff'].max()) debug('Writing colocation file', output_final_name) df_temp.to_csv(output_final_name, index=False, compression='bz2') del df_temp