def generate_colocation_single(checkins, config, p, k, t_diff, s_diff): dataset_name = config['dataset'][p] kwargs = config['kwargs'] n_core = kwargs['n_core'] start = kwargs['colocation']['start'] finish = kwargs['colocation']['finish'] order = kwargs['colocation']['order'] working_directory = '/'.join( [config['directory']['intermediate'], dataset_name]) make_sure_path_exists(working_directory) kdtree_intermediate = '/'.join([ working_directory, config['intermediate']['colocation']['kdtree'].format(p, k) ]) if is_file_exists(kdtree_intermediate): st_tree = sk_joblib.load(kdtree_intermediate) else: st_tree = create_spatiotemporal_kd_tree(checkins, kdtree_intermediate, t_diff, s_diff) other = extract_spatiotemporal_normalized(checkins, t_diff, s_diff) begins, ends = init_begin_end(n_core, len(checkins), start=start, finish=finish) ### Generate colocation based on extracted checkins prepare_colocation(config, p, k, t_diff, s_diff, begins, ends) ### Start from bottom if order == 'ascending': Parallel(n_jobs=n_core)(delayed(execute_parallel_st_tree_single)(checkins, config, st_tree, other[begins[i]:ends[i]], \ p, k, t_diff, s_diff, begins[i], ends[i]) \ for i in range(len(begins))) else: Parallel(n_jobs=n_core)(delayed(execute_parallel_st_tree_single)(checkins, config, st_tree, other[begins[i-1]:ends[i-1]], \ p, k, t_diff, s_diff, begins[i-1], ends[i-1]) \ for i in xrange(len(begins), 0, -1))
def extract_popularity(checkins, config, p, k): intermediate_root = config['directory']['intermediate'] dataset_names = config['dataset'] modes = config['mode'] popularity_intermediate_file = config['intermediate']['sci']['popularity'] pickle_directory = '/'.join([intermediate_root, dataset_names[p]]) make_sure_path_exists(pickle_directory) pickle_filename = '/'.join( [pickle_directory, popularity_intermediate_file.format(modes[k])]) if not is_file_exists(pickle_filename): stat_lp = {} ### Popularity score for location l visit_per_venue, p_l = extract_visit_per_venue(checkins, config) p_ul = extract_aggregated_visit(visit_per_venue, p_l) ### Evaluate the weight for each venue for vid, arr in p_ul.items(): if len(arr) > 0: ent = entropy(arr) stat_lp[vid] = ent else: stat_lp[vid] = 0.0 ### Memory management del p_l[:] del p_l visit_per_venue.clear() p_ul.clear() del visit_per_venue, p_ul ### Write to pickle intermediate file with open(pickle_filename, 'wb') as handle: pickle.dump(stat_lp, handle, protocol=pickle.HIGHEST_PROTOCOL) else: with open(pickle_filename, 'rb') as handle: stat_lp = pickle.load(handle) ### Return the result return stat_lp
def temporal_factor(config, p, k, t, d, g2): g4 = None ### Intermediate file -- check if exists pgt_root = config['directory']['pgt'] dataset_names = config['dataset'] modes = config['mode'] pgt_root = config['directory']['pgt'] make_sure_path_exists('/'.join([pgt_root, dataset_names[p]])) g4_file = '/'.join([pgt_root, dataset_names[p], \ config['intermediate']['pgt']['pgt_g4'].format(modes[k], t, d)]) if is_file_exists(g4_file) is True: g4 = pd.read_csv(g4_file) else: global_df = transform_colocation_pgt(config, p, k, t, d, 'global') colocation_df = read_colocation_file(config, p, k, t, d, \ usecols=['user1', 'user2', 'time1', 'time2']) colocation_df['wg'] = global_df['wg'].values colocation_df['time'] = (colocation_df['time1'] + colocation_df['time2']) / 2 colocation_df.drop(columns=['time1', 'time2'], inplace=True) groups = colocation_df.groupby(['user1', 'user2']) g4 = applyParallel(config, groups, lambda_temporal) g4 = g4.groupby(['user1', 'user2'])['wt'].agg(['sum']) g4.reset_index(inplace=True) g4.sort_values(['user1', 'user2'], inplace=True) g4['g4'] = g2['g2'] * g4['sum'] g4[g4 < 0] = 0.0 ### Prevent negatives in values g4['g4'] = g4['g4'] / max(g4['g4']) g4.to_csv(g4_file, header=True, index=False, compression='bz2') del colocation_df, groups return g4
def extract_pi_loc_k(checkins, grouped, venues, user_visit, config, p, k, start, finish, feature): pgt_part_root = config['directory']['pgt_temp'] dataset_names = config['dataset'] modes = config['mode'] make_sure_path_exists('/'.join([pgt_part_root, dataset_names[p]])) pgt_file_part = '/'.join([pgt_part_root, dataset_names[p], \ config['intermediate']['pgt']['%s_part' % feature].format(modes[k], start, finish)]) if is_file_exists(pgt_file_part) is True: pass else: user_visit = user_visit[(user_visit['visit_count'] > 0)] #debug('start', start, 'finish', finish) if feature == 'personal': ids = grouped['user'].values grouping = 'user' result = pd.DataFrame(columns=['user', 'location', 'p_i']) elif feature == 'global': ids = grouped['location'].values grouping = 'location' result = pd.DataFrame(columns=['location', 'p_i']) t0 = time.time() for i in range(start, finish): u_i = ids[i] df = df_uid(checkins, u_i, config, grouping) visit_match = user_visit.isin({grouping: df[grouping].unique()}) visit_temp = user_visit[visit_match[grouping]] if len(visit_temp > 0): if feature == 'personal': ### Extract the p_i of each user's visit visit_temp['p_i'] = visit_temp.apply( lambda x: calculate_density(x, cd, df, venues), axis=1) visit_temp = visit_temp[['user', 'location', 'p_i']] result = result.append(visit_temp, ignore_index=True) elif feature == 'global': ### Aggregate visit on each location aggregations = { 'user_count': { 'entropy': lambda x: calculate_entropy(x) }, } grouped = visit_temp.groupby(['location']) \ .agg(aggregations) grouped.columns = [ "_".join(x) for x in grouped.columns.ravel() ] grouped.rename(columns={"user_count_entropy": "p_i"}, inplace=True) grouped.reset_index(inplace=True) # debug(grouped.columns.values) # debug(grouped.head()) grouped = grouped[['location', 'p_i']] result = result.append(grouped, ignore_index=True) t1 = time.time() ### Writing to temp file if feature == 'personal': result.drop_duplicates(subset=['user', 'location'], inplace=True) result.to_csv(pgt_file_part, index=False, header=True) debug('Finished density calculation into %s in %s seconds' % (pgt_file_part, str(t1 - t0)))
def parallel_sampling(config, p, k, t, d): debug('Start sampling', 'p', p, 'k', k, 't', t, 'd', d) kwargs = config['kwargs'] is_read_compressed = kwargs['read_compressed'] colocation_root = config['directory']['colocation'] make_sure_path_exists(colocation_root) if is_read_compressed is False: sample_name = config['intermediate']['colocation']['sample_csv'] compression = None else: sample_name = config['intermediate']['colocation']['sample_compressed'] compression = 'bz2' sample_rate = kwargs['preprocessing']['sampling']['rate'] sample_fullname = '/'.join( [colocation_root, sample_name.format(p, k, t, d, sample_rate)]) df = read_colocation_file(config, p, k, t, d) df = df.sample(frac=sample_rate, random_state=1) df.to_csv(sample_fullname, header=True, index=False, compression=compression, mode='w') debug('Finished sampling', 'p', p, 'k', k, 't', t, 'd', d, '#sample: ', len(df))
def run_pgt_evaluation(config): kwargs = config['kwargs'] n_core = kwargs['n_core'] all_datasets = config['dataset'] all_modes = config['mode'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] t_diffs = kwargs['ts'] s_diffs = kwargs['ds'] report_directory = config['directory']['report'] make_sure_path_exists(report_directory) for dataset_name in datasets: p = all_datasets.index(dataset_name) for mode in modes: k = all_modes.index(mode) debug('Run PGT Evaluation on Dataset', dataset_name, p, 'Mode', mode, k, '#Core', n_core) ### Creating the report file result_filename = '/'.join( [report_directory, 'PGT_result_p{}_k{}.csv'.format(p, k)]) remove_file_if_exists(result_filename) with open(result_filename, 'ab') as fw: fw.write( 'p,k,t,d,auc,precision,recall,f1,#friends,#data,feature_set,preprocessing\n' ) for t_diff in t_diffs: for s_diff in s_diffs: pgt_evaluation(config, p, k, t_diff, s_diff) gc.collect()
def write_statistics(df, config, p, k, t, d): dataset_names = config['dataset'] compressed = config['kwargs']['compress_output'] sci_root = config['directory']['sci'] make_sure_path_exists('/'.join([sci_root, dataset_names[p]])) if compressed is True: sci_name = config['intermediate']['sci']['evaluation_compressed'] compression = 'bz2' else: sci_name = config['intermediate']['sci']['evaluation'] compression = None df.to_csv('/'.join([sci_root, dataset_names[p], sci_name.format(p, k, t, d)]), \ header=True, index=False, compression=compression)
def extract_pgt(config, p, k, t, d): dataset_names = config['dataset'] compressed = config['kwargs']['compress_output'] pgt_root = config['directory']['pgt'] make_sure_path_exists('/'.join([pgt_root, dataset_names[p]])) if compressed is True: pgt_name = config['intermediate']['pgt']['pgt_output_compressed'] compression = 'bz2' else: pgt_name = config['intermediate']['pgt']['pgt_output'] compression = None intermediate_file = '/'.join( [pgt_root, dataset_names[p], pgt_name.format(p, k, t, d)]) if is_file_exists(intermediate_file) is False: ### Extracting each feature if config['kwargs']['pgt']['extract_pgt']['run'] is True: g1 = None g2 = None g3 = None g4 = None if config['kwargs']['pgt']['extract_pgt']['personal'] is True: g1, g2 = personal_factor(config, p, k, t, d) ### P in PGT debug('Finished loading personal factor', 'p', p, 'k', k, 't', t, 'd', d) if config['kwargs']['pgt']['extract_pgt']['global'] is True: g3 = global_factor(config, p, k, t, d, g2) ### PG in PGT debug('Finished loading global factor', 'p', p, 'k', k, 't', t, 'd', d) if config['kwargs']['pgt']['extract_pgt']['temporal'] is True: g4 = temporal_factor(config, p, k, t, d, g2) ### PGT in PGT debug('Finished loading temporal factor', 'p', p, 'k', k, 't', t, 'd', d) ### Merging all together if config['kwargs']['pgt']['extract_pgt']['merge'] is True: if g1 is not None and g2 is not None and g3 is not None and g4 is not None: df = g1[['user1', 'user2', 'g1']].merge(g2[['user1', 'user2', 'g2']], on=['user1', 'user2']) df = df.merge(g3[['user1', 'user2', 'g3']], on=['user1', 'user2']) df = df.merge(g4[['user1', 'user2', 'g4']], on=['user1', 'user2']) friend_df = extract_friendships(dataset_names[p], config) df = determine_social_tie(df, friend_df) df.to_csv(intermediate_file, header=True, index=False, compression=compression)
def prepare_colocation(config, p, k, t_diff, s_diff, begins, ends): working_directory = config['directory']['colocation'] filename = config['intermediate']['colocation']['part'] dataset_name = config['dataset'][p] make_sure_path_exists('/'.join([working_directory, dataset_name])) clear_dir = config['kwargs']['colocation']['clear_dir'] if clear_dir is True: remove_all_files('/'.join([working_directory, dataset_name])) ### Prepare the files for i in range(len(begins)): with open( '/'.join([ working_directory, dataset_name, filename.format(p, k, t_diff, s_diff, begins[i], ends[i]) ]), 'wb'): pass debug('Each colocation part file has been created')
def pgt_evaluation(config, p, k, t, d): debug('Evaluating PGT for p{}, k{}, t{}, d{}'.format(p, k, t, d)) dataset_names = config['dataset'] compressed = config['kwargs']['read_compressed'] pgt_root = config['directory']['pgt'] make_sure_path_exists('/'.join([pgt_root, dataset_names[p]])) if compressed is True: pgt_name = config['intermediate']['pgt']['pgt_output_compressed'] else: pgt_name = config['intermediate']['pgt']['pgt_output'] evaluation_name = '/'.join( [pgt_root, dataset_names[p], pgt_name.format(p, k, t, d)]) if is_file_exists(evaluation_name) is True: dataset = pd.read_csv(evaluation_name) # Format: 'user1', 'user2', 'g1', 'g2', 'g3', 'g4', 'link' X = dataset[['g1', 'g2', 'g3', 'g4']].values y = dataset[['link']].values ### Normalize unexpected values X[np.isinf(X)] = 0 X[np.isnan(X)] = 0 y[np.isinf(y)] = 0 y[np.isnan(y)] = 0 selected_feature_set = config['kwargs']['pgt_eval']['features'] if selected_feature_set == 'all': notes = ["PGT+", "PGT", "P0", "P", "PG"] assign = [[0, 1, 2, 3], [3], [0], [1], [2]] else: ### Summary only notes = ["PGT+", "PGT"] assign = [[0, 1, 2, 3], [3]] debug(notes, assign) texts = generate_report(config, X, y, assign, notes, p, k, t, d) del X, y report_directory = config['directory']['report'] result_filename = '/'.join( [report_directory, 'PGT_result_p{}_k{}.csv'.format(p, k)]) for text in texts: if text is not None: with open(result_filename, 'ab') as fw: fw.write(text + '\n') else: debug('File not found', evaluation_name)
def personal_factor(config, p, k, t, d): g1 = None g2 = None ### Intermediate file -- check if exists pgt_root = config['directory']['pgt'] dataset_names = config['dataset'] modes = config['mode'] pgt_root = config['directory']['pgt'] make_sure_path_exists('/'.join([pgt_root, dataset_names[p]])) g1_file = '/'.join([pgt_root, dataset_names[p], \ config['intermediate']['pgt']['pgt_g1'].format(modes[k], t, d)]) g2_file = '/'.join([pgt_root, dataset_names[p], \ config['intermediate']['pgt']['pgt_g2'].format(modes[k], t, d)]) if is_file_exists(g1_file) is True and is_file_exists(g2_file) is True: g1 = pd.read_csv(g1_file) g2 = pd.read_csv(g2_file) else: ### If it does not exist feature = 'personal' colocation_df = transform_colocation_pgt(config, p, k, t, d, feature) ### Aggregate the weight for each user pair g1 = colocation_df.groupby(['user1', 'user2'])['wp'].agg(['mean', 'count']) g2 = colocation_df.groupby(['user1', 'user2'])['wp'].agg(['max', 'count']) g1.reset_index(inplace=True) g2.reset_index(inplace=True) g1.sort_values(['user1', 'user2'], inplace=True) g2.sort_values(['user1', 'user2'], inplace=True) g1['g1'] = g1['mean'] * g1['count'] g2['g2'] = g2['max'] * g2['count'] g1['g1'] = g1['g1'] / max(g1['g1']) g2['g2'] = g2['g2'] / max(g2['g2']) g1.to_csv(g1_file, header=True, index=False, compression='bz2') g2.to_csv(g2_file, header=True, index=False, compression='bz2') # debug(g1.head()) # debug(g2.head()) del colocation_df return g1, g2
def global_factor(config, p, k, t, d, g2): g3 = None ### Intermediate file -- check if exists pgt_root = config['directory']['pgt'] dataset_names = config['dataset'] modes = config['mode'] pgt_root = config['directory']['pgt'] make_sure_path_exists('/'.join([pgt_root, dataset_names[p]])) g3_file = '/'.join([pgt_root, dataset_names[p], \ config['intermediate']['pgt']['pgt_g3'].format(modes[k], t, d)]) if is_file_exists(g3_file) is True: g3 = pd.read_csv(g3_file) else: feature = 'global' colocation_df = transform_colocation_pgt(config, p, k, t, d, feature) g3 = colocation_df.groupby(['user1', 'user2'])['wg'].agg(['sum']) g3.reset_index(inplace=True) g3.sort_values(['user1', 'user2'], inplace=True) g3['g3'] = g2['g2'] * g3['sum'] g3['g3'] = g3['g3'] / max(g3['g3']) g3.to_csv(g3_file, header=True, index=False, compression='bz2') del colocation_df return g3
def extract_colocation_features(stat_lp, config, p, k, t, d): debug('p', p, 'k', k, 't', t, 'd', d) ### Check if SCI intermediate exists dataset_names = config['dataset'] compressed = config['kwargs']['compress_output'] sci_root = config['directory']['sci'] make_sure_path_exists('/'.join([sci_root, dataset_names[p]])) if compressed is True: sci_name = config['intermediate']['sci']['evaluation_compressed'] else: sci_name = config['intermediate']['sci']['evaluation'] sci_name = '/'.join( [sci_root, dataset_names[p], sci_name.format(p, k, t, d)]) if is_file_exists(sci_name): debug('File %s exists' % sci_name) else: ### Read (original) friendship from file friend_df = extract_friendships(dataset_names[p], config) colocation_df = read_colocation_file(config, p, k, t, d) ### Find if the two users in the colocated check-ins are friends / stranger colocation_df = determine_social_tie(colocation_df, friend_df) debug('#colocations', len(colocation_df), 'p', p, 'k', k, 't', t, 'd', d) ### Find the stability value for each co-location pairs groups = colocation_df.groupby(['user1', 'user2', 'link']) grouped = aggregate_stats(groups, stat_lp, p, k, t, d) ### Write the result into a csv output write_statistics(grouped, config, p, k, t, d) ### Memory management del friend_df del colocation_df del grouped debug('Finished extract_colocation_features', 'p', p, 'k', k, 't', t, 'd', d)
def process_reduce(config, p, k, t_diff, s_diff): out_format = config['intermediate']['colocation']['csv'] re_format = config['intermediate']['colocation']['re'] working_directory = config['directory']['colocation'] dataset_name = config['dataset'][p] make_sure_path_exists('/'.join([working_directory, dataset_name])) pattern = re.compile(re_format.format(p, k, t_diff, s_diff)) file_list = [] for fname in os.listdir('/'.join([working_directory, dataset_name])): if fname.endswith(".csv"): if pattern.match(fname): file_list.append('/'.join( [working_directory, dataset_name, fname])) output = '/'.join([ working_directory, dataset_name, out_format.format(p, k, t_diff, s_diff) ]) with open(output, 'wb') as fw: fw.write('%s' % colocation_header) with open(output, 'ab') as wfd: for f in file_list: with open(f, 'rb') as fd: shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10) #10MB per writing chunk to avoid reading big file into memory.
def prepare_extraction(config, feature, p, k): ### Check if PGT intermediate exists dataset_names = config['dataset'] modes = config['mode'] pgt_root = config['directory']['pgt'] pgt_part_root = config['directory']['pgt_temp'] make_sure_path_exists('/'.join([pgt_root, dataset_names[p]])) pgt_file = '/'.join([pgt_root, dataset_names[p], \ config['intermediate']['pgt'][feature].format(modes[k])]) if is_file_exists(pgt_file) is True: debug('PGT %s exists' % feature) else: if feature == 'personal': checkins, grouped = extract_checkins_per_user( dataset_names[p], modes[k], config) elif feature == 'global': checkins, grouped = extract_checkins_per_venue( dataset_names[p], modes[k], config) user_visit_dir = '/'.join( [config['directory']['intermediate'], config['dataset'][p]]) user_visit_name = config['intermediate']['pgt']['user_visit'].format( config['mode'][k]) final_name = '/'.join([user_visit_dir, user_visit_name]) ### Using user visit database if is_file_exists(final_name) is True: user_visit = pd.read_csv(final_name, compression='bz2') venues = checkins[['location', 'latitude', 'longitude' ]].drop_duplicates(subset=['location']) debug('#Venues', len(venues), 'p', p, 'k', k) kwargs = config['kwargs'] n_core = kwargs['n_core'] start = kwargs['pgt'][feature]['start'] finish = kwargs['pgt'][feature]['finish'] begins, ends = init_begin_end(n_core, len(grouped), start=start, finish=finish) if feature == 'personal': function = density_location elif feature == 'global': function = entropy_location ### Map step Parallel(n_jobs=n_core)(delayed(function)(checkins, grouped, venues, user_visit, \ config, p, k, begins[i-1], ends[i-1]) \ for i in xrange(len(begins), 0, -1)) ### Reduce step if feature == 'personal': result = pd.DataFrame(columns=['user', 'location', 'p_i']) elif feature == 'global': result = pd.DataFrame(columns=['location', 'p_i']) for i in range(len(begins)): start = begins[i] finish = ends[i] pgt_file_part = '/'.join([pgt_part_root, dataset_names[p], \ config['intermediate']['pgt']['%s_part' % feature].format(modes[k], start, finish)]) temp = pd.read_csv(pgt_file_part) result = result.append(temp, ignore_index=True) if feature == 'personal': result.drop_duplicates(subset=['user', 'location'], inplace=True) result.sort_values(['user', 'location'], inplace=True) elif feature == 'global': result.sort_values(['location'], inplace=True) debug('#User Visits', len(result)) result.to_csv(pgt_file, index=False, header=True) ### Clean up mess if needed if config['kwargs']['pgt'][feature]['clean_temp'] is True: for i in range(len(begins)): start = begins[i] finish = ends[i] pgt_file_part = '/'.join([pgt_part_root, dataset_names[p], \ config['intermediate']['pgt']['%s_part' % feature].format(modes[k], start, finish)]) remove_file_if_exists(pgt_file_part) else: debug( 'Please generate the user visit first through preprocessing/read.py', '(function: generate_user_visit)')
def transform_colocation_pgt(config, p, k, t, d, feature): dataset_names = config['dataset'] modes = config['mode'] pgt_root = config['directory']['pgt'] pgt_part_root = config['directory']['pgt_temp'] make_sure_path_exists('/'.join([pgt_root, dataset_names[p]])) make_sure_path_exists('/'.join([pgt_part_root, dataset_names[p]])) pgt_file = '/'.join([pgt_root, dataset_names[p], \ config['intermediate']['pgt'][feature].format(modes[k])]) ### Check if PGT intermediate exists if is_file_exists(pgt_file) is False: debug('PGT %s does not exists' % feature) debug('Please run PGT %s factor extraction first' % feature) return None else: g0 = '/'.join([pgt_root, dataset_names[p], \ config['intermediate']['pgt']['pgt_g0_%s' % feature].format(modes[k], t, d)]) if is_file_exists(g0) is False: if feature == 'personal': ### columns=['user', 'location', 'p_i'] personal_density = pd.read_csv(pgt_file) col_name = 'wp' elif feature == 'global': ### columns=['location', 'p_i'] entropy_location = pd.read_csv(pgt_file) col_name = 'wg' ### user1,user2,location1,location2,time1,time2,lat1,lon1,lat2,lon2,t_diff,s_diff ### Evaluate the weight for each colocation ### Map step i = 0 chunksize = 10**5 debug('chunksize for transform_colocation_pgt', chunksize) for colocation_df in read_colocation_file( config, p, k, t, d, chunksize=chunksize, usecols=['user1', 'user2', 'location1', 'location2']): g0_part = '/'.join([pgt_part_root, dataset_names[p], \ config['intermediate']['pgt']['pgt_g0_%s_part' % feature].format(modes[k], t, d, i)]) debug('Processing', feature, 'part', g0_part) if is_file_exists(g0_part) is False: if feature == 'personal': colocation_df[col_name] = colocation_df.apply( lambda x: calculate_personal(x, personal_density), axis=1) elif feature == 'global': colocation_df[col_name] = colocation_df.apply( lambda x: calculate_global(x, entropy_location), axis=1) colocation_df.to_csv(g0_part, index=False, header=True, compression='bz2') i += 1 ### Reduce step colocation_df = pd.DataFrame( columns=['user1', 'user2', 'location1', 'location2', col_name]) condition = True i = 0 while (condition is True): ### Iterate over all chunks g0_part = '/'.join([pgt_part_root, dataset_names[p], \ config['intermediate']['pgt']['pgt_g0_%s_part' % feature].format(modes[k], t, d, i)]) if is_file_exists(g0_part) is False: condition = False break temp = pd.read_csv(g0_part) colocation_df = colocation_df.append(temp, ignore_index=True) i += 1 if config['kwargs']['pgt'][feature]['clean_temp'] is True: condition = True i = 0 while (condition is True): g0_part = '/'.join([pgt_part_root, dataset_names[p], \ config['intermediate']['pgt']['pgt_g0_%s_part' % feature].format(modes[k], t, d, i)]) if is_file_exists(g0_part) is False: condition = False break remove_file_if_exists(g0_part) i += 1 colocation_df.replace([np.inf, -np.inf], np.nan, inplace=True) colocation_df.fillna(0, inplace=True) colocation_df.to_csv(g0, index=False, header=True, compression='bz2') gc.collect() else: colocation_df = pd.read_csv(g0) debug('Loaded g0 %s successfully [%s]' % (feature, g0)) return colocation_df
def sci_evaluation(config, p, k, t, d): debug('Evaluating SCI for p{}, k{}, t{}, d{}'.format(p, k, t, d)) dataset_names = config['dataset'] compressed = config['kwargs']['read_compressed'] sci_root = config['directory']['sci'] make_sure_path_exists('/'.join([sci_root, dataset_names[p]])) if compressed is True: sci_name = config['intermediate']['sci']['evaluation_compressed'] else: sci_name = config['intermediate']['sci']['evaluation'] evaluation_name = '/'.join( [sci_root, dataset_names[p], sci_name.format(p, k, t, d)]) if is_file_exists(evaluation_name) is True: dataset = pd.read_csv(evaluation_name) # Format: 'uid1', 'uid2', 'frequency', 'diversity', 'duration', 'stability', 'popularity', 'link' X = dataset[[ 'frequency', 'diversity', 'duration', 'stability_std', 'popularity', 'stability_avg', 'stability_old' ]].values y = dataset[['link']].values ### Selecting the feature set selected_feature_set = config['kwargs']['sci_eval']['features'] ### PAKDD 2017 Submission if selected_feature_set == 'pakdd_2017_all': notes = [ "SCI", "frequency", "diversity", "duration", "stability", "F+D", "F+TD", "F+TS", "D+TD", "D+TS", "TD+TS", "F+D+TD", "F+D+TS", "F+TD+TS", "D+TD+TS" ] assign = [[0, 1, 2, 6], [0], [1], [2], [6], [0, 1], [0, 2], [0, 6], [1, 2], [1, 6], [2, 6], [0, 1, 2], [0, 1, 6], [0, 2, 6], [1, 2, 6]] ### PAKDD 2017 Submission (All) elif selected_feature_set == 'pakdd_2017_summary': notes = ['SCI'] assign = [[0, 1, 2, 6]] ## New Feature added (Popularity) elif selected_feature_set == 'all_features': notes = [ 'SCI+', 'Frequency', 'Diversity', 'Duration', 'Stability', 'Popularity', 'F+D', 'F+TD', 'F+TS', 'F+P', 'D+TD', 'D+TS', 'D+P', 'TD+TS', 'TD+P', 'TS+P', 'F+D+TD', 'F+D+TS', 'F+D+P', 'F+TD+TS', 'F+TD+P', 'F+TS+P', 'D+TD+TS', 'D+TD+P', 'D+TS+P', 'TD+TS+P', 'F+D+TD+TS', 'F+D+TD+P', 'F+D+TS+P', 'F+TD+TS+P', 'D+TD+TS+P', 'SCI' ] assign = [[0, 1, 2, 3, 4], [0], [1], [2], [3], [4], [0, 1], [0, 2], [0, 3], [0, 4], [1, 2], [1, 3], [1, 4], [2, 3], [2, 4], [3, 4], [0, 1, 2], [0, 1, 3], [0, 1, 4], [0, 2, 3], [0, 2, 4], [0, 3, 4], [1, 2, 3], [1, 2, 4], [1, 3, 4], [2, 3, 4], [0, 1, 2, 3], [0, 1, 2, 4], [0, 1, 3, 4], [0, 2, 3, 4], [1, 2, 3, 4], [0, 1, 2, 3]] ### Only All features elif selected_feature_set == 'summary': notes = ['SCI+'] assign = [[0, 1, 2, 3, 4]] ### Added Popularity, elif selected_feature_set == 'sci_plus_all': notes = [ 'All', 'F', 'D', 'TD', 'TSD', 'P', 'TSA', 'TS', 'F+D', 'F+TD', 'F+TSD', 'F+P', 'F+TSA', 'F+TS', 'D+TD', 'D+TSD', 'D+P', 'D+TSA', 'D+TS', 'TD+TSD', 'TD+P', 'TD+TSA', 'TD+TS', 'TSD+P', 'TSD+TSA', 'TSD+TS', 'P+TSA', 'P+TS', 'TSA+TS', 'F+D+TD', 'F+D+TSD', 'F+D+P', 'F+D+TSA', 'F+D+TS', 'F+TD+TSD', 'F+TD+P', 'F+TD+TSA', 'F+TD+TS', 'F+TSD+P', 'F+TSD+TSA', 'F+TSD+TS', 'F+P+TSA', 'F+P+TS', 'F+TSA+TS', 'D+TD+TSD', 'D+TD+P', 'D+TD+TSA', 'D+TD+TS', 'D+TSD+P', 'D+TSD+TSA', 'D+TSD+TS', 'D+P+TSA', 'D+P+TS', 'D+TSA+TS', 'TD+TSD+P', 'TD+TSD+TSA', 'TD+TSD+TS', 'TD+P+TSA', 'TD+P+TS', 'TD+TSA+TS', 'TSD+P+TSA', 'TSD+P+TS', 'TSD+TSA+TS', 'P+TSA+TS', 'F+D+TD+TSD', 'F+D+TD+P', 'F+D+TD+TSA', 'F+D+TD+TS', 'F+D+TSD+P', 'F+D+TSD+TSA', 'F+D+TSD+TS', 'F+D+P+TSA', 'F+D+P+TS', 'F+D+TSA+TS', 'F+TD+TSD+P', 'F+TD+TSD+TSA', 'F+TD+TSD+TS', 'F+TD+P+TSA', 'F+TD+P+TS', 'F+TD+TSA+TS', 'F+TSD+P+TSA', 'F+TSD+P+TS', 'F+TSD+TSA+TS', 'F+P+TSA+TS', 'D+TD+TSD+P', 'D+TD+TSD+TSA', 'D+TD+TSD+TS', 'D+TD+P+TSA', 'D+TD+P+TS', 'D+TD+TSA+TS', 'D+TSD+P+TSA', 'D+TSD+P+TS', 'D+TSD+TSA+TS', 'D+P+TSA+TS', 'TD+TSD+P+TSA', 'TD+TSD+P+TS', 'TD+TSD+TSA+TS', 'TD+P+TSA+TS', 'TSD+P+TSA+TS', 'F+D+TD+TSD+P', 'F+D+TD+TSD+TSA', 'F+D+TD+TSD+TS', 'F+D+TD+P+TSA', 'F+D+TD+P+TS', 'F+D+TD+TSA+TS', 'F+D+TSD+P+TSA', 'F+D+TSD+P+TS', 'F+D+TSD+TSA+TS', 'F+D+P+TSA+TS', 'F+TD+TSD+P+TSA', 'F+TD+TSD+P+TS', 'F+TD+TSD+TSA+TS', 'F+TD+P+TSA+TS', 'F+TSD+P+TSA+TS', 'D+TD+TSD+P+TSA', 'D+TD+TSD+P+TS', 'D+TD+TSD+TSA+TS', 'D+TD+P+TSA+TS', 'D+TSD+P+TSA+TS', 'TD+TSD+P+TSA+TS', 'F+D+TD+TSD+P+TSA', 'F+D+TD+TSD+P+TS', 'F+D+TD+TSD+TSA+TS', 'F+D+TD+P+TSA+TS', 'F+D+TSD+P+TSA+TS', 'F+TD+TSD+P+TSA+TS', 'D+TD+TSD+P+TSA+TS' ] assign = [[0, 1, 2, 3, 4, 5, 6], [0], [1], [2], [3], [4], [5], [6], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [2, 3], [2, 4], [2, 5], [2, 6], [3, 4], [3, 5], [3, 6], [4, 5], [4, 6], [5, 6], [0, 1, 2], [0, 1, 3], [0, 1, 4], [0, 1, 5], [0, 1, 6], [0, 2, 3], [0, 2, 4], [0, 2, 5], [0, 2, 6], [0, 3, 4], [0, 3, 5], [0, 3, 6], [0, 4, 5], [0, 4, 6], [0, 5, 6], [1, 2, 3], [1, 2, 4], [1, 2, 5], [1, 2, 6], [1, 3, 4], [1, 3, 5], [1, 3, 6], [1, 4, 5], [1, 4, 6], [1, 5, 6], [2, 3, 4], [2, 3, 5], [2, 3, 6], [2, 4, 5], [2, 4, 6], [2, 5, 6], [3, 4, 5], [3, 4, 6], [3, 5, 6], [4, 5, 6], [0, 1, 2, 3], [0, 1, 2, 4], [0, 1, 2, 5], [0, 1, 2, 6], [0, 1, 3, 4], [0, 1, 3, 5], [0, 1, 3, 6], [0, 1, 4, 5], [0, 1, 4, 6], [0, 1, 5, 6], [0, 2, 3, 4], [0, 2, 3, 5], [0, 2, 3, 6], [0, 2, 4, 5], [0, 2, 4, 6], [0, 2, 5, 6], [0, 3, 4, 5], [0, 3, 4, 6], [0, 3, 5, 6], [0, 4, 5, 6], [1, 2, 3, 4], [1, 2, 3, 5], [1, 2, 3, 6], [1, 2, 4, 5], [1, 2, 4, 6], [1, 2, 5, 6], [1, 3, 4, 5], [1, 3, 4, 6], [1, 3, 5, 6], [1, 4, 5, 6], [2, 3, 4, 5], [2, 3, 4, 6], [2, 3, 5, 6], [2, 4, 5, 6], [3, 4, 5, 6], [0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6], [0, 1, 2, 4, 5], [0, 1, 2, 4, 6], [0, 1, 2, 5, 6], [0, 1, 3, 4, 5], [0, 1, 3, 4, 6], [0, 1, 3, 5, 6], [0, 1, 4, 5, 6], [0, 2, 3, 4, 5], [0, 2, 3, 4, 6], [0, 2, 3, 5, 6], [0, 2, 4, 5, 6], [0, 3, 4, 5, 6], [1, 2, 3, 4, 5], [1, 2, 3, 4, 6], [1, 2, 3, 5, 6], [1, 2, 4, 5, 6], [1, 3, 4, 5, 6], [2, 3, 4, 5, 6], [0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 6], [0, 1, 2, 3, 5, 6], [0, 1, 2, 4, 5, 6], [0, 1, 3, 4, 5, 6], [0, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]] ### SCI and SCI+ else: ### 'summary_old_new' notes = ['SCI+', 'SCI'] assign = [[0, 1, 2, 3, 4], [0, 1, 2, 6]] ### Generate the report debug(notes, assign) texts = generate_report(config, X, y, assign, notes, p, k, t, d) del X, y report_directory = config['directory']['report'] result_filename = '/'.join( [report_directory, 'SCI_result_p{}_k{}.csv'.format(p, k)]) for text in texts: if text is not None: with open(result_filename, 'ab') as fw: fw.write(text + '\n') else: debug('File not found', evaluation_name)
def generate_walk2friend(config): kwargs = config['kwargs'] datasets = kwargs['active_dataset'] modes = kwargs['active_mode'] t_diffs = kwargs['ts'] s_diffs = kwargs['ds'] for dataset_name in datasets: p = config['dataset'].index(dataset_name) for mode in modes: k = config['mode'].index(mode) for t in t_diffs: for d in s_diffs: output_dir = config['directory']['walk2friend'] make_sure_path_exists(output_dir) debug('p', p, 'k', k, 't', t, 'd', d) checkin_name = '/'.join([ output_dir, '{}_{}_t{}_d{}.checkin'.format(dataset_name, mode, t, d) ]) friends_name = '/'.join([ output_dir, '{}_{}_t{}_d{}.friends'.format(dataset_name, mode, t, d) ]) if is_file_exists(checkin_name) is False or is_file_exists( friends_name) is False: checkins, _ = extract_checkins_all( dataset_name, mode, config) friends = extract_friendships(dataset_name, config) user_unique = [] for colocations in read_colocation_file( config, p, k, t, d, chunksize=10**6, usecols=['user1', 'user2']): user_unique.append(colocations['user1'].unique()) user_unique.append(colocations['user2'].unique()) # user_unique = np.array(user_unique) user_unique = np.ravel(user_unique) debug(user_unique) user_unique = np.unique(user_unique) debug('Before', '#checkins', len(checkins), '#friends', len(friends)) checkins = checkins.loc[( checkins['user'].isin(user_unique))] friends = friends.loc[ (friends['user1'].isin(user_unique)) & (friends['user2'].isin(user_unique))] debug('After', '#checkins', len(checkins), '#friends', len(friends)) checkins.sort_values(['user', 'location'], inplace=True) checkins.rename(columns={ "user": "******", "location": "locid" }, inplace=True) checkins['mid'] = range(len(checkins)) checkins = checkins[['mid', 'uid', 'locid']] checkins.to_csv(checkin_name, index=False, header=True) friends.rename(columns={ "user1": "u1", "user2": "u2" }, inplace=True) friends.sort_values(['u1', 'u2'], inplace=True) friends = friends[['u1', 'u2']] friends.to_csv(friends_name, index=False, header=True) del user_unique gc.collect()