def test_save_results(self): results = utils.load_results() temp_file_name = utils.get_data_folder() + 'temp.json' utils.save_results(results, file_name=temp_file_name) temp = utils.load_results(file_name=temp_file_name) self.assertEqual(results, temp)
def multiple_floors_train_predict(config, df, debug_floor, reference_submission, use_multiprocessing, models_group_name, mode, holdout_df, test_floors, recompute_grouped_data, overwrite_models, test_type_mapping, only_public_test_preds, test_waypoint_times, store_all_wifi_predictions, store_full_wifi_predictions, debug_fn=None, verbose=True): data_folder = utils.get_data_folder() model_folder = data_folder.parent / 'Models' / models_group_name site_floors = df.iloc[df.test_site.values].groupby( ['site_id', 'text_level']).size().reset_index() if debug_floor is not None: site_floors = site_floors.iloc[debug_floor:(debug_floor + 1)] sites = site_floors.site_id.values floors = site_floors.text_level.values ref_scores = get_reference_scores(site_floors, reference_submission) if ( reference_submission) is not None else [None] * floors.size if use_multiprocessing: with mp.Pool(processes=mp.cpu_count() - 1) as pool: floor_ids = np.arange(floors.size) results = [ pool.apply_async( floor_train_predict, args=(config, s, f, i, r, data_folder, model_folder, df, mode, holdout_df, test_floors, recompute_grouped_data, overwrite_models, test_type_mapping, only_public_test_preds, test_waypoint_times, store_all_wifi_predictions, store_full_wifi_predictions, debug_fn, verbose)) for (s, f, i, r) in zip(sites, floors, floor_ids, ref_scores) ] all_outputs = [p.get() for p in results] else: all_outputs = [] for floor_id, (analysis_site, floor, ref_score) in enumerate(zip(sites, floors, ref_scores)): all_outputs.append( floor_train_predict( config, analysis_site, floor, floor_id, ref_score, data_folder, model_folder, df, mode, holdout_df, test_floors, recompute_grouped_data, overwrite_models, test_type_mapping, only_public_test_preds, test_waypoint_times, store_all_wifi_predictions, store_full_wifi_predictions, debug_fn, verbose)) return all_outputs
def run(only_process_test_sites=True, overwrite_existing_processed=False, write_separate_wifi=False): print("Reshaping raw data") data_folder = utils.get_data_folder() parquet_folder = data_folder / "reference_preprocessed" summary_path = data_folder / "file_summary.csv" df = pd.read_csv(summary_path) source_submission = 'submission_cost_minimization.csv' submission_folder = data_folder / 'submissions' submission = pd.read_csv(submission_folder / source_submission) sample_sub_fns = np.array( [sps.split('_')[1] for sps in (submission.site_path_timestamp)]) sample_sub_times = np.array( [int(sps.split('_')[2]) for sps in (submission.site_path_timestamp)]) # First check for the last file and abort if it already exists last_pickle_path = data_folder / ( str(Path(df.ext_path.values[-1]).with_suffix("")) + "_reshaped.pickle") if last_pickle_path.exists() and (not overwrite_existing_processed): return # Loop over all file paths and compare the parquet and pickle files one by # one for i in range(df.shape[0]): # for i in np.arange(26924, 28000): print(f"{i+1} of {df.shape[0]}") if not only_process_test_sites or df.test_site[i]: ext_path = Path(df.ext_path[i]) mode = ext_path.parts[0] pickle_path = data_folder / (str(ext_path.with_suffix("")) + "_reshaped.pickle") parquet_path = parquet_folder / mode / ext_path.with_suffix( ".parquet") pathlib.Path(parquet_path.parent).mkdir(parents=True, exist_ok=True) if not pickle_path.exists() or overwrite_existing_processed: reshape_parquet( pickle_path, parquet_path, df.iloc[i], i, write_separate_wifi, data_folder, sample_sub_fns, sample_sub_times, )
def multiple_floors_train_predict(config, df, models_group_name, mode, holdout_df, test_floors, overwrite_models, test_type_mapping, only_public_test_preds, test_waypoint_times, debug_fn, verbose=True): data_folder = utils.get_data_folder() model_folder = data_folder.parent / 'Models' / models_group_name site_floors = df.iloc[df.test_site.values].groupby( ['site_id', 'text_level']).size().reset_index() sites = site_floors.site_id.values floors = site_floors.text_level.values if debug_fn is not None: target_row = np.where(df.fn == debug_fn)[0][0] sites = [df.site_id.values[target_row]] floors = [df.text_level.values[target_row]] all_distances = [] all_magnetometer = [] for floor_id, (analysis_site, floor) in enumerate(zip(sites, floors)): if verbose: print(f"Processing floor {floor_id+1} of {site_floors.shape[0]}") distances, magnetometer_pos = floor_train_predict( config, analysis_site, floor, floor_id, data_folder, model_folder, df, mode, holdout_df, test_floors, recompute_grouped_data, overwrite_models, test_type_mapping, only_public_test_preds, test_waypoint_times, debug_fn) all_distances.append(distances) all_magnetometer.append(magnetometer_pos) return all_distances, all_magnetometer
def run(mode): print("Preparing features for the sensor uncertainty models") fn_mode = ['mean', 'first_middle_last'][0] num_neighbors = 10 additional_feature_cols = ['rel_fraction', 'time_offset'] feature_cols = ['total_dist', 'num_waypoints'] stem_target_cols = [ 'mean_rel_dist_error', 'mean_abs_rel_dist_error', 'mean_angle_error', 'mean_abs_angle_error' ] if fn_mode == 'mean': target_cols = copy.copy(stem_target_cols) else: target_cols = [] for ext in ['first_', 'middle_', 'last_']: for c in stem_target_cols: if ext == 'middle_': target_cols += [ext + c] else: target_cols += [ext + c[5:]] feature_cols += copy.copy(target_cols) data_folder = utils.get_data_folder() model_folder = data_folder.parent / 'Models' / 'correct_sensor_preds' pathlib.Path(model_folder).mkdir(parents=True, exist_ok=True) save_ext = '' if fn_mode == 'mean' else ' first_middle_last' save_path = model_folder / (mode + save_ext + '.csv') if save_path.is_file(): return data_path = data_folder / 'fn_device_errors.csv' data = pd.read_csv(data_path) num_add_features = len(additional_feature_cols) all_feature_cols = additional_feature_cols + feature_cols num_keep_first_data_cols = 7 num_shifts = num_neighbors * 2 + 1 num_rows = data.shape[0] num_features = len(feature_cols) all_features = np.zeros( (num_shifts, num_rows, num_features + num_add_features)) padded_feature_vals = np.full((num_rows + num_shifts - 1, num_features), np.nan) padded_feature_vals[num_neighbors:( -num_neighbors)] = data[feature_cols].values device_ids = data.device_id.values padded_device_ids = np.full((num_rows + num_shifts - 1), np.nan) padded_device_ids[num_neighbors:(-num_neighbors)] = device_ids times = data.plot_time.values padded_times = np.full((num_rows + num_shifts - 1), np.nan) padded_times[num_neighbors:(-num_neighbors)] = times modes = data['mode'].values can_use_mask = np.concatenate([ np.zeros(num_neighbors, dtype=bool), (modes != 'test') & ((mode == 'test') | (modes != 'valid')), np.zeros(num_neighbors, dtype=bool), ]) for shift_id, shift in enumerate(range(-num_neighbors, num_neighbors + 1)): start_row = shift + num_neighbors end_row = shift + num_neighbors + num_rows shifted_features = np.copy(padded_feature_vals[start_row:end_row]) shifted_device_ids = padded_device_ids[start_row:end_row] if shift == 0: step_can_use_mask = np.ones_like(can_use_mask[start_row:end_row]) else: step_can_use_mask = np.copy(can_use_mask[start_row:end_row]) shift_mask = step_can_use_mask & (shifted_device_ids == device_ids) shifted_features[~shift_mask] = np.nan time_offsets = padded_times[start_row:end_row] - times sign_log_time_offsets = np.sign(time_offsets) * np.log10( np.abs(time_offsets)) sign_log_time_offsets[~shift_mask] = np.nan all_features[shift_id, :, 1] = sign_log_time_offsets all_features[shift_id, :, num_add_features:] = shifted_features # Add the weighted mean distance within the window (excluding the centered fn) all_features[num_neighbors, :, 2] = np.nan dist_surrounding_sum = np.nansum(all_features[:, :, 2], 0, keepdims=True) all_features[:, :, 0] = all_features[:, :, 2] / dist_surrounding_sum # Convert the features and targets to a flat dataframe df_cols = {} for i in range(num_keep_first_data_cols): col = data.columns[i] df_cols[col] = data[col].values df_cols['device_id'] = data.device_id.values df_cols['plot_time'] = data.plot_time.values no_middle_segments = data.num_waypoints.values <= 3 for k in target_cols: target_col_id = np.where( np.array(feature_cols) == k)[0][0] + (num_add_features) target_vals = np.copy(all_features[num_neighbors, :, target_col_id]) if fn_mode == 'first_middle_last' and k[:6] == 'middle': target_vals[no_middle_segments] = np.nan df_cols[k + '_target'] = target_vals for c_id, c in enumerate(all_feature_cols): for shift_id, shift in enumerate( range(-num_neighbors, num_neighbors + 1)): if shift != 0: col_name = c + str(shift) df_cols[col_name] = all_features[shift_id, :, c_id] combined = pd.DataFrame(df_cols) combined.to_csv(save_path, index=False)
def run(mode="test", consider_multiprocessing=True, overwrite_output=False): print("Non-parametric WiFi model") models_group_name = 'non_parametric_wifi' overwrite_models = True recompute_grouped_data = not True # config = { # 'min_train_points': 10, # Ignore bssid with few observations # 'min_train_fns': 1, # Ignore bssid with few trajectories # 'delay_decay_penalty_exp_base': 0.62, # Base for bssid weight decay as a f of delay to compute the shared bssid fraction # 'inv_fn_count_penalty_exp': 0.1, # Exponent to give more weight to rare bssids to compute the shared bssid fraction # 'non_shared_penalty_start': 1.0, # Threshold below which the shared wifi fraction gets penalized in the distance calculation # 'non_shared_penalty_exponent': 2.2, # Exponent to penalize the non shared wifi fraction # 'non_shared_penalty_constant': 75, # Multiplicative constant to penalize the non shared wifi fraction # 'delay_decay_exp_base': 0.925, # Base for shared bssid weight decay as a f of delay # 'inv_fn_count_distance_exp': 0.1, # Exponent to give more weight to rare bssids to compute the weighted mean distance # 'unique_model_frequencies': False, # Discard bssid's with changing freqs # 'time_range_max_strength': 3, # Group wifi observations before and after each observation and retain the max strength # 'limit_train_near_waypoints': not True, # Similar to "snap to grid" - You likely want to set this to False eventually to get more granular predictions # } config = { 'min_train_points': 5, # Ignore bssid with few observations 'min_train_fns': 1, # Ignore bssid with few trajectories 'delay_decay_penalty_exp_base': 0.8, # Base for bssid weight decay as a f of delay to compute the shared bssid fraction 'inv_fn_count_penalty_exp': 0.0, # Exponent to give more weight to rare bssids to compute the shared bssid fraction 'non_shared_penalty_start': 1.0, # Threshold below which the shared wifi fraction gets penalized in the distance calculation 'non_shared_penalty_exponent': 2.0, # Exponent to penalize the non shared wifi fraction 'non_shared_penalty_constant': 50, # Multiplicative constant to penalize the non shared wifi fraction 'delay_decay_exp_base': 0.92, # Base for shared bssid weight decay as a f of delay 'inv_fn_count_distance_exp': 0.2, # Exponent to give more weight to rare bssids to compute the weighted mean distance 'unique_model_frequencies': False, # Discard bssid's with changing freqs 'time_range_max_strength': 1e-5, # Group wifi observations before and after each observation and retain the max strength 'limit_train_near_waypoints': False # Similar to "snap to grid" - You likely want to set this to False eventually to get more granular predictions } debug_floor = [None, 16][0] debug_fn = [None, '5dd374df44333f00067aa198'][0] store_all_wifi_predictions = False store_full_wifi_predictions = not config[ 'limit_train_near_waypoints'] # Required for the current combined optimization only_public_test_preds = False reference_submission_ext = 'non_parametric_wifi - valid - 2021-03-30 091444.csv' bogus_test_floors_to_train_all_test_models = False test_override_floors = False data_folder = utils.get_data_folder() summary_path = data_folder / 'file_summary.csv' stratified_holdout_path = data_folder / 'holdout_ids.csv' leaderboard_types_path = data_folder / 'leaderboard_type.csv' preds_folder = data_folder.parent / 'Models' / models_group_name / ( 'predictions') pathlib.Path(preds_folder).mkdir(parents=True, exist_ok=True) if store_full_wifi_predictions: file_ext = models_group_name + ' - ' + mode + ' - full distances.pickle' full_predictions_path = preds_folder / file_ext if full_predictions_path.is_file() and (not overwrite_output): return reference_submission_path = data_folder / reference_submission_ext df = pd.read_csv(summary_path) holdout_df = pd.read_csv(stratified_holdout_path) test_waypoint_times = utils.get_test_waypoint_times(data_folder) test_floors = utils.get_test_floors( data_folder, debug_test_floor_override=test_override_floors) leaderboard_types = pd.read_csv(leaderboard_types_path) test_type_mapping = { fn: t for (fn, t) in zip(leaderboard_types.fn, leaderboard_types['type']) } reference_submission = pd.read_csv(reference_submission_path) assert store_full_wifi_predictions == ( not config['limit_train_near_waypoints']) if bogus_test_floors_to_train_all_test_models and mode == 'test': print( "WARNING: bogus shuffling of test floors to train all floor models" ) test_floors = utils.get_test_floors(data_folder) site_floors = df.iloc[df.test_site.values].groupby( ['site_id', 'text_level']).size().reset_index() site_floors['level'] = [ utils.TEST_FLOOR_MAPPING[t] for t in (site_floors.text_level) ] site_floors['num_test_counts'] = 0 first_floor_fns = {s: [] for s in np.unique(site_floors.site_id)} repeated_floor_fns = {s: [] for s in np.unique(site_floors.site_id)} for fn in test_floors: site = df.site_id[df.fn == fn].values[0] increment_row = np.where((site_floors.site_id == site) & ( site_floors.level == test_floors[fn]))[0][0] site_floors.loc[increment_row, 'num_test_counts'] += 1 if site_floors.num_test_counts.values[increment_row] > 1: repeated_floor_fns[site].append(fn) else: first_floor_fns[site].append(fn) non_visited_floor_ids = np.where(site_floors.num_test_counts == 0)[0] for i, non_visited_id in enumerate(non_visited_floor_ids): site = site_floors.site_id.values[non_visited_id] if repeated_floor_fns[site]: override_fn = repeated_floor_fns[site].pop() else: override_fn = first_floor_fns[site].pop() test_floors[override_fn] = site_floors.level.values[non_visited_id] # Verify that now all floors contain at least one test fn site_floors['num_test_counts'] = 0 for fn in test_floors: site = df.site_id[df.fn == fn].values[0] increment_row = np.where((site_floors.site_id == site) & ( site_floors.level == test_floors[fn]))[0][0] site_floors.loc[increment_row, 'num_test_counts'] += 1 if debug_fn is not None: debug_fn_row = np.where(df.fn.values == debug_fn)[0][0] debug_fn_site = df.site_id.values[debug_fn_row] debug_fn_level = df.text_level.values[debug_fn_row] site_floors = df.iloc[df.test_site.values].groupby( ['site_id', 'text_level']).size().reset_index() debug_floor = np.where((site_floors.site_id == debug_fn_site) & ( site_floors.text_level == debug_fn_level))[0][0] use_multiprocessing = consider_multiprocessing and (debug_fn is None) and ( debug_floor is None) all_outputs = non_parametric_wifi_utils.multiple_floors_train_predict( config, df, debug_floor, reference_submission, use_multiprocessing, models_group_name, mode, holdout_df, test_floors, recompute_grouped_data, overwrite_models, test_type_mapping, only_public_test_preds, test_waypoint_times, store_all_wifi_predictions, store_full_wifi_predictions, debug_fn) test_preds = { k: v for d in [o[0] for o in all_outputs] for k, v in d.items() } valid_preds = [r for l in [o[1] for o in all_outputs] for r in l] all_wifi_predictions = [r for l in [o[2] for o in all_outputs] for r in l] full_wifi_predictions = dict(ChainMap(*[o[3] for o in all_outputs if o[3]])) Path(preds_folder).mkdir(parents=True, exist_ok=True) if store_full_wifi_predictions: with open(full_predictions_path, 'wb') as handle: pickle.dump(full_wifi_predictions, handle, protocol=pickle.HIGHEST_PROTOCOL) if mode == 'test': submission = utils.convert_to_submission(data_folder, test_preds) submission_ext = models_group_name + ' - test.csv' submission.to_csv(preds_folder / submission_ext, index=False) elif debug_floor is None: preds_df = pd.DataFrame(valid_preds) print(f"Mean validation error: {preds_df.error.values.mean():.2f}") preds_path = preds_folder / (models_group_name + ' - valid.csv') preds_df.to_csv(preds_path, index=False) if store_all_wifi_predictions: all_wifi_preds_df = pd.DataFrame(all_wifi_predictions) all_wifi_preds_df.sort_values(["site", "fn", "time"], inplace=True) preds_path = preds_folder / (models_group_name + ' - all wifi validation.csv') all_wifi_preds_df.to_csv(preds_path, index=False) holdout_unweighted = np.sqrt(preds_df.squared_error.values).mean() print(f"Holdout unweighted aggregate loss: {holdout_unweighted:.2f}")
data_folder = './dataset' hostname = socket.gethostname() if hostname.startswith('ubuntu'): data_folder = '/dev/shm/finegrained/' args.checkpoint_dir = '/home/user/winycg/accv_checkpoint/' elif hostname.startswith('winycgv1'): data_folder = '/dev/shm/' args.checkpoint_dir = '/home/user/hhd/winycg/accv_checkpoint/' args.checkpoint_dir = os.path.join(args.checkpoint_dir, log_dir) if not os.path.isdir(args.checkpoint_dir): os.makedirs(args.checkpoint_dir) return data_folder loaders = get_dataloaders(get_data_folder(), args) trainloader = loaders['train'] valloader = loaders['val'] testloader = loaders['test'] print('Number of train dataset: ' ,len(trainloader.dataset)) print('Number of validation dataset: ' ,len(valloader.dataset)) print('Number of test dataset: ' ,len(testloader.dataset)) num_classes = trainloader.dataset.num_classes print('Number of classes: ' , num_classes) C, H, W = trainloader.dataset[0][0].size() # -------------------------------------------------------------------------------------------- # Model print('==> Building model..')
def run(): print("Summarizing all waypoint locations") only_process_test_sites = True write_all_wifi_data = False data_folder = utils.get_data_folder() summary_path = data_folder / "file_summary.csv" combined_waypoints_path = data_folder / "train_waypoints_timed.csv" if combined_waypoints_path.is_file(): return # combined_train_wifi_times_path = data_folder / "train_wifi_times.csv" # combined_test_wifi_times_path = data_folder / "test_wifi_times.csv" stratified_holdout_path = data_folder / 'holdout_ids.csv' combined_all_wifi_folder = data_folder / 'train' df = pd.read_csv(summary_path) holdout = pd.read_csv(stratified_holdout_path) # Loop over all file paths and compare the parquet and pickle files one by one all_waypoints = [] all_train_wifi_times = [] all_test_wifi_times = [] all_wifi_data = [] for i in range(df.shape[0]): # if i < 26900: # continue print(f"Trajectory {i+1} of {df.shape[0]}") if (not only_process_test_sites or df.test_site[i]) and df.num_wifi[i] > 0: pickle_path = data_folder / ( str(Path(df.ext_path[i]).with_suffix("")) + "_reshaped.pickle") with open(pickle_path, "rb") as f: trajectory = pickle.load(f) if df['mode'][i] != 'test': waypoints = trajectory['waypoint'] num_waypoints = waypoints.shape[0] # Add meta columns for c in ['site_id', 'mode', 'fn', 'text_level']: waypoints[c] = df[c][i] # Add whether it is a train or validation trajectory waypoints['mode'] = holdout['mode'][holdout.fn == df.fn[i]].values[0] # Add the waypoint type waypoint_types = np.repeat('middle', num_waypoints) waypoint_types[0] = 'first' waypoint_types[num_waypoints - 1] = 'last' waypoints['type'] = waypoint_types waypoints['id'] = np.arange(num_waypoints) waypoints['num_waypoints'] = num_waypoints # Add the most recent wifi times that are closest to the waypoint # timestamps wifi_t1_times = np.unique(trajectory['wifi'].t1_wifi) assert np.all(np.diff(wifi_t1_times) > 0) wifi_last_t2_times = trajectory['wifi'].groupby( 't1_wifi')['t2_wifi'].aggregate("max").values num_wifi_obs = trajectory['wifi'].groupby( 't1_wifi')['t1_wifi'].aggregate("count").values try: assert wifi_t1_times.size == wifi_last_t2_times.size assert np.sum(np.diff(wifi_last_t2_times) < -1) <= 1 assert np.all(wifi_last_t2_times < wifi_t1_times) or ( df['mode'][i] == 'test') except: import pdb pdb.set_trace() x = 1 if df['mode'][i] != 'test': waypoint_wifi_times = np.zeros(num_waypoints, dtype=np.int64) for j in range(num_waypoints): wifi_id = max( 0, (wifi_last_t2_times <= waypoints.time[j]).sum() - 1) waypoint_wifi_times[j] = wifi_last_t2_times[wifi_id] waypoints['last_wifi_t2_time'] = waypoint_wifi_times waypoints['trajectory_wifi_time'] = waypoint_wifi_times - ( wifi_t1_times[0]) waypoint_times = waypoints.time.values waypoints['trajectory_waypoint_time'] = waypoint_times - ( waypoint_times[0]) waypoints['first_waypoint_time'] = waypoint_times[0] # Reorder the columns cols = waypoints.columns.tolist() reordered_cols = cols[:1] + cols[4:] + cols[1:4] waypoints = waypoints[reordered_cols] all_waypoints.append(waypoints) if write_all_wifi_data: wifi_data = trajectory['wifi'].copy() for c in ['site_id', 'mode', 'fn', 'level']: wifi_data[c] = df[c][i] cols = wifi_data.columns.tolist() reordered_cols = cols[6:] + cols[:6] wifi_data = wifi_data[reordered_cols] if 'wifi_waypoints' in trajectory: wifi_wp = trajectory['wifi_waypoints'] wifi_wp.sort_values(["t1_wifi", "t2_wifi"], ascending=[True, False], inplace=True) wifi_wp_map = wifi_wp.groupby(['t1_wifi' ]).first().reset_index()[[ 't1_wifi', 'waypoint_interp_x', 'waypoint_interp_y' ]] wifi_data = wifi_data.merge(wifi_wp_map, on='t1_wifi') else: wifi_data['waypoint_interp_x'] = np.nan wifi_data['waypoint_interp_y'] = np.nan all_wifi_data.append(wifi_data) wifi_times = pd.DataFrame({ 'site_id': df['site_id'][i], 'mode': df['mode'][i], 'fn': df['fn'][i], 'level': df['level'][i], 'wifi_t1_times': wifi_t1_times, 'wifi_last_t2_times': wifi_last_t2_times, 'trajectory_index': np.arange(wifi_last_t2_times.size), 'num_wifi_obs': num_wifi_obs, }) if df['mode'][i] == 'test': wifi_times['first_last_t2_time'] = wifi_last_t2_times[0] all_test_wifi_times.append(wifi_times) else: wifi_times['first_waypoint_time'] = waypoint_times[0] all_train_wifi_times.append(wifi_times) # Write the combined waypoints to disk combined_waypoints = pd.concat(all_waypoints) combined_waypoints.sort_values(["site_id", "first_waypoint_time", "time"], inplace=True) combined_waypoints.to_csv(combined_waypoints_path, index=False) # # Write the combined wifi times to disk # combined_train_wifi_times = pd.concat(all_train_wifi_times) # combined_train_wifi_times.sort_values( # ["site_id", "first_waypoint_time", "wifi_t1_times"], inplace=True) # combined_train_wifi_times.to_csv(combined_train_wifi_times_path, index=False) # combined_test_wifi_times = pd.concat(all_test_wifi_times) # combined_test_wifi_times.sort_values( # ["site_id", "first_last_t2_time", "wifi_t1_times"], inplace=True) # combined_test_wifi_times.to_csv(combined_test_wifi_times_path, index=False) # Write the raw wifi data to disk if write_all_wifi_data: test_floors = utils.get_test_floors(data_folder) combined_all_wifi = pd.concat(all_wifi_data) combined_all_wifi.sort_values(["site_id", "fn", "mode"], inplace=True) all_levels = [ l if m != 'test' else test_floors[fn] for (l, m, fn) in zip(combined_all_wifi.level, combined_all_wifi['mode'], combined_all_wifi.fn) ] combined_all_wifi['level'] = np.array(all_levels) sites = np.sort(np.unique(combined_all_wifi.site_id.values)) for site_id, site in enumerate(sites): print(f"Site {site_id+1} of {len(sites)}") combined_all_wifi_site = combined_all_wifi[ combined_all_wifi.site_id.values == site] # Map the levels from a reference submission for the test data levels = np.sort(np.unique(combined_all_wifi_site.level.values)) for l in levels: combined_all_wifi_floor = combined_all_wifi_site[ combined_all_wifi_site.level.values == l] combined_all_wifi_floor.sort_values(["mode", "t1_wifi"], inplace=True) text_level = df.text_level[ df.fn == combined_all_wifi_floor.fn.values[-1]].values[0] combined_all_wifi_path = combined_all_wifi_folder / site / text_level / ( 'all_wifi.csv') combined_all_wifi_floor.to_csv(combined_all_wifi_path, index=False)
def run(mode): print("Processing time leak (edge trajectories)") debug_site = [None, 0][0] use_multiprocessing = False test_preds_source = 'test - 2021-05-15 051944.csv' test_override_floors = False data_folder = utils.get_data_folder() test_override_ext = '_floor_override' if (mode == 'test' and test_override_floors) else '' save_path = data_folder / (mode + '_edge_positions_v3' + test_override_ext + '.csv') if save_path.is_file(): return summary_path = data_folder / 'file_summary.csv' test_preds_path = data_folder / 'submissions' / test_preds_source stratified_holdout_path = data_folder / 'holdout_ids.csv' device_id_path = data_folder / 'device_ids.pickle' ordered_device_time_path = data_folder / 'inferred_device_ids.csv' with open(device_id_path, 'rb') as f: device_ids = pickle.load(f) public_private_test_leaks = { 'ff141af01177f34e9caa7a12': ('start', 3, 203.11885, 97.310814), 'f973ee415265be4addc457b1': ('start', -1, 20.062187, 99.66188), '23b4c8eb4b41d75946285461': ('end', 2, 60.205635, 102.28055), '5582270fcaee1f580de9006f': ('end', 0, 97.8957, 28.9133), 'b51a662297b90657f0b03b44': ('start', 1, 112.39258, 233.72379), } df = pd.read_csv(summary_path) holdout_df = pd.read_csv(stratified_holdout_path) test_floors = utils.get_test_floors( data_folder, debug_test_floor_override=test_override_floors) test_preds = pd.read_csv(test_preds_path) test_preds = utils.override_test_floor_errors( test_preds, debug_test_floor_override=test_override_floors) test_preds['fn'] = [ spt.split('_')[1] for spt in test_preds.site_path_timestamp ] test_preds['timestamp'] = [ int(spt.split('_')[2]) for spt in test_preds.site_path_timestamp ] for test_fn in test_floors: assert test_preds.floor[test_preds.fn == test_fn].values[0] == test_floors[test_fn] device_time_path = pd.read_csv(ordered_device_time_path) device_time_path['time'] = device_time_path['start_time'] test_rows = np.where(device_time_path['mode'].values == "test")[0] device_time_path.loc[ test_rows, 'time'] = device_time_path['first_last_wifi_time'].values[test_rows] device_time_path.sort_values(['device_id', 'time'], inplace=True) sites = df.iloc[df.test_site.values].groupby(['site_id' ]).size().reset_index() if debug_site is not None: sites = sites.iloc[debug_site:(debug_site + 1)] sites = sites.site_id.values if use_multiprocessing: with mp.Pool(processes=mp.cpu_count() - 1) as pool: results = [ pool.apply_async(extract_floor_start_end, args=(data_folder, s, df, holdout_df, test_preds, device_time_path, mode, device_ids, public_private_test_leaks)) for s in sites ] all_outputs = [p.get() for p in results] else: all_outputs = [] for site_id, analysis_site in enumerate(sites): print(f"Processing site {site_id+1} of {len(sites)}") all_outputs.append( extract_floor_start_end(data_folder, analysis_site, df, holdout_df, test_preds, device_time_path, mode, device_ids, public_private_test_leaks)) # Save the combined results combined = pd.concat(all_outputs) combined.to_csv(save_path, index=False)
def run(): print("Combining sensor data") only_process_test_sites = True sensor_cols = ['time', 'acce', 'gyro', 'ahrs'] data_folder = utils.get_data_folder() save_folder = data_folder / 'sensor_data' pathlib.Path(save_folder).mkdir(parents=True, exist_ok=True) summary_path = data_folder / 'file_summary.csv' source_submission = 'submission_cost_minimization.csv' submission_folder = data_folder / 'submissions' submission = pd.read_csv(submission_folder / source_submission) submission = utils.override_test_floor_errors(submission) sample_sub_fns = np.array( [sps.split('_')[1] for sps in (submission.site_path_timestamp)]) sample_sub_times = np.array( [int(sps.split('_')[2]) for sps in (submission.site_path_timestamp)]) holdout_df = pd.read_csv(data_folder / 'holdout_ids.csv') df = pd.read_csv(summary_path) # Overwrite the validation data mode validation_fns = set(holdout_df.fn.values[holdout_df.holdout]) df['mode'] = [ 'valid' if fn in validation_fns else m for (fn, m) in zip(df['fn'].values, df['mode'].values) ] def rotate_by_angles(orig, angles): rotated = np.zeros_like(orig) for i, theta in enumerate(angles): o = orig[i] c, s = np.cos(theta), np.sin(theta) r = np.matrix([[c, s], [-s, c]]) rotated[i] = np.dot(r, o) return rotated # Combine all the between waypoint sub-trajectories into a single file for each # data mode target_sensor_cols = None for mode in np.unique(df['mode'].values): # mode = 'valid' waypoint_mapping = {} sub_df = df.iloc[np.where(df['mode'] == mode)[0]] if only_process_test_sites: sub_df = sub_df[sub_df.test_site.values] save_ext = '' else: save_ext = '_all_sites' save_path = save_folder / (mode + save_ext + '.pickle') if save_path.is_file(): continue print(mode) for fn_id, (fn, site, floor) in enumerate( zip(sub_df.fn, sub_df.site_id, sub_df.text_level)): #print(fn_id) path_ext = fn + '_reshaped.pickle' if mode == 'test': data_path = data_folder / mode / path_ext sub_fn_ids = np.where(sample_sub_fns == fn)[0] waypoint_times = sample_sub_times[sub_fn_ids] floor_int = submission.floor.values[sub_fn_ids[0]] waypoints = None relative_waypoint_movement_1 = None relative_waypoint_distances = None relative_waypoint_movement_2 = None else: try: floor_int = utils.TEST_FLOOR_MAPPING[floor] except: print(f"Failed {fn_id}") continue data_path = data_folder / 'train' / site / floor / path_ext try: with open(data_path, 'rb') as f: file_data = pickle.load(f) except: print(f"Failed {fn_id}") continue if mode != 'test': waypoint_times = file_data['waypoint'].time.values waypoints = file_data['waypoint'] waypoint_pos = waypoints[['x_waypoint', 'y_waypoint']].values relative_waypoint_movement_1 = np.diff(waypoint_pos, axis=0) rel_angles = np.angle(relative_waypoint_movement_1[:, 0] + 1j * (relative_waypoint_movement_1[:, 1])) relative_waypoint_movement_2 = rotate_by_angles( relative_waypoint_movement_1[1:], rel_angles[:-1]) relative_waypoint_distances = np.sqrt( (relative_waypoint_movement_1**2).sum(1)) num_waypoints = waypoint_times.size # Chunk out the waypoint segments waypoint_segments = [] fractions_time_covered = [] shared_time = file_data['shared_time'] share_time_vals = shared_time.time.values for i in range(num_waypoints - 1): start_time = waypoint_times[i] end_time = waypoint_times[i + 1] if target_sensor_cols is None: target_sensor_cols = [ c for c in shared_time.columns if any([sc in c for sc in sensor_cols]) ] start_row = max(0, (share_time_vals <= start_time).sum() - 1) end_row = min(share_time_vals.size, (share_time_vals < end_time).sum() + 1) time_range = end_time - start_time covered_time = min(end_time, share_time_vals[end_row - 1]) - max( start_time, share_time_vals[start_row]) fractions_time_covered.append(covered_time / time_range) # if fn == '5dc8e91a17ffdd0006f12ce0' and i == 0: # import pdb; pdb.set_trace() # x=1 waypoint_segments.append(shared_time.iloc[np.arange( start_row, end_row)]) # if fn == '5dc8e91a17ffdd0006f12ce0': # import pdb; pdb.set_trace() # x=1 # import pdb; pdb.set_trace() waypoint_mapping[fn] = { 'site': site, 'floor': floor_int, 'num_waypoints': num_waypoints, 'waypoints': waypoints, 'waypoint_times': waypoint_times, 'fractions_time_covered': np.array(fractions_time_covered), 'waypoint_segments': waypoint_segments, 'relative_waypoint_movement_1': relative_waypoint_movement_1, 'relative_waypoint_distances': relative_waypoint_distances, 'relative_waypoint_movement_2': relative_waypoint_movement_2, } # Save the combined mapping to disk with open(save_path, 'wb') as handle: pickle.dump(waypoint_mapping, handle, protocol=pickle.HIGHEST_PROTOCOL)
def run(): only_process_test_sites = True data_folder = utils.get_data_folder() sensor_folder = data_folder / 'sensor_data' device_id_path = data_folder / 'device_ids.pickle' try: with open(device_id_path, 'rb') as f: device_ids = pickle.load(f) print("Extracting segment meta data (2/2)") except: device_ids = None print("Extracting segment meta data (1/2)") device_ext = '_no_device' if device_ids is None else '' save_ext = '' if only_process_test_sites else '_all_sites' save_path = sensor_folder / ('meta' + save_ext + device_ext + '.csv') if save_path.is_file(): return device_ids is None summary_path = data_folder / 'file_summary.csv' df = pd.read_csv(summary_path) leaderboard_types_path = data_folder / 'leaderboard_type.csv' leaderboard_types = pd.read_csv(leaderboard_types_path) test_type_mapping = {fn: t for (fn, t) in zip( leaderboard_types.fn, leaderboard_types['type'])} # Combine all the sub-trajectory meta data all_sub_trajectories = [] for mode in ['test', 'train', 'valid']: print(mode) load_path = sensor_folder / (mode + save_ext + '.pickle') with open(load_path, 'rb') as f: combined_mode = pickle.load(f) for fn in combined_mode: t = combined_mode[fn] site = t['site'] level = t['floor'] text_level = df.text_level.values[np.where( (df.site_id == site) & (df.level == level))[0][0]] num_waypoints = t['num_waypoints'] waypoint_times = t['waypoint_times'] sub_durations = np.diff(waypoint_times) waypoint_segments = t['waypoint_segments'] waypoint_times = t['waypoint_times'] relative_movements = t['relative_waypoint_movement_1'] for i in range(num_waypoints-1): segment_time = waypoint_segments[i].time.values sensor_time_diff = np.diff(segment_time) start_time_offset = segment_time[0] - waypoint_times[i] end_time_offset = segment_time[-1] - waypoint_times[i+1] mean_robust_sensor_time_diff = sensor_time_diff[ (sensor_time_diff >= 19) & (sensor_time_diff <= 21)].mean() if mode == 'test': distance_covered = None test_type = test_type_mapping[fn] plot_time = df.first_last_wifi_time.values[ np.where(df.fn.values == fn)[0][0]] else: distance_covered = np.sqrt((relative_movements[i]**2).sum()) test_type = '' plot_time = waypoint_times[i] all_sub_trajectories.append({ 'mode': mode, 'site': site, 'level': level, 'text_level': text_level, 'fn': fn, 'device_id': None if device_ids is None else device_ids[fn][0], 'device_id_merged': None if device_ids is None else ( device_ids[fn][2]), 'test_type': test_type, 'plot_time': plot_time, 'start_time': waypoint_times[i], 'end_time': waypoint_times[i+1], 'sub_trajectory_id': i, 'num_waypoints': num_waypoints, 'duration': sub_durations[i], 'num_obs': segment_time.size, 'start_time_offset': start_time_offset, 'end_time_offset': end_time_offset, 'mean_sensor_time_diff': sensor_time_diff.mean(), 'mean_robust_sensor_time_diff': mean_robust_sensor_time_diff, 'min_sensor_time_diff': sensor_time_diff.min(), 'max_sensor_time_diff': sensor_time_diff.max(), 'distance_covered': distance_covered, }) combined = pd.DataFrame(all_sub_trajectories) combined.to_csv(save_path, index=False) return device_ids is None
def run(mode): print("Model the sensor uncertainty") fn_mode = ['mean', 'joined_middle_last', 'first_middle_last'][0] skip_unbias_models = not True overwrite_models = True max_train_folds = [1, None][1] additional_feature_cols = ['num_waypoints'] params = { 'objective': 'regression', 'learning_rate': 0.005, 'extra_trees': True, 'num_leaves': 40, 'n_estimators': int(1e3), 'max_depth': -1, 'min_child_samples': 1, 'colsample_bynode': 0.4, 'subsample_freq': 1, 'subsample': 0.8, 'metric': 'rmse', 'verbose': -1, 'n_jobs': 1 } data_folder = utils.get_data_folder() model_folder = data_folder.parent / 'Models' / 'correct_sensor_preds' save_ext = '' if fn_mode == 'mean' else ' ' + fn_mode predict_ext = mode + save_ext + '.csv' save_folder = model_folder / 'predictions' pathlib.Path(save_folder).mkdir(parents=True, exist_ok=True) save_path = save_folder / predict_ext if save_path.is_file(): return load_ext = '' if fn_mode == 'mean' else ' first_middle_last' data_path = model_folder / (mode + load_ext + '.csv') data = pd.read_csv(data_path) target_cols = [c for c in data.columns if c[-7:] == '_target'] last_non_feature_col = target_cols[-1] if fn_mode == 'joined_middle_last': target_cols = [c[6:] for c in target_cols if c[:5] == 'first'] last_non_feature_id = np.where(data.columns == last_non_feature_col)[0][0] feature_cols = additional_feature_cols + data.columns.tolist()[ (last_non_feature_id + 1):] non_feature_cols = data.columns.tolist()[:(last_non_feature_id + 1)] def prepare_data(data, mode, fn_mode, target_cols, feature_cols=None): sub = data[(data['mode'] == mode)] if fn_mode == 'joined_middle_last': nrow = sub.shape[0] orig_sub = sub.copy() sub = pd.concat([sub, sub, sub]) sub.index = np.arange(sub.shape[0]) for c in target_cols: sub[c] = np.concatenate([ orig_sub['first_' + c].values, orig_sub['middle_mean_' + c].values, orig_sub['last_' + c].values, ]) sub['segment_type'] = np.repeat(np.arange(3), nrow) if feature_cols is not None: feature_cols.append('segment_type') return sub folds = data.train_fold.values[data['mode'] == 'train'].astype(np.int32) unique_folds = np.sort(np.unique(folds)) if max_train_folds is not None: unique_folds = unique_folds[:max_train_folds] num_folds = unique_folds.size predict_rows = np.where(data['mode'].values == mode)[0] predict_data = prepare_data(data, mode, fn_mode, target_cols, feature_cols) predict_features = predict_data[feature_cols].values combined = {k: data[k].values[predict_rows] for k in non_feature_cols} for target_col_id, target_col in enumerate(target_cols): unbias_target = not 'abs_' in target_col if not unbias_target or not skip_unbias_models: print( f'\nTarget {target_col_id+1} of {len(target_cols)}: {target_col}' ) predict_targets = predict_data[target_col].values predict_fold_preds = {} predict_fold_preds_l = [] for f_id, f in enumerate(unique_folds): print(f"Fold {f_id+1} of {num_folds}") model_path = model_folder / (target_col + ' - fold ' + str( int(f)) + save_ext + '.pickle') if mode == 'valid' and (not model_path.is_file() or overwrite_models): train_data = prepare_data(data, 'train', fn_mode, target_cols) train_data = train_data[train_data['train_fold'] != f] train_features = train_data[feature_cols].values train_targets = train_data[target_col].values non_nan_train_targets = ~np.isnan(train_targets) model = lgb.LGBMRegressor(**params) model = model.fit(train_features[non_nan_train_targets], train_targets[non_nan_train_targets], verbose=1) with open(model_path, "wb") as handle: pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL) else: with open(model_path, "rb") as file: model = pickle.load(file) preds = model.predict(predict_features) predict_fold_preds[target_col + '_fold_' + str(f)] = preds predict_fold_preds_l.append(preds) avg_fold_pred = np.stack(predict_fold_preds_l, -1).mean(-1) predict_fold_preds[target_col + '_avg_fold'] = avg_fold_pred if mode == 'valid': if unbias_target: original_avg_abs_error = np.abs(predict_targets).mean() debiased_avg_abs_error = np.abs(predict_targets - avg_fold_pred).mean() print(f"Orig abs error: {original_avg_abs_error:.3f};\ Debiased abs error: {debiased_avg_abs_error:.3f}") else: abs_err_correlation = np.corrcoef( np.stack([predict_targets, avg_fold_pred]))[0, 1] print(f"Abs error correlation: {abs_err_correlation:.3f}") if fn_mode == 'joined_middle_last': orig_keys = list(predict_fold_preds.keys()) for k in orig_keys: for st_id, st in enumerate( ['first_', 'middle_mean_', 'last_']): start_index = st_id * predict_rows.size end_index = (st_id + 1) * predict_rows.size predict_fold_preds[ st + k] = predict_fold_preds[k][start_index:end_index] for k in orig_keys: del predict_fold_preds[k] combined.update(predict_fold_preds) combined_df = pd.DataFrame(combined) combined_df.to_csv(save_path, index=False)
def run(overwrite_summary=False): print("Extracting metadata") data_folder = utils.get_data_folder() file_summaries = [] sample_submission = pd.read_csv(data_folder / "submissions" / ( "sample_submission.csv")) sample_submission_counts = {} all_sites = [] for s in sample_submission.site_path_timestamp: all_sites.append(s.split("_")[0]) file_name = s.split("_")[1] if file_name in sample_submission_counts: sample_submission_counts[file_name] += 1 else: sample_submission_counts[file_name] = 1 test_sites = list(set(all_sites)) summary_path = data_folder / "file_summary.csv" if summary_path.is_file() and not overwrite_summary: df = pd.read_csv(summary_path) else: for mode in ["train", "test"]: main_folder = data_folder / mode main_data_folders_or_files = sorted(main_folder.iterdir()) if mode == "train": # Loop over all train data and extract the site ID for f in main_data_folders_or_files: # sub_folder = main_folder / f sub_folder = f sub_sub_folders = sorted(sub_folder.iterdir()) sub_sub_folders = [ s for s in sub_sub_folders if not s.suffix == ".pickle" ] site_id = None for sub_sub_ext in sub_sub_folders: # sub_sub_path = sub_folder / sub_sub_ext sub_sub_path = sub_sub_ext sub_sub_files = sorted(sub_sub_path.iterdir()) sub_sub_files = [s for s in sub_sub_files if s.suffix == ".txt"] for e in sub_sub_files: print(f"{len(file_summaries)+1} of 27549") # file_path = sub_sub_path / e file_path = e file_summary, site_id, complete_file = get_file_summary( file_path, site_id, mode, f, sub_sub_ext, e, None, test_sites, data_folder, ) if complete_file: # The file train/5cd56b83e2acfd2d33b5cab0/B2/5cf72539e9d9c9000852f45b.txt seems cut short file_summaries.append(file_summary) else: main_data_folders_or_files = [ s for s in main_data_folders_or_files if s.suffix == ".txt" ] for e in main_data_folders_or_files: site_id = None print(f"{len(file_summaries)+1} of 27549") # file_path = main_folder / e file_path = e file_summary, site_id, _ = get_file_summary( file_path, site_id, mode, None, None, e, sample_submission_counts, test_sites, data_folder ) file_summaries.append(file_summary) df = pd.DataFrame(file_summaries) df = df.astype({ "num_test_waypoints": "Int64", "num_train_waypoints": "Int64", "level": "Int64", "first_last_wifi_time": "Int64", }) # # Potential subsequent run of the script # if not 'text_level' in df.columns: # df['text_level'] = None # for i in range(df.shape[0]): # print(i) # if df['mode'][i] == 'train': # text_level = df['ext_path'][i].split('/')[2] # df.loc[i, 'text_level'] = text_level df.to_csv(summary_path, index=False)
def test_get_data_folder(self): data_folder = utils.get_data_folder() self.assertEqual(data_folder, "data/")
from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import LabelEncoder, StandardScaler from tensorflow.keras import layers, callbacks from tensorflow.python.keras.utils.vis_utils import plot_model from utils import get_loss from utils import TEST_SITES from utils import get_data_folder N_SPLITS = 3 SITE_IDX = None N_TOP_BSSIDS = 20 BATCH_SIZE = 128 OVERWRITE = True data_folder = get_data_folder() summary_path = data_folder / "file_summary.csv" stratified_holdout_path = data_folder / "holdout_ids.csv" # Using https://www.kaggle.com/hiro5299834/indoor-navigation-and-location-wifi-features pivot_data_folder = data_folder / "pivot" holdout_df = pd.read_csv(stratified_holdout_path) if SITE_IDX is None: sites = TEST_SITES pivot_paths = [pivot_data_folder / f"{site}_train.csv" for site in sites] holdout_df = holdout_df[holdout_df["test_site"]] else: analysis_site = TEST_SITES[SITE_IDX] sites = [analysis_site] holdout_df = holdout_df[holdout_df["site_id"] == analysis_site]
def run(): print("Creating validation set") require_valid_waypoints_in_train = True max_valid_unique_fraction = 0.4 max_valid_unique_count_per_trajectory = 15 prob_allow_hardest_trajectories = 0.15 min_waypoints_holdout = 6 # Reflect the test data - only put long trajectories in the holdout set holdout_fraction = 0.08 np.random.seed(14) data_folder = utils.get_data_folder() summary_path = data_folder / 'file_summary.csv' stratified_holdout_path = data_folder / 'holdout_ids.csv' if not stratified_holdout_path.is_file(): df = pd.read_csv(summary_path) if require_valid_waypoints_in_train: train_waypoints, waypoint_counts = utils.get_train_waypoints( data_folder, df) sites = sorted(set(df.site_id)) valid_trajectory_seen_train = [] floor_dfs = [] for s in sites: floors = sorted( set(df.text_level[(df.site_id == s).values & (df['mode'] == 'train').values])) for f in floors: floor_df = df.iloc[(df.site_id == s).values & (df.text_level == f).values] if require_valid_waypoints_in_train: try: floor_int = utils.TEST_FLOOR_MAPPING[f] except: floor_int = utils.NON_TEST_FLOOR_MAPPING[f] floor_waypoints = train_waypoints.iloc[ (train_waypoints.site_id.values == s) & (train_waypoints.level.values.astype(np.float32) == floor_int)] floor_waypoint_counts = { (str(k[2]), str(k[3])): v for k, v in waypoint_counts.items() if k[0] == s and (k[1] == floor_int) } considered_seq_ids = [] fn_sorted_ids = np.argsort( -floor_df.num_train_waypoints.values) for i, fn in enumerate(floor_df.fn.values[fn_sorted_ids]): fn_waypoints = floor_waypoints[floor_waypoints.fn == fn] if fn_waypoints.shape[0] == 0: continue if fn_waypoints.shape[ 0] < min_waypoints_holdout and len( considered_seq_ids) > 0: break waypoint_counts_fn = np.array([ floor_waypoint_counts[(str(x), str(y))] for x, y in zip(fn_waypoints.x.values, fn_waypoints.y.values) ]) this_waypoint_counts = np.zeros(fn_waypoints.shape[0]) waypoint_vals = np.stack( [fn_waypoints.x.values, fn_waypoints.y.values]) for j in range(fn_waypoints.shape[0]): this_waypoint_counts[j] = ( (waypoint_vals[0] == waypoint_vals[0, j]) & (waypoint_vals[1] == waypoint_vals[1, j])).sum() unseen_normalized = (waypoint_counts_fn > this_waypoint_counts) / ( this_waypoint_counts) non_unique_count = unseen_normalized.sum() total_count = (1 / this_waypoint_counts).sum() unique_fraction = 1 - (non_unique_count / total_count) unique_count = np.round(total_count - non_unique_count) # if (unique_fraction <= max_valid_unique_fraction) and not ( # unique_count <= max_valid_unique_count_per_trajectory): # import pdb; pdb.set_trace() # x=1 if (unique_fraction <= max_valid_unique_fraction) and ( unique_count <= max_valid_unique_count_per_trajectory) or ( np.random.uniform() < prob_allow_hardest_trajectories): considered_seq_ids.append(fn_sorted_ids[i]) for x, y in zip(fn_waypoints.x.values, fn_waypoints.y.values): floor_waypoint_counts[(str(x), str(y))] -= 1 if len(considered_seq_ids) == 0: if floor_df.test_site.values[0]: raise ValueError( "No valid validation trajectories selected") considered_seq_ids = [fn_sorted_ids[0]] considered_seq_ids = np.array(considered_seq_ids) else: considered_seq_ids = np.where( floor_df.num_train_waypoints.values >= ( min(min_waypoints_holdout, floor_df.num_train_waypoints.max())))[0] # if (s, f) == ('5d2709a003f801723c3251bf', '3F'): # import pdb; pdb.set_trace() num_holdout_trajectories = min( considered_seq_ids.size, max(1, int(floor_df.shape[0] * holdout_fraction))) if num_holdout_trajectories == considered_seq_ids.size: holdout_ids = considered_seq_ids else: probs = floor_df.num_train_waypoints.values[ considered_seq_ids]**1 probs = probs / probs.sum() holdout_ids = np.random.choice(considered_seq_ids, num_holdout_trajectories, replace=False, p=probs) floor_waypoint_counts = {(str(k[2]), str(k[3])): v for k, v in waypoint_counts.items() if k[0] == s and (k[1] == floor_int)} for fn in floor_df.fn.values[holdout_ids]: fn_waypoints = floor_waypoints[floor_waypoints.fn == fn] for x, y in zip(fn_waypoints.x.values, fn_waypoints.y.values): floor_waypoint_counts[(str(x), str(y))] -= 1 num_train_waypoints = floor_df.num_train_waypoints.values test_site = floor_df.test_site.values floor_df = floor_df.iloc[:, :4] floor_df.index = np.arange(floor_df.shape[0]) floor_df['text_level'] = f floor_df['holdout'] = False floor_df.loc[holdout_ids, 'holdout'] = True floor_df.loc[holdout_ids, 'mode'] = 'valid' floor_df['num_train_waypoints'] = num_train_waypoints floor_df['test_site'] = test_site for valid_fn in floor_df.fn.values[floor_df.holdout.values]: fn_waypoints = floor_waypoints[floor_waypoints.fn == valid_fn][['x', 'y']].values if df.test_site.values[(df.site_id == s)][0]: waypoints_in_train = np.array([ floor_waypoint_counts[(str( fn_waypoints[i, 0]), str(fn_waypoints[i, 1]))] > 0 for i in (range(fn_waypoints.shape[0])) ]) valid_trajectory_seen_train.append( (waypoints_in_train.shape[0], waypoints_in_train.sum())) # if s == '5d2709c303f801723c3299ee' and f == '1F': # import pdb; pdb.set_trace() # x=1 floor_dfs.append(floor_df) valid_trajectory_seen_train = np.array(valid_trajectory_seen_train) entire_traj_in_train = ( valid_trajectory_seen_train[:, 0] == valid_trajectory_seen_train[:, 1]) entire_waypoints_fraction = valid_trajectory_seen_train[ entire_traj_in_train, 0].sum() / valid_trajectory_seen_train[:, 0].sum() print( entire_traj_in_train.mean(), entire_waypoints_fraction.mean(), valid_trajectory_seen_train[:, 1].sum() / valid_trajectory_seen_train[:, 0].sum()) combined_df = pd.concat(floor_dfs) combined_df.to_csv(stratified_holdout_path, index=False)
def get_embedded_description_file_name(): embedded_description_file_name = get_data_folder( ) + get_embedding_file_name_prefix() + 'features.npy' return embedded_description_file_name
def get_steam_inventory_file_name(profile_id): steam_inventory_file_name = get_data_folder() + 'inventory_' + str(profile_id) + '.json' return steam_inventory_file_name
def get_embedding_app_id_file_name(): embedding_app_id_file_name = get_data_folder( ) + get_embedding_file_name_prefix() + 'appids.txt' return embedding_app_id_file_name
def read_all_recs(df): base_path = utils.get_data_folder() return [ read_one_rec(rec, base_path) for _, rec in tqdm(df.iterrows(), total=len(df)) ]
"5da1382d4db8ce0c98bbe92e", # 14 - Well aligned, no waypoints inside buildings "5da138314db8ce0c98bbf3a0", # 15 - Well aligned, occasional waypoints at edge of buildings "5da138364db8ce0c98bc00f1", # 16 - Well aligned, occasional waypoints at edge of buildings "5da1383b4db8ce0c98bc11ab", # 17 - Well aligned, no waypoints inside buildings "5da138754db8ce0c98bca82f", # 18 - Well aligned, waypoints often at edges of buildings, sometimes inside buildings "5da138764db8ce0c98bcaa46", # 19 - Well aligned, no waypoints inside buildings "5da1389e4db8ce0c98bd0547", # 20 - Well aligned, no waypoints inside buildings. Some open areas seem unaccessible "5da138b74db8ce0c98bd4774", # 21 - Well aligned, no waypoints inside buildings "5da958dd46f8266d0737457b", # 22 - Well aligned, rare waypoints inside buildings "5dbc1d84c1eb61796cf7c010", # 23 - Well aligned, no waypoints inside buildings "5dc8cea7659e181adb076a3f", # 24 - Well aligned, no waypoints inside buildings ][4] level_filter = [None, "F1", -1][2] target_fn = [None, "5dd5290c50e04e0006f5651e", "5dd5216b50e04e0006f56476"][0] data_folder = utils.get_data_folder() summary_path = data_folder / "file_summary.csv" if not "df" in locals(): df = pd.read_csv(summary_path) considered_ids = np.where(df["mode"] == "train")[0] if only_consider_test_sites: considered_ids = considered_ids[df.test_site[considered_ids]] if site_filter is not None: considered_ids = considered_ids[df.site_id[considered_ids] == site_filter] if level_filter is not None: if isinstance(level_filter, int): considered_ids = considered_ids[df.level[considered_ids] == level_filter] else: considered_ids = considered_ids[df.text_level[considered_ids] ==
def run(mode): print("Calculating the sensor uncertainty for unseen trajectories") save_uncertainties = True distance_model_dist_estimates = True apply_angle_correction_threshold = 0 # >= 0.4 means no corrections apply_distance_correction_threshold = 0 # >= 0.4 means no corrections predictions_ext = mode + '.csv' source_valid_test_predictions = [ 'relative_movement_v3_valid.csv', 'relative_movement_v3_test.csv' ] source_valid_test_dist_predictions = [ 'distance_valid.csv', 'distance_test.csv' ] fn_mode = 'first_middle_last' if 'first_middle_last' in predictions_ext else ( 'joined_middle_last' if 'joined_middle_last' in predictions_ext else 'mean') data_folder = utils.get_data_folder() model_folder = data_folder.parent / 'Models' / 'correct_sensor_preds' uncertainty_path = model_folder / 'predictions' / ('uncertainty - ' + predictions_ext) if uncertainty_path.is_file(): return source_pred_folder = data_folder.parent / 'Models' / ( 'sensor_absolute_movement') / 'predictions' source_ext = source_valid_test_predictions[int(mode == 'test')] source_path = source_pred_folder / source_ext preds = pd.read_csv(source_path) source_dist_pred_folder = data_folder.parent / 'Models' / ( 'sensor_distance') / 'predictions' source_dist_ext = source_valid_test_dist_predictions[int(mode == 'test')] source_dist_path = source_dist_pred_folder / source_dist_ext dist_preds = pd.read_csv(source_dist_path) corrections_path = model_folder / 'predictions' / predictions_ext corrections = pd.read_csv(corrections_path) uncertainties = pd.read_csv(source_path) fns = np.sort(np.unique(preds.fn.values)) preds['distance'] = np.sqrt(preds['x']**2 + preds['y']**2) if distance_model_dist_estimates: assert np.all(dist_preds.fn.values == preds.fn.values) preds['pred_distance'] = dist_preds.pred.values else: preds['pred_distance'] = np.sqrt(preds['x_pred']**2 + preds['y_pred']**2) preds['pred_distance_corrected'] = preds['pred_distance'] preds['x_pred_corrected'] = preds['x_pred'] preds['y_pred_corrected'] = preds['y_pred'] uncertainties['distance_uncertainty'] = np.nan uncertainties['pred_distance_uncertainty'] = np.nan uncertainties['angle_uncertainty'] = np.nan uncertainties['pred_angle_uncertainty'] = np.nan uncertainty_map_mean = [ ('mean_abs_rel_dist_error_target', 'distance_uncertainty'), ('mean_abs_rel_dist_error_target_avg_fold', 'pred_distance_uncertainty'), ('mean_abs_angle_error_target', 'angle_uncertainty'), ('mean_abs_angle_error_target_avg_fold', 'pred_angle_uncertainty'), ] uncertainty_map_first = [ ('first_abs_rel_dist_error_target', 'distance_uncertainty'), ('first_abs_rel_dist_error_target_avg_fold', 'pred_distance_uncertainty'), ('first_abs_angle_error_target', 'angle_uncertainty'), ('first_abs_angle_error_target_avg_fold', 'pred_angle_uncertainty'), ] uncertainty_map_middle = [ ('middle_mean_abs_rel_dist_error_target', 'distance_uncertainty'), ('middle_mean_abs_rel_dist_error_target_avg_fold', 'pred_distance_uncertainty'), ('middle_mean_abs_angle_error_target', 'angle_uncertainty'), ('middle_mean_abs_angle_error_target_avg_fold', 'pred_angle_uncertainty'), ] uncertainty_map_last = [ ('last_abs_rel_dist_error_target', 'distance_uncertainty'), ('last_abs_rel_dist_error_target_avg_fold', 'pred_distance_uncertainty'), ('last_abs_angle_error_target', 'angle_uncertainty'), ('last_abs_angle_error_target_avg_fold', 'pred_angle_uncertainty'), ] for fn in fns: pred_rows = np.where(preds.fn.values == fn)[0] uncertainty_rows = np.where(uncertainties.fn.values == fn)[0] correct_row = np.where(corrections.fn.values == fn)[0][0] if fn_mode == 'mean': if 'mean_rel_dist_error_target_avg_fold' in corrections.columns: mean_correction = corrections[ 'mean_rel_dist_error_target_avg_fold'].values[correct_row] if np.abs( mean_correction) > apply_distance_correction_threshold: preds.loc[pred_rows, 'pred_distance_corrected'] = preds[ 'pred_distance'].values[pred_rows] * (1 + mean_correction) if 'mean_angle_error_target_avg_fold' in corrections.columns: angle_correction = corrections[ 'mean_angle_error_target_avg_fold'].values[correct_row] if np.abs(angle_correction) > apply_angle_correction_threshold: x = preds.x_pred.values[pred_rows] y = preds.y_pred.values[pred_rows] preds.loc[pred_rows, 'x_pred_corrected'] = np.cos( angle_correction) * x + np.sin(angle_correction) * y preds.loc[pred_rows, 'y_pred_corrected'] = -np.sin( angle_correction) * x + np.cos(angle_correction) * y for k1, k2 in uncertainty_map_mean: v = corrections[k1].values[correct_row] uncertainties.loc[uncertainty_rows, k2] = v else: first_correction = corrections[ 'first_rel_dist_error_target_avg_fold'].values[correct_row] middle_correction = corrections[ 'middle_mean_rel_dist_error_target_avg_fold'].values[ correct_row] last_correction = corrections[ 'last_rel_dist_error_target_avg_fold'].values[correct_row] if np.abs(first_correction) > apply_distance_correction_threshold: preds.loc[ pred_rows[0], 'pred_distance_corrected'] = preds['pred_distance'].values[ pred_rows[0]] * (1 + first_correction) if np.abs(middle_correction) > apply_distance_correction_threshold: preds.loc[ pred_rows[1:-1], 'pred_distance_corrected'] = preds['pred_distance'].values[ pred_rows[1:-1]] * (1 + middle_correction) if np.abs(last_correction) > apply_distance_correction_threshold: preds.loc[ pred_rows[-1], 'pred_distance_corrected'] = preds['pred_distance'].values[ pred_rows[-1]] * (1 + last_correction) for k1, k2 in uncertainty_map_first: v = corrections[k1].values[correct_row] uncertainties.loc[uncertainty_rows[0], k2] = v for k1, k2 in uncertainty_map_middle: v = corrections[k1].values[correct_row] uncertainties.loc[uncertainty_rows[1:-1], k2] = v for k1, k2 in uncertainty_map_last: v = corrections[k1].values[correct_row] uncertainties.loc[uncertainty_rows[-1], k2] = v preds['pred_error'] = np.abs((preds['distance'] - preds['pred_distance'])) preds['corrected_error'] = np.abs( (preds['distance'] - preds['pred_distance_corrected'])) preds['rel_pred_dist_error'] = ( preds['distance'] - preds['pred_distance']) / (preds['pred_distance']) preds['rel_corrected_dist_error'] = (preds['distance'] - preds['pred_distance_corrected']) / ( preds['pred_distance_corrected']) orig_abs_err = (np.abs(preds['x'] - preds['x_pred']) + np.abs(preds['y'] - preds['y_pred'])).mean() / 2 corrected_abs_err = ( np.abs(preds['x'] - preds['x_pred_corrected']) + np.abs(preds['y'] - preds['y_pred_corrected'])).mean() / 2 mean_orig_rel_dist_err = np.abs(preds['rel_pred_dist_error'].values).mean() mean_corrected_rel_dist_err = np.abs( preds['rel_corrected_dist_error'].values).mean() rel_err = mean_corrected_rel_dist_err / mean_orig_rel_dist_err print(f'Corrected relative error rate: {rel_err:.3f}') orig_dist_mae = preds.pred_error.values.mean() corrected_dist_mae = preds.corrected_error.values.mean() print(f'Original MAE: {orig_dist_mae:.3f}; Corrected MAE:\ {corrected_dist_mae:.3f}') print(f'Rel move original MAE: {orig_abs_err:.3f}; Corrected rel move MAE:\ {corrected_abs_err:.3f}') changed_fraction = (preds['pred_error'] != preds['corrected_error']).mean() improved_fraction = (preds['pred_error'] > preds['corrected_error'] ).mean() / ((changed_fraction + 1e-9)) print(f'Improved distance pred fraction: {improved_fraction:.3f}') dist_uncertainty_cor = np.corrcoef( np.stack([ uncertainties['pred_distance_uncertainty'].values, uncertainties['distance_uncertainty'].values, ]))[0, 1] angle_uncertainty_cor = np.corrcoef( np.stack([ uncertainties['pred_angle_uncertainty'].values, uncertainties['angle_uncertainty'].values, ]))[0, 1] print(f'Distance uncertainty correlation: {dist_uncertainty_cor:.3f}') print(f'Angle uncertainty correlation: {angle_uncertainty_cor:.3f}') uncertainties.plot.scatter('pred_distance_uncertainty', 'distance_uncertainty') uncertainties = uncertainties[[ 'site', 'floor', 'fn', 'sub_trajectory_id', 'num_waypoints', 'distance_uncertainty', 'pred_distance_uncertainty', 'angle_uncertainty', 'pred_angle_uncertainty', ]] if save_uncertainties: uncertainty_path = model_folder / 'predictions' / ('uncertainty - ' + predictions_ext) uncertainties.to_csv(uncertainty_path, index=False)
import itertools from pathlib import Path from typing import Optional import numpy as np import pandas as pd from scipy.interpolate import interp2d try: from utils import get_data_folder, TEST_FLOOR_MAPPING except: import sys sys.path.append('..') from utils import get_data_folder, TEST_FLOOR_MAPPING DEFAULT_WAYPOINT_PATH: Path = get_data_folder( ) / "waypoints" / "waypoint_by_hand.csv" def generate_grid_points(site, floor, bottom_left, top_left, bottom_right, top_right, N_h, N_v): X = np.array([0, 0, 1, 1], dtype=np.float32) Y = np.array([0, 1, 0, 1], dtype=np.float32) Zx, Zy = np.stack([bottom_left, top_left, bottom_right, top_right], axis=1) fx = interp2d(X, Y, Zx) fy = interp2d(X, Y, Zy) U = np.linspace(0, 1, N_h) V = np.linspace(0, 1, N_v) W = np.array(list([fx(u, v), fy(u, v)] for u, v in itertools.product(U, V))) W = np.squeeze(W)
def get_all_floor_preds(mode, config, site, site_id, data_folder, model_folder, df, analysis_floors, debug_fn, verbose=True): site_floors = np.sort( np.unique(df.text_level[(df.site_id == site) & (df['mode'] != 'test')].values)) if debug_fn is not None and df.site_id.values[np.where( df.fn.values == debug_fn)[0][0]] != site: return None # Load the wifi models for all site floors models = {} for floor in site_floors: mode_prefix = 'test-' if mode == 'test' else '' model_path = model_folder / site / (mode_prefix + floor + '.pickle') with open(model_path, 'rb') as f: models[floor] = pickle.load(f) # Load all prediction inputs trajectories = [] for floor_id, floor in enumerate(site_floors): if verbose: print(f"Load floor {floor_id+1} of {site_floors.size}") numeric_floor = utils.TEST_FLOOR_MAPPING[floor] site_df = df[(df.site_id == site) & (df.num_wifi > 0)] analysis_df = site_df[(site_df['mode'] == mode)] target_floors = np.array( [analysis_floors[fn] for fn in analysis_df['fn'].values]) correct_floor = target_floors == numeric_floor analysis_df_floor = analysis_df[correct_floor] if analysis_df_floor.shape[0] > 0: test_floor = floor if mode == 'test' else None trajectories.extend( utils.load_site_floor(analysis_df_floor, recompute_grouped_data=False, test_floor=test_floor)) # Generate predictions for all trajectories for all floors make_predict_efforts = [True for t in trajectories] analysis_preds = [] for j, t in enumerate(trajectories): if verbose: print(f"Trajectory {j+1} of {len(trajectories)}") fn = t['file_meta'].fn if (debug_fn is not None) and fn != debug_fn: continue debug_floor_distances = {} for floor in site_floors: # Locate all unique wifi time observations _, full_pos_pred = predict_trajectory(t, make_predict_efforts[j], models[floor], True, config) distances = full_pos_pred.values[:, 2:] min_distances = distances.min(0) if (debug_fn is not None) and fn == debug_fn: print(debug_fn, floor) debug_floor_distances[floor] = min_distances analysis_preds.append({ 'site': site, 'floor': floor, 'numeric_floor': utils.TEST_FLOOR_MAPPING[floor], 'reference_floor_label': analysis_floors[fn], 'fn': fn, 'min_min_distance': min_distances.min(), 'mean_min_distance': min_distances.mean(), 'max_min_distance': min_distances.max(), 'min_distance_q0.1': np.quantile(min_distances, 0.1), 'min_distance_q0.2': np.quantile(min_distances, 0.2), 'min_distance_q0.3': np.quantile(min_distances, 0.3), 'min_distance_q0.4': np.quantile(min_distances, 0.4), 'min_distance_q0.5': np.quantile(min_distances, 0.5), 'min_distance_q0.6': np.quantile(min_distances, 0.6), 'min_distance_q0.7': np.quantile(min_distances, 0.7), 'min_distance_q0.8': np.quantile(min_distances, 0.8), 'min_distance_q0.9': np.quantile(min_distances, 0.9), }) if (debug_fn is not None) and fn == debug_fn: combined_debug = pd.DataFrame(debug_floor_distances) debug_save_path = utils.get_data_folder() / ( "all_floor_wifi_distances " + debug_fn + '.csv') combined_debug.to_csv(debug_save_path, index=True) print(f"Done with site {site_id+1} of 24: {site}") return pd.DataFrame(analysis_preds)
def run(): print("Computing aggregate statistics") data_folder = utils.get_data_folder() processed_folder = data_folder / 'processed' pathlib.Path(processed_folder).mkdir(parents=True, exist_ok=True) last_processed_path = processed_folder / 'tests_level_id.p' if last_processed_path.is_file(): return df = pd.read_csv(data_folder / 'file_summary.csv') # start_time = time.time() recs = read_all_recs( #df[(df['test_site']) & (~df['num_train_waypoints'].isnull()) & (df['num_wifi'] > 0)] df[(df['test_site'])]) #print(len(recs)) #print(f"Done in {time.time()-start_time:8.5f}s") agg = {} for col in tqdm([ 'x_acce', 'y_acce', 'z_acce', 'x_gyro', 'y_gyro', 'z_gyro', 'x_magn', 'y_magn', 'z_magn', 'x_ahrs', 'y_ahrs', 'z_ahrs' ]): x = np.concatenate([r['shared_time'][col] for r in recs]) agg[col] = {"mean": x.mean(), "std": x.std()} for col in tqdm(['x_waypoint', 'y_waypoint']): x = np.concatenate( [r['waypoint'][col] for r in recs if 'waypoint' in r]) agg[col] = {"mean": x.mean(), "std": x.std()} x = np.concatenate([r['wifi']['rssid_wifi'] for r in recs if 'wifi' in r]) agg['wifi'] = { 'mean': x.mean(), "std": x.std(), "min": x.min(), "max": x.max() } for col in tqdm(['power_beac', 'rssi_beac']): x = np.concatenate([r['ibeacon'][col] for r in recs if 'ibeacon' in r]) agg[col] = { 'mean': x.mean(), "std": x.std(), "min": x.min(), "max": x.max() } agg['wifi']['max_records_per_t1'] = np.max([ r['wifi'].groupby('t1_wifi')['bssid_wifi'].size().max() for r in recs if 'wifi' in r ]) agg['wifi']['max_records_per_t2'] = np.max([ r['wifi'].groupby('t2_wifi')['bssid_wifi'].size().max() for r in recs if 'wifi' in r ]) agg['wifi']['max_unique_t1'] = np.max( [r['wifi']['t1_wifi'].nunique() for r in recs if 'wifi' in r]) agg['wifi']['max_unique_t2'] = np.max( [r['wifi']['t2_wifi'].nunique() for r in recs if 'wifi' in r]) agg['max_seq_len'] = np.max([len(r['shared_time']['time']) for r in recs]) with open(processed_folder / 'tests_stats.p', 'wb') as handle: pickle.dump(agg, handle, protocol=4) with open(processed_folder / 'tests_ssid_wifi.p', 'wb') as handle: pickle.dump(set( np.concatenate( [r['wifi']['ssid_wifi'] for r in recs if 'wifi' in r])), handle, protocol=4) with open(processed_folder / 'tests_bssid_wifi.p', 'wb') as handle: pickle.dump(set( np.concatenate( [r['wifi']['bssid_wifi'] for r in recs if 'wifi' in r])), handle, protocol=4) with open(processed_folder / 'tests_id_beac_1.p', 'wb') as handle: pickle.dump(set( np.concatenate( [r['ibeacon']['id_beac_1'] for r in recs if 'ibeacon' in r])), handle, protocol=4) with open(processed_folder / 'tests_id_beac_2.p', 'wb') as handle: pickle.dump(set( np.concatenate( [r['ibeacon']['id_beac_2'] for r in recs if 'ibeacon' in r])), handle, protocol=4) with open(processed_folder / 'tests_id_beac_3.p', 'wb') as handle: pickle.dump(set( np.concatenate( [r['ibeacon']['id_beac_3'] for r in recs if 'ibeacon' in r])), handle, protocol=4) with open(processed_folder / 'tests_mac_beac.p', 'wb') as handle: pickle.dump(set( np.concatenate( [r['ibeacon']['mac_beac'] for r in recs if 'ibeacon' in r])), handle, protocol=4) with open(processed_folder / 'tests_site_id.p', 'wb') as handle: pickle.dump(set([r['site_id'] for r in recs]), handle, protocol=4) with open(last_processed_path, 'wb') as handle: pickle.dump(set( [r['site_id'] + '_' + str(r['text_level']) for r in recs]), handle, protocol=4)
def test_get_data_folder(self): folder_name = utils.get_data_folder() self.assertGreater(len(folder_name), 0)
def run(mode, grid_type, consider_multiprocessing): print(f"Optimizing predictions for grid type {grid_type}") store_valid_submission = True store_extended_test = True debug_fn = [None, '58279d6ab8c2213722f2ef6b'][0] extensive_search = True additional_grid_multiprocessing = consider_multiprocessing consider_ignore_private_test = False grid_mode = ["standard", "dense"][int(grid_type == "dense_inner")] grid_settings = { "standard": { "min_distance_to_known": 3.0, "wall_point_distance_multiplier": 0.4, "inner_point_distance_multiplier": 0.7, }, "dense": { "min_distance_to_known": 1.5, "wall_point_distance_multiplier": 0.2, "inner_point_distance_multiplier": 0.35, }, } grid_version = [ 3, 4 ][int(grid_type != "walls_only_old" )] # V3: no inner grid fill; V4: inner grid fill + fixes cheat_valid_waypoints = not True config = { 'top_distance_pos_wifi': 20, 'weighted_pos_exponent': 4, 'waypoint_weighted_wifi_penalties_mult': 0.8, 'nn_wifi_exp': 1.5, 'wifi_penalties_exp': 0.8, 'time_leak_delay_cutoff': 15, 'time_leak_time_decay_constant': 20, 'time_leak_nearby_constant': 2, 'time_leak_exact_constant': 5, 'time_leak_distance_pen_limit_constant': 0.7, 'time_leak_dissimilarity_decay': 15, 'time_leak_max_penalty': 30, 'distance_pen_constant': 30, 'rel_movement_pos_constant': 0, # Angle penalties are better! 'rel_movement_angle_constant': 9, 'abs_movement_pos_constant': 1.5, 'cum_abs_movement_pos_constant': 1.0, 'abs_movement_angle_constant': 0, # Position penalties are better! 'distance_uncertainty_exponent': 1.0, 'abs_move_uncertainty_exponent': 1.0, 'wifi_dir_constant': 0.5, 'inject_waypoints': not cheat_valid_waypoints or mode == 'test', 'off_grid_waypoint_penalty': 8, 'off_grid_no_penalty_distance': 10, 'addit_grid_density_penalty': 4, 'min_distance_to_known': grid_settings[grid_mode]["min_distance_to_known"], 'max_distance_to_known': 30.0, 'generate_inner_waypoints': True, 'generate_edge_waypoints': False, 'wall_point_distance_multiplier': grid_settings[grid_mode]["wall_point_distance_multiplier"], 'inner_point_distance_multiplier': grid_settings[grid_mode]["inner_point_distance_multiplier"], 'considered_sensor_sig_keys_scale': [('z_ahrs', 0.1), ('z_magn', 1.0)], 'top_distance_pos_sensor': 20, 'magnetometer_penalty_constant': 0 * 1.0, 'wall_penalty_constant': 0 * 5, 'beam_1_width': (4000 if grid_mode == "very_dense" else 2000) if (extensive_search) else 200, # Expect a small boost of 0 to 0.05 when doing extensive_search 'beam_2_width_wifi': 20 if extensive_search else 10, 'beam_2_width_abs_movement': 80 if extensive_search else 40, } unbias_distance_predictions = True drop_mislabeled_fn_list_valid = [] test_override_floors = False use_multiprocessing = consider_multiprocessing and ( extensive_search) and debug_fn is None and ( grid_mode != "very_dense" ) # Can be used with large (> 1000) beam 1 widths ignore_private_test = consider_ignore_private_test and (debug_fn is None) valid_mode = mode == 'valid' if valid_mode: wifi_source = 'non_parametric_wifi - valid - full distances.pickle' sensor_distance_source = 'distance_valid.csv' sensor_relative_movement_source = 'relative_movement_v2_valid.csv' sensor_absolute_movement_source = 'relative_movement_v3_valid.csv' sensor_uncertainties_source = 'uncertainty - valid.csv' time_leak_source = 'valid_edge_positions_v3.csv' else: wifi_source = 'non_parametric_wifi - test - full distances.pickle' sensor_distance_source = 'distance_test.csv' sensor_relative_movement_source = 'relative_movement_v2_test.csv' sensor_absolute_movement_source = 'relative_movement_v3_test.csv' sensor_uncertainties_source = 'uncertainty - test.csv' time_leak_source = 'test_edge_positions_v3.csv' wifi_ref_source = wifi_source.replace(' - full distances', '').replace('pickle', 'csv') data_folder = utils.get_data_folder() waypoints_path = data_folder / 'train_waypoints_timed.csv' models_folder = Path(data_folder).parent / 'Models' wifi_preds_folder = models_folder / 'non_parametric_wifi' / 'predictions' storage_folder = Path(data_folder).parent / 'Combined predictions' pathlib.Path(storage_folder).mkdir(parents=True, exist_ok=True) submission_path = storage_folder / (mode + ' - ' + grid_type + '.csv') if submission_path.is_file(): return wifi_preds_path = wifi_preds_folder / wifi_source source_preds_path = wifi_preds_folder / wifi_ref_source sensor_distance_folder = models_folder / 'sensor_distance' / 'predictions' sensor_distance_path = sensor_distance_folder / sensor_distance_source sensor_rel_movement_folder = models_folder / 'sensor_relative_movement' / ( 'predictions') sensor_abs_movement_folder = models_folder / 'sensor_absolute_movement' / ( 'predictions') sensor_rel_movement_path = sensor_rel_movement_folder / ( sensor_relative_movement_source) sensor_abs_movement_path = sensor_abs_movement_folder / ( sensor_absolute_movement_source) time_leak_source_path = data_folder / time_leak_source leaderboard_types_path = data_folder / 'leaderboard_type.csv' correct_sensor_preds_folder = models_folder / 'correct_sensor_preds' / ( 'predictions') sensor_uncertainties_path = correct_sensor_preds_folder / ( sensor_uncertainties_source) sensor_segment_stats_source = data_folder / 'sensor_data' / 'meta.csv' walls_folder = data_folder / 'stashed_walls_intersection_count' waypoints_folder = data_folder / 'stashed_floor_additional_waypoints' pathlib.Path(waypoints_folder).mkdir(parents=True, exist_ok=True) # Load the raw data upon changing the data mode (loaded_mode, orig_source_preds, source_preds, sites, floors, unique_floor_waypoints, floor_waypoint_rel_pos_distances, floor_waypoint_wifi_distances, floor_waypoint_wifi_distances_order, leaderboard_types, time_leaks, wifi_preds_flat, original_preds, distance_preds, relative_movement_preds, absolute_movement_preds, sensor_preds_uncertainties, sensor_segment_stats, source_actual, fn_ids, w) = combine_predictions_beamsearch_utils.preprocess( config, mode, wifi_preds_path, source_preds_path, valid_mode, sensor_distance_path, sensor_rel_movement_path, sensor_abs_movement_path, time_leak_source_path, waypoints_path, leaderboard_types_path, cheat_valid_waypoints, sensor_uncertainties_path, sensor_segment_stats_source, waypoints_folder, additional_grid_multiprocessing, test_override_floors, grid_version) optimized_predictions, optimized_test_predictions = ( combine_predictions_beamsearch_utils.combined_predictions_all_floors( mode, config, use_multiprocessing, distance_preds, relative_movement_preds, absolute_movement_preds, sensor_preds_uncertainties, source_preds, original_preds, source_actual, sensor_segment_stats, fn_ids, sites, floors, time_leaks, wifi_preds_flat, unique_floor_waypoints, floor_waypoint_rel_pos_distances, floor_waypoint_wifi_distances, floor_waypoint_wifi_distances_order, leaderboard_types, ignore_private_test, debug_fn, drop_mislabeled_fn_list_valid, w, walls_folder, unbias_distance_predictions)) if valid_mode: optimized_predictions.sort_values( ["site", "floor", "fn", "waypoint_time"], inplace=True) err = optimized_predictions.after_optim_error.values optimized_error = err.mean() print(f"Optimized validation error: {optimized_error:.2f}") if debug_fn is None: best_opt_err = utils.get_best_opt_error(optimized_predictions) tr_mask = optimized_predictions.all_targets_on_waypoints.values tr_traj_opt_error = err[tr_mask].mean() tr_best_opt_error = best_opt_err[tr_mask].mean() non_tr_traj_opt_error = np.nan if cheat_valid_waypoints else ( err[~tr_mask].mean()) print( f"Group stats: {tr_traj_opt_error:.2f} ({tr_best_opt_error:.2f});\ {non_tr_traj_opt_error:.2f}") else: non_predicted_ids = np.where(np.abs(original_preds).sum(1) == 0)[0] optimized_test_predictions = pd.concat([ optimized_test_predictions, orig_source_preds.iloc[non_predicted_ids] ]) original_rows = np.array([ np.where(optimized_test_predictions.site_path_timestamp.values == sps)[0][0] for sps in orig_source_preds.site_path_timestamp ]) optimized_test_predictions = optimized_test_predictions.iloc[ original_rows] optimized_test_predictions.index = np.arange( optimized_test_predictions.shape[0]) if (store_valid_submission or mode == 'test') and debug_fn is None: if valid_mode: optimized_predictions.to_csv(submission_path, index=False) else: optimized_test_predictions.to_csv(submission_path, index=False) if store_extended_test: submission_path_extended = storage_folder / ( mode + ' - ' + grid_type + ' - extended.csv') optimized_predictions.to_csv(submission_path_extended, index=False)
def test_get_data_folder_v2(self): folder_name = utils.get_data_folder(version=2) self.assertEqual(folder_name, 'data_v2/')
def run(): print("Combining sensor models with device ids") data_folder = utils.get_data_folder() summary_path = data_folder / 'file_summary.csv' df = pd.read_csv(summary_path) model_folder = data_folder.parent / 'Models' / 'sensor_absolute_movement' absolute_fold_folder = model_folder / 'cv' distance_folder = model_folder.parent / 'sensor_distance' valid_path = model_folder / 'predictions' / 'relative_movement_v3_valid.csv' device_id_path = data_folder / 'device_ids.pickle' meta_sensor_path = data_folder / 'sensor_data' / 'meta.csv' with open(device_id_path, 'rb') as f: device_ids = pickle.load(f) meta_data = pd.read_csv(meta_sensor_path, dtype={'test_type': object}) ################################################# # A: Combine statistics at the trajectory level # ################################################# device_ids_path = data_folder / 'inferred_device_ids.csv' if not device_ids_path.is_file(): device_id_vals, device_drifts, device_id_merged_vals = zip( *list(device_ids.values())) device_ids_df = pd.DataFrame({ 'mode': [meta_data['mode'].values[np.where(meta_data.fn == fn)[0][ 0]] for fn in list(device_ids.keys())], 'test_type': [meta_data['test_type'].values[np.where(meta_data.fn == fn)[ 0][0]] for fn in list(device_ids.keys())], 'fn': list(device_ids.keys()), 'device_id': list(device_id_vals), 'device_id_drift': list(device_drifts), 'device_id_merged': list(device_id_merged_vals), 'site': [df.site_id.values[np.where(df.fn == fn)[0][0]] for fn in list( device_ids.keys())], 'floor': [df.level.values[np.where(df.fn == fn)[0][0]] for fn in list( device_ids.keys())], 'start_time': [meta_data.start_time.values[np.where(meta_data.fn == fn)[ 0][ 0]] for fn in list(device_ids.keys())], 'end_time': [meta_data.end_time.values[np.where(meta_data.fn == fn)[0][ -1]] for fn in list(device_ids.keys())], 'first_last_wifi_time': [df.first_last_wifi_time.values[ np.where(df.fn == fn)[0][0]] for fn in list(device_ids.keys())], }) device_ids_df.sort_values(['first_last_wifi_time', 'start_time'], inplace=True) device_ids_df.to_csv(device_ids_path, index=False) ############################################## # B: Combine statistics at the segment level # ############################################## save_path_device_errors = data_folder / 'sensor_model_device_errors.csv' if not save_path_device_errors.is_file(): train_preds_list = [] for i in range(5): preds = pd.read_csv(absolute_fold_folder / f'preds_bag_fold_{i}.csv') preds['train_fold'] = i train_preds_list.append(preds) train_preds = pd.concat(train_preds_list) train_preds['mode'] = 'train' valid_preds = pd.read_csv(valid_path) valid_preds['train_fold'] = np.nan valid_preds['mode'] = 'valid' test_preds = meta_data[meta_data['mode'] == 'test'] with pd.option_context('mode.chained_assignment', None): test_preds['x'] = np.nan test_preds['y'] = np.nan test_preds['x_pred'] = np.nan test_preds['y_pred'] = np.nan test_preds['train_fold'] = np.nan test_preds.rename(columns={"level": "floor"}, inplace=True) test_preds = test_preds.loc[:, valid_preds.columns] all_preds = pd.concat([train_preds, valid_preds, test_preds]) all_preds['device_id'] = -3 all_preds['device_id_drift'] = False all_preds['device_id_merged'] = -3 all_preds['error'] = -2 all_preds['start_time'] = -1 all_preds['end_time'] = -1 all_preds.index = np.arange(all_preds.shape[0]) for i in tqdm(range(all_preds.shape[0])): fn = all_preds.fn.values[i] mode = all_preds['mode'].values[i] if mode == 'test': error = np.nan else: error = np.sqrt((all_preds.x.values[i]-all_preds.x_pred.values[i])**2 + ( all_preds.y.values[i]-all_preds.y_pred.values[i])**2) sub_trajectory_id = all_preds.sub_trajectory_id.values[i] meta_row = np.where((meta_data.fn.values == fn) & ( meta_data.sub_trajectory_id.values == sub_trajectory_id))[0][0] start_time = meta_data.start_time.values[meta_row] end_time = meta_data.end_time.values[meta_row] df_row = np.where(df.fn == fn)[0][0] first_last_wifi_time = df.first_last_wifi_time.values[df_row] if np.isnan(first_last_wifi_time): assert mode == 'train' first_last_wifi_time = df.start_time.values[df_row] all_preds.loc[i, 'device_id'] = device_ids[fn][0] all_preds.loc[i, 'device_id_drift'] = device_ids[fn][1] all_preds.loc[i, 'device_id_merged'] = device_ids[fn][2] all_preds.loc[i, 'error'] = error all_preds.loc[i, 'start_time'] = start_time all_preds.loc[i, 'end_time'] = end_time all_preds.loc[i, 'first_last_wifi_time'] = first_last_wifi_time all_preds.sort_values([ 'device_id', 'first_last_wifi_time', 'sub_trajectory_id'], inplace=True) all_preds.to_csv(save_path_device_errors, index=False) save_path_fn_errors = data_folder / "fn_device_errors.csv" if not save_path_fn_errors.is_file(): device_errors = pd.read_csv(save_path_device_errors) device_errors.sort_values( ['site', 'fn', 'sub_trajectory_id'], inplace=True) device_errors.index = np.arange(device_errors.shape[0]) device_errors['new_device_id'] = [ True] + (device_errors.device_id.values[:-1] != ( device_errors.device_id.values[1:])).tolist() device_errors['dist'] = np.sqrt( device_errors.x.values**2 + device_errors.y.values**2) distance_cv_folder = distance_folder / 'cv' folds = [] for i in range(5): f = pd.read_csv(distance_cv_folder / ( "preds_bag_fold_" + str(i) + ".csv")) f['fold'] = i folds.append(f) combined_train_folds = pd.concat(folds) valid_preds = pd.read_csv(distance_folder / 'predictions' / ( "distance_valid.csv")) valid_preds.drop(valid_preds.columns[0], axis=1, inplace=True) valid_preds['fold'] = np.nan all_preds = pd.concat([combined_train_folds, valid_preds]) all_preds.sort_values( ['site', 'fn', 'sub_trajectory_id'], inplace=True) all_preds.index = np.arange(all_preds.shape[0]) device_errors['dist_pred'] = np.nan device_errors.loc[np.where(~np.isnan(device_errors.dist.values))[0], 'dist_pred'] = all_preds.pred.values device_errors['dist_error'] = device_errors.dist.values-( device_errors.dist_pred.values) device_errors['rel_dist_error'] = device_errors.dist_error.values/( device_errors.dist_pred.values) device_errors.sort_values(['fn', 'sub_trajectory_id'], inplace=True) device_errors.index = np.arange(device_errors.shape[0]) device_errors['rel_weight'] = np.concatenate( device_errors.groupby('fn').apply( lambda x: np.abs(x.dist.values)/np.abs(x.dist.values).sum())) device_errors['section'] = "Middle" device_errors.loc[np.where(device_errors.sub_trajectory_id.values == ( device_errors.num_waypoints.values-2))[0], 'section'] = "Last" device_errors.loc[np.where( device_errors.sub_trajectory_id.values == 0)[0], 'section'] = "First" device_errors['middle_weight_sums'] = np.concatenate( device_errors.groupby('fn').apply( lambda x: np.repeat(( x.rel_weight.values[x.section.values=="Middle"]).sum(), x.shape[0]).reshape(-1))) device_errors.sort_values( ['device_id', 'first_last_wifi_time', 'sub_trajectory_id'], inplace=True) device_errors.index = np.arange(device_errors.shape[0]) device_errors['rel_middle_weight'] = 0 middle_rows = np.where(device_errors.section.values == "Middle")[0] device_errors.loc[middle_rows, "rel_middle_weight"] = ( device_errors.rel_weight.values[middle_rows]/( device_errors.middle_weight_sums.values[middle_rows])) device_errors['angle_error'] = np.arctan2( device_errors.y_pred.values, device_errors.x_pred.values) - np.arctan2( device_errors.y.values, device_errors.x.values) change_rows = np.where((~np.isnan(device_errors.angle_error.values)) & ( device_errors.angle_error.values < np.pi))[0] device_errors.loc[change_rows, 'angle_error'] = ( device_errors.angle_error.values[change_rows] + 2*np.pi) change_rows = np.where((~np.isnan(device_errors.angle_error.values)) & ( device_errors.angle_error.values > np.pi))[0] device_errors.loc[change_rows, 'angle_error'] = ( device_errors.angle_error.values[change_rows] - 2*np.pi) def f(x): d = {} d['site'] = x['site'].values[0] d['floor'] = x['floor'].values[0] d['mode'] = x['mode'].values[0] d['train_fold'] = x['train_fold'].values[0] d['num_waypoints'] = x['num_waypoints'].values[0] d['total_dist'] = x['dist'].values.sum() d['mean_rel_dist_error'] = (x['rel_dist_error'].values*( x['rel_weight'].values)).sum() d['mean_abs_rel_dist_error'] = (np.abs(x['rel_dist_error'].values)*( x['rel_weight'].values)).sum() d['mean_angle_error'] = (x['angle_error'].values*( x['rel_weight'].values)).sum() d['mean_abs_angle_error'] = (np.abs(x['angle_error'].values)*( x['rel_weight'].values)).sum() d['first_rel_dist_error'] = x['rel_dist_error'].values[0] d['first_abs_rel_dist_error'] = np.abs(x['rel_dist_error'].values)[0] d['first_angle_error'] = x['angle_error'].values[0] d['first_abs_angle_error'] = np.abs(x['angle_error'].values)[0] d['middle_mean_rel_dist_error'] = (x['rel_dist_error'].values*( x['rel_weight'].values))[1:-1].sum() d['middle_mean_abs_rel_dist_error'] = (np.abs(x[ 'rel_dist_error'].values)*(x['rel_weight'].values))[1:-1].sum() d['middle_mean_angle_error'] = (x['angle_error'].values*( x['rel_weight'].values))[1:-1].sum() d['middle_mean_abs_angle_error'] = (np.abs(x['angle_error'].values)*( x['rel_weight'].values))[1:-1].sum() d['last_rel_dist_error'] = x['rel_dist_error'].values[-1] d['last_abs_rel_dist_error'] = np.abs(x['rel_dist_error'].values)[-1] d['last_angle_error'] = x['angle_error'].values[-1] d['last_abs_angle_error'] = np.abs(x['angle_error'].values)[-1] d['first_first_last_wifi_time'] = ( x['first_last_wifi_time'].values).min() d['time'] = (x['start_time'].values).min() d['device_id'] = (x['device_id'].values).min() return pd.Series(d, index=list(d.keys())) fn_dev_errors = device_errors.groupby('fn').apply(f).reset_index() fn_dev_errors['plot_time'] = fn_dev_errors['time'] fn_dev_errors.loc[np.where( fn_dev_errors['mode'].values == "test")[0], 'plot_time'] = ( fn_dev_errors.first_first_last_wifi_time.values[ np.where(fn_dev_errors['mode'] == "test")[0]]) fn_dev_errors['row'] = 1+np.arange(fn_dev_errors.shape[0]) fn_dev_errors.sort_values(['device_id', 'plot_time'], inplace=True) fn_dev_errors.to_csv(save_path_fn_errors, index=False)
def run(): print("Inferring device ids") only_process_test_sites = True signature_dist_threshold = 0.5 dist_scaler = np.array([ 2.4831865e-03, 1.8569984e-03, 1.5326408e-03, 3.5197838e+01, 4.1837849e+01, 3.4933647e+01 ], dtype=np.float32) sig_cols = [ 'x2_gyro_uncali', 'y2_gyro_uncali', 'z2_gyro_uncali', 'x2_magn_uncali', 'y2_magn_uncali', 'z2_magn_uncali', ] data_folder = utils.get_data_folder() sensor_folder = data_folder / 'sensor_data' summary_path = data_folder / 'file_summary.csv' device_id_path = data_folder / 'device_ids.pickle' if device_id_path.is_file(): return save_ext = '' if only_process_test_sites else '_all_sites' meta_sensor_path = sensor_folder / ('meta' + save_ext + '_no_device.csv') df = pd.read_csv(summary_path) meta_sensor = pd.read_csv(meta_sensor_path, dtype={'test_type': object}) if only_process_test_sites: df = df[df.test_site] df.index = np.arange(df.shape[0]) with pd.option_context('mode.chained_assignment', None): df['first_last_wifi_replaced_time'] = df['first_last_wifi_time'] no_wifi_rows = np.where(df.num_wifi == 0)[0] assert np.all(df['mode'].values[no_wifi_rows] == 'train') df.loc[no_wifi_rows, 'first_last_wifi_replaced_time'] = ( df.start_time.values[no_wifi_rows]) df.sort_values(by=['first_last_wifi_replaced_time'], axis=0, inplace=True) df.index = np.arange(df.shape[0]) all_sensor = {} for m in ['valid', 'test', 'train']: print(m) with open(sensor_folder / (m + save_ext + '.pickle'), 'rb') as f: sensor_data = pickle.load(f) all_sensor.update(sensor_data) fns = df.fn.values modes = df['mode'].values num_fn = df.shape[0] unique_sites = np.sort(np.unique(df.site_id.values)) device_ids = {} device_ids_ordered = [] active_device_signatures = [] act_sig_recent_ids = [] next_signature_id = 0 dev_stats = [] for i in range(num_fn): if (i+1) % 1000 == 0: print(i+1) fn = fns[i] mode = modes[i] # if fn in ['5daec763aa1d300006faafcd', '5daece4eaa1d300006fab032']: # import pdb; pdb.set_trace() # x=1 first_uncal_vals = all_sensor[fn]['waypoint_segments'][0][sig_cols].values last_uncal_vals = all_sensor[fn]['waypoint_segments'][-1][sig_cols].values signature_absent = np.isnan(first_uncal_vals[0, 0]) or np.isnan( last_uncal_vals[0, 0]) this_first_signature = first_uncal_vals[0] this_last_signature = last_uncal_vals[-1] signature_change_this_step = not np.all( np.isclose(this_first_signature, this_last_signature)) plot_time = df.first_last_wifi_time.values[i] if mode == 'test' else ( df.start_time.values[i]) meta_rows = np.where(meta_sensor.fn.values == fn)[0] mean_robust_sensor_time_diff = np.median( meta_sensor.mean_robust_sensor_time_diff.values[meta_rows]) site_id = np.where(df.site_id.values[i] == unique_sites)[0][0] if signature_absent: device_ids[fn] = (-1, None) device_ids_ordered.append((fn, -1, None)) dev_stats.append({ 'fn': fn, 'device_id': -1, 'site_id': site_id, 'plot_time': plot_time, 'mean_robust_sensor_time_diff': mean_robust_sensor_time_diff, }) continue # Compute when the next trajectory can use the same device if mode == 'test': corrected_start_time = df.first_last_wifi_time.values[i] + 5000 this_min_next_available_time = df.duration.values[i] + ( df.first_last_wifi_time.values[i]) - 5000 else: this_min_next_available_time = df.end_time.values[i] corrected_start_time = df.start_time.values[i] found_signature = False for j in act_sig_recent_ids: (signature, signature_id, min_available_time, prev_sig_mode, prev_fn, prev_row, prev_drift) = active_device_signatures[j] signature_dist = (np.abs(this_first_signature - signature)/( dist_scaler)).sum() num_shared_nz = (np.isclose(signature, this_first_signature) & ( signature != 0)).sum() # if signature_dist > 0: # print(signature_dist, num_shared_nz) same_signature = signature_dist <= signature_dist_threshold or ( num_shared_nz > 1) if same_signature: if corrected_start_time < min_available_time: print(i, corrected_start_time, min_available_time, signature_dist, num_shared_nz) print("This should not happen - signature time inconsistency") # if signature_change_this_step: # import pdb; pdb.set_trace() device_ids[fn] = (signature_id, signature_change_this_step) device_ids_ordered.append((fn, signature_id, signature_change_this_step)) dev_stats.append({ 'fn': fn, 'device_id': signature_id, 'site_id': site_id, 'plot_time': plot_time, 'mean_robust_sensor_time_diff': mean_robust_sensor_time_diff, }) found_signature = True active_device_signatures[j] = ( this_last_signature, signature_id, this_min_next_available_time, mode, fn, i, signature_change_this_step) act_sig_recent_ids.remove(j) act_sig_recent_ids = [j] + act_sig_recent_ids break if not found_signature: signature_id = next_signature_id # if signature_id == 52: # import pdb; pdb.set_trace() # x=1 device_ids[fn] = (signature_id, signature_change_this_step) device_ids_ordered.append((fn, signature_id, signature_change_this_step)) dev_stats.append({ 'fn': fn, 'device_id': signature_id, 'site_id': site_id, 'plot_time': plot_time, 'mean_robust_sensor_time_diff': mean_robust_sensor_time_diff, }) active_device_signatures.append(( this_last_signature, signature_id, this_min_next_available_time, mode, fn, i, signature_change_this_step)) act_sig_recent_ids = [next_signature_id] + act_sig_recent_ids next_signature_id += 1 combined_signatures = pd.DataFrame( np.stack([s[0] for s in active_device_signatures])) # Stitch device ids back together using time, mean time between sensor # observations and the site id. # Also split the -1 device ids based on mean time between sensor observations dev_stats_df = pd.DataFrame(dev_stats) dev_stats_df.loc[(dev_stats_df.device_id.values == -1) & ( dev_stats_df.mean_robust_sensor_time_diff < 20), 'device_id'] = -2 predecessors = { -2: [], -1: [], } stats_device_ids = dev_stats_df.device_id.values site_ids = dev_stats_df.site_id.values plot_times = dev_stats_df.plot_time.values rtds = dev_stats_df.mean_robust_sensor_time_diff.values for i in range(dev_stats_df.device_id.values.max()+1): first_row = np.where(stats_device_ids == i)[0][0] first_rtd = rtds[first_row] this_site_id = site_ids[first_row] pred_candidates = [] for c in predecessors: last_chain_device = c if not len(predecessors[c]) else ( predecessors[c][-1]) pred_last_row = np.where(stats_device_ids == last_chain_device)[0][-1] pred_last_rtd = rtds[pred_last_row] pred_site_id = site_ids[pred_last_row] time_gap = plot_times[first_row] - plot_times[pred_last_row] print(i, last_chain_device, time_gap) if time_gap > 0 and time_gap <= 86400000 and ( this_site_id == pred_site_id) and np.abs( first_rtd - pred_last_rtd) < 0.02: pred_candidates.append(c) if len(pred_candidates): assert len(pred_candidates) == 1 predecessors[pred_candidates[0]].append(i) else: predecessors[i] = [] merged_device_ids = {} for k_id, k in enumerate(list(predecessors.keys())): merged_device_ids[k] = k_id for v in predecessors[k]: merged_device_ids[v] = k_id combined_device_ids = {} for fn in device_ids: dev_id, drift = device_ids[fn] rtd_fn = rtds[np.where(dev_stats_df.fn.values == fn)[0][0]] if dev_id == -1 and rtd_fn < 20: dev_id = -2 combined_device_ids[fn] = (dev_id, drift, merged_device_ids[dev_id]) #np.array([v[2] for k, v in combined_device_ids.items()]) with open(device_id_path, 'wb') as handle: pickle.dump(combined_device_ids, handle, protocol=pickle.HIGHEST_PROTOCOL)