Beispiel #1
0
    def test_save_results(self):
        results = utils.load_results()

        temp_file_name = utils.get_data_folder() + 'temp.json'
        utils.save_results(results, file_name=temp_file_name)

        temp = utils.load_results(file_name=temp_file_name)
        self.assertEqual(results, temp)
def multiple_floors_train_predict(config,
                                  df,
                                  debug_floor,
                                  reference_submission,
                                  use_multiprocessing,
                                  models_group_name,
                                  mode,
                                  holdout_df,
                                  test_floors,
                                  recompute_grouped_data,
                                  overwrite_models,
                                  test_type_mapping,
                                  only_public_test_preds,
                                  test_waypoint_times,
                                  store_all_wifi_predictions,
                                  store_full_wifi_predictions,
                                  debug_fn=None,
                                  verbose=True):
    data_folder = utils.get_data_folder()
    model_folder = data_folder.parent / 'Models' / models_group_name
    site_floors = df.iloc[df.test_site.values].groupby(
        ['site_id', 'text_level']).size().reset_index()
    if debug_floor is not None:
        site_floors = site_floors.iloc[debug_floor:(debug_floor + 1)]
    sites = site_floors.site_id.values
    floors = site_floors.text_level.values
    ref_scores = get_reference_scores(site_floors, reference_submission) if (
        reference_submission) is not None else [None] * floors.size

    if use_multiprocessing:
        with mp.Pool(processes=mp.cpu_count() - 1) as pool:
            floor_ids = np.arange(floors.size)
            results = [
                pool.apply_async(
                    floor_train_predict,
                    args=(config, s, f, i, r, data_folder, model_folder, df,
                          mode, holdout_df, test_floors,
                          recompute_grouped_data, overwrite_models,
                          test_type_mapping, only_public_test_preds,
                          test_waypoint_times, store_all_wifi_predictions,
                          store_full_wifi_predictions, debug_fn, verbose))
                for (s, f, i, r) in zip(sites, floors, floor_ids, ref_scores)
            ]
            all_outputs = [p.get() for p in results]
    else:
        all_outputs = []
        for floor_id, (analysis_site, floor,
                       ref_score) in enumerate(zip(sites, floors, ref_scores)):
            all_outputs.append(
                floor_train_predict(
                    config, analysis_site, floor, floor_id, ref_score,
                    data_folder, model_folder, df, mode, holdout_df,
                    test_floors, recompute_grouped_data, overwrite_models,
                    test_type_mapping, only_public_test_preds,
                    test_waypoint_times, store_all_wifi_predictions,
                    store_full_wifi_predictions, debug_fn, verbose))

    return all_outputs
Beispiel #3
0
def run(only_process_test_sites=True,
        overwrite_existing_processed=False,
        write_separate_wifi=False):
    print("Reshaping raw data")
    data_folder = utils.get_data_folder()
    parquet_folder = data_folder / "reference_preprocessed"
    summary_path = data_folder / "file_summary.csv"
    df = pd.read_csv(summary_path)
    source_submission = 'submission_cost_minimization.csv'
    submission_folder = data_folder / 'submissions'
    submission = pd.read_csv(submission_folder / source_submission)
    sample_sub_fns = np.array(
        [sps.split('_')[1] for sps in (submission.site_path_timestamp)])
    sample_sub_times = np.array(
        [int(sps.split('_')[2]) for sps in (submission.site_path_timestamp)])

    # First check for the last file and abort if it already exists
    last_pickle_path = data_folder / (
        str(Path(df.ext_path.values[-1]).with_suffix("")) + "_reshaped.pickle")
    if last_pickle_path.exists() and (not overwrite_existing_processed):
        return

    # Loop over all file paths and compare the parquet and pickle files one by
    # one
    for i in range(df.shape[0]):
        # for i in np.arange(26924, 28000):
        print(f"{i+1} of {df.shape[0]}")
        if not only_process_test_sites or df.test_site[i]:
            ext_path = Path(df.ext_path[i])
            mode = ext_path.parts[0]
            pickle_path = data_folder / (str(ext_path.with_suffix("")) +
                                         "_reshaped.pickle")
            parquet_path = parquet_folder / mode / ext_path.with_suffix(
                ".parquet")
            pathlib.Path(parquet_path.parent).mkdir(parents=True,
                                                    exist_ok=True)
            if not pickle_path.exists() or overwrite_existing_processed:
                reshape_parquet(
                    pickle_path,
                    parquet_path,
                    df.iloc[i],
                    i,
                    write_separate_wifi,
                    data_folder,
                    sample_sub_fns,
                    sample_sub_times,
                )
Beispiel #4
0
def multiple_floors_train_predict(config,
                                  df,
                                  models_group_name,
                                  mode,
                                  holdout_df,
                                  test_floors,
                                  overwrite_models,
                                  test_type_mapping,
                                  only_public_test_preds,
                                  test_waypoint_times,
                                  debug_fn,
                                  verbose=True):
    data_folder = utils.get_data_folder()
    model_folder = data_folder.parent / 'Models' / models_group_name
    site_floors = df.iloc[df.test_site.values].groupby(
        ['site_id', 'text_level']).size().reset_index()
    sites = site_floors.site_id.values
    floors = site_floors.text_level.values

    if debug_fn is not None:
        target_row = np.where(df.fn == debug_fn)[0][0]
        sites = [df.site_id.values[target_row]]
        floors = [df.text_level.values[target_row]]

    all_distances = []
    all_magnetometer = []
    for floor_id, (analysis_site, floor) in enumerate(zip(sites, floors)):
        if verbose:
            print(f"Processing floor {floor_id+1} of {site_floors.shape[0]}")
        distances, magnetometer_pos = floor_train_predict(
            config, analysis_site, floor, floor_id, data_folder, model_folder,
            df, mode, holdout_df, test_floors, recompute_grouped_data,
            overwrite_models, test_type_mapping, only_public_test_preds,
            test_waypoint_times, debug_fn)
        all_distances.append(distances)
        all_magnetometer.append(magnetometer_pos)

    return all_distances, all_magnetometer
def run(mode):
    print("Preparing features for the sensor uncertainty models")
    fn_mode = ['mean', 'first_middle_last'][0]
    num_neighbors = 10
    additional_feature_cols = ['rel_fraction', 'time_offset']
    feature_cols = ['total_dist', 'num_waypoints']
    stem_target_cols = [
        'mean_rel_dist_error', 'mean_abs_rel_dist_error', 'mean_angle_error',
        'mean_abs_angle_error'
    ]

    if fn_mode == 'mean':
        target_cols = copy.copy(stem_target_cols)
    else:
        target_cols = []
        for ext in ['first_', 'middle_', 'last_']:
            for c in stem_target_cols:
                if ext == 'middle_':
                    target_cols += [ext + c]
                else:
                    target_cols += [ext + c[5:]]
    feature_cols += copy.copy(target_cols)

    data_folder = utils.get_data_folder()
    model_folder = data_folder.parent / 'Models' / 'correct_sensor_preds'
    pathlib.Path(model_folder).mkdir(parents=True, exist_ok=True)
    save_ext = '' if fn_mode == 'mean' else ' first_middle_last'
    save_path = model_folder / (mode + save_ext + '.csv')

    if save_path.is_file():
        return

    data_path = data_folder / 'fn_device_errors.csv'
    data = pd.read_csv(data_path)
    num_add_features = len(additional_feature_cols)
    all_feature_cols = additional_feature_cols + feature_cols
    num_keep_first_data_cols = 7

    num_shifts = num_neighbors * 2 + 1
    num_rows = data.shape[0]
    num_features = len(feature_cols)
    all_features = np.zeros(
        (num_shifts, num_rows, num_features + num_add_features))
    padded_feature_vals = np.full((num_rows + num_shifts - 1, num_features),
                                  np.nan)
    padded_feature_vals[num_neighbors:(
        -num_neighbors)] = data[feature_cols].values
    device_ids = data.device_id.values
    padded_device_ids = np.full((num_rows + num_shifts - 1), np.nan)
    padded_device_ids[num_neighbors:(-num_neighbors)] = device_ids
    times = data.plot_time.values
    padded_times = np.full((num_rows + num_shifts - 1), np.nan)
    padded_times[num_neighbors:(-num_neighbors)] = times
    modes = data['mode'].values
    can_use_mask = np.concatenate([
        np.zeros(num_neighbors, dtype=bool),
        (modes != 'test') & ((mode == 'test') | (modes != 'valid')),
        np.zeros(num_neighbors, dtype=bool),
    ])
    for shift_id, shift in enumerate(range(-num_neighbors, num_neighbors + 1)):
        start_row = shift + num_neighbors
        end_row = shift + num_neighbors + num_rows
        shifted_features = np.copy(padded_feature_vals[start_row:end_row])
        shifted_device_ids = padded_device_ids[start_row:end_row]
        if shift == 0:
            step_can_use_mask = np.ones_like(can_use_mask[start_row:end_row])
        else:
            step_can_use_mask = np.copy(can_use_mask[start_row:end_row])
        shift_mask = step_can_use_mask & (shifted_device_ids == device_ids)
        shifted_features[~shift_mask] = np.nan
        time_offsets = padded_times[start_row:end_row] - times
        sign_log_time_offsets = np.sign(time_offsets) * np.log10(
            np.abs(time_offsets))
        sign_log_time_offsets[~shift_mask] = np.nan

        all_features[shift_id, :, 1] = sign_log_time_offsets
        all_features[shift_id, :, num_add_features:] = shifted_features

    # Add the weighted mean distance within the window (excluding the centered fn)
    all_features[num_neighbors, :, 2] = np.nan
    dist_surrounding_sum = np.nansum(all_features[:, :, 2], 0, keepdims=True)
    all_features[:, :, 0] = all_features[:, :, 2] / dist_surrounding_sum

    # Convert the features and targets to a flat dataframe
    df_cols = {}
    for i in range(num_keep_first_data_cols):
        col = data.columns[i]
        df_cols[col] = data[col].values
    df_cols['device_id'] = data.device_id.values
    df_cols['plot_time'] = data.plot_time.values
    no_middle_segments = data.num_waypoints.values <= 3
    for k in target_cols:
        target_col_id = np.where(
            np.array(feature_cols) == k)[0][0] + (num_add_features)
        target_vals = np.copy(all_features[num_neighbors, :, target_col_id])
        if fn_mode == 'first_middle_last' and k[:6] == 'middle':
            target_vals[no_middle_segments] = np.nan
        df_cols[k + '_target'] = target_vals
    for c_id, c in enumerate(all_feature_cols):
        for shift_id, shift in enumerate(
                range(-num_neighbors, num_neighbors + 1)):
            if shift != 0:
                col_name = c + str(shift)
                df_cols[col_name] = all_features[shift_id, :, c_id]

    combined = pd.DataFrame(df_cols)
    combined.to_csv(save_path, index=False)
Beispiel #6
0
def run(mode="test", consider_multiprocessing=True, overwrite_output=False):
    print("Non-parametric WiFi model")
    models_group_name = 'non_parametric_wifi'
    overwrite_models = True
    recompute_grouped_data = not True
    # config = {
    #   'min_train_points': 10, # Ignore bssid with few observations
    #   'min_train_fns': 1, # Ignore bssid with few trajectories
    #   'delay_decay_penalty_exp_base': 0.62, # Base for bssid weight decay as a f of delay to compute the shared bssid fraction
    #   'inv_fn_count_penalty_exp': 0.1, # Exponent to give more weight to rare bssids to compute the shared bssid fraction
    #   'non_shared_penalty_start': 1.0, # Threshold below which the shared wifi fraction gets penalized in the distance calculation
    #   'non_shared_penalty_exponent': 2.2, # Exponent to penalize the non shared wifi fraction
    #   'non_shared_penalty_constant': 75, # Multiplicative constant to penalize the non shared wifi fraction
    #   'delay_decay_exp_base': 0.925, # Base for shared bssid weight decay as a f of delay
    #   'inv_fn_count_distance_exp': 0.1, # Exponent to give more weight to rare bssids to compute the weighted mean distance
    #   'unique_model_frequencies': False, # Discard bssid's with changing freqs
    #   'time_range_max_strength': 3, # Group wifi observations before and after each observation and retain the max strength
    #   'limit_train_near_waypoints': not True, # Similar to "snap to grid" - You likely want to set this to False eventually to get more granular predictions
    #   }
    config = {
        'min_train_points': 5,  # Ignore bssid with few observations
        'min_train_fns': 1,  # Ignore bssid with few trajectories
        'delay_decay_penalty_exp_base':
        0.8,  # Base for bssid weight decay as a f of delay to compute the shared bssid fraction
        'inv_fn_count_penalty_exp':
        0.0,  # Exponent to give more weight to rare bssids to compute the shared bssid fraction
        'non_shared_penalty_start':
        1.0,  # Threshold below which the shared wifi fraction gets penalized in the distance calculation
        'non_shared_penalty_exponent':
        2.0,  # Exponent to penalize the non shared wifi fraction
        'non_shared_penalty_constant':
        50,  # Multiplicative constant to penalize the non shared wifi fraction
        'delay_decay_exp_base':
        0.92,  # Base for shared bssid weight decay as a f of delay
        'inv_fn_count_distance_exp':
        0.2,  # Exponent to give more weight to rare bssids to compute the weighted mean distance
        'unique_model_frequencies':
        False,  # Discard bssid's with changing freqs
        'time_range_max_strength':
        1e-5,  # Group wifi observations before and after each observation and retain the max strength
        'limit_train_near_waypoints':
        False  # Similar to "snap to grid" - You likely want to set this to False eventually to get more granular predictions
    }

    debug_floor = [None, 16][0]
    debug_fn = [None, '5dd374df44333f00067aa198'][0]
    store_all_wifi_predictions = False
    store_full_wifi_predictions = not config[
        'limit_train_near_waypoints']  # Required for the current combined optimization
    only_public_test_preds = False
    reference_submission_ext = 'non_parametric_wifi - valid - 2021-03-30 091444.csv'
    bogus_test_floors_to_train_all_test_models = False
    test_override_floors = False

    data_folder = utils.get_data_folder()
    summary_path = data_folder / 'file_summary.csv'
    stratified_holdout_path = data_folder / 'holdout_ids.csv'
    leaderboard_types_path = data_folder / 'leaderboard_type.csv'
    preds_folder = data_folder.parent / 'Models' / models_group_name / (
        'predictions')
    pathlib.Path(preds_folder).mkdir(parents=True, exist_ok=True)
    if store_full_wifi_predictions:
        file_ext = models_group_name + ' - ' + mode + ' - full distances.pickle'
        full_predictions_path = preds_folder / file_ext

        if full_predictions_path.is_file() and (not overwrite_output):
            return

    reference_submission_path = data_folder / reference_submission_ext
    df = pd.read_csv(summary_path)
    holdout_df = pd.read_csv(stratified_holdout_path)
    test_waypoint_times = utils.get_test_waypoint_times(data_folder)
    test_floors = utils.get_test_floors(
        data_folder, debug_test_floor_override=test_override_floors)
    leaderboard_types = pd.read_csv(leaderboard_types_path)
    test_type_mapping = {
        fn: t
        for (fn, t) in zip(leaderboard_types.fn, leaderboard_types['type'])
    }
    reference_submission = pd.read_csv(reference_submission_path)

    assert store_full_wifi_predictions == (
        not config['limit_train_near_waypoints'])

    if bogus_test_floors_to_train_all_test_models and mode == 'test':
        print(
            "WARNING: bogus shuffling of test floors to train all floor models"
        )
        test_floors = utils.get_test_floors(data_folder)
        site_floors = df.iloc[df.test_site.values].groupby(
            ['site_id', 'text_level']).size().reset_index()
        site_floors['level'] = [
            utils.TEST_FLOOR_MAPPING[t] for t in (site_floors.text_level)
        ]
        site_floors['num_test_counts'] = 0
        first_floor_fns = {s: [] for s in np.unique(site_floors.site_id)}
        repeated_floor_fns = {s: [] for s in np.unique(site_floors.site_id)}
        for fn in test_floors:
            site = df.site_id[df.fn == fn].values[0]
            increment_row = np.where((site_floors.site_id == site) & (
                site_floors.level == test_floors[fn]))[0][0]
            site_floors.loc[increment_row, 'num_test_counts'] += 1
            if site_floors.num_test_counts.values[increment_row] > 1:
                repeated_floor_fns[site].append(fn)
            else:
                first_floor_fns[site].append(fn)

        non_visited_floor_ids = np.where(site_floors.num_test_counts == 0)[0]
        for i, non_visited_id in enumerate(non_visited_floor_ids):
            site = site_floors.site_id.values[non_visited_id]
            if repeated_floor_fns[site]:
                override_fn = repeated_floor_fns[site].pop()
            else:
                override_fn = first_floor_fns[site].pop()
            test_floors[override_fn] = site_floors.level.values[non_visited_id]

        # Verify that now all floors contain at least one test fn
        site_floors['num_test_counts'] = 0
        for fn in test_floors:
            site = df.site_id[df.fn == fn].values[0]
            increment_row = np.where((site_floors.site_id == site) & (
                site_floors.level == test_floors[fn]))[0][0]
            site_floors.loc[increment_row, 'num_test_counts'] += 1

    if debug_fn is not None:
        debug_fn_row = np.where(df.fn.values == debug_fn)[0][0]
        debug_fn_site = df.site_id.values[debug_fn_row]
        debug_fn_level = df.text_level.values[debug_fn_row]
        site_floors = df.iloc[df.test_site.values].groupby(
            ['site_id', 'text_level']).size().reset_index()
        debug_floor = np.where((site_floors.site_id == debug_fn_site) & (
            site_floors.text_level == debug_fn_level))[0][0]

    use_multiprocessing = consider_multiprocessing and (debug_fn is None) and (
        debug_floor is None)
    all_outputs = non_parametric_wifi_utils.multiple_floors_train_predict(
        config, df, debug_floor, reference_submission, use_multiprocessing,
        models_group_name, mode, holdout_df, test_floors,
        recompute_grouped_data, overwrite_models, test_type_mapping,
        only_public_test_preds, test_waypoint_times,
        store_all_wifi_predictions, store_full_wifi_predictions, debug_fn)

    test_preds = {
        k: v
        for d in [o[0] for o in all_outputs] for k, v in d.items()
    }
    valid_preds = [r for l in [o[1] for o in all_outputs] for r in l]
    all_wifi_predictions = [r for l in [o[2] for o in all_outputs] for r in l]
    full_wifi_predictions = dict(ChainMap(*[o[3] for o in all_outputs
                                            if o[3]]))

    Path(preds_folder).mkdir(parents=True, exist_ok=True)
    if store_full_wifi_predictions:
        with open(full_predictions_path, 'wb') as handle:
            pickle.dump(full_wifi_predictions,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)
    if mode == 'test':
        submission = utils.convert_to_submission(data_folder, test_preds)
        submission_ext = models_group_name + ' - test.csv'
        submission.to_csv(preds_folder / submission_ext, index=False)
    elif debug_floor is None:
        preds_df = pd.DataFrame(valid_preds)
        print(f"Mean validation error: {preds_df.error.values.mean():.2f}")
        preds_path = preds_folder / (models_group_name + ' - valid.csv')
        preds_df.to_csv(preds_path, index=False)

        if store_all_wifi_predictions:
            all_wifi_preds_df = pd.DataFrame(all_wifi_predictions)
            all_wifi_preds_df.sort_values(["site", "fn", "time"], inplace=True)
            preds_path = preds_folder / (models_group_name +
                                         ' - all wifi validation.csv')
            all_wifi_preds_df.to_csv(preds_path, index=False)

        holdout_unweighted = np.sqrt(preds_df.squared_error.values).mean()
        print(f"Holdout unweighted aggregate loss: {holdout_unweighted:.2f}")
Beispiel #7
0
    data_folder = './dataset'
    hostname = socket.gethostname()
    if hostname.startswith('ubuntu'):
        data_folder = '/dev/shm/finegrained/' 
        args.checkpoint_dir = '/home/user/winycg/accv_checkpoint/'
    elif hostname.startswith('winycgv1'):
        data_folder = '/dev/shm/'
        args.checkpoint_dir = '/home/user/hhd/winycg/accv_checkpoint/'

    args.checkpoint_dir = os.path.join(args.checkpoint_dir, log_dir)
    if not os.path.isdir(args.checkpoint_dir):
        os.makedirs(args.checkpoint_dir)
    return data_folder


loaders = get_dataloaders(get_data_folder(), args)
trainloader = loaders['train']
valloader = loaders['val']
testloader = loaders['test']

print('Number of train dataset: ' ,len(trainloader.dataset))
print('Number of validation dataset: ' ,len(valloader.dataset))
print('Number of test dataset: ' ,len(testloader.dataset))
num_classes = trainloader.dataset.num_classes
print('Number of classes: ' , num_classes)
C, H, W =  trainloader.dataset[0][0].size()

# --------------------------------------------------------------------------------------------

# Model
print('==> Building model..')
def run():
    print("Summarizing all waypoint locations")
    only_process_test_sites = True
    write_all_wifi_data = False

    data_folder = utils.get_data_folder()
    summary_path = data_folder / "file_summary.csv"
    combined_waypoints_path = data_folder / "train_waypoints_timed.csv"
    if combined_waypoints_path.is_file():
        return
    # combined_train_wifi_times_path = data_folder / "train_wifi_times.csv"
    # combined_test_wifi_times_path = data_folder / "test_wifi_times.csv"
    stratified_holdout_path = data_folder / 'holdout_ids.csv'
    combined_all_wifi_folder = data_folder / 'train'
    df = pd.read_csv(summary_path)
    holdout = pd.read_csv(stratified_holdout_path)

    # Loop over all file paths and compare the parquet and pickle files one by one
    all_waypoints = []
    all_train_wifi_times = []
    all_test_wifi_times = []
    all_wifi_data = []
    for i in range(df.shape[0]):
        # if i < 26900:
        #   continue

        print(f"Trajectory {i+1} of {df.shape[0]}")
        if (not only_process_test_sites
                or df.test_site[i]) and df.num_wifi[i] > 0:
            pickle_path = data_folder / (
                str(Path(df.ext_path[i]).with_suffix("")) + "_reshaped.pickle")
            with open(pickle_path, "rb") as f:
                trajectory = pickle.load(f)

            if df['mode'][i] != 'test':
                waypoints = trajectory['waypoint']
                num_waypoints = waypoints.shape[0]

                # Add meta columns
                for c in ['site_id', 'mode', 'fn', 'text_level']:
                    waypoints[c] = df[c][i]

                # Add whether it is a train or validation trajectory
                waypoints['mode'] = holdout['mode'][holdout.fn ==
                                                    df.fn[i]].values[0]

                # Add the waypoint type
                waypoint_types = np.repeat('middle', num_waypoints)
                waypoint_types[0] = 'first'
                waypoint_types[num_waypoints - 1] = 'last'
                waypoints['type'] = waypoint_types
                waypoints['id'] = np.arange(num_waypoints)
                waypoints['num_waypoints'] = num_waypoints

            # Add the most recent wifi times that are closest to the waypoint
            # timestamps
            wifi_t1_times = np.unique(trajectory['wifi'].t1_wifi)
            assert np.all(np.diff(wifi_t1_times) > 0)
            wifi_last_t2_times = trajectory['wifi'].groupby(
                't1_wifi')['t2_wifi'].aggregate("max").values
            num_wifi_obs = trajectory['wifi'].groupby(
                't1_wifi')['t1_wifi'].aggregate("count").values
            try:
                assert wifi_t1_times.size == wifi_last_t2_times.size
                assert np.sum(np.diff(wifi_last_t2_times) < -1) <= 1
                assert np.all(wifi_last_t2_times < wifi_t1_times) or (
                    df['mode'][i] == 'test')
            except:
                import pdb
                pdb.set_trace()
                x = 1

            if df['mode'][i] != 'test':
                waypoint_wifi_times = np.zeros(num_waypoints, dtype=np.int64)
                for j in range(num_waypoints):
                    wifi_id = max(
                        0, (wifi_last_t2_times <= waypoints.time[j]).sum() - 1)
                    waypoint_wifi_times[j] = wifi_last_t2_times[wifi_id]
                waypoints['last_wifi_t2_time'] = waypoint_wifi_times
                waypoints['trajectory_wifi_time'] = waypoint_wifi_times - (
                    wifi_t1_times[0])
                waypoint_times = waypoints.time.values
                waypoints['trajectory_waypoint_time'] = waypoint_times - (
                    waypoint_times[0])
                waypoints['first_waypoint_time'] = waypoint_times[0]

                # Reorder the columns
                cols = waypoints.columns.tolist()
                reordered_cols = cols[:1] + cols[4:] + cols[1:4]
                waypoints = waypoints[reordered_cols]

                all_waypoints.append(waypoints)

            if write_all_wifi_data:
                wifi_data = trajectory['wifi'].copy()
                for c in ['site_id', 'mode', 'fn', 'level']:
                    wifi_data[c] = df[c][i]
                cols = wifi_data.columns.tolist()
                reordered_cols = cols[6:] + cols[:6]
                wifi_data = wifi_data[reordered_cols]
                if 'wifi_waypoints' in trajectory:
                    wifi_wp = trajectory['wifi_waypoints']
                    wifi_wp.sort_values(["t1_wifi", "t2_wifi"],
                                        ascending=[True, False],
                                        inplace=True)
                    wifi_wp_map = wifi_wp.groupby(['t1_wifi'
                                                   ]).first().reset_index()[[
                                                       't1_wifi',
                                                       'waypoint_interp_x',
                                                       'waypoint_interp_y'
                                                   ]]

                    wifi_data = wifi_data.merge(wifi_wp_map, on='t1_wifi')
                else:
                    wifi_data['waypoint_interp_x'] = np.nan
                    wifi_data['waypoint_interp_y'] = np.nan
                all_wifi_data.append(wifi_data)

            wifi_times = pd.DataFrame({
                'site_id':
                df['site_id'][i],
                'mode':
                df['mode'][i],
                'fn':
                df['fn'][i],
                'level':
                df['level'][i],
                'wifi_t1_times':
                wifi_t1_times,
                'wifi_last_t2_times':
                wifi_last_t2_times,
                'trajectory_index':
                np.arange(wifi_last_t2_times.size),
                'num_wifi_obs':
                num_wifi_obs,
            })

            if df['mode'][i] == 'test':
                wifi_times['first_last_t2_time'] = wifi_last_t2_times[0]
                all_test_wifi_times.append(wifi_times)
            else:
                wifi_times['first_waypoint_time'] = waypoint_times[0]
                all_train_wifi_times.append(wifi_times)

    # Write the combined waypoints to disk
    combined_waypoints = pd.concat(all_waypoints)
    combined_waypoints.sort_values(["site_id", "first_waypoint_time", "time"],
                                   inplace=True)
    combined_waypoints.to_csv(combined_waypoints_path, index=False)

    # # Write the combined wifi times to disk
    # combined_train_wifi_times = pd.concat(all_train_wifi_times)
    # combined_train_wifi_times.sort_values(
    #   ["site_id", "first_waypoint_time", "wifi_t1_times"], inplace=True)
    # combined_train_wifi_times.to_csv(combined_train_wifi_times_path, index=False)

    # combined_test_wifi_times = pd.concat(all_test_wifi_times)
    # combined_test_wifi_times.sort_values(
    #   ["site_id", "first_last_t2_time", "wifi_t1_times"], inplace=True)
    # combined_test_wifi_times.to_csv(combined_test_wifi_times_path, index=False)

    # Write the raw wifi data to disk
    if write_all_wifi_data:
        test_floors = utils.get_test_floors(data_folder)
        combined_all_wifi = pd.concat(all_wifi_data)
        combined_all_wifi.sort_values(["site_id", "fn", "mode"], inplace=True)
        all_levels = [
            l if m != 'test' else test_floors[fn]
            for (l, m,
                 fn) in zip(combined_all_wifi.level, combined_all_wifi['mode'],
                            combined_all_wifi.fn)
        ]
        combined_all_wifi['level'] = np.array(all_levels)
        sites = np.sort(np.unique(combined_all_wifi.site_id.values))
        for site_id, site in enumerate(sites):
            print(f"Site {site_id+1} of {len(sites)}")
            combined_all_wifi_site = combined_all_wifi[
                combined_all_wifi.site_id.values == site]

            # Map the levels from a reference submission for the test data
            levels = np.sort(np.unique(combined_all_wifi_site.level.values))
            for l in levels:
                combined_all_wifi_floor = combined_all_wifi_site[
                    combined_all_wifi_site.level.values == l]
                combined_all_wifi_floor.sort_values(["mode", "t1_wifi"],
                                                    inplace=True)
                text_level = df.text_level[
                    df.fn == combined_all_wifi_floor.fn.values[-1]].values[0]

                combined_all_wifi_path = combined_all_wifi_folder / site / text_level / (
                    'all_wifi.csv')
                combined_all_wifi_floor.to_csv(combined_all_wifi_path,
                                               index=False)
Beispiel #9
0
def run(mode):
    print("Processing time leak (edge trajectories)")
    debug_site = [None, 0][0]
    use_multiprocessing = False
    test_preds_source = 'test - 2021-05-15 051944.csv'
    test_override_floors = False

    data_folder = utils.get_data_folder()
    test_override_ext = '_floor_override' if (mode == 'test'
                                              and test_override_floors) else ''
    save_path = data_folder / (mode + '_edge_positions_v3' +
                               test_override_ext + '.csv')
    if save_path.is_file():
        return

    summary_path = data_folder / 'file_summary.csv'
    test_preds_path = data_folder / 'submissions' / test_preds_source
    stratified_holdout_path = data_folder / 'holdout_ids.csv'
    device_id_path = data_folder / 'device_ids.pickle'
    ordered_device_time_path = data_folder / 'inferred_device_ids.csv'
    with open(device_id_path, 'rb') as f:
        device_ids = pickle.load(f)
    public_private_test_leaks = {
        'ff141af01177f34e9caa7a12': ('start', 3, 203.11885, 97.310814),
        'f973ee415265be4addc457b1': ('start', -1, 20.062187, 99.66188),
        '23b4c8eb4b41d75946285461': ('end', 2, 60.205635, 102.28055),
        '5582270fcaee1f580de9006f': ('end', 0, 97.8957, 28.9133),
        'b51a662297b90657f0b03b44': ('start', 1, 112.39258, 233.72379),
    }
    df = pd.read_csv(summary_path)
    holdout_df = pd.read_csv(stratified_holdout_path)
    test_floors = utils.get_test_floors(
        data_folder, debug_test_floor_override=test_override_floors)
    test_preds = pd.read_csv(test_preds_path)
    test_preds = utils.override_test_floor_errors(
        test_preds, debug_test_floor_override=test_override_floors)
    test_preds['fn'] = [
        spt.split('_')[1] for spt in test_preds.site_path_timestamp
    ]
    test_preds['timestamp'] = [
        int(spt.split('_')[2]) for spt in test_preds.site_path_timestamp
    ]
    for test_fn in test_floors:
        assert test_preds.floor[test_preds.fn ==
                                test_fn].values[0] == test_floors[test_fn]

    device_time_path = pd.read_csv(ordered_device_time_path)
    device_time_path['time'] = device_time_path['start_time']
    test_rows = np.where(device_time_path['mode'].values == "test")[0]
    device_time_path.loc[
        test_rows,
        'time'] = device_time_path['first_last_wifi_time'].values[test_rows]
    device_time_path.sort_values(['device_id', 'time'], inplace=True)

    sites = df.iloc[df.test_site.values].groupby(['site_id'
                                                  ]).size().reset_index()
    if debug_site is not None:
        sites = sites.iloc[debug_site:(debug_site + 1)]
    sites = sites.site_id.values

    if use_multiprocessing:
        with mp.Pool(processes=mp.cpu_count() - 1) as pool:
            results = [
                pool.apply_async(extract_floor_start_end,
                                 args=(data_folder, s, df, holdout_df,
                                       test_preds, device_time_path, mode,
                                       device_ids, public_private_test_leaks))
                for s in sites
            ]
            all_outputs = [p.get() for p in results]
    else:
        all_outputs = []
        for site_id, analysis_site in enumerate(sites):
            print(f"Processing site {site_id+1} of {len(sites)}")
            all_outputs.append(
                extract_floor_start_end(data_folder, analysis_site, df,
                                        holdout_df, test_preds,
                                        device_time_path, mode, device_ids,
                                        public_private_test_leaks))

    # Save the combined results
    combined = pd.concat(all_outputs)
    combined.to_csv(save_path, index=False)
Beispiel #10
0
def run():
    print("Combining sensor data")
    only_process_test_sites = True
    sensor_cols = ['time', 'acce', 'gyro', 'ahrs']

    data_folder = utils.get_data_folder()
    save_folder = data_folder / 'sensor_data'
    pathlib.Path(save_folder).mkdir(parents=True, exist_ok=True)
    summary_path = data_folder / 'file_summary.csv'
    source_submission = 'submission_cost_minimization.csv'
    submission_folder = data_folder / 'submissions'
    submission = pd.read_csv(submission_folder / source_submission)
    submission = utils.override_test_floor_errors(submission)
    sample_sub_fns = np.array(
        [sps.split('_')[1] for sps in (submission.site_path_timestamp)])
    sample_sub_times = np.array(
        [int(sps.split('_')[2]) for sps in (submission.site_path_timestamp)])
    holdout_df = pd.read_csv(data_folder / 'holdout_ids.csv')
    df = pd.read_csv(summary_path)

    # Overwrite the validation data mode
    validation_fns = set(holdout_df.fn.values[holdout_df.holdout])
    df['mode'] = [
        'valid' if fn in validation_fns else m
        for (fn, m) in zip(df['fn'].values, df['mode'].values)
    ]

    def rotate_by_angles(orig, angles):
        rotated = np.zeros_like(orig)

        for i, theta in enumerate(angles):
            o = orig[i]
            c, s = np.cos(theta), np.sin(theta)
            r = np.matrix([[c, s], [-s, c]])
            rotated[i] = np.dot(r, o)

        return rotated

    # Combine all the between waypoint sub-trajectories into a single file for each
    # data mode
    target_sensor_cols = None
    for mode in np.unique(df['mode'].values):
        # mode = 'valid'
        waypoint_mapping = {}

        sub_df = df.iloc[np.where(df['mode'] == mode)[0]]

        if only_process_test_sites:
            sub_df = sub_df[sub_df.test_site.values]
            save_ext = ''
        else:
            save_ext = '_all_sites'
        save_path = save_folder / (mode + save_ext + '.pickle')

        if save_path.is_file():
            continue
        print(mode)

        for fn_id, (fn, site, floor) in enumerate(
                zip(sub_df.fn, sub_df.site_id, sub_df.text_level)):
            #print(fn_id)
            path_ext = fn + '_reshaped.pickle'
            if mode == 'test':
                data_path = data_folder / mode / path_ext
                sub_fn_ids = np.where(sample_sub_fns == fn)[0]
                waypoint_times = sample_sub_times[sub_fn_ids]
                floor_int = submission.floor.values[sub_fn_ids[0]]
                waypoints = None
                relative_waypoint_movement_1 = None
                relative_waypoint_distances = None
                relative_waypoint_movement_2 = None
            else:
                try:
                    floor_int = utils.TEST_FLOOR_MAPPING[floor]
                except:
                    print(f"Failed {fn_id}")
                    continue
                data_path = data_folder / 'train' / site / floor / path_ext

            try:
                with open(data_path, 'rb') as f:
                    file_data = pickle.load(f)
            except:
                print(f"Failed {fn_id}")
                continue

            if mode != 'test':
                waypoint_times = file_data['waypoint'].time.values
                waypoints = file_data['waypoint']
                waypoint_pos = waypoints[['x_waypoint', 'y_waypoint']].values
                relative_waypoint_movement_1 = np.diff(waypoint_pos, axis=0)
                rel_angles = np.angle(relative_waypoint_movement_1[:, 0] + 1j *
                                      (relative_waypoint_movement_1[:, 1]))
                relative_waypoint_movement_2 = rotate_by_angles(
                    relative_waypoint_movement_1[1:], rel_angles[:-1])
                relative_waypoint_distances = np.sqrt(
                    (relative_waypoint_movement_1**2).sum(1))

            num_waypoints = waypoint_times.size

            # Chunk out the waypoint segments
            waypoint_segments = []
            fractions_time_covered = []
            shared_time = file_data['shared_time']
            share_time_vals = shared_time.time.values
            for i in range(num_waypoints - 1):
                start_time = waypoint_times[i]
                end_time = waypoint_times[i + 1]

                if target_sensor_cols is None:
                    target_sensor_cols = [
                        c for c in shared_time.columns
                        if any([sc in c for sc in sensor_cols])
                    ]

                start_row = max(0, (share_time_vals <= start_time).sum() - 1)
                end_row = min(share_time_vals.size,
                              (share_time_vals < end_time).sum() + 1)

                time_range = end_time - start_time
                covered_time = min(end_time,
                                   share_time_vals[end_row - 1]) - max(
                                       start_time, share_time_vals[start_row])
                fractions_time_covered.append(covered_time / time_range)

                # if fn == '5dc8e91a17ffdd0006f12ce0' and i == 0:
                #   import pdb; pdb.set_trace()
                #   x=1

                waypoint_segments.append(shared_time.iloc[np.arange(
                    start_row, end_row)])

            # if fn == '5dc8e91a17ffdd0006f12ce0':
            #   import pdb; pdb.set_trace()
            #   x=1

            # import pdb; pdb.set_trace()
            waypoint_mapping[fn] = {
                'site': site,
                'floor': floor_int,
                'num_waypoints': num_waypoints,
                'waypoints': waypoints,
                'waypoint_times': waypoint_times,
                'fractions_time_covered': np.array(fractions_time_covered),
                'waypoint_segments': waypoint_segments,
                'relative_waypoint_movement_1': relative_waypoint_movement_1,
                'relative_waypoint_distances': relative_waypoint_distances,
                'relative_waypoint_movement_2': relative_waypoint_movement_2,
            }

        # Save the combined mapping to disk
        with open(save_path, 'wb') as handle:
            pickle.dump(waypoint_mapping,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)
def run():
  only_process_test_sites = True
  data_folder = utils.get_data_folder()
  sensor_folder = data_folder / 'sensor_data'
  device_id_path = data_folder / 'device_ids.pickle'
  try:
    with open(device_id_path, 'rb') as f:
      device_ids = pickle.load(f)
    print("Extracting segment meta data (2/2)")
  except:
    device_ids = None
    print("Extracting segment meta data (1/2)")
  device_ext = '_no_device' if device_ids is None else ''
  save_ext = '' if only_process_test_sites else '_all_sites'
  save_path = sensor_folder / ('meta' + save_ext + device_ext + '.csv')
  if save_path.is_file():
    return device_ids is None
  summary_path = data_folder / 'file_summary.csv'
  df = pd.read_csv(summary_path)
  leaderboard_types_path = data_folder / 'leaderboard_type.csv'
  leaderboard_types = pd.read_csv(leaderboard_types_path)
  test_type_mapping = {fn: t for (fn, t) in zip(
    leaderboard_types.fn, leaderboard_types['type'])}
  
  # Combine all the sub-trajectory meta data
  all_sub_trajectories = []
  for mode in ['test', 'train', 'valid']:
    print(mode)
    load_path = sensor_folder / (mode + save_ext + '.pickle')
    with open(load_path, 'rb') as f:
      combined_mode = pickle.load(f)
      
    for fn in combined_mode:
      t = combined_mode[fn]
      
      site = t['site']
      level = t['floor']
      text_level = df.text_level.values[np.where(
        (df.site_id == site) & (df.level == level))[0][0]]
      num_waypoints = t['num_waypoints']
      waypoint_times = t['waypoint_times']
      sub_durations = np.diff(waypoint_times)
      
      waypoint_segments = t['waypoint_segments']
      waypoint_times = t['waypoint_times']
      relative_movements = t['relative_waypoint_movement_1']
      for i in range(num_waypoints-1):
        segment_time = waypoint_segments[i].time.values
        sensor_time_diff = np.diff(segment_time)
        start_time_offset = segment_time[0] - waypoint_times[i] 
        end_time_offset = segment_time[-1] - waypoint_times[i+1] 
        mean_robust_sensor_time_diff = sensor_time_diff[
          (sensor_time_diff >= 19) & (sensor_time_diff <= 21)].mean()
        
        if mode == 'test':
          distance_covered = None
          test_type = test_type_mapping[fn]
          plot_time = df.first_last_wifi_time.values[
            np.where(df.fn.values == fn)[0][0]]
        else:
          distance_covered = np.sqrt((relative_movements[i]**2).sum())
          test_type = ''
          plot_time = waypoint_times[i]
        
        all_sub_trajectories.append({
          'mode': mode,
          'site': site,
          'level': level,
          'text_level': text_level,
          'fn': fn,
          'device_id': None if device_ids is None else device_ids[fn][0],
          'device_id_merged': None if device_ids is None else (
            device_ids[fn][2]),
          'test_type': test_type,
          
          'plot_time': plot_time,
          'start_time': waypoint_times[i],
          'end_time': waypoint_times[i+1],
          'sub_trajectory_id': i,
          'num_waypoints': num_waypoints,
          'duration': sub_durations[i],
          'num_obs': segment_time.size,
          'start_time_offset': start_time_offset,
          'end_time_offset': end_time_offset,
          'mean_sensor_time_diff': sensor_time_diff.mean(),
          'mean_robust_sensor_time_diff': mean_robust_sensor_time_diff,
          'min_sensor_time_diff': sensor_time_diff.min(),
          'max_sensor_time_diff': sensor_time_diff.max(),
          'distance_covered': distance_covered,
          })
        
  combined = pd.DataFrame(all_sub_trajectories)
  combined.to_csv(save_path, index=False)
  
  return device_ids is None
def run(mode):
    print("Model the sensor uncertainty")
    fn_mode = ['mean', 'joined_middle_last', 'first_middle_last'][0]
    skip_unbias_models = not True
    overwrite_models = True
    max_train_folds = [1, None][1]
    additional_feature_cols = ['num_waypoints']

    params = {
        'objective': 'regression',
        'learning_rate': 0.005,
        'extra_trees': True,
        'num_leaves': 40,
        'n_estimators': int(1e3),
        'max_depth': -1,
        'min_child_samples': 1,
        'colsample_bynode': 0.4,
        'subsample_freq': 1,
        'subsample': 0.8,
        'metric': 'rmse',
        'verbose': -1,
        'n_jobs': 1
    }

    data_folder = utils.get_data_folder()
    model_folder = data_folder.parent / 'Models' / 'correct_sensor_preds'
    save_ext = '' if fn_mode == 'mean' else ' ' + fn_mode
    predict_ext = mode + save_ext + '.csv'
    save_folder = model_folder / 'predictions'
    pathlib.Path(save_folder).mkdir(parents=True, exist_ok=True)
    save_path = save_folder / predict_ext
    if save_path.is_file():
        return
    load_ext = '' if fn_mode == 'mean' else ' first_middle_last'
    data_path = model_folder / (mode + load_ext + '.csv')
    data = pd.read_csv(data_path)
    target_cols = [c for c in data.columns if c[-7:] == '_target']
    last_non_feature_col = target_cols[-1]

    if fn_mode == 'joined_middle_last':
        target_cols = [c[6:] for c in target_cols if c[:5] == 'first']

    last_non_feature_id = np.where(data.columns == last_non_feature_col)[0][0]
    feature_cols = additional_feature_cols + data.columns.tolist()[
        (last_non_feature_id + 1):]
    non_feature_cols = data.columns.tolist()[:(last_non_feature_id + 1)]

    def prepare_data(data, mode, fn_mode, target_cols, feature_cols=None):
        sub = data[(data['mode'] == mode)]

        if fn_mode == 'joined_middle_last':
            nrow = sub.shape[0]
            orig_sub = sub.copy()
            sub = pd.concat([sub, sub, sub])
            sub.index = np.arange(sub.shape[0])
            for c in target_cols:
                sub[c] = np.concatenate([
                    orig_sub['first_' + c].values,
                    orig_sub['middle_mean_' + c].values,
                    orig_sub['last_' + c].values,
                ])
            sub['segment_type'] = np.repeat(np.arange(3), nrow)

            if feature_cols is not None:
                feature_cols.append('segment_type')

        return sub

    folds = data.train_fold.values[data['mode'] == 'train'].astype(np.int32)
    unique_folds = np.sort(np.unique(folds))
    if max_train_folds is not None:
        unique_folds = unique_folds[:max_train_folds]
    num_folds = unique_folds.size
    predict_rows = np.where(data['mode'].values == mode)[0]
    predict_data = prepare_data(data, mode, fn_mode, target_cols, feature_cols)
    predict_features = predict_data[feature_cols].values
    combined = {k: data[k].values[predict_rows] for k in non_feature_cols}

    for target_col_id, target_col in enumerate(target_cols):
        unbias_target = not 'abs_' in target_col
        if not unbias_target or not skip_unbias_models:
            print(
                f'\nTarget {target_col_id+1} of {len(target_cols)}: {target_col}'
            )
            predict_targets = predict_data[target_col].values
            predict_fold_preds = {}
            predict_fold_preds_l = []
            for f_id, f in enumerate(unique_folds):
                print(f"Fold {f_id+1} of {num_folds}")
                model_path = model_folder / (target_col + ' - fold ' + str(
                    int(f)) + save_ext + '.pickle')
                if mode == 'valid' and (not model_path.is_file()
                                        or overwrite_models):
                    train_data = prepare_data(data, 'train', fn_mode,
                                              target_cols)
                    train_data = train_data[train_data['train_fold'] != f]
                    train_features = train_data[feature_cols].values
                    train_targets = train_data[target_col].values
                    non_nan_train_targets = ~np.isnan(train_targets)

                    model = lgb.LGBMRegressor(**params)
                    model = model.fit(train_features[non_nan_train_targets],
                                      train_targets[non_nan_train_targets],
                                      verbose=1)
                    with open(model_path, "wb") as handle:
                        pickle.dump(model,
                                    handle,
                                    protocol=pickle.HIGHEST_PROTOCOL)
                else:
                    with open(model_path, "rb") as file:
                        model = pickle.load(file)

                preds = model.predict(predict_features)
                predict_fold_preds[target_col + '_fold_' + str(f)] = preds
                predict_fold_preds_l.append(preds)

            avg_fold_pred = np.stack(predict_fold_preds_l, -1).mean(-1)
            predict_fold_preds[target_col + '_avg_fold'] = avg_fold_pred

            if mode == 'valid':
                if unbias_target:
                    original_avg_abs_error = np.abs(predict_targets).mean()
                    debiased_avg_abs_error = np.abs(predict_targets -
                                                    avg_fold_pred).mean()
                    print(f"Orig abs error: {original_avg_abs_error:.3f};\
     Debiased abs error: {debiased_avg_abs_error:.3f}")
                else:
                    abs_err_correlation = np.corrcoef(
                        np.stack([predict_targets, avg_fold_pred]))[0, 1]
                    print(f"Abs error correlation: {abs_err_correlation:.3f}")

            if fn_mode == 'joined_middle_last':
                orig_keys = list(predict_fold_preds.keys())
                for k in orig_keys:
                    for st_id, st in enumerate(
                        ['first_', 'middle_mean_', 'last_']):
                        start_index = st_id * predict_rows.size
                        end_index = (st_id + 1) * predict_rows.size
                        predict_fold_preds[
                            st +
                            k] = predict_fold_preds[k][start_index:end_index]
                for k in orig_keys:
                    del predict_fold_preds[k]

            combined.update(predict_fold_preds)

    combined_df = pd.DataFrame(combined)
    combined_df.to_csv(save_path, index=False)
def run(overwrite_summary=False):
  print("Extracting metadata")
  data_folder = utils.get_data_folder()
  file_summaries = []
  
  sample_submission = pd.read_csv(data_folder / "submissions" / (
    "sample_submission.csv"))
  sample_submission_counts = {}
  all_sites = []
  for s in sample_submission.site_path_timestamp:
    all_sites.append(s.split("_")[0])
    file_name = s.split("_")[1]
    if file_name in sample_submission_counts:
      sample_submission_counts[file_name] += 1
    else:
      sample_submission_counts[file_name] = 1
  test_sites = list(set(all_sites))
  
  summary_path = data_folder / "file_summary.csv"
  if summary_path.is_file() and not overwrite_summary:
    df = pd.read_csv(summary_path)
  else:
    for mode in ["train", "test"]:
      main_folder = data_folder / mode
      main_data_folders_or_files = sorted(main_folder.iterdir())
      if mode == "train":
        # Loop over all train data and extract the site ID
        for f in main_data_folders_or_files:
          # sub_folder = main_folder / f
          sub_folder = f
          sub_sub_folders = sorted(sub_folder.iterdir())
          sub_sub_folders = [
              s for s in sub_sub_folders if not s.suffix == ".pickle"
          ]
          site_id = None
          for sub_sub_ext in sub_sub_folders:
            # sub_sub_path = sub_folder / sub_sub_ext
            sub_sub_path = sub_sub_ext
            sub_sub_files = sorted(sub_sub_path.iterdir())
            sub_sub_files = [s for s in sub_sub_files if s.suffix == ".txt"]
            for e in sub_sub_files:
              print(f"{len(file_summaries)+1} of 27549")
              # file_path = sub_sub_path / e
              file_path = e
              file_summary, site_id, complete_file = get_file_summary(
                  file_path,
                  site_id,
                  mode,
                  f,
                  sub_sub_ext,
                  e,
                  None,
                  test_sites,
                  data_folder,
              )
  
              if complete_file:
                # The file train/5cd56b83e2acfd2d33b5cab0/B2/5cf72539e9d9c9000852f45b.txt seems cut short
                file_summaries.append(file_summary)
      else:
        main_data_folders_or_files = [
            s for s in main_data_folders_or_files if s.suffix == ".txt"
        ]
        for e in main_data_folders_or_files:
          site_id = None
          print(f"{len(file_summaries)+1} of 27549")
          # file_path = main_folder / e
          file_path = e
          file_summary, site_id, _ = get_file_summary(
              file_path,
              site_id,
              mode,
              None,
              None,
              e,
              sample_submission_counts,
              test_sites,
              data_folder
          )
  
          file_summaries.append(file_summary)
  
    df = pd.DataFrame(file_summaries)
    df = df.astype({
        "num_test_waypoints": "Int64",
        "num_train_waypoints": "Int64",
        "level": "Int64",
        "first_last_wifi_time": "Int64",
    })
  
  # # Potential subsequent run of the script
  # if not 'text_level' in df.columns:
  #   df['text_level'] = None
  #   for i in range(df.shape[0]):
  #     print(i)
  #     if df['mode'][i] == 'train':
  #       text_level = df['ext_path'][i].split('/')[2]
  #       df.loc[i, 'text_level'] = text_level
  
  df.to_csv(summary_path, index=False)
Beispiel #14
0
 def test_get_data_folder(self):
     data_folder = utils.get_data_folder()
     self.assertEqual(data_folder, "data/")
Beispiel #15
0
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras import layers, callbacks
from tensorflow.python.keras.utils.vis_utils import plot_model

from utils import get_loss
from utils import TEST_SITES
from utils import get_data_folder

N_SPLITS = 3
SITE_IDX = None
N_TOP_BSSIDS = 20
BATCH_SIZE = 128
OVERWRITE = True

data_folder = get_data_folder()
summary_path = data_folder / "file_summary.csv"
stratified_holdout_path = data_folder / "holdout_ids.csv"

# Using https://www.kaggle.com/hiro5299834/indoor-navigation-and-location-wifi-features
pivot_data_folder = data_folder / "pivot"

holdout_df = pd.read_csv(stratified_holdout_path)
if SITE_IDX is None:
  sites = TEST_SITES
  pivot_paths = [pivot_data_folder / f"{site}_train.csv" for site in sites]
  holdout_df = holdout_df[holdout_df["test_site"]]
else:
  analysis_site = TEST_SITES[SITE_IDX]
  sites = [analysis_site]
  holdout_df = holdout_df[holdout_df["site_id"] == analysis_site]
def run():
    print("Creating validation set")
    require_valid_waypoints_in_train = True
    max_valid_unique_fraction = 0.4
    max_valid_unique_count_per_trajectory = 15
    prob_allow_hardest_trajectories = 0.15
    min_waypoints_holdout = 6  # Reflect the test data - only put long trajectories in the holdout set
    holdout_fraction = 0.08
    np.random.seed(14)
    data_folder = utils.get_data_folder()
    summary_path = data_folder / 'file_summary.csv'
    stratified_holdout_path = data_folder / 'holdout_ids.csv'
    if not stratified_holdout_path.is_file():
        df = pd.read_csv(summary_path)

        if require_valid_waypoints_in_train:
            train_waypoints, waypoint_counts = utils.get_train_waypoints(
                data_folder, df)

        sites = sorted(set(df.site_id))
        valid_trajectory_seen_train = []
        floor_dfs = []
        for s in sites:
            floors = sorted(
                set(df.text_level[(df.site_id == s).values
                                  & (df['mode'] == 'train').values]))
            for f in floors:
                floor_df = df.iloc[(df.site_id == s).values
                                   & (df.text_level == f).values]
                if require_valid_waypoints_in_train:
                    try:
                        floor_int = utils.TEST_FLOOR_MAPPING[f]
                    except:
                        floor_int = utils.NON_TEST_FLOOR_MAPPING[f]
                    floor_waypoints = train_waypoints.iloc[
                        (train_waypoints.site_id.values == s)
                        & (train_waypoints.level.values.astype(np.float32) ==
                           floor_int)]
                    floor_waypoint_counts = {
                        (str(k[2]), str(k[3])): v
                        for k, v in waypoint_counts.items()
                        if k[0] == s and (k[1] == floor_int)
                    }

                    considered_seq_ids = []
                    fn_sorted_ids = np.argsort(
                        -floor_df.num_train_waypoints.values)
                    for i, fn in enumerate(floor_df.fn.values[fn_sorted_ids]):
                        fn_waypoints = floor_waypoints[floor_waypoints.fn ==
                                                       fn]
                        if fn_waypoints.shape[0] == 0:
                            continue
                        if fn_waypoints.shape[
                                0] < min_waypoints_holdout and len(
                                    considered_seq_ids) > 0:
                            break
                        waypoint_counts_fn = np.array([
                            floor_waypoint_counts[(str(x), str(y))] for x, y in
                            zip(fn_waypoints.x.values, fn_waypoints.y.values)
                        ])
                        this_waypoint_counts = np.zeros(fn_waypoints.shape[0])
                        waypoint_vals = np.stack(
                            [fn_waypoints.x.values, fn_waypoints.y.values])
                        for j in range(fn_waypoints.shape[0]):
                            this_waypoint_counts[j] = (
                                (waypoint_vals[0] == waypoint_vals[0, j]) &
                                (waypoint_vals[1] == waypoint_vals[1,
                                                                   j])).sum()

                        unseen_normalized = (waypoint_counts_fn >
                                             this_waypoint_counts) / (
                                                 this_waypoint_counts)
                        non_unique_count = unseen_normalized.sum()
                        total_count = (1 / this_waypoint_counts).sum()
                        unique_fraction = 1 - (non_unique_count / total_count)
                        unique_count = np.round(total_count - non_unique_count)

                        # if (unique_fraction <= max_valid_unique_fraction) and not (
                        #     unique_count <= max_valid_unique_count_per_trajectory):
                        #   import pdb; pdb.set_trace()
                        #   x=1

                        if (unique_fraction <= max_valid_unique_fraction) and (
                                unique_count <=
                                max_valid_unique_count_per_trajectory) or (
                                    np.random.uniform() <
                                    prob_allow_hardest_trajectories):
                            considered_seq_ids.append(fn_sorted_ids[i])
                            for x, y in zip(fn_waypoints.x.values,
                                            fn_waypoints.y.values):
                                floor_waypoint_counts[(str(x), str(y))] -= 1

                    if len(considered_seq_ids) == 0:
                        if floor_df.test_site.values[0]:
                            raise ValueError(
                                "No valid validation trajectories selected")
                        considered_seq_ids = [fn_sorted_ids[0]]

                    considered_seq_ids = np.array(considered_seq_ids)
                else:
                    considered_seq_ids = np.where(
                        floor_df.num_train_waypoints.values >= (
                            min(min_waypoints_holdout,
                                floor_df.num_train_waypoints.max())))[0]
                # if (s, f) == ('5d2709a003f801723c3251bf', '3F'):
                #   import pdb; pdb.set_trace()
                num_holdout_trajectories = min(
                    considered_seq_ids.size,
                    max(1, int(floor_df.shape[0] * holdout_fraction)))
                if num_holdout_trajectories == considered_seq_ids.size:
                    holdout_ids = considered_seq_ids
                else:
                    probs = floor_df.num_train_waypoints.values[
                        considered_seq_ids]**1
                    probs = probs / probs.sum()
                    holdout_ids = np.random.choice(considered_seq_ids,
                                                   num_holdout_trajectories,
                                                   replace=False,
                                                   p=probs)
                floor_waypoint_counts = {(str(k[2]), str(k[3])): v
                                         for k, v in waypoint_counts.items()
                                         if k[0] == s and (k[1] == floor_int)}
                for fn in floor_df.fn.values[holdout_ids]:
                    fn_waypoints = floor_waypoints[floor_waypoints.fn == fn]
                    for x, y in zip(fn_waypoints.x.values,
                                    fn_waypoints.y.values):
                        floor_waypoint_counts[(str(x), str(y))] -= 1

                num_train_waypoints = floor_df.num_train_waypoints.values
                test_site = floor_df.test_site.values
                floor_df = floor_df.iloc[:, :4]
                floor_df.index = np.arange(floor_df.shape[0])
                floor_df['text_level'] = f
                floor_df['holdout'] = False
                floor_df.loc[holdout_ids, 'holdout'] = True
                floor_df.loc[holdout_ids, 'mode'] = 'valid'
                floor_df['num_train_waypoints'] = num_train_waypoints
                floor_df['test_site'] = test_site

                for valid_fn in floor_df.fn.values[floor_df.holdout.values]:
                    fn_waypoints = floor_waypoints[floor_waypoints.fn ==
                                                   valid_fn][['x', 'y']].values
                    if df.test_site.values[(df.site_id == s)][0]:
                        waypoints_in_train = np.array([
                            floor_waypoint_counts[(str(
                                fn_waypoints[i, 0]), str(fn_waypoints[i, 1]))]
                            > 0 for i in (range(fn_waypoints.shape[0]))
                        ])
                        valid_trajectory_seen_train.append(
                            (waypoints_in_train.shape[0],
                             waypoints_in_train.sum()))

                # if s == '5d2709c303f801723c3299ee' and f == '1F':
                #   import pdb; pdb.set_trace()
                #   x=1

                floor_dfs.append(floor_df)

        valid_trajectory_seen_train = np.array(valid_trajectory_seen_train)
        entire_traj_in_train = (
            valid_trajectory_seen_train[:,
                                        0] == valid_trajectory_seen_train[:,
                                                                          1])
        entire_waypoints_fraction = valid_trajectory_seen_train[
            entire_traj_in_train,
            0].sum() / valid_trajectory_seen_train[:, 0].sum()
        print(
            entire_traj_in_train.mean(), entire_waypoints_fraction.mean(),
            valid_trajectory_seen_train[:, 1].sum() /
            valid_trajectory_seen_train[:, 0].sum())

        combined_df = pd.concat(floor_dfs)
        combined_df.to_csv(stratified_holdout_path, index=False)
def get_embedded_description_file_name():
    embedded_description_file_name = get_data_folder(
    ) + get_embedding_file_name_prefix() + 'features.npy'
    return embedded_description_file_name
def get_steam_inventory_file_name(profile_id):
    steam_inventory_file_name = get_data_folder() + 'inventory_' + str(profile_id) + '.json'

    return steam_inventory_file_name
def get_embedding_app_id_file_name():
    embedding_app_id_file_name = get_data_folder(
    ) + get_embedding_file_name_prefix() + 'appids.txt'
    return embedding_app_id_file_name
def read_all_recs(df):
    base_path = utils.get_data_folder()
    return [
        read_one_rec(rec, base_path)
        for _, rec in tqdm(df.iterrows(), total=len(df))
    ]
    "5da1382d4db8ce0c98bbe92e",  # 14 - Well aligned, no waypoints inside buildings
    "5da138314db8ce0c98bbf3a0",  # 15 - Well aligned, occasional waypoints at edge of buildings
    "5da138364db8ce0c98bc00f1",  # 16 - Well aligned, occasional waypoints at edge of buildings
    "5da1383b4db8ce0c98bc11ab",  # 17 - Well aligned, no waypoints inside buildings
    "5da138754db8ce0c98bca82f",  # 18 - Well aligned, waypoints often at edges of buildings, sometimes inside buildings
    "5da138764db8ce0c98bcaa46",  # 19 - Well aligned, no waypoints inside buildings
    "5da1389e4db8ce0c98bd0547",  # 20 - Well aligned, no waypoints inside buildings. Some open areas seem unaccessible
    "5da138b74db8ce0c98bd4774",  # 21 - Well aligned, no waypoints inside buildings
    "5da958dd46f8266d0737457b",  # 22 - Well aligned, rare waypoints inside buildings
    "5dbc1d84c1eb61796cf7c010",  # 23 - Well aligned, no waypoints inside buildings
    "5dc8cea7659e181adb076a3f",  # 24 - Well aligned, no waypoints inside buildings
][4]
level_filter = [None, "F1", -1][2]
target_fn = [None, "5dd5290c50e04e0006f5651e", "5dd5216b50e04e0006f56476"][0]

data_folder = utils.get_data_folder()
summary_path = data_folder / "file_summary.csv"
if not "df" in locals():
    df = pd.read_csv(summary_path)

considered_ids = np.where(df["mode"] == "train")[0]
if only_consider_test_sites:
    considered_ids = considered_ids[df.test_site[considered_ids]]
if site_filter is not None:
    considered_ids = considered_ids[df.site_id[considered_ids] == site_filter]
if level_filter is not None:
    if isinstance(level_filter, int):
        considered_ids = considered_ids[df.level[considered_ids] ==
                                        level_filter]
    else:
        considered_ids = considered_ids[df.text_level[considered_ids] ==
Beispiel #22
0
def run(mode):
    print("Calculating the sensor uncertainty for unseen trajectories")
    save_uncertainties = True
    distance_model_dist_estimates = True
    apply_angle_correction_threshold = 0  # >= 0.4 means no corrections
    apply_distance_correction_threshold = 0  # >= 0.4 means no corrections
    predictions_ext = mode + '.csv'

    source_valid_test_predictions = [
        'relative_movement_v3_valid.csv', 'relative_movement_v3_test.csv'
    ]
    source_valid_test_dist_predictions = [
        'distance_valid.csv', 'distance_test.csv'
    ]
    fn_mode = 'first_middle_last' if 'first_middle_last' in predictions_ext else (
        'joined_middle_last'
        if 'joined_middle_last' in predictions_ext else 'mean')

    data_folder = utils.get_data_folder()
    model_folder = data_folder.parent / 'Models' / 'correct_sensor_preds'
    uncertainty_path = model_folder / 'predictions' / ('uncertainty - ' +
                                                       predictions_ext)
    if uncertainty_path.is_file():
        return

    source_pred_folder = data_folder.parent / 'Models' / (
        'sensor_absolute_movement') / 'predictions'
    source_ext = source_valid_test_predictions[int(mode == 'test')]
    source_path = source_pred_folder / source_ext
    preds = pd.read_csv(source_path)

    source_dist_pred_folder = data_folder.parent / 'Models' / (
        'sensor_distance') / 'predictions'
    source_dist_ext = source_valid_test_dist_predictions[int(mode == 'test')]
    source_dist_path = source_dist_pred_folder / source_dist_ext
    dist_preds = pd.read_csv(source_dist_path)

    corrections_path = model_folder / 'predictions' / predictions_ext
    corrections = pd.read_csv(corrections_path)
    uncertainties = pd.read_csv(source_path)

    fns = np.sort(np.unique(preds.fn.values))
    preds['distance'] = np.sqrt(preds['x']**2 + preds['y']**2)
    if distance_model_dist_estimates:
        assert np.all(dist_preds.fn.values == preds.fn.values)
        preds['pred_distance'] = dist_preds.pred.values
    else:
        preds['pred_distance'] = np.sqrt(preds['x_pred']**2 +
                                         preds['y_pred']**2)
    preds['pred_distance_corrected'] = preds['pred_distance']
    preds['x_pred_corrected'] = preds['x_pred']
    preds['y_pred_corrected'] = preds['y_pred']
    uncertainties['distance_uncertainty'] = np.nan
    uncertainties['pred_distance_uncertainty'] = np.nan
    uncertainties['angle_uncertainty'] = np.nan
    uncertainties['pred_angle_uncertainty'] = np.nan

    uncertainty_map_mean = [
        ('mean_abs_rel_dist_error_target', 'distance_uncertainty'),
        ('mean_abs_rel_dist_error_target_avg_fold',
         'pred_distance_uncertainty'),
        ('mean_abs_angle_error_target', 'angle_uncertainty'),
        ('mean_abs_angle_error_target_avg_fold', 'pred_angle_uncertainty'),
    ]
    uncertainty_map_first = [
        ('first_abs_rel_dist_error_target', 'distance_uncertainty'),
        ('first_abs_rel_dist_error_target_avg_fold',
         'pred_distance_uncertainty'),
        ('first_abs_angle_error_target', 'angle_uncertainty'),
        ('first_abs_angle_error_target_avg_fold', 'pred_angle_uncertainty'),
    ]
    uncertainty_map_middle = [
        ('middle_mean_abs_rel_dist_error_target', 'distance_uncertainty'),
        ('middle_mean_abs_rel_dist_error_target_avg_fold',
         'pred_distance_uncertainty'),
        ('middle_mean_abs_angle_error_target', 'angle_uncertainty'),
        ('middle_mean_abs_angle_error_target_avg_fold',
         'pred_angle_uncertainty'),
    ]
    uncertainty_map_last = [
        ('last_abs_rel_dist_error_target', 'distance_uncertainty'),
        ('last_abs_rel_dist_error_target_avg_fold',
         'pred_distance_uncertainty'),
        ('last_abs_angle_error_target', 'angle_uncertainty'),
        ('last_abs_angle_error_target_avg_fold', 'pred_angle_uncertainty'),
    ]

    for fn in fns:
        pred_rows = np.where(preds.fn.values == fn)[0]
        uncertainty_rows = np.where(uncertainties.fn.values == fn)[0]
        correct_row = np.where(corrections.fn.values == fn)[0][0]

        if fn_mode == 'mean':
            if 'mean_rel_dist_error_target_avg_fold' in corrections.columns:
                mean_correction = corrections[
                    'mean_rel_dist_error_target_avg_fold'].values[correct_row]
                if np.abs(
                        mean_correction) > apply_distance_correction_threshold:
                    preds.loc[pred_rows, 'pred_distance_corrected'] = preds[
                        'pred_distance'].values[pred_rows] * (1 +
                                                              mean_correction)

            if 'mean_angle_error_target_avg_fold' in corrections.columns:
                angle_correction = corrections[
                    'mean_angle_error_target_avg_fold'].values[correct_row]
                if np.abs(angle_correction) > apply_angle_correction_threshold:
                    x = preds.x_pred.values[pred_rows]
                    y = preds.y_pred.values[pred_rows]
                    preds.loc[pred_rows, 'x_pred_corrected'] = np.cos(
                        angle_correction) * x + np.sin(angle_correction) * y
                    preds.loc[pred_rows, 'y_pred_corrected'] = -np.sin(
                        angle_correction) * x + np.cos(angle_correction) * y

            for k1, k2 in uncertainty_map_mean:
                v = corrections[k1].values[correct_row]
                uncertainties.loc[uncertainty_rows, k2] = v
        else:
            first_correction = corrections[
                'first_rel_dist_error_target_avg_fold'].values[correct_row]
            middle_correction = corrections[
                'middle_mean_rel_dist_error_target_avg_fold'].values[
                    correct_row]
            last_correction = corrections[
                'last_rel_dist_error_target_avg_fold'].values[correct_row]

            if np.abs(first_correction) > apply_distance_correction_threshold:
                preds.loc[
                    pred_rows[0],
                    'pred_distance_corrected'] = preds['pred_distance'].values[
                        pred_rows[0]] * (1 + first_correction)
            if np.abs(middle_correction) > apply_distance_correction_threshold:
                preds.loc[
                    pred_rows[1:-1],
                    'pred_distance_corrected'] = preds['pred_distance'].values[
                        pred_rows[1:-1]] * (1 + middle_correction)
            if np.abs(last_correction) > apply_distance_correction_threshold:
                preds.loc[
                    pred_rows[-1],
                    'pred_distance_corrected'] = preds['pred_distance'].values[
                        pred_rows[-1]] * (1 + last_correction)

            for k1, k2 in uncertainty_map_first:
                v = corrections[k1].values[correct_row]
                uncertainties.loc[uncertainty_rows[0], k2] = v

            for k1, k2 in uncertainty_map_middle:
                v = corrections[k1].values[correct_row]
                uncertainties.loc[uncertainty_rows[1:-1], k2] = v

            for k1, k2 in uncertainty_map_last:
                v = corrections[k1].values[correct_row]
                uncertainties.loc[uncertainty_rows[-1], k2] = v

    preds['pred_error'] = np.abs((preds['distance'] - preds['pred_distance']))
    preds['corrected_error'] = np.abs(
        (preds['distance'] - preds['pred_distance_corrected']))
    preds['rel_pred_dist_error'] = (
        preds['distance'] - preds['pred_distance']) / (preds['pred_distance'])
    preds['rel_corrected_dist_error'] = (preds['distance'] -
                                         preds['pred_distance_corrected']) / (
                                             preds['pred_distance_corrected'])
    orig_abs_err = (np.abs(preds['x'] - preds['x_pred']) +
                    np.abs(preds['y'] - preds['y_pred'])).mean() / 2
    corrected_abs_err = (
        np.abs(preds['x'] - preds['x_pred_corrected']) +
        np.abs(preds['y'] - preds['y_pred_corrected'])).mean() / 2

    mean_orig_rel_dist_err = np.abs(preds['rel_pred_dist_error'].values).mean()
    mean_corrected_rel_dist_err = np.abs(
        preds['rel_corrected_dist_error'].values).mean()
    rel_err = mean_corrected_rel_dist_err / mean_orig_rel_dist_err
    print(f'Corrected relative error rate: {rel_err:.3f}')

    orig_dist_mae = preds.pred_error.values.mean()
    corrected_dist_mae = preds.corrected_error.values.mean()
    print(f'Original MAE: {orig_dist_mae:.3f}; Corrected MAE:\
   {corrected_dist_mae:.3f}')
    print(f'Rel move original MAE: {orig_abs_err:.3f}; Corrected rel move MAE:\
   {corrected_abs_err:.3f}')

    changed_fraction = (preds['pred_error'] != preds['corrected_error']).mean()
    improved_fraction = (preds['pred_error'] > preds['corrected_error']
                         ).mean() / ((changed_fraction + 1e-9))
    print(f'Improved distance pred fraction: {improved_fraction:.3f}')

    dist_uncertainty_cor = np.corrcoef(
        np.stack([
            uncertainties['pred_distance_uncertainty'].values,
            uncertainties['distance_uncertainty'].values,
        ]))[0, 1]
    angle_uncertainty_cor = np.corrcoef(
        np.stack([
            uncertainties['pred_angle_uncertainty'].values,
            uncertainties['angle_uncertainty'].values,
        ]))[0, 1]
    print(f'Distance uncertainty correlation: {dist_uncertainty_cor:.3f}')
    print(f'Angle uncertainty correlation: {angle_uncertainty_cor:.3f}')
    uncertainties.plot.scatter('pred_distance_uncertainty',
                               'distance_uncertainty')

    uncertainties = uncertainties[[
        'site',
        'floor',
        'fn',
        'sub_trajectory_id',
        'num_waypoints',
        'distance_uncertainty',
        'pred_distance_uncertainty',
        'angle_uncertainty',
        'pred_angle_uncertainty',
    ]]
    if save_uncertainties:
        uncertainty_path = model_folder / 'predictions' / ('uncertainty - ' +
                                                           predictions_ext)
        uncertainties.to_csv(uncertainty_path, index=False)
Beispiel #23
0
import itertools
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
from scipy.interpolate import interp2d

try:
    from utils import get_data_folder, TEST_FLOOR_MAPPING
except:
    import sys
    sys.path.append('..')
    from utils import get_data_folder, TEST_FLOOR_MAPPING

DEFAULT_WAYPOINT_PATH: Path = get_data_folder(
) / "waypoints" / "waypoint_by_hand.csv"


def generate_grid_points(site, floor, bottom_left, top_left, bottom_right,
                         top_right, N_h, N_v):
    X = np.array([0, 0, 1, 1], dtype=np.float32)
    Y = np.array([0, 1, 0, 1], dtype=np.float32)
    Zx, Zy = np.stack([bottom_left, top_left, bottom_right, top_right], axis=1)
    fx = interp2d(X, Y, Zx)
    fy = interp2d(X, Y, Zy)

    U = np.linspace(0, 1, N_h)
    V = np.linspace(0, 1, N_v)
    W = np.array(list([fx(u, v), fy(u, v)]
                      for u, v in itertools.product(U, V)))
    W = np.squeeze(W)
def get_all_floor_preds(mode,
                        config,
                        site,
                        site_id,
                        data_folder,
                        model_folder,
                        df,
                        analysis_floors,
                        debug_fn,
                        verbose=True):
    site_floors = np.sort(
        np.unique(df.text_level[(df.site_id == site)
                                & (df['mode'] != 'test')].values))

    if debug_fn is not None and df.site_id.values[np.where(
            df.fn.values == debug_fn)[0][0]] != site:
        return None

    # Load the wifi models for all site floors
    models = {}
    for floor in site_floors:
        mode_prefix = 'test-' if mode == 'test' else ''
        model_path = model_folder / site / (mode_prefix + floor + '.pickle')

        with open(model_path, 'rb') as f:
            models[floor] = pickle.load(f)

    # Load all prediction inputs
    trajectories = []
    for floor_id, floor in enumerate(site_floors):
        if verbose:
            print(f"Load floor {floor_id+1} of {site_floors.size}")
        numeric_floor = utils.TEST_FLOOR_MAPPING[floor]
        site_df = df[(df.site_id == site) & (df.num_wifi > 0)]
        analysis_df = site_df[(site_df['mode'] == mode)]

        target_floors = np.array(
            [analysis_floors[fn] for fn in analysis_df['fn'].values])
        correct_floor = target_floors == numeric_floor
        analysis_df_floor = analysis_df[correct_floor]

        if analysis_df_floor.shape[0] > 0:
            test_floor = floor if mode == 'test' else None
            trajectories.extend(
                utils.load_site_floor(analysis_df_floor,
                                      recompute_grouped_data=False,
                                      test_floor=test_floor))

    # Generate predictions for all trajectories for all floors
    make_predict_efforts = [True for t in trajectories]
    analysis_preds = []
    for j, t in enumerate(trajectories):
        if verbose:
            print(f"Trajectory {j+1} of {len(trajectories)}")
        fn = t['file_meta'].fn

        if (debug_fn is not None) and fn != debug_fn:
            continue

        debug_floor_distances = {}
        for floor in site_floors:
            # Locate all unique wifi time observations
            _, full_pos_pred = predict_trajectory(t, make_predict_efforts[j],
                                                  models[floor], True, config)

            distances = full_pos_pred.values[:, 2:]
            min_distances = distances.min(0)

            if (debug_fn is not None) and fn == debug_fn:
                print(debug_fn, floor)
                debug_floor_distances[floor] = min_distances

            analysis_preds.append({
                'site':
                site,
                'floor':
                floor,
                'numeric_floor':
                utils.TEST_FLOOR_MAPPING[floor],
                'reference_floor_label':
                analysis_floors[fn],
                'fn':
                fn,
                'min_min_distance':
                min_distances.min(),
                'mean_min_distance':
                min_distances.mean(),
                'max_min_distance':
                min_distances.max(),
                'min_distance_q0.1':
                np.quantile(min_distances, 0.1),
                'min_distance_q0.2':
                np.quantile(min_distances, 0.2),
                'min_distance_q0.3':
                np.quantile(min_distances, 0.3),
                'min_distance_q0.4':
                np.quantile(min_distances, 0.4),
                'min_distance_q0.5':
                np.quantile(min_distances, 0.5),
                'min_distance_q0.6':
                np.quantile(min_distances, 0.6),
                'min_distance_q0.7':
                np.quantile(min_distances, 0.7),
                'min_distance_q0.8':
                np.quantile(min_distances, 0.8),
                'min_distance_q0.9':
                np.quantile(min_distances, 0.9),
            })

        if (debug_fn is not None) and fn == debug_fn:
            combined_debug = pd.DataFrame(debug_floor_distances)
            debug_save_path = utils.get_data_folder() / (
                "all_floor_wifi_distances " + debug_fn + '.csv')
            combined_debug.to_csv(debug_save_path, index=True)

    print(f"Done with site {site_id+1} of 24: {site}")

    return pd.DataFrame(analysis_preds)
def run():
    print("Computing aggregate statistics")
    data_folder = utils.get_data_folder()
    processed_folder = data_folder / 'processed'
    pathlib.Path(processed_folder).mkdir(parents=True, exist_ok=True)
    last_processed_path = processed_folder / 'tests_level_id.p'
    if last_processed_path.is_file():
        return

    df = pd.read_csv(data_folder / 'file_summary.csv')
    # start_time = time.time()
    recs = read_all_recs(
        #df[(df['test_site']) & (~df['num_train_waypoints'].isnull()) & (df['num_wifi'] > 0)]
        df[(df['test_site'])])
    #print(len(recs))
    #print(f"Done in {time.time()-start_time:8.5f}s")

    agg = {}

    for col in tqdm([
            'x_acce', 'y_acce', 'z_acce', 'x_gyro', 'y_gyro', 'z_gyro',
            'x_magn', 'y_magn', 'z_magn', 'x_ahrs', 'y_ahrs', 'z_ahrs'
    ]):
        x = np.concatenate([r['shared_time'][col] for r in recs])
        agg[col] = {"mean": x.mean(), "std": x.std()}

    for col in tqdm(['x_waypoint', 'y_waypoint']):
        x = np.concatenate(
            [r['waypoint'][col] for r in recs if 'waypoint' in r])
        agg[col] = {"mean": x.mean(), "std": x.std()}

    x = np.concatenate([r['wifi']['rssid_wifi'] for r in recs if 'wifi' in r])
    agg['wifi'] = {
        'mean': x.mean(),
        "std": x.std(),
        "min": x.min(),
        "max": x.max()
    }

    for col in tqdm(['power_beac', 'rssi_beac']):
        x = np.concatenate([r['ibeacon'][col] for r in recs if 'ibeacon' in r])
        agg[col] = {
            'mean': x.mean(),
            "std": x.std(),
            "min": x.min(),
            "max": x.max()
        }

    agg['wifi']['max_records_per_t1'] = np.max([
        r['wifi'].groupby('t1_wifi')['bssid_wifi'].size().max() for r in recs
        if 'wifi' in r
    ])
    agg['wifi']['max_records_per_t2'] = np.max([
        r['wifi'].groupby('t2_wifi')['bssid_wifi'].size().max() for r in recs
        if 'wifi' in r
    ])

    agg['wifi']['max_unique_t1'] = np.max(
        [r['wifi']['t1_wifi'].nunique() for r in recs if 'wifi' in r])
    agg['wifi']['max_unique_t2'] = np.max(
        [r['wifi']['t2_wifi'].nunique() for r in recs if 'wifi' in r])

    agg['max_seq_len'] = np.max([len(r['shared_time']['time']) for r in recs])

    with open(processed_folder / 'tests_stats.p', 'wb') as handle:
        pickle.dump(agg, handle, protocol=4)
    with open(processed_folder / 'tests_ssid_wifi.p', 'wb') as handle:
        pickle.dump(set(
            np.concatenate(
                [r['wifi']['ssid_wifi'] for r in recs if 'wifi' in r])),
                    handle,
                    protocol=4)
    with open(processed_folder / 'tests_bssid_wifi.p', 'wb') as handle:
        pickle.dump(set(
            np.concatenate(
                [r['wifi']['bssid_wifi'] for r in recs if 'wifi' in r])),
                    handle,
                    protocol=4)
    with open(processed_folder / 'tests_id_beac_1.p', 'wb') as handle:
        pickle.dump(set(
            np.concatenate(
                [r['ibeacon']['id_beac_1'] for r in recs if 'ibeacon' in r])),
                    handle,
                    protocol=4)
    with open(processed_folder / 'tests_id_beac_2.p', 'wb') as handle:
        pickle.dump(set(
            np.concatenate(
                [r['ibeacon']['id_beac_2'] for r in recs if 'ibeacon' in r])),
                    handle,
                    protocol=4)
    with open(processed_folder / 'tests_id_beac_3.p', 'wb') as handle:
        pickle.dump(set(
            np.concatenate(
                [r['ibeacon']['id_beac_3'] for r in recs if 'ibeacon' in r])),
                    handle,
                    protocol=4)
    with open(processed_folder / 'tests_mac_beac.p', 'wb') as handle:
        pickle.dump(set(
            np.concatenate(
                [r['ibeacon']['mac_beac'] for r in recs if 'ibeacon' in r])),
                    handle,
                    protocol=4)
    with open(processed_folder / 'tests_site_id.p', 'wb') as handle:
        pickle.dump(set([r['site_id'] for r in recs]), handle, protocol=4)
    with open(last_processed_path, 'wb') as handle:
        pickle.dump(set(
            [r['site_id'] + '_' + str(r['text_level']) for r in recs]),
                    handle,
                    protocol=4)
Beispiel #26
0
 def test_get_data_folder(self):
     folder_name = utils.get_data_folder()
     self.assertGreater(len(folder_name), 0)
def run(mode, grid_type, consider_multiprocessing):
    print(f"Optimizing predictions for grid type {grid_type}")
    store_valid_submission = True
    store_extended_test = True
    debug_fn = [None, '58279d6ab8c2213722f2ef6b'][0]
    extensive_search = True
    additional_grid_multiprocessing = consider_multiprocessing
    consider_ignore_private_test = False
    grid_mode = ["standard", "dense"][int(grid_type == "dense_inner")]
    grid_settings = {
        "standard": {
            "min_distance_to_known": 3.0,
            "wall_point_distance_multiplier": 0.4,
            "inner_point_distance_multiplier": 0.7,
        },
        "dense": {
            "min_distance_to_known": 1.5,
            "wall_point_distance_multiplier": 0.2,
            "inner_point_distance_multiplier": 0.35,
        },
    }

    grid_version = [
        3, 4
    ][int(grid_type != "walls_only_old"
          )]  # V3: no inner grid fill; V4: inner grid fill + fixes
    cheat_valid_waypoints = not True

    config = {
        'top_distance_pos_wifi':
        20,
        'weighted_pos_exponent':
        4,
        'waypoint_weighted_wifi_penalties_mult':
        0.8,
        'nn_wifi_exp':
        1.5,
        'wifi_penalties_exp':
        0.8,
        'time_leak_delay_cutoff':
        15,
        'time_leak_time_decay_constant':
        20,
        'time_leak_nearby_constant':
        2,
        'time_leak_exact_constant':
        5,
        'time_leak_distance_pen_limit_constant':
        0.7,
        'time_leak_dissimilarity_decay':
        15,
        'time_leak_max_penalty':
        30,
        'distance_pen_constant':
        30,
        'rel_movement_pos_constant':
        0,  # Angle penalties are better!
        'rel_movement_angle_constant':
        9,
        'abs_movement_pos_constant':
        1.5,
        'cum_abs_movement_pos_constant':
        1.0,
        'abs_movement_angle_constant':
        0,  # Position penalties are better!
        'distance_uncertainty_exponent':
        1.0,
        'abs_move_uncertainty_exponent':
        1.0,
        'wifi_dir_constant':
        0.5,
        'inject_waypoints':
        not cheat_valid_waypoints or mode == 'test',
        'off_grid_waypoint_penalty':
        8,
        'off_grid_no_penalty_distance':
        10,
        'addit_grid_density_penalty':
        4,
        'min_distance_to_known':
        grid_settings[grid_mode]["min_distance_to_known"],
        'max_distance_to_known':
        30.0,
        'generate_inner_waypoints':
        True,
        'generate_edge_waypoints':
        False,
        'wall_point_distance_multiplier':
        grid_settings[grid_mode]["wall_point_distance_multiplier"],
        'inner_point_distance_multiplier':
        grid_settings[grid_mode]["inner_point_distance_multiplier"],
        'considered_sensor_sig_keys_scale': [('z_ahrs', 0.1), ('z_magn', 1.0)],
        'top_distance_pos_sensor':
        20,
        'magnetometer_penalty_constant':
        0 * 1.0,
        'wall_penalty_constant':
        0 * 5,
        'beam_1_width': (4000 if grid_mode == "very_dense" else 2000) if
        (extensive_search) else
        200,  # Expect a small boost of 0 to 0.05 when doing extensive_search
        'beam_2_width_wifi':
        20 if extensive_search else 10,
        'beam_2_width_abs_movement':
        80 if extensive_search else 40,
    }

    unbias_distance_predictions = True
    drop_mislabeled_fn_list_valid = []
    test_override_floors = False
    use_multiprocessing = consider_multiprocessing and (
        extensive_search) and debug_fn is None and (
            grid_mode != "very_dense"
        )  # Can be used with large (> 1000) beam 1 widths
    ignore_private_test = consider_ignore_private_test and (debug_fn is None)

    valid_mode = mode == 'valid'
    if valid_mode:
        wifi_source = 'non_parametric_wifi - valid - full distances.pickle'
        sensor_distance_source = 'distance_valid.csv'
        sensor_relative_movement_source = 'relative_movement_v2_valid.csv'
        sensor_absolute_movement_source = 'relative_movement_v3_valid.csv'
        sensor_uncertainties_source = 'uncertainty - valid.csv'
        time_leak_source = 'valid_edge_positions_v3.csv'
    else:
        wifi_source = 'non_parametric_wifi - test - full distances.pickle'
        sensor_distance_source = 'distance_test.csv'
        sensor_relative_movement_source = 'relative_movement_v2_test.csv'
        sensor_absolute_movement_source = 'relative_movement_v3_test.csv'
        sensor_uncertainties_source = 'uncertainty - test.csv'
        time_leak_source = 'test_edge_positions_v3.csv'

    wifi_ref_source = wifi_source.replace(' - full distances',
                                          '').replace('pickle', 'csv')
    data_folder = utils.get_data_folder()
    waypoints_path = data_folder / 'train_waypoints_timed.csv'
    models_folder = Path(data_folder).parent / 'Models'
    wifi_preds_folder = models_folder / 'non_parametric_wifi' / 'predictions'
    storage_folder = Path(data_folder).parent / 'Combined predictions'
    pathlib.Path(storage_folder).mkdir(parents=True, exist_ok=True)
    submission_path = storage_folder / (mode + ' - ' + grid_type + '.csv')
    if submission_path.is_file():
        return

    wifi_preds_path = wifi_preds_folder / wifi_source
    source_preds_path = wifi_preds_folder / wifi_ref_source
    sensor_distance_folder = models_folder / 'sensor_distance' / 'predictions'
    sensor_distance_path = sensor_distance_folder / sensor_distance_source
    sensor_rel_movement_folder = models_folder / 'sensor_relative_movement' / (
        'predictions')
    sensor_abs_movement_folder = models_folder / 'sensor_absolute_movement' / (
        'predictions')
    sensor_rel_movement_path = sensor_rel_movement_folder / (
        sensor_relative_movement_source)
    sensor_abs_movement_path = sensor_abs_movement_folder / (
        sensor_absolute_movement_source)
    time_leak_source_path = data_folder / time_leak_source
    leaderboard_types_path = data_folder / 'leaderboard_type.csv'
    correct_sensor_preds_folder = models_folder / 'correct_sensor_preds' / (
        'predictions')
    sensor_uncertainties_path = correct_sensor_preds_folder / (
        sensor_uncertainties_source)
    sensor_segment_stats_source = data_folder / 'sensor_data' / 'meta.csv'
    walls_folder = data_folder / 'stashed_walls_intersection_count'
    waypoints_folder = data_folder / 'stashed_floor_additional_waypoints'
    pathlib.Path(waypoints_folder).mkdir(parents=True, exist_ok=True)

    # Load the raw data upon changing the data mode
    (loaded_mode, orig_source_preds, source_preds, sites, floors,
     unique_floor_waypoints, floor_waypoint_rel_pos_distances,
     floor_waypoint_wifi_distances, floor_waypoint_wifi_distances_order,
     leaderboard_types, time_leaks, wifi_preds_flat, original_preds,
     distance_preds, relative_movement_preds, absolute_movement_preds,
     sensor_preds_uncertainties, sensor_segment_stats, source_actual, fn_ids,
     w) = combine_predictions_beamsearch_utils.preprocess(
         config, mode, wifi_preds_path, source_preds_path, valid_mode,
         sensor_distance_path, sensor_rel_movement_path,
         sensor_abs_movement_path, time_leak_source_path, waypoints_path,
         leaderboard_types_path, cheat_valid_waypoints,
         sensor_uncertainties_path, sensor_segment_stats_source,
         waypoints_folder, additional_grid_multiprocessing,
         test_override_floors, grid_version)

    optimized_predictions, optimized_test_predictions = (
        combine_predictions_beamsearch_utils.combined_predictions_all_floors(
            mode, config, use_multiprocessing, distance_preds,
            relative_movement_preds, absolute_movement_preds,
            sensor_preds_uncertainties, source_preds, original_preds,
            source_actual, sensor_segment_stats, fn_ids, sites, floors,
            time_leaks, wifi_preds_flat, unique_floor_waypoints,
            floor_waypoint_rel_pos_distances, floor_waypoint_wifi_distances,
            floor_waypoint_wifi_distances_order, leaderboard_types,
            ignore_private_test, debug_fn, drop_mislabeled_fn_list_valid, w,
            walls_folder, unbias_distance_predictions))

    if valid_mode:
        optimized_predictions.sort_values(
            ["site", "floor", "fn", "waypoint_time"], inplace=True)
        err = optimized_predictions.after_optim_error.values
        optimized_error = err.mean()
        print(f"Optimized validation error: {optimized_error:.2f}")
        if debug_fn is None:
            best_opt_err = utils.get_best_opt_error(optimized_predictions)
            tr_mask = optimized_predictions.all_targets_on_waypoints.values
            tr_traj_opt_error = err[tr_mask].mean()
            tr_best_opt_error = best_opt_err[tr_mask].mean()
            non_tr_traj_opt_error = np.nan if cheat_valid_waypoints else (
                err[~tr_mask].mean())
            print(
                f"Group stats: {tr_traj_opt_error:.2f} ({tr_best_opt_error:.2f});\
   {non_tr_traj_opt_error:.2f}")
    else:
        non_predicted_ids = np.where(np.abs(original_preds).sum(1) == 0)[0]
        optimized_test_predictions = pd.concat([
            optimized_test_predictions,
            orig_source_preds.iloc[non_predicted_ids]
        ])
        original_rows = np.array([
            np.where(optimized_test_predictions.site_path_timestamp.values ==
                     sps)[0][0]
            for sps in orig_source_preds.site_path_timestamp
        ])

        optimized_test_predictions = optimized_test_predictions.iloc[
            original_rows]
        optimized_test_predictions.index = np.arange(
            optimized_test_predictions.shape[0])

    if (store_valid_submission or mode == 'test') and debug_fn is None:
        if valid_mode:
            optimized_predictions.to_csv(submission_path, index=False)
        else:
            optimized_test_predictions.to_csv(submission_path, index=False)
            if store_extended_test:
                submission_path_extended = storage_folder / (
                    mode + ' - ' + grid_type + ' - extended.csv')
                optimized_predictions.to_csv(submission_path_extended,
                                             index=False)
Beispiel #28
0
 def test_get_data_folder_v2(self):
     folder_name = utils.get_data_folder(version=2)
     self.assertEqual(folder_name, 'data_v2/')
Beispiel #29
0
def run():
  print("Combining sensor models with device ids")
  data_folder = utils.get_data_folder()
  summary_path = data_folder / 'file_summary.csv'
  df = pd.read_csv(summary_path)
  model_folder = data_folder.parent / 'Models' / 'sensor_absolute_movement'
  absolute_fold_folder = model_folder / 'cv'
  distance_folder = model_folder.parent / 'sensor_distance'
  valid_path = model_folder / 'predictions' / 'relative_movement_v3_valid.csv'
  device_id_path = data_folder / 'device_ids.pickle'
  meta_sensor_path = data_folder / 'sensor_data' / 'meta.csv'
  
  with open(device_id_path, 'rb') as f:
    device_ids = pickle.load(f)
  meta_data = pd.read_csv(meta_sensor_path, dtype={'test_type': object})
  
  
  #################################################
  # A: Combine statistics at the trajectory level #
  #################################################
  device_ids_path = data_folder / 'inferred_device_ids.csv'
  if not device_ids_path.is_file():
    device_id_vals, device_drifts, device_id_merged_vals = zip(
      *list(device_ids.values()))
    device_ids_df = pd.DataFrame({
      'mode': [meta_data['mode'].values[np.where(meta_data.fn == fn)[0][
        0]] for fn in list(device_ids.keys())],
      'test_type': [meta_data['test_type'].values[np.where(meta_data.fn == fn)[
        0][0]] for fn in list(device_ids.keys())],
      'fn': list(device_ids.keys()),
      'device_id': list(device_id_vals),
      'device_id_drift': list(device_drifts),
      'device_id_merged': list(device_id_merged_vals),
      'site': [df.site_id.values[np.where(df.fn == fn)[0][0]] for fn in list(
        device_ids.keys())],
      'floor': [df.level.values[np.where(df.fn == fn)[0][0]] for fn in list(
        device_ids.keys())],
      'start_time': [meta_data.start_time.values[np.where(meta_data.fn == fn)[
        0][ 0]] for fn in list(device_ids.keys())],
      'end_time': [meta_data.end_time.values[np.where(meta_data.fn == fn)[0][
        -1]] for fn in list(device_ids.keys())],
      'first_last_wifi_time': [df.first_last_wifi_time.values[
        np.where(df.fn == fn)[0][0]] for fn in list(device_ids.keys())],
      })
    device_ids_df.sort_values(['first_last_wifi_time', 'start_time'],
                              inplace=True)
    device_ids_df.to_csv(device_ids_path, index=False)
  
  
  ##############################################
  # B: Combine statistics at the segment level #
  ##############################################
  save_path_device_errors = data_folder / 'sensor_model_device_errors.csv'
  if not save_path_device_errors.is_file():
    train_preds_list = []
    for i in range(5):
      preds = pd.read_csv(absolute_fold_folder / f'preds_bag_fold_{i}.csv')
      preds['train_fold'] = i
      train_preds_list.append(preds)
      
    train_preds = pd.concat(train_preds_list)
      
    train_preds['mode'] = 'train'
    valid_preds = pd.read_csv(valid_path)
    valid_preds['train_fold'] = np.nan
    valid_preds['mode'] = 'valid'
    test_preds = meta_data[meta_data['mode'] == 'test']
    with pd.option_context('mode.chained_assignment', None):
      test_preds['x'] = np.nan
      test_preds['y'] = np.nan
      test_preds['x_pred'] = np.nan
      test_preds['y_pred'] = np.nan
      test_preds['train_fold'] = np.nan
      test_preds.rename(columns={"level": "floor"}, inplace=True)
      test_preds = test_preds.loc[:, valid_preds.columns]
    
    all_preds = pd.concat([train_preds, valid_preds, test_preds])
    
    all_preds['device_id'] = -3
    all_preds['device_id_drift'] = False
    all_preds['device_id_merged'] = -3
    all_preds['error'] = -2
    all_preds['start_time'] = -1
    all_preds['end_time'] = -1
    all_preds.index = np.arange(all_preds.shape[0])
    for i in tqdm(range(all_preds.shape[0])):
      fn = all_preds.fn.values[i]
      mode = all_preds['mode'].values[i]
      if mode == 'test':
        error = np.nan
      else:
        error = np.sqrt((all_preds.x.values[i]-all_preds.x_pred.values[i])**2 + (
          all_preds.y.values[i]-all_preds.y_pred.values[i])**2)
      sub_trajectory_id = all_preds.sub_trajectory_id.values[i]
      meta_row = np.where((meta_data.fn.values == fn) & (
        meta_data.sub_trajectory_id.values == sub_trajectory_id))[0][0]
      start_time = meta_data.start_time.values[meta_row]
      end_time = meta_data.end_time.values[meta_row]
      df_row = np.where(df.fn == fn)[0][0]
      first_last_wifi_time = df.first_last_wifi_time.values[df_row]
      if np.isnan(first_last_wifi_time):
        assert mode == 'train'
        first_last_wifi_time = df.start_time.values[df_row]
      
      all_preds.loc[i, 'device_id'] = device_ids[fn][0]
      all_preds.loc[i, 'device_id_drift'] = device_ids[fn][1]
      all_preds.loc[i, 'device_id_merged'] = device_ids[fn][2]
      all_preds.loc[i, 'error'] = error
      all_preds.loc[i, 'start_time'] = start_time
      all_preds.loc[i, 'end_time'] = end_time
      all_preds.loc[i, 'first_last_wifi_time'] = first_last_wifi_time
      
    all_preds.sort_values([
      'device_id', 'first_last_wifi_time', 'sub_trajectory_id'], inplace=True)
    all_preds.to_csv(save_path_device_errors, index=False)
  
  save_path_fn_errors = data_folder / "fn_device_errors.csv"
  if not save_path_fn_errors.is_file():
    device_errors = pd.read_csv(save_path_device_errors)
    device_errors.sort_values(
      ['site', 'fn', 'sub_trajectory_id'], inplace=True)
    device_errors.index = np.arange(device_errors.shape[0])
    device_errors['new_device_id'] = [
      True] + (device_errors.device_id.values[:-1] != (
        device_errors.device_id.values[1:])).tolist()
    device_errors['dist'] = np.sqrt(
      device_errors.x.values**2 + device_errors.y.values**2)
    distance_cv_folder = distance_folder / 'cv'
    folds = []
    for i in range(5):
      f = pd.read_csv(distance_cv_folder / (
        "preds_bag_fold_" + str(i) + ".csv"))
      f['fold'] = i
      folds.append(f)
    combined_train_folds = pd.concat(folds)
    
    valid_preds = pd.read_csv(distance_folder / 'predictions' / (
      "distance_valid.csv"))
    valid_preds.drop(valid_preds.columns[0], axis=1, inplace=True)
    valid_preds['fold'] = np.nan
    
    all_preds = pd.concat([combined_train_folds, valid_preds])
    all_preds.sort_values(
      ['site', 'fn', 'sub_trajectory_id'], inplace=True)
    all_preds.index = np.arange(all_preds.shape[0])
    device_errors['dist_pred'] = np.nan
    device_errors.loc[np.where(~np.isnan(device_errors.dist.values))[0],
                      'dist_pred'] = all_preds.pred.values
    device_errors['dist_error'] = device_errors.dist.values-(
      device_errors.dist_pred.values)
    device_errors['rel_dist_error'] = device_errors.dist_error.values/(
      device_errors.dist_pred.values)
    device_errors.sort_values(['fn', 'sub_trajectory_id'], inplace=True)
    device_errors.index = np.arange(device_errors.shape[0])
    device_errors['rel_weight'] = np.concatenate(
      device_errors.groupby('fn').apply(
        lambda x: np.abs(x.dist.values)/np.abs(x.dist.values).sum()))
    device_errors['section'] = "Middle"
    device_errors.loc[np.where(device_errors.sub_trajectory_id.values == (
      device_errors.num_waypoints.values-2))[0], 'section'] = "Last"
    device_errors.loc[np.where(
      device_errors.sub_trajectory_id.values == 0)[0], 'section'] = "First"
    device_errors['middle_weight_sums'] = np.concatenate(
      device_errors.groupby('fn').apply(
        lambda x: np.repeat((
          x.rel_weight.values[x.section.values=="Middle"]).sum(),
          x.shape[0]).reshape(-1)))
    device_errors.sort_values(
      ['device_id', 'first_last_wifi_time', 'sub_trajectory_id'], inplace=True)
    device_errors.index = np.arange(device_errors.shape[0])
    device_errors['rel_middle_weight'] = 0
    middle_rows = np.where(device_errors.section.values == "Middle")[0]
    device_errors.loc[middle_rows, "rel_middle_weight"] = (
      device_errors.rel_weight.values[middle_rows]/(
        device_errors.middle_weight_sums.values[middle_rows]))
    device_errors['angle_error'] = np.arctan2(
      device_errors.y_pred.values, device_errors.x_pred.values) - np.arctan2(
        device_errors.y.values, device_errors.x.values)
    change_rows = np.where((~np.isnan(device_errors.angle_error.values)) & (
      device_errors.angle_error.values < np.pi))[0]
    device_errors.loc[change_rows, 'angle_error'] = (
      device_errors.angle_error.values[change_rows] + 2*np.pi)
    change_rows = np.where((~np.isnan(device_errors.angle_error.values)) & (
      device_errors.angle_error.values > np.pi))[0]
    device_errors.loc[change_rows, 'angle_error'] = (
      device_errors.angle_error.values[change_rows] - 2*np.pi)
    
    def f(x):
      d = {}
      d['site'] = x['site'].values[0]
      d['floor'] = x['floor'].values[0]
      d['mode'] = x['mode'].values[0]
      d['train_fold'] = x['train_fold'].values[0]
      d['num_waypoints'] = x['num_waypoints'].values[0]
      d['total_dist'] = x['dist'].values.sum()
      
      d['mean_rel_dist_error'] = (x['rel_dist_error'].values*(
        x['rel_weight'].values)).sum()
      d['mean_abs_rel_dist_error'] = (np.abs(x['rel_dist_error'].values)*(
        x['rel_weight'].values)).sum()
      d['mean_angle_error'] = (x['angle_error'].values*(
        x['rel_weight'].values)).sum()
      d['mean_abs_angle_error'] = (np.abs(x['angle_error'].values)*(
        x['rel_weight'].values)).sum()
      d['first_rel_dist_error'] = x['rel_dist_error'].values[0]
      d['first_abs_rel_dist_error'] = np.abs(x['rel_dist_error'].values)[0]
      d['first_angle_error'] = x['angle_error'].values[0]
      d['first_abs_angle_error'] = np.abs(x['angle_error'].values)[0]
      d['middle_mean_rel_dist_error'] = (x['rel_dist_error'].values*(
        x['rel_weight'].values))[1:-1].sum()
      d['middle_mean_abs_rel_dist_error'] = (np.abs(x[
        'rel_dist_error'].values)*(x['rel_weight'].values))[1:-1].sum()
      d['middle_mean_angle_error'] = (x['angle_error'].values*(
        x['rel_weight'].values))[1:-1].sum()
      d['middle_mean_abs_angle_error'] = (np.abs(x['angle_error'].values)*(
        x['rel_weight'].values))[1:-1].sum()      
      d['last_rel_dist_error'] = x['rel_dist_error'].values[-1]
      d['last_abs_rel_dist_error'] = np.abs(x['rel_dist_error'].values)[-1]
      d['last_angle_error'] = x['angle_error'].values[-1]
      d['last_abs_angle_error'] = np.abs(x['angle_error'].values)[-1]
      
      d['first_first_last_wifi_time'] = (
        x['first_last_wifi_time'].values).min()
      d['time'] = (x['start_time'].values).min()
      d['device_id'] = (x['device_id'].values).min()
      
      return pd.Series(d, index=list(d.keys()))

    fn_dev_errors = device_errors.groupby('fn').apply(f).reset_index()
    
    fn_dev_errors['plot_time'] = fn_dev_errors['time']
    fn_dev_errors.loc[np.where(
      fn_dev_errors['mode'].values == "test")[0], 'plot_time'] = (
      fn_dev_errors.first_first_last_wifi_time.values[
        np.where(fn_dev_errors['mode'] == "test")[0]])
    fn_dev_errors['row'] = 1+np.arange(fn_dev_errors.shape[0])
    fn_dev_errors.sort_values(['device_id', 'plot_time'], inplace=True)
    fn_dev_errors.to_csv(save_path_fn_errors, index=False)
Beispiel #30
0
def run():
  print("Inferring device ids")
  only_process_test_sites = True
  signature_dist_threshold = 0.5
  dist_scaler = np.array([
    2.4831865e-03, 1.8569984e-03, 1.5326408e-03,
    3.5197838e+01, 4.1837849e+01, 3.4933647e+01
    ], dtype=np.float32)
  
  sig_cols = [
    'x2_gyro_uncali', 'y2_gyro_uncali', 'z2_gyro_uncali',
    'x2_magn_uncali', 'y2_magn_uncali', 'z2_magn_uncali',
    ]
  data_folder = utils.get_data_folder()
  sensor_folder = data_folder / 'sensor_data'
  summary_path = data_folder / 'file_summary.csv'
  device_id_path = data_folder / 'device_ids.pickle'
  if device_id_path.is_file():
    return
  save_ext = '' if only_process_test_sites else '_all_sites'
  meta_sensor_path = sensor_folder / ('meta' + save_ext + '_no_device.csv')
  df = pd.read_csv(summary_path)
  meta_sensor = pd.read_csv(meta_sensor_path, dtype={'test_type': object})
  if only_process_test_sites:
    df = df[df.test_site]
  
  df.index = np.arange(df.shape[0])
  with pd.option_context('mode.chained_assignment', None):
    df['first_last_wifi_replaced_time'] = df['first_last_wifi_time']
    no_wifi_rows = np.where(df.num_wifi == 0)[0]
    assert np.all(df['mode'].values[no_wifi_rows] == 'train')
    df.loc[no_wifi_rows, 'first_last_wifi_replaced_time'] = (
      df.start_time.values[no_wifi_rows])
  
  df.sort_values(by=['first_last_wifi_replaced_time'], axis=0, inplace=True)
  df.index = np.arange(df.shape[0])
  
  all_sensor = {}
  for m in ['valid', 'test', 'train']:
    print(m)
    with open(sensor_folder / (m + save_ext + '.pickle'), 'rb') as f:
      sensor_data = pickle.load(f)
    all_sensor.update(sensor_data)
  
  fns = df.fn.values
  modes = df['mode'].values
  num_fn = df.shape[0]
  unique_sites = np.sort(np.unique(df.site_id.values))
  
  device_ids = {}
  device_ids_ordered = []
  active_device_signatures = []
  act_sig_recent_ids = []
  next_signature_id = 0
  dev_stats = []
  for i in range(num_fn):
    if (i+1) % 1000 == 0:
      print(i+1)
    fn = fns[i]
    mode = modes[i]
    
    # if fn in ['5daec763aa1d300006faafcd', '5daece4eaa1d300006fab032']:
    #   import pdb; pdb.set_trace()
    #   x=1
    
    first_uncal_vals = all_sensor[fn]['waypoint_segments'][0][sig_cols].values
    last_uncal_vals = all_sensor[fn]['waypoint_segments'][-1][sig_cols].values
    signature_absent = np.isnan(first_uncal_vals[0, 0]) or np.isnan(
      last_uncal_vals[0, 0])
    this_first_signature = first_uncal_vals[0]
    this_last_signature = last_uncal_vals[-1]
    signature_change_this_step = not np.all(
      np.isclose(this_first_signature, this_last_signature))
    plot_time = df.first_last_wifi_time.values[i] if mode == 'test' else (
      df.start_time.values[i])
    meta_rows = np.where(meta_sensor.fn.values == fn)[0]
    mean_robust_sensor_time_diff = np.median(
      meta_sensor.mean_robust_sensor_time_diff.values[meta_rows])
    site_id = np.where(df.site_id.values[i] == unique_sites)[0][0]
    
    if signature_absent:
      device_ids[fn] = (-1, None)
      device_ids_ordered.append((fn, -1, None))
      dev_stats.append({
        'fn': fn, 'device_id': -1, 'site_id': site_id, 'plot_time': plot_time,
        'mean_robust_sensor_time_diff': mean_robust_sensor_time_diff,
        })
      continue
    
    # Compute when the next trajectory can use the same device
    if mode == 'test':
      corrected_start_time = df.first_last_wifi_time.values[i] + 5000
      this_min_next_available_time = df.duration.values[i] + (
        df.first_last_wifi_time.values[i]) - 5000
    else:
      this_min_next_available_time = df.end_time.values[i]
      corrected_start_time = df.start_time.values[i]
    
    found_signature = False
    for j in act_sig_recent_ids:
      (signature, signature_id, min_available_time, prev_sig_mode, prev_fn,
       prev_row, prev_drift) = active_device_signatures[j]
      signature_dist = (np.abs(this_first_signature - signature)/(
        dist_scaler)).sum()
      num_shared_nz = (np.isclose(signature, this_first_signature) & (
        signature != 0)).sum()
      # if signature_dist > 0:
      #   print(signature_dist, num_shared_nz)
      same_signature = signature_dist <= signature_dist_threshold or (
        num_shared_nz > 1)
      
      if same_signature:
        if corrected_start_time < min_available_time:
          print(i, corrected_start_time, min_available_time, signature_dist,
                num_shared_nz)
          print("This should not happen - signature time inconsistency")
        
        # if signature_change_this_step:
        #   import pdb; pdb.set_trace()
        
        device_ids[fn] = (signature_id, signature_change_this_step)
        device_ids_ordered.append((fn, signature_id, signature_change_this_step))
        dev_stats.append({
          'fn': fn, 'device_id': signature_id, 'site_id': site_id,
          'plot_time': plot_time,
          'mean_robust_sensor_time_diff': mean_robust_sensor_time_diff,
        })
        found_signature = True
        active_device_signatures[j] = (
          this_last_signature, signature_id, this_min_next_available_time, mode,
          fn, i, signature_change_this_step)
        act_sig_recent_ids.remove(j)
        act_sig_recent_ids = [j] + act_sig_recent_ids
        break
      
    if not found_signature:
      signature_id = next_signature_id
      # if signature_id == 52:
      #   import pdb; pdb.set_trace()
      #   x=1
      device_ids[fn] = (signature_id, signature_change_this_step)
      device_ids_ordered.append((fn, signature_id, signature_change_this_step))
      dev_stats.append({
          'fn': fn, 'device_id': signature_id, 'site_id': site_id,
          'plot_time': plot_time,
          'mean_robust_sensor_time_diff': mean_robust_sensor_time_diff,
        })
      active_device_signatures.append((
        this_last_signature, signature_id, this_min_next_available_time, mode,
        fn, i, signature_change_this_step))
      act_sig_recent_ids = [next_signature_id] + act_sig_recent_ids
      next_signature_id += 1
  
  combined_signatures = pd.DataFrame(
    np.stack([s[0] for s in active_device_signatures]))
  
  # Stitch device ids back together using time, mean time between sensor
  # observations and the site id.
  # Also split the -1 device ids based on mean time between sensor observations
  dev_stats_df = pd.DataFrame(dev_stats)
  dev_stats_df.loc[(dev_stats_df.device_id.values == -1) & (
    dev_stats_df.mean_robust_sensor_time_diff < 20), 'device_id'] = -2
  predecessors = {
    -2: [],
    -1: [],
    }
  
  stats_device_ids = dev_stats_df.device_id.values
  site_ids = dev_stats_df.site_id.values
  plot_times = dev_stats_df.plot_time.values
  rtds = dev_stats_df.mean_robust_sensor_time_diff.values
  for i in range(dev_stats_df.device_id.values.max()+1):
    first_row = np.where(stats_device_ids == i)[0][0]
    first_rtd = rtds[first_row]
    this_site_id = site_ids[first_row]
    
    pred_candidates = []
    for c in predecessors:
      last_chain_device = c if not len(predecessors[c]) else (
        predecessors[c][-1])
      pred_last_row = np.where(stats_device_ids == last_chain_device)[0][-1]
      pred_last_rtd = rtds[pred_last_row]
      pred_site_id = site_ids[pred_last_row]
      time_gap = plot_times[first_row] - plot_times[pred_last_row]
      print(i, last_chain_device, time_gap)
      
      if time_gap > 0 and time_gap <= 86400000 and (
          this_site_id == pred_site_id) and np.abs(
            first_rtd - pred_last_rtd) < 0.02:
        pred_candidates.append(c)
      
    if len(pred_candidates):
      assert len(pred_candidates) == 1
      predecessors[pred_candidates[0]].append(i)
    else:
      predecessors[i] = []
  
  merged_device_ids = {}
  for k_id, k in enumerate(list(predecessors.keys())):
    merged_device_ids[k] = k_id
    for v in predecessors[k]:
      merged_device_ids[v] = k_id
  
  combined_device_ids = {}
  for fn in device_ids:
    dev_id, drift = device_ids[fn]
    rtd_fn = rtds[np.where(dev_stats_df.fn.values == fn)[0][0]]
    if dev_id == -1 and rtd_fn < 20:
      dev_id = -2
    combined_device_ids[fn] = (dev_id, drift, merged_device_ids[dev_id])
  
  #np.array([v[2] for k, v in combined_device_ids.items()])
  
  with open(device_id_path, 'wb') as handle:
    pickle.dump(combined_device_ids, handle, protocol=pickle.HIGHEST_PROTOCOL)