# impute_columns = ['acc_mobile_x', 'acc_mobile_y', 'acc_mobile_z', 'gyr_mobile_x','gyr_mobile_y','gyr_mobile_z','mag_mobile_x','mag_mobile_y','mag_mobile_z','prox_mobile_distance', # 'loc_mobile_latitude','loc_mobile_longitude','loc_mobile_height','loc_mobile_velocity','loc_mobile_direction','loc_mobile_horizontalAccuracy','loc_mobile_verticalAccuracy'] # # Let us impute the missing values and plot an example. MisVal = ImputationMissingValues() imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'acc_mobile_x') imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'acc_mobile_x') imputed_interpolation_dataset = MisVal.impute_interpolate( copy.deepcopy(dataset), 'acc_mobile_x') DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'acc_mobile_x', imputed_mean_dataset['acc_mobile_x'], imputed_interpolation_dataset['acc_mobile_x']) # Now, let us carry out that operation over all columns except for the label. for col in [c for c in dataset.columns if not 'label' in c]: dataset = MisVal.impute_interpolate(dataset, col) # Using the result from Chapter 2, let us try the Kalman filter on the light_phone_lux attribute and study the result. original_dataset = pd.read_csv(DATA_PATH / ORIG_DATASET_FNAME, index_col=0) original_dataset.index = pd.to_datetime(original_dataset.index) KalFilter = KalmanFilters() kalman_dataset = KalFilter.apply_kalman_filter(original_dataset, 'acc_mobile_x')
dataset.index[0]).microseconds / 1000 # Step 2: Let us impute the missing values. MisVal = ImputationMissingValues() imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'hr_watch_rate') imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'hr_watch_rate') imputed_interpolation_dataset = MisVal.impute_interpolate( copy.deepcopy(dataset), 'hr_watch_rate') # DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate']) print imputed_interpolation_dataset['hr_watch_rate'] DataViz.plot_imputed_values(dataset, ['original', 'interpolation'], 'hr_watch_rate', imputed_interpolation_dataset['hr_watch_rate']) # And we impute for all columns except for the label in the selected way (interpolation) for col in [c for c in dataset.columns if not 'label' in c]: dataset = MisVal.impute_interpolate(dataset, col) # Let us try the Kalman filter on the light_phone_lux attribute and study the result. original_dataset = pd.read_csv(dataset_path + 'chapter2_result.csv', index_col=0) original_dataset.index = original_dataset.index.to_datetime() KalFilter = KalmanFilters() kalman_dataset = KalFilter.apply_kalman_filter(original_dataset, 'acc_phone_x') DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'],
c for c in dataset.columns if not 'label' and not 'light_phone_lux' in c ]: dataset = MisVal.impute_interpolate(dataset, col) dataset = MisVal.impute_median(dataset, 'light_phone_lux') # Let us try the Kalman filter on the light_phone_lux attribute and study the result. original_dataset = pd.read_csv(dataset_path + 'chapter2_result-own.csv', index_col=0) original_dataset.index = original_dataset.index.to_datetime() KalFilter = KalmanFilters() kalman_dataset = KalFilter.apply_kalman_filter(original_dataset, 'acc_phone_x') #kalman_dataset = KalFilter.apply_kalman_filter(original_dataset, 'light_phone_lux') DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'], 'acc_phone_x', kalman_dataset['acc_phone_x_kalman']) #DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'], 'light_phone_lux', kalman_dataset['light_phone_lux_kalman']) DataViz.plot_dataset(kalman_dataset, ['acc_phone_x', 'acc_phone_x_kalman'], ['exact', 'exact'], ['line', 'line']) #DataViz.plot_dataset(kalman_dataset, ['light_phone_lux', 'light_phone_lux_kalman'], ['exact','exact'], ['line', 'line']) # We ignore the Kalman filter output for now... # Let us apply a lowpass filter and reduce the importance of the data above 1.5 Hz LowPass = LowPassFilter() # Determine the sampling frequency. fs = float(1000) / milliseconds_per_instance cutoff = 1.5
dataset_cs.index = dataset_cs.index.to_datetime() # Computer the number of milliseconds covered by an instane based on the first two rows milliseconds_per_instance = (dataset_own.index[1] - dataset_own.index[0]).microseconds / 1000 # Step 2: Let us impute the missing values for CS (plot only). MisVal = ImputationMissingValues() imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset_cs), 'hr_watch_rate') imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset_cs), 'hr_watch_rate') imputed_interpolation_dataset = MisVal.impute_interpolate( copy.deepcopy(dataset_cs), 'hr_watch_rate') DataViz.plot_imputed_values(dataset_cs, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate']) # Make plot for own dataset MisVal = ImputationMissingValues() imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset_own), 'press_phone_pressure') imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset_own), 'press_phone_pressure') imputed_interpolation_dataset = MisVal.impute_interpolate( copy.deepcopy(dataset_own), 'press_phone_pressure') DataViz.plot_imputed_values( dataset_own, ['original', 'mean', 'interpolation'], 'press_phone_pressure', imputed_mean_dataset['press_phone_pressure'], imputed_interpolation_dataset['press_phone_pressure'])
def main(): # Import the data from the specified location and parse the date index try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of our visualization class to plot the results DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Create objects for value imputation, low pass filter and PCA MisVal = ImputationMissingValues() LowPass = LowPassFilter() PCA = PrincipalComponentAnalysis() if FLAGS.mode == 'imputation': # Impute the missing values and plot an example imputed_mean_dataset = MisVal.impute_mean( dataset=copy.deepcopy(dataset), col='hr_watch_rate') imputed_median_dataset = MisVal.impute_median( dataset=copy.deepcopy(dataset), col='hr_watch_rate') imputed_interpolation_dataset = MisVal.impute_interpolate( dataset=copy.deepcopy(dataset), col='hr_watch_rate') DataViz.plot_imputed_values( dataset, ['original', 'mean', 'median', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_median_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate']) elif FLAGS.mode == 'kalman': # Using the result from Chapter 2, try the Kalman filter on the light_phone_lux attribute and study the result try: original_dataset = pd.read_csv(DATA_PATH / ORIG_DATASET_FNAME, index_col=0) original_dataset.index = pd.to_datetime(original_dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!' ) raise e KalFilter = KalmanFilters() kalman_dataset = KalFilter.apply_kalman_filter( data_table=original_dataset, col='acc_phone_x') DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'], 'acc_phone_x', kalman_dataset['acc_phone_x_kalman']) DataViz.plot_dataset(data_table=kalman_dataset, columns=['acc_phone_x', 'acc_phone_x_kalman'], match=['exact', 'exact'], display=['line', 'line']) elif FLAGS.mode == 'lowpass': # Apply a lowpass filter and reduce the importance of the data above 1.5 Hz # Determine the sampling frequency fs = float(1000) / milliseconds_per_instance # Study acc_phone_x new_dataset = LowPass.low_pass_filter( data_table=copy.deepcopy(dataset), col='acc_phone_x', sampling_frequency=fs, cutoff_frequency=FLAGS.cutoff, order=10) DataViz.plot_dataset( new_dataset.iloc[int(0.4 * len(new_dataset.index) ):int(0.43 * len(new_dataset.index)), :], ['acc_phone_x', 'acc_phone_x_lowpass'], ['exact', 'exact'], ['line', 'line']) elif FLAGS.mode == 'PCA': # First impute again, as PCA can not deal with missing values for col in [c for c in dataset.columns if 'label' not in c]: dataset = MisVal.impute_interpolate(dataset, col) # Determine the PC's for all but the target columns (the labels and the heart rate) selected_predictor_cols = [ c for c in dataset.columns if (not ('label' in c)) and (not (c == 'hr_watch_rate')) ] pc_values = PCA.determine_pc_explained_variance( data_table=dataset, cols=selected_predictor_cols) cumulated_variance = np.cumsum(pc_values) # Plot the explained variance and cumulated variance comp_numbers = np.arange(1, len(pc_values) + 1) DataViz.plot_xy(x=[comp_numbers, comp_numbers], y=[pc_values, cumulated_variance], xlabel='principal component number', ylabel='explained variance', ylim=[0, 1], line_styles=['b-', 'r-'], names=['Variance', 'Cumulated variance']) # Select 7 as the best number of PC's as this explains most of the variance n_pcs = 7 dataset = PCA.apply_pca(data_table=copy.deepcopy(dataset), cols=selected_predictor_cols, number_comp=n_pcs) # Visualize the result of the PC's and the overall final dataset DataViz.plot_dataset(dataset, ['pca_', 'label'], ['like', 'like'], ['line', 'points']) DataViz.plot_dataset(dataset, [ 'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'pca_', 'label' ], [ 'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like' ], [ 'line', 'line', 'line', 'line', 'line', 'line', 'line', 'points', 'points' ]) elif FLAGS.mode == 'final': # Carry out that operation over all columns except for the label print('Imputing missing values.') for col in tqdm([c for c in dataset.columns if 'label' not in c]): dataset = MisVal.impute_interpolate(dataset=dataset, col=col) # Include all measurements that have a form of periodicity and filter them periodic_measurements = [ 'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z' ] print('Applying low pass filter on peridic measurements.') # Determine the sampling frequency. fs = float(1000) / milliseconds_per_instance for col in tqdm(periodic_measurements): dataset = LowPass.low_pass_filter(data_table=dataset, col=col, sampling_frequency=fs, cutoff_frequency=FLAGS.cutoff, order=10) dataset[col] = dataset[col + '_lowpass'] del dataset[col + '_lowpass'] # Use the optimal found parameter n_pcs = 7 to apply PCA to the final dataset selected_predictor_cols = [ c for c in dataset.columns if (not ('label' in c)) and (not (c == 'hr_watch_rate')) ] n_pcs = 7 dataset = PCA.apply_pca(data_table=copy.deepcopy(dataset), cols=selected_predictor_cols, number_comp=n_pcs) # Visualize the final overall dataset DataViz.plot_dataset(dataset, [ 'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'pca_', 'label' ], [ 'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like' ], [ 'line', 'line', 'line', 'line', 'line', 'line', 'line', 'points', 'points' ]) # Store the final outcome dataset.to_csv(DATA_PATH / RESULT_FNAME)
"Gyroscope x (rad/s)": 'gyr_x', "Gyroscope y (rad/s)": 'gyr_y', "Gyroscope z (rad/s)": 'gyr_z', } DataViz = VisualizeDataset(__file__, show=False) KalFilter = KalmanFilters() dataset = pd.read_csv(preprocessed_phone_data) dataset.index = pd.to_datetime(dataset[time_col]) for col in attributes_to_impute: print('Applying kalman filter for ', col) dataset = KalFilter.apply_kalman_filter(dataset, col) DataViz.save_path = save_names[col] + '_phone_imputed_values' DataViz.plot_imputed_values(dataset, ['original', 'kalman'], col, dataset[col]) DataViz.save_path = save_names[col] + '_phone_all_data' DataViz.plot_dataset(dataset, [col, col + '_kalman'], ['exact','exact'], ['line', 'line']) print(dataset.columns) dataset.to_csv(outlier_phone_data) dataset = pd.read_csv(preprocessed_watch_data) dataset.index = pd.to_datetime(dataset[time_col]) for col in attributes_to_impute: print('Applying kalman filter for ', col) dataset = KalFilter.apply_kalman_filter(dataset, col) DataViz.save_path = save_names[col] + '_watch_imputed_values' DataViz.plot_imputed_values(dataset, ['original', 'kalman'], col, dataset[col]) DataViz.save_path = save_names[col] + '_watch_all_data'
# MisVal = ImputationMissingValues() # imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'hr_watch_rate') # imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'hr_watch_rate') # imputed_interpolation_dataset = MisVal.impute_interpolate(copy.deepcopy(dataset), 'hr_watch_rate') # DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate']) X_incomplete = dataset # print(list(X_incomplete)) # # X is the complete data matrix # # X_incomplete has the same values as X except a subset have been replace with NaN # # # Use 3 nearest rows which have a feature to fill in each row's missing features X_filled_knn = KNN(k=6).complete(X_incomplete) # X_filled_knn = knnimpute.(X_incomplete) DataViz.plot_imputed_values(dataset, ['original', 'imputed'], 'hr_watch_rate', X_filled_knn[:, 0]) # # matrix completion using convex optimization to find low-rank solution # # that still matches observed values. Slow! # X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # # # Instead of solving the nuclear norm objective directly, instead # # induce sparsity using singular value thresholding # X_filled_softimpute = SoftImpute().complete(X_incomplete_normalized) # # # print mean squared error for the three imputation methods above # nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean() # print("Nuclear norm minimization MSE: %f" % nnm_mse) # # softImpute_mse = ((X_filled_softimpute[missing_mask] - X[missing_mask]) ** 2).mean() # print("SoftImpute MSE: %f" % softImpute_mse)