# Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles/' dataset = pd.read_csv(dataset_path + 'chapter3_final_outliers.csv', index_col=0) dataset.index = dataset.index.to_datetime() # print(dataset.isnull().sum()) # print(dataset.shape) # Computer the number of milliseconds covered by an instane based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Step 2: Let us impute the missing values. MisVal = ImputationMissingValues() # imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'hr_Heart Rate') # imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'hr_Heart Rate') # imputed_interpolation_dataset = MisVal.impute_interpolate(copy.deepcopy(dataset), 'hr_Heart Rate') # DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_Heart Rate', imputed_mean_dataset['hr_Heart Rate'], imputed_interpolation_dataset['hr_Heart Rate']) # And we impute for all columns except for the label in the selected way (interpolation) for col in [c for c in dataset.columns if not 'label' in c]: dataset = MisVal.impute_interpolate(dataset, col) # Let us try the Kalman filter on the light_phone_lux attribute and study the result. # original_dataset = pd.read_csv(dataset_path + 'chapter3_final_outliers.csv', index_col=0) # original_dataset.index = original_dataset.index.to_datetime() # KalFilter = KalmanFilters()
# Let is create our visualization class again. DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles/' dataset = pd.read_csv(dataset_path + 'chapter3_result_outliers.csv', index_col=0) dataset.index = dataset.index.to_datetime() # Computer the number of milliseconds covered by an instane based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Step 2: Let us impute the missing values. MisVal = ImputationMissingValues() imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'hr_watch_rate') imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'hr_watch_rate') imputed_interpolation_dataset = MisVal.impute_interpolate( copy.deepcopy(dataset), 'hr_watch_rate') # DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate']) print imputed_interpolation_dataset['hr_watch_rate'] DataViz.plot_imputed_values(dataset, ['original', 'interpolation'], 'hr_watch_rate', imputed_interpolation_dataset['hr_watch_rate']) # And we impute for all columns except for the label in the selected way (interpolation)
raise e # We'll create an instance of our visualization class to plot the results. DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # impute_columns = ['acc_mobile_x', 'acc_mobile_y', 'acc_mobile_z', 'gyr_mobile_x','gyr_mobile_y','gyr_mobile_z','mag_mobile_x','mag_mobile_y','mag_mobile_z','prox_mobile_distance', # 'loc_mobile_latitude','loc_mobile_longitude','loc_mobile_height','loc_mobile_velocity','loc_mobile_direction','loc_mobile_horizontalAccuracy','loc_mobile_verticalAccuracy'] # # Let us impute the missing values and plot an example. MisVal = ImputationMissingValues() imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'acc_mobile_x') imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'acc_mobile_x') imputed_interpolation_dataset = MisVal.impute_interpolate( copy.deepcopy(dataset), 'acc_mobile_x') DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'acc_mobile_x', imputed_mean_dataset['acc_mobile_x'], imputed_interpolation_dataset['acc_mobile_x']) # Now, let us carry out that operation over all columns except for the label. for col in [c for c in dataset.columns if not 'label' in c]: dataset = MisVal.impute_interpolate(dataset, col)
# Let is create our visualization class again. DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles-own/' dataset = pd.read_csv(dataset_path + 'chapter3_result_outliers-own.csv', index_col=0) dataset.index = dataset.index.to_datetime() # Computer the number of milliseconds covered by an instane based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Step 2: Let us impute the missing values. MisVal = ImputationMissingValues() imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'acc_phone_x') imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'acc_phone_x') imputed_interpolation_dataset = MisVal.impute_interpolate( copy.deepcopy(dataset), 'acc_phone_x') #DataViz.plot_imputed_values(dataset, ['original', 'mean', 'median', 'interpolation'], 'acc_phone_x', imputed_mean_dataset['acc_phone_x'], imputed_median_dataset['acc_phone_x'], imputed_interpolation_dataset['acc_phone_x']) # And we impute for all columns except for the label in the selected way (interpolation) for col in [ c for c in dataset.columns if not 'label' and not 'light_phone_lux' in c ]: dataset = MisVal.impute_interpolate(dataset, col)
dataset_path = './intermediate_datafiles/' dataset_own = pd.read_csv(dataset_path + 'chapter3_result_outliers_own.csv', index_col=0) dataset_cs = pd.read_csv(dataset_path + 'chapter3_result_outliers_cs.csv', index_col=0) # Create index dataset_own.index = dataset_own.index.to_datetime() dataset_cs.index = dataset_cs.index.to_datetime() # Computer the number of milliseconds covered by an instane based on the first two rows milliseconds_per_instance = (dataset_own.index[1] - dataset_own.index[0]).microseconds / 1000 # Step 2: Let us impute the missing values for CS (plot only). MisVal = ImputationMissingValues() imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset_cs), 'hr_watch_rate') imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset_cs), 'hr_watch_rate') imputed_interpolation_dataset = MisVal.impute_interpolate( copy.deepcopy(dataset_cs), 'hr_watch_rate') DataViz.plot_imputed_values(dataset_cs, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate']) # Make plot for own dataset MisVal = ImputationMissingValues() imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset_own), 'press_phone_pressure')
def main(): # Import the data from the specified location and parse the date index try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of our visualization class to plot the results DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Create objects for value imputation, low pass filter and PCA MisVal = ImputationMissingValues() LowPass = LowPassFilter() PCA = PrincipalComponentAnalysis() if FLAGS.mode == 'imputation': # Impute the missing values and plot an example imputed_mean_dataset = MisVal.impute_mean( dataset=copy.deepcopy(dataset), col='hr_watch_rate') imputed_median_dataset = MisVal.impute_median( dataset=copy.deepcopy(dataset), col='hr_watch_rate') imputed_interpolation_dataset = MisVal.impute_interpolate( dataset=copy.deepcopy(dataset), col='hr_watch_rate') DataViz.plot_imputed_values( dataset, ['original', 'mean', 'median', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_median_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate']) elif FLAGS.mode == 'kalman': # Using the result from Chapter 2, try the Kalman filter on the light_phone_lux attribute and study the result try: original_dataset = pd.read_csv(DATA_PATH / ORIG_DATASET_FNAME, index_col=0) original_dataset.index = pd.to_datetime(original_dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!' ) raise e KalFilter = KalmanFilters() kalman_dataset = KalFilter.apply_kalman_filter( data_table=original_dataset, col='acc_phone_x') DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'], 'acc_phone_x', kalman_dataset['acc_phone_x_kalman']) DataViz.plot_dataset(data_table=kalman_dataset, columns=['acc_phone_x', 'acc_phone_x_kalman'], match=['exact', 'exact'], display=['line', 'line']) elif FLAGS.mode == 'lowpass': # Apply a lowpass filter and reduce the importance of the data above 1.5 Hz # Determine the sampling frequency fs = float(1000) / milliseconds_per_instance # Study acc_phone_x new_dataset = LowPass.low_pass_filter( data_table=copy.deepcopy(dataset), col='acc_phone_x', sampling_frequency=fs, cutoff_frequency=FLAGS.cutoff, order=10) DataViz.plot_dataset( new_dataset.iloc[int(0.4 * len(new_dataset.index) ):int(0.43 * len(new_dataset.index)), :], ['acc_phone_x', 'acc_phone_x_lowpass'], ['exact', 'exact'], ['line', 'line']) elif FLAGS.mode == 'PCA': # First impute again, as PCA can not deal with missing values for col in [c for c in dataset.columns if 'label' not in c]: dataset = MisVal.impute_interpolate(dataset, col) # Determine the PC's for all but the target columns (the labels and the heart rate) selected_predictor_cols = [ c for c in dataset.columns if (not ('label' in c)) and (not (c == 'hr_watch_rate')) ] pc_values = PCA.determine_pc_explained_variance( data_table=dataset, cols=selected_predictor_cols) cumulated_variance = np.cumsum(pc_values) # Plot the explained variance and cumulated variance comp_numbers = np.arange(1, len(pc_values) + 1) DataViz.plot_xy(x=[comp_numbers, comp_numbers], y=[pc_values, cumulated_variance], xlabel='principal component number', ylabel='explained variance', ylim=[0, 1], line_styles=['b-', 'r-'], names=['Variance', 'Cumulated variance']) # Select 7 as the best number of PC's as this explains most of the variance n_pcs = 7 dataset = PCA.apply_pca(data_table=copy.deepcopy(dataset), cols=selected_predictor_cols, number_comp=n_pcs) # Visualize the result of the PC's and the overall final dataset DataViz.plot_dataset(dataset, ['pca_', 'label'], ['like', 'like'], ['line', 'points']) DataViz.plot_dataset(dataset, [ 'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'pca_', 'label' ], [ 'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like' ], [ 'line', 'line', 'line', 'line', 'line', 'line', 'line', 'points', 'points' ]) elif FLAGS.mode == 'final': # Carry out that operation over all columns except for the label print('Imputing missing values.') for col in tqdm([c for c in dataset.columns if 'label' not in c]): dataset = MisVal.impute_interpolate(dataset=dataset, col=col) # Include all measurements that have a form of periodicity and filter them periodic_measurements = [ 'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z' ] print('Applying low pass filter on peridic measurements.') # Determine the sampling frequency. fs = float(1000) / milliseconds_per_instance for col in tqdm(periodic_measurements): dataset = LowPass.low_pass_filter(data_table=dataset, col=col, sampling_frequency=fs, cutoff_frequency=FLAGS.cutoff, order=10) dataset[col] = dataset[col + '_lowpass'] del dataset[col + '_lowpass'] # Use the optimal found parameter n_pcs = 7 to apply PCA to the final dataset selected_predictor_cols = [ c for c in dataset.columns if (not ('label' in c)) and (not (c == 'hr_watch_rate')) ] n_pcs = 7 dataset = PCA.apply_pca(data_table=copy.deepcopy(dataset), cols=selected_predictor_cols, number_comp=n_pcs) # Visualize the final overall dataset DataViz.plot_dataset(dataset, [ 'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'pca_', 'label' ], [ 'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like' ], [ 'line', 'line', 'line', 'line', 'line', 'line', 'line', 'points', 'points' ]) # Store the final outcome dataset.to_csv(DATA_PATH / RESULT_FNAME)