Beispiel #1
0
# impute_columns = ['acc_mobile_x', 'acc_mobile_y', 'acc_mobile_z', 'gyr_mobile_x','gyr_mobile_y','gyr_mobile_z','mag_mobile_x','mag_mobile_y','mag_mobile_z','prox_mobile_distance',
#                        'loc_mobile_latitude','loc_mobile_longitude','loc_mobile_height','loc_mobile_velocity','loc_mobile_direction','loc_mobile_horizontalAccuracy','loc_mobile_verticalAccuracy']
#

# Let us impute the missing values and plot an example.

MisVal = ImputationMissingValues()
imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset),
                                          'acc_mobile_x')
imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset),
                                              'acc_mobile_x')
imputed_interpolation_dataset = MisVal.impute_interpolate(
    copy.deepcopy(dataset), 'acc_mobile_x')
DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'],
                            'acc_mobile_x',
                            imputed_mean_dataset['acc_mobile_x'],
                            imputed_interpolation_dataset['acc_mobile_x'])

# Now, let us carry out that operation over all columns except for the label.

for col in [c for c in dataset.columns if not 'label' in c]:
    dataset = MisVal.impute_interpolate(dataset, col)

# Using the result from Chapter 2, let us try the Kalman filter on the light_phone_lux attribute and study the result.

original_dataset = pd.read_csv(DATA_PATH / ORIG_DATASET_FNAME, index_col=0)
original_dataset.index = pd.to_datetime(original_dataset.index)
KalFilter = KalmanFilters()
kalman_dataset = KalFilter.apply_kalman_filter(original_dataset,
                                               'acc_mobile_x')
Beispiel #2
0
                             dataset.index[0]).microseconds / 1000

# Step 2: Let us impute the missing values.

MisVal = ImputationMissingValues()
imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset),
                                          'hr_watch_rate')
imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset),
                                              'hr_watch_rate')
imputed_interpolation_dataset = MisVal.impute_interpolate(
    copy.deepcopy(dataset), 'hr_watch_rate')
# DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate'])

print imputed_interpolation_dataset['hr_watch_rate']
DataViz.plot_imputed_values(dataset, ['original', 'interpolation'],
                            'hr_watch_rate',
                            imputed_interpolation_dataset['hr_watch_rate'])

# And we impute for all columns except for the label in the selected way (interpolation)

for col in [c for c in dataset.columns if not 'label' in c]:
    dataset = MisVal.impute_interpolate(dataset, col)

# Let us try the Kalman filter on the light_phone_lux attribute and study the result.

original_dataset = pd.read_csv(dataset_path + 'chapter2_result.csv',
                               index_col=0)
original_dataset.index = original_dataset.index.to_datetime()
KalFilter = KalmanFilters()
kalman_dataset = KalFilter.apply_kalman_filter(original_dataset, 'acc_phone_x')
DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'],
Beispiel #3
0
        c for c in dataset.columns
        if not 'label' and not 'light_phone_lux' in c
]:
    dataset = MisVal.impute_interpolate(dataset, col)
dataset = MisVal.impute_median(dataset, 'light_phone_lux')

# Let us try the Kalman filter on the light_phone_lux attribute and study the result.

original_dataset = pd.read_csv(dataset_path + 'chapter2_result-own.csv',
                               index_col=0)
original_dataset.index = original_dataset.index.to_datetime()
KalFilter = KalmanFilters()
kalman_dataset = KalFilter.apply_kalman_filter(original_dataset, 'acc_phone_x')
#kalman_dataset = KalFilter.apply_kalman_filter(original_dataset, 'light_phone_lux')
DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'],
                            'acc_phone_x',
                            kalman_dataset['acc_phone_x_kalman'])
#DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'], 'light_phone_lux', kalman_dataset['light_phone_lux_kalman'])
DataViz.plot_dataset(kalman_dataset, ['acc_phone_x', 'acc_phone_x_kalman'],
                     ['exact', 'exact'], ['line', 'line'])
#DataViz.plot_dataset(kalman_dataset, ['light_phone_lux', 'light_phone_lux_kalman'], ['exact','exact'], ['line', 'line'])

# We ignore the Kalman filter output for now...

# Let us apply a lowpass filter and reduce the importance of the data above 1.5 Hz

LowPass = LowPassFilter()

# Determine the sampling frequency.
fs = float(1000) / milliseconds_per_instance
cutoff = 1.5
Beispiel #4
0
dataset_cs.index = dataset_cs.index.to_datetime()

# Computer the number of milliseconds covered by an instane based on the first two rows
milliseconds_per_instance = (dataset_own.index[1] -
                             dataset_own.index[0]).microseconds / 1000

# Step 2: Let us impute the missing values for CS (plot only).
MisVal = ImputationMissingValues()
imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset_cs),
                                          'hr_watch_rate')
imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset_cs),
                                              'hr_watch_rate')
imputed_interpolation_dataset = MisVal.impute_interpolate(
    copy.deepcopy(dataset_cs), 'hr_watch_rate')
DataViz.plot_imputed_values(dataset_cs, ['original', 'mean', 'interpolation'],
                            'hr_watch_rate',
                            imputed_mean_dataset['hr_watch_rate'],
                            imputed_interpolation_dataset['hr_watch_rate'])

# Make plot for own dataset
MisVal = ImputationMissingValues()
imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset_own),
                                          'press_phone_pressure')
imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset_own),
                                              'press_phone_pressure')
imputed_interpolation_dataset = MisVal.impute_interpolate(
    copy.deepcopy(dataset_own), 'press_phone_pressure')
DataViz.plot_imputed_values(
    dataset_own, ['original', 'mean', 'interpolation'], 'press_phone_pressure',
    imputed_mean_dataset['press_phone_pressure'],
    imputed_interpolation_dataset['press_phone_pressure'])
Beispiel #5
0
def main():
    # Import the data from the specified location and parse the date index
    try:
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of our visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Compute the number of milliseconds covered by an instance based on the first two rows
    milliseconds_per_instance = (dataset.index[1] -
                                 dataset.index[0]).microseconds / 1000

    # Create objects for value imputation, low pass filter and PCA
    MisVal = ImputationMissingValues()
    LowPass = LowPassFilter()
    PCA = PrincipalComponentAnalysis()

    if FLAGS.mode == 'imputation':
        # Impute the missing values and plot an example
        imputed_mean_dataset = MisVal.impute_mean(
            dataset=copy.deepcopy(dataset), col='hr_watch_rate')
        imputed_median_dataset = MisVal.impute_median(
            dataset=copy.deepcopy(dataset), col='hr_watch_rate')
        imputed_interpolation_dataset = MisVal.impute_interpolate(
            dataset=copy.deepcopy(dataset), col='hr_watch_rate')
        DataViz.plot_imputed_values(
            dataset, ['original', 'mean', 'median', 'interpolation'],
            'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'],
            imputed_median_dataset['hr_watch_rate'],
            imputed_interpolation_dataset['hr_watch_rate'])

    elif FLAGS.mode == 'kalman':
        # Using the result from Chapter 2, try the Kalman filter on the light_phone_lux attribute and study the result
        try:
            original_dataset = pd.read_csv(DATA_PATH / ORIG_DATASET_FNAME,
                                           index_col=0)
            original_dataset.index = pd.to_datetime(original_dataset.index)
        except IOError as e:
            print(
                'File not found, try to run previous crowdsignals scripts first!'
            )
            raise e
        KalFilter = KalmanFilters()
        kalman_dataset = KalFilter.apply_kalman_filter(
            data_table=original_dataset, col='acc_phone_x')
        DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'],
                                    'acc_phone_x',
                                    kalman_dataset['acc_phone_x_kalman'])
        DataViz.plot_dataset(data_table=kalman_dataset,
                             columns=['acc_phone_x', 'acc_phone_x_kalman'],
                             match=['exact', 'exact'],
                             display=['line', 'line'])

    elif FLAGS.mode == 'lowpass':
        # Apply a lowpass filter and reduce the importance of the data above 1.5 Hz
        # Determine the sampling frequency
        fs = float(1000) / milliseconds_per_instance

        # Study acc_phone_x
        new_dataset = LowPass.low_pass_filter(
            data_table=copy.deepcopy(dataset),
            col='acc_phone_x',
            sampling_frequency=fs,
            cutoff_frequency=FLAGS.cutoff,
            order=10)
        DataViz.plot_dataset(
            new_dataset.iloc[int(0.4 * len(new_dataset.index)
                                 ):int(0.43 * len(new_dataset.index)), :],
            ['acc_phone_x', 'acc_phone_x_lowpass'], ['exact', 'exact'],
            ['line', 'line'])

    elif FLAGS.mode == 'PCA':
        # First impute again, as PCA can not deal with missing values
        for col in [c for c in dataset.columns if 'label' not in c]:
            dataset = MisVal.impute_interpolate(dataset, col)

        # Determine the PC's for all but the target columns (the labels and the heart rate)
        selected_predictor_cols = [
            c for c in dataset.columns
            if (not ('label' in c)) and (not (c == 'hr_watch_rate'))
        ]
        pc_values = PCA.determine_pc_explained_variance(
            data_table=dataset, cols=selected_predictor_cols)
        cumulated_variance = np.cumsum(pc_values)

        # Plot the explained variance and cumulated variance
        comp_numbers = np.arange(1, len(pc_values) + 1)
        DataViz.plot_xy(x=[comp_numbers, comp_numbers],
                        y=[pc_values, cumulated_variance],
                        xlabel='principal component number',
                        ylabel='explained variance',
                        ylim=[0, 1],
                        line_styles=['b-', 'r-'],
                        names=['Variance', 'Cumulated variance'])

        # Select 7 as the best number of PC's as this explains most of the variance
        n_pcs = 7
        dataset = PCA.apply_pca(data_table=copy.deepcopy(dataset),
                                cols=selected_predictor_cols,
                                number_comp=n_pcs)

        # Visualize the result of the PC's and the overall final dataset
        DataViz.plot_dataset(dataset, ['pca_', 'label'], ['like', 'like'],
                             ['line', 'points'])
        DataViz.plot_dataset(dataset, [
            'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_',
            'press_phone_', 'pca_', 'label'
        ], [
            'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like',
            'like'
        ], [
            'line', 'line', 'line', 'line', 'line', 'line', 'line', 'points',
            'points'
        ])

    elif FLAGS.mode == 'final':
        # Carry out that operation over all columns except for the label
        print('Imputing missing values.')
        for col in tqdm([c for c in dataset.columns if 'label' not in c]):
            dataset = MisVal.impute_interpolate(dataset=dataset, col=col)

        # Include all measurements that have a form of periodicity and filter them
        periodic_measurements = [
            'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
            'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y',
            'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z',
            'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x',
            'mag_watch_y', 'mag_watch_z'
        ]

        print('Applying low pass filter on peridic measurements.')
        # Determine the sampling frequency.
        fs = float(1000) / milliseconds_per_instance
        for col in tqdm(periodic_measurements):
            dataset = LowPass.low_pass_filter(data_table=dataset,
                                              col=col,
                                              sampling_frequency=fs,
                                              cutoff_frequency=FLAGS.cutoff,
                                              order=10)
            dataset[col] = dataset[col + '_lowpass']
            del dataset[col + '_lowpass']

        # Use the optimal found parameter n_pcs = 7 to apply PCA to the final dataset
        selected_predictor_cols = [
            c for c in dataset.columns
            if (not ('label' in c)) and (not (c == 'hr_watch_rate'))
        ]
        n_pcs = 7
        dataset = PCA.apply_pca(data_table=copy.deepcopy(dataset),
                                cols=selected_predictor_cols,
                                number_comp=n_pcs)

        # Visualize the final overall dataset
        DataViz.plot_dataset(dataset, [
            'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_',
            'press_phone_', 'pca_', 'label'
        ], [
            'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like',
            'like'
        ], [
            'line', 'line', 'line', 'line', 'line', 'line', 'line', 'points',
            'points'
        ])

        # Store the final outcome
        dataset.to_csv(DATA_PATH / RESULT_FNAME)
Beispiel #6
0
    "Gyroscope x (rad/s)": 'gyr_x',
    "Gyroscope y (rad/s)": 'gyr_y',
    "Gyroscope z (rad/s)": 'gyr_z',
}

DataViz = VisualizeDataset(__file__, show=False)
KalFilter = KalmanFilters()

dataset = pd.read_csv(preprocessed_phone_data)
dataset.index = pd.to_datetime(dataset[time_col])

for col in attributes_to_impute:
    print('Applying kalman filter for ', col)
    dataset = KalFilter.apply_kalman_filter(dataset, col)
    DataViz.save_path = save_names[col] + '_phone_imputed_values'
    DataViz.plot_imputed_values(dataset, ['original', 'kalman'], col, dataset[col])
    DataViz.save_path = save_names[col] + '_phone_all_data'
    DataViz.plot_dataset(dataset, [col, col + '_kalman'], ['exact','exact'], ['line', 'line'])

print(dataset.columns)
dataset.to_csv(outlier_phone_data)

dataset = pd.read_csv(preprocessed_watch_data)
dataset.index = pd.to_datetime(dataset[time_col])

for col in attributes_to_impute:
    print('Applying kalman filter for ', col)
    dataset = KalFilter.apply_kalman_filter(dataset, col)
    DataViz.save_path = save_names[col] + '_watch_imputed_values'
    DataViz.plot_imputed_values(dataset, ['original', 'kalman'], col, dataset[col])
    DataViz.save_path = save_names[col] + '_watch_all_data'
# MisVal = ImputationMissingValues()
# imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'hr_watch_rate')
# imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'hr_watch_rate')
# imputed_interpolation_dataset = MisVal.impute_interpolate(copy.deepcopy(dataset), 'hr_watch_rate')
# DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate'])

X_incomplete = dataset
# print(list(X_incomplete))
# # X is the complete data matrix
# # X_incomplete has the same values as X except a subset have been replace with NaN
#
# # Use 3 nearest rows which have a feature to fill in each row's missing features
X_filled_knn = KNN(k=6).complete(X_incomplete)
# X_filled_knn = knnimpute.(X_incomplete)

DataViz.plot_imputed_values(dataset, ['original', 'imputed'], 'hr_watch_rate',
                            X_filled_knn[:, 0])

# # matrix completion using convex optimization to find low-rank solution
# # that still matches observed values. Slow!
# X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)
#
# # Instead of solving the nuclear norm objective directly, instead
# # induce sparsity using singular value thresholding
# X_filled_softimpute = SoftImpute().complete(X_incomplete_normalized)
#
# # print mean squared error for the three imputation methods above
# nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean()
# print("Nuclear norm minimization MSE: %f" % nnm_mse)
#
# softImpute_mse = ((X_filled_softimpute[missing_mask] - X[missing_mask]) ** 2).mean()
# print("SoftImpute MSE: %f" % softImpute_mse)