# Chapter 4: Identifying aggregate attributes.

# First we focus on the time domain.

# Set the window sizes to the number of instances representing 1 seconds, 5 seconds and 3 minutes
window_sizes = [
    int(float(1000) / milliseconds_per_instance),
    int(float(5000) / milliseconds_per_instance),
    int(float(0.3 * 60000) / milliseconds_per_instance)
]

NumAbs = NumericalAbstraction()
dataset_copy = copy.deepcopy(dataset)
for ws in window_sizes:
    dataset_copy = NumAbs.abstract_numerical(dataset_copy,
                                             ['userAcceleration.x'], ws,
                                             'mean')
    dataset_copy = NumAbs.abstract_numerical(dataset_copy,
                                             ['userAcceleration.x'], ws, 'std')

DataViz.plot_dataset(dataset_copy, [
    'userAcceleration.x', 'userAcceleration.x_temp_mean',
    'userAcceleration.x_temp_std', 'label'
], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])

ws = int(float(0.3 * 60000) / milliseconds_per_instance)
selected_predictor_cols = [c for c in dataset.columns if not 'label' in c]
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws,
                                    'mean')
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws,
                                    'std')
Esempio n. 2
0
print('total window sizes', window_sizes)

NumAbs = NumericalAbstraction()

periodic_predictor_cols = [
    'acc_x',
    'acc_y',
    'acc_z',
    "gyr_x",
    "gyr_y",
    "gyr_z",
]

for ws in window_sizes:
    dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws,
                                        'mean')
    dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws,
                                        'std')
    print('window size', ws)

print(dataset.columns)
DataViz.plot_dataset(dataset, ['acc_x', 'acc_y', 'acc_z', 'label'],
                     ['exact', 'like', 'like', 'like'],
                     ['line', 'line', 'line', 'points'])

# ws = int(float(0.5*60000)/milliseconds_per_instance)
# dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws, 'mean')
# dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws, 'std')

print('temporal', dataset.shape)
Esempio n. 3
0
# Compute the number of milliseconds covered by an instane based on the first two rows
milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000


# Chapter 4: Identifying aggregate attributes.

# First we focus on the time domain.

# Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes
window_sizes = [int(float(5000)/milliseconds_per_instance), int(float(0.5*60000)/milliseconds_per_instance), int(float(5*60000)/milliseconds_per_instance)]

NumAbs = NumericalAbstraction()
dataset_copy = copy.deepcopy(dataset)
for ws in window_sizes:
    dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'mean')
    dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'std')

DataViz.plot_dataset(dataset_copy, ['acc_phone_x', 'acc_phone_x_temp_mean', 'acc_phone_x_temp_std', 'label'], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])

ws = int(float(0.5*60000)/milliseconds_per_instance)
selected_predictor_cols = [c for c in dataset.columns if not 'label' in c]
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'mean')
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'std')


CatAbs = CategoricalAbstraction()
dataset = CatAbs.abstract_categorical(dataset, ['label'], ['like'], 0.03, int(float(5*60000)/milliseconds_per_instance), 2)

# Now we move to the frequency domain, with the same window size.
Esempio n. 4
0
NumAbs = NumericalAbstraction()
dataset_copy = copy.deepcopy(dataset)
# for ws in window_sizes:
#     dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_X'], ws, 'mean')
#     dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_X'], ws, 'std')
#
# DataViz.plot_dataset(dataset_copy, ['acc_phone_X', 'acc_phone_X_temp_mean', 'acc_phone_X_temp_std', 'label'],
#                      ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])

# ws = int(float(0.5 * 60000) / milliseconds_per_instance) $ FIXME they only use one window size! We do multiple
# ws = int(float(8000) / milliseconds_per_instance)
ws = int(float(4000) / milliseconds_per_instance)
# for ws in window_sizes: #FIXME uncomment for multiple ws's
selected_predictor_cols = [c for c in dataset.columns if not 'label' in c]
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws,
                                    'mean')
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws,
                                    'std')

# DataViz.plot_dataset(dataset,
#                      ['acc_phone_X', 'gyr_phone_X', 'mag_phone_X', 'pca_1', 'label'],
#                      ['like', 'like', 'like', 'like', 'like'],
#                      ['line', 'line', 'line', 'line', 'points'])
#
# # support for labels is useless in our case
# CatAbs = CategoricalAbstraction()
# dataset = CatAbs.abstract_categorical(dataset, ['label'], ['like'], 0.03,
#                                       int(float(8000) / milliseconds_per_instance), 2)
#
# Now we move to the frequency domain, with the same window size.
Esempio n. 5
0
# Chapter 4: Identifying aggregate attributes.

# First we focus on the time domain.

# Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes
window_sizes = [
    int(float(2000) / milliseconds_per_instance),
    int(float(4000) / milliseconds_per_instance),
    int(float(10000) / milliseconds_per_instance)
]

NumAbs = NumericalAbstraction()
dataset_copy = copy.deepcopy(dataset)
for ws in window_sizes:
    dataset_copy = NumAbs.abstract_numerical(
        dataset_copy, ['linacc_Linear Acceleration x (m/s^2)'], ws, 'mean')
    dataset_copy = NumAbs.abstract_numerical(
        dataset_copy, ['linacc_Linear Acceleration x (m/s^2)'], ws, 'std')

DataViz.plot_dataset(dataset_copy, [
    'linacc_Linear Acceleration x (m/s^2)',
    'linacc_Linear Acceleration x (m/s^2)_temp_mean',
    'linacc_Linear Acceleration x (m/s^2)_temp_std', 'label'
], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])

ws = int(float(4000) / milliseconds_per_instance)

selected_predictor_cols = [
    'acc_Acceleration x (m/s^2)', 'acc_Acceleration y (m/s^2)',
    'acc_Acceleration z (m/s^2)', 'press_Pressure (hPa)',
    'gyr_Gyroscope x (rad/s)', 'gyr_Gyroscope y (rad/s)',
Esempio n. 6
0
print('milliseconds per instance', milliseconds_per_instance)

window_sizes = [
    int(float(5000) / milliseconds_per_instance),
    int(float(0.5 * 60000) / milliseconds_per_instance),
    int(float(5 * 60000) / milliseconds_per_instance)
]

print('window sizes', window_sizes)

NumAbs = NumericalAbstraction()
CatAbs = CategoricalAbstraction()
dataset_copy = copy.deepcopy(dataset)

for ws in window_sizes:
    dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws,
                                             'min')
    dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws,
                                             'max')
    print('window size', ws)

DataViz.plot_dataset(dataset_copy,
                     ['acc_phone_x', 'acc_phone_x_temp_min', 'label'],
                     ['exact', 'like', 'like'], ['line', 'line', 'points'])
DataViz.plot_dataset(dataset_copy,
                     ['acc_phone_x', 'acc_phone_x_temp_max', 'label'],
                     ['exact', 'like', 'like'], ['line', 'line', 'points'])
DataViz.plot_dataset(
    dataset_copy,
    ['acc_phone_x', 'acc_phone_x_temp_max', 'acc_phone_x_temp_min', 'label'],
    ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])
Esempio n. 7
0
def main():
    # Read the result from the previous chapter convert the index to datetime
    try:
        dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Compute the number of milliseconds covered by an instance based on the first two rows
    milliseconds_per_instance = (dataset.index[1] -
                                 dataset.index[0]).microseconds / 1000

    # Create objects for feature abstraction
    NumAbs = NumericalAbstraction()
    CatAbs = CategoricalAbstraction()
    FreqAbs = FourierTransformation()

    if FLAGS.mode == 'time':
        # Focus on the time domain first
        # Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes
        window_sizes = [
            int(float(5000) / milliseconds_per_instance),
            int(float(0.5 * 60000) / milliseconds_per_instance),
            int(float(5 * 60000) / milliseconds_per_instance)
        ]

        dataset_copy = copy.deepcopy(dataset)
        for ws in window_sizes:
            print(
                f'Abstracting numerical features for window size {ws * milliseconds_per_instance / 1000}s.'
            )
            dataset_copy = NumAbs.abstract_numerical(
                data_table=dataset_copy,
                cols=['acc_phone_x'],
                window_size=ws,
                aggregation_function='mean')
            dataset_copy = NumAbs.abstract_numerical(
                data_table=dataset_copy,
                cols=['acc_phone_x'],
                window_size=ws,
                aggregation_function='std')

        DataViz.plot_dataset(data_table=dataset_copy,
                             columns=[
                                 'acc_phone_x', 'acc_phone_x_temp_mean',
                                 'acc_phone_x_temp_std', 'label'
                             ],
                             match=['exact', 'like', 'like', 'like'],
                             display=['line', 'line', 'line', 'points'])

    elif FLAGS.mode == 'frequency':
        # Move to the frequency domain with the same window size
        fs = 1000.0 / milliseconds_per_instance
        ws = int(10000.0 / milliseconds_per_instance)

        data_table = FreqAbs.abstract_frequency(
            data_table=copy.deepcopy(dataset),
            cols=['acc_phone_x'],
            window_size=ws,
            sampling_rate=fs)
        # Spectral analysis
        DataViz.plot_dataset(data_table=data_table,
                             columns=[
                                 'acc_phone_x_max_freq',
                                 'acc_phone_x_freq_weighted',
                                 'acc_phone_x_pse', 'label'
                             ],
                             match=['like', 'like', 'like', 'like'],
                             display=['line', 'line', 'line', 'points'])

    elif FLAGS.mode == 'final':
        ws = int(float(0.5 * 60000) / milliseconds_per_instance)
        fs = 1000.0 / milliseconds_per_instance

        # Abstract time domain features and plot the result
        selected_predictor_cols = [
            c for c in dataset.columns if 'label' not in c
        ]
        print('Calculating mean and std for selected predictor cols.')
        dataset = NumAbs.abstract_numerical(data_table=dataset,
                                            cols=selected_predictor_cols,
                                            window_size=ws,
                                            aggregation_function='mean')
        dataset = NumAbs.abstract_numerical(data_table=dataset,
                                            cols=selected_predictor_cols,
                                            window_size=ws,
                                            aggregation_function='std')

        DataViz.plot_dataset(data_table=dataset,
                             columns=[
                                 'acc_phone_x', 'gyr_phone_x', 'hr_watch_rate',
                                 'light_phone_lux', 'mag_phone_x',
                                 'press_phone_', 'pca_1', 'label'
                             ],
                             match=[
                                 'like', 'like', 'like', 'like', 'like',
                                 'like', 'like', 'like'
                             ],
                             display=[
                                 'line', 'line', 'line', 'line', 'line',
                                 'line', 'line', 'points'
                             ])

        # Abstract categorical features
        print('Abstracting categorical features.')
        dataset = CatAbs.abstract_categorical(
            data_table=dataset,
            cols=['label'],
            match=['like'],
            min_support=0.03,
            window_size=int(float(5 * 60000) / milliseconds_per_instance),
            max_pattern_size=2)

        # Abstract frequency domain features
        periodic_predictor_cols = [
            'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
            'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y',
            'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z',
            'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x',
            'mag_watch_y', 'mag_watch_z'
        ]

        print('Abstracting frequency features.')
        dataset = FreqAbs.abstract_frequency(data_table=dataset,
                                             cols=periodic_predictor_cols,
                                             window_size=ws,
                                             sampling_rate=fs)

        # Take a certain percentage of overlap in the windows, otherwise training examples will be too much alike
        # Set the allowed percentage of overlap
        window_overlap = FLAGS.overlap
        skip_points = int((1 - window_overlap) * ws)
        dataset = dataset.iloc[::skip_points, :]

        # Plot the final dataset
        DataViz.plot_dataset(data_table=dataset,
                             columns=[
                                 'acc_phone_x', 'gyr_phone_x', 'hr_watch_rate',
                                 'light_phone_lux', 'mag_phone_x',
                                 'press_phone_', 'pca_1', 'label'
                             ],
                             match=[
                                 'like', 'like', 'like', 'like', 'like',
                                 'like', 'like', 'like'
                             ],
                             display=[
                                 'line', 'line', 'line', 'line', 'line',
                                 'line', 'line', 'points'
                             ])

        # Store the generated dataset
        dataset.to_csv(DATA_PATH / RESULT_FNAME)