# Chapter 4: Identifying aggregate attributes. print('attributes time domain') # First we focus on the time domain. # Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes window_sizes = [ int(float(5000) / milliseconds_per_instance), int(float(0.5 * 60000) / milliseconds_per_instance), int(float(5 * 60000) / milliseconds_per_instance), int(float(10 * 60000) / milliseconds_per_instance) ] print('total window sizes', window_sizes) NumAbs = NumericalAbstraction() periodic_predictor_cols = [ 'acc_x', 'acc_y', 'acc_z', "gyr_x", "gyr_y", "gyr_z", ] for ws in window_sizes: dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws, 'mean') dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws, 'std')
raise e dataset.index = dataset.index.to_datetime() # Compute the number of milliseconds covered by an instane based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000 # Chapter 4: Identifying aggregate attributes. # First we focus on the time domain. # Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes window_sizes = [int(float(5000)/milliseconds_per_instance), int(float(0.5*60000)/milliseconds_per_instance), int(float(5*60000)/milliseconds_per_instance)] NumAbs = NumericalAbstraction() dataset_copy = copy.deepcopy(dataset) for ws in window_sizes: dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'mean') dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'std') DataViz.plot_dataset(dataset_copy, ['acc_phone_x', 'acc_phone_x_temp_mean', 'acc_phone_x_temp_std', 'label'], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) ws = int(float(0.5*60000)/milliseconds_per_instance) selected_predictor_cols = [c for c in dataset.columns if not 'label' in c] dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'mean') dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'std') CatAbs = CategoricalAbstraction() dataset = CatAbs.abstract_categorical(dataset, ['label'], ['like'], 0.03, int(float(5*60000)/milliseconds_per_instance), 2)
# Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Chapter 4: Identifying aggregate attributes. # First we focus on the time domain. # Set the window sizes to the number of instances representing 1 seconds, 5 seconds and 3 minutes window_sizes = [ int(float(1000) / milliseconds_per_instance), int(float(5000) / milliseconds_per_instance), int(float(0.3 * 60000) / milliseconds_per_instance) ] NumAbs = NumericalAbstraction() dataset_copy = copy.deepcopy(dataset) for ws in window_sizes: dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['userAcceleration.x'], ws, 'mean') dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['userAcceleration.x'], ws, 'std') DataViz.plot_dataset(dataset_copy, [ 'userAcceleration.x', 'userAcceleration.x_temp_mean', 'userAcceleration.x_temp_std', 'label' ], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) ws = int(float(0.3 * 60000) / milliseconds_per_instance) selected_predictor_cols = [c for c in dataset.columns if not 'label' in c]
dataset = pd.read_csv(dataset_path + 'imputation_result.csv', index_col=0) except IOError as e: print('File not found, try to outlier and imputation scripts first!') raise e dataset.index = dataset.index.to_datetime() # Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # ------------------------------------------------------------------------------------ # TIME DOMAIN print 'starting time domain computations.' NumAbs = NumericalAbstraction() """ # Set the window sizes to the number of instances representing 20 seconds, 30 seconds and 40 seconds # This part is for generating plots, it plots for all current non-label features window_sizes = [int(float(4*5000)/milliseconds_per_instance), int(float(0.5*60000)/milliseconds_per_instance), int(float(40000)/milliseconds_per_instance)] cols = [c for c in dataset.columns if not 'label' in c] for c in cols: dataset_copy = copy.deepcopy(dataset) for ws in window_sizes: dataset_copy = NumAbs.abstract_numerical(dataset_copy, [c], ws, 'mean') dataset_copy = NumAbs.abstract_numerical(dataset_copy, [c], ws, 'std') dataset_copy = NumAbs.abstract_numerical(dataset_copy, [c], ws, 'min') dataset_copy = NumAbs.abstract_numerical(dataset_copy, [c], ws, 'MAD') dataset_copy = NumAbs.abstract_numerical(dataset_copy, [c], ws, 'kurtosis') dataset_copy = NumAbs.abstract_numerical(dataset_copy, [c], ws, 'slope')
# Compute the number of milliseconds covered by an instane based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Chapter 4: Identifying aggregate attributes. # First we focus on the time domain. # Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes window_sizes = [ int(float(2000) / milliseconds_per_instance), int(float(4000) / milliseconds_per_instance), int(float(10000) / milliseconds_per_instance) ] NumAbs = NumericalAbstraction() dataset_copy = copy.deepcopy(dataset) for ws in window_sizes: dataset_copy = NumAbs.abstract_numerical( dataset_copy, ['linacc_Linear Acceleration x (m/s^2)'], ws, 'mean') dataset_copy = NumAbs.abstract_numerical( dataset_copy, ['linacc_Linear Acceleration x (m/s^2)'], ws, 'std') DataViz.plot_dataset(dataset_copy, [ 'linacc_Linear Acceleration x (m/s^2)', 'linacc_Linear Acceleration x (m/s^2)_temp_mean', 'linacc_Linear Acceleration x (m/s^2)_temp_std', 'label' ], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) ws = int(float(4000) / milliseconds_per_instance)
raise e dataset.index = pd.to_datetime(dataset.index) # Let us create our visualization class again. DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance based on the first two rows # Chapter 4: Identifying aggregate attributes. # First we focus on the time domain. # Set the window sizes to the number of instances representing 2 minutes and 5 minutes NumAbs = NumericalAbstraction() #dataset_copy = copy.deepcopy(dataset) #for ws in window_sizes: # dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_x'], ws, 'mean') # dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_x'], ws, 'std') # dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_x'], ws, 'slope') #DataViz.plot_dataset(dataset_copy, ['acc_x', 'acc_x_temp_mean', 'acc_x_temp_std', 'label'], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) selected_predictor_cols = [ 'heartrate', 'acc_x', 'acc_y', 'acc_z', 'pca_1', 'pca_2' ] ws = 6 print('mean') dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws,
# plt.show() plt.savefig('figures/crowdsignals_ch4/amplitude_table.png') exit() print('milliseconds per instance', milliseconds_per_instance) window_sizes = [ int(float(5000) / milliseconds_per_instance), int(float(0.5 * 60000) / milliseconds_per_instance), int(float(5 * 60000) / milliseconds_per_instance) ] print('window sizes', window_sizes) NumAbs = NumericalAbstraction() CatAbs = CategoricalAbstraction() dataset_copy = copy.deepcopy(dataset) for ws in window_sizes: dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'min') dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'max') print('window size', ws) DataViz.plot_dataset(dataset_copy, ['acc_phone_x', 'acc_phone_x_temp_min', 'label'], ['exact', 'like', 'like'], ['line', 'line', 'points']) DataViz.plot_dataset(dataset_copy, ['acc_phone_x', 'acc_phone_x_temp_max', 'label'],
def main(): # Read the result from the previous chapter convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Create objects for feature abstraction NumAbs = NumericalAbstraction() CatAbs = CategoricalAbstraction() FreqAbs = FourierTransformation() if FLAGS.mode == 'time': # Focus on the time domain first # Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes window_sizes = [ int(float(5000) / milliseconds_per_instance), int(float(0.5 * 60000) / milliseconds_per_instance), int(float(5 * 60000) / milliseconds_per_instance) ] dataset_copy = copy.deepcopy(dataset) for ws in window_sizes: print( f'Abstracting numerical features for window size {ws * milliseconds_per_instance / 1000}s.' ) dataset_copy = NumAbs.abstract_numerical( data_table=dataset_copy, cols=['acc_phone_x'], window_size=ws, aggregation_function='mean') dataset_copy = NumAbs.abstract_numerical( data_table=dataset_copy, cols=['acc_phone_x'], window_size=ws, aggregation_function='std') DataViz.plot_dataset(data_table=dataset_copy, columns=[ 'acc_phone_x', 'acc_phone_x_temp_mean', 'acc_phone_x_temp_std', 'label' ], match=['exact', 'like', 'like', 'like'], display=['line', 'line', 'line', 'points']) elif FLAGS.mode == 'frequency': # Move to the frequency domain with the same window size fs = 1000.0 / milliseconds_per_instance ws = int(10000.0 / milliseconds_per_instance) data_table = FreqAbs.abstract_frequency( data_table=copy.deepcopy(dataset), cols=['acc_phone_x'], window_size=ws, sampling_rate=fs) # Spectral analysis DataViz.plot_dataset(data_table=data_table, columns=[ 'acc_phone_x_max_freq', 'acc_phone_x_freq_weighted', 'acc_phone_x_pse', 'label' ], match=['like', 'like', 'like', 'like'], display=['line', 'line', 'line', 'points']) elif FLAGS.mode == 'final': ws = int(float(0.5 * 60000) / milliseconds_per_instance) fs = 1000.0 / milliseconds_per_instance # Abstract time domain features and plot the result selected_predictor_cols = [ c for c in dataset.columns if 'label' not in c ] print('Calculating mean and std for selected predictor cols.') dataset = NumAbs.abstract_numerical(data_table=dataset, cols=selected_predictor_cols, window_size=ws, aggregation_function='mean') dataset = NumAbs.abstract_numerical(data_table=dataset, cols=selected_predictor_cols, window_size=ws, aggregation_function='std') DataViz.plot_dataset(data_table=dataset, columns=[ 'acc_phone_x', 'gyr_phone_x', 'hr_watch_rate', 'light_phone_lux', 'mag_phone_x', 'press_phone_', 'pca_1', 'label' ], match=[ 'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like' ], display=[ 'line', 'line', 'line', 'line', 'line', 'line', 'line', 'points' ]) # Abstract categorical features print('Abstracting categorical features.') dataset = CatAbs.abstract_categorical( data_table=dataset, cols=['label'], match=['like'], min_support=0.03, window_size=int(float(5 * 60000) / milliseconds_per_instance), max_pattern_size=2) # Abstract frequency domain features periodic_predictor_cols = [ 'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z' ] print('Abstracting frequency features.') dataset = FreqAbs.abstract_frequency(data_table=dataset, cols=periodic_predictor_cols, window_size=ws, sampling_rate=fs) # Take a certain percentage of overlap in the windows, otherwise training examples will be too much alike # Set the allowed percentage of overlap window_overlap = FLAGS.overlap skip_points = int((1 - window_overlap) * ws) dataset = dataset.iloc[::skip_points, :] # Plot the final dataset DataViz.plot_dataset(data_table=dataset, columns=[ 'acc_phone_x', 'gyr_phone_x', 'hr_watch_rate', 'light_phone_lux', 'mag_phone_x', 'press_phone_', 'pca_1', 'label' ], match=[ 'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like' ], display=[ 'line', 'line', 'line', 'line', 'line', 'line', 'line', 'points' ]) # Store the generated dataset dataset.to_csv(DATA_PATH / RESULT_FNAME)