def main(): # Set up file names and locations. DATA_PATH = Path('./intermediate_datafiles/') DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter2_result.csv' RESULT_FNAME = sys.argv[2] if len( sys.argv) > 2 else 'chapter3_result_outliers.csv' # Next, import the data from the specified location and parse the date index. try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run the preceding crowdsignals scripts first!' ) raise e # We'll create an instance of our visualization class to plot the results. DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance using the first two rows. milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Step 1: Let us see whether we have some outliers we would prefer to remove. # Determine the columns we want to experiment on. outlier_columns = ['acc_phone_x', 'light_phone_lux'] # Create the outlier classes. OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() # And investigate the approaches for all relevant attributes. for col in outlier_columns: print(f"Applying outlier criteria for column {col}") # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset = OutlierDistr.chauvenet(dataset, col) DataViz.plot_binary_outliers(dataset, col, col + '_outlier') dataset = OutlierDistr.mixture_model(dataset, col) DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact', 'exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 32 bits = ~4GB available memory try: dataset = OutlierDist.simple_distance_based( dataset, [col], 'euclidean', 0.10, 0.99) DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') except MemoryError as e: print( 'Not enough memory available for simple distance-based outlier detection...' ) print('Skipping.') try: dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5) DataViz.plot_dataset(dataset, [col, 'lof'], ['exact', 'exact'], ['line', 'points']) except MemoryError as e: print('Not enough memory available for lof...') print('Skipping.') # Remove all the stuff from the dataset again. cols_to_remove = [ col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof' ] for to_remove in cols_to_remove: if to_remove in dataset: del dataset[to_remove] # We take Chauvenet's criterion and apply it to all but the label data... for col in [c for c in dataset.columns if not 'label' in c]: print(f'Measurement is now: {col}') dataset = OutlierDistr.chauvenet(dataset, col) dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan del dataset[col + '_outlier'] dataset.to_csv(DATA_PATH / RESULT_FNAME)
def main(): # Set up file names and locations. # Next, import the data from the specified location and parse the date index. dataset = pickle.load(open('datasets\dataframes\df_concat_with_labels.pkl', 'rb')) # We'll create an instance of our visualization class to plot the results. DataViz = VisualizeDataset(__file__, show=False) # Compute the number of milliseconds covered by an instance using the first two rows. milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000 # Step 1: Let us see whether we have some outliers we would prefer to remove. # Determine the columns we want to experiment on. outlier_columns = [ "Acceleration x (m/s^2)","Acceleration y (m/s^2)","Acceleration z (m/s^2)", "Magnetic field x (µT)","Magnetic field y (µT)","Magnetic field z (µT)", "Gyroscope x (rad/s)","Gyroscope y (rad/s)","Gyroscope z (rad/s)", "Linear Acceleration x (m/s^2)","Linear Acceleration y (m/s^2)","Linear Acceleration z (m/s^2)", ] print(dataset.columns) # Create the outlier classes. OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() # And investigate the approaches for all relevant attributes. for col in outlier_columns: print(f"Applying outlier criteria for column {col}") # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset = OutlierDistr.chauvenet(dataset, col) print(dataset.shape) # DataViz.plot_binary_outliers(dataset, col, col + '_outlier') dataset = OutlierDistr.mixture_model(dataset, col) print(dataset.shape) # DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 32 bits = ~4GB available memory try: dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99) # DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') print(dataset['simple_dist_outlier'].mean()) except MemoryError as e: print('Not enough memory available for simple distance-based outlier detection...') print('Skipping.') try: dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5) # DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points']) except MemoryError as e: print('Not enough memory available for lof...') print('Skipping.') # Remove all the stuff from the dataset again. cols_to_remove = [col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof'] for to_remove in cols_to_remove: if to_remove in dataset: del dataset[to_remove] # We take Chauvenet's criterion and apply it to all but the label data... for col in [c for c in dataset.columns if not 'label' in c]: print(f'Measurement is now: {col}') dataset = OutlierDistr.chauvenet(dataset, col) dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan del dataset[col + '_outlier'] pickle.dump(dataset, open('concat_outliers.pkl', 'wb'))
def main(): # Import the data from the specified location and parse the date index try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FILENAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print('File not found, try to run the preceding crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(module_path=__file__) # Create the outlier classes OutlierDistribution = DistributionBasedOutlierDetection() OutlierDistance = DistanceBasedOutlierDetection() # Step 1: If requested, see whether there are some outliers that need to be preferably removed # Set the columns to experiment on outlier_columns = ['acc_phone_x', 'light_phone_lux'] if FLAGS.mode == 'chauvenet': # Investigate the outlier columns using chauvenet criterium for col in outlier_columns: print(f"Applying chauvenet outlier criteria for column {col}") dataset = OutlierDistribution.chauvenet(data_table=dataset, col=col) DataViz.plot_binary_outliers(data_table=dataset, col=col, outlier_col=f'{col}_outlier') elif FLAGS.mode == 'mixture': # Investigate the outlier columns using mixture models for col in outlier_columns: print(f"Applying mixture model for column {col}") dataset = OutlierDistribution.mixture_model(data_table=dataset, col=col, components=3) DataViz.plot_dataset(data_table=dataset, columns=[col, f'{col}_mixture'], match=['exact', 'exact'], display=['line', 'points']) elif FLAGS.mode == 'distance': for col in outlier_columns: print(f"Applying distance based outlier detection for column {col}") # This step requires: # n_data_points * n_data_points * point_size = 31839 * 31839 * 32 bits = ~4GB available memory try: dataset = OutlierDistance.simple_distance_based(data_table=dataset, cols=[col], d_function='euclidean', d_min=FLAGS.dmin, f_min=FLAGS.fmin) DataViz.plot_binary_outliers(data_table=dataset, col=col, outlier_col='simple_dist_outlier') except MemoryError: print('Not enough memory available for simple distance-based outlier detection...') print('Skipping.') elif FLAGS.mode == 'LOF': for col in outlier_columns: print(f"Applying Local outlier factor for column {col}") try: dataset = OutlierDistance.local_outlier_factor(data_table=dataset, cols=[col], d_function='euclidean', k=FLAGS.K) DataViz.plot_dataset(data_table=dataset, columns=[col, 'lof'], match=['exact', 'exact'], display=['line', 'points']) except MemoryError: print('Not enough memory available for local outlier factor...') print('Skipping.') elif FLAGS.mode == 'final': # Take Chauvenet's criterion and apply it to all but the label column in the main dataset for col in [c for c in dataset.columns if 'label' not in c]: print(f'Measurement is now: {col}') dataset = OutlierDistribution.chauvenet(data_table=dataset, col=col) dataset.loc[dataset[f'{col}_outlier'], col] = np.nan del dataset[col + '_outlier'] dataset.to_csv(DATA_PATH / RESULT_FILENAME)
class OutlierExperiment: def __init__(self, data_path, data_file): self.dataset = pd.read_csv(Path(data_path / data_file), index_col=0) self.dataset = self.dataset self.dataset.index = pd.to_datetime(self.dataset.index) self.DataViz = VisualizeDataset(__file__, show=False) self.outlier_columns = ['acc_phone_x', 'light_phone_lux'] self.OutlierDistr = DistributionBasedOutlierDetection() self.OutlierDist = DistanceBasedOutlierDetection() self.original_columns = self.dataset.columns self.num_outliers = {'acc_phone_x': 0, 'light_phone_lux': 0} def remove_columns(self): for to_remove in self.dataset.columns: if to_remove not in self.original_columns: del self.dataset[to_remove] def chauvenet(self, C): original_columns = self.dataset.columns for col in self.outlier_columns: print(f"Applying outlier criteria for column {col}") self.dataset = self.OutlierDistr.chauvenet(self.dataset, col, C) self.DataViz.plot_binary_outliers(self.dataset, col, col + '_outlier') self.num_outliers[col] = self.dataset[self.dataset[ col + '_outlier'] == 1][col].size / self.dataset[col].size def mixture_model(self, n): for col in self.outlier_columns: print(f"Applying outlier criteria for column {col}") self.dataset = self.OutlierDistr.mixture_model( self.dataset, col, n) self.DataViz.plot_dataset(self.dataset, [col, col + '_mixture'], ['exact', 'exact'], ['line', 'points']) self.num_outliers[col] = self.dataset[ col + '_mixture'].sum() / self.dataset[col + '_mixture'].size print(self.dataset[col + '_mixture'].max()) if self.num_outliers[col] > 1: print(self.dataset[col + '_mixture']) def simple_distance_based(self, d_min, f_min): for col in self.outlier_columns: print(f"Applying outlier criteria for column {col}") self.dataset = self.OutlierDist.simple_distance_based( self.dataset, [col], 'euclidean', d_min, f_min) self.DataViz.plot_binary_outliers(self.dataset, col, 'simple_dist_outlier') self.num_outliers[col] = self.dataset[ self.dataset['simple_dist_outlier'] == 1][col].size / self.dataset[col].size self.remove_columns() def local_outlier_factor(self, k): for col in self.outlier_columns: print(f"Applying outlier criteria for column {col}") self.dataset = self.OutlierDist.local_outlier_factor( self.dataset, [col], 'euclidean', k) self.DataViz.plot_dataset(self.dataset, [col, 'lof'], ['exact', 'exact'], ['line', 'points']) self.num_outliers[col] = self.dataset[ self.dataset['lof'] == 1][col].size / self.dataset[col].size self.remove_columns()
# n_data_points * n_data_points * point_size = # 31839 * 31839 * 64 bits = ~8GB available memory try: dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99) DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') except MemoryError as e: print( 'Not enough memory available for simple distance-based outlier detection...' ) print('Skipping.') try: dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5) DataViz.plot_dataset(dataset, [col, 'lof'], ['exact', 'exact'], ['line', 'points']) except MemoryError as e: print('Not enough memory available for lof...') print('Skipping.') # Remove all the stuff from the dataset again. cols_to_remove = [ col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof' ] for to_remove in cols_to_remove: if to_remove in dataset: del dataset[to_remove] # We take Chauvenet's criterion and apply it to all but the label data...
['exact', 'exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 64 bits = ~8GB available memory try: dataset_cs = OutlierDist.simple_distance_based(dataset_cs, [col], 'euclidean', dmin, fmin) DataViz.plot_binary_outliers(dataset_cs, col, 'simple_dist_outlier') except MemoryError as e: print( 'Not enough memory available for simple distance-based outlier detection...' ) print('Skipping.') try: dataset_cs = OutlierDist.local_outlier_factor(dataset_cs, [col], 'euclidean', k) DataViz.plot_dataset(dataset_cs, [col, 'lof'], ['exact', 'exact'], ['line', 'points']) except MemoryError as e: print('Not enough memory available for lof...') print('Skipping.') # Remove all the stuff from the dataset again. cols_to_remove = [ col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof' ] for to_remove in cols_to_remove: if to_remove in dataset_cs: del dataset_cs[to_remove] # We take Chauvent's criterion and apply it to all but the label data...