def main(): dataset_path = './intermediate_datafiles/' dataset = pd.read_csv(dataset_path + 'chapter2_result.csv', index_col=0) outlier_columns = ['acc_phone_x', 'light_phone_lux'] DataViz = VisualizeDataset() OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() dataset.index = dataset.index.to_datetime() start = input("choose method: [1],[2],[3],[4]") if start == 1: param = input("Chauvenet\ninput parameters: c") for col in outlier_columns: dataset = OutlierDistr.chauvenet(dataset, col, param) DataViz.plot_binary_outliers(dataset, col, col + '_outlier') elif start == 2: # param = input("Mixture model\n input parameters: components, iter") components, iter = raw_input("Mixture model\n input parameters: components, iter").split(',') components = int(components) iter = int(iter) for col in outlier_columns: dataset = OutlierDistr.mixture_model(dataset, col, components, iter) DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points']) elif start == 3: d_min, f_min = raw_input("Simple distance-based\n input parameters: d_min, f_min").split() d_min = float(d_min) f_min = float(f_min) for col in outlier_columns: dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', d_min, f_min) DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') elif start == 4: param = input("Local outlier factor\n input parameters: k") for col in outlier_columns: dataset = OutlierDist.local_outlier_factor(dataset, col, 'euclidean', k) DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points']) else : print("no method selected")
for col in outlier_columns: # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset = OutlierDistr.chauvenet(dataset, col) #print 'chauvenet', col #DataViz.plot_binary_outliers(dataset, col, col + '_outlier') dataset = OutlierDistr.mixture_model(dataset, col) #DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 64 bits = ~8GB available memory try: print("trying") dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99) print("plot") DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') except MemoryError as e: print('Not enough memory available for simple distance-based outlier detection...') print('Skipping.') '''try: print("2nd trying") dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5) print("2nd plot") DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points']) except MemoryError as e: print('Not enough memory available for lof...') print('Skipping.')''' # Remove all the stuff from the dataset again. print("we will remove")
def main(): # Set up file names and locations. DATA_PATH = Path('./intermediate_datafiles/') DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter2_result.csv' RESULT_FNAME = sys.argv[2] if len( sys.argv) > 2 else 'chapter3_result_outliers.csv' # Next, import the data from the specified location and parse the date index. try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run the preceding crowdsignals scripts first!' ) raise e # We'll create an instance of our visualization class to plot the results. DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance using the first two rows. milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Step 1: Let us see whether we have some outliers we would prefer to remove. # Determine the columns we want to experiment on. outlier_columns = ['acc_phone_x', 'light_phone_lux'] # Create the outlier classes. OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() # And investigate the approaches for all relevant attributes. for col in outlier_columns: print(f"Applying outlier criteria for column {col}") # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset = OutlierDistr.chauvenet(dataset, col) DataViz.plot_binary_outliers(dataset, col, col + '_outlier') dataset = OutlierDistr.mixture_model(dataset, col) DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact', 'exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 32 bits = ~4GB available memory try: dataset = OutlierDist.simple_distance_based( dataset, [col], 'euclidean', 0.10, 0.99) DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') except MemoryError as e: print( 'Not enough memory available for simple distance-based outlier detection...' ) print('Skipping.') try: dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5) DataViz.plot_dataset(dataset, [col, 'lof'], ['exact', 'exact'], ['line', 'points']) except MemoryError as e: print('Not enough memory available for lof...') print('Skipping.') # Remove all the stuff from the dataset again. cols_to_remove = [ col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof' ] for to_remove in cols_to_remove: if to_remove in dataset: del dataset[to_remove] # We take Chauvenet's criterion and apply it to all but the label data... for col in [c for c in dataset.columns if not 'label' in c]: print(f'Measurement is now: {col}') dataset = OutlierDistr.chauvenet(dataset, col) dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan del dataset[col + '_outlier'] dataset.to_csv(DATA_PATH / RESULT_FNAME)
# Step 1: Let us see whether we have some outliers we would prefer to remove. # Determine the columns we want to experiment on. outlier_columns = ['acc_y', 'lin_acc_x'] # Create the outlier classes. OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() # And investigate the approaches for all relevant attributes. for col in outlier_columns: # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset = OutlierDistr.chauvenet(dataset, col) DataViz.plot_binary_outliers(dataset, col, col + '_outlier', 'Chauvenets criterion') dataset = OutlierDistr.mixture_model(dataset, col) DataViz.plot_dataset(dataset, [col, col + '_mixture'], 'Mixture models', ['exact', 'exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 64 bits = ~8GB available memory # try: # dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99) # DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier', 'Simple distance-based approach') # except MemoryError as e: # print('Not enough memory available for simple distance-based outlier detection...') # print('Skipping.') # # try: # dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5)
def main(): # Import the data from the specified location and parse the date index try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FILENAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print('File not found, try to run the preceding crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(module_path=__file__) # Create the outlier classes OutlierDistribution = DistributionBasedOutlierDetection() OutlierDistance = DistanceBasedOutlierDetection() # Step 1: If requested, see whether there are some outliers that need to be preferably removed # Set the columns to experiment on outlier_columns = ['acc_phone_x', 'light_phone_lux'] if FLAGS.mode == 'chauvenet': # Investigate the outlier columns using chauvenet criterium for col in outlier_columns: print(f"Applying chauvenet outlier criteria for column {col}") dataset = OutlierDistribution.chauvenet(data_table=dataset, col=col) DataViz.plot_binary_outliers(data_table=dataset, col=col, outlier_col=f'{col}_outlier') elif FLAGS.mode == 'mixture': # Investigate the outlier columns using mixture models for col in outlier_columns: print(f"Applying mixture model for column {col}") dataset = OutlierDistribution.mixture_model(data_table=dataset, col=col, components=3) DataViz.plot_dataset(data_table=dataset, columns=[col, f'{col}_mixture'], match=['exact', 'exact'], display=['line', 'points']) elif FLAGS.mode == 'distance': for col in outlier_columns: print(f"Applying distance based outlier detection for column {col}") # This step requires: # n_data_points * n_data_points * point_size = 31839 * 31839 * 32 bits = ~4GB available memory try: dataset = OutlierDistance.simple_distance_based(data_table=dataset, cols=[col], d_function='euclidean', d_min=FLAGS.dmin, f_min=FLAGS.fmin) DataViz.plot_binary_outliers(data_table=dataset, col=col, outlier_col='simple_dist_outlier') except MemoryError: print('Not enough memory available for simple distance-based outlier detection...') print('Skipping.') elif FLAGS.mode == 'LOF': for col in outlier_columns: print(f"Applying Local outlier factor for column {col}") try: dataset = OutlierDistance.local_outlier_factor(data_table=dataset, cols=[col], d_function='euclidean', k=FLAGS.K) DataViz.plot_dataset(data_table=dataset, columns=[col, 'lof'], match=['exact', 'exact'], display=['line', 'points']) except MemoryError: print('Not enough memory available for local outlier factor...') print('Skipping.') elif FLAGS.mode == 'final': # Take Chauvenet's criterion and apply it to all but the label column in the main dataset for col in [c for c in dataset.columns if 'label' not in c]: print(f'Measurement is now: {col}') dataset = OutlierDistribution.chauvenet(data_table=dataset, col=col) dataset.loc[dataset[f'{col}_outlier'], col] = np.nan del dataset[col + '_outlier'] dataset.to_csv(DATA_PATH / RESULT_FILENAME)
class OutlierExperiment: def __init__(self, data_path, data_file): self.dataset = pd.read_csv(Path(data_path / data_file), index_col=0) self.dataset = self.dataset self.dataset.index = pd.to_datetime(self.dataset.index) self.DataViz = VisualizeDataset(__file__, show=False) self.outlier_columns = ['acc_phone_x', 'light_phone_lux'] self.OutlierDistr = DistributionBasedOutlierDetection() self.OutlierDist = DistanceBasedOutlierDetection() self.original_columns = self.dataset.columns self.num_outliers = {'acc_phone_x': 0, 'light_phone_lux': 0} def remove_columns(self): for to_remove in self.dataset.columns: if to_remove not in self.original_columns: del self.dataset[to_remove] def chauvenet(self, C): original_columns = self.dataset.columns for col in self.outlier_columns: print(f"Applying outlier criteria for column {col}") self.dataset = self.OutlierDistr.chauvenet(self.dataset, col, C) self.DataViz.plot_binary_outliers(self.dataset, col, col + '_outlier') self.num_outliers[col] = self.dataset[self.dataset[ col + '_outlier'] == 1][col].size / self.dataset[col].size def mixture_model(self, n): for col in self.outlier_columns: print(f"Applying outlier criteria for column {col}") self.dataset = self.OutlierDistr.mixture_model( self.dataset, col, n) self.DataViz.plot_dataset(self.dataset, [col, col + '_mixture'], ['exact', 'exact'], ['line', 'points']) self.num_outliers[col] = self.dataset[ col + '_mixture'].sum() / self.dataset[col + '_mixture'].size print(self.dataset[col + '_mixture'].max()) if self.num_outliers[col] > 1: print(self.dataset[col + '_mixture']) def simple_distance_based(self, d_min, f_min): for col in self.outlier_columns: print(f"Applying outlier criteria for column {col}") self.dataset = self.OutlierDist.simple_distance_based( self.dataset, [col], 'euclidean', d_min, f_min) self.DataViz.plot_binary_outliers(self.dataset, col, 'simple_dist_outlier') self.num_outliers[col] = self.dataset[ self.dataset['simple_dist_outlier'] == 1][col].size / self.dataset[col].size self.remove_columns() def local_outlier_factor(self, k): for col in self.outlier_columns: print(f"Applying outlier criteria for column {col}") self.dataset = self.OutlierDist.local_outlier_factor( self.dataset, [col], 'euclidean', k) self.DataViz.plot_dataset(self.dataset, [col, 'lof'], ['exact', 'exact'], ['line', 'points']) self.num_outliers[col] = self.dataset[ self.dataset['lof'] == 1][col].size / self.dataset[col].size self.remove_columns()
def outliers(data_file, save_file, sub_path): DataViz = VisualizeDataset(__file__, show=False) # Set up file names and locations. # Next, import the data from the specified location and parse the date index. dataset = pd.read_csv(data_file) dataset.index = pd.to_datetime(dataset['timestamp']) # We'll create an instance of our visualization class to plot the results. # Compute the number of milliseconds covered by an instance using the first two rows. milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000 # Step 1: Let us see whether we have some outliers we would prefer to remove. # Determine the columns we want to experiment on. outlier_columns = [ "Acceleration x (m/s^2)","Acceleration y (m/s^2)","Acceleration z (m/s^2)", "Gyroscope x (rad/s)","Gyroscope y (rad/s)","Gyroscope z (rad/s)", ] print(dataset.columns) # Create the outlier classes. OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() # And investigate the approaches for all relevant attributes. for col in outlier_columns: print(f"Applying outlier criteria for column {col}") # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset = OutlierDistr.chauvenet(dataset, col) DataViz.plot_binary_outliers(dataset, col, col + '_outlier', save_path=sub_path+'/chauvenet') dataset = OutlierDistr.mixture_model(dataset, col) DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'], save_path=sub_path+'/mixture') print('data shape: ', dataset.shape) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 32 bits = ~4GB available memory # try: dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99) DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier', save_path=sub_path+'/simple_dist') print(dataset['simple_dist_outlier'].mean()) except MemoryError as e: print('Not enough memory available for simple distance-based outlier detection...') print('Skipping.') # try: # dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5) # DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'], save_path=sub_path+'/lof') # except MemoryError as e: # print('Not enough memory available for lof...') # print('Skipping.') # Remove all the stuff from the dataset again. cols_to_remove = [col + '_outlier', col + '_mixture', 'simple_dist_outlier'] for to_remove in cols_to_remove: if to_remove in dataset: del dataset[to_remove] # We take Chauvenet's criterion and apply it to all but the label data... # for col in [c for c in dataset.columns if not 'label' in c]: # print(f'Measurement is now: {col}') # dataset = OutlierDistr.chauvenet(dataset, col) # dataset.loc[dataset[f'{col}_mixture'] == True, col] = np.nan # del dataset[col + '_outlier'] dataset.to_csv(save_file)
# Mixture models NumDist = 3 # given was 3 # Simple Distance dmin = 0.10 # given was 0.10 fmin = 0.99 # given was 0.99 # Local outlier factor k = 5 # given was 5 ##### Outlier filtering for the CS dataset ##### #And investigate the approaches for all relevant attributes. for col in outlier_columns: # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset_cs = OutlierDistr.chauvenet(dataset_cs, col, constant) DataViz.plot_binary_outliers(dataset_cs, col, col + '_outlier') dataset_cs = OutlierDistr.mixture_model(dataset_cs, col, NumDist) DataViz.plot_dataset(dataset_cs, [col, col + '_mixture'], ['exact', 'exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 64 bits = ~8GB available memory try: dataset_cs = OutlierDist.simple_distance_based(dataset_cs, [col], 'euclidean', dmin, fmin) DataViz.plot_binary_outliers(dataset_cs, col, 'simple_dist_outlier') except MemoryError as e: print( 'Not enough memory available for simple distance-based outlier detection...' ) print('Skipping.')