# Determine the columns we want to experiment on. #outlier_columns = ['acc_phone_x','acc_phone_y','acc_phone_z','gyr_phone_x','gyr_phone_y','gyr_phone_z','light_phone_lux','mag_phone_x','mag_phone_y','mag_phone_z'] outlier_columns = ['light_phone_lux'] # Create the outlier classes. OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() #And investigate the approaches for all relevant attributes. for col in outlier_columns: # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset = OutlierDistr.chauvenet(dataset, col) #print 'chauvenet', col #DataViz.plot_binary_outliers(dataset, col, col + '_outlier') dataset = OutlierDistr.mixture_model(dataset, col) #DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 64 bits = ~8GB available memory try: print("trying") dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99) print("plot") DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') except MemoryError as e: print('Not enough memory available for simple distance-based outlier detection...') print('Skipping.') '''try: print("2nd trying")
def main(): # D:\Users\Andy\Downloads\Desktop\ml4qs\ML4QS_Group_41\ML4QS\simple_dataset\2\accelerometer.csv # Set up file names and locations. for user in range(1, 34): DATA_PATH = Path('./intermediate_datafiles/') DATASET_FNAME = sys.argv[1] if len( sys.argv ) > 1 else 'AS14_' + "{:02d}".format(user) + '/chapter2_result.csv' RESULT_FNAME = sys.argv[2] if len( sys.argv) > 2 else 'chapter3_result_outliers.csv' # Next, import the data from the specified location and parse the date index. try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run the preceding crowdsignals scripts first!' ) raise e # We'll create an instance of our visualization class to plot the results. DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance using the first two rows. milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Step 1: Let us see whether we have some outliers we would prefer to remove. # Determine the columns we want to experiment on. outlier_columns = [ 'actvalue', 'builtvalue', 'commvalue', 'entvalue', 'accvalue', 'offvalue', 'othervalue', 'socialvalue', 'travelvalue', 'unkvalue', 'utilvalue', 'callvalue', 'arovalue', 'valvalue', 'scrvalue', 'smsvalue' ] # Create the outlier classes. OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() # And investigate the approaches for all relevant attributes. for col in outlier_columns: # if col is None: # continue print(f"Applying outlier criteria for column {col}") # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset = OutlierDistr.chauvenet(dataset, col) # DataViz.plot_binary_outliers(dataset, col, col + '_outlier') dataset = OutlierDistr.mixture_model(dataset, col) # DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 32 bits = ~4GB available memory try: dataset = OutlierDist.simple_distance_based( dataset, [col], 'euclidean', 0.10, 0.99) # DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') except MemoryError as e: print( 'Not enough memory available for simple distance-based outlier detection...' ) print('Skipping.') # try: # dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5) # # DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points']) # except MemoryError as e: # print('Not enough memory available for lof...') # print('Skipping.') # Remove all the stuff from the dataset again. cols_to_remove = [ col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof' ] for to_remove in cols_to_remove: if to_remove in dataset: del dataset[to_remove] # We take Chauvenet's criterion and apply it to all but the label data... for col in [c for c in dataset.columns if not 'label' in c]: print(f'Measurement is now: {col}') dataset = OutlierDistr.chauvenet(dataset, col) dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan del dataset[col + '_outlier'] dataset.to_csv(DATA_PATH / RESULT_FNAME)
def main(): # Set up file names and locations. # Next, import the data from the specified location and parse the date index. dataset = pickle.load(open('datasets\dataframes\df_concat_with_labels.pkl', 'rb')) # We'll create an instance of our visualization class to plot the results. DataViz = VisualizeDataset(__file__, show=False) # Compute the number of milliseconds covered by an instance using the first two rows. milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000 # Step 1: Let us see whether we have some outliers we would prefer to remove. # Determine the columns we want to experiment on. outlier_columns = [ "Acceleration x (m/s^2)","Acceleration y (m/s^2)","Acceleration z (m/s^2)", "Magnetic field x (µT)","Magnetic field y (µT)","Magnetic field z (µT)", "Gyroscope x (rad/s)","Gyroscope y (rad/s)","Gyroscope z (rad/s)", "Linear Acceleration x (m/s^2)","Linear Acceleration y (m/s^2)","Linear Acceleration z (m/s^2)", ] print(dataset.columns) # Create the outlier classes. OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() # And investigate the approaches for all relevant attributes. for col in outlier_columns: print(f"Applying outlier criteria for column {col}") # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset = OutlierDistr.chauvenet(dataset, col) print(dataset.shape) # DataViz.plot_binary_outliers(dataset, col, col + '_outlier') dataset = OutlierDistr.mixture_model(dataset, col) print(dataset.shape) # DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 32 bits = ~4GB available memory try: dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99) # DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') print(dataset['simple_dist_outlier'].mean()) except MemoryError as e: print('Not enough memory available for simple distance-based outlier detection...') print('Skipping.') try: dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5) # DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points']) except MemoryError as e: print('Not enough memory available for lof...') print('Skipping.') # Remove all the stuff from the dataset again. cols_to_remove = [col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof'] for to_remove in cols_to_remove: if to_remove in dataset: del dataset[to_remove] # We take Chauvenet's criterion and apply it to all but the label data... for col in [c for c in dataset.columns if not 'label' in c]: print(f'Measurement is now: {col}') dataset = OutlierDistr.chauvenet(dataset, col) dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan del dataset[col + '_outlier'] pickle.dump(dataset, open('concat_outliers.pkl', 'wb'))
def main(): # Import the data from the specified location and parse the date index try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FILENAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print('File not found, try to run the preceding crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(module_path=__file__) # Create the outlier classes OutlierDistribution = DistributionBasedOutlierDetection() OutlierDistance = DistanceBasedOutlierDetection() # Step 1: If requested, see whether there are some outliers that need to be preferably removed # Set the columns to experiment on outlier_columns = ['acc_phone_x', 'light_phone_lux'] if FLAGS.mode == 'chauvenet': # Investigate the outlier columns using chauvenet criterium for col in outlier_columns: print(f"Applying chauvenet outlier criteria for column {col}") dataset = OutlierDistribution.chauvenet(data_table=dataset, col=col) DataViz.plot_binary_outliers(data_table=dataset, col=col, outlier_col=f'{col}_outlier') elif FLAGS.mode == 'mixture': # Investigate the outlier columns using mixture models for col in outlier_columns: print(f"Applying mixture model for column {col}") dataset = OutlierDistribution.mixture_model(data_table=dataset, col=col, components=3) DataViz.plot_dataset(data_table=dataset, columns=[col, f'{col}_mixture'], match=['exact', 'exact'], display=['line', 'points']) elif FLAGS.mode == 'distance': for col in outlier_columns: print(f"Applying distance based outlier detection for column {col}") # This step requires: # n_data_points * n_data_points * point_size = 31839 * 31839 * 32 bits = ~4GB available memory try: dataset = OutlierDistance.simple_distance_based(data_table=dataset, cols=[col], d_function='euclidean', d_min=FLAGS.dmin, f_min=FLAGS.fmin) DataViz.plot_binary_outliers(data_table=dataset, col=col, outlier_col='simple_dist_outlier') except MemoryError: print('Not enough memory available for simple distance-based outlier detection...') print('Skipping.') elif FLAGS.mode == 'LOF': for col in outlier_columns: print(f"Applying Local outlier factor for column {col}") try: dataset = OutlierDistance.local_outlier_factor(data_table=dataset, cols=[col], d_function='euclidean', k=FLAGS.K) DataViz.plot_dataset(data_table=dataset, columns=[col, 'lof'], match=['exact', 'exact'], display=['line', 'points']) except MemoryError: print('Not enough memory available for local outlier factor...') print('Skipping.') elif FLAGS.mode == 'final': # Take Chauvenet's criterion and apply it to all but the label column in the main dataset for col in [c for c in dataset.columns if 'label' not in c]: print(f'Measurement is now: {col}') dataset = OutlierDistribution.chauvenet(data_table=dataset, col=col) dataset.loc[dataset[f'{col}_outlier'], col] = np.nan del dataset[col + '_outlier'] dataset.to_csv(DATA_PATH / RESULT_FILENAME)
class OutlierExperiment: def __init__(self, data_path, data_file): self.dataset = pd.read_csv(Path(data_path / data_file), index_col=0) self.dataset = self.dataset self.dataset.index = pd.to_datetime(self.dataset.index) self.DataViz = VisualizeDataset(__file__, show=False) self.outlier_columns = ['acc_phone_x', 'light_phone_lux'] self.OutlierDistr = DistributionBasedOutlierDetection() self.OutlierDist = DistanceBasedOutlierDetection() self.original_columns = self.dataset.columns self.num_outliers = {'acc_phone_x': 0, 'light_phone_lux': 0} def remove_columns(self): for to_remove in self.dataset.columns: if to_remove not in self.original_columns: del self.dataset[to_remove] def chauvenet(self, C): original_columns = self.dataset.columns for col in self.outlier_columns: print(f"Applying outlier criteria for column {col}") self.dataset = self.OutlierDistr.chauvenet(self.dataset, col, C) self.DataViz.plot_binary_outliers(self.dataset, col, col + '_outlier') self.num_outliers[col] = self.dataset[self.dataset[ col + '_outlier'] == 1][col].size / self.dataset[col].size def mixture_model(self, n): for col in self.outlier_columns: print(f"Applying outlier criteria for column {col}") self.dataset = self.OutlierDistr.mixture_model( self.dataset, col, n) self.DataViz.plot_dataset(self.dataset, [col, col + '_mixture'], ['exact', 'exact'], ['line', 'points']) self.num_outliers[col] = self.dataset[ col + '_mixture'].sum() / self.dataset[col + '_mixture'].size print(self.dataset[col + '_mixture'].max()) if self.num_outliers[col] > 1: print(self.dataset[col + '_mixture']) def simple_distance_based(self, d_min, f_min): for col in self.outlier_columns: print(f"Applying outlier criteria for column {col}") self.dataset = self.OutlierDist.simple_distance_based( self.dataset, [col], 'euclidean', d_min, f_min) self.DataViz.plot_binary_outliers(self.dataset, col, 'simple_dist_outlier') self.num_outliers[col] = self.dataset[ self.dataset['simple_dist_outlier'] == 1][col].size / self.dataset[col].size self.remove_columns() def local_outlier_factor(self, k): for col in self.outlier_columns: print(f"Applying outlier criteria for column {col}") self.dataset = self.OutlierDist.local_outlier_factor( self.dataset, [col], 'euclidean', k) self.DataViz.plot_dataset(self.dataset, [col, 'lof'], ['exact', 'exact'], ['line', 'points']) self.num_outliers[col] = self.dataset[ self.dataset['lof'] == 1][col].size / self.dataset[col].size self.remove_columns()
def main(): # Set up file names and locations. DATA_PATH = Path('./intermediate_datafiles/') DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter2_result.csv' RESULT_FNAME = sys.argv[2] if len( sys.argv) > 2 else 'chapter3_result_outliers.csv' # Next, import the data from the specified location and parse the date index. try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run the preceding crowdsignals scripts first!' ) raise e # We'll create an instance of our visualization class to plot the results. DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance using the first two rows. milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Step 1: Let us see whether we have some outliers we would prefer to remove. # Determine the columns we want to experiment on. outlier_columns = [ 'acc_phone_X', 'acc_phone_Y', 'acc_phone_Z', 'gyr_phone_X', 'gyr_phone_Y', 'gyr_phone_Z', 'mag_phone_X', 'mag_phone_Y', 'mag_phone_Z', ] # Create the outlier classes. OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() # And investigate the approaches for all relevant attributes. for col in outlier_columns: print(f"Applying outlier criteria for column {col}") # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset = OutlierDistr.chauvenet(dataset, col) DataViz.plot_binary_outliers(dataset, col, col + '_outlier') dataset = OutlierDistr.mixture_model(dataset, col) DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact', 'exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 32 bits = ~4GB available memory try: dataset = OutlierDist.simple_distance_based( dataset, [col], 'euclidean', 0.10, 0.99) DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') except MemoryError as e: print( 'Not enough memory available for simple distance-based outlier detection...' ) print('Skipping.') try: dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5) DataViz.plot_dataset(dataset, [col, 'lof'], ['exact', 'exact'], ['line', 'points']) DataViz.plot_dataset_boxplot(dataset, ['lof']) # print(col, dataset['lof'].describe()) qtls = list(dataset['lof'].quantile([0.01, 0.25, 0.5, 0.75, 0.99])) # print(col, qtls) #print(col, qtls[4]) dataset['lof_outliers'] = False dataset.loc[(dataset['lof'] > qtls[4]), 'lof_outliers'] = True DataViz.plot_binary_outliers(dataset, col, 'lof_outliers') except MemoryError as e: print('Not enough memory available for lof...') print('Skipping.') # Remove all the stuff from the dataset again. cols_to_remove = [ col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof', 'lof_outliers' ] for to_remove in cols_to_remove: if to_remove in dataset: del dataset[to_remove] # We take Chauvenet's criterion and apply it to all but the label data... for col in [c for c in dataset.columns if not 'label' in c]: print(f'Measurement is now: {col}') if col.startswith('mag'): dataset = OutlierDist.simple_distance_based( dataset, [col], 'euclidean', 0.10, 0.99).rename(columns={'simple_dist_outlier': f'{col}_outlier'}) else: dataset = OutlierDistr.chauvenet(dataset, col) dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan DataViz.plot_binary_outliers(dataset, col, f'{col}_outlier') del dataset[col + '_outlier'] dataset.to_csv(DATA_PATH / RESULT_FNAME)
NumDist = 3 # given was 3 # Simple Distance dmin = 0.10 # given was 0.10 fmin = 0.99 # given was 0.99 # Local outlier factor k = 5 # given was 5 ##### Outlier filtering for the CS dataset ##### #And investigate the approaches for all relevant attributes. for col in outlier_columns: # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset_cs = OutlierDistr.chauvenet(dataset_cs, col, constant) DataViz.plot_binary_outliers(dataset_cs, col, col + '_outlier') dataset_cs = OutlierDistr.mixture_model(dataset_cs, col, NumDist) DataViz.plot_dataset(dataset_cs, [col, col + '_mixture'], ['exact', 'exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 64 bits = ~8GB available memory try: dataset_cs = OutlierDist.simple_distance_based(dataset_cs, [col], 'euclidean', dmin, fmin) DataViz.plot_binary_outliers(dataset_cs, col, 'simple_dist_outlier') except MemoryError as e: print( 'Not enough memory available for simple distance-based outlier detection...' ) print('Skipping.')