Esempio n. 1
0
def main():
    dataset_path = './intermediate_datafiles/'
    dataset = pd.read_csv(dataset_path + 'chapter2_result.csv', index_col=0)
    outlier_columns = ['acc_phone_x', 'light_phone_lux']
    DataViz = VisualizeDataset()

    OutlierDistr = DistributionBasedOutlierDetection()
    OutlierDist = DistanceBasedOutlierDetection()

    dataset.index = dataset.index.to_datetime()
    start = input("choose method: [1],[2],[3],[4]")
    if start == 1:
        param = input("Chauvenet\ninput parameters: c")
        for col in outlier_columns:
            dataset = OutlierDistr.chauvenet(dataset, col, param)
            DataViz.plot_binary_outliers(dataset, col, col + '_outlier')

    elif start == 2:
        # param = input("Mixture model\n input parameters: components, iter")
        components, iter = raw_input("Mixture model\n input parameters: components, iter").split(',')
        components = int(components)
        iter = int(iter)
        for col in outlier_columns:
            dataset = OutlierDistr.mixture_model(dataset, col, components, iter)
            DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'])

    elif start == 3:
        d_min, f_min = raw_input("Simple distance-based\n input parameters: d_min, f_min").split()
        d_min = float(d_min)
        f_min = float(f_min)
        for col in outlier_columns:
            dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', d_min, f_min)
            DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')

    elif start == 4:
        param = input("Local outlier factor\n input parameters: k")
        for col in outlier_columns:
            dataset = OutlierDist.local_outlier_factor(dataset, col, 'euclidean', k)
            DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'])

    else :
        print("no method selected")
for col in outlier_columns:
    # And try out all different approaches. Note that we have done some optimization
    # of the parameter values for each of the approaches by visual inspection.
    dataset = OutlierDistr.chauvenet(dataset, col)
    #print 'chauvenet', col
    #DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
    dataset = OutlierDistr.mixture_model(dataset, col)
    #DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'])
    # This requires:
    # n_data_points * n_data_points * point_size =
    # 31839 * 31839 * 64 bits = ~8GB available memory
    try:
        print("trying")
	dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99)
        print("plot")
	DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
    except MemoryError as e:
        print('Not enough memory available for simple distance-based outlier detection...')
        print('Skipping.')
    
    '''try:
        print("2nd trying")
	dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5)
        print("2nd plot")
	DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'])
    except MemoryError as e:
        print('Not enough memory available for lof...')
        print('Skipping.')'''
    
    # Remove all the stuff from the dataset again.
    print("we will remove")
def main():

    # Set up file names and locations.
    DATA_PATH = Path('./intermediate_datafiles/')
    DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter2_result.csv'
    RESULT_FNAME = sys.argv[2] if len(
        sys.argv) > 2 else 'chapter3_result_outliers.csv'

    # Next, import the data from the specified location and parse the date index.
    try:
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)

    except IOError as e:
        print(
            'File not found, try to run the preceding crowdsignals scripts first!'
        )
        raise e

    # We'll create an instance of our visualization class to plot the results.
    DataViz = VisualizeDataset(__file__)

    # Compute the number of milliseconds covered by an instance using the first two rows.
    milliseconds_per_instance = (dataset.index[1] -
                                 dataset.index[0]).microseconds / 1000

    # Step 1: Let us see whether we have some outliers we would prefer to remove.

    # Determine the columns we want to experiment on.
    outlier_columns = ['acc_phone_x', 'light_phone_lux']

    # Create the outlier classes.
    OutlierDistr = DistributionBasedOutlierDetection()
    OutlierDist = DistanceBasedOutlierDetection()

    # And investigate the approaches for all relevant attributes.
    for col in outlier_columns:

        print(f"Applying outlier criteria for column {col}")

        # And try out all different approaches. Note that we have done some optimization
        # of the parameter values for each of the approaches by visual inspection.
        dataset = OutlierDistr.chauvenet(dataset, col)
        DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
        dataset = OutlierDistr.mixture_model(dataset, col)
        DataViz.plot_dataset(dataset, [col, col + '_mixture'],
                             ['exact', 'exact'], ['line', 'points'])
        # This requires:
        # n_data_points * n_data_points * point_size =
        # 31839 * 31839 * 32 bits = ~4GB available memory

        try:
            dataset = OutlierDist.simple_distance_based(
                dataset, [col], 'euclidean', 0.10, 0.99)
            DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
        except MemoryError as e:
            print(
                'Not enough memory available for simple distance-based outlier detection...'
            )
            print('Skipping.')

        try:
            dataset = OutlierDist.local_outlier_factor(dataset, [col],
                                                       'euclidean', 5)
            DataViz.plot_dataset(dataset, [col, 'lof'], ['exact', 'exact'],
                                 ['line', 'points'])
        except MemoryError as e:
            print('Not enough memory available for lof...')
            print('Skipping.')

        # Remove all the stuff from the dataset again.
        cols_to_remove = [
            col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof'
        ]
        for to_remove in cols_to_remove:
            if to_remove in dataset:
                del dataset[to_remove]

    # We take Chauvenet's criterion and apply it to all but the label data...

    for col in [c for c in dataset.columns if not 'label' in c]:
        print(f'Measurement is now: {col}')
        dataset = OutlierDistr.chauvenet(dataset, col)
        dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan
        del dataset[col + '_outlier']

    dataset.to_csv(DATA_PATH / RESULT_FNAME)
Esempio n. 4
0
# Step 1: Let us see whether we have some outliers we would prefer to remove.

# Determine the columns we want to experiment on.
outlier_columns = ['acc_y', 'lin_acc_x']

# Create the outlier classes.
OutlierDistr = DistributionBasedOutlierDetection()
OutlierDist = DistanceBasedOutlierDetection()

# And investigate the approaches for all relevant attributes.
for col in outlier_columns:
    # And try out all different approaches. Note that we have done some optimization
    # of the parameter values for each of the approaches by visual inspection.
    dataset = OutlierDistr.chauvenet(dataset, col)
    DataViz.plot_binary_outliers(dataset, col, col + '_outlier',
                                 'Chauvenets criterion')
    dataset = OutlierDistr.mixture_model(dataset, col)
    DataViz.plot_dataset(dataset, [col, col + '_mixture'], 'Mixture models',
                         ['exact', 'exact'], ['line', 'points'])
    # This requires:
    # n_data_points * n_data_points * point_size =
    # 31839 * 31839 * 64 bits = ~8GB available memory
    # try:
    #     dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99)
    #     DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier', 'Simple distance-based approach')
    # except MemoryError as e:
    #     print('Not enough memory available for simple distance-based outlier detection...')
    #     print('Skipping.')
    #
    # try:
    #     dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5)
Esempio n. 5
0
def main():
    # Import the data from the specified location and parse the date index
    try:
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FILENAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print('File not found, try to run the preceding crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(module_path=__file__)
    # Create the outlier classes
    OutlierDistribution = DistributionBasedOutlierDetection()
    OutlierDistance = DistanceBasedOutlierDetection()

    # Step 1: If requested, see whether there are some outliers that need to be preferably removed
    # Set the columns to experiment on
    outlier_columns = ['acc_phone_x', 'light_phone_lux']

    if FLAGS.mode == 'chauvenet':
        # Investigate the outlier columns using chauvenet criterium
        for col in outlier_columns:
            print(f"Applying chauvenet outlier criteria for column {col}")
            dataset = OutlierDistribution.chauvenet(data_table=dataset, col=col)
            DataViz.plot_binary_outliers(data_table=dataset, col=col, outlier_col=f'{col}_outlier')

    elif FLAGS.mode == 'mixture':
        # Investigate the outlier columns using mixture models
        for col in outlier_columns:
            print(f"Applying mixture model for column {col}")
            dataset = OutlierDistribution.mixture_model(data_table=dataset, col=col, components=3)
            DataViz.plot_dataset(data_table=dataset, columns=[col, f'{col}_mixture'], match=['exact', 'exact'],
                                 display=['line', 'points'])

    elif FLAGS.mode == 'distance':
        for col in outlier_columns:
            print(f"Applying distance based outlier detection for column {col}")
            # This step requires:
            # n_data_points * n_data_points * point_size = 31839 * 31839 * 32 bits = ~4GB available memory
            try:
                dataset = OutlierDistance.simple_distance_based(data_table=dataset, cols=[col], d_function='euclidean',
                                                                d_min=FLAGS.dmin, f_min=FLAGS.fmin)
                DataViz.plot_binary_outliers(data_table=dataset, col=col, outlier_col='simple_dist_outlier')
            except MemoryError:
                print('Not enough memory available for simple distance-based outlier detection...')
                print('Skipping.')

    elif FLAGS.mode == 'LOF':
        for col in outlier_columns:
            print(f"Applying Local outlier factor for column {col}")
            try:
                dataset = OutlierDistance.local_outlier_factor(data_table=dataset, cols=[col], d_function='euclidean',
                                                               k=FLAGS.K)
                DataViz.plot_dataset(data_table=dataset, columns=[col, 'lof'], match=['exact', 'exact'],
                                     display=['line', 'points'])
            except MemoryError:
                print('Not enough memory available for local outlier factor...')
                print('Skipping.')

    elif FLAGS.mode == 'final':
        # Take Chauvenet's criterion and apply it to all but the label column in the main dataset
        for col in [c for c in dataset.columns if 'label' not in c]:
            print(f'Measurement is now: {col}')
            dataset = OutlierDistribution.chauvenet(data_table=dataset, col=col)
            dataset.loc[dataset[f'{col}_outlier'], col] = np.nan
            del dataset[col + '_outlier']

        dataset.to_csv(DATA_PATH / RESULT_FILENAME)
Esempio n. 6
0
class OutlierExperiment:
    def __init__(self, data_path, data_file):
        self.dataset = pd.read_csv(Path(data_path / data_file), index_col=0)
        self.dataset = self.dataset
        self.dataset.index = pd.to_datetime(self.dataset.index)
        self.DataViz = VisualizeDataset(__file__, show=False)
        self.outlier_columns = ['acc_phone_x', 'light_phone_lux']
        self.OutlierDistr = DistributionBasedOutlierDetection()
        self.OutlierDist = DistanceBasedOutlierDetection()
        self.original_columns = self.dataset.columns
        self.num_outliers = {'acc_phone_x': 0, 'light_phone_lux': 0}

    def remove_columns(self):
        for to_remove in self.dataset.columns:
            if to_remove not in self.original_columns:
                del self.dataset[to_remove]

    def chauvenet(self, C):
        original_columns = self.dataset.columns
        for col in self.outlier_columns:
            print(f"Applying outlier criteria for column {col}")
            self.dataset = self.OutlierDistr.chauvenet(self.dataset, col, C)
            self.DataViz.plot_binary_outliers(self.dataset, col,
                                              col + '_outlier')
            self.num_outliers[col] = self.dataset[self.dataset[
                col + '_outlier'] == 1][col].size / self.dataset[col].size

    def mixture_model(self, n):
        for col in self.outlier_columns:
            print(f"Applying outlier criteria for column {col}")
            self.dataset = self.OutlierDistr.mixture_model(
                self.dataset, col, n)
            self.DataViz.plot_dataset(self.dataset, [col, col + '_mixture'],
                                      ['exact', 'exact'], ['line', 'points'])
            self.num_outliers[col] = self.dataset[
                col + '_mixture'].sum() / self.dataset[col + '_mixture'].size
            print(self.dataset[col + '_mixture'].max())
            if self.num_outliers[col] > 1:
                print(self.dataset[col + '_mixture'])

    def simple_distance_based(self, d_min, f_min):
        for col in self.outlier_columns:
            print(f"Applying outlier criteria for column {col}")
            self.dataset = self.OutlierDist.simple_distance_based(
                self.dataset, [col], 'euclidean', d_min, f_min)
            self.DataViz.plot_binary_outliers(self.dataset, col,
                                              'simple_dist_outlier')
            self.num_outliers[col] = self.dataset[
                self.dataset['simple_dist_outlier'] ==
                1][col].size / self.dataset[col].size
            self.remove_columns()

    def local_outlier_factor(self, k):
        for col in self.outlier_columns:
            print(f"Applying outlier criteria for column {col}")
            self.dataset = self.OutlierDist.local_outlier_factor(
                self.dataset, [col], 'euclidean', k)
            self.DataViz.plot_dataset(self.dataset, [col, 'lof'],
                                      ['exact', 'exact'], ['line', 'points'])
            self.num_outliers[col] = self.dataset[
                self.dataset['lof'] == 1][col].size / self.dataset[col].size
            self.remove_columns()
Esempio n. 7
0
def outliers(data_file, save_file, sub_path):
    
    DataViz = VisualizeDataset(__file__, show=False)

    # Set up file names and locations.

    # Next, import the data from the specified location and parse the date index.
    dataset = pd.read_csv(data_file)
    dataset.index = pd.to_datetime(dataset['timestamp'])

    # We'll create an instance of our visualization class to plot the results.

    # Compute the number of milliseconds covered by an instance using the first two rows.
    milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000

    # Step 1: Let us see whether we have some outliers we would prefer to remove.

    # Determine the columns we want to experiment on.
    outlier_columns = [
        "Acceleration x (m/s^2)","Acceleration y (m/s^2)","Acceleration z (m/s^2)",
        "Gyroscope x (rad/s)","Gyroscope y (rad/s)","Gyroscope z (rad/s)",
    ]
    print(dataset.columns)
    # Create the outlier classes.
    OutlierDistr = DistributionBasedOutlierDetection()
    OutlierDist = DistanceBasedOutlierDetection()

    # And investigate the approaches for all relevant attributes.
    for col in outlier_columns:

        print(f"Applying outlier criteria for column {col}")

        # And try out all different approaches. Note that we have done some optimization
        # of the parameter values for each of the approaches by visual inspection.
        dataset = OutlierDistr.chauvenet(dataset, col)
        DataViz.plot_binary_outliers(dataset, col, col + '_outlier', save_path=sub_path+'/chauvenet')
        
        dataset = OutlierDistr.mixture_model(dataset, col)
        DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'], save_path=sub_path+'/mixture')
        print('data shape: ', dataset.shape)
        # This requires:
        # n_data_points * n_data_points * point_size =
        # 31839 * 31839 * 32 bits = ~4GB available memory
        #
        try:
            dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99)
            DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier', save_path=sub_path+'/simple_dist')
            print(dataset['simple_dist_outlier'].mean())
        except MemoryError as e:
            print('Not enough memory available for simple distance-based outlier detection...')
            print('Skipping.')

#         try:
#             dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5)
#             DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'], save_path=sub_path+'/lof')
#         except MemoryError as e:
#             print('Not enough memory available for lof...')
#             print('Skipping.')

        # Remove all the stuff from the dataset again.
        cols_to_remove = [col + '_outlier', col + '_mixture', 'simple_dist_outlier']
        for to_remove in cols_to_remove:
            if to_remove in dataset:
                del dataset[to_remove]

    # We take Chauvenet's criterion and apply it to all but the label data...

    # for col in [c for c in dataset.columns if not 'label' in c]:
    #     print(f'Measurement is now: {col}')
    #     dataset = OutlierDistr.chauvenet(dataset, col)
    #     dataset.loc[dataset[f'{col}_mixture'] == True, col] = np.nan
    #     del dataset[col + '_outlier']

    dataset.to_csv(save_file)
Esempio n. 8
0
# Mixture models
NumDist = 3  # given was 3
# Simple Distance
dmin = 0.10  # given was 0.10
fmin = 0.99  # given was 0.99
# Local outlier factor
k = 5  # given was 5

##### Outlier filtering for the CS dataset #####

#And investigate the approaches for all relevant attributes.
for col in outlier_columns:
    # And try out all different approaches. Note that we have done some optimization
    # of the parameter values for each of the approaches by visual inspection.
    dataset_cs = OutlierDistr.chauvenet(dataset_cs, col, constant)
    DataViz.plot_binary_outliers(dataset_cs, col, col + '_outlier')
    dataset_cs = OutlierDistr.mixture_model(dataset_cs, col, NumDist)
    DataViz.plot_dataset(dataset_cs, [col, col + '_mixture'],
                         ['exact', 'exact'], ['line', 'points'])
    # This requires:
    # n_data_points * n_data_points * point_size =
    # 31839 * 31839 * 64 bits = ~8GB available memory
    try:
        dataset_cs = OutlierDist.simple_distance_based(dataset_cs, [col],
                                                       'euclidean', dmin, fmin)
        DataViz.plot_binary_outliers(dataset_cs, col, 'simple_dist_outlier')
    except MemoryError as e:
        print(
            'Not enough memory available for simple distance-based outlier detection...'
        )
        print('Skipping.')