def __init__(self, data_path, data_file):
     self.dataset = pd.read_csv(Path(data_path / data_file), index_col=0)
     self.dataset = self.dataset
     self.dataset.index = pd.to_datetime(self.dataset.index)
     self.DataViz = VisualizeDataset(__file__, show=False)
     self.outlier_columns = ['acc_phone_x', 'light_phone_lux']
     self.OutlierDistr = DistributionBasedOutlierDetection()
     self.OutlierDist = DistanceBasedOutlierDetection()
     self.original_columns = self.dataset.columns
     self.num_outliers = {'acc_phone_x': 0, 'light_phone_lux': 0}
Beispiel #2
0
def main():
    # D:\Users\Andy\Downloads\Desktop\ml4qs\ML4QS_Group_41\ML4QS\simple_dataset\2\accelerometer.csv
    # Set up file names and locations.
    for user in range(1, 34):

        DATA_PATH = Path('./intermediate_datafiles/')
        DATASET_FNAME = sys.argv[1] if len(
            sys.argv
        ) > 1 else 'AS14_' + "{:02d}".format(user) + '/chapter2_result.csv'
        RESULT_FNAME = sys.argv[2] if len(
            sys.argv) > 2 else 'chapter3_result_outliers.csv'

        # Next, import the data from the specified location and parse the date index.
        try:
            dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
            dataset.index = pd.to_datetime(dataset.index)

        except IOError as e:
            print(
                'File not found, try to run the preceding crowdsignals scripts first!'
            )
            raise e

        # We'll create an instance of our visualization class to plot the results.
        DataViz = VisualizeDataset(__file__)

        # Compute the number of milliseconds covered by an instance using the first two rows.
        milliseconds_per_instance = (dataset.index[1] -
                                     dataset.index[0]).microseconds / 1000

        # Step 1: Let us see whether we have some outliers we would prefer to remove.

        # Determine the columns we want to experiment on.
        outlier_columns = [
            'actvalue', 'builtvalue', 'commvalue', 'entvalue', 'accvalue',
            'offvalue', 'othervalue', 'socialvalue', 'travelvalue', 'unkvalue',
            'utilvalue', 'callvalue', 'arovalue', 'valvalue', 'scrvalue',
            'smsvalue'
        ]

        # Create the outlier classes.
        OutlierDistr = DistributionBasedOutlierDetection()
        OutlierDist = DistanceBasedOutlierDetection()

        # And investigate the approaches for all relevant attributes.
        for col in outlier_columns:
            # if col is None:
            #     continue
            print(f"Applying outlier criteria for column {col}")

            # And try out all different approaches. Note that we have done some optimization
            # of the parameter values for each of the approaches by visual inspection.
            dataset = OutlierDistr.chauvenet(dataset, col)
            # DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
            dataset = OutlierDistr.mixture_model(dataset, col)
            # DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'])
            # This requires:
            # n_data_points * n_data_points * point_size =
            # 31839 * 31839 * 32 bits = ~4GB available memory

            try:
                dataset = OutlierDist.simple_distance_based(
                    dataset, [col], 'euclidean', 0.10, 0.99)
                # DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
            except MemoryError as e:
                print(
                    'Not enough memory available for simple distance-based outlier detection...'
                )
                print('Skipping.')

            # try:
            #     dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5)
            #     # DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'])
            # except MemoryError as e:
            #     print('Not enough memory available for lof...')
            #     print('Skipping.')

            # Remove all the stuff from the dataset again.
            cols_to_remove = [
                col + '_outlier', col + '_mixture', 'simple_dist_outlier',
                'lof'
            ]
            for to_remove in cols_to_remove:
                if to_remove in dataset:
                    del dataset[to_remove]

        # We take Chauvenet's criterion and apply it to all but the label data...

        for col in [c for c in dataset.columns if not 'label' in c]:
            print(f'Measurement is now: {col}')
            dataset = OutlierDistr.chauvenet(dataset, col)
            dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan
            del dataset[col + '_outlier']

        dataset.to_csv(DATA_PATH / RESULT_FNAME)
Beispiel #3
0
    dataset, ['acc_', 'press_', 'gyr_', 'mag_', 'linacc_', 'hr_', 'label'],
    ['like', 'like', 'like', 'like', 'like', 'like', 'like'],
    ['line', 'line', 'line', 'line', 'line', 'points', 'points'])

# Compute the number of milliseconds covered by an instance based on the first two rows
milliseconds_per_instance = (dataset.index[1] -
                             dataset.index[0]).microseconds / 1000

# Step 1: Let us see whether we have some outliers we would prefer to remove.

# Determine the columns we want to experiment on.
# outlier_columns = ['acc_phone_x', 'light_phone_lux']
outlier_columns = [c for c in dataset.columns if not 'label' in c]

# Create the outlier classes.
OutlierDistr = DistributionBasedOutlierDetection()
OutlierDist = DistanceBasedOutlierDetection()

# #And investigate the approaches for all relevant attributes.
# for col in outlier_columns:
#     # And try out all different approaches. Note that we have done some optimization
#     # of the parameter values for each of the approaches by visual inspection.
#     # dataset = OutlierDistr.chauvenet(dataset, col)
#     # print(col, sum(dataset[col+'_outlier']))
#     # plot = DataViz.plot_binary_outliers(dataset, col, col + '_outlier', ax[i,j])
#
#     dataset = OutlierDistr.mixture_model(dataset, col)
#     DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'])
#     # plot.plot(ax=ax[i,j])
#     # i += 1
#     # if i == 7:
Beispiel #4
0
def main():

    # Set up file names and locations.

    # Next, import the data from the specified location and parse the date index.
    dataset = pickle.load(open('datasets\dataframes\df_concat_with_labels.pkl', 'rb'))



    # We'll create an instance of our visualization class to plot the results.
    DataViz = VisualizeDataset(__file__, show=False)

    # Compute the number of milliseconds covered by an instance using the first two rows.
    milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000

    # Step 1: Let us see whether we have some outliers we would prefer to remove.

    # Determine the columns we want to experiment on.
    outlier_columns = [
        "Acceleration x (m/s^2)","Acceleration y (m/s^2)","Acceleration z (m/s^2)",
        "Magnetic field x (µT)","Magnetic field y (µT)","Magnetic field z (µT)",
        "Gyroscope x (rad/s)","Gyroscope y (rad/s)","Gyroscope z (rad/s)",
        "Linear Acceleration x (m/s^2)","Linear Acceleration y (m/s^2)","Linear Acceleration z (m/s^2)",
    ]
    print(dataset.columns)
    # Create the outlier classes.
    OutlierDistr = DistributionBasedOutlierDetection()
    OutlierDist = DistanceBasedOutlierDetection()

    # And investigate the approaches for all relevant attributes.
    for col in outlier_columns:

        print(f"Applying outlier criteria for column {col}")

        # And try out all different approaches. Note that we have done some optimization
        # of the parameter values for each of the approaches by visual inspection.
        dataset = OutlierDistr.chauvenet(dataset, col)
        print(dataset.shape)
        # DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
        dataset = OutlierDistr.mixture_model(dataset, col)
        print(dataset.shape)
        # DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'])
        # This requires:
        # n_data_points * n_data_points * point_size =
        # 31839 * 31839 * 32 bits = ~4GB available memory

        try:
            dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99)
            # DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
            print(dataset['simple_dist_outlier'].mean())
        except MemoryError as e:
            print('Not enough memory available for simple distance-based outlier detection...')
            print('Skipping.')

        try:
            dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5)
            # DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'])
        except MemoryError as e:
            print('Not enough memory available for lof...')
            print('Skipping.')

        # Remove all the stuff from the dataset again.
        cols_to_remove = [col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof']
        for to_remove in cols_to_remove:
            if to_remove in dataset:
                del dataset[to_remove]

    # We take Chauvenet's criterion and apply it to all but the label data...

    for col in [c for c in dataset.columns if not 'label' in c]:
        print(f'Measurement is now: {col}')
        dataset = OutlierDistr.chauvenet(dataset, col)
        dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan
        del dataset[col + '_outlier']

    pickle.dump(dataset, open('concat_outliers.pkl', 'wb'))
Beispiel #5
0
def main():
    # Import the data from the specified location and parse the date index
    try:
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FILENAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print('File not found, try to run the preceding crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(module_path=__file__)
    # Create the outlier classes
    OutlierDistribution = DistributionBasedOutlierDetection()
    OutlierDistance = DistanceBasedOutlierDetection()

    # Step 1: If requested, see whether there are some outliers that need to be preferably removed
    # Set the columns to experiment on
    outlier_columns = ['acc_phone_x', 'light_phone_lux']

    if FLAGS.mode == 'chauvenet':
        # Investigate the outlier columns using chauvenet criterium
        for col in outlier_columns:
            print(f"Applying chauvenet outlier criteria for column {col}")
            dataset = OutlierDistribution.chauvenet(data_table=dataset, col=col)
            DataViz.plot_binary_outliers(data_table=dataset, col=col, outlier_col=f'{col}_outlier')

    elif FLAGS.mode == 'mixture':
        # Investigate the outlier columns using mixture models
        for col in outlier_columns:
            print(f"Applying mixture model for column {col}")
            dataset = OutlierDistribution.mixture_model(data_table=dataset, col=col, components=3)
            DataViz.plot_dataset(data_table=dataset, columns=[col, f'{col}_mixture'], match=['exact', 'exact'],
                                 display=['line', 'points'])

    elif FLAGS.mode == 'distance':
        for col in outlier_columns:
            print(f"Applying distance based outlier detection for column {col}")
            # This step requires:
            # n_data_points * n_data_points * point_size = 31839 * 31839 * 32 bits = ~4GB available memory
            try:
                dataset = OutlierDistance.simple_distance_based(data_table=dataset, cols=[col], d_function='euclidean',
                                                                d_min=FLAGS.dmin, f_min=FLAGS.fmin)
                DataViz.plot_binary_outliers(data_table=dataset, col=col, outlier_col='simple_dist_outlier')
            except MemoryError:
                print('Not enough memory available for simple distance-based outlier detection...')
                print('Skipping.')

    elif FLAGS.mode == 'LOF':
        for col in outlier_columns:
            print(f"Applying Local outlier factor for column {col}")
            try:
                dataset = OutlierDistance.local_outlier_factor(data_table=dataset, cols=[col], d_function='euclidean',
                                                               k=FLAGS.K)
                DataViz.plot_dataset(data_table=dataset, columns=[col, 'lof'], match=['exact', 'exact'],
                                     display=['line', 'points'])
            except MemoryError:
                print('Not enough memory available for local outlier factor...')
                print('Skipping.')

    elif FLAGS.mode == 'final':
        # Take Chauvenet's criterion and apply it to all but the label column in the main dataset
        for col in [c for c in dataset.columns if 'label' not in c]:
            print(f'Measurement is now: {col}')
            dataset = OutlierDistribution.chauvenet(data_table=dataset, col=col)
            dataset.loc[dataset[f'{col}_outlier'], col] = np.nan
            del dataset[col + '_outlier']

        dataset.to_csv(DATA_PATH / RESULT_FILENAME)
Beispiel #6
0
def main():
    # Set up file names and locations.
    DATA_PATH = Path('./intermediate_datafiles/')
    DATASET_FNAME = sys.argv[1] if len(
        sys.argv) > 1 else 'phoneSensorsA3_ch2.csv'
    RESULT_FNAME = sys.argv[2] if len(
        sys.argv) > 2 else 'phoneSensorsA3_outliers_ch3.csv'

    # Next, import the data from the specified location and parse the date index.
    try:
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)

    except IOError as e:
        print(
            'File not found, try to run the preceding crowdsignals scripts first!'
        )
        raise e

    # We'll create an instance of our visualization class to plot the results.
    DataViz = VisualizeDataset()

    # Compute the number of milliseconds covered by an instance using the first two rows.
    milliseconds_per_instance = (dataset.index[1] -
                                 dataset.index[0]).microseconds / 1000

    # Step 1: Let us see whether we have some outliers we would prefer to remove.

    # Determine the columns we want to experiment on.
    outlier_columns = [
        'acc_mobile_x', 'acc_mobile_y', 'acc_mobile_z', 'gyr_mobile_x',
        'gyr_mobile_y', 'gyr_mobile_z', 'mag_mobile_x', 'mag_mobile_y',
        'mag_mobile_z', 'prox_mobile_distance', 'loc_mobile_latitude',
        'loc_mobile_longitude', 'loc_mobile_height', 'loc_mobile_velocity',
        'loc_mobile_direction', 'loc_mobile_horizontalAccuracy',
        'loc_mobile_verticalAccuracy'
    ]

    # Create the outlier classes.
    OutlierDistr = DistributionBasedOutlierDetection()
    OutlierDist = DistanceBasedOutlierDetection()

    # And investigate the approaches for all relevant attributes.
    for col in outlier_columns:

        dataset_outliers_sdb = OutlierDist.simple_distance_based(
            copy.deepcopy(dataset), [col], 'euclidean', 0.10, 0.99)
        DataViz.plot_binary_outliers(dataset_outliers_sdb, col,
                                     'simple_dist_outlier')

        print(f"Applying outlier criteria for column {col}")

        # And try out all different approaches. Note that we have done some optimization
        # of the parameter values for each of the approaches by visual inspection.

        #dataset = OutlierDistr.chauvenet(dataset, col)
        #DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
        #dataset = OutlierDistr.mixture_model(dataset, col)
        #DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact', 'exact'], ['line', 'points'])

        # This requires:
        # n_data_points * n_data_points * point_size =
        # 31839 * 31839 * 32 bits = ~4GB available memory

        # try:
        #     dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99)
        #     DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
        # except MemoryError as e:
        #     print('Not enough memory available for simple distance-based outlier detection...')
        #     print('Skipping.')

        # try:
        #     dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 2)
        #     DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'])
        # except MemoryError as e:
        #     print('Not enough memory available for lof...')
        #     print('Skipping.')

        # Remove all the stuff from the dataset again.
        cols_to_remove = [
            col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof'
        ]
        for to_remove in cols_to_remove:
            if to_remove in dataset:
                del dataset[to_remove]

    # We take Chauvenet's criterion and apply it to all but the label data...

    for col in [c for c in dataset.columns if not 'label' in c]:
        print(f'Measurement is now: {col}')
        dataset = OutlierDistr.chauvenet(dataset, col)
        dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan
        del dataset[col + '_outlier']

    dataset.to_csv(DATA_PATH / RESULT_FNAME)
def main():
    # Set up file names and locations.
    DATA_PATH = Path('./intermediate_datafiles/')
    DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter2_result.csv'
    RESULT_FNAME = sys.argv[2] if len(
        sys.argv) > 2 else 'chapter3_result_outliers.csv'

    # Next, import the data from the specified location and parse the date index.
    try:
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)

    except IOError as e:
        print(
            'File not found, try to run the preceding crowdsignals scripts first!'
        )
        raise e

    # We'll create an instance of our visualization class to plot the results.
    DataViz = VisualizeDataset(__file__)

    # Compute the number of milliseconds covered by an instance using the first two rows.
    milliseconds_per_instance = (dataset.index[1] -
                                 dataset.index[0]).microseconds / 1000

    # Step 1: Let us see whether we have some outliers we would prefer to remove.

    # Determine the columns we want to experiment on.
    outlier_columns = [
        'acc_phone_X',
        'acc_phone_Y',
        'acc_phone_Z',
        'gyr_phone_X',
        'gyr_phone_Y',
        'gyr_phone_Z',
        'mag_phone_X',
        'mag_phone_Y',
        'mag_phone_Z',
    ]

    # Create the outlier classes.
    OutlierDistr = DistributionBasedOutlierDetection()
    OutlierDist = DistanceBasedOutlierDetection()

    # And investigate the approaches for all relevant attributes.
    for col in outlier_columns:

        print(f"Applying outlier criteria for column {col}")

        # And try out all different approaches. Note that we have done some optimization
        # of the parameter values for each of the approaches by visual inspection.
        dataset = OutlierDistr.chauvenet(dataset, col)
        DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
        dataset = OutlierDistr.mixture_model(dataset, col)
        DataViz.plot_dataset(dataset, [col, col + '_mixture'],
                             ['exact', 'exact'], ['line', 'points'])
        # This requires:
        # n_data_points * n_data_points * point_size =
        # 31839 * 31839 * 32 bits = ~4GB available memory

        try:
            dataset = OutlierDist.simple_distance_based(
                dataset, [col], 'euclidean', 0.10, 0.99)
            DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
        except MemoryError as e:
            print(
                'Not enough memory available for simple distance-based outlier detection...'
            )
            print('Skipping.')

        try:
            dataset = OutlierDist.local_outlier_factor(dataset, [col],
                                                       'euclidean', 5)
            DataViz.plot_dataset(dataset, [col, 'lof'], ['exact', 'exact'],
                                 ['line', 'points'])
            DataViz.plot_dataset_boxplot(dataset, ['lof'])
            # print(col, dataset['lof'].describe())
            qtls = list(dataset['lof'].quantile([0.01, 0.25, 0.5, 0.75, 0.99]))
            # print(col, qtls)
            #print(col, qtls[4])

            dataset['lof_outliers'] = False
            dataset.loc[(dataset['lof'] > qtls[4]), 'lof_outliers'] = True

            DataViz.plot_binary_outliers(dataset, col, 'lof_outliers')
        except MemoryError as e:
            print('Not enough memory available for lof...')
            print('Skipping.')

        # Remove all the stuff from the dataset again.
        cols_to_remove = [
            col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof',
            'lof_outliers'
        ]
        for to_remove in cols_to_remove:
            if to_remove in dataset:
                del dataset[to_remove]

    # We take Chauvenet's criterion and apply it to all but the label data...
    for col in [c for c in dataset.columns if not 'label' in c]:
        print(f'Measurement is now: {col}')
        if col.startswith('mag'):
            dataset = OutlierDist.simple_distance_based(
                dataset, [col], 'euclidean', 0.10,
                0.99).rename(columns={'simple_dist_outlier': f'{col}_outlier'})
        else:
            dataset = OutlierDistr.chauvenet(dataset, col)

        dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan
        DataViz.plot_binary_outliers(dataset, col, f'{col}_outlier')
        del dataset[col + '_outlier']

    dataset.to_csv(DATA_PATH / RESULT_FNAME)