Beispiel #1
0
def main():
    DATA_PATH = Path('./intermediate_datafiles/')
    DATASET_FNAME = 'chapter2_result.csv'
    RESULT_FNAME = 'chapter3_heart_rate.csv'

    try:
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run the preceding crowdsignals scripts first!'
        )
        raise e

    DataViz = VisualizeDataset(__file__)

    # Original heart rate values
    # DataViz.plot_imputed_values(dataset, ['original'], 'hr_watch_rate')

    Kalman = KalmanFilters()

    dataset = Kalman.apply_kalman_filter(dataset, 'hr_watch_rate')
    # print(dataset.head())
    # print(dataset.index)

    DataViz.plot_dataset(dataset, ['hr_watch_rate', 'hr_watch_rate_kalman'],
                         ['exact', 'exact'], ['line', 'line'])
 def __init__(self, data_path, data_file):
     self.dataset = pd.read_csv(Path(data_path / data_file), index_col=0)
     self.dataset = self.dataset
     self.dataset.index = pd.to_datetime(self.dataset.index)
     self.DataViz = VisualizeDataset(__file__, show=False)
     self.outlier_columns = ['acc_phone_x', 'light_phone_lux']
     self.OutlierDistr = DistributionBasedOutlierDetection()
     self.OutlierDist = DistanceBasedOutlierDetection()
     self.original_columns = self.dataset.columns
     self.num_outliers = {'acc_phone_x': 0, 'light_phone_lux': 0}
Beispiel #3
0
def main():
    dataset_path = './intermediate_datafiles/'
    dataset = pd.read_csv(dataset_path + 'chapter2_result.csv', index_col=0)
    outlier_columns = ['acc_phone_x', 'light_phone_lux']
    DataViz = VisualizeDataset()

    OutlierDistr = DistributionBasedOutlierDetection()
    OutlierDist = DistanceBasedOutlierDetection()

    dataset.index = dataset.index.to_datetime()
    start = input("choose method: [1],[2],[3],[4]")
    if start == 1:
        param = input("Chauvenet\ninput parameters: c")
        for col in outlier_columns:
            dataset = OutlierDistr.chauvenet(dataset, col, param)
            DataViz.plot_binary_outliers(dataset, col, col + '_outlier')

    elif start == 2:
        # param = input("Mixture model\n input parameters: components, iter")
        components, iter = raw_input("Mixture model\n input parameters: components, iter").split(',')
        components = int(components)
        iter = int(iter)
        for col in outlier_columns:
            dataset = OutlierDistr.mixture_model(dataset, col, components, iter)
            DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'])

    elif start == 3:
        d_min, f_min = raw_input("Simple distance-based\n input parameters: d_min, f_min").split()
        d_min = float(d_min)
        f_min = float(f_min)
        for col in outlier_columns:
            dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', d_min, f_min)
            DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')

    elif start == 4:
        param = input("Local outlier factor\n input parameters: k")
        for col in outlier_columns:
            dataset = OutlierDist.local_outlier_factor(dataset, col, 'euclidean', k)
            DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'])

    else :
        print("no method selected")
Beispiel #4
0
def main():
    DataViz = VisualizeDataset()

    dataset_path = './intermediate_datafiles/'
    try:
        dataset = pd.read_csv(dataset_path + 'chapter3_result_final.csv',
                              index_col=0)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    dataset.index = dataset.index.to_datetime()
    milliseconds_per_instance = (dataset.index[1] -
                                 dataset.index[0]).microseconds / 1000

    # Now we move to the frequency domain, with the same window size.

    FreqAbs = FourierTransformation()
    fs = float(1000) / milliseconds_per_instance

    periodic_predictor_cols = [
        'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
        'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y',
        'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z',
        'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x',
        'mag_watch_y', 'mag_watch_z'
    ]
    data_table = FreqAbs.abstract_frequency(
        copy.deepcopy(dataset), ['acc_phone_x'],
        int(float(10000) / milliseconds_per_instance), fs)

    # Spectral analysis.

    DataViz.plot_dataset(data_table, [
        'acc_phone_x_max_freq', 'acc_phone_x_freq_weighted', 'acc_phone_x_pse',
        'label'
    ], ['like', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])

    dataset = FreqAbs.abstract_frequency(
        dataset, periodic_predictor_cols,
        int(float(10000) / milliseconds_per_instance), fs)

    # Now we only take a certain percentage of overlap in the windows, otherwise our training examples will be too much alike.

    # The percentage of overlap we allow
    window_overlap = 0.9
    skip_points = int((1 - window_overlap) * ws)
    dataset = dataset.iloc[::skip_points, :]

    DataViz.plot_dataset(
        dataset, [
            'acc_phone_x', 'gyr_phone_x', 'hr_watch_rate', 'light_phone_lux',
            'mag_phone_x', 'press_phone_', 'pca_1', 'label'
        ], ['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'],
        ['line', 'line', 'line', 'line', 'line', 'line', 'line', 'points'])
from util.VisualizeDataset import VisualizeDataset
from Chapter4.TemporalAbstraction import NumericalAbstraction
from Chapter4.TemporalAbstraction import CategoricalAbstraction
from Chapter4.FrequencyAbstraction import FourierTransformation
from Chapter4.TextAbstraction import TextAbstraction

# Read the result from the previous chapter, and make sure the index is of the type datetime.
import pickle
from Load import *

# As usual, we set our program constants, read the input file and initialize a visualization object.
dataset = pd.read_csv(cluster_watch_data, index_col=time_col)
dataset.index = pd.to_datetime(dataset.index)

# Let us create our visualization class again.
DataViz = VisualizeDataset(__file__, show=False)

# Compute the number of milliseconds covered by an instance based on the first two rows
milliseconds_per_instance = (dataset.index[1] -
                             dataset.index[0]).microseconds / 1000

# Chapter 4: Identifying aggregate attributes.
print('attributes time domain')

# First we focus on the time domain.

# Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes
window_sizes = [
    int(float(5000) / milliseconds_per_instance),
    int(float(0.5 * 60000) / milliseconds_per_instance),
    int(float(5 * 60000) / milliseconds_per_instance),
Beispiel #6
0
#                                                            #
##############################################################

from util.VisualizeDataset import VisualizeDataset
from Chapter3.OutlierDetection import DistributionBasedOutlierDetection
from Chapter3.OutlierDetection import DistanceBasedOutlierDetection
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import pylab

# Let is create our visualization class again.
DataViz = VisualizeDataset()

# Read the result from the previous chapter, and make sture the index is of the type datetime.
dataset_path = './intermediate_datafiles/'
try:
    dataset = pd.read_csv(dataset_path + 'chapter2_final2hz.csv', index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = dataset.index.to_datetime()

# fig, ax = plt.subplots(5, 3)
# i = 0
# j = 0
#
Beispiel #7
0
from Chapter7.LearningAlgorithms import ClassificationAlgorithms
from Chapter7.LearningAlgorithms import RegressionAlgorithms
from Chapter7.Evaluation import ClassificationEvaluation
from Chapter7.Evaluation import RegressionEvaluation
from Chapter7.FeatureSelection import FeatureSelectionClassification
from Chapter7.FeatureSelection import FeatureSelectionRegression
import copy
import pandas as pd
from util import util
import matplotlib.pyplot as plot
import numpy as np
from sklearn.model_selection import train_test_split
import os

# Of course we repeat some stuff from Chapter 3, namely to load the dataset
DataViz = VisualizeDataset()
# Read the result from the previous chapter, and make sure the index is of the type datetime.
dataset_path = './intermediate_datafiles_ass3/'
export_tree_path = 'Example_graphs/ass3/'
try:
    dataset = pd.read_csv(dataset_path + 'clustering_result.csv', index_col=0)
except IOError as e:
    print('File not found, try to run previous scripts first!')
    raise e

if not os.path.exists(export_tree_path):
    os.makedirs(export_tree_path)

dataset.index = dataset.index.to_datetime()

# Let us consider our first task, namely the prediction of the label. We consider this as a non-temporal task.
Beispiel #8
0
from util.VisualizeDataset import VisualizeDataset
from Chapter5.DistanceMetrics import InstanceDistanceMetrics
from Chapter5.DistanceMetrics import PersonDistanceMetricsNoOrdering
from Chapter5.DistanceMetrics import PersonDistanceMetricsOrdering
from Chapter5.Clustering import NonHierarchicalClustering
from Chapter5.Clustering import HierarchicalClustering
import copy
import pandas as pd
import matplotlib.pyplot as plot
import util.util as util


# Of course we repeat some stuff from Chapter 3, namely to load the dataset

DataViz = VisualizeDataset()

# Read the result from the previous chapter, and make sure the index is of the type datetime.
dataset_path = './intermediate_datafiles/'

try:
    dataset = pd.read_csv(dataset_path + 'chapter4_our_result.csv', index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e
dataset.index = dataset.index.to_datetime()

# First let us use non hierarchical clustering.

clusteringNH = NonHierarchicalClustering()
Beispiel #9
0
from Chapter7.Evaluation import RegressionEvaluation
from Chapter8.LearningAlgorithmsTemporal import TemporalClassificationAlgorithms
from Chapter8.LearningAlgorithmsTemporal import TemporalRegressionAlgorithms
from statsmodels.tsa.stattools import adfuller
from pandas.tools.plotting import autocorrelation_plot

import copy
import pandas as pd
from util import util
import matplotlib.pyplot as plot
import numpy as np
from sklearn.model_selection import train_test_split

# Of course we repeat some stuff from Chapter 3, namely to load the dataset

DataViz = VisualizeDataset()

# Read the result from the previous chapter, and make sure the index is of the type datetime.
dataset_path = './intermediate_datafiles/'

try:
    dataset = pd.read_csv(dataset_path + 'chapter5_our_result.csv',
                          index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = dataset.index.to_datetime()

# Let us consider our second task, namely the prediction of the Azimuth. We consider this as a temporal task.
Beispiel #10
0
RESULT_FNAME = 'chapter7_classification_result.csv'
EXPORT_TREE_PATH = Path('./figures/crowdsignals_ch7_classification/')

# Next, we declare the parameters we'll use in the algorithms.
N_FORWARD_SELECTION = 15

try:
    dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = pd.to_datetime(dataset.index)
dataset = dataset.sample(n=8000)
# Let us create our visualization class again.
DataViz = VisualizeDataset(__file__, show=False)
# datetime object containing current date and time
now1 = datetime.now()
# dd/mm/YY H:M:S
dt_string = now1.strftime("%d/%m/%Y %H:%M:%S")
print("date and time =", dt_string)

diff = now1 - begin
print('difference time', diff)

# Let us consider our first task, namely the prediction of the label. We consider this as a non-temporal task.

# We create a single column with the categorical attribute representing our class. Furthermore, we use 70% of our data
# for training and the remaining 30% as an independent test set. We select the sets based on stratified sampling. We remove
# cases where we do not know the label.
# Read the result from the previous chapter, and make sure the index is of the type datetime.
DATA_PATH = Path('./intermediate_datafiles/')
DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter3_result_final.csv'
RESULT_FNAME = sys.argv[2] if len(sys.argv) > 2 else 'chapter4_result.csv'

try:
    dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = pd.to_datetime(dataset.index)

# Let us create our visualization class again.
DataViz = VisualizeDataset(__file__)

# Compute the number of milliseconds covered by an instance based on the first two rows
milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000


# Chapter 4: Identifying aggregate attributes.
print('attributes time domain')

# First we focus on the time domain.

# Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes
window_sizes = [int(float(5000)/milliseconds_per_instance), int(float(0.5*60000)/milliseconds_per_instance), int(float(5*60000)/milliseconds_per_instance)]

print('total window sizes', window_sizes)
Beispiel #12
0
    DataSet.add_numerical_dataset('magnetometer_phone.csv', 'timestamps',
                                  ['x', 'y', 'z'], 'avg', 'mag_phone_')
    DataSet.add_numerical_dataset('magnetometer_smartwatch.csv', 'timestamps',
                                  ['x', 'y', 'z'], 'avg', 'mag_watch_')

    # We add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again
    DataSet.add_numerical_dataset('pressure_phone.csv', 'timestamps',
                                  ['pressure'], 'avg', 'press_phone_')

    # Get the resulting pandas data table

    dataset = DataSet.data_table

    # Plot the data

    DataViz = VisualizeDataset()

    # Boxplot
    DataViz.plot_dataset_boxplot(dataset, [
        'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
        'acc_watch_y', 'acc_watch_z'
    ])

    # Plot all data
    DataViz.plot_dataset(
        dataset, [
            'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_',
            'press_phone_', 'label'
        ], ['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'],
        ['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])
Beispiel #13
0
# Set up the file names and locations.
DATA_PATH = Path('./intermediate_datafiles/our_data')
DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter3_result_outliers.csv'
RESULT_FNAME = sys.argv[2] if len(sys.argv) > 2 else 'chapter3_result_final.csv'
ORIG_DATASET_FNAME = sys.argv[3] if len(sys.argv) > 3 else 'chapter2_result.csv'

# Next, import the data from the specified location and parse the date index.
try:
    dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
    dataset.index = pd.to_datetime(dataset.index)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

# We'll create an instance of our visualization class to plot the results.
DataViz = VisualizeDataset(__file__)

# Compute the number of milliseconds covered by an instance based on the first two rows
milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000

# Let us impute the missing values and plot an example.

MisVal = ImputationMissingValues()
# imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'hr_watch_rate')
# imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'hr_watch_rate')
# imputed_interpolation_dataset = MisVal.impute_interpolate(copy.deepcopy(dataset), 'hr_watch_rate')
# DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate'])

# Now, let us carry out that operation over all columns except for the label.

for col in [c for c in dataset.columns if not 'label' in c]:
Beispiel #14
0
def main():
    # Set a granularity (the discrete step size of our time series data) and choose if all resulting datasets should
    # be saved. A course-grained granularity of one instance per minute, and a fine-grained one with four instances
    # per second are used.
    GRANULARITIES = [60000, 250]
    SAVE_VERSIONS = False

    # We can call Path.mkdir(exist_ok=True) to make any required directories if they don't already exist.
    [path.mkdir(exist_ok=True, parents=True) for path in [DATASET_PATH, RESULT_PATH]]

    # Create object to visualize the data and save figures
    DataViz = VisualizeDataset(module_path=__file__)

    datasets = []
    for milliseconds_per_instance in GRANULARITIES:
        print(
            f'Creating numerical datasets from files in {DATASET_PATH} using granularity {milliseconds_per_instance}.')

        # Create an initial dataset object with the base directory for our data and a granularity and add selected
        # measurements to it
        data_engineer = CreateDataset(base_dir=DATASET_PATH, granularity=milliseconds_per_instance)

        # Add the accelerometer data (continuous numerical measurements) of the phone and the smartwatch
        # and aggregate the values per timestep by averaging the values
        data_engineer.add_numerical_dataset(file='accelerometer_phone.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='acc_phone_')
        data_engineer.add_numerical_dataset(file='accelerometer_smartwatch.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='acc_watch_')

        # Add the gyroscope data (continuous numerical measurements) of the phone and the smartwatch
        # and aggregate the values per timestep by averaging the values
        data_engineer.add_numerical_dataset(file='gyroscope_phone.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='gyr_phone_')
        data_engineer.add_numerical_dataset(file='gyroscope_smartwatch.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='gyr_watch_')

        # Add the heart rate (continuous numerical measurements) and aggregate by averaging the values
        data_engineer.add_numerical_dataset(file='heart_rate_smartwatch.csv', timestamp_col='timestamps',
                                            value_cols=['rate'], aggregation='avg', prefix='hr_watch_')

        # Add the labels provided by the users as binary attributes (i.e. add a one to the attribute representing the
        # specific value for a label if it occurs within an interval). These are categorical events that might overlap.
        data_engineer.add_event_dataset(file='labels.csv', start_timestamp_col='label_start',
                                        end_timestamp_col='label_end',
                                        value_col='label', aggregation='binary')

        # Add the amount of light sensed by the phone (continuous numerical measurements) and aggregate by averaging
        data_engineer.add_numerical_dataset(file='light_phone.csv', timestamp_col='timestamps', value_cols=['lux'],
                                            aggregation='avg', prefix='light_phone_')

        # Add the magnetometer data (continuous numerical measurements) of the phone and the smartwatch
        # and aggregate the values per timestep by averaging the values
        data_engineer.add_numerical_dataset(file='magnetometer_phone.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='mag_phone_')
        data_engineer.add_numerical_dataset(file='magnetometer_smartwatch.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='mag_watch_')

        # Add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again
        data_engineer.add_numerical_dataset(file='pressure_phone.csv', timestamp_col='timestamps',
                                            value_cols=['pressure'],
                                            aggregation='avg', prefix='press_phone_')

        # Get the resulting pandas data table
        dataset = data_engineer.data_table

        # Create boxplots
        DataViz.plot_dataset_boxplot(dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
                                                            'acc_watch_y', 'acc_watch_z'])

        # Plot all data
        DataViz.plot_dataset(data_table=dataset,
                             columns=['acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_',
                                      'label'],
                             match=['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'],
                             display=['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])

        # Print a summary of the dataset
        util.print_statistics(dataset=dataset)
        datasets.append(copy.deepcopy(dataset))

        # Save the various versions of the created datasets with logical filenames if needed
        if SAVE_VERSIONS:
            dataset.to_csv(RESULT_PATH / f'chapter2_result_{milliseconds_per_instance}')

    # Make a table like the one shown in the book, comparing the two datasets produced
    util.print_latex_table_statistics_two_datasets(dataset1=datasets[0], dataset2=datasets[1])

    # Finally, store the last dataset we generated (250 ms)
    dataset.to_csv(RESULT_PATH / RESULT_FNAME)
Beispiel #15
0
    DataSetOwn.add_numerical_dataset('mag_custom.csv', 'timestamps', ['x','y','z'], 'avg', 'mag_phone_')
    DataSetOwn.add_numerical_dataset('press_custom.csv', 'timestamps', ['pressure'], 'avg', 'press_phone_')
#    DataSetCS.add_numerical_dataset('magnetometer_phone.csv', 'timestamps', ['x','y','z'], 'avg', 'mag_phone_')
#    DataSetCS.add_numerical_dataset('magnetometer_smartwatch.csv', 'timestamps', ['x','y','z'], 'avg', 'mag_watch_')

    # We add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again
    DataSetOwn.add_numerical_dataset('pedom_custom.csv', 'timestamps', ['steps', 'distance'], 'avg', 'pedom_phone_')
#    DataSetCS.add_numerical_dataset('pressure_phone.csv', 'timestamps', ['pressure'], 'avg', 'press_phone_')

    # Get the resulting pandas data table

    dataset_own = DataSetOwn.data_table
#    dataset_cs = DataSetCS.data_table

    # Plot the data
    DataViz = VisualizeDataset()

    # Boxplot
    DataViz.plot_dataset_boxplot(dataset_own, ['acc_phone_x','acc_phone_y','acc_phone_z'])
#    DataViz.plot_dataset_boxplot(dataset_cs, ['acc_phone_x','acc_phone_y','acc_phone_z'])

    # Plot all data
    DataViz.plot_dataset(dataset_own, ['acc_', 'gyr_', 'mag_', 'press_' ,'pedom_phone_', 'label'], ['like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line','line', 'points', 'points'])
#    DataViz.plot_dataset(dataset_cs, ['acc_phone', 'gyr_phone', 'mag_phone', 'press_phone_', 'label'], ['like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'points'])

    # And print a summary of the dataset

    util.print_statistics(dataset_own)
    datasets_own.append(copy.deepcopy(dataset_own))
    
#    util.print_statistics(dataset_cs)
#    Machine Learning for the Quantified Self                #
#    Springer                                                #
#    Chapter 4                                               #
#                                                            #
##############################################################

from util.VisualizeDataset import VisualizeDataset
from Chapter4.TemporalAbstraction import NumericalAbstraction
from Chapter4.TemporalAbstraction import CategoricalAbstraction
from Chapter4.FrequencyAbstraction import FourierTransformation
from Chapter4.TextAbstraction import TextAbstraction
import copy
import pandas as pd

# Let us create our visualization class again.
DataViz = VisualizeDataset()

# Read the result from the previous chapter, and make sure the index is of the type datetime.
dataset_path = './intermediate_datafiles/'
try:
    dataset = pd.read_csv(dataset_path + 'chapter3_our_result_final.csv', index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = dataset.index.to_datetime()

# Compute the number of milliseconds covered by an instane based on the first two rows
milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000

# Chapter 4: Identifying aggregate attributes.
# Read the result from the previous chapter, and make sure the index is of the type datetime.
DATA_PATH = Path('./intermediate_datafiles/our_data')
DATASET_FNAME = sys.argv[1] if len(
    sys.argv) > 1 else 'chapter3_result_final.csv'
RESULT_FNAME = sys.argv[2] if len(sys.argv) > 2 else 'chapter4_result.csv'

try:
    dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = pd.to_datetime(dataset.index)

# Let us create our visualization class again.
DataViz = VisualizeDataset(__file__)

# Compute the number of milliseconds covered by an instance based on the first two rows
milliseconds_per_instance = (dataset.index[1] -
                             dataset.index[0]).microseconds / 1000

# Chapter 4: Identifying aggregate attributes.

# First we focus on the time domain.

# Set the window sizes to the number of instances representing 1 seconds, 5 seconds and 3 minutes
window_sizes = [
    int(float(1000) / milliseconds_per_instance),
    int(float(5000) / milliseconds_per_instance),
    int(float(0.3 * 60000) / milliseconds_per_instance)
]
Beispiel #18
0
# Set up the file names and locations.
DATA_PATH = Path('./intermediate_datafiles/')
DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter3_result_outliers.csv'
RESULT_FNAME = sys.argv[2] if len(sys.argv) > 2 else 'chapter3_result_final.csv'
ORIG_DATASET_FNAME = sys.argv[3] if len(sys.argv) > 3 else 'chapter2_result.csv'

# Next, import the data from the specified location and parse the date index.
try:
    dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
    dataset.index = pd.to_datetime(dataset.index)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

# We'll create an instance of our visualization class to plot the results.
DataViz = VisualizeDataset(__file__)

# Compute the number of milliseconds covered by an instance based on the first two rows
milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000

# Let us impute the missing values and plot an example.

MisVal = ImputationMissingValues()
# imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'hr_watch_rate')
# imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'hr_watch_rate')
# imputed_interpolation_dataset = MisVal.impute_interpolate(copy.deepcopy(dataset), 'hr_watch_rate')
# DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate'])

# Now, let us carry out that operation over all columns except for the label.

for col in [c for c in dataset.columns if not 'label' in c]:
Beispiel #19
0
        'horizontalAccuracy', 'verticalAccuracy'
    ], 'avg', 'loc_mobile_')  # unit:()

    # TODO add the other datasets

    # We add the labels provided by the users. These are categorical events that might overlap. We add them
    # as binary attributes (i.e. add a one to the attribute representing the specific value for the label if it
    # occurs within an interval).
    dataset.add_event_dataset('labels_phone.csv', 'label_start', 'label_end',
                              'label', 'binary')

    # Get the resulting pandas data table
    dataset = dataset.data_table

    # Plot the data
    DataViz = VisualizeDataset(__file__)

    # Boxplot
    DataViz.plot_dataset_boxplot(
        dataset, ['acc_mobile_x', 'acc_mobile_y', 'acc_mobile_z'])
    #DataViz.plot_dataset_boxplot(dataset, ['gyr_mobile_x', 'gyr_mobile_y', 'gyr_mobile_z'])

    # Plot all data
    # DataViz.plot_dataset(dataset, ['acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'label'],
    #                               ['like', 'like', 'like', 'like', 'like', 'like', 'like','like'],
    #                               ['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])

    DataViz.plot_dataset(dataset, [
        'acc_mobile_', 'gyr_mobile_', 'mag_mobile_', 'prox_mobile_distance',
        'loc_mobile_', 'label'
    ], ['like', 'like', 'like', 'like', 'like', 'like'],
Beispiel #20
0
#    Chapter 3                                               #
#                                                            #
##############################################################

from util.VisualizeDataset import VisualizeDataset
from Chapter3.DataTransformation import LowPassFilter
from Chapter3.DataTransformation import PrincipalComponentAnalysis
from Chapter3.ImputationMissingValues import ImputationMissingValues
from Chapter3.KalmanFilters import KalmanFilters
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot

# Let is create our visualization class again.
DataViz = VisualizeDataset()

# Read the result from the previous chapter, and make sure the index is of the type datetime.
dataset_path = './intermediate_datafiles/'
dataset = pd.read_csv(dataset_path + 'chapter3_result_outliers.csv',
                      index_col=0)
dataset.index = dataset.index.to_datetime()

# Computer the number of milliseconds covered by an instane based on the first two rows
milliseconds_per_instance = (dataset.index[1] -
                             dataset.index[0]).microseconds / 1000

# Step 2: Let us impute the missing values.

MisVal = ImputationMissingValues()
imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset),
def main():
    # Read the result from the previous chapter and convert the index to datetime
    try:
        dataset = pd.read_csv(DATA_PATH / DATASET_FILENAME, index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print('File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Consider the second task, namely the prediction of the heart rate. Therefore create a dataset with the heart
    # rate as target and split using timestamps, because this is considered as a temporal task.
    print('\n- - - Loading dataset - - -')
    prepare = PrepareDatasetForLearning()
    learner = RegressionAlgorithms()
    evaluation = RegressionEvaluation()
    train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(dataset, 'hr_watch_rate',
                                                                                       '2016-02-08 18:28:56',
                                                                                       '2016-02-08 19:34:07',
                                                                                       '2016-02-08 20:07:50')
    print('Training set length is: ', len(train_X.index))
    print('Test set length is: ', len(test_X.index))

    # Select subsets of the features
    print('- - - Selecting subsets - - -')
    basic_features = ['acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z',
                      'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z',
                      'labelOnTable', 'labelSitting', 'labelWashingHands', 'labelWalking', 'labelStanding',
                      'labelDriving',
                      'labelEating', 'labelRunning', 'light_phone_lux', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z',
                      'mag_watch_x', 'mag_watch_y', 'mag_watch_z', 'press_phone_pressure']
    pca_features = ['pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7']
    time_features = [name for name in dataset.columns if ('temp_' in name and 'hr_watch' not in name)]
    freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))]
    cluster_features = ['cluster']
    print('#basic features: ', len(basic_features))
    print('#PCA features: ', len(pca_features))
    print('#time features: ', len(time_features))
    print('#frequency features: ', len(freq_features))
    print('#cluster features: ', len(cluster_features))
    features_after_chapter_3 = list(set().union(basic_features, pca_features))
    features_after_chapter_4 = list(set().union(features_after_chapter_3, time_features, freq_features))
    features_after_chapter_5 = list(set().union(features_after_chapter_4, cluster_features))

    if FLAGS.mode == 'selection' or FLAGS.mode == 'all':
        # First, consider the Pearson correlations and see whether features can be selected based on them
        fs = FeatureSelectionRegression()
        print('\n- - - Running feature selection - - -')
        features, correlations = fs.pearson_selection(10, train_X[features_after_chapter_5], train_y)
        util.print_pearson_correlations(correlations)

    # Select the 10 features with the highest correlation
    selected_features = ['temp_pattern_labelOnTable', 'labelOnTable', 'temp_pattern_labelOnTable(b)labelOnTable',
                         'pca_2_temp_mean_ws_120', 'pca_1_temp_mean_ws_120', 'acc_watch_y_temp_mean_ws_120', 'pca_2',
                         'acc_phone_z_temp_mean_ws_120', 'gyr_watch_y_pse', 'gyr_watch_x_pse']
    possible_feature_sets = [basic_features, features_after_chapter_3, features_after_chapter_4,
                             features_after_chapter_5, selected_features]
    feature_names = ['initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features']

    if FLAGS.mode == 'overall' or FLAGS.mode == 'all':
        print('\n- - - Running test of all different regression algorithms - - -')
        # First study the importance of the parameter settings. Therefore repeat the experiment a number of times to get
        # a bit more robust data as the initialization of e.g. the NN is random
        REPEATS = FLAGS.repeats
        scores_over_all_algs = []

        for i in range(0, len(possible_feature_sets)):
            selected_train_X = train_X[possible_feature_sets[i]]
            selected_test_X = test_X[possible_feature_sets[i]]

            performance_tr_nn, performance_tr_nn_std = 0, 0
            performance_tr_rf, performance_tr_rf_std = 0, 0
            performance_te_nn, performance_te_nn_std = 0, 0
            performance_te_rf, performance_te_rf_std = 0, 0

            # First run non deterministic classifiers a number of times to average their score
            for repeat in range(0, REPEATS):
                print(f'Training NeuralNetwork run {repeat + 1}/{REPEATS} ... ')
                regr_train_y, regr_test_y = learner.\
                    feedforward_neural_network(selected_train_X, train_y, selected_test_X, gridsearch=True)
                mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y)
                mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y)
                performance_tr_nn += mean_tr
                performance_tr_nn_std += std_tr
                performance_te_nn += mean_te
                performance_te_nn_std += std_te

                print(f'Training RandomForest run {repeat + 1}/{REPEATS} ... ')
                regr_train_y, regr_test_y = learner.random_forest(selected_train_X, train_y, selected_test_X,
                                                                  gridsearch=True)
                mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y)
                mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y)
                performance_tr_rf += mean_tr
                performance_tr_rf_std += std_tr
                performance_te_rf += mean_te
                performance_te_rf_std += std_te

            overall_performance_tr_nn = performance_tr_nn / REPEATS
            overall_performance_tr_nn_std = performance_tr_nn_std / REPEATS
            overall_performance_te_nn = performance_te_nn / REPEATS
            overall_performance_te_nn_std = performance_te_nn_std / REPEATS
            overall_performance_tr_rf = performance_tr_rf / REPEATS
            overall_performance_tr_rf_std = performance_tr_rf_std / REPEATS
            overall_performance_te_rf = performance_te_rf / REPEATS
            overall_performance_te_rf_std = performance_te_rf_std / REPEATS

            # Run deterministic algorithms:
            print("Support Vector Regressor run 1/1 ... ")
            # Convergence of the SVR does not always occur (even adjusting tolerance and iterations does not help)
            regr_train_y, regr_test_y = learner.\
                support_vector_regression_without_kernel(selected_train_X, train_y, selected_test_X, gridsearch=False)
            mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y)
            mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y)
            performance_tr_svm = mean_tr
            performance_tr_svm_std = std_tr
            performance_te_svm = mean_te
            performance_te_svm_std = std_te

            print("Training Nearest Neighbor run 1/1 ... ")
            regr_train_y, regr_test_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X,
                                                                   gridsearch=True)
            mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y)
            mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y)
            performance_tr_knn = mean_tr
            performance_tr_knn_std = std_tr
            performance_te_knn = mean_te
            performance_te_knn_std = std_te

            print("Training Decision Tree run 1/1 ... ")
            regr_train_y, regr_test_y = learner.\
                decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True,
                              export_tree_path=EXPORT_TREE_PATH)
            mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y)
            mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y)
            performance_tr_dt = mean_tr
            performance_tr_dt_std = std_tr
            performance_te_dt = mean_te
            performance_te_dt_std = std_te

            scores_with_sd = [(overall_performance_tr_nn, overall_performance_tr_nn_std, overall_performance_te_nn,
                               overall_performance_te_nn_std),
                              (overall_performance_tr_rf, overall_performance_tr_rf_std, overall_performance_te_rf,
                               overall_performance_te_rf_std),
                              (performance_tr_svm, performance_tr_svm_std, performance_te_svm, performance_te_svm_std),
                              (performance_tr_knn, performance_tr_knn_std, performance_te_knn, performance_te_knn_std),
                              (performance_tr_dt, performance_tr_dt_std, performance_te_dt, performance_te_dt_std)]
            util.print_table_row_performances_regression(feature_names[i], scores_with_sd)
            scores_over_all_algs.append(scores_with_sd)

        # Plot the results
        DataViz.plot_performances_regression(['NN', 'RF', 'SVM', 'KNN', 'DT'], feature_names, scores_over_all_algs)

    if FLAGS.mode == 'detail' or FLAGS.mode == 'all':
        print('\n- - - Running visualization of results - - -')
        regr_train_y, regr_test_y = learner.random_forest(train_X[features_after_chapter_5], train_y,
                                                          test_X[features_after_chapter_5], gridsearch=False,
                                                          print_model_details=True)
        DataViz.plot_numerical_prediction_versus_real(train_X.index, train_y, regr_train_y, test_X.index, test_y,
                                                      regr_test_y, 'heart rate')
Beispiel #22
0
# Read the result from the previous chapter, and make sure the index is of the type datetime.
DATA_PATH = Path('./intermediate_datafiles/')
DATASET_FNAME = sys.argv[1] if len(
    sys.argv) > 1 else 'chapter3_result_final.csv'
RESULT_FNAME = sys.argv[2] if len(sys.argv) > 2 else 'chapter4_result.csv'

try:
    dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = pd.to_datetime(dataset.index)

# Let us create our visualization class again.
DataViz = VisualizeDataset(__file__)

# Compute the number of milliseconds covered by an instance based on the first two rows
milliseconds_per_instance = (dataset.index[1] -
                             dataset.index[0]).microseconds / 1000

# Chapter 4: Identifying aggregate attributes.

# First we focus on the time domain.

# OLD: Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes
# NEW: Set the window sizes to the number of instances representing 1 second, 8 seconds, 16 seconds  and 24 seconds
window_sizes = [
    int(float(1000) / milliseconds_per_instance),
    int(float(4000) / milliseconds_per_instance),
    int(float(8000) / milliseconds_per_instance)
Beispiel #23
0
    # We add the labels provided by the users. These are categorical events that might overlap. We add them
    # as binary attributes (i.e. add a one to the attribute representing the specific value for the label if it
    # occurs within an interval).
    DataSet.add_binary_labels_dataset('A01_parsed_raw_data.csv', 'timestamp',
                                      ['labelWalking', 'labelFalling', 'labelLyingDown', 'labelLying',
                                       'labelSittingDown', 'labelSitting', 'labelStandingFromLying', 'labelOnAllFours',
                                       'labelSittingOnTheGround', 'labelStandingFromSitting',
                                       'labelStandingFromSittingOnTheGround'], 'max', '')

    # Get the resulting pandas data table

    dataset = DataSet.data_table

    # Plot the data

    DataViz = VisualizeDataset()

    # Boxplot
    DataViz.plot_dataset_boxplot(dataset, ['ankle_l_x', 'ankle_l_y', 'ankle_l_z', 'ankle_r_x', 'ankle_r_y', 'ankle_r_z',
                                           'belt_x', 'belt_y', 'belt_z', 'chest_x', 'chest_y', 'chest_z'])

    # Plot all data
    DataViz.plot_dataset(dataset, ['ankle_l_', 'ankle_r_', 'belt_', 'chest_', 'label'], ['like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'points'])

    # And print a summary of the dataset

    util.print_statistics(dataset)
    datasets.append(copy.deepcopy(dataset))

# And print the table that has been included in the book
Beispiel #24
0
#    Machine Learning for the Quantified Self                #
#    Springer                                                #
#    Chapter 4                                               #
#                                                            #
##############################################################

from util.VisualizeDataset import VisualizeDataset
from Chapter4.TemporalAbstraction import NumericalAbstraction
from Chapter4.TemporalAbstraction import CategoricalAbstraction
from Chapter4.FrequencyAbstraction import FourierTransformation
from Chapter4.TextAbstraction import TextAbstraction
import copy
import pandas as pd

# Let us create our visualization class again.
DataViz = VisualizeDataset()

# Read the result from the previous chapter, and make sure the index is of the type datetime.
dataset_path = './intermediate_datafiles-own/'
try:
    dataset = pd.read_csv(dataset_path + 'chapter3_result_final-own.csv', index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = dataset.index.to_datetime()

# Compute the number of milliseconds covered by an instane based on the first two rows
milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000

#    Mark Hoogendoorn and Burkhardt Funk (2017)              #
#    Machine Learning for the Quantified Self                #
#    Springer                                                #
#    Chapter 3                                               #
#                                                            #
##############################################################

from util.VisualizeDataset import VisualizeDataset
from Chapter3.OutlierDetection import DistributionBasedOutlierDetection
from Chapter3.OutlierDetection import DistanceBasedOutlierDetection
import copy
import pandas as pd
import numpy as np

# Let is create our visualization class again.
DataViz = VisualizeDataset()

# Read the result from the previous chapter, and make sture the index is of the type datetime.
dataset_path = './intermediate_datafiles-own/'
try:
    dataset = pd.read_csv(dataset_path + 'chapter2_result-own.csv', index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = dataset.index.to_datetime()

# Compute the number of milliseconds covered by an instance based on the first two rows
milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000

# Step 1: Let us see whether we have some outliers we would prefer to remove.
try:
    dataset = pickle.load(
        open('datasets/dataframes/concat_df_imputed_gyro.pkl', 'rb'))
    if not all_data:
        print('subset data')
        dataset = dataset[:14780]
    dataset.index = pd.to_datetime(dataset.index)
    if all_data:
        print('all data')
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

print(dataset.columns)
DataViz = VisualizeDataset(__file__, show=False)

# We'll start by applying non-hierarchical clustering.
clusteringNH = NonHierarchicalClustering()

# Let us look at k-means first.
k_values = range(2, 10)
silhouette_values = []
attributes_to_cluster = [
    'Acceleration x (m/s^2)', 'Acceleration y (m/s^2)',
    'Acceleration z (m/s^2)', 'Gyroscope x (rad/s)', 'Gyroscope y (rad/s)',
    'Gyroscope z (rad/s)'
]
## Do some initial runs to determine the right number for k

print('===== kmeans clustering =====')
Beispiel #27
0
def main():
    # D:\Users\Andy\Downloads\Desktop\ml4qs\ML4QS_Group_41\ML4QS\simple_dataset\2\accelerometer.csv
    # Set up file names and locations.
    for user in range(1, 34):

        DATA_PATH = Path('./intermediate_datafiles/')
        DATASET_FNAME = sys.argv[1] if len(
            sys.argv
        ) > 1 else 'AS14_' + "{:02d}".format(user) + '/chapter2_result.csv'
        RESULT_FNAME = sys.argv[2] if len(
            sys.argv) > 2 else 'chapter3_result_outliers.csv'

        # Next, import the data from the specified location and parse the date index.
        try:
            dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
            dataset.index = pd.to_datetime(dataset.index)

        except IOError as e:
            print(
                'File not found, try to run the preceding crowdsignals scripts first!'
            )
            raise e

        # We'll create an instance of our visualization class to plot the results.
        DataViz = VisualizeDataset(__file__)

        # Compute the number of milliseconds covered by an instance using the first two rows.
        milliseconds_per_instance = (dataset.index[1] -
                                     dataset.index[0]).microseconds / 1000

        # Step 1: Let us see whether we have some outliers we would prefer to remove.

        # Determine the columns we want to experiment on.
        outlier_columns = [
            'actvalue', 'builtvalue', 'commvalue', 'entvalue', 'accvalue',
            'offvalue', 'othervalue', 'socialvalue', 'travelvalue', 'unkvalue',
            'utilvalue', 'callvalue', 'arovalue', 'valvalue', 'scrvalue',
            'smsvalue'
        ]

        # Create the outlier classes.
        OutlierDistr = DistributionBasedOutlierDetection()
        OutlierDist = DistanceBasedOutlierDetection()

        # And investigate the approaches for all relevant attributes.
        for col in outlier_columns:
            # if col is None:
            #     continue
            print(f"Applying outlier criteria for column {col}")

            # And try out all different approaches. Note that we have done some optimization
            # of the parameter values for each of the approaches by visual inspection.
            dataset = OutlierDistr.chauvenet(dataset, col)
            # DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
            dataset = OutlierDistr.mixture_model(dataset, col)
            # DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'])
            # This requires:
            # n_data_points * n_data_points * point_size =
            # 31839 * 31839 * 32 bits = ~4GB available memory

            try:
                dataset = OutlierDist.simple_distance_based(
                    dataset, [col], 'euclidean', 0.10, 0.99)
                # DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
            except MemoryError as e:
                print(
                    'Not enough memory available for simple distance-based outlier detection...'
                )
                print('Skipping.')

            # try:
            #     dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5)
            #     # DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'])
            # except MemoryError as e:
            #     print('Not enough memory available for lof...')
            #     print('Skipping.')

            # Remove all the stuff from the dataset again.
            cols_to_remove = [
                col + '_outlier', col + '_mixture', 'simple_dist_outlier',
                'lof'
            ]
            for to_remove in cols_to_remove:
                if to_remove in dataset:
                    del dataset[to_remove]

        # We take Chauvenet's criterion and apply it to all but the label data...

        for col in [c for c in dataset.columns if not 'label' in c]:
            print(f'Measurement is now: {col}')
            dataset = OutlierDistr.chauvenet(dataset, col)
            dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan
            del dataset[col + '_outlier']

        dataset.to_csv(DATA_PATH / RESULT_FNAME)
Beispiel #28
0
def main():
    # Read the result from the previous chapter convert the index to datetime
    try:
        dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Create objects for clustering
    clusteringNH = NonHierarchicalClustering()
    clusteringH = HierarchicalClustering()

    if FLAGS.mode == 'kmeans':
        # Do some initial runs to determine the right number for k
        k_values = range(2, 10)
        silhouette_values = []

        print('Running k-means clustering')
        for k in k_values:
            print(f'k = {k}')
            dataset_cluster = clusteringNH.k_means_over_instances(
                dataset=copy.deepcopy(dataset),
                cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                k=k,
                distance_metric='default',
                max_iters=20,
                n_inits=10)
            silhouette_score = dataset_cluster['silhouette'].mean()
            print(f'silhouette = {silhouette_score}')
            silhouette_values.append(silhouette_score)

        DataViz.plot_xy(x=[k_values],
                        y=[silhouette_values],
                        xlabel='k',
                        ylabel='silhouette score',
                        ylim=[0, 1],
                        line_styles=['b-'])

        # Run the knn with the highest silhouette score
        k = k_values[np.argmax(silhouette_values)]
        print(f'Highest K-Means silhouette score: k = {k}')
        print('Use this value of k to run the --mode=final --k=?')

    if FLAGS.mode == 'kmediods':
        # Do some initial runs to determine the right number for k
        k_values = range(2, 10)
        silhouette_values = []
        print('Running k-medoids clustering')

        for k in k_values:
            print(f'k = {k}')
            dataset_cluster = clusteringNH.k_medoids_over_instances(
                dataset=copy.deepcopy(dataset),
                cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                k=k,
                distance_metric='default',
                max_iters=20,
                n_inits=10)
            silhouette_score = dataset_cluster['silhouette'].mean()
            print(f'silhouette = {silhouette_score}')
            silhouette_values.append(silhouette_score)

        DataViz.plot_xy(x=[k_values],
                        y=[silhouette_values],
                        xlabel='k',
                        ylabel='silhouette score',
                        ylim=[0, 1],
                        line_styles=['b-'])

        # Run k medoids with the highest silhouette score
        k = k_values[np.argmax(silhouette_values)]
        print(f'Highest K-Medoids silhouette score: k = {k}')

        dataset_kmed = clusteringNH.k_medoids_over_instances(
            dataset=copy.deepcopy(dataset),
            cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            k=k,
            distance_metric='default',
            max_iters=20,
            n_inits=50)
        DataViz.plot_clusters_3d(
            data_table=dataset_kmed,
            data_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            cluster_col='cluster',
            label_cols=['label'])
        DataViz.plot_silhouette(data_table=dataset_kmed,
                                cluster_col='cluster',
                                silhouette_col='silhouette')
        util.print_latex_statistics_clusters(
            dataset=dataset_kmed,
            cluster_col='cluster',
            input_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            label_col='label')

    # Run hierarchical clustering
    if FLAGS.mode == 'agglomerative':
        k_values = range(2, 10)
        silhouette_values = []

        # Do some initial runs to determine the right number for the maximum number of clusters
        print('Running agglomerative clustering')
        for k in k_values:
            print(f'k = {k}')
            dataset_cluster, link = clusteringH.agglomerative_over_instances(
                dataset=dataset,
                cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                max_clusters=k,
                distance_metric='euclidean',
                use_prev_linkage=True,
                link_function='ward')
            silhouette_score = dataset_cluster['silhouette'].mean()
            print(f'silhouette = {silhouette_score}')
            silhouette_values.append(silhouette_score)
            if k == k_values[0]:
                DataViz.plot_dendrogram(dataset_cluster, link)

        # Plot the clustering results
        DataViz.plot_xy(x=[k_values],
                        y=[silhouette_values],
                        xlabel='k',
                        ylabel='silhouette score',
                        ylim=[0, 1],
                        line_styles=['b-'])

    if FLAGS.mode == 'final':
        # Select the outcome dataset of the knn clustering
        clusteringNH = NonHierarchicalClustering()
        dataset = clusteringNH.k_means_over_instances(
            dataset=dataset,
            cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            k=FLAGS.k,
            distance_metric='default',
            max_iters=50,
            n_inits=50)
        # Plot the results
        DataViz.plot_clusters_3d(dataset,
                                 ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                                 'cluster', ['label'])
        DataViz.plot_silhouette(dataset, 'cluster', 'silhouette')
        # Print table statistics
        util.print_latex_statistics_clusters(
            dataset, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            'label')
        del dataset['silhouette']

        # Store the final dataset
        dataset.to_csv(DATA_PATH / RESULT_FNAME)
Beispiel #29
0
from Chapter7.Evaluation import ClassificationEvaluation
from Chapter7.Evaluation import RegressionEvaluation
from Chapter7.FeatureSelection import FeatureSelectionClassification
from Chapter7.FeatureSelection import FeatureSelectionRegression
import copy
import pandas as pd
from util import util
import matplotlib.pyplot as plot
import numpy as np
from sklearn.model_selection import train_test_split
import os


# Of course we repeat some stuff from Chapter 3, namely to load the dataset

DataViz = VisualizeDataset()

# Read the result from the previous chapter, and make sure the index is of the type datetime.

dataset_path = './intermediate_datafiles/'
export_tree_path = 'Example_graphs/Chapter7/'

try:
    dataset = pd.read_csv(dataset_path + 'chapter5_result.csv', index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

if not os.path.exists(export_tree_path):
    os.makedirs(export_tree_path)
Beispiel #30
0
from Chapter8.LearningAlgorithmsTemporal import TemporalClassificationAlgorithms
from Chapter8.LearningAlgorithmsTemporal import TemporalRegressionAlgorithms
from statsmodels.tsa.stattools import adfuller
from pandas.tools.plotting import autocorrelation_plot

import copy
import pandas as pd
from util import util
import matplotlib.pyplot as plot
import numpy as np
from sklearn.model_selection import train_test_split


# Of course we repeat some stuff from Chapter 3, namely to load the dataset

DataViz = VisualizeDataset()

# Read the result from the previous chapter, and make sure the index is of the type datetime.
dataset_path = './intermediate_datafiles/'

try:
    dataset = pd.read_csv(dataset_path + 'chapter5_result.csv', index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = dataset.index.to_datetime()

# Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task.

prepare = PrepareDatasetForLearning()