def main(): DATA_PATH = Path('./intermediate_datafiles/') DATASET_FNAME = 'chapter2_result.csv' RESULT_FNAME = 'chapter3_heart_rate.csv' try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run the preceding crowdsignals scripts first!' ) raise e DataViz = VisualizeDataset(__file__) # Original heart rate values # DataViz.plot_imputed_values(dataset, ['original'], 'hr_watch_rate') Kalman = KalmanFilters() dataset = Kalman.apply_kalman_filter(dataset, 'hr_watch_rate') # print(dataset.head()) # print(dataset.index) DataViz.plot_dataset(dataset, ['hr_watch_rate', 'hr_watch_rate_kalman'], ['exact', 'exact'], ['line', 'line'])
def __init__(self, data_path, data_file): self.dataset = pd.read_csv(Path(data_path / data_file), index_col=0) self.dataset = self.dataset self.dataset.index = pd.to_datetime(self.dataset.index) self.DataViz = VisualizeDataset(__file__, show=False) self.outlier_columns = ['acc_phone_x', 'light_phone_lux'] self.OutlierDistr = DistributionBasedOutlierDetection() self.OutlierDist = DistanceBasedOutlierDetection() self.original_columns = self.dataset.columns self.num_outliers = {'acc_phone_x': 0, 'light_phone_lux': 0}
def main(): dataset_path = './intermediate_datafiles/' dataset = pd.read_csv(dataset_path + 'chapter2_result.csv', index_col=0) outlier_columns = ['acc_phone_x', 'light_phone_lux'] DataViz = VisualizeDataset() OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() dataset.index = dataset.index.to_datetime() start = input("choose method: [1],[2],[3],[4]") if start == 1: param = input("Chauvenet\ninput parameters: c") for col in outlier_columns: dataset = OutlierDistr.chauvenet(dataset, col, param) DataViz.plot_binary_outliers(dataset, col, col + '_outlier') elif start == 2: # param = input("Mixture model\n input parameters: components, iter") components, iter = raw_input("Mixture model\n input parameters: components, iter").split(',') components = int(components) iter = int(iter) for col in outlier_columns: dataset = OutlierDistr.mixture_model(dataset, col, components, iter) DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points']) elif start == 3: d_min, f_min = raw_input("Simple distance-based\n input parameters: d_min, f_min").split() d_min = float(d_min) f_min = float(f_min) for col in outlier_columns: dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', d_min, f_min) DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') elif start == 4: param = input("Local outlier factor\n input parameters: k") for col in outlier_columns: dataset = OutlierDist.local_outlier_factor(dataset, col, 'euclidean', k) DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points']) else : print("no method selected")
def main(): DataViz = VisualizeDataset() dataset_path = './intermediate_datafiles/' try: dataset = pd.read_csv(dataset_path + 'chapter3_result_final.csv', index_col=0) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Now we move to the frequency domain, with the same window size. FreqAbs = FourierTransformation() fs = float(1000) / milliseconds_per_instance periodic_predictor_cols = [ 'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z' ] data_table = FreqAbs.abstract_frequency( copy.deepcopy(dataset), ['acc_phone_x'], int(float(10000) / milliseconds_per_instance), fs) # Spectral analysis. DataViz.plot_dataset(data_table, [ 'acc_phone_x_max_freq', 'acc_phone_x_freq_weighted', 'acc_phone_x_pse', 'label' ], ['like', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) dataset = FreqAbs.abstract_frequency( dataset, periodic_predictor_cols, int(float(10000) / milliseconds_per_instance), fs) # Now we only take a certain percentage of overlap in the windows, otherwise our training examples will be too much alike. # The percentage of overlap we allow window_overlap = 0.9 skip_points = int((1 - window_overlap) * ws) dataset = dataset.iloc[::skip_points, :] DataViz.plot_dataset( dataset, [ 'acc_phone_x', 'gyr_phone_x', 'hr_watch_rate', 'light_phone_lux', 'mag_phone_x', 'press_phone_', 'pca_1', 'label' ], ['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'line', 'line', 'line', 'points'])
from util.VisualizeDataset import VisualizeDataset from Chapter4.TemporalAbstraction import NumericalAbstraction from Chapter4.TemporalAbstraction import CategoricalAbstraction from Chapter4.FrequencyAbstraction import FourierTransformation from Chapter4.TextAbstraction import TextAbstraction # Read the result from the previous chapter, and make sure the index is of the type datetime. import pickle from Load import * # As usual, we set our program constants, read the input file and initialize a visualization object. dataset = pd.read_csv(cluster_watch_data, index_col=time_col) dataset.index = pd.to_datetime(dataset.index) # Let us create our visualization class again. DataViz = VisualizeDataset(__file__, show=False) # Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Chapter 4: Identifying aggregate attributes. print('attributes time domain') # First we focus on the time domain. # Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes window_sizes = [ int(float(5000) / milliseconds_per_instance), int(float(0.5 * 60000) / milliseconds_per_instance), int(float(5 * 60000) / milliseconds_per_instance),
# # ############################################################## from util.VisualizeDataset import VisualizeDataset from Chapter3.OutlierDetection import DistributionBasedOutlierDetection from Chapter3.OutlierDetection import DistanceBasedOutlierDetection import copy import pandas as pd import numpy as np import matplotlib.pyplot as plt import numpy as np import statsmodels.api as sm import pylab # Let is create our visualization class again. DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sture the index is of the type datetime. dataset_path = './intermediate_datafiles/' try: dataset = pd.read_csv(dataset_path + 'chapter2_final2hz.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() # fig, ax = plt.subplots(5, 3) # i = 0 # j = 0 #
from Chapter7.LearningAlgorithms import ClassificationAlgorithms from Chapter7.LearningAlgorithms import RegressionAlgorithms from Chapter7.Evaluation import ClassificationEvaluation from Chapter7.Evaluation import RegressionEvaluation from Chapter7.FeatureSelection import FeatureSelectionClassification from Chapter7.FeatureSelection import FeatureSelectionRegression import copy import pandas as pd from util import util import matplotlib.pyplot as plot import numpy as np from sklearn.model_selection import train_test_split import os # Of course we repeat some stuff from Chapter 3, namely to load the dataset DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles_ass3/' export_tree_path = 'Example_graphs/ass3/' try: dataset = pd.read_csv(dataset_path + 'clustering_result.csv', index_col=0) except IOError as e: print('File not found, try to run previous scripts first!') raise e if not os.path.exists(export_tree_path): os.makedirs(export_tree_path) dataset.index = dataset.index.to_datetime() # Let us consider our first task, namely the prediction of the label. We consider this as a non-temporal task.
from util.VisualizeDataset import VisualizeDataset from Chapter5.DistanceMetrics import InstanceDistanceMetrics from Chapter5.DistanceMetrics import PersonDistanceMetricsNoOrdering from Chapter5.DistanceMetrics import PersonDistanceMetricsOrdering from Chapter5.Clustering import NonHierarchicalClustering from Chapter5.Clustering import HierarchicalClustering import copy import pandas as pd import matplotlib.pyplot as plot import util.util as util # Of course we repeat some stuff from Chapter 3, namely to load the dataset DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles/' try: dataset = pd.read_csv(dataset_path + 'chapter4_our_result.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() # First let us use non hierarchical clustering. clusteringNH = NonHierarchicalClustering()
from Chapter7.Evaluation import RegressionEvaluation from Chapter8.LearningAlgorithmsTemporal import TemporalClassificationAlgorithms from Chapter8.LearningAlgorithmsTemporal import TemporalRegressionAlgorithms from statsmodels.tsa.stattools import adfuller from pandas.tools.plotting import autocorrelation_plot import copy import pandas as pd from util import util import matplotlib.pyplot as plot import numpy as np from sklearn.model_selection import train_test_split # Of course we repeat some stuff from Chapter 3, namely to load the dataset DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles/' try: dataset = pd.read_csv(dataset_path + 'chapter5_our_result.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() # Let us consider our second task, namely the prediction of the Azimuth. We consider this as a temporal task.
RESULT_FNAME = 'chapter7_classification_result.csv' EXPORT_TREE_PATH = Path('./figures/crowdsignals_ch7_classification/') # Next, we declare the parameters we'll use in the algorithms. N_FORWARD_SELECTION = 15 try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = pd.to_datetime(dataset.index) dataset = dataset.sample(n=8000) # Let us create our visualization class again. DataViz = VisualizeDataset(__file__, show=False) # datetime object containing current date and time now1 = datetime.now() # dd/mm/YY H:M:S dt_string = now1.strftime("%d/%m/%Y %H:%M:%S") print("date and time =", dt_string) diff = now1 - begin print('difference time', diff) # Let us consider our first task, namely the prediction of the label. We consider this as a non-temporal task. # We create a single column with the categorical attribute representing our class. Furthermore, we use 70% of our data # for training and the remaining 30% as an independent test set. We select the sets based on stratified sampling. We remove # cases where we do not know the label.
# Read the result from the previous chapter, and make sure the index is of the type datetime. DATA_PATH = Path('./intermediate_datafiles/') DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter3_result_final.csv' RESULT_FNAME = sys.argv[2] if len(sys.argv) > 2 else 'chapter4_result.csv' try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = pd.to_datetime(dataset.index) # Let us create our visualization class again. DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000 # Chapter 4: Identifying aggregate attributes. print('attributes time domain') # First we focus on the time domain. # Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes window_sizes = [int(float(5000)/milliseconds_per_instance), int(float(0.5*60000)/milliseconds_per_instance), int(float(5*60000)/milliseconds_per_instance)] print('total window sizes', window_sizes)
DataSet.add_numerical_dataset('magnetometer_phone.csv', 'timestamps', ['x', 'y', 'z'], 'avg', 'mag_phone_') DataSet.add_numerical_dataset('magnetometer_smartwatch.csv', 'timestamps', ['x', 'y', 'z'], 'avg', 'mag_watch_') # We add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again DataSet.add_numerical_dataset('pressure_phone.csv', 'timestamps', ['pressure'], 'avg', 'press_phone_') # Get the resulting pandas data table dataset = DataSet.data_table # Plot the data DataViz = VisualizeDataset() # Boxplot DataViz.plot_dataset_boxplot(dataset, [ 'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z' ]) # Plot all data DataViz.plot_dataset( dataset, [ 'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'label' ], ['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])
# Set up the file names and locations. DATA_PATH = Path('./intermediate_datafiles/our_data') DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter3_result_outliers.csv' RESULT_FNAME = sys.argv[2] if len(sys.argv) > 2 else 'chapter3_result_final.csv' ORIG_DATASET_FNAME = sys.argv[3] if len(sys.argv) > 3 else 'chapter2_result.csv' # Next, import the data from the specified location and parse the date index. try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e # We'll create an instance of our visualization class to plot the results. DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000 # Let us impute the missing values and plot an example. MisVal = ImputationMissingValues() # imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'hr_watch_rate') # imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'hr_watch_rate') # imputed_interpolation_dataset = MisVal.impute_interpolate(copy.deepcopy(dataset), 'hr_watch_rate') # DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate']) # Now, let us carry out that operation over all columns except for the label. for col in [c for c in dataset.columns if not 'label' in c]:
def main(): # Set a granularity (the discrete step size of our time series data) and choose if all resulting datasets should # be saved. A course-grained granularity of one instance per minute, and a fine-grained one with four instances # per second are used. GRANULARITIES = [60000, 250] SAVE_VERSIONS = False # We can call Path.mkdir(exist_ok=True) to make any required directories if they don't already exist. [path.mkdir(exist_ok=True, parents=True) for path in [DATASET_PATH, RESULT_PATH]] # Create object to visualize the data and save figures DataViz = VisualizeDataset(module_path=__file__) datasets = [] for milliseconds_per_instance in GRANULARITIES: print( f'Creating numerical datasets from files in {DATASET_PATH} using granularity {milliseconds_per_instance}.') # Create an initial dataset object with the base directory for our data and a granularity and add selected # measurements to it data_engineer = CreateDataset(base_dir=DATASET_PATH, granularity=milliseconds_per_instance) # Add the accelerometer data (continuous numerical measurements) of the phone and the smartwatch # and aggregate the values per timestep by averaging the values data_engineer.add_numerical_dataset(file='accelerometer_phone.csv', timestamp_col='timestamps', value_cols=['x', 'y', 'z'], aggregation='avg', prefix='acc_phone_') data_engineer.add_numerical_dataset(file='accelerometer_smartwatch.csv', timestamp_col='timestamps', value_cols=['x', 'y', 'z'], aggregation='avg', prefix='acc_watch_') # Add the gyroscope data (continuous numerical measurements) of the phone and the smartwatch # and aggregate the values per timestep by averaging the values data_engineer.add_numerical_dataset(file='gyroscope_phone.csv', timestamp_col='timestamps', value_cols=['x', 'y', 'z'], aggregation='avg', prefix='gyr_phone_') data_engineer.add_numerical_dataset(file='gyroscope_smartwatch.csv', timestamp_col='timestamps', value_cols=['x', 'y', 'z'], aggregation='avg', prefix='gyr_watch_') # Add the heart rate (continuous numerical measurements) and aggregate by averaging the values data_engineer.add_numerical_dataset(file='heart_rate_smartwatch.csv', timestamp_col='timestamps', value_cols=['rate'], aggregation='avg', prefix='hr_watch_') # Add the labels provided by the users as binary attributes (i.e. add a one to the attribute representing the # specific value for a label if it occurs within an interval). These are categorical events that might overlap. data_engineer.add_event_dataset(file='labels.csv', start_timestamp_col='label_start', end_timestamp_col='label_end', value_col='label', aggregation='binary') # Add the amount of light sensed by the phone (continuous numerical measurements) and aggregate by averaging data_engineer.add_numerical_dataset(file='light_phone.csv', timestamp_col='timestamps', value_cols=['lux'], aggregation='avg', prefix='light_phone_') # Add the magnetometer data (continuous numerical measurements) of the phone and the smartwatch # and aggregate the values per timestep by averaging the values data_engineer.add_numerical_dataset(file='magnetometer_phone.csv', timestamp_col='timestamps', value_cols=['x', 'y', 'z'], aggregation='avg', prefix='mag_phone_') data_engineer.add_numerical_dataset(file='magnetometer_smartwatch.csv', timestamp_col='timestamps', value_cols=['x', 'y', 'z'], aggregation='avg', prefix='mag_watch_') # Add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again data_engineer.add_numerical_dataset(file='pressure_phone.csv', timestamp_col='timestamps', value_cols=['pressure'], aggregation='avg', prefix='press_phone_') # Get the resulting pandas data table dataset = data_engineer.data_table # Create boxplots DataViz.plot_dataset_boxplot(dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z']) # Plot all data DataViz.plot_dataset(data_table=dataset, columns=['acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'label'], match=['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'], display=['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points']) # Print a summary of the dataset util.print_statistics(dataset=dataset) datasets.append(copy.deepcopy(dataset)) # Save the various versions of the created datasets with logical filenames if needed if SAVE_VERSIONS: dataset.to_csv(RESULT_PATH / f'chapter2_result_{milliseconds_per_instance}') # Make a table like the one shown in the book, comparing the two datasets produced util.print_latex_table_statistics_two_datasets(dataset1=datasets[0], dataset2=datasets[1]) # Finally, store the last dataset we generated (250 ms) dataset.to_csv(RESULT_PATH / RESULT_FNAME)
DataSetOwn.add_numerical_dataset('mag_custom.csv', 'timestamps', ['x','y','z'], 'avg', 'mag_phone_') DataSetOwn.add_numerical_dataset('press_custom.csv', 'timestamps', ['pressure'], 'avg', 'press_phone_') # DataSetCS.add_numerical_dataset('magnetometer_phone.csv', 'timestamps', ['x','y','z'], 'avg', 'mag_phone_') # DataSetCS.add_numerical_dataset('magnetometer_smartwatch.csv', 'timestamps', ['x','y','z'], 'avg', 'mag_watch_') # We add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again DataSetOwn.add_numerical_dataset('pedom_custom.csv', 'timestamps', ['steps', 'distance'], 'avg', 'pedom_phone_') # DataSetCS.add_numerical_dataset('pressure_phone.csv', 'timestamps', ['pressure'], 'avg', 'press_phone_') # Get the resulting pandas data table dataset_own = DataSetOwn.data_table # dataset_cs = DataSetCS.data_table # Plot the data DataViz = VisualizeDataset() # Boxplot DataViz.plot_dataset_boxplot(dataset_own, ['acc_phone_x','acc_phone_y','acc_phone_z']) # DataViz.plot_dataset_boxplot(dataset_cs, ['acc_phone_x','acc_phone_y','acc_phone_z']) # Plot all data DataViz.plot_dataset(dataset_own, ['acc_', 'gyr_', 'mag_', 'press_' ,'pedom_phone_', 'label'], ['like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line','line', 'points', 'points']) # DataViz.plot_dataset(dataset_cs, ['acc_phone', 'gyr_phone', 'mag_phone', 'press_phone_', 'label'], ['like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'points']) # And print a summary of the dataset util.print_statistics(dataset_own) datasets_own.append(copy.deepcopy(dataset_own)) # util.print_statistics(dataset_cs)
# Machine Learning for the Quantified Self # # Springer # # Chapter 4 # # # ############################################################## from util.VisualizeDataset import VisualizeDataset from Chapter4.TemporalAbstraction import NumericalAbstraction from Chapter4.TemporalAbstraction import CategoricalAbstraction from Chapter4.FrequencyAbstraction import FourierTransformation from Chapter4.TextAbstraction import TextAbstraction import copy import pandas as pd # Let us create our visualization class again. DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles/' try: dataset = pd.read_csv(dataset_path + 'chapter3_our_result_final.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() # Compute the number of milliseconds covered by an instane based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Chapter 4: Identifying aggregate attributes.
# Read the result from the previous chapter, and make sure the index is of the type datetime. DATA_PATH = Path('./intermediate_datafiles/our_data') DATASET_FNAME = sys.argv[1] if len( sys.argv) > 1 else 'chapter3_result_final.csv' RESULT_FNAME = sys.argv[2] if len(sys.argv) > 2 else 'chapter4_result.csv' try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = pd.to_datetime(dataset.index) # Let us create our visualization class again. DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Chapter 4: Identifying aggregate attributes. # First we focus on the time domain. # Set the window sizes to the number of instances representing 1 seconds, 5 seconds and 3 minutes window_sizes = [ int(float(1000) / milliseconds_per_instance), int(float(5000) / milliseconds_per_instance), int(float(0.3 * 60000) / milliseconds_per_instance) ]
# Set up the file names and locations. DATA_PATH = Path('./intermediate_datafiles/') DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter3_result_outliers.csv' RESULT_FNAME = sys.argv[2] if len(sys.argv) > 2 else 'chapter3_result_final.csv' ORIG_DATASET_FNAME = sys.argv[3] if len(sys.argv) > 3 else 'chapter2_result.csv' # Next, import the data from the specified location and parse the date index. try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e # We'll create an instance of our visualization class to plot the results. DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Let us impute the missing values and plot an example. MisVal = ImputationMissingValues() # imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'hr_watch_rate') # imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'hr_watch_rate') # imputed_interpolation_dataset = MisVal.impute_interpolate(copy.deepcopy(dataset), 'hr_watch_rate') # DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate']) # Now, let us carry out that operation over all columns except for the label. for col in [c for c in dataset.columns if not 'label' in c]:
'horizontalAccuracy', 'verticalAccuracy' ], 'avg', 'loc_mobile_') # unit:() # TODO add the other datasets # We add the labels provided by the users. These are categorical events that might overlap. We add them # as binary attributes (i.e. add a one to the attribute representing the specific value for the label if it # occurs within an interval). dataset.add_event_dataset('labels_phone.csv', 'label_start', 'label_end', 'label', 'binary') # Get the resulting pandas data table dataset = dataset.data_table # Plot the data DataViz = VisualizeDataset(__file__) # Boxplot DataViz.plot_dataset_boxplot( dataset, ['acc_mobile_x', 'acc_mobile_y', 'acc_mobile_z']) #DataViz.plot_dataset_boxplot(dataset, ['gyr_mobile_x', 'gyr_mobile_y', 'gyr_mobile_z']) # Plot all data # DataViz.plot_dataset(dataset, ['acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'label'], # ['like', 'like', 'like', 'like', 'like', 'like', 'like','like'], # ['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points']) DataViz.plot_dataset(dataset, [ 'acc_mobile_', 'gyr_mobile_', 'mag_mobile_', 'prox_mobile_distance', 'loc_mobile_', 'label' ], ['like', 'like', 'like', 'like', 'like', 'like'],
# Chapter 3 # # # ############################################################## from util.VisualizeDataset import VisualizeDataset from Chapter3.DataTransformation import LowPassFilter from Chapter3.DataTransformation import PrincipalComponentAnalysis from Chapter3.ImputationMissingValues import ImputationMissingValues from Chapter3.KalmanFilters import KalmanFilters import copy import pandas as pd import numpy as np import matplotlib.pyplot as plot # Let is create our visualization class again. DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles/' dataset = pd.read_csv(dataset_path + 'chapter3_result_outliers.csv', index_col=0) dataset.index = dataset.index.to_datetime() # Computer the number of milliseconds covered by an instane based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Step 2: Let us impute the missing values. MisVal = ImputationMissingValues() imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset),
def main(): # Read the result from the previous chapter and convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FILENAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Consider the second task, namely the prediction of the heart rate. Therefore create a dataset with the heart # rate as target and split using timestamps, because this is considered as a temporal task. print('\n- - - Loading dataset - - -') prepare = PrepareDatasetForLearning() learner = RegressionAlgorithms() evaluation = RegressionEvaluation() train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(dataset, 'hr_watch_rate', '2016-02-08 18:28:56', '2016-02-08 19:34:07', '2016-02-08 20:07:50') print('Training set length is: ', len(train_X.index)) print('Test set length is: ', len(test_X.index)) # Select subsets of the features print('- - - Selecting subsets - - -') basic_features = ['acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'labelOnTable', 'labelSitting', 'labelWashingHands', 'labelWalking', 'labelStanding', 'labelDriving', 'labelEating', 'labelRunning', 'light_phone_lux', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z', 'press_phone_pressure'] pca_features = ['pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7'] time_features = [name for name in dataset.columns if ('temp_' in name and 'hr_watch' not in name)] freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))] cluster_features = ['cluster'] print('#basic features: ', len(basic_features)) print('#PCA features: ', len(pca_features)) print('#time features: ', len(time_features)) print('#frequency features: ', len(freq_features)) print('#cluster features: ', len(cluster_features)) features_after_chapter_3 = list(set().union(basic_features, pca_features)) features_after_chapter_4 = list(set().union(features_after_chapter_3, time_features, freq_features)) features_after_chapter_5 = list(set().union(features_after_chapter_4, cluster_features)) if FLAGS.mode == 'selection' or FLAGS.mode == 'all': # First, consider the Pearson correlations and see whether features can be selected based on them fs = FeatureSelectionRegression() print('\n- - - Running feature selection - - -') features, correlations = fs.pearson_selection(10, train_X[features_after_chapter_5], train_y) util.print_pearson_correlations(correlations) # Select the 10 features with the highest correlation selected_features = ['temp_pattern_labelOnTable', 'labelOnTable', 'temp_pattern_labelOnTable(b)labelOnTable', 'pca_2_temp_mean_ws_120', 'pca_1_temp_mean_ws_120', 'acc_watch_y_temp_mean_ws_120', 'pca_2', 'acc_phone_z_temp_mean_ws_120', 'gyr_watch_y_pse', 'gyr_watch_x_pse'] possible_feature_sets = [basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features] feature_names = ['initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features'] if FLAGS.mode == 'overall' or FLAGS.mode == 'all': print('\n- - - Running test of all different regression algorithms - - -') # First study the importance of the parameter settings. Therefore repeat the experiment a number of times to get # a bit more robust data as the initialization of e.g. the NN is random REPEATS = FLAGS.repeats scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] performance_tr_nn, performance_tr_nn_std = 0, 0 performance_tr_rf, performance_tr_rf_std = 0, 0 performance_te_nn, performance_te_nn_std = 0, 0 performance_te_rf, performance_te_rf_std = 0, 0 # First run non deterministic classifiers a number of times to average their score for repeat in range(0, REPEATS): print(f'Training NeuralNetwork run {repeat + 1}/{REPEATS} ... ') regr_train_y, regr_test_y = learner.\ feedforward_neural_network(selected_train_X, train_y, selected_test_X, gridsearch=True) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_nn += mean_tr performance_tr_nn_std += std_tr performance_te_nn += mean_te performance_te_nn_std += std_te print(f'Training RandomForest run {repeat + 1}/{REPEATS} ... ') regr_train_y, regr_test_y = learner.random_forest(selected_train_X, train_y, selected_test_X, gridsearch=True) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_rf += mean_tr performance_tr_rf_std += std_tr performance_te_rf += mean_te performance_te_rf_std += std_te overall_performance_tr_nn = performance_tr_nn / REPEATS overall_performance_tr_nn_std = performance_tr_nn_std / REPEATS overall_performance_te_nn = performance_te_nn / REPEATS overall_performance_te_nn_std = performance_te_nn_std / REPEATS overall_performance_tr_rf = performance_tr_rf / REPEATS overall_performance_tr_rf_std = performance_tr_rf_std / REPEATS overall_performance_te_rf = performance_te_rf / REPEATS overall_performance_te_rf_std = performance_te_rf_std / REPEATS # Run deterministic algorithms: print("Support Vector Regressor run 1/1 ... ") # Convergence of the SVR does not always occur (even adjusting tolerance and iterations does not help) regr_train_y, regr_test_y = learner.\ support_vector_regression_without_kernel(selected_train_X, train_y, selected_test_X, gridsearch=False) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_svm = mean_tr performance_tr_svm_std = std_tr performance_te_svm = mean_te performance_te_svm_std = std_te print("Training Nearest Neighbor run 1/1 ... ") regr_train_y, regr_test_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X, gridsearch=True) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_knn = mean_tr performance_tr_knn_std = std_tr performance_te_knn = mean_te performance_te_knn_std = std_te print("Training Decision Tree run 1/1 ... ") regr_train_y, regr_test_y = learner.\ decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True, export_tree_path=EXPORT_TREE_PATH) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_dt = mean_tr performance_tr_dt_std = std_tr performance_te_dt = mean_te performance_te_dt_std = std_te scores_with_sd = [(overall_performance_tr_nn, overall_performance_tr_nn_std, overall_performance_te_nn, overall_performance_te_nn_std), (overall_performance_tr_rf, overall_performance_tr_rf_std, overall_performance_te_rf, overall_performance_te_rf_std), (performance_tr_svm, performance_tr_svm_std, performance_te_svm, performance_te_svm_std), (performance_tr_knn, performance_tr_knn_std, performance_te_knn, performance_te_knn_std), (performance_tr_dt, performance_tr_dt_std, performance_te_dt, performance_te_dt_std)] util.print_table_row_performances_regression(feature_names[i], scores_with_sd) scores_over_all_algs.append(scores_with_sd) # Plot the results DataViz.plot_performances_regression(['NN', 'RF', 'SVM', 'KNN', 'DT'], feature_names, scores_over_all_algs) if FLAGS.mode == 'detail' or FLAGS.mode == 'all': print('\n- - - Running visualization of results - - -') regr_train_y, regr_test_y = learner.random_forest(train_X[features_after_chapter_5], train_y, test_X[features_after_chapter_5], gridsearch=False, print_model_details=True) DataViz.plot_numerical_prediction_versus_real(train_X.index, train_y, regr_train_y, test_X.index, test_y, regr_test_y, 'heart rate')
# Read the result from the previous chapter, and make sure the index is of the type datetime. DATA_PATH = Path('./intermediate_datafiles/') DATASET_FNAME = sys.argv[1] if len( sys.argv) > 1 else 'chapter3_result_final.csv' RESULT_FNAME = sys.argv[2] if len(sys.argv) > 2 else 'chapter4_result.csv' try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = pd.to_datetime(dataset.index) # Let us create our visualization class again. DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Chapter 4: Identifying aggregate attributes. # First we focus on the time domain. # OLD: Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes # NEW: Set the window sizes to the number of instances representing 1 second, 8 seconds, 16 seconds and 24 seconds window_sizes = [ int(float(1000) / milliseconds_per_instance), int(float(4000) / milliseconds_per_instance), int(float(8000) / milliseconds_per_instance)
# We add the labels provided by the users. These are categorical events that might overlap. We add them # as binary attributes (i.e. add a one to the attribute representing the specific value for the label if it # occurs within an interval). DataSet.add_binary_labels_dataset('A01_parsed_raw_data.csv', 'timestamp', ['labelWalking', 'labelFalling', 'labelLyingDown', 'labelLying', 'labelSittingDown', 'labelSitting', 'labelStandingFromLying', 'labelOnAllFours', 'labelSittingOnTheGround', 'labelStandingFromSitting', 'labelStandingFromSittingOnTheGround'], 'max', '') # Get the resulting pandas data table dataset = DataSet.data_table # Plot the data DataViz = VisualizeDataset() # Boxplot DataViz.plot_dataset_boxplot(dataset, ['ankle_l_x', 'ankle_l_y', 'ankle_l_z', 'ankle_r_x', 'ankle_r_y', 'ankle_r_z', 'belt_x', 'belt_y', 'belt_z', 'chest_x', 'chest_y', 'chest_z']) # Plot all data DataViz.plot_dataset(dataset, ['ankle_l_', 'ankle_r_', 'belt_', 'chest_', 'label'], ['like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'points']) # And print a summary of the dataset util.print_statistics(dataset) datasets.append(copy.deepcopy(dataset)) # And print the table that has been included in the book
# Machine Learning for the Quantified Self # # Springer # # Chapter 4 # # # ############################################################## from util.VisualizeDataset import VisualizeDataset from Chapter4.TemporalAbstraction import NumericalAbstraction from Chapter4.TemporalAbstraction import CategoricalAbstraction from Chapter4.FrequencyAbstraction import FourierTransformation from Chapter4.TextAbstraction import TextAbstraction import copy import pandas as pd # Let us create our visualization class again. DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles-own/' try: dataset = pd.read_csv(dataset_path + 'chapter3_result_final-own.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() # Compute the number of milliseconds covered by an instane based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000
# Mark Hoogendoorn and Burkhardt Funk (2017) # # Machine Learning for the Quantified Self # # Springer # # Chapter 3 # # # ############################################################## from util.VisualizeDataset import VisualizeDataset from Chapter3.OutlierDetection import DistributionBasedOutlierDetection from Chapter3.OutlierDetection import DistanceBasedOutlierDetection import copy import pandas as pd import numpy as np # Let is create our visualization class again. DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sture the index is of the type datetime. dataset_path = './intermediate_datafiles-own/' try: dataset = pd.read_csv(dataset_path + 'chapter2_result-own.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() # Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000 # Step 1: Let us see whether we have some outliers we would prefer to remove.
try: dataset = pickle.load( open('datasets/dataframes/concat_df_imputed_gyro.pkl', 'rb')) if not all_data: print('subset data') dataset = dataset[:14780] dataset.index = pd.to_datetime(dataset.index) if all_data: print('all data') except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e print(dataset.columns) DataViz = VisualizeDataset(__file__, show=False) # We'll start by applying non-hierarchical clustering. clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. k_values = range(2, 10) silhouette_values = [] attributes_to_cluster = [ 'Acceleration x (m/s^2)', 'Acceleration y (m/s^2)', 'Acceleration z (m/s^2)', 'Gyroscope x (rad/s)', 'Gyroscope y (rad/s)', 'Gyroscope z (rad/s)' ] ## Do some initial runs to determine the right number for k print('===== kmeans clustering =====')
def main(): # D:\Users\Andy\Downloads\Desktop\ml4qs\ML4QS_Group_41\ML4QS\simple_dataset\2\accelerometer.csv # Set up file names and locations. for user in range(1, 34): DATA_PATH = Path('./intermediate_datafiles/') DATASET_FNAME = sys.argv[1] if len( sys.argv ) > 1 else 'AS14_' + "{:02d}".format(user) + '/chapter2_result.csv' RESULT_FNAME = sys.argv[2] if len( sys.argv) > 2 else 'chapter3_result_outliers.csv' # Next, import the data from the specified location and parse the date index. try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run the preceding crowdsignals scripts first!' ) raise e # We'll create an instance of our visualization class to plot the results. DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance using the first two rows. milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Step 1: Let us see whether we have some outliers we would prefer to remove. # Determine the columns we want to experiment on. outlier_columns = [ 'actvalue', 'builtvalue', 'commvalue', 'entvalue', 'accvalue', 'offvalue', 'othervalue', 'socialvalue', 'travelvalue', 'unkvalue', 'utilvalue', 'callvalue', 'arovalue', 'valvalue', 'scrvalue', 'smsvalue' ] # Create the outlier classes. OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() # And investigate the approaches for all relevant attributes. for col in outlier_columns: # if col is None: # continue print(f"Applying outlier criteria for column {col}") # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset = OutlierDistr.chauvenet(dataset, col) # DataViz.plot_binary_outliers(dataset, col, col + '_outlier') dataset = OutlierDistr.mixture_model(dataset, col) # DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 32 bits = ~4GB available memory try: dataset = OutlierDist.simple_distance_based( dataset, [col], 'euclidean', 0.10, 0.99) # DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') except MemoryError as e: print( 'Not enough memory available for simple distance-based outlier detection...' ) print('Skipping.') # try: # dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5) # # DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points']) # except MemoryError as e: # print('Not enough memory available for lof...') # print('Skipping.') # Remove all the stuff from the dataset again. cols_to_remove = [ col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof' ] for to_remove in cols_to_remove: if to_remove in dataset: del dataset[to_remove] # We take Chauvenet's criterion and apply it to all but the label data... for col in [c for c in dataset.columns if not 'label' in c]: print(f'Measurement is now: {col}') dataset = OutlierDistr.chauvenet(dataset, col) dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan del dataset[col + '_outlier'] dataset.to_csv(DATA_PATH / RESULT_FNAME)
def main(): # Read the result from the previous chapter convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Create objects for clustering clusteringNH = NonHierarchicalClustering() clusteringH = HierarchicalClustering() if FLAGS.mode == 'kmeans': # Do some initial runs to determine the right number for k k_values = range(2, 10) silhouette_values = [] print('Running k-means clustering') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_means_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # Run the knn with the highest silhouette score k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Means silhouette score: k = {k}') print('Use this value of k to run the --mode=final --k=?') if FLAGS.mode == 'kmediods': # Do some initial runs to determine the right number for k k_values = range(2, 10) silhouette_values = [] print('Running k-medoids clustering') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_medoids_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # Run k medoids with the highest silhouette score k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Medoids silhouette score: k = {k}') dataset_kmed = clusteringNH.k_medoids_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=50) DataViz.plot_clusters_3d( data_table=dataset_kmed, data_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], cluster_col='cluster', label_cols=['label']) DataViz.plot_silhouette(data_table=dataset_kmed, cluster_col='cluster', silhouette_col='silhouette') util.print_latex_statistics_clusters( dataset=dataset_kmed, cluster_col='cluster', input_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], label_col='label') # Run hierarchical clustering if FLAGS.mode == 'agglomerative': k_values = range(2, 10) silhouette_values = [] # Do some initial runs to determine the right number for the maximum number of clusters print('Running agglomerative clustering') for k in k_values: print(f'k = {k}') dataset_cluster, link = clusteringH.agglomerative_over_instances( dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], max_clusters=k, distance_metric='euclidean', use_prev_linkage=True, link_function='ward') silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) if k == k_values[0]: DataViz.plot_dendrogram(dataset_cluster, link) # Plot the clustering results DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) if FLAGS.mode == 'final': # Select the outcome dataset of the knn clustering clusteringNH = NonHierarchicalClustering() dataset = clusteringNH.k_means_over_instances( dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=FLAGS.k, distance_metric='default', max_iters=50, n_inits=50) # Plot the results DataViz.plot_clusters_3d(dataset, ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'cluster', ['label']) DataViz.plot_silhouette(dataset, 'cluster', 'silhouette') # Print table statistics util.print_latex_statistics_clusters( dataset, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'label') del dataset['silhouette'] # Store the final dataset dataset.to_csv(DATA_PATH / RESULT_FNAME)
from Chapter7.Evaluation import ClassificationEvaluation from Chapter7.Evaluation import RegressionEvaluation from Chapter7.FeatureSelection import FeatureSelectionClassification from Chapter7.FeatureSelection import FeatureSelectionRegression import copy import pandas as pd from util import util import matplotlib.pyplot as plot import numpy as np from sklearn.model_selection import train_test_split import os # Of course we repeat some stuff from Chapter 3, namely to load the dataset DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles/' export_tree_path = 'Example_graphs/Chapter7/' try: dataset = pd.read_csv(dataset_path + 'chapter5_result.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e if not os.path.exists(export_tree_path): os.makedirs(export_tree_path)
from Chapter8.LearningAlgorithmsTemporal import TemporalClassificationAlgorithms from Chapter8.LearningAlgorithmsTemporal import TemporalRegressionAlgorithms from statsmodels.tsa.stattools import adfuller from pandas.tools.plotting import autocorrelation_plot import copy import pandas as pd from util import util import matplotlib.pyplot as plot import numpy as np from sklearn.model_selection import train_test_split # Of course we repeat some stuff from Chapter 3, namely to load the dataset DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles/' try: dataset = pd.read_csv(dataset_path + 'chapter5_result.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() # Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task. prepare = PrepareDatasetForLearning()