# Let us first study whether the time series is stationary and what the autocorrelations are. # dftest = adfuller(dataset['gyr_phone_x'], autolag='AIC') # print dftest # # autocorrelation_plot(dataset['gyr_phone_x']) # plot.show() # exit(0) # Now let us focus on the learning part. learner = TemporalRegressionAlgorithms() eval = RegressionEvaluation() # We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random. repeats = 1 # we set a washout time to give the NN's the time to stabilize. We do not compute the error during the washout time. washout_time = 10 scores_over_all_algs = [] # for i in range(0, len(possible_feature_sets)): # # selected_train_X = train_X[possible_feature_sets[i]] # selected_test_X = test_X[possible_feature_sets[i]]
feature_names = [ 'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features' ] # Let us first study whether the time series is stationary and what the autocorrelations are. dftest = adfuller(dataset['hr_watch_rate'], autolag='AIC') print dftest autocorrelation_plot(dataset['hr_watch_rate']) plot.show() # Now let us focus on the learning part. learner = TemporalRegressionAlgorithms() eval = RegressionEvaluation() # We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random. repeats = 5 # we set a washout time to give the NN's the time to stabilize. We do not compute the error during the washout time. washout_time = 10 scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]]
'pca_1_temp_mean_ws_120', 'acc_watch_y_temp_mean_ws_120', 'pca_2', 'acc_phone_z_temp_mean_ws_120', 'gyr_watch_y_pse', 'gyr_watch_x_pse' ] possible_feature_sets = [ basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features ] feature_names = [ 'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features' ] # Let us first study the importance of the parameter settings. learner = RegressionAlgorithms() eval = RegressionEvaluation() # We repeat the experiment a number of times to get a bit more robust data as the initialization of e.g. the NN is random. repeats = 5 scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] # First we run our non deterministic classifiers a number of times to average their score. performance_tr_nn = 0
def gridsearch_reservoir_computing(self, train_X, train_y, test_X, test_y, per_time_step=False, error='mse', gridsearch_training_frac=0.7): tuned_parameters = { 'a': [0.6, 0.8], 'reservoir_size': [100, 400, 700, 1000], 'washout_period': [5, 10, 15, 20], 'sr': [0.25, 1.25, 2.25, 3.25] } # tuned_parameters = {'a': [0.4], 'reservoir_size':[250]} params = tuned_parameters.keys() combinations = self.generate_parameter_combinations( tuned_parameters, params) split_point = int(gridsearch_training_frac * len(train_X.index)) train_params_X = train_X.ix[0:split_point, ] test_params_X = train_X.ix[split_point:len(train_X.index), ] train_params_y = train_y.ix[0:split_point, ] test_params_y = train_y.ix[split_point:len(train_X.index), ] if error == 'mse': best_error = sys.float_info.max elif error == 'accuracy': best_error = 0 best_combination = [] for comb in combinations: print comb # Order of the keys might have changed. keys = tuned_parameters.keys() pred_train_y, pred_test_y, pred_train_y_prob, pred_test_y_prob = self.reservoir_computing( train_params_X, train_params_y, test_params_X, test_params_y, reservoir_size=comb[keys.index('reservoir_size')], a=comb[keys.index('a')], washout_period=comb[keys.index('washout_period')], sr=comb[keys.index('sr')], per_time_step=per_time_step, gridsearch=False) if error == 'mse': eval = RegressionEvaluation() mse = eval.mean_squared_error(test_params_y, pred_test_y_prob) if mse < best_error: best_error = mse best_combination = comb elif error == 'accuracy': eval = ClassificationEvaluation() acc = eval.accuracy(test_params_y, pred_test_y) if acc > best_error: best_error = acc best_combination = comb print '-------' print best_combination print '-------' return best_combination[keys.index( 'reservoir_size')], best_combination[keys.index( 'a')], best_combination[keys.index( 'washout_period')], best_combination[keys.index('sr')]
def main(): # Read the result from the previous chapter and convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Consider the second task, namely the prediction of the heart rate. Therefore create a dataset with the heart # rate as target and split using timestamps, because this is considered as a temporal task print('\n- - - Loading dataset - - -') prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time( dataset, 'hr_watch_rate', '2016-02-08 18:29:56', '2016-02-08 19:34:07', '2016-02-08 20:07:50') print('Training set length is: ', len(train_X.index)) print('Test set length is: ', len(test_X.index)) # Select subsets of the features print('\n- - - Selecting subsets - - -') basic_features = [ 'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'labelOnTable', 'labelSitting', 'labelWashingHands', 'labelWalking', 'labelStanding', 'labelDriving', 'labelEating', 'labelRunning', 'light_phone_lux', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z', 'press_phone_pressure' ] pca_features = [ 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7' ] time_features = [ name for name in dataset.columns if ('temp_' in name and 'hr_watch' not in name) ] freq_features = [ name for name in dataset.columns if (('_freq' in name) or ('_pse' in name)) ] cluster_features = ['cluster'] print('#basic features: ', len(basic_features)) print('#PCA features: ', len(pca_features)) print('#time features: ', len(time_features)) print('#frequency features: ', len(freq_features)) print('#cluster features: ', len(cluster_features)) features_after_chapter_3 = list(set().union(basic_features, pca_features)) features_after_chapter_4 = list(set().union(features_after_chapter_3, time_features, freq_features)) features_after_chapter_5 = list(set().union(features_after_chapter_4, cluster_features)) selected_features = [ 'temp_pattern_labelOnTable', 'labelOnTable', 'temp_pattern_labelOnTable(b)labelOnTable', 'cluster', 'pca_1_temp_mean_ws_120', 'pca_2_temp_mean_ws_120', 'pca_2', 'acc_watch_y_temp_mean_ws_120', 'gyr_watch_y_pse', 'gyr_watch_x_pse' ] possible_feature_sets = [ basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features ] feature_names = [ 'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features' ] if FLAGS.mode == 'correlation' or FLAGS.mode == 'all': # First study whether the time series is stationary and what the autocorrelations are adfuller(dataset['hr_watch_rate'], autolag='AIC') plt.Figure() autocorrelation_plot(dataset['hr_watch_rate']) DataViz.save(plt) plt.show() # Now focus on the learning part learner = TemporalRegressionAlgorithms() evaluate = RegressionEvaluation() if FLAGS.mode == 'overall' or FLAGS.mode == 'all': # Repeat the experiment a number of times to get a bit more robust data as the initialization of e.g. the NN is # random repeats = FLAGS.repeats # Set a washout time to give the NN's the time to stabilize (so don't compute the error during the washout time) washout_time = FLAGS.washout scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): print(f'Evaluating for features {possible_feature_sets[i]}') selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] # First run non deterministic classifiers a number of times to average their score performance_tr_res, performance_tr_res_std = 0, 0 performance_te_res, performance_te_res_std = 0, 0 performance_tr_rnn, performance_tr_rnn_std = 0, 0 performance_te_rnn, performance_te_rnn_std = 0, 0 for repeat in range(0, repeats): print(f'--- run {repeat} ---') regr_train_y, regr_test_y = learner.reservoir_computing( selected_train_X, train_y, selected_test_X, test_y, gridsearch=True, per_time_step=False) mean_tr, std_tr = evaluate.mean_squared_error_with_std( train_y.iloc[washout_time:, ], regr_train_y.iloc[washout_time:, ]) mean_te, std_te = evaluate.mean_squared_error_with_std( test_y.iloc[washout_time:, ], regr_test_y.iloc[washout_time:, ]) performance_tr_res += mean_tr performance_tr_res_std += std_tr performance_te_res += mean_te performance_te_res_std += std_te regr_train_y, regr_test_y = learner.recurrent_neural_network( selected_train_X, train_y, selected_test_X, test_y, gridsearch=True) mean_tr, std_tr = evaluate.mean_squared_error_with_std( train_y.iloc[washout_time:, ], regr_train_y.iloc[washout_time:, ]) mean_te, std_te = evaluate.mean_squared_error_with_std( test_y.iloc[washout_time:, ], regr_test_y.iloc[washout_time:, ]) performance_tr_rnn += mean_tr performance_tr_rnn_std += std_tr performance_te_rnn += mean_te performance_te_rnn_std += std_te # Only apply the time series in case of the basis features if feature_names[i] == 'initial set': regr_train_y, regr_test_y = learner.time_series( selected_train_X, train_y, selected_test_X, test_y, gridsearch=True) mean_tr, std_tr = evaluate.mean_squared_error_with_std( train_y.iloc[washout_time:, ], regr_train_y.iloc[washout_time:, ]) mean_te, std_te = evaluate.mean_squared_error_with_std( test_y.iloc[washout_time:, ], regr_test_y.iloc[washout_time:, ]) overall_performance_tr_ts = mean_tr overall_performance_tr_ts_std = std_tr overall_performance_te_ts = mean_te overall_performance_te_ts_std = std_te else: overall_performance_tr_ts = 0 overall_performance_tr_ts_std = 0 overall_performance_te_ts = 0 overall_performance_te_ts_std = 0 overall_performance_tr_res = performance_tr_res / repeats overall_performance_tr_res_std = performance_tr_res_std / repeats overall_performance_te_res = performance_te_res / repeats overall_performance_te_res_std = performance_te_res_std / repeats overall_performance_tr_rnn = performance_tr_rnn / repeats overall_performance_tr_rnn_std = performance_tr_rnn_std / repeats overall_performance_te_rnn = performance_te_rnn / repeats overall_performance_te_rnn_std = performance_te_rnn_std / repeats scores_with_sd = [ (overall_performance_tr_res, overall_performance_tr_res_std, overall_performance_te_res, overall_performance_te_res_std), (overall_performance_tr_rnn, overall_performance_tr_rnn_std, overall_performance_te_rnn, overall_performance_te_rnn_std), (overall_performance_tr_ts, overall_performance_tr_ts_std, overall_performance_te_ts, overall_performance_te_ts_std) ] util.print_table_row_performances_regression( feature_names[i], scores_with_sd) scores_over_all_algs.append(scores_with_sd) DataViz.plot_performances_regression( ['Reservoir', 'RNN', 'Time series'], feature_names, scores_over_all_algs) if FLAGS.mode == 'detail' or FLAGS.mode == 'all': regr_train_y, regr_test_y = learner.reservoir_computing( train_X[features_after_chapter_5], train_y, test_X[features_after_chapter_5], test_y, gridsearch=False) DataViz.plot_numerical_prediction_versus_real( train_X.index, train_y, regr_train_y['hr_watch_rate'], test_X.index, test_y, regr_test_y['hr_watch_rate'], 'heart rate') regr_train_y, regr_test_y = learner.recurrent_neural_network( train_X[basic_features], train_y, test_X[basic_features], test_y, gridsearch=True) DataViz.plot_numerical_prediction_versus_real( train_X.index, train_y, regr_train_y['hr_watch_rate'], test_X.index, test_y, regr_test_y['hr_watch_rate'], 'heart rate') regr_train_y, regr_test_y = learner.time_series( train_X[basic_features], train_y, test_X[basic_features], test_y, gridsearch=True) DataViz.plot_numerical_prediction_versus_real( train_X.index, train_y, regr_train_y['hr_watch_rate'], test_X.index, test_y, regr_test_y['hr_watch_rate'], 'heart rate') if FLAGS.mode == 'dynamical' or FLAGS.mode == 'all': # And now some example code for using the dynamical systems model with parameter tuning (note: focus on # predicting accelerometer data): train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression( copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y'], 0.9, filter_data=False, temporal=True) output_sets = learner. \ dynamical_systems_model_nsga_2(train_X, train_y, test_X, test_y, ['self.acc_phone_x', 'self.acc_phone_y', 'self.acc_phone_z'], ['self.a * self.acc_phone_x + self.b * self.acc_phone_y', 'self.c * self.acc_phone_y + self.d * self.acc_phone_z', 'self.e * self.acc_phone_x + self.f * self.acc_phone_z'], ['self.acc_phone_x', 'self.acc_phone_y'], ['self.a', 'self.b', 'self.c', 'self.d', 'self.e', 'self.f'], pop_size=10, max_generations=10, per_time_step=True) DataViz.plot_pareto_front(output_sets) DataViz.plot_numerical_prediction_versus_real_dynsys_mo( train_X.index, train_y, test_X.index, test_y, output_sets, 0, 'acc_phone_x') regr_train_y, regr_test_y = learner. \ dynamical_systems_model_ga(train_X, train_y, test_X, test_y, ['self.acc_phone_x', 'self.acc_phone_y', 'self.acc_phone_z'], ['self.a * self.acc_phone_x + self.b * self.acc_phone_y', 'self.c * self.acc_phone_y + self.d * self.acc_phone_z', 'self.e * self.acc_phone_x + self.f * self.acc_phone_z'], ['self.acc_phone_x', 'self.acc_phone_y'], ['self.a', 'self.b', 'self.c', 'self.d', 'self.e', 'self.f'], pop_size=5, max_generations=10, per_time_step=True) DataViz.plot_numerical_prediction_versus_real( train_X.index, train_y['acc_phone_x'], regr_train_y['acc_phone_x'], test_X.index, test_y['acc_phone_x'], regr_test_y['acc_phone_x'], 'acc_phone_x') regr_train_y, regr_test_y = learner. \ dynamical_systems_model_sa(train_X, train_y, test_X, test_y, ['self.acc_phone_x', 'self.acc_phone_y', 'self.acc_phone_z'], ['self.a * self.acc_phone_x + self.b * self.acc_phone_y', 'self.c * self.acc_phone_y + self.d * self.acc_phone_z', 'self.e * self.acc_phone_x + self.f * self.acc_phone_z'], ['self.acc_phone_x', 'self.acc_phone_y'], ['self.a', 'self.b', 'self.c', 'self.d', 'self.e', 'self.f'], max_generations=10, per_time_step=True) DataViz.plot_numerical_prediction_versus_real( train_X.index, train_y['acc_phone_x'], regr_train_y['acc_phone_x'], test_X.index, test_y['acc_phone_x'], regr_test_y['acc_phone_x'], 'acc_phone_x')