except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e if not os.path.exists(export_tree_path): os.makedirs(export_tree_path) dataset.index = dataset.index.to_datetime() # Let us consider our first task, namely the prediction of the label. We consider this as a non-temporal task. # We create a single column with the categorical attribute representing our class. Furthermore, we use 70% of our data # for training and the remaining 30% as an independent test set. We select the sets based on stratified sampling. We remove # cases where we do not know the label. prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(dataset, ['label'], 'like', 0.7, filter=True, temporal=False) #train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(dataset, ['label'], 'like', 0.01, filter=True, temporal=False) print 'Training set length is: ', len(train_X.index) print 'Test set length is: ', len(test_X.index) # Select subsets of the features that we will consider: basic_features = ['acc_phone_x','acc_phone_y','acc_phone_z','acc_watch_x','acc_watch_y','acc_watch_z','gyr_phone_x','gyr_phone_y','gyr_phone_z','gyr_watch_x','gyr_watch_y','gyr_watch_z', 'hr_watch_rate', 'light_phone_lux','mag_phone_x','mag_phone_y','mag_phone_z','mag_watch_x','mag_watch_y','mag_watch_z','press_phone_pressure'] pca_features = ['pca_1','pca_2','pca_3','pca_4','pca_5','pca_6','pca_7'] time_features = [name for name in dataset.columns if '_temp_' in name] freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))] '''
# Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles/' try: dataset = pd.read_csv(dataset_path + 'chapter5_our_result.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() # Let us consider our second task, namely the prediction of the Azimuth. We consider this as a temporal task. prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time( dataset, 'Azimuth', '2019-06-14 12:06:30', # '2016-02-08 18:29:58','2016-02-08 18:29:59') '2019-06-14 12:16:02', '2019-06-14 12:20:56') print 'Training set length is: ', len(train_X.index) print 'Test set length is: ', len(test_X.index) # Select subsets of the features that we will consider: print 'Training set length is: ', len(train_X.index)
def main(): # Read the result from the previous chapter and convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FILENAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Consider the second task, namely the prediction of the heart rate. Therefore create a dataset with the heart # rate as target and split using timestamps, because this is considered as a temporal task. print('\n- - - Loading dataset - - -') prepare = PrepareDatasetForLearning() learner = RegressionAlgorithms() evaluation = RegressionEvaluation() train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(dataset, 'hr_watch_rate', '2016-02-08 18:28:56', '2016-02-08 19:34:07', '2016-02-08 20:07:50') print('Training set length is: ', len(train_X.index)) print('Test set length is: ', len(test_X.index)) # Select subsets of the features print('- - - Selecting subsets - - -') basic_features = ['acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'labelOnTable', 'labelSitting', 'labelWashingHands', 'labelWalking', 'labelStanding', 'labelDriving', 'labelEating', 'labelRunning', 'light_phone_lux', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z', 'press_phone_pressure'] pca_features = ['pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7'] time_features = [name for name in dataset.columns if ('temp_' in name and 'hr_watch' not in name)] freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))] cluster_features = ['cluster'] print('#basic features: ', len(basic_features)) print('#PCA features: ', len(pca_features)) print('#time features: ', len(time_features)) print('#frequency features: ', len(freq_features)) print('#cluster features: ', len(cluster_features)) features_after_chapter_3 = list(set().union(basic_features, pca_features)) features_after_chapter_4 = list(set().union(features_after_chapter_3, time_features, freq_features)) features_after_chapter_5 = list(set().union(features_after_chapter_4, cluster_features)) if FLAGS.mode == 'selection' or FLAGS.mode == 'all': # First, consider the Pearson correlations and see whether features can be selected based on them fs = FeatureSelectionRegression() print('\n- - - Running feature selection - - -') features, correlations = fs.pearson_selection(10, train_X[features_after_chapter_5], train_y) util.print_pearson_correlations(correlations) # Select the 10 features with the highest correlation selected_features = ['temp_pattern_labelOnTable', 'labelOnTable', 'temp_pattern_labelOnTable(b)labelOnTable', 'pca_2_temp_mean_ws_120', 'pca_1_temp_mean_ws_120', 'acc_watch_y_temp_mean_ws_120', 'pca_2', 'acc_phone_z_temp_mean_ws_120', 'gyr_watch_y_pse', 'gyr_watch_x_pse'] possible_feature_sets = [basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features] feature_names = ['initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features'] if FLAGS.mode == 'overall' or FLAGS.mode == 'all': print('\n- - - Running test of all different regression algorithms - - -') # First study the importance of the parameter settings. Therefore repeat the experiment a number of times to get # a bit more robust data as the initialization of e.g. the NN is random REPEATS = FLAGS.repeats scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] performance_tr_nn, performance_tr_nn_std = 0, 0 performance_tr_rf, performance_tr_rf_std = 0, 0 performance_te_nn, performance_te_nn_std = 0, 0 performance_te_rf, performance_te_rf_std = 0, 0 # First run non deterministic classifiers a number of times to average their score for repeat in range(0, REPEATS): print(f'Training NeuralNetwork run {repeat + 1}/{REPEATS} ... ') regr_train_y, regr_test_y = learner.\ feedforward_neural_network(selected_train_X, train_y, selected_test_X, gridsearch=True) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_nn += mean_tr performance_tr_nn_std += std_tr performance_te_nn += mean_te performance_te_nn_std += std_te print(f'Training RandomForest run {repeat + 1}/{REPEATS} ... ') regr_train_y, regr_test_y = learner.random_forest(selected_train_X, train_y, selected_test_X, gridsearch=True) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_rf += mean_tr performance_tr_rf_std += std_tr performance_te_rf += mean_te performance_te_rf_std += std_te overall_performance_tr_nn = performance_tr_nn / REPEATS overall_performance_tr_nn_std = performance_tr_nn_std / REPEATS overall_performance_te_nn = performance_te_nn / REPEATS overall_performance_te_nn_std = performance_te_nn_std / REPEATS overall_performance_tr_rf = performance_tr_rf / REPEATS overall_performance_tr_rf_std = performance_tr_rf_std / REPEATS overall_performance_te_rf = performance_te_rf / REPEATS overall_performance_te_rf_std = performance_te_rf_std / REPEATS # Run deterministic algorithms: print("Support Vector Regressor run 1/1 ... ") # Convergence of the SVR does not always occur (even adjusting tolerance and iterations does not help) regr_train_y, regr_test_y = learner.\ support_vector_regression_without_kernel(selected_train_X, train_y, selected_test_X, gridsearch=False) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_svm = mean_tr performance_tr_svm_std = std_tr performance_te_svm = mean_te performance_te_svm_std = std_te print("Training Nearest Neighbor run 1/1 ... ") regr_train_y, regr_test_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X, gridsearch=True) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_knn = mean_tr performance_tr_knn_std = std_tr performance_te_knn = mean_te performance_te_knn_std = std_te print("Training Decision Tree run 1/1 ... ") regr_train_y, regr_test_y = learner.\ decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True, export_tree_path=EXPORT_TREE_PATH) mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y) mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y) performance_tr_dt = mean_tr performance_tr_dt_std = std_tr performance_te_dt = mean_te performance_te_dt_std = std_te scores_with_sd = [(overall_performance_tr_nn, overall_performance_tr_nn_std, overall_performance_te_nn, overall_performance_te_nn_std), (overall_performance_tr_rf, overall_performance_tr_rf_std, overall_performance_te_rf, overall_performance_te_rf_std), (performance_tr_svm, performance_tr_svm_std, performance_te_svm, performance_te_svm_std), (performance_tr_knn, performance_tr_knn_std, performance_te_knn, performance_te_knn_std), (performance_tr_dt, performance_tr_dt_std, performance_te_dt, performance_te_dt_std)] util.print_table_row_performances_regression(feature_names[i], scores_with_sd) scores_over_all_algs.append(scores_with_sd) # Plot the results DataViz.plot_performances_regression(['NN', 'RF', 'SVM', 'KNN', 'DT'], feature_names, scores_over_all_algs) if FLAGS.mode == 'detail' or FLAGS.mode == 'all': print('\n- - - Running visualization of results - - -') regr_train_y, regr_test_y = learner.random_forest(train_X[features_after_chapter_5], train_y, test_X[features_after_chapter_5], gridsearch=False, print_model_details=True) DataViz.plot_numerical_prediction_versus_real(train_X.index, train_y, regr_train_y, test_X.index, test_y, regr_test_y, 'heart rate')
def main(): # Read the result from the previous chapter and convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FILENAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Consider the first task, namely the prediction of the label. Therefore create a single column with the categorical # attribute representing the class. Furthermore, use 70% of the data for training and the remaining 30% as an # independent test set. Select the sets based on stratified sampling and remove cases where the label is unknown. print('\n- - - Loading dataset - - -') prepare = PrepareDatasetForLearning() learner = ClassificationAlgorithms() evaluation = ClassificationEvaluation() train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification( dataset, ['label'], 'like', 0.7, filter_data=True, temporal=False) print('Training set length is: ', len(train_X.index)) print('Test set length is: ', len(test_X.index)) # Select subsets of the features print('- - - Selecting subsets - - -') basic_features = [ 'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'hr_watch_rate', 'light_phone_lux', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z', 'press_phone_pressure' ] pca_features = [ 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7' ] time_features = [name for name in dataset.columns if '_temp_' in name] freq_features = [ name for name in dataset.columns if (('_freq' in name) or ('_pse' in name)) ] cluster_features = ['cluster'] print('#basic features: ', len(basic_features)) print('#PCA features: ', len(pca_features)) print('#time features: ', len(time_features)) print('#frequency features: ', len(freq_features)) print('#cluster features: ', len(cluster_features)) features_after_chapter_3 = list(set().union(basic_features, pca_features)) features_after_chapter_4 = list(set().union(features_after_chapter_3, time_features, freq_features)) features_after_chapter_5 = list(set().union(features_after_chapter_4, cluster_features)) if FLAGS.mode == 'selection' or FLAGS.mode == 'all': # First, consider the performance over a selection of features N_FORWARD_SELECTION = FLAGS.nfeatures fs = FeatureSelectionClassification() print('\n- - - Running feature selection - - -') features, ordered_features, ordered_scores = fs.forward_selection( max_features=N_FORWARD_SELECTION, X_train=train_X[features_after_chapter_5], y_train=train_y) DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)], y=[ordered_scores], xlabel='number of features', ylabel='accuracy') # Select the most important features (based on python2 features) selected_features = [ 'acc_phone_y_freq_0.0_Hz_ws_40', 'press_phone_pressure_temp_mean_ws_120', 'gyr_phone_x_temp_std_ws_120', 'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted', 'gyr_phone_y_freq_1.0_Hz_ws_40', 'acc_phone_x_freq_1.9_Hz_ws_40', 'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40' ] if FLAGS.mode == 'regularization' or FLAGS.mode == 'all': print('\n- - - Running regularization and model complexity test - - -') # Study the impact of regularization and model complexity: does regularization prevent overfitting? # Due to runtime constraints run the experiment 3 times, for even more robust data increase the repetitions N_REPEATS_NN = FLAGS.nnrepeat reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10] performance_training = [] performance_test = [] for reg_param in reg_parameters: performance_tr = 0 performance_te = 0 for i in range(0, N_REPEATS_NN): class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network( train_X, train_y, test_X, hidden_layer_sizes=(250, ), alpha=reg_param, max_iter=500, gridsearch=False) performance_tr += evaluation.accuracy(train_y, class_train_y) performance_te += evaluation.accuracy(test_y, class_test_y) performance_training.append(performance_tr / N_REPEATS_NN) performance_test.append(performance_te / N_REPEATS_NN) DataViz.plot_xy(x=[reg_parameters, reg_parameters], y=[performance_training, performance_test], method='semilogx', xlabel='regularization parameter value', ylabel='accuracy', ylim=[0.95, 1.01], names=['training', 'test'], line_styles=['r-', 'b:']) if FLAGS.mode == 'tree' or FLAGS.mode == 'all': print('\n- - - Running leaf size test of decision tree - - -') # Consider the influence of certain parameter settings for the tree model. (very related to the # regularization) and study the impact on performance. leaf_settings = [1, 2, 5, 10] performance_training = [] performance_test = [] for no_points_leaf in leaf_settings: class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree( train_X[selected_features], train_y, test_X[selected_features], min_samples_leaf=no_points_leaf, gridsearch=False, print_model_details=False) performance_training.append( evaluation.accuracy(train_y, class_train_y)) performance_test.append(evaluation.accuracy(test_y, class_test_y)) DataViz.plot_xy(x=[leaf_settings, leaf_settings], y=[performance_training, performance_test], xlabel='Minimum number of points per leaf', ylabel='Accuracy', names=['training', 'test'], line_styles=['r-', 'b:']) if FLAGS.mode == 'overall' or FLAGS.mode == 'all': print( '\n- - - Running test of all different classification algorithms - - -' ) # Perform grid searches over the most important parameters and do so by means of cross validation upon the # training set possible_feature_sets = [ basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features ] feature_names = [ 'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features' ] N_KCV_REPEATS = FLAGS.kcvrepeat scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] # First run non deterministic classifiers a number of times to average their score performance_tr_nn, performance_te_nn = 0, 0 performance_tr_rf, performance_te_rf = 0, 0 performance_tr_svm, performance_te_svm = 0, 0 for repeat in range(0, N_KCV_REPEATS): print( f'Training NeuralNetwork run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ... ' ) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network( selected_train_X, train_y, selected_test_X, gridsearch=True) print( f'Training RandomForest run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ... ' ) performance_tr_nn += evaluation.accuracy( train_y, class_train_y) performance_te_nn += evaluation.accuracy(test_y, class_test_y) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest( selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_rf += evaluation.accuracy( train_y, class_train_y) performance_te_rf += evaluation.accuracy(test_y, class_test_y) print( f'Training SVM run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ...' ) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner. \ support_vector_machine_with_kernel(selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_svm += evaluation.accuracy( train_y, class_train_y) performance_te_svm += evaluation.accuracy(test_y, class_test_y) overall_performance_tr_nn = performance_tr_nn / N_KCV_REPEATS overall_performance_te_nn = performance_te_nn / N_KCV_REPEATS overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS overall_performance_tr_svm = performance_tr_svm / N_KCV_REPEATS overall_performance_te_svm = performance_te_svm / N_KCV_REPEATS # Run deterministic classifiers: print("Deterministic Classifiers:") print( f'Training Nearest Neighbor run 1 / 1, featureset {feature_names[i]}' ) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor( selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_knn = evaluation.accuracy(train_y, class_train_y) performance_te_knn = evaluation.accuracy(test_y, class_test_y) print( f'Training Decision Tree run 1 / 1 featureset {feature_names[i]}' ) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree( selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_dt = evaluation.accuracy(train_y, class_train_y) performance_te_dt = evaluation.accuracy(test_y, class_test_y) print( f'Training Naive Bayes run 1/1 featureset {feature_names[i]}') class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes( selected_train_X, train_y, selected_test_X) performance_tr_nb = evaluation.accuracy(train_y, class_train_y) performance_te_nb = evaluation.accuracy(test_y, class_test_y) scores_with_sd = util. \ print_table_row_performances(feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [ (overall_performance_tr_nn, overall_performance_te_nn), (overall_performance_tr_rf, overall_performance_te_rf), (overall_performance_tr_svm, overall_performance_te_svm), (performance_tr_knn, performance_te_knn), (performance_tr_knn, performance_te_knn), (performance_tr_dt, performance_te_dt), (performance_tr_nb, performance_te_nb)]) scores_over_all_algs.append(scores_with_sd) DataViz.plot_performances_classification( ['NN', 'RF', 'SVM', 'KNN', 'DT', 'NB'], feature_names, scores_over_all_algs) if FLAGS.mode == 'detail' or FLAGS.mode == 'all': print( '\n- - - Running detail test of promising classification algorithms - - -' ) # Study two promising ones in more detail, namely decision tree and random forest algorithm learner.decision_tree(train_X[selected_features], train_y, test_X[selected_features], gridsearch=True, print_model_details=True, export_tree_path=EXPORT_TREE_PATH) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest( train_X[selected_features], train_y, test_X[selected_features], gridsearch=True, print_model_details=True) test_cm = evaluation.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns) DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False)
DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles/' try: dataset = pd.read_csv(dataset_path + 'chapter5_result.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() # Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task. prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(dataset, 'hr_watch_rate', '2016-02-08 18:29:56', # '2016-02-08 18:29:58','2016-02-08 18:29:59') '2016-02-08 19:34:07', '2016-02-08 20:07:50') print 'Training set length is: ', len(train_X.index) print 'Test set length is: ', len(test_X.index) # Select subsets of the features that we will consider: print 'Training set length is: ', len(train_X.index) print 'Test set length is: ', len(test_X.index) # Select subsets of the features that we will consider:
DATA_PATH = Path('./intermediate_datafiles/Crowdsignal/') DATASET_FNAME = 'chapter5_result.csv' DataViz = VisualizeDataset(__file__) try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = pd.to_datetime(dataset.index) # Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task. prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time( dataset, 'hr_watch_rate', '2016-02-08 18:29:56', # '2016-02-08 18:29:58','2016-02-08 18:29:59') '2016-02-08 19:34:07', '2016-02-08 20:07:50') print('Training set length is: ', len(train_X.index)) print('Test set length is: ', len(test_X.index)) # Select subsets of the features that we will consider: print('Training set length is: ', len(train_X.index))
try: dataset = pd.read_csv(dataset_path + 'chapter5_result-own.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset = pd.read_csv(dataset_path + 'chapter5_result-own.csv', index_col=0) dataset.index = dataset.index.to_datetime() if not os.path.exists(export_tree_path): os.makedirs(export_tree_path) # Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task. prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time( dataset, 'light_phone_lux', '2016-02-08 18:28:56', '2016-02-08 19:34:07', '2016-02-08 20:07:50') # '2016-02-08 18:28:58','2016-02-08 18:28:59') print 'Training set length is: ', len(train_X.index) print 'Test set length is: ', len(test_X.index) # Select subsets of the features that we will consider: basic_features = [ 'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'labelOnTable', 'labelSitting', 'labelSmoking', 'labelWalkingStairs', 'labelStandingInElevator', 'mag_phone_x',
from Operations import * # Set up file names and locations. DATA_PATH = Path('./intermediate_datafiles/') DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter2_result.csv' RESULT_FNAME = sys.argv[2] if len( sys.argv) > 2 else 'chapter3_result_outliers.csv' dataset = pickle.load(open('concat_no_skipping.pkl', 'rb')) dataset = rename(dataset) # dataset.index = pd.to_datetime(dataset.index) DataViz = VisualizeDataset(__file__, show=False) # Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task. prepare = PrepareDatasetForLearning() dataset = dataset.fillna(0) print(dataset.index) print(dataset.loc[dataset.index.values[0]]) train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time( dataset, "gyr_z", dataset.index.values[0], dataset.index.values[int(len(dataset.index) * 0.7)], dataset.index.values[-1]) print('Training set length is: ', len(train_X.index)) print('Test set length is: ', len(test_X.index)) # Select subsets of the features that we will consider: print('Training set length is: ', len(train_X.index))
except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e if not os.path.exists(export_tree_path): os.makedirs(export_tree_path) dataset.index = dataset.index.to_datetime() dataset = dataset.dropna() # Let us consider our first task, namely the prediction of the label. We consider this as a non-temporal task. # We create a single column with the categorical attribute representing our class. Furthermore, we use 70% of our data # for training and the remaining 30% as an independent test set. We select the sets based on stratified sampling. We remove # cases where we do not know the label. prepare = PrepareDatasetForLearning() exact_labels = [ 'labelOnTable', 'labelSitting', 'labelWashingHands', 'labelWalking', 'labelStanding', 'labelDriving', 'labelEating', 'labelRunning' ] sum_values = dataset[exact_labels].sum(axis=1) # Create a new 'class' column and set the value to the default class. dataset['class'] = 'undefined' for i in range(0, len(dataset.index)): # If we have exactly one true class column, we can assign that value, # otherwise we keep the default class. first = True for label in exact_labels: if dataset.ix[i, label] == 1: if first:
try: dataset = pd.read_csv(dataset_path + 'mydata_chapter5_result.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset = dataset.dropna(axis=0,how = 'any',inplace=False) dataset.index = dataset.index.to_datetime() # print(dataset.isnull().sum()) # exit(0) # Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task. prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(dataset, 'gyr_phone_x', '2017-06-09 10:20:28', '2017-06-09 11:18:27', '2017-06-09 11:43:23') print 'Training set length is: ', len(train_X.index) print 'Test set length is: ', len(test_X.index) # Select subsets of the features that we will consider: print 'Training set length is: ', len(train_X.index) print 'Test set length is: ', len(test_X.index) # Select subsets of the features that we will consider: basic_features = ['acc_phone_x','acc_phone_y','acc_phone_z','gyr_phone_y','gyr_phone_z',
def experiment(file): dataset = pd.read_csv(file, index_col=time_col) DataViz = VisualizeDataset(__file__.split('.')[0] + file.split('.')[0].split('/')[1] + '.py', show=True) print(DataViz.figures_dir) dataset.index = pd.to_datetime(dataset.index) prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification( dataset, ['label'], 'like', 0.7, filter=False, temporal=False, drop_na=False, fill_na=True) time_features = [name for name in dataset.columns if '_temp' in name] freq_features = [ name for name in dataset.columns if (('_freq' in name) or ('_pse' in name)) ] cluster_features = ['cluster'] features_2 = list(set().union(basic_features, time_features)) features_3 = list(set().union(basic_features, time_features, freq_features)) features_4 = list(set().union(basic_features, time_features, freq_features, cluster_features)) # print('feature selection') # fs = FeatureSelectionClassification() # features, selected_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION, # train_X[features_4], train_y) # log([str(ordered_scores), str(selected_features)]) selected_features = [ 'gyr_y_temp_std_ws_1200', 'acc_z_temp_mean_ws_120', 'acc_x_temp_mean_ws_120', 'gyr_x_temp_std_ws_2400', 'gyr_z_max_freq', 'gyr_y_freq_1.9_Hz_ws_40', 'acc_z_freq_0.4_Hz_ws_40', 'gyr_z_freq_1.2_Hz_ws_40', 'gyr_x_freq_0.2_Hz_ws_40', 'acc_z_freq_1.0_Hz_ws_40', 'acc_x_freq_0.2_Hz_ws_40', 'acc_y_freq_1.9_Hz_ws_40', 'gyr_x_temp_mean_ws_1200', 'acc_z_freq_1.9_Hz_ws_40', 'acc_x_temp_std_ws_120', 'gyr_z_temp_std_ws_120', 'gyr_y_freq_1.5_Hz_ws_40', 'gyr_z_temp_mean_ws_120', 'gyr_x_freq_0.0_Hz_ws_40', 'acc_z_freq_0.6_Hz_ws_40' ] DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)], y=[selected_features], xlabel='number of features', ylabel='accuracy') print('feature selection finished for %s' % file) learner = ClassificationAlgorithms() eval = ClassificationEvaluation() possible_feature_sets = [ basic_features, features_2, features_3, features_4, selected_features ] feature_names = [ 'Basic features', 'Features with time', 'Features with frequency', 'Features with cluster', 'Selected features' ] # with shelve.open('temp/shelve.out', 'n') as f: # for key in dir(): # try: # f[key] = globals()[key] # except: # print('ERROR shelving: {0}'.format(key)) N_KCV_REPEATS = 1 scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): print(datetime.now()) print('possible feature sets', i) log(['Features %d' % i]) selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] # First we run our non deterministic classifiers a number of times to average their score. performance_tr_rf = 0 performance_te_rf = 0 for repeat in range(0, N_KCV_REPEATS): print(datetime.now()) print('\nRepeat', repeat) print('Random Forest') class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest( selected_train_X, train_y, selected_test_X, gridsearch=True, print_model_details=True) test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns) DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False) performance_tr_rf += eval.accuracy(train_y, class_train_y) performance_te_rf += eval.accuracy(test_y, class_test_y) print(datetime.now()) overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS log([ 'RF' + ' train acc: %f' % performance_te_rf + ' test acc: %f' % performance_te_rf ]) # And we run our deterministic classifiers: print('decision tree') class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree( selected_train_X, train_y, selected_test_X, gridsearch=True, print_model_details=True) performance_tr_dt = eval.accuracy(train_y, class_train_y) performance_te_dt = eval.accuracy(test_y, class_test_y) test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns) DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False) log([ 'DT' + ' train acc: %f' % performance_tr_dt + ' test acc: %f' % performance_te_dt ]) scores_with_sd = util.print_table_row_performances( feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [ (overall_performance_tr_rf, overall_performance_te_rf), (performance_tr_dt, performance_te_dt), ]) scores_over_all_algs.append(scores_with_sd) DataViz.plot_performances_classification(['RF', 'DT'], feature_names, scores_over_all_algs) print(datetime.now())
try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e random_state = 2020 dataset.index = pd.to_datetime(dataset.index) # dataset["hr_watch_rate"] = np.random.randint(60,120, size=len(dataset)) dataset = dataset.fillna(0) # print(dataset["hr_watch_rate"]) # Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task. prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time( dataset, 'acc_phone_x', '2020-06-05 13:11:27', # '2016-02-08 18:29:58','2016-02-08 18:29:59') '2020-06-05 13:43:26', '2020-06-05 13:55:40') print('Training set length is: ', len(train_X.index)) print('Test set length is: ', len(test_X.index)) # Select subsets of the features that we will consider: print('Training set length is: ', len(train_X.index)) print('Test set length is: ', len(test_X.index))
DATA_PATH = Path('./intermediate_datafiles/') DATASET_FNAME = 'chapter5_result_own.csv' DataViz = VisualizeDataset(__file__) try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = pd.to_datetime(dataset.index) # Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task. prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time( dataset, 'gyr_phone_z', '2020-06-02 13:11:36', # '2016-02-08 18:29:58','2016-02-08 18:29:59') '2020-06-02 13:52:51', '2020-06-02 14:13:28') print('Training set length is: ', len(train_X.index)) print('Test set length is: ', len(test_X.index)) # Select subsets of the features that we will consider: print('Training set length is: ', len(train_X.index))
def main(): # Read the result from the previous chapter and convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Consider the second task, namely the prediction of the heart rate. Therefore create a dataset with the heart # rate as target and split using timestamps, because this is considered as a temporal task print('\n- - - Loading dataset - - -') prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time( dataset, 'hr_watch_rate', '2016-02-08 18:29:56', '2016-02-08 19:34:07', '2016-02-08 20:07:50') print('Training set length is: ', len(train_X.index)) print('Test set length is: ', len(test_X.index)) # Select subsets of the features print('\n- - - Selecting subsets - - -') basic_features = [ 'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'labelOnTable', 'labelSitting', 'labelWashingHands', 'labelWalking', 'labelStanding', 'labelDriving', 'labelEating', 'labelRunning', 'light_phone_lux', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z', 'press_phone_pressure' ] pca_features = [ 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7' ] time_features = [ name for name in dataset.columns if ('temp_' in name and 'hr_watch' not in name) ] freq_features = [ name for name in dataset.columns if (('_freq' in name) or ('_pse' in name)) ] cluster_features = ['cluster'] print('#basic features: ', len(basic_features)) print('#PCA features: ', len(pca_features)) print('#time features: ', len(time_features)) print('#frequency features: ', len(freq_features)) print('#cluster features: ', len(cluster_features)) features_after_chapter_3 = list(set().union(basic_features, pca_features)) features_after_chapter_4 = list(set().union(features_after_chapter_3, time_features, freq_features)) features_after_chapter_5 = list(set().union(features_after_chapter_4, cluster_features)) selected_features = [ 'temp_pattern_labelOnTable', 'labelOnTable', 'temp_pattern_labelOnTable(b)labelOnTable', 'cluster', 'pca_1_temp_mean_ws_120', 'pca_2_temp_mean_ws_120', 'pca_2', 'acc_watch_y_temp_mean_ws_120', 'gyr_watch_y_pse', 'gyr_watch_x_pse' ] possible_feature_sets = [ basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features ] feature_names = [ 'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features' ] if FLAGS.mode == 'correlation' or FLAGS.mode == 'all': # First study whether the time series is stationary and what the autocorrelations are adfuller(dataset['hr_watch_rate'], autolag='AIC') plt.Figure() autocorrelation_plot(dataset['hr_watch_rate']) DataViz.save(plt) plt.show() # Now focus on the learning part learner = TemporalRegressionAlgorithms() evaluate = RegressionEvaluation() if FLAGS.mode == 'overall' or FLAGS.mode == 'all': # Repeat the experiment a number of times to get a bit more robust data as the initialization of e.g. the NN is # random repeats = FLAGS.repeats # Set a washout time to give the NN's the time to stabilize (so don't compute the error during the washout time) washout_time = FLAGS.washout scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): print(f'Evaluating for features {possible_feature_sets[i]}') selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] # First run non deterministic classifiers a number of times to average their score performance_tr_res, performance_tr_res_std = 0, 0 performance_te_res, performance_te_res_std = 0, 0 performance_tr_rnn, performance_tr_rnn_std = 0, 0 performance_te_rnn, performance_te_rnn_std = 0, 0 for repeat in range(0, repeats): print(f'--- run {repeat} ---') regr_train_y, regr_test_y = learner.reservoir_computing( selected_train_X, train_y, selected_test_X, test_y, gridsearch=True, per_time_step=False) mean_tr, std_tr = evaluate.mean_squared_error_with_std( train_y.iloc[washout_time:, ], regr_train_y.iloc[washout_time:, ]) mean_te, std_te = evaluate.mean_squared_error_with_std( test_y.iloc[washout_time:, ], regr_test_y.iloc[washout_time:, ]) performance_tr_res += mean_tr performance_tr_res_std += std_tr performance_te_res += mean_te performance_te_res_std += std_te regr_train_y, regr_test_y = learner.recurrent_neural_network( selected_train_X, train_y, selected_test_X, test_y, gridsearch=True) mean_tr, std_tr = evaluate.mean_squared_error_with_std( train_y.iloc[washout_time:, ], regr_train_y.iloc[washout_time:, ]) mean_te, std_te = evaluate.mean_squared_error_with_std( test_y.iloc[washout_time:, ], regr_test_y.iloc[washout_time:, ]) performance_tr_rnn += mean_tr performance_tr_rnn_std += std_tr performance_te_rnn += mean_te performance_te_rnn_std += std_te # Only apply the time series in case of the basis features if feature_names[i] == 'initial set': regr_train_y, regr_test_y = learner.time_series( selected_train_X, train_y, selected_test_X, test_y, gridsearch=True) mean_tr, std_tr = evaluate.mean_squared_error_with_std( train_y.iloc[washout_time:, ], regr_train_y.iloc[washout_time:, ]) mean_te, std_te = evaluate.mean_squared_error_with_std( test_y.iloc[washout_time:, ], regr_test_y.iloc[washout_time:, ]) overall_performance_tr_ts = mean_tr overall_performance_tr_ts_std = std_tr overall_performance_te_ts = mean_te overall_performance_te_ts_std = std_te else: overall_performance_tr_ts = 0 overall_performance_tr_ts_std = 0 overall_performance_te_ts = 0 overall_performance_te_ts_std = 0 overall_performance_tr_res = performance_tr_res / repeats overall_performance_tr_res_std = performance_tr_res_std / repeats overall_performance_te_res = performance_te_res / repeats overall_performance_te_res_std = performance_te_res_std / repeats overall_performance_tr_rnn = performance_tr_rnn / repeats overall_performance_tr_rnn_std = performance_tr_rnn_std / repeats overall_performance_te_rnn = performance_te_rnn / repeats overall_performance_te_rnn_std = performance_te_rnn_std / repeats scores_with_sd = [ (overall_performance_tr_res, overall_performance_tr_res_std, overall_performance_te_res, overall_performance_te_res_std), (overall_performance_tr_rnn, overall_performance_tr_rnn_std, overall_performance_te_rnn, overall_performance_te_rnn_std), (overall_performance_tr_ts, overall_performance_tr_ts_std, overall_performance_te_ts, overall_performance_te_ts_std) ] util.print_table_row_performances_regression( feature_names[i], scores_with_sd) scores_over_all_algs.append(scores_with_sd) DataViz.plot_performances_regression( ['Reservoir', 'RNN', 'Time series'], feature_names, scores_over_all_algs) if FLAGS.mode == 'detail' or FLAGS.mode == 'all': regr_train_y, regr_test_y = learner.reservoir_computing( train_X[features_after_chapter_5], train_y, test_X[features_after_chapter_5], test_y, gridsearch=False) DataViz.plot_numerical_prediction_versus_real( train_X.index, train_y, regr_train_y['hr_watch_rate'], test_X.index, test_y, regr_test_y['hr_watch_rate'], 'heart rate') regr_train_y, regr_test_y = learner.recurrent_neural_network( train_X[basic_features], train_y, test_X[basic_features], test_y, gridsearch=True) DataViz.plot_numerical_prediction_versus_real( train_X.index, train_y, regr_train_y['hr_watch_rate'], test_X.index, test_y, regr_test_y['hr_watch_rate'], 'heart rate') regr_train_y, regr_test_y = learner.time_series( train_X[basic_features], train_y, test_X[basic_features], test_y, gridsearch=True) DataViz.plot_numerical_prediction_versus_real( train_X.index, train_y, regr_train_y['hr_watch_rate'], test_X.index, test_y, regr_test_y['hr_watch_rate'], 'heart rate') if FLAGS.mode == 'dynamical' or FLAGS.mode == 'all': # And now some example code for using the dynamical systems model with parameter tuning (note: focus on # predicting accelerometer data): train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression( copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y'], 0.9, filter_data=False, temporal=True) output_sets = learner. \ dynamical_systems_model_nsga_2(train_X, train_y, test_X, test_y, ['self.acc_phone_x', 'self.acc_phone_y', 'self.acc_phone_z'], ['self.a * self.acc_phone_x + self.b * self.acc_phone_y', 'self.c * self.acc_phone_y + self.d * self.acc_phone_z', 'self.e * self.acc_phone_x + self.f * self.acc_phone_z'], ['self.acc_phone_x', 'self.acc_phone_y'], ['self.a', 'self.b', 'self.c', 'self.d', 'self.e', 'self.f'], pop_size=10, max_generations=10, per_time_step=True) DataViz.plot_pareto_front(output_sets) DataViz.plot_numerical_prediction_versus_real_dynsys_mo( train_X.index, train_y, test_X.index, test_y, output_sets, 0, 'acc_phone_x') regr_train_y, regr_test_y = learner. \ dynamical_systems_model_ga(train_X, train_y, test_X, test_y, ['self.acc_phone_x', 'self.acc_phone_y', 'self.acc_phone_z'], ['self.a * self.acc_phone_x + self.b * self.acc_phone_y', 'self.c * self.acc_phone_y + self.d * self.acc_phone_z', 'self.e * self.acc_phone_x + self.f * self.acc_phone_z'], ['self.acc_phone_x', 'self.acc_phone_y'], ['self.a', 'self.b', 'self.c', 'self.d', 'self.e', 'self.f'], pop_size=5, max_generations=10, per_time_step=True) DataViz.plot_numerical_prediction_versus_real( train_X.index, train_y['acc_phone_x'], regr_train_y['acc_phone_x'], test_X.index, test_y['acc_phone_x'], regr_test_y['acc_phone_x'], 'acc_phone_x') regr_train_y, regr_test_y = learner. \ dynamical_systems_model_sa(train_X, train_y, test_X, test_y, ['self.acc_phone_x', 'self.acc_phone_y', 'self.acc_phone_z'], ['self.a * self.acc_phone_x + self.b * self.acc_phone_y', 'self.c * self.acc_phone_y + self.d * self.acc_phone_z', 'self.e * self.acc_phone_x + self.f * self.acc_phone_z'], ['self.acc_phone_x', 'self.acc_phone_y'], ['self.a', 'self.b', 'self.c', 'self.d', 'self.e', 'self.f'], max_generations=10, per_time_step=True) DataViz.plot_numerical_prediction_versus_real( train_X.index, train_y['acc_phone_x'], regr_train_y['acc_phone_x'], test_X.index, test_y['acc_phone_x'], regr_test_y['acc_phone_x'], 'acc_phone_x')
flattened_values[columnname +"_max"] = (np.max(values)) #flattened_values = flattened_values.join(np.max(transformation)) #flattened_values = np.append(flattened_values, np.argmax(transformation)) #flattened_values = np.append(flattened_values, np.max(transformation)) df = pd.DataFrame(data=flattened_values) df['class'] = str(label) frames.append(df) result = pd.concat(frames) result.columns = result.columns.astype(str) # result = result.sample(frac=1) prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(result, ['class'], 'unlike', 0.8, filter=True, temporal=False) #number_training_samples = len(train_X) #val_split = int(0.7 * number_training_samples) #val_X = train_X[val_split:-1] #val_y = train_y[val_split:-1] #train_X = train_X[0:val_split - 1] #train_y = train_y[0:val_split - 1] learner = ClassificationAlgorithms() eval = ClassificationEvaluation() print(len(train_X))
index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() dataset = dataset.dropna() del dataset['silhouette'] # Let us consider our first task, namely the prediction of the label. We consider this as a non-temporal task. # We create a single column with the categorical attribute representing our class. Furthermore, we use 70% of our data # for training and the remaining 30% as an independent test set. We select the sets based on stratified sampling. We remove # cases where we do not know the label. prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time( dataset, 'acc_phone_x', '2017-06-13 22:21:02', # '2016-02-08 18:29:58','2016-02-08 18:29:59') '2017-06-13 23:40:47', '2017-06-14 00:22:24') print 'Training set length is: ', len(train_X.index) print 'Test set length is: ', len(test_X.index) # Select subsets of the features that we will consider: print 'Training set length is: ', len(train_X.index)