performance_te_dt = eval.accuracy(test_y, class_test_y) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(selected_train_X, train_y, selected_test_X) performance_tr_nb = eval.accuracy(train_y, class_train_y) performance_te_nb = eval.accuracy(test_y, class_test_y) scores_with_sd = util.print_table_row_performances(feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [ (overall_performance_tr_nn, overall_performance_te_nn), (overall_performance_tr_rf, overall_performance_te_rf), (overall_performance_tr_svm, overall_performance_te_svm), (performance_tr_knn, performance_te_knn), (performance_tr_dt, performance_te_dt), (performance_tr_nb, performance_te_nb)]) scores_over_all_algs.append(scores_with_sd) DataViz.plot_performances_classification(['NN', 'RF', 'SVM', 'KNN', 'DT', 'NB'], feature_names, scores_over_all_algs) # And we study two promising ones in more detail. First let us consider the decision tree which works best with the selected # features. # class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(train_X[selected_features], train_y, test_X[selected_features], gridsearch=True, print_model_details=True, export_tree_path=export_tree_path) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(train_X[selected_features], train_y, test_X[selected_features], gridsearch=True, print_model_details=True) test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns) DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False)
def main(): # Read the result from the previous chapter and convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FILENAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Consider the first task, namely the prediction of the label. Therefore create a single column with the categorical # attribute representing the class. Furthermore, use 70% of the data for training and the remaining 30% as an # independent test set. Select the sets based on stratified sampling and remove cases where the label is unknown. print('\n- - - Loading dataset - - -') prepare = PrepareDatasetForLearning() learner = ClassificationAlgorithms() evaluation = ClassificationEvaluation() train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification( dataset, ['label'], 'like', 0.7, filter_data=True, temporal=False) print('Training set length is: ', len(train_X.index)) print('Test set length is: ', len(test_X.index)) # Select subsets of the features print('- - - Selecting subsets - - -') basic_features = [ 'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'hr_watch_rate', 'light_phone_lux', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z', 'press_phone_pressure' ] pca_features = [ 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7' ] time_features = [name for name in dataset.columns if '_temp_' in name] freq_features = [ name for name in dataset.columns if (('_freq' in name) or ('_pse' in name)) ] cluster_features = ['cluster'] print('#basic features: ', len(basic_features)) print('#PCA features: ', len(pca_features)) print('#time features: ', len(time_features)) print('#frequency features: ', len(freq_features)) print('#cluster features: ', len(cluster_features)) features_after_chapter_3 = list(set().union(basic_features, pca_features)) features_after_chapter_4 = list(set().union(features_after_chapter_3, time_features, freq_features)) features_after_chapter_5 = list(set().union(features_after_chapter_4, cluster_features)) if FLAGS.mode == 'selection' or FLAGS.mode == 'all': # First, consider the performance over a selection of features N_FORWARD_SELECTION = FLAGS.nfeatures fs = FeatureSelectionClassification() print('\n- - - Running feature selection - - -') features, ordered_features, ordered_scores = fs.forward_selection( max_features=N_FORWARD_SELECTION, X_train=train_X[features_after_chapter_5], y_train=train_y) DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)], y=[ordered_scores], xlabel='number of features', ylabel='accuracy') # Select the most important features (based on python2 features) selected_features = [ 'acc_phone_y_freq_0.0_Hz_ws_40', 'press_phone_pressure_temp_mean_ws_120', 'gyr_phone_x_temp_std_ws_120', 'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted', 'gyr_phone_y_freq_1.0_Hz_ws_40', 'acc_phone_x_freq_1.9_Hz_ws_40', 'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40' ] if FLAGS.mode == 'regularization' or FLAGS.mode == 'all': print('\n- - - Running regularization and model complexity test - - -') # Study the impact of regularization and model complexity: does regularization prevent overfitting? # Due to runtime constraints run the experiment 3 times, for even more robust data increase the repetitions N_REPEATS_NN = FLAGS.nnrepeat reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10] performance_training = [] performance_test = [] for reg_param in reg_parameters: performance_tr = 0 performance_te = 0 for i in range(0, N_REPEATS_NN): class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network( train_X, train_y, test_X, hidden_layer_sizes=(250, ), alpha=reg_param, max_iter=500, gridsearch=False) performance_tr += evaluation.accuracy(train_y, class_train_y) performance_te += evaluation.accuracy(test_y, class_test_y) performance_training.append(performance_tr / N_REPEATS_NN) performance_test.append(performance_te / N_REPEATS_NN) DataViz.plot_xy(x=[reg_parameters, reg_parameters], y=[performance_training, performance_test], method='semilogx', xlabel='regularization parameter value', ylabel='accuracy', ylim=[0.95, 1.01], names=['training', 'test'], line_styles=['r-', 'b:']) if FLAGS.mode == 'tree' or FLAGS.mode == 'all': print('\n- - - Running leaf size test of decision tree - - -') # Consider the influence of certain parameter settings for the tree model. (very related to the # regularization) and study the impact on performance. leaf_settings = [1, 2, 5, 10] performance_training = [] performance_test = [] for no_points_leaf in leaf_settings: class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree( train_X[selected_features], train_y, test_X[selected_features], min_samples_leaf=no_points_leaf, gridsearch=False, print_model_details=False) performance_training.append( evaluation.accuracy(train_y, class_train_y)) performance_test.append(evaluation.accuracy(test_y, class_test_y)) DataViz.plot_xy(x=[leaf_settings, leaf_settings], y=[performance_training, performance_test], xlabel='Minimum number of points per leaf', ylabel='Accuracy', names=['training', 'test'], line_styles=['r-', 'b:']) if FLAGS.mode == 'overall' or FLAGS.mode == 'all': print( '\n- - - Running test of all different classification algorithms - - -' ) # Perform grid searches over the most important parameters and do so by means of cross validation upon the # training set possible_feature_sets = [ basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features ] feature_names = [ 'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features' ] N_KCV_REPEATS = FLAGS.kcvrepeat scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] # First run non deterministic classifiers a number of times to average their score performance_tr_nn, performance_te_nn = 0, 0 performance_tr_rf, performance_te_rf = 0, 0 performance_tr_svm, performance_te_svm = 0, 0 for repeat in range(0, N_KCV_REPEATS): print( f'Training NeuralNetwork run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ... ' ) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network( selected_train_X, train_y, selected_test_X, gridsearch=True) print( f'Training RandomForest run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ... ' ) performance_tr_nn += evaluation.accuracy( train_y, class_train_y) performance_te_nn += evaluation.accuracy(test_y, class_test_y) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest( selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_rf += evaluation.accuracy( train_y, class_train_y) performance_te_rf += evaluation.accuracy(test_y, class_test_y) print( f'Training SVM run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ...' ) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner. \ support_vector_machine_with_kernel(selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_svm += evaluation.accuracy( train_y, class_train_y) performance_te_svm += evaluation.accuracy(test_y, class_test_y) overall_performance_tr_nn = performance_tr_nn / N_KCV_REPEATS overall_performance_te_nn = performance_te_nn / N_KCV_REPEATS overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS overall_performance_tr_svm = performance_tr_svm / N_KCV_REPEATS overall_performance_te_svm = performance_te_svm / N_KCV_REPEATS # Run deterministic classifiers: print("Deterministic Classifiers:") print( f'Training Nearest Neighbor run 1 / 1, featureset {feature_names[i]}' ) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor( selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_knn = evaluation.accuracy(train_y, class_train_y) performance_te_knn = evaluation.accuracy(test_y, class_test_y) print( f'Training Decision Tree run 1 / 1 featureset {feature_names[i]}' ) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree( selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_dt = evaluation.accuracy(train_y, class_train_y) performance_te_dt = evaluation.accuracy(test_y, class_test_y) print( f'Training Naive Bayes run 1/1 featureset {feature_names[i]}') class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes( selected_train_X, train_y, selected_test_X) performance_tr_nb = evaluation.accuracy(train_y, class_train_y) performance_te_nb = evaluation.accuracy(test_y, class_test_y) scores_with_sd = util. \ print_table_row_performances(feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [ (overall_performance_tr_nn, overall_performance_te_nn), (overall_performance_tr_rf, overall_performance_te_rf), (overall_performance_tr_svm, overall_performance_te_svm), (performance_tr_knn, performance_te_knn), (performance_tr_knn, performance_te_knn), (performance_tr_dt, performance_te_dt), (performance_tr_nb, performance_te_nb)]) scores_over_all_algs.append(scores_with_sd) DataViz.plot_performances_classification( ['NN', 'RF', 'SVM', 'KNN', 'DT', 'NB'], feature_names, scores_over_all_algs) if FLAGS.mode == 'detail' or FLAGS.mode == 'all': print( '\n- - - Running detail test of promising classification algorithms - - -' ) # Study two promising ones in more detail, namely decision tree and random forest algorithm learner.decision_tree(train_X[selected_features], train_y, test_X[selected_features], gridsearch=True, print_model_details=True, export_tree_path=EXPORT_TREE_PATH) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest( train_X[selected_features], train_y, test_X[selected_features], gridsearch=True, print_model_details=True) test_cm = evaluation.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns) DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False)
overall_performance_tr_rf = performance_tr_rf/repeats overall_performance_te_rf = performance_te_rf/repeats """ overall_performance_tr_svm = performance_tr_svm / repeats overall_performance_te_svm = performance_te_svm / repeats # And we run our deterministic classifiers: """ class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_knn = eval.accuracy(train_y, class_train_y) performance_te_knn = eval.accuracy(test_y, class_test_y) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_dt = eval.accuracy(train_y, class_train_y) performance_te_dt = eval.accuracy(test_y, class_test_y) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(selected_train_X, train_y, selected_test_X) performance_tr_nb = eval.accuracy(train_y, class_train_y) performance_te_nb = eval.accuracy(test_y, class_test_y) """ scores_with_sd = util.print_table_row_performances( feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [(performance_tr_svm, performance_te_svm)]) scores_over_all_algs.append(scores_with_sd) print scores_over_all_algs DataViz.plot_performances_classification(['SVM'], feature_names, scores_over_all_algs) exit(0)
def experiment(file): dataset = pd.read_csv(file, index_col=time_col) DataViz = VisualizeDataset(__file__.split('.')[0] + file.split('.')[0].split('/')[1] + '.py', show=True) print(DataViz.figures_dir) dataset.index = pd.to_datetime(dataset.index) prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification( dataset, ['label'], 'like', 0.7, filter=False, temporal=False, drop_na=False, fill_na=True) time_features = [name for name in dataset.columns if '_temp' in name] freq_features = [ name for name in dataset.columns if (('_freq' in name) or ('_pse' in name)) ] cluster_features = ['cluster'] features_2 = list(set().union(basic_features, time_features)) features_3 = list(set().union(basic_features, time_features, freq_features)) features_4 = list(set().union(basic_features, time_features, freq_features, cluster_features)) # print('feature selection') # fs = FeatureSelectionClassification() # features, selected_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION, # train_X[features_4], train_y) # log([str(ordered_scores), str(selected_features)]) selected_features = [ 'gyr_y_temp_std_ws_1200', 'acc_z_temp_mean_ws_120', 'acc_x_temp_mean_ws_120', 'gyr_x_temp_std_ws_2400', 'gyr_z_max_freq', 'gyr_y_freq_1.9_Hz_ws_40', 'acc_z_freq_0.4_Hz_ws_40', 'gyr_z_freq_1.2_Hz_ws_40', 'gyr_x_freq_0.2_Hz_ws_40', 'acc_z_freq_1.0_Hz_ws_40', 'acc_x_freq_0.2_Hz_ws_40', 'acc_y_freq_1.9_Hz_ws_40', 'gyr_x_temp_mean_ws_1200', 'acc_z_freq_1.9_Hz_ws_40', 'acc_x_temp_std_ws_120', 'gyr_z_temp_std_ws_120', 'gyr_y_freq_1.5_Hz_ws_40', 'gyr_z_temp_mean_ws_120', 'gyr_x_freq_0.0_Hz_ws_40', 'acc_z_freq_0.6_Hz_ws_40' ] DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)], y=[selected_features], xlabel='number of features', ylabel='accuracy') print('feature selection finished for %s' % file) learner = ClassificationAlgorithms() eval = ClassificationEvaluation() possible_feature_sets = [ basic_features, features_2, features_3, features_4, selected_features ] feature_names = [ 'Basic features', 'Features with time', 'Features with frequency', 'Features with cluster', 'Selected features' ] # with shelve.open('temp/shelve.out', 'n') as f: # for key in dir(): # try: # f[key] = globals()[key] # except: # print('ERROR shelving: {0}'.format(key)) N_KCV_REPEATS = 1 scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): print(datetime.now()) print('possible feature sets', i) log(['Features %d' % i]) selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] # First we run our non deterministic classifiers a number of times to average their score. performance_tr_rf = 0 performance_te_rf = 0 for repeat in range(0, N_KCV_REPEATS): print(datetime.now()) print('\nRepeat', repeat) print('Random Forest') class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest( selected_train_X, train_y, selected_test_X, gridsearch=True, print_model_details=True) test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns) DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False) performance_tr_rf += eval.accuracy(train_y, class_train_y) performance_te_rf += eval.accuracy(test_y, class_test_y) print(datetime.now()) overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS log([ 'RF' + ' train acc: %f' % performance_te_rf + ' test acc: %f' % performance_te_rf ]) # And we run our deterministic classifiers: print('decision tree') class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree( selected_train_X, train_y, selected_test_X, gridsearch=True, print_model_details=True) performance_tr_dt = eval.accuracy(train_y, class_train_y) performance_te_dt = eval.accuracy(test_y, class_test_y) test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns) DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False) log([ 'DT' + ' train acc: %f' % performance_tr_dt + ' test acc: %f' % performance_te_dt ]) scores_with_sd = util.print_table_row_performances( feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [ (overall_performance_tr_rf, overall_performance_te_rf), (performance_tr_dt, performance_te_dt), ]) scores_over_all_algs.append(scores_with_sd) DataViz.plot_performances_classification(['RF', 'DT'], feature_names, scores_over_all_algs) print(datetime.now())