class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_knn = eval.accuracy(train_y, class_train_y) performance_te_knn = eval.accuracy(test_y, class_test_y) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_dt = eval.accuracy(train_y, class_train_y) performance_te_dt = eval.accuracy(test_y, class_test_y) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(selected_train_X, train_y, selected_test_X) performance_tr_nb = eval.accuracy(train_y, class_train_y) performance_te_nb = eval.accuracy(test_y, class_test_y) scores_with_sd = util.print_table_row_performances(feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [ (overall_performance_tr_nn, overall_performance_te_nn), (overall_performance_tr_rf, overall_performance_te_rf), (overall_performance_tr_svm, overall_performance_te_svm), (performance_tr_knn, performance_te_knn), (performance_tr_dt, performance_te_dt), (performance_tr_nb, performance_te_nb)]) scores_over_all_algs.append(scores_with_sd) DataViz.plot_performances_classification(['NN', 'RF', 'SVM', 'KNN', 'DT', 'NB'], feature_names, scores_over_all_algs) # And we study two promising ones in more detail. First let us consider the decision tree which works best with the selected # features. # class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(train_X[selected_features], train_y, test_X[selected_features], gridsearch=True, print_model_details=True, export_tree_path=export_tree_path) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(train_X[selected_features], train_y, test_X[selected_features],
""" overall_performance_tr_svm = performance_tr_svm/repeats overall_performance_te_svm = performance_te_svm/repeats """ # And we run our deterministic classifiers: """ class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_knn = eval.accuracy(train_y, class_train_y) performance_te_knn = eval.accuracy(test_y, class_test_y) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_dt = eval.accuracy(train_y, class_train_y) performance_te_dt = eval.accuracy(test_y, class_test_y) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(selected_train_X, train_y, selected_test_X) performance_tr_nb = eval.accuracy(train_y, class_train_y) performance_te_nb = eval.accuracy(test_y, class_test_y) """ scores_with_sd = util.print_table_row_performances( feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [(overall_performance_tr_rf, overall_performance_te_rf)]) scores_over_all_algs.append(scores_with_sd) print scores_over_all_algs DataViz.plot_performances_classification(['RF'], feature_names, scores_over_all_algs) exit(0)
overall_performance_tr_rf = performance_tr_rf/repeats overall_performance_te_rf = performance_te_rf/repeats """ overall_performance_tr_svm = performance_tr_svm / repeats overall_performance_te_svm = performance_te_svm / repeats # And we run our deterministic classifiers: """ class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_knn = eval.accuracy(train_y, class_train_y) performance_te_knn = eval.accuracy(test_y, class_test_y) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True) performance_tr_dt = eval.accuracy(train_y, class_train_y) performance_te_dt = eval.accuracy(test_y, class_test_y) class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(selected_train_X, train_y, selected_test_X) performance_tr_nb = eval.accuracy(train_y, class_train_y) performance_te_nb = eval.accuracy(test_y, class_test_y) """ scores_with_sd = util.print_table_row_performances( feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [(performance_tr_svm, performance_te_svm)]) scores_over_all_algs.append(scores_with_sd) print scores_over_all_algs DataViz.plot_performances_classification(['SVM'], feature_names, scores_over_all_algs) exit(0)
def experiment(file): dataset = pd.read_csv(file, index_col=time_col) DataViz = VisualizeDataset(__file__.split('.')[0] + file.split('.')[0].split('/')[1] + '.py', show=True) print(DataViz.figures_dir) dataset.index = pd.to_datetime(dataset.index) prepare = PrepareDatasetForLearning() train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification( dataset, ['label'], 'like', 0.7, filter=False, temporal=False, drop_na=False, fill_na=True) time_features = [name for name in dataset.columns if '_temp' in name] freq_features = [ name for name in dataset.columns if (('_freq' in name) or ('_pse' in name)) ] cluster_features = ['cluster'] features_2 = list(set().union(basic_features, time_features)) features_3 = list(set().union(basic_features, time_features, freq_features)) features_4 = list(set().union(basic_features, time_features, freq_features, cluster_features)) # print('feature selection') # fs = FeatureSelectionClassification() # features, selected_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION, # train_X[features_4], train_y) # log([str(ordered_scores), str(selected_features)]) selected_features = [ 'gyr_y_temp_std_ws_1200', 'acc_z_temp_mean_ws_120', 'acc_x_temp_mean_ws_120', 'gyr_x_temp_std_ws_2400', 'gyr_z_max_freq', 'gyr_y_freq_1.9_Hz_ws_40', 'acc_z_freq_0.4_Hz_ws_40', 'gyr_z_freq_1.2_Hz_ws_40', 'gyr_x_freq_0.2_Hz_ws_40', 'acc_z_freq_1.0_Hz_ws_40', 'acc_x_freq_0.2_Hz_ws_40', 'acc_y_freq_1.9_Hz_ws_40', 'gyr_x_temp_mean_ws_1200', 'acc_z_freq_1.9_Hz_ws_40', 'acc_x_temp_std_ws_120', 'gyr_z_temp_std_ws_120', 'gyr_y_freq_1.5_Hz_ws_40', 'gyr_z_temp_mean_ws_120', 'gyr_x_freq_0.0_Hz_ws_40', 'acc_z_freq_0.6_Hz_ws_40' ] DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)], y=[selected_features], xlabel='number of features', ylabel='accuracy') print('feature selection finished for %s' % file) learner = ClassificationAlgorithms() eval = ClassificationEvaluation() possible_feature_sets = [ basic_features, features_2, features_3, features_4, selected_features ] feature_names = [ 'Basic features', 'Features with time', 'Features with frequency', 'Features with cluster', 'Selected features' ] # with shelve.open('temp/shelve.out', 'n') as f: # for key in dir(): # try: # f[key] = globals()[key] # except: # print('ERROR shelving: {0}'.format(key)) N_KCV_REPEATS = 1 scores_over_all_algs = [] for i in range(0, len(possible_feature_sets)): print(datetime.now()) print('possible feature sets', i) log(['Features %d' % i]) selected_train_X = train_X[possible_feature_sets[i]] selected_test_X = test_X[possible_feature_sets[i]] # First we run our non deterministic classifiers a number of times to average their score. performance_tr_rf = 0 performance_te_rf = 0 for repeat in range(0, N_KCV_REPEATS): print(datetime.now()) print('\nRepeat', repeat) print('Random Forest') class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest( selected_train_X, train_y, selected_test_X, gridsearch=True, print_model_details=True) test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns) DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False) performance_tr_rf += eval.accuracy(train_y, class_train_y) performance_te_rf += eval.accuracy(test_y, class_test_y) print(datetime.now()) overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS log([ 'RF' + ' train acc: %f' % performance_te_rf + ' test acc: %f' % performance_te_rf ]) # And we run our deterministic classifiers: print('decision tree') class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree( selected_train_X, train_y, selected_test_X, gridsearch=True, print_model_details=True) performance_tr_dt = eval.accuracy(train_y, class_train_y) performance_te_dt = eval.accuracy(test_y, class_test_y) test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns) DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False) log([ 'DT' + ' train acc: %f' % performance_tr_dt + ' test acc: %f' % performance_te_dt ]) scores_with_sd = util.print_table_row_performances( feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [ (overall_performance_tr_rf, overall_performance_te_rf), (performance_tr_dt, performance_te_dt), ]) scores_over_all_algs.append(scores_with_sd) DataViz.plot_performances_classification(['RF', 'DT'], feature_names, scores_over_all_algs) print(datetime.now())