Esempio n. 1
0
    def backward_selection(self, max_features, X_train, y_train):
        # First select all features.
        selected_features = X_train.columns.tolist()
        ca = ClassificationAlgorithms()
        ce = ClassificationEvaluation()
        for i in range(0, (len(X_train.columns) - max_features)):
            best_perf = 0
            worst_feature = ''

            # Select from the features that are still in the selection.
            for f in selected_features:
                temp_selected_features = copy.deepcopy(selected_features)
                temp_selected_features.remove(f)

                # Determine the score without the feature.
                pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree(X_train[temp_selected_features], y_train, X_train[temp_selected_features])
                perf = ce.accuracy(y_train, pred_y_train)

                # If we score better without the feature than what we have seen so far
                # this is the worst feature.
                if perf > best_perf:
                    best_perf = perf
                    worst_feature = f

            # Remove the worst feature.
            selected_features.remove(worst_feature)
        return selected_features
Esempio n. 2
0
    def forward_selection(
            max_features: int, X_train: pd.DataFrame,
            y_train: pd.Series) -> Tuple[List[str], List[str], List[float]]:
        """
        Select the given number of features for classification, that show the best accuracy, using forward selection.
        The method uses the given features and labels to train a decision tree and determine the accuracy of the
        prediction. The method returns the selected features as well as the the scores.

        :param max_features: Number of features to select.
        :param X_train: Features as DataFrame.
        :param y_train: Labels corresponding to given features.
        :return: Selected features and scores.
        """

        # Start with no features
        ordered_features = []
        ordered_scores = []
        selected_features = []
        ca = ClassificationAlgorithms()
        ce = ClassificationEvaluation()

        # Select the appropriate number of features
        for i in range(0, max_features):
            # Determine the features left to select
            features_left = list(set(X_train.columns) - set(selected_features))
            best_perf = 0
            best_feature = ''

            print(f'Selecting feature {i+1}/{max_features}')
            # Iterate over all features left
            for f in features_left:
                temp_selected_features = copy.deepcopy(selected_features)
                temp_selected_features.append(f)

                # Determine the accuracy of a decision tree learner adding the feature
                pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree(
                    X_train[temp_selected_features],
                    y_train,
                    X_train[temp_selected_features],
                    gridsearch=False)
                perf = ce.accuracy(y_train, pred_y_train)

                # If the performance is better than the best so far (aiming for high accuracy), set the current feature
                # to the best feature and the same for the best performance
                if perf > best_perf:
                    best_perf = perf
                    best_feature = f
            # Select the feature with the best performance
            selected_features.append(best_feature)
            ordered_features.append(best_feature)
            ordered_scores.append(best_perf)
        return selected_features, ordered_features, ordered_scores
Esempio n. 3
0
    def forward_selection(self, max_features, X_train, y_train):
        # Start with no features.
        ordered_features = []
        ordered_scores = []
        selected_features = []
        ca = ClassificationAlgorithms()
        ce = ClassificationEvaluation()
        prev_best_perf = 0

        # Select the appropriate number of features.
        for i in range(0, max_features):
            print i

            #Determine the features left to select.
            features_left = list(set(X_train.columns) - set(selected_features))
            best_perf = 0
            best_attribute = ''

            # For all features we can still select...
            for f in features_left:
                temp_selected_features = copy.deepcopy(selected_features)
                temp_selected_features.append(f)

                # Determine the accuracy of a decision tree learner if we were to add
                # the feature.
                pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree(
                    X_train[temp_selected_features], y_train,
                    X_train[temp_selected_features])
                perf = ce.accuracy(y_train, pred_y_train)

                # If the performance is better than what we have seen so far (we aim for high accuracy)
                # we set the current feature to the best feature and the same for the best performance.
                if perf > best_perf:
                    best_perf = perf
                    best_feature = f
            # We select the feature with the best performance.
            selected_features.append(best_feature)
            prev_best_perf = best_perf
            ordered_features.append(best_feature)
            ordered_scores.append(best_perf)
        return selected_features, ordered_features, ordered_scores
Esempio n. 4
0
    def backward_selection(max_features: int, X_train: pd.DataFrame,
                           y_train: pd.Series) -> List[str]:
        """
        Select the given number of features for classification, that show the best accuracy, using backward selection.
        The method uses the given features and labels to train a decision tree and determine the accuracy of the
        prediction.

        :param max_features: Number of features to select.
        :param X_train: Features as DataFrame.
        :param y_train: Labels corresponding to given features.
        :return: Selected features.
        """

        # First select all features
        selected_features = X_train.columns.tolist()
        ca = ClassificationAlgorithms()
        ce = ClassificationEvaluation()
        for i in range(0, (len(X_train.columns) - max_features)):
            best_perf = 0
            worst_feature = ''

            # Select from the features that are still in the selection
            for f in selected_features:
                temp_selected_features = copy.deepcopy(selected_features)
                temp_selected_features.remove(f)

                # Determine the score without the feature
                pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree(
                    X_train[temp_selected_features], y_train,
                    X_train[temp_selected_features])
                perf = ce.accuracy(y_train, pred_y_train)

                # If scoring better without the feature than seen so far, this is the worst feature
                if perf > best_perf:
                    best_perf = perf
                    worst_feature = f

            # Remove the worst feature
            selected_features.remove(worst_feature)
        return selected_features
def main():
    # Read the result from the previous chapter and convert the index to datetime
    try:
        dataset = pd.read_csv(DATA_PATH / DATASET_FILENAME, index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Consider the first task, namely the prediction of the label. Therefore create a single column with the categorical
    # attribute representing the class. Furthermore, use 70% of the data for training and the remaining 30% as an
    # independent test set. Select the sets based on stratified sampling and remove cases where the label is unknown.
    print('\n- - - Loading dataset - - -')
    prepare = PrepareDatasetForLearning()
    learner = ClassificationAlgorithms()
    evaluation = ClassificationEvaluation()
    train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(
        dataset, ['label'], 'like', 0.7, filter_data=True, temporal=False)

    print('Training set length is: ', len(train_X.index))
    print('Test set length is: ', len(test_X.index))

    # Select subsets of the features
    print('- - - Selecting subsets - - -')
    basic_features = [
        'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
        'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y',
        'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z',
        'hr_watch_rate', 'light_phone_lux', 'mag_phone_x', 'mag_phone_y',
        'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z',
        'press_phone_pressure'
    ]
    pca_features = [
        'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7'
    ]
    time_features = [name for name in dataset.columns if '_temp_' in name]
    freq_features = [
        name for name in dataset.columns
        if (('_freq' in name) or ('_pse' in name))
    ]
    cluster_features = ['cluster']
    print('#basic features: ', len(basic_features))
    print('#PCA features: ', len(pca_features))
    print('#time features: ', len(time_features))
    print('#frequency features: ', len(freq_features))
    print('#cluster features: ', len(cluster_features))
    features_after_chapter_3 = list(set().union(basic_features, pca_features))
    features_after_chapter_4 = list(set().union(features_after_chapter_3,
                                                time_features, freq_features))
    features_after_chapter_5 = list(set().union(features_after_chapter_4,
                                                cluster_features))

    if FLAGS.mode == 'selection' or FLAGS.mode == 'all':
        # First, consider the performance over a selection of features
        N_FORWARD_SELECTION = FLAGS.nfeatures
        fs = FeatureSelectionClassification()
        print('\n- - - Running feature selection - - -')
        features, ordered_features, ordered_scores = fs.forward_selection(
            max_features=N_FORWARD_SELECTION,
            X_train=train_X[features_after_chapter_5],
            y_train=train_y)
        DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)],
                        y=[ordered_scores],
                        xlabel='number of features',
                        ylabel='accuracy')

    # Select the most important features (based on python2 features)
    selected_features = [
        'acc_phone_y_freq_0.0_Hz_ws_40',
        'press_phone_pressure_temp_mean_ws_120', 'gyr_phone_x_temp_std_ws_120',
        'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted',
        'gyr_phone_y_freq_1.0_Hz_ws_40', 'acc_phone_x_freq_1.9_Hz_ws_40',
        'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40'
    ]

    if FLAGS.mode == 'regularization' or FLAGS.mode == 'all':
        print('\n- - - Running regularization and model complexity test - - -')
        # Study the impact of regularization and model complexity: does regularization prevent overfitting?
        # Due to runtime constraints run the experiment 3 times, for even more robust data increase the repetitions
        N_REPEATS_NN = FLAGS.nnrepeat
        reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10]
        performance_training = []
        performance_test = []

        for reg_param in reg_parameters:
            performance_tr = 0
            performance_te = 0
            for i in range(0, N_REPEATS_NN):
                class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(
                    train_X,
                    train_y,
                    test_X,
                    hidden_layer_sizes=(250, ),
                    alpha=reg_param,
                    max_iter=500,
                    gridsearch=False)
                performance_tr += evaluation.accuracy(train_y, class_train_y)
                performance_te += evaluation.accuracy(test_y, class_test_y)
            performance_training.append(performance_tr / N_REPEATS_NN)
            performance_test.append(performance_te / N_REPEATS_NN)
        DataViz.plot_xy(x=[reg_parameters, reg_parameters],
                        y=[performance_training, performance_test],
                        method='semilogx',
                        xlabel='regularization parameter value',
                        ylabel='accuracy',
                        ylim=[0.95, 1.01],
                        names=['training', 'test'],
                        line_styles=['r-', 'b:'])

    if FLAGS.mode == 'tree' or FLAGS.mode == 'all':
        print('\n- - - Running leaf size test of decision tree - - -')
        # Consider the influence of certain parameter settings for the tree model. (very related to the
        # regularization) and study the impact on performance.
        leaf_settings = [1, 2, 5, 10]
        performance_training = []
        performance_test = []

        for no_points_leaf in leaf_settings:
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
                train_X[selected_features],
                train_y,
                test_X[selected_features],
                min_samples_leaf=no_points_leaf,
                gridsearch=False,
                print_model_details=False)

            performance_training.append(
                evaluation.accuracy(train_y, class_train_y))
            performance_test.append(evaluation.accuracy(test_y, class_test_y))

        DataViz.plot_xy(x=[leaf_settings, leaf_settings],
                        y=[performance_training, performance_test],
                        xlabel='Minimum number of points per leaf',
                        ylabel='Accuracy',
                        names=['training', 'test'],
                        line_styles=['r-', 'b:'])

    if FLAGS.mode == 'overall' or FLAGS.mode == 'all':
        print(
            '\n- - - Running test of all different classification algorithms - - -'
        )
        # Perform grid searches over the most important parameters and do so by means of cross validation upon the
        # training set
        possible_feature_sets = [
            basic_features, features_after_chapter_3, features_after_chapter_4,
            features_after_chapter_5, selected_features
        ]
        feature_names = [
            'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5',
            'Selected features'
        ]
        N_KCV_REPEATS = FLAGS.kcvrepeat

        scores_over_all_algs = []

        for i in range(0, len(possible_feature_sets)):
            selected_train_X = train_X[possible_feature_sets[i]]
            selected_test_X = test_X[possible_feature_sets[i]]

            # First run non deterministic classifiers a number of times to average their score
            performance_tr_nn, performance_te_nn = 0, 0
            performance_tr_rf, performance_te_rf = 0, 0
            performance_tr_svm, performance_te_svm = 0, 0

            for repeat in range(0, N_KCV_REPEATS):
                print(
                    f'Training NeuralNetwork run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ... '
                )
                class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(
                    selected_train_X,
                    train_y,
                    selected_test_X,
                    gridsearch=True)

                print(
                    f'Training RandomForest run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ... '
                )
                performance_tr_nn += evaluation.accuracy(
                    train_y, class_train_y)
                performance_te_nn += evaluation.accuracy(test_y, class_test_y)

                class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
                    selected_train_X,
                    train_y,
                    selected_test_X,
                    gridsearch=True)
                performance_tr_rf += evaluation.accuracy(
                    train_y, class_train_y)
                performance_te_rf += evaluation.accuracy(test_y, class_test_y)

                print(
                    f'Training SVM run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ...'
                )

                class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner. \
                    support_vector_machine_with_kernel(selected_train_X, train_y, selected_test_X, gridsearch=True)
                performance_tr_svm += evaluation.accuracy(
                    train_y, class_train_y)
                performance_te_svm += evaluation.accuracy(test_y, class_test_y)

            overall_performance_tr_nn = performance_tr_nn / N_KCV_REPEATS
            overall_performance_te_nn = performance_te_nn / N_KCV_REPEATS
            overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS
            overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS
            overall_performance_tr_svm = performance_tr_svm / N_KCV_REPEATS
            overall_performance_te_svm = performance_te_svm / N_KCV_REPEATS

            # Run deterministic classifiers:
            print("Deterministic Classifiers:")

            print(
                f'Training Nearest Neighbor run 1 / 1, featureset {feature_names[i]}'
            )
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor(
                selected_train_X, train_y, selected_test_X, gridsearch=True)
            performance_tr_knn = evaluation.accuracy(train_y, class_train_y)
            performance_te_knn = evaluation.accuracy(test_y, class_test_y)

            print(
                f'Training Decision Tree run 1 / 1  featureset {feature_names[i]}'
            )
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
                selected_train_X, train_y, selected_test_X, gridsearch=True)
            performance_tr_dt = evaluation.accuracy(train_y, class_train_y)
            performance_te_dt = evaluation.accuracy(test_y, class_test_y)

            print(
                f'Training Naive Bayes run 1/1 featureset {feature_names[i]}')
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(
                selected_train_X, train_y, selected_test_X)
            performance_tr_nb = evaluation.accuracy(train_y, class_train_y)
            performance_te_nb = evaluation.accuracy(test_y, class_test_y)

            scores_with_sd = util. \
                print_table_row_performances(feature_names[i], len(selected_train_X.index),
                                             len(selected_test_X.index), [
                                                 (overall_performance_tr_nn, overall_performance_te_nn),
                                                 (overall_performance_tr_rf, overall_performance_te_rf),
                                                 (overall_performance_tr_svm, overall_performance_te_svm),
                                                 (performance_tr_knn, performance_te_knn),
                                                 (performance_tr_knn, performance_te_knn),
                                                 (performance_tr_dt, performance_te_dt),
                                                 (performance_tr_nb, performance_te_nb)])
            scores_over_all_algs.append(scores_with_sd)

        DataViz.plot_performances_classification(
            ['NN', 'RF', 'SVM', 'KNN', 'DT', 'NB'], feature_names,
            scores_over_all_algs)

    if FLAGS.mode == 'detail' or FLAGS.mode == 'all':
        print(
            '\n- - - Running detail test of promising classification algorithms - - -'
        )
        # Study two promising ones in more detail, namely decision tree and random forest algorithm
        learner.decision_tree(train_X[selected_features],
                              train_y,
                              test_X[selected_features],
                              gridsearch=True,
                              print_model_details=True,
                              export_tree_path=EXPORT_TREE_PATH)

        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
            train_X[selected_features],
            train_y,
            test_X[selected_features],
            gridsearch=True,
            print_model_details=True)

        test_cm = evaluation.confusion_matrix(test_y, class_test_y,
                                              class_train_prob_y.columns)
        DataViz.plot_confusion_matrix(test_cm,
                                      class_train_prob_y.columns,
                                      normalize=False)
Esempio n. 6
0
    'mag_y_freq_0.8_Hz_ws_40', 'pca_7_temp_std_ws_120', 'mag_x_max_freq',
    'gyr_z_freq_2.0_Hz_ws_40', 'gyr_y_freq_0.0_Hz_ws_40',
    'mag_z_freq_1.5_Hz_ws_40', 'acc_z_temp_MAD_ws_120',
    'acc_y_temp_kurtosis_ws_120', 'mag_x_freq_1.2_Hz_ws_40',
    'lin_acc_y_freq_1.8_Hz_ws_40'
]

selected_features_with_DT = [
    'acc_z_freq_0.0_Hz_ws_40', 'loc_height_temp_mean_ws_120',
    'pca_4_temp_kurtosis_ws_120', 'lin_acc_y_temp_kurtosis_ws_120',
    'pca_1_temp_kurtosis_ws_120', 'acc_z_temp_MAD_ws_120',
    'mag_x_freq_1.2_Hz_ws_40', 'gyr_z_freq_2.0_Hz_ws_40',
    'acc_y_temp_kurtosis_ws_120', 'lin_acc_y_freq_0.6_Hz_ws_40'
]

learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()

possible_feature_sets = [
    basic_features, features_after_outliers_and_imputation,
    features_after_domain_features, features_after_cluster_features,
    selected_features_with_DT, selected_features_with_NB
]
feature_names = [
    'initial set', 'After imputation', 'With Domain features',
    'With cluster features', 'Selected features DT', 'Selected features NB'
]

repeats = 5

scores_over_all_algs = []
Esempio n. 7
0
plot.ylabel('accuracy')
plot.show()

# Based on the plot we select the top 10 features.
'''
selected_features = [
    'acc_phone_y_freq_0.0_Hz_ws_40', 'press_phone_pressure_temp_mean_ws_120',
    'gyr_phone_x_temp_std_ws_120', 'mag_watch_y_pse', 'mag_phone_z_max_freq',
    'gyr_watch_y_freq_weighted', 'gyr_phone_y_freq_1.0_Hz_ws_40',
    'acc_phone_x_freq_1.9_Hz_ws_40', 'mag_watch_z_freq_0.9_Hz_ws_40',
    'acc_watch_y_freq_0.5_Hz_ws_40'
]

# Let us first study the impact of regularization and model complexity: does regularization prevent overfitting?

learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()
'''
reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10]
performance_training = []
performance_test = []

# We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random.

repeats = 20

for reg_param in reg_parameters:
    performance_tr = 0
    performance_te = 0
    for i in range(0, repeats):
        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(train_X, train_y,
Esempio n. 8
0
result.columns = result.columns.astype(str)

# result = result.sample(frac=1)
prepare = PrepareDatasetForLearning()

train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(result, ['class'], 'unlike', 0.8,
                                                                               filter=True, temporal=False)

#number_training_samples = len(train_X)
#val_split = int(0.7 * number_training_samples)
#val_X = train_X[val_split:-1]
#val_y = train_y[val_split:-1]
#train_X = train_X[0:val_split - 1]
#train_y = train_y[0:val_split - 1]

learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()

print(len(train_X))
print(len(test_X))

class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(train_X,
                                                                                                        train_y,
                                                                                                        test_X,
                                                                                                        hidden_layer_sizes=(
                                                                                                            250, 50,),
                                                                                                        # alpha=reg_param,
                                                                                                        max_iter=500,
                                                                                                        gridsearch=False)

performance_tr_nn = eval.accuracy(train_y, class_train_y)
Esempio n. 9
0
# Based on the plot we select the top 10 features.
exit(0)
# selected_features = ['acc_phone_y_freq_0.0_Hz_ws_40', 'press_phone_pressure_temp_mean_ws_120', 'gyr_phone_x_temp_std_ws_120',
#                      'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted', 'gyr_phone_y_freq_1.0_Hz_ws_40',
#                      'acc_phone_x_freq_1.9_Hz_ws_40', 'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40']
selected_features = [
    'pca_1_temp_mean_ws_120', 'press_phone_Pressure_temp_mean_ws_120',
    'acc_phone_x_freq_0.0_Hz_ws_40', 'mag_phone_y_temp_mean_ws_120',
    'gyr_phone_z_freq_0.7_Hz_ws_40', 'pca_3', 'mag_phone_z_freq_0.9_Hz_ws_40',
    'mag_phone_z_freq_0.7_Hz_ws_40', 'gyr_phone_z_freq_0.5_Hz_ws_40',
    'gyr_phone_y_freq_0.6_Hz_ws_40'
]  # forward feature selection

# # Let us first study the impact of regularization and model complexity: does regularization prevent overfitting?
#
learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()
#
# reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10]
# performance_training = []
# performance_test = []
#
# # We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random.
#
# repeats = 2
#
# for reg_param in reg_parameters:
#     performance_tr = 0
#     performance_te = 0
#     for i in range(0, repeats):
#         class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(train_X, train_y,
Esempio n. 10
0
	freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))]
	print '#basic features: ', len(basic_features)
	print '#PCA features: ', len(pca_features)
	print '#time features: ', len(time_features)
	print '#frequency features: ', len(freq_features)
	cluster_features = ['cluster']
	print '#cluster features: ', len(cluster_features)
	features_after_chapter_3 = list(set().union(basic_features, pca_features))
	features_after_chapter_4 = list(set().union(basic_features, pca_features, time_features, freq_features))
	features_after_chapter_5 = list(set().union(basic_features, pca_features, time_features, freq_features, cluster_features))


	# First, let us consider the performance over a selection of features:


	learner = ClassificationAlgorithms()
	eval = ClassificationEvaluation()


	# And we study two promising ones in more detail. First let us consider the decision tree which works best with the selected
	# features.
	#
	class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(train_X[features_after_chapter_5], train_y, test_X[features_after_chapter_5],
												   gridsearch=True,
												   print_model_details=True, export_tree_path=export_tree_path)

	#class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(train_X[features_after_chapter_5], train_y, test_X[features_after_chapter_5],
	#                                                                                           gridsearch=True, print_model_details=True)

	test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns)
Esempio n. 11
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
from Chapter7.LearningAlgorithms import ClassificationAlgorithms

data = pd.read_table("../datasets/our_data/plrx.txt")
algs = ClassificationAlgorithms()
data['label'] -= 1
train = data.head(130)
test = data.tail(52)
train_X = train[[str(i) for i in range(1, 13)]]
train_y = train['label']
test_X = test[[str(i) for i in range(1, 13)]]
test_y = test['label']
res = algs.feedforward_neural_network(train_X, train_y, test_X)


def roc_curve_creation(P):
    roc_curve_points = []
    P = sorted(P, key=lambda x: x[0], reverse=True)
    pos = 0
    for pair in P:
        pos += pair[1]
    neg = len(P) - pos
    last = -1
    for i in range(len(P)):
        if P[i][0] == last:
            continue
        last = P[i][0]
        P_sel = P[i:]
        count = 0
Esempio n. 12
0
def experiment(file):
    dataset = pd.read_csv(file, index_col=time_col)
    DataViz = VisualizeDataset(__file__.split('.')[0] +
                               file.split('.')[0].split('/')[1] + '.py',
                               show=True)
    print(DataViz.figures_dir)
    dataset.index = pd.to_datetime(dataset.index)
    prepare = PrepareDatasetForLearning()
    train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(
        dataset, ['label'],
        'like',
        0.7,
        filter=False,
        temporal=False,
        drop_na=False,
        fill_na=True)

    time_features = [name for name in dataset.columns if '_temp' in name]
    freq_features = [
        name for name in dataset.columns
        if (('_freq' in name) or ('_pse' in name))
    ]
    cluster_features = ['cluster']
    features_2 = list(set().union(basic_features, time_features))
    features_3 = list(set().union(basic_features, time_features,
                                  freq_features))
    features_4 = list(set().union(basic_features, time_features, freq_features,
                                  cluster_features))

    # print('feature selection')
    # fs = FeatureSelectionClassification()
    # features, selected_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION,
    #                                                                   train_X[features_4], train_y)
    # log([str(ordered_scores), str(selected_features)])
    selected_features = [
        'gyr_y_temp_std_ws_1200', 'acc_z_temp_mean_ws_120',
        'acc_x_temp_mean_ws_120', 'gyr_x_temp_std_ws_2400', 'gyr_z_max_freq',
        'gyr_y_freq_1.9_Hz_ws_40', 'acc_z_freq_0.4_Hz_ws_40',
        'gyr_z_freq_1.2_Hz_ws_40', 'gyr_x_freq_0.2_Hz_ws_40',
        'acc_z_freq_1.0_Hz_ws_40', 'acc_x_freq_0.2_Hz_ws_40',
        'acc_y_freq_1.9_Hz_ws_40', 'gyr_x_temp_mean_ws_1200',
        'acc_z_freq_1.9_Hz_ws_40', 'acc_x_temp_std_ws_120',
        'gyr_z_temp_std_ws_120', 'gyr_y_freq_1.5_Hz_ws_40',
        'gyr_z_temp_mean_ws_120', 'gyr_x_freq_0.0_Hz_ws_40',
        'acc_z_freq_0.6_Hz_ws_40'
    ]
    DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)],
                    y=[selected_features],
                    xlabel='number of features',
                    ylabel='accuracy')

    print('feature selection finished for %s' % file)
    learner = ClassificationAlgorithms()
    eval = ClassificationEvaluation()

    possible_feature_sets = [
        basic_features, features_2, features_3, features_4, selected_features
    ]
    feature_names = [
        'Basic features', 'Features with time', 'Features with frequency',
        'Features with cluster', 'Selected features'
    ]

    # with shelve.open('temp/shelve.out', 'n') as f:
    #     for key in dir():
    #         try:
    #             f[key] = globals()[key]
    #         except:
    #             print('ERROR shelving: {0}'.format(key))

    N_KCV_REPEATS = 1

    scores_over_all_algs = []

    for i in range(0, len(possible_feature_sets)):
        print(datetime.now())
        print('possible feature sets', i)
        log(['Features %d' % i])
        selected_train_X = train_X[possible_feature_sets[i]]
        selected_test_X = test_X[possible_feature_sets[i]]

        # First we run our non deterministic classifiers a number of times to average their score.

        performance_tr_rf = 0
        performance_te_rf = 0

        for repeat in range(0, N_KCV_REPEATS):
            print(datetime.now())
            print('\nRepeat', repeat)
            print('Random Forest')
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
                selected_train_X,
                train_y,
                selected_test_X,
                gridsearch=True,
                print_model_details=True)
            test_cm = eval.confusion_matrix(test_y, class_test_y,
                                            class_train_prob_y.columns)

            DataViz.plot_confusion_matrix(test_cm,
                                          class_train_prob_y.columns,
                                          normalize=False)

            performance_tr_rf += eval.accuracy(train_y, class_train_y)
            performance_te_rf += eval.accuracy(test_y, class_test_y)

            print(datetime.now())

        overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS
        overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS
        log([
            'RF' + ' train acc: %f' % performance_te_rf +
            ' test acc: %f' % performance_te_rf
        ])

        # And we run our deterministic classifiers:

        print('decision tree')
        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
            selected_train_X,
            train_y,
            selected_test_X,
            gridsearch=True,
            print_model_details=True)
        performance_tr_dt = eval.accuracy(train_y, class_train_y)
        performance_te_dt = eval.accuracy(test_y, class_test_y)
        test_cm = eval.confusion_matrix(test_y, class_test_y,
                                        class_train_prob_y.columns)

        DataViz.plot_confusion_matrix(test_cm,
                                      class_train_prob_y.columns,
                                      normalize=False)

        log([
            'DT' + ' train acc: %f' % performance_tr_dt +
            ' test acc: %f' % performance_te_dt
        ])
        scores_with_sd = util.print_table_row_performances(
            feature_names[i], len(selected_train_X.index),
            len(selected_test_X.index), [
                (overall_performance_tr_rf, overall_performance_te_rf),
                (performance_tr_dt, performance_te_dt),
            ])
        scores_over_all_algs.append(scores_with_sd)

    DataViz.plot_performances_classification(['RF', 'DT'], feature_names,
                                             scores_over_all_algs)
    print(datetime.now())
#                 xlabel='number of features', ylabel='accuracy')

# Based on the plot we select the top 10 features (note: slightly different compared to Python 2, we use
# those feartures here).

selected_features = [
    'rotationRate.z_temp_std_ws_180',
    'userAcceleration.z_temp_std_ws_180',
    'gravity.x_freq_0.0_Hz_ws_100',
    'userAcceleration.y_freq_0.0_Hz_ws_100',
    'gravity.x_freq_2.7_Hz_ws_100',
]

# Let us first study the impact of regularization and model complexity: does regularization prevent overfitting?

learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()

reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10]
performance_training = []
performance_test = []

# We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random.
# N_REPEATS_NN = 20
#
# for reg_param in reg_parameters:
#     performance_tr = 0
#     performance_te = 0
#     for i in range(0, N_REPEATS_NN):
#         class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(
#             train_X, train_y,
# Based on the plot we select the top 10 features.
selected_features_with_NB = ['acc_x_temp_MAD_ws_120', 'mag_z_freq_0.0_Hz_ws_40', 'mag_y_freq_0.0_Hz_ws_40',
                             'lin_acc_y_freq_0.0_Hz_ws_40', 'pca_7_temp_mean_ws_120', 'acc_z_temp_std_ws_120', 'acc_y',
                             'pca_9_temp_MAD_ws_120', 'mag_x_freq_0.1_Hz_ws_40', 'mag_z_freq_1.1_Hz_ws_40']

selected_features_with_KNN = ['mag_y_freq_0.8_Hz_ws_40', 'pca_7_temp_std_ws_120', 'mag_x_max_freq',
                              'gyr_z_freq_2.0_Hz_ws_40', 'gyr_y_freq_0.0_Hz_ws_40', 'mag_z_freq_1.5_Hz_ws_40',
                              'acc_z_temp_MAD_ws_120', 'acc_y_temp_kurtosis_ws_120', 'mag_x_freq_1.2_Hz_ws_40',
                              'lin_acc_y_freq_1.8_Hz_ws_40']

selected_features_with_DT = ['acc_z_freq_0.0_Hz_ws_40', 'loc_height_temp_mean_ws_120', 'pca_4_temp_kurtosis_ws_120',
                             'lin_acc_y_temp_kurtosis_ws_120', 'pca_1_temp_kurtosis_ws_120', 'acc_z_temp_MAD_ws_120',
                             'mag_x_freq_1.2_Hz_ws_40', 'gyr_z_freq_2.0_Hz_ws_40', 'acc_y_temp_kurtosis_ws_120',
                             'lin_acc_y_freq_0.6_Hz_ws_40']

learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()

# possible_feature_sets = [basic_features, features_after_outliers_and_imputation, features_after_domain_features,
#                          features_after_cluster_features, selected_features_with_DT, selected_features_with_NB]
# feature_names = ['initial set', 'After imputation', 'With Domain features', 'With cluster features', 'Selected features DT', 'Selected features NB']

feature_set_axel = [basic_features, features_after_outliers_and_imputation]
feature_names_axel = ['initial set', 'After imputation']

feature_set_hasine = [features_after_domain_features, features_after_cluster_features]
feature_names_hasine = ['With Domain features', 'With cluster features']

feature_set_kim = [selected_features_with_DT, selected_features_with_NB]
feature_names_kim = ['Selected features DT', 'Selected features NB']
Esempio n. 15
0
    'mag_y_freq_0.8_Hz_ws_40', 'pca_7_temp_std_ws_120', 'mag_x_max_freq',
    'gyr_z_freq_2.0_Hz_ws_40', 'gyr_y_freq_0.0_Hz_ws_40',
    'mag_z_freq_1.5_Hz_ws_40', 'acc_z_temp_MAD_ws_120',
    'acc_y_temp_kurtosis_ws_120', 'mag_x_freq_1.2_Hz_ws_40',
    'lin_acc_y_freq_1.8_Hz_ws_40'
]

selected_features_with_DT = [
    'acc_z_freq_0.0_Hz_ws_40', 'loc_height_temp_mean_ws_120',
    'pca_4_temp_kurtosis_ws_120', 'lin_acc_y_temp_kurtosis_ws_120',
    'pca_1_temp_kurtosis_ws_120', 'acc_z_temp_MAD_ws_120',
    'mag_x_freq_1.2_Hz_ws_40', 'gyr_z_freq_2.0_Hz_ws_40',
    'acc_y_temp_kurtosis_ws_120', 'lin_acc_y_freq_0.6_Hz_ws_40'
]

learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()

possible_feature_sets = [
    basic_features, features_after_outliers_and_imputation,
    features_after_domain_features, features_after_cluster_features,
    selected_features_with_DT, selected_features_with_NB
]
feature_names = [
    'initial set', 'After imputation', 'With Domain features',
    'With cluster features', 'Selected features DT', 'Selected features NB'
]

repeats = 3

scores_over_all_algs = []
Esempio n. 16
0
plot.plot(range(1, 26), ordered_scores)
plot.xlabel('number of features')
plot.ylabel('accuracy')
plot.show()

# Based on the plot we select the top 10 features.

selected_features = features_after_chapter_5
# ['acc_phone_y_freq_0.0_Hz_ws_40', 'press_phone_pressure_temp_mean_ws_120', 'gyr_phone_x_temp_std_ws_120',
#                     'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted', 'gyr_phone_y_freq_1.0_Hz_ws_40',
#                     'acc_phone_x_freq_1.9_Hz_ws_40', 'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40']

# Let us first study the impact of regularization and model complexity: does regularization prevent overfitting?

learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()

reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10]
performance_training = []
performance_test = []

# We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random.

repeats = 20

for reg_param in reg_parameters:
    performance_tr = 0
    performance_te = 0
    for i in range(0, repeats):
        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(train_X, train_y,
# Based on the plot we select the top 10 features.
selected_features_with_NB = ['acc_x_temp_MAD_ws_120', 'mag_z_freq_0.0_Hz_ws_40', 'mag_y_freq_0.0_Hz_ws_40',
                             'lin_acc_y_freq_0.0_Hz_ws_40', 'pca_7_temp_mean_ws_120', 'acc_z_temp_std_ws_120', 'acc_y',
                             'pca_9_temp_MAD_ws_120', 'mag_x_freq_0.1_Hz_ws_40', 'mag_z_freq_1.1_Hz_ws_40']

selected_features_with_KNN = ['mag_y_freq_0.8_Hz_ws_40', 'pca_7_temp_std_ws_120', 'mag_x_max_freq',
                              'gyr_z_freq_2.0_Hz_ws_40', 'gyr_y_freq_0.0_Hz_ws_40', 'mag_z_freq_1.5_Hz_ws_40',
                              'acc_z_temp_MAD_ws_120', 'acc_y_temp_kurtosis_ws_120', 'mag_x_freq_1.2_Hz_ws_40',
                              'lin_acc_y_freq_1.8_Hz_ws_40']

selected_features_with_DT = ['acc_z_freq_0.0_Hz_ws_40', 'loc_height_temp_mean_ws_120', 'pca_4_temp_kurtosis_ws_120',
                             'lin_acc_y_temp_kurtosis_ws_120', 'pca_1_temp_kurtosis_ws_120', 'acc_z_temp_MAD_ws_120',
                             'mag_x_freq_1.2_Hz_ws_40', 'gyr_z_freq_2.0_Hz_ws_40', 'acc_y_temp_kurtosis_ws_120',
                             'lin_acc_y_freq_0.6_Hz_ws_40']

learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()

possible_feature_sets = [basic_features, features_after_outliers_and_imputation, features_after_domain_features,
                         features_after_cluster_features, selected_features_with_DT, selected_features_with_NB]
feature_names = ['initial set', 'After imputation', 'With Domain features', 'With cluster features', 'Selected features DT', 'Selected features NB']


repeats = 3

scores_over_all_algs = []

for i in range(0, len(possible_feature_sets)):
    #print "working on feature set", feature_names[i]
    selected_train_X = train_X[possible_feature_sets[i]]
    selected_test_X = test_X[possible_feature_sets[i]]