Example #1
0
def main():
    # Read the result from the previous chapter convert the index to datetime
    try:
        dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Create objects for clustering
    clusteringNH = NonHierarchicalClustering()
    clusteringH = HierarchicalClustering()

    if FLAGS.mode == 'kmeans':
        # Do some initial runs to determine the right number for k
        k_values = range(2, 10)
        silhouette_values = []

        print('Running k-means clustering')
        for k in k_values:
            print(f'k = {k}')
            dataset_cluster = clusteringNH.k_means_over_instances(
                dataset=copy.deepcopy(dataset),
                cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                k=k,
                distance_metric='default',
                max_iters=20,
                n_inits=10)
            silhouette_score = dataset_cluster['silhouette'].mean()
            print(f'silhouette = {silhouette_score}')
            silhouette_values.append(silhouette_score)

        DataViz.plot_xy(x=[k_values],
                        y=[silhouette_values],
                        xlabel='k',
                        ylabel='silhouette score',
                        ylim=[0, 1],
                        line_styles=['b-'])

        # Run the knn with the highest silhouette score
        k = k_values[np.argmax(silhouette_values)]
        print(f'Highest K-Means silhouette score: k = {k}')
        print('Use this value of k to run the --mode=final --k=?')

    if FLAGS.mode == 'kmediods':
        # Do some initial runs to determine the right number for k
        k_values = range(2, 10)
        silhouette_values = []
        print('Running k-medoids clustering')

        for k in k_values:
            print(f'k = {k}')
            dataset_cluster = clusteringNH.k_medoids_over_instances(
                dataset=copy.deepcopy(dataset),
                cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                k=k,
                distance_metric='default',
                max_iters=20,
                n_inits=10)
            silhouette_score = dataset_cluster['silhouette'].mean()
            print(f'silhouette = {silhouette_score}')
            silhouette_values.append(silhouette_score)

        DataViz.plot_xy(x=[k_values],
                        y=[silhouette_values],
                        xlabel='k',
                        ylabel='silhouette score',
                        ylim=[0, 1],
                        line_styles=['b-'])

        # Run k medoids with the highest silhouette score
        k = k_values[np.argmax(silhouette_values)]
        print(f'Highest K-Medoids silhouette score: k = {k}')

        dataset_kmed = clusteringNH.k_medoids_over_instances(
            dataset=copy.deepcopy(dataset),
            cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            k=k,
            distance_metric='default',
            max_iters=20,
            n_inits=50)
        DataViz.plot_clusters_3d(
            data_table=dataset_kmed,
            data_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            cluster_col='cluster',
            label_cols=['label'])
        DataViz.plot_silhouette(data_table=dataset_kmed,
                                cluster_col='cluster',
                                silhouette_col='silhouette')
        util.print_latex_statistics_clusters(
            dataset=dataset_kmed,
            cluster_col='cluster',
            input_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            label_col='label')

    # Run hierarchical clustering
    if FLAGS.mode == 'agglomerative':
        k_values = range(2, 10)
        silhouette_values = []

        # Do some initial runs to determine the right number for the maximum number of clusters
        print('Running agglomerative clustering')
        for k in k_values:
            print(f'k = {k}')
            dataset_cluster, link = clusteringH.agglomerative_over_instances(
                dataset=dataset,
                cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                max_clusters=k,
                distance_metric='euclidean',
                use_prev_linkage=True,
                link_function='ward')
            silhouette_score = dataset_cluster['silhouette'].mean()
            print(f'silhouette = {silhouette_score}')
            silhouette_values.append(silhouette_score)
            if k == k_values[0]:
                DataViz.plot_dendrogram(dataset_cluster, link)

        # Plot the clustering results
        DataViz.plot_xy(x=[k_values],
                        y=[silhouette_values],
                        xlabel='k',
                        ylabel='silhouette score',
                        ylim=[0, 1],
                        line_styles=['b-'])

    if FLAGS.mode == 'final':
        # Select the outcome dataset of the knn clustering
        clusteringNH = NonHierarchicalClustering()
        dataset = clusteringNH.k_means_over_instances(
            dataset=dataset,
            cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            k=FLAGS.k,
            distance_metric='default',
            max_iters=50,
            n_inits=50)
        # Plot the results
        DataViz.plot_clusters_3d(dataset,
                                 ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                                 'cluster', ['label'])
        DataViz.plot_silhouette(dataset, 'cluster', 'silhouette')
        # Print table statistics
        util.print_latex_statistics_clusters(
            dataset, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            'label')
        del dataset['silhouette']

        # Store the final dataset
        dataset.to_csv(DATA_PATH / RESULT_FNAME)
Example #2
0
print("date and time =", dt_string)

diff = now3 - now2
print('difference time', diff)

# First, let us consider the performance over a selection of features:

fs = FeatureSelectionClassification()

features, ordered_features, ordered_scores = fs.forward_selection(
    N_FORWARD_SELECTION, train_X[features_after_chapter_5], train_y)
print(ordered_scores)
print(ordered_features)

DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)],
                y=[ordered_scores],
                xlabel='number of features',
                ylabel='accuracy')

# datetime object containing current date and time
now4 = datetime.now()
# dd/mm/YY H:M:S
dt_string = now4.strftime("%d/%m/%Y %H:%M:%S")
print("date and time =", dt_string)

diff = now4 - now3
print('difference time', diff)

# Based on the plot we select the top 10 features (note: slightly different compared to Python 2, we use
# those feartures here).

selected_features = [
Example #3
0
    dataset = LowPass.low_pass_filter(dataset, col, fs, cutoff, order=10)
    DataViz.plot_dataset(dataset.iloc[int(0.5 * len(new_dataset.index)):int(0.8 * len(new_dataset.index))],
                         [col, col + '_lowpass'], ['exact', 'exact'], ['line', 'line'])
    dataset[col] = dataset[col + '_lowpass']
    del dataset[col + '_lowpass']

# Determine the PC's for all but our target columns (the labels and the heart rate)
# We simplify by ignoring both, we could also ignore one first, and apply a PC to the remainder.

PCA = PrincipalComponentAnalysis()
selected_predictor_cols = [c for c in dataset.columns if (not ('label' in c))]
pc_values = PCA.determine_pc_explained_variance(dataset, selected_predictor_cols)

# Plot the variance explained.
DataViz.plot_xy(x=[range(1, len(selected_predictor_cols) + 1)], y=[pc_values],
                xlabel='principal component number', ylabel='explained variance',
                ylim=[0, 1], line_styles=['b-'])

# We select 7 as the best number of PC's as this explains most of the variance

n_pcs = 3

dataset = PCA.apply_pca(copy.deepcopy(dataset), selected_predictor_cols, n_pcs)

# And we visualize the result of the PC's

DataViz.plot_dataset(dataset, ['pca_', 'label'], ['like', 'like'], ['line', 'points'])

# And the overall final dataset:

DataViz.plot_dataset(dataset,
Example #4
0
    'Gyroscope z (rad/s)'
]
## Do some initial runs to determine the right number for k

print('===== kmeans clustering =====')
for k in k_values:
    print(f'k = {k}')
    dataset_cluster = clusteringNH.k_means_over_instances(
        copy.deepcopy(dataset), attributes_to_cluster, k, 'default', 20, 10)
    silhouette_score = dataset_cluster['silhouette'].mean()
    print(f'silhouette = {silhouette_score}')
    silhouette_values.append(silhouette_score)

DataViz.plot_xy(x=[k_values],
                y=[silhouette_values],
                xlabel='k',
                ylabel='silhouette score',
                ylim=[0, 1],
                line_styles=['b-'])

# And run the knn with the highest silhouette score

# k = 6 # todo: replaced with np.argmax call over silhouette scores
k = k_values[np.argmax(silhouette_values)]
print(f'Highest K-Means silhouette score: k = {k}')

dataset_knn = clusteringNH.k_means_over_instances(copy.deepcopy(dataset),
                                                  attributes_to_cluster, k,
                                                  'default', 50, 50)
DataViz.plot_clusters_3d(dataset_knn, attributes_to_cluster, 'cluster',
                         ['label'])
DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette')
def main():
    # Read the result from the previous chapter and convert the index to datetime
    try:
        dataset = pd.read_csv(DATA_PATH / DATASET_FILENAME, index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Consider the first task, namely the prediction of the label. Therefore create a single column with the categorical
    # attribute representing the class. Furthermore, use 70% of the data for training and the remaining 30% as an
    # independent test set. Select the sets based on stratified sampling and remove cases where the label is unknown.
    print('\n- - - Loading dataset - - -')
    prepare = PrepareDatasetForLearning()
    learner = ClassificationAlgorithms()
    evaluation = ClassificationEvaluation()
    train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(
        dataset, ['label'], 'like', 0.7, filter_data=True, temporal=False)

    print('Training set length is: ', len(train_X.index))
    print('Test set length is: ', len(test_X.index))

    # Select subsets of the features
    print('- - - Selecting subsets - - -')
    basic_features = [
        'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
        'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y',
        'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z',
        'hr_watch_rate', 'light_phone_lux', 'mag_phone_x', 'mag_phone_y',
        'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z',
        'press_phone_pressure'
    ]
    pca_features = [
        'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7'
    ]
    time_features = [name for name in dataset.columns if '_temp_' in name]
    freq_features = [
        name for name in dataset.columns
        if (('_freq' in name) or ('_pse' in name))
    ]
    cluster_features = ['cluster']
    print('#basic features: ', len(basic_features))
    print('#PCA features: ', len(pca_features))
    print('#time features: ', len(time_features))
    print('#frequency features: ', len(freq_features))
    print('#cluster features: ', len(cluster_features))
    features_after_chapter_3 = list(set().union(basic_features, pca_features))
    features_after_chapter_4 = list(set().union(features_after_chapter_3,
                                                time_features, freq_features))
    features_after_chapter_5 = list(set().union(features_after_chapter_4,
                                                cluster_features))

    if FLAGS.mode == 'selection' or FLAGS.mode == 'all':
        # First, consider the performance over a selection of features
        N_FORWARD_SELECTION = FLAGS.nfeatures
        fs = FeatureSelectionClassification()
        print('\n- - - Running feature selection - - -')
        features, ordered_features, ordered_scores = fs.forward_selection(
            max_features=N_FORWARD_SELECTION,
            X_train=train_X[features_after_chapter_5],
            y_train=train_y)
        DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)],
                        y=[ordered_scores],
                        xlabel='number of features',
                        ylabel='accuracy')

    # Select the most important features (based on python2 features)
    selected_features = [
        'acc_phone_y_freq_0.0_Hz_ws_40',
        'press_phone_pressure_temp_mean_ws_120', 'gyr_phone_x_temp_std_ws_120',
        'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted',
        'gyr_phone_y_freq_1.0_Hz_ws_40', 'acc_phone_x_freq_1.9_Hz_ws_40',
        'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40'
    ]

    if FLAGS.mode == 'regularization' or FLAGS.mode == 'all':
        print('\n- - - Running regularization and model complexity test - - -')
        # Study the impact of regularization and model complexity: does regularization prevent overfitting?
        # Due to runtime constraints run the experiment 3 times, for even more robust data increase the repetitions
        N_REPEATS_NN = FLAGS.nnrepeat
        reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10]
        performance_training = []
        performance_test = []

        for reg_param in reg_parameters:
            performance_tr = 0
            performance_te = 0
            for i in range(0, N_REPEATS_NN):
                class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(
                    train_X,
                    train_y,
                    test_X,
                    hidden_layer_sizes=(250, ),
                    alpha=reg_param,
                    max_iter=500,
                    gridsearch=False)
                performance_tr += evaluation.accuracy(train_y, class_train_y)
                performance_te += evaluation.accuracy(test_y, class_test_y)
            performance_training.append(performance_tr / N_REPEATS_NN)
            performance_test.append(performance_te / N_REPEATS_NN)
        DataViz.plot_xy(x=[reg_parameters, reg_parameters],
                        y=[performance_training, performance_test],
                        method='semilogx',
                        xlabel='regularization parameter value',
                        ylabel='accuracy',
                        ylim=[0.95, 1.01],
                        names=['training', 'test'],
                        line_styles=['r-', 'b:'])

    if FLAGS.mode == 'tree' or FLAGS.mode == 'all':
        print('\n- - - Running leaf size test of decision tree - - -')
        # Consider the influence of certain parameter settings for the tree model. (very related to the
        # regularization) and study the impact on performance.
        leaf_settings = [1, 2, 5, 10]
        performance_training = []
        performance_test = []

        for no_points_leaf in leaf_settings:
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
                train_X[selected_features],
                train_y,
                test_X[selected_features],
                min_samples_leaf=no_points_leaf,
                gridsearch=False,
                print_model_details=False)

            performance_training.append(
                evaluation.accuracy(train_y, class_train_y))
            performance_test.append(evaluation.accuracy(test_y, class_test_y))

        DataViz.plot_xy(x=[leaf_settings, leaf_settings],
                        y=[performance_training, performance_test],
                        xlabel='Minimum number of points per leaf',
                        ylabel='Accuracy',
                        names=['training', 'test'],
                        line_styles=['r-', 'b:'])

    if FLAGS.mode == 'overall' or FLAGS.mode == 'all':
        print(
            '\n- - - Running test of all different classification algorithms - - -'
        )
        # Perform grid searches over the most important parameters and do so by means of cross validation upon the
        # training set
        possible_feature_sets = [
            basic_features, features_after_chapter_3, features_after_chapter_4,
            features_after_chapter_5, selected_features
        ]
        feature_names = [
            'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5',
            'Selected features'
        ]
        N_KCV_REPEATS = FLAGS.kcvrepeat

        scores_over_all_algs = []

        for i in range(0, len(possible_feature_sets)):
            selected_train_X = train_X[possible_feature_sets[i]]
            selected_test_X = test_X[possible_feature_sets[i]]

            # First run non deterministic classifiers a number of times to average their score
            performance_tr_nn, performance_te_nn = 0, 0
            performance_tr_rf, performance_te_rf = 0, 0
            performance_tr_svm, performance_te_svm = 0, 0

            for repeat in range(0, N_KCV_REPEATS):
                print(
                    f'Training NeuralNetwork run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ... '
                )
                class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(
                    selected_train_X,
                    train_y,
                    selected_test_X,
                    gridsearch=True)

                print(
                    f'Training RandomForest run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ... '
                )
                performance_tr_nn += evaluation.accuracy(
                    train_y, class_train_y)
                performance_te_nn += evaluation.accuracy(test_y, class_test_y)

                class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
                    selected_train_X,
                    train_y,
                    selected_test_X,
                    gridsearch=True)
                performance_tr_rf += evaluation.accuracy(
                    train_y, class_train_y)
                performance_te_rf += evaluation.accuracy(test_y, class_test_y)

                print(
                    f'Training SVM run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ...'
                )

                class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner. \
                    support_vector_machine_with_kernel(selected_train_X, train_y, selected_test_X, gridsearch=True)
                performance_tr_svm += evaluation.accuracy(
                    train_y, class_train_y)
                performance_te_svm += evaluation.accuracy(test_y, class_test_y)

            overall_performance_tr_nn = performance_tr_nn / N_KCV_REPEATS
            overall_performance_te_nn = performance_te_nn / N_KCV_REPEATS
            overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS
            overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS
            overall_performance_tr_svm = performance_tr_svm / N_KCV_REPEATS
            overall_performance_te_svm = performance_te_svm / N_KCV_REPEATS

            # Run deterministic classifiers:
            print("Deterministic Classifiers:")

            print(
                f'Training Nearest Neighbor run 1 / 1, featureset {feature_names[i]}'
            )
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor(
                selected_train_X, train_y, selected_test_X, gridsearch=True)
            performance_tr_knn = evaluation.accuracy(train_y, class_train_y)
            performance_te_knn = evaluation.accuracy(test_y, class_test_y)

            print(
                f'Training Decision Tree run 1 / 1  featureset {feature_names[i]}'
            )
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
                selected_train_X, train_y, selected_test_X, gridsearch=True)
            performance_tr_dt = evaluation.accuracy(train_y, class_train_y)
            performance_te_dt = evaluation.accuracy(test_y, class_test_y)

            print(
                f'Training Naive Bayes run 1/1 featureset {feature_names[i]}')
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(
                selected_train_X, train_y, selected_test_X)
            performance_tr_nb = evaluation.accuracy(train_y, class_train_y)
            performance_te_nb = evaluation.accuracy(test_y, class_test_y)

            scores_with_sd = util. \
                print_table_row_performances(feature_names[i], len(selected_train_X.index),
                                             len(selected_test_X.index), [
                                                 (overall_performance_tr_nn, overall_performance_te_nn),
                                                 (overall_performance_tr_rf, overall_performance_te_rf),
                                                 (overall_performance_tr_svm, overall_performance_te_svm),
                                                 (performance_tr_knn, performance_te_knn),
                                                 (performance_tr_knn, performance_te_knn),
                                                 (performance_tr_dt, performance_te_dt),
                                                 (performance_tr_nb, performance_te_nb)])
            scores_over_all_algs.append(scores_with_sd)

        DataViz.plot_performances_classification(
            ['NN', 'RF', 'SVM', 'KNN', 'DT', 'NB'], feature_names,
            scores_over_all_algs)

    if FLAGS.mode == 'detail' or FLAGS.mode == 'all':
        print(
            '\n- - - Running detail test of promising classification algorithms - - -'
        )
        # Study two promising ones in more detail, namely decision tree and random forest algorithm
        learner.decision_tree(train_X[selected_features],
                              train_y,
                              test_X[selected_features],
                              gridsearch=True,
                              print_model_details=True,
                              export_tree_path=EXPORT_TREE_PATH)

        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
            train_X[selected_features],
            train_y,
            test_X[selected_features],
            gridsearch=True,
            print_model_details=True)

        test_cm = evaluation.confusion_matrix(test_y, class_test_y,
                                              class_train_prob_y.columns)
        DataViz.plot_confusion_matrix(test_cm,
                                      class_train_prob_y.columns,
                                      normalize=False)
            train_y,
            test_X,
            hidden_layer_sizes=(250, ),
            alpha=reg_param,
            max_iter=500,
            gridsearch=False)

        performance_tr += eval.accuracy(train_y, class_train_y)
        performance_te += eval.accuracy(test_y, class_test_y)
    performance_training.append(performance_tr / N_REPEATS_NN)
    performance_test.append(performance_te / N_REPEATS_NN)

DataViz.plot_xy(x=[reg_parameters, reg_parameters],
                y=[performance_training, performance_test],
                method='semilogx',
                xlabel='regularization parameter value',
                ylabel='accuracy',
                ylim=[0.95, 1.01],
                names=['training', 'test'],
                line_styles=['r-', 'b:'])

# Second, let us consider the influence of certain parameter settings for the tree model. (very related to the
# regularization) and study the impact on performance.

leaf_settings = [1, 2, 5, 10]
performance_training = []
performance_test = []

for no_points_leaf in leaf_settings:

    class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
        train_X[selected_features],
Example #7
0
def experiment(file):
    dataset = pd.read_csv(file, index_col=time_col)
    DataViz = VisualizeDataset(__file__.split('.')[0] +
                               file.split('.')[0].split('/')[1] + '.py',
                               show=True)
    print(DataViz.figures_dir)
    dataset.index = pd.to_datetime(dataset.index)
    prepare = PrepareDatasetForLearning()
    train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(
        dataset, ['label'],
        'like',
        0.7,
        filter=False,
        temporal=False,
        drop_na=False,
        fill_na=True)

    time_features = [name for name in dataset.columns if '_temp' in name]
    freq_features = [
        name for name in dataset.columns
        if (('_freq' in name) or ('_pse' in name))
    ]
    cluster_features = ['cluster']
    features_2 = list(set().union(basic_features, time_features))
    features_3 = list(set().union(basic_features, time_features,
                                  freq_features))
    features_4 = list(set().union(basic_features, time_features, freq_features,
                                  cluster_features))

    # print('feature selection')
    # fs = FeatureSelectionClassification()
    # features, selected_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION,
    #                                                                   train_X[features_4], train_y)
    # log([str(ordered_scores), str(selected_features)])
    selected_features = [
        'gyr_y_temp_std_ws_1200', 'acc_z_temp_mean_ws_120',
        'acc_x_temp_mean_ws_120', 'gyr_x_temp_std_ws_2400', 'gyr_z_max_freq',
        'gyr_y_freq_1.9_Hz_ws_40', 'acc_z_freq_0.4_Hz_ws_40',
        'gyr_z_freq_1.2_Hz_ws_40', 'gyr_x_freq_0.2_Hz_ws_40',
        'acc_z_freq_1.0_Hz_ws_40', 'acc_x_freq_0.2_Hz_ws_40',
        'acc_y_freq_1.9_Hz_ws_40', 'gyr_x_temp_mean_ws_1200',
        'acc_z_freq_1.9_Hz_ws_40', 'acc_x_temp_std_ws_120',
        'gyr_z_temp_std_ws_120', 'gyr_y_freq_1.5_Hz_ws_40',
        'gyr_z_temp_mean_ws_120', 'gyr_x_freq_0.0_Hz_ws_40',
        'acc_z_freq_0.6_Hz_ws_40'
    ]
    DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)],
                    y=[selected_features],
                    xlabel='number of features',
                    ylabel='accuracy')

    print('feature selection finished for %s' % file)
    learner = ClassificationAlgorithms()
    eval = ClassificationEvaluation()

    possible_feature_sets = [
        basic_features, features_2, features_3, features_4, selected_features
    ]
    feature_names = [
        'Basic features', 'Features with time', 'Features with frequency',
        'Features with cluster', 'Selected features'
    ]

    # with shelve.open('temp/shelve.out', 'n') as f:
    #     for key in dir():
    #         try:
    #             f[key] = globals()[key]
    #         except:
    #             print('ERROR shelving: {0}'.format(key))

    N_KCV_REPEATS = 1

    scores_over_all_algs = []

    for i in range(0, len(possible_feature_sets)):
        print(datetime.now())
        print('possible feature sets', i)
        log(['Features %d' % i])
        selected_train_X = train_X[possible_feature_sets[i]]
        selected_test_X = test_X[possible_feature_sets[i]]

        # First we run our non deterministic classifiers a number of times to average their score.

        performance_tr_rf = 0
        performance_te_rf = 0

        for repeat in range(0, N_KCV_REPEATS):
            print(datetime.now())
            print('\nRepeat', repeat)
            print('Random Forest')
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
                selected_train_X,
                train_y,
                selected_test_X,
                gridsearch=True,
                print_model_details=True)
            test_cm = eval.confusion_matrix(test_y, class_test_y,
                                            class_train_prob_y.columns)

            DataViz.plot_confusion_matrix(test_cm,
                                          class_train_prob_y.columns,
                                          normalize=False)

            performance_tr_rf += eval.accuracy(train_y, class_train_y)
            performance_te_rf += eval.accuracy(test_y, class_test_y)

            print(datetime.now())

        overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS
        overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS
        log([
            'RF' + ' train acc: %f' % performance_te_rf +
            ' test acc: %f' % performance_te_rf
        ])

        # And we run our deterministic classifiers:

        print('decision tree')
        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
            selected_train_X,
            train_y,
            selected_test_X,
            gridsearch=True,
            print_model_details=True)
        performance_tr_dt = eval.accuracy(train_y, class_train_y)
        performance_te_dt = eval.accuracy(test_y, class_test_y)
        test_cm = eval.confusion_matrix(test_y, class_test_y,
                                        class_train_prob_y.columns)

        DataViz.plot_confusion_matrix(test_cm,
                                      class_train_prob_y.columns,
                                      normalize=False)

        log([
            'DT' + ' train acc: %f' % performance_tr_dt +
            ' test acc: %f' % performance_te_dt
        ])
        scores_with_sd = util.print_table_row_performances(
            feature_names[i], len(selected_train_X.index),
            len(selected_test_X.index), [
                (overall_performance_tr_rf, overall_performance_te_rf),
                (performance_tr_dt, performance_te_dt),
            ])
        scores_over_all_algs.append(scores_with_sd)

    DataViz.plot_performances_classification(['RF', 'DT'], feature_names,
                                             scores_over_all_algs)
    print(datetime.now())
Example #8
0
def main():
    # Import the data from the specified location and parse the date index
    try:
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of our visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Compute the number of milliseconds covered by an instance based on the first two rows
    milliseconds_per_instance = (dataset.index[1] -
                                 dataset.index[0]).microseconds / 1000

    # Create objects for value imputation, low pass filter and PCA
    MisVal = ImputationMissingValues()
    LowPass = LowPassFilter()
    PCA = PrincipalComponentAnalysis()

    if FLAGS.mode == 'imputation':
        # Impute the missing values and plot an example
        imputed_mean_dataset = MisVal.impute_mean(
            dataset=copy.deepcopy(dataset), col='hr_watch_rate')
        imputed_median_dataset = MisVal.impute_median(
            dataset=copy.deepcopy(dataset), col='hr_watch_rate')
        imputed_interpolation_dataset = MisVal.impute_interpolate(
            dataset=copy.deepcopy(dataset), col='hr_watch_rate')
        DataViz.plot_imputed_values(
            dataset, ['original', 'mean', 'median', 'interpolation'],
            'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'],
            imputed_median_dataset['hr_watch_rate'],
            imputed_interpolation_dataset['hr_watch_rate'])

    elif FLAGS.mode == 'kalman':
        # Using the result from Chapter 2, try the Kalman filter on the light_phone_lux attribute and study the result
        try:
            original_dataset = pd.read_csv(DATA_PATH / ORIG_DATASET_FNAME,
                                           index_col=0)
            original_dataset.index = pd.to_datetime(original_dataset.index)
        except IOError as e:
            print(
                'File not found, try to run previous crowdsignals scripts first!'
            )
            raise e
        KalFilter = KalmanFilters()
        kalman_dataset = KalFilter.apply_kalman_filter(
            data_table=original_dataset, col='acc_phone_x')
        DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'],
                                    'acc_phone_x',
                                    kalman_dataset['acc_phone_x_kalman'])
        DataViz.plot_dataset(data_table=kalman_dataset,
                             columns=['acc_phone_x', 'acc_phone_x_kalman'],
                             match=['exact', 'exact'],
                             display=['line', 'line'])

    elif FLAGS.mode == 'lowpass':
        # Apply a lowpass filter and reduce the importance of the data above 1.5 Hz
        # Determine the sampling frequency
        fs = float(1000) / milliseconds_per_instance

        # Study acc_phone_x
        new_dataset = LowPass.low_pass_filter(
            data_table=copy.deepcopy(dataset),
            col='acc_phone_x',
            sampling_frequency=fs,
            cutoff_frequency=FLAGS.cutoff,
            order=10)
        DataViz.plot_dataset(
            new_dataset.iloc[int(0.4 * len(new_dataset.index)
                                 ):int(0.43 * len(new_dataset.index)), :],
            ['acc_phone_x', 'acc_phone_x_lowpass'], ['exact', 'exact'],
            ['line', 'line'])

    elif FLAGS.mode == 'PCA':
        # First impute again, as PCA can not deal with missing values
        for col in [c for c in dataset.columns if 'label' not in c]:
            dataset = MisVal.impute_interpolate(dataset, col)

        # Determine the PC's for all but the target columns (the labels and the heart rate)
        selected_predictor_cols = [
            c for c in dataset.columns
            if (not ('label' in c)) and (not (c == 'hr_watch_rate'))
        ]
        pc_values = PCA.determine_pc_explained_variance(
            data_table=dataset, cols=selected_predictor_cols)
        cumulated_variance = np.cumsum(pc_values)

        # Plot the explained variance and cumulated variance
        comp_numbers = np.arange(1, len(pc_values) + 1)
        DataViz.plot_xy(x=[comp_numbers, comp_numbers],
                        y=[pc_values, cumulated_variance],
                        xlabel='principal component number',
                        ylabel='explained variance',
                        ylim=[0, 1],
                        line_styles=['b-', 'r-'],
                        names=['Variance', 'Cumulated variance'])

        # Select 7 as the best number of PC's as this explains most of the variance
        n_pcs = 7
        dataset = PCA.apply_pca(data_table=copy.deepcopy(dataset),
                                cols=selected_predictor_cols,
                                number_comp=n_pcs)

        # Visualize the result of the PC's and the overall final dataset
        DataViz.plot_dataset(dataset, ['pca_', 'label'], ['like', 'like'],
                             ['line', 'points'])
        DataViz.plot_dataset(dataset, [
            'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_',
            'press_phone_', 'pca_', 'label'
        ], [
            'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like',
            'like'
        ], [
            'line', 'line', 'line', 'line', 'line', 'line', 'line', 'points',
            'points'
        ])

    elif FLAGS.mode == 'final':
        # Carry out that operation over all columns except for the label
        print('Imputing missing values.')
        for col in tqdm([c for c in dataset.columns if 'label' not in c]):
            dataset = MisVal.impute_interpolate(dataset=dataset, col=col)

        # Include all measurements that have a form of periodicity and filter them
        periodic_measurements = [
            'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
            'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y',
            'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z',
            'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x',
            'mag_watch_y', 'mag_watch_z'
        ]

        print('Applying low pass filter on peridic measurements.')
        # Determine the sampling frequency.
        fs = float(1000) / milliseconds_per_instance
        for col in tqdm(periodic_measurements):
            dataset = LowPass.low_pass_filter(data_table=dataset,
                                              col=col,
                                              sampling_frequency=fs,
                                              cutoff_frequency=FLAGS.cutoff,
                                              order=10)
            dataset[col] = dataset[col + '_lowpass']
            del dataset[col + '_lowpass']

        # Use the optimal found parameter n_pcs = 7 to apply PCA to the final dataset
        selected_predictor_cols = [
            c for c in dataset.columns
            if (not ('label' in c)) and (not (c == 'hr_watch_rate'))
        ]
        n_pcs = 7
        dataset = PCA.apply_pca(data_table=copy.deepcopy(dataset),
                                cols=selected_predictor_cols,
                                number_comp=n_pcs)

        # Visualize the final overall dataset
        DataViz.plot_dataset(dataset, [
            'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_',
            'press_phone_', 'pca_', 'label'
        ], [
            'like', 'like', 'like', 'like', 'like', 'like', 'like', 'like',
            'like'
        ], [
            'line', 'line', 'line', 'line', 'line', 'line', 'line', 'points',
            'points'
        ])

        # Store the final outcome
        dataset.to_csv(DATA_PATH / RESULT_FNAME)
## Do some initial runs to determine the right number for k

print('===== kmeans clustering mag =====')
for k in k_values:
    print(f'k = {k}')
    dataset_cluster = clusteringNH.k_means_over_instances(
        copy.deepcopy(dataset), ['gravity.x', 'gravity.y', 'gravity.z'], k,
        'default', 20, 10)
    silhouette_score = dataset_cluster['silhouette'].mean()
    print(f'silhouette = {silhouette_score}')
    silhouette_values.append(silhouette_score)

DataViz.plot_xy(x=[k_values],
                y=[silhouette_values],
                xlabel='k',
                ylabel='silhouette score kmeans',
                ylim=[0, 1],
                line_styles=['b-'])

# And run the knn with the highest silhouette score

# k = 6 # todo: replaced with np.argmax call over silhouette scores
k = k_values[np.argmax(silhouette_values)]
print(f'Highest K-Means silhouette score: k = {k}')

dataset_knn = clusteringNH.k_means_over_instances(
    copy.deepcopy(dataset), ['gravity.x', 'gravity.y', 'gravity.z'], k,
    'default', 50, 50)
DataViz.plot_clusters_3d(dataset_knn, ['gravity.x', 'gravity.y', 'gravity.z'],
                         'cluster', ['label'])
DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette')