Python PrepareDatasetForLearning Exemples, Chapter7.PrepareDatasetForLearning.PrepareDatasetForLearning Python Exemples

Exemple #1

0

Afficher le fichier

except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

if not os.path.exists(export_tree_path):
    os.makedirs(export_tree_path)

dataset.index = dataset.index.to_datetime()

# Let us consider our first task, namely the prediction of the label. We consider this as a non-temporal task.

# We create a single column with the categorical attribute representing our class. Furthermore, we use 70% of our data
# for training and the remaining 30% as an independent test set. We select the sets based on stratified sampling. We remove
# cases where we do not know the label.

prepare = PrepareDatasetForLearning()

train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(dataset, ['label'], 'like', 0.7, filter=True, temporal=False)
#train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(dataset, ['label'], 'like', 0.01, filter=True, temporal=False)

print 'Training set length is: ', len(train_X.index)
print 'Test set length is: ', len(test_X.index)

# Select subsets of the features that we will consider:

basic_features = ['acc_phone_x','acc_phone_y','acc_phone_z','acc_watch_x','acc_watch_y','acc_watch_z','gyr_phone_x','gyr_phone_y','gyr_phone_z','gyr_watch_x','gyr_watch_y','gyr_watch_z',
                  'hr_watch_rate', 'light_phone_lux','mag_phone_x','mag_phone_y','mag_phone_z','mag_watch_x','mag_watch_y','mag_watch_z','press_phone_pressure']
pca_features = ['pca_1','pca_2','pca_3','pca_4','pca_5','pca_6','pca_7']
time_features = [name for name in dataset.columns if '_temp_' in name]
freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))]
'''

Exemple #2

0

Afficher le fichier

# Read the result from the previous chapter, and make sure the index is of the type datetime.
dataset_path = './intermediate_datafiles/'

try:
    dataset = pd.read_csv(dataset_path + 'chapter5_our_result.csv',
                          index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = dataset.index.to_datetime()

# Let us consider our second task, namely the prediction of the Azimuth. We consider this as a temporal task.

prepare = PrepareDatasetForLearning()

train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(
    dataset,
    'Azimuth',
    '2019-06-14 12:06:30',
    #                                                                                   '2016-02-08 18:29:58','2016-02-08 18:29:59')
    '2019-06-14 12:16:02',
    '2019-06-14 12:20:56')

print 'Training set length is: ', len(train_X.index)
print 'Test set length is: ', len(test_X.index)

# Select subsets of the features that we will consider:

print 'Training set length is: ', len(train_X.index)

Exemple #3

0

Afficher le fichier

Fichier : crowdsignals_ch7_regression.py Projet : NilsHMeier/ML4QS

def main():
    # Read the result from the previous chapter and convert the index to datetime
    try:
        dataset = pd.read_csv(DATA_PATH / DATASET_FILENAME, index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print('File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Consider the second task, namely the prediction of the heart rate. Therefore create a dataset with the heart
    # rate as target and split using timestamps, because this is considered as a temporal task.
    print('\n- - - Loading dataset - - -')
    prepare = PrepareDatasetForLearning()
    learner = RegressionAlgorithms()
    evaluation = RegressionEvaluation()
    train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(dataset, 'hr_watch_rate',
                                                                                       '2016-02-08 18:28:56',
                                                                                       '2016-02-08 19:34:07',
                                                                                       '2016-02-08 20:07:50')
    print('Training set length is: ', len(train_X.index))
    print('Test set length is: ', len(test_X.index))

    # Select subsets of the features
    print('- - - Selecting subsets - - -')
    basic_features = ['acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z',
                      'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z',
                      'labelOnTable', 'labelSitting', 'labelWashingHands', 'labelWalking', 'labelStanding',
                      'labelDriving',
                      'labelEating', 'labelRunning', 'light_phone_lux', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z',
                      'mag_watch_x', 'mag_watch_y', 'mag_watch_z', 'press_phone_pressure']
    pca_features = ['pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7']
    time_features = [name for name in dataset.columns if ('temp_' in name and 'hr_watch' not in name)]
    freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))]
    cluster_features = ['cluster']
    print('#basic features: ', len(basic_features))
    print('#PCA features: ', len(pca_features))
    print('#time features: ', len(time_features))
    print('#frequency features: ', len(freq_features))
    print('#cluster features: ', len(cluster_features))
    features_after_chapter_3 = list(set().union(basic_features, pca_features))
    features_after_chapter_4 = list(set().union(features_after_chapter_3, time_features, freq_features))
    features_after_chapter_5 = list(set().union(features_after_chapter_4, cluster_features))

    if FLAGS.mode == 'selection' or FLAGS.mode == 'all':
        # First, consider the Pearson correlations and see whether features can be selected based on them
        fs = FeatureSelectionRegression()
        print('\n- - - Running feature selection - - -')
        features, correlations = fs.pearson_selection(10, train_X[features_after_chapter_5], train_y)
        util.print_pearson_correlations(correlations)

    # Select the 10 features with the highest correlation
    selected_features = ['temp_pattern_labelOnTable', 'labelOnTable', 'temp_pattern_labelOnTable(b)labelOnTable',
                         'pca_2_temp_mean_ws_120', 'pca_1_temp_mean_ws_120', 'acc_watch_y_temp_mean_ws_120', 'pca_2',
                         'acc_phone_z_temp_mean_ws_120', 'gyr_watch_y_pse', 'gyr_watch_x_pse']
    possible_feature_sets = [basic_features, features_after_chapter_3, features_after_chapter_4,
                             features_after_chapter_5, selected_features]
    feature_names = ['initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features']

    if FLAGS.mode == 'overall' or FLAGS.mode == 'all':
        print('\n- - - Running test of all different regression algorithms - - -')
        # First study the importance of the parameter settings. Therefore repeat the experiment a number of times to get
        # a bit more robust data as the initialization of e.g. the NN is random
        REPEATS = FLAGS.repeats
        scores_over_all_algs = []

        for i in range(0, len(possible_feature_sets)):
            selected_train_X = train_X[possible_feature_sets[i]]
            selected_test_X = test_X[possible_feature_sets[i]]

            performance_tr_nn, performance_tr_nn_std = 0, 0
            performance_tr_rf, performance_tr_rf_std = 0, 0
            performance_te_nn, performance_te_nn_std = 0, 0
            performance_te_rf, performance_te_rf_std = 0, 0

            # First run non deterministic classifiers a number of times to average their score
            for repeat in range(0, REPEATS):
                print(f'Training NeuralNetwork run {repeat + 1}/{REPEATS} ... ')
                regr_train_y, regr_test_y = learner.\
                    feedforward_neural_network(selected_train_X, train_y, selected_test_X, gridsearch=True)
                mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y)
                mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y)
                performance_tr_nn += mean_tr
                performance_tr_nn_std += std_tr
                performance_te_nn += mean_te
                performance_te_nn_std += std_te

                print(f'Training RandomForest run {repeat + 1}/{REPEATS} ... ')
                regr_train_y, regr_test_y = learner.random_forest(selected_train_X, train_y, selected_test_X,
                                                                  gridsearch=True)
                mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y)
                mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y)
                performance_tr_rf += mean_tr
                performance_tr_rf_std += std_tr
                performance_te_rf += mean_te
                performance_te_rf_std += std_te

            overall_performance_tr_nn = performance_tr_nn / REPEATS
            overall_performance_tr_nn_std = performance_tr_nn_std / REPEATS
            overall_performance_te_nn = performance_te_nn / REPEATS
            overall_performance_te_nn_std = performance_te_nn_std / REPEATS
            overall_performance_tr_rf = performance_tr_rf / REPEATS
            overall_performance_tr_rf_std = performance_tr_rf_std / REPEATS
            overall_performance_te_rf = performance_te_rf / REPEATS
            overall_performance_te_rf_std = performance_te_rf_std / REPEATS

            # Run deterministic algorithms:
            print("Support Vector Regressor run 1/1 ... ")
            # Convergence of the SVR does not always occur (even adjusting tolerance and iterations does not help)
            regr_train_y, regr_test_y = learner.\
                support_vector_regression_without_kernel(selected_train_X, train_y, selected_test_X, gridsearch=False)
            mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y)
            mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y)
            performance_tr_svm = mean_tr
            performance_tr_svm_std = std_tr
            performance_te_svm = mean_te
            performance_te_svm_std = std_te

            print("Training Nearest Neighbor run 1/1 ... ")
            regr_train_y, regr_test_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X,
                                                                   gridsearch=True)
            mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y)
            mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y)
            performance_tr_knn = mean_tr
            performance_tr_knn_std = std_tr
            performance_te_knn = mean_te
            performance_te_knn_std = std_te

            print("Training Decision Tree run 1/1 ... ")
            regr_train_y, regr_test_y = learner.\
                decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True,
                              export_tree_path=EXPORT_TREE_PATH)
            mean_tr, std_tr = evaluation.mean_squared_error_with_std(train_y, regr_train_y)
            mean_te, std_te = evaluation.mean_squared_error_with_std(test_y, regr_test_y)
            performance_tr_dt = mean_tr
            performance_tr_dt_std = std_tr
            performance_te_dt = mean_te
            performance_te_dt_std = std_te

            scores_with_sd = [(overall_performance_tr_nn, overall_performance_tr_nn_std, overall_performance_te_nn,
                               overall_performance_te_nn_std),
                              (overall_performance_tr_rf, overall_performance_tr_rf_std, overall_performance_te_rf,
                               overall_performance_te_rf_std),
                              (performance_tr_svm, performance_tr_svm_std, performance_te_svm, performance_te_svm_std),
                              (performance_tr_knn, performance_tr_knn_std, performance_te_knn, performance_te_knn_std),
                              (performance_tr_dt, performance_tr_dt_std, performance_te_dt, performance_te_dt_std)]
            util.print_table_row_performances_regression(feature_names[i], scores_with_sd)
            scores_over_all_algs.append(scores_with_sd)

        # Plot the results
        DataViz.plot_performances_regression(['NN', 'RF', 'SVM', 'KNN', 'DT'], feature_names, scores_over_all_algs)

    if FLAGS.mode == 'detail' or FLAGS.mode == 'all':
        print('\n- - - Running visualization of results - - -')
        regr_train_y, regr_test_y = learner.random_forest(train_X[features_after_chapter_5], train_y,
                                                          test_X[features_after_chapter_5], gridsearch=False,
                                                          print_model_details=True)
        DataViz.plot_numerical_prediction_versus_real(train_X.index, train_y, regr_train_y, test_X.index, test_y,
                                                      regr_test_y, 'heart rate')

Exemple #4

0

Afficher le fichier

Fichier : crowdsignals_ch7_classification.py Projet : NilsHMeier/ML4QS

def main():
    # Read the result from the previous chapter and convert the index to datetime
    try:
        dataset = pd.read_csv(DATA_PATH / DATASET_FILENAME, index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Consider the first task, namely the prediction of the label. Therefore create a single column with the categorical
    # attribute representing the class. Furthermore, use 70% of the data for training and the remaining 30% as an
    # independent test set. Select the sets based on stratified sampling and remove cases where the label is unknown.
    print('\n- - - Loading dataset - - -')
    prepare = PrepareDatasetForLearning()
    learner = ClassificationAlgorithms()
    evaluation = ClassificationEvaluation()
    train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(
        dataset, ['label'], 'like', 0.7, filter_data=True, temporal=False)

    print('Training set length is: ', len(train_X.index))
    print('Test set length is: ', len(test_X.index))

    # Select subsets of the features
    print('- - - Selecting subsets - - -')
    basic_features = [
        'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
        'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y',
        'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z',
        'hr_watch_rate', 'light_phone_lux', 'mag_phone_x', 'mag_phone_y',
        'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z',
        'press_phone_pressure'
    ]
    pca_features = [
        'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7'
    ]
    time_features = [name for name in dataset.columns if '_temp_' in name]
    freq_features = [
        name for name in dataset.columns
        if (('_freq' in name) or ('_pse' in name))
    ]
    cluster_features = ['cluster']
    print('#basic features: ', len(basic_features))
    print('#PCA features: ', len(pca_features))
    print('#time features: ', len(time_features))
    print('#frequency features: ', len(freq_features))
    print('#cluster features: ', len(cluster_features))
    features_after_chapter_3 = list(set().union(basic_features, pca_features))
    features_after_chapter_4 = list(set().union(features_after_chapter_3,
                                                time_features, freq_features))
    features_after_chapter_5 = list(set().union(features_after_chapter_4,
                                                cluster_features))

    if FLAGS.mode == 'selection' or FLAGS.mode == 'all':
        # First, consider the performance over a selection of features
        N_FORWARD_SELECTION = FLAGS.nfeatures
        fs = FeatureSelectionClassification()
        print('\n- - - Running feature selection - - -')
        features, ordered_features, ordered_scores = fs.forward_selection(
            max_features=N_FORWARD_SELECTION,
            X_train=train_X[features_after_chapter_5],
            y_train=train_y)
        DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)],
                        y=[ordered_scores],
                        xlabel='number of features',
                        ylabel='accuracy')

    # Select the most important features (based on python2 features)
    selected_features = [
        'acc_phone_y_freq_0.0_Hz_ws_40',
        'press_phone_pressure_temp_mean_ws_120', 'gyr_phone_x_temp_std_ws_120',
        'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted',
        'gyr_phone_y_freq_1.0_Hz_ws_40', 'acc_phone_x_freq_1.9_Hz_ws_40',
        'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40'
    ]

    if FLAGS.mode == 'regularization' or FLAGS.mode == 'all':
        print('\n- - - Running regularization and model complexity test - - -')
        # Study the impact of regularization and model complexity: does regularization prevent overfitting?
        # Due to runtime constraints run the experiment 3 times, for even more robust data increase the repetitions
        N_REPEATS_NN = FLAGS.nnrepeat
        reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10]
        performance_training = []
        performance_test = []

        for reg_param in reg_parameters:
            performance_tr = 0
            performance_te = 0
            for i in range(0, N_REPEATS_NN):
                class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(
                    train_X,
                    train_y,
                    test_X,
                    hidden_layer_sizes=(250, ),
                    alpha=reg_param,
                    max_iter=500,
                    gridsearch=False)
                performance_tr += evaluation.accuracy(train_y, class_train_y)
                performance_te += evaluation.accuracy(test_y, class_test_y)
            performance_training.append(performance_tr / N_REPEATS_NN)
            performance_test.append(performance_te / N_REPEATS_NN)
        DataViz.plot_xy(x=[reg_parameters, reg_parameters],
                        y=[performance_training, performance_test],
                        method='semilogx',
                        xlabel='regularization parameter value',
                        ylabel='accuracy',
                        ylim=[0.95, 1.01],
                        names=['training', 'test'],
                        line_styles=['r-', 'b:'])

    if FLAGS.mode == 'tree' or FLAGS.mode == 'all':
        print('\n- - - Running leaf size test of decision tree - - -')
        # Consider the influence of certain parameter settings for the tree model. (very related to the
        # regularization) and study the impact on performance.
        leaf_settings = [1, 2, 5, 10]
        performance_training = []
        performance_test = []

        for no_points_leaf in leaf_settings:
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
                train_X[selected_features],
                train_y,
                test_X[selected_features],
                min_samples_leaf=no_points_leaf,
                gridsearch=False,
                print_model_details=False)

            performance_training.append(
                evaluation.accuracy(train_y, class_train_y))
            performance_test.append(evaluation.accuracy(test_y, class_test_y))

        DataViz.plot_xy(x=[leaf_settings, leaf_settings],
                        y=[performance_training, performance_test],
                        xlabel='Minimum number of points per leaf',
                        ylabel='Accuracy',
                        names=['training', 'test'],
                        line_styles=['r-', 'b:'])

    if FLAGS.mode == 'overall' or FLAGS.mode == 'all':
        print(
            '\n- - - Running test of all different classification algorithms - - -'
        )
        # Perform grid searches over the most important parameters and do so by means of cross validation upon the
        # training set
        possible_feature_sets = [
            basic_features, features_after_chapter_3, features_after_chapter_4,
            features_after_chapter_5, selected_features
        ]
        feature_names = [
            'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5',
            'Selected features'
        ]
        N_KCV_REPEATS = FLAGS.kcvrepeat

        scores_over_all_algs = []

        for i in range(0, len(possible_feature_sets)):
            selected_train_X = train_X[possible_feature_sets[i]]
            selected_test_X = test_X[possible_feature_sets[i]]

            # First run non deterministic classifiers a number of times to average their score
            performance_tr_nn, performance_te_nn = 0, 0
            performance_tr_rf, performance_te_rf = 0, 0
            performance_tr_svm, performance_te_svm = 0, 0

            for repeat in range(0, N_KCV_REPEATS):
                print(
                    f'Training NeuralNetwork run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ... '
                )
                class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(
                    selected_train_X,
                    train_y,
                    selected_test_X,
                    gridsearch=True)

                print(
                    f'Training RandomForest run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ... '
                )
                performance_tr_nn += evaluation.accuracy(
                    train_y, class_train_y)
                performance_te_nn += evaluation.accuracy(test_y, class_test_y)

                class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
                    selected_train_X,
                    train_y,
                    selected_test_X,
                    gridsearch=True)
                performance_tr_rf += evaluation.accuracy(
                    train_y, class_train_y)
                performance_te_rf += evaluation.accuracy(test_y, class_test_y)

                print(
                    f'Training SVM run {repeat + 1} / {N_KCV_REPEATS}, featureset is {feature_names[i]} ...'
                )

                class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner. \
                    support_vector_machine_with_kernel(selected_train_X, train_y, selected_test_X, gridsearch=True)
                performance_tr_svm += evaluation.accuracy(
                    train_y, class_train_y)
                performance_te_svm += evaluation.accuracy(test_y, class_test_y)

            overall_performance_tr_nn = performance_tr_nn / N_KCV_REPEATS
            overall_performance_te_nn = performance_te_nn / N_KCV_REPEATS
            overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS
            overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS
            overall_performance_tr_svm = performance_tr_svm / N_KCV_REPEATS
            overall_performance_te_svm = performance_te_svm / N_KCV_REPEATS

            # Run deterministic classifiers:
            print("Deterministic Classifiers:")

            print(
                f'Training Nearest Neighbor run 1 / 1, featureset {feature_names[i]}'
            )
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor(
                selected_train_X, train_y, selected_test_X, gridsearch=True)
            performance_tr_knn = evaluation.accuracy(train_y, class_train_y)
            performance_te_knn = evaluation.accuracy(test_y, class_test_y)

            print(
                f'Training Decision Tree run 1 / 1  featureset {feature_names[i]}'
            )
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
                selected_train_X, train_y, selected_test_X, gridsearch=True)
            performance_tr_dt = evaluation.accuracy(train_y, class_train_y)
            performance_te_dt = evaluation.accuracy(test_y, class_test_y)

            print(
                f'Training Naive Bayes run 1/1 featureset {feature_names[i]}')
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(
                selected_train_X, train_y, selected_test_X)
            performance_tr_nb = evaluation.accuracy(train_y, class_train_y)
            performance_te_nb = evaluation.accuracy(test_y, class_test_y)

            scores_with_sd = util. \
                print_table_row_performances(feature_names[i], len(selected_train_X.index),
                                             len(selected_test_X.index), [
                                                 (overall_performance_tr_nn, overall_performance_te_nn),
                                                 (overall_performance_tr_rf, overall_performance_te_rf),
                                                 (overall_performance_tr_svm, overall_performance_te_svm),
                                                 (performance_tr_knn, performance_te_knn),
                                                 (performance_tr_knn, performance_te_knn),
                                                 (performance_tr_dt, performance_te_dt),
                                                 (performance_tr_nb, performance_te_nb)])
            scores_over_all_algs.append(scores_with_sd)

        DataViz.plot_performances_classification(
            ['NN', 'RF', 'SVM', 'KNN', 'DT', 'NB'], feature_names,
            scores_over_all_algs)

    if FLAGS.mode == 'detail' or FLAGS.mode == 'all':
        print(
            '\n- - - Running detail test of promising classification algorithms - - -'
        )
        # Study two promising ones in more detail, namely decision tree and random forest algorithm
        learner.decision_tree(train_X[selected_features],
                              train_y,
                              test_X[selected_features],
                              gridsearch=True,
                              print_model_details=True,
                              export_tree_path=EXPORT_TREE_PATH)

        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
            train_X[selected_features],
            train_y,
            test_X[selected_features],
            gridsearch=True,
            print_model_details=True)

        test_cm = evaluation.confusion_matrix(test_y, class_test_y,
                                              class_train_prob_y.columns)
        DataViz.plot_confusion_matrix(test_cm,
                                      class_train_prob_y.columns,
                                      normalize=False)

Exemple #5

0

Afficher le fichier

DataViz = VisualizeDataset()

# Read the result from the previous chapter, and make sure the index is of the type datetime.
dataset_path = './intermediate_datafiles/'

try:
    dataset = pd.read_csv(dataset_path + 'chapter5_result.csv', index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = dataset.index.to_datetime()

# Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task.

prepare = PrepareDatasetForLearning()

train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(dataset, 'hr_watch_rate', '2016-02-08 18:29:56',
#                                                                                   '2016-02-08 18:29:58','2016-02-08 18:29:59')
                                                                                   '2016-02-08 19:34:07', '2016-02-08 20:07:50')

print 'Training set length is: ', len(train_X.index)
print 'Test set length is: ', len(test_X.index)

# Select subsets of the features that we will consider:

print 'Training set length is: ', len(train_X.index)
print 'Test set length is: ', len(test_X.index)

# Select subsets of the features that we will consider:

Exemple #6

0

Afficher le fichier

Fichier : crowdsignals_ch8_regression.py Projet : Mick-IJzer/ML4QS

DATA_PATH = Path('./intermediate_datafiles/Crowdsignal/')
DATASET_FNAME = 'chapter5_result.csv'

DataViz = VisualizeDataset(__file__)

try:
    dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = pd.to_datetime(dataset.index)

# Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task.

prepare = PrepareDatasetForLearning()

train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(
    dataset,
    'hr_watch_rate',
    '2016-02-08 18:29:56',
    #                                                                                   '2016-02-08 18:29:58','2016-02-08 18:29:59')
    '2016-02-08 19:34:07',
    '2016-02-08 20:07:50')

print('Training set length is: ', len(train_X.index))
print('Test set length is: ', len(test_X.index))

# Select subsets of the features that we will consider:

print('Training set length is: ', len(train_X.index))

Exemple #7

0

Afficher le fichier

try:
    dataset = pd.read_csv(dataset_path + 'chapter5_result-own.csv',
                          index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e
dataset = pd.read_csv(dataset_path + 'chapter5_result-own.csv', index_col=0)
dataset.index = dataset.index.to_datetime()

if not os.path.exists(export_tree_path):
    os.makedirs(export_tree_path)

# Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task.

prepare = PrepareDatasetForLearning()

train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(
    dataset, 'light_phone_lux', '2016-02-08 18:28:56', '2016-02-08 19:34:07',
    '2016-02-08 20:07:50')
#                                                                                   '2016-02-08 18:28:58','2016-02-08 18:28:59')

print 'Training set length is: ', len(train_X.index)
print 'Test set length is: ', len(test_X.index)

# Select subsets of the features that we will consider:

basic_features = [
    'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'gyr_phone_x', 'gyr_phone_y',
    'gyr_phone_z', 'labelOnTable', 'labelSitting', 'labelSmoking',
    'labelWalkingStairs', 'labelStandingInElevator', 'mag_phone_x',

Exemple #8

0

Afficher le fichier

from Operations import *
# Set up file names and locations.
DATA_PATH = Path('./intermediate_datafiles/')
DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter2_result.csv'
RESULT_FNAME = sys.argv[2] if len(
    sys.argv) > 2 else 'chapter3_result_outliers.csv'

dataset = pickle.load(open('concat_no_skipping.pkl', 'rb'))
dataset = rename(dataset)
# dataset.index = pd.to_datetime(dataset.index)

DataViz = VisualizeDataset(__file__, show=False)

# Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task.

prepare = PrepareDatasetForLearning()

dataset = dataset.fillna(0)
print(dataset.index)
print(dataset.loc[dataset.index.values[0]])
train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(
    dataset, "gyr_z", dataset.index.values[0],
    dataset.index.values[int(len(dataset.index) * 0.7)],
    dataset.index.values[-1])

print('Training set length is: ', len(train_X.index))
print('Test set length is: ', len(test_X.index))

# Select subsets of the features that we will consider:

print('Training set length is: ', len(train_X.index))

Exemple #9

0

Afficher le fichier

except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

if not os.path.exists(export_tree_path):
    os.makedirs(export_tree_path)

dataset.index = dataset.index.to_datetime()
dataset = dataset.dropna()
# Let us consider our first task, namely the prediction of the label. We consider this as a non-temporal task.

# We create a single column with the categorical attribute representing our class. Furthermore, we use 70% of our data
# for training and the remaining 30% as an independent test set. We select the sets based on stratified sampling. We remove
# cases where we do not know the label.

prepare = PrepareDatasetForLearning()

exact_labels = [
    'labelOnTable', 'labelSitting', 'labelWashingHands', 'labelWalking',
    'labelStanding', 'labelDriving', 'labelEating', 'labelRunning'
]
sum_values = dataset[exact_labels].sum(axis=1)
# Create a new 'class' column and set the value to the default class.
dataset['class'] = 'undefined'
for i in range(0, len(dataset.index)):
    # If we have exactly one true class column, we can assign that value,
    # otherwise we keep the default class.
    first = True
    for label in exact_labels:
        if dataset.ix[i, label] == 1:
            if first:

Exemple #10

0

Afficher le fichier

try:
    dataset = pd.read_csv(dataset_path + 'mydata_chapter5_result.csv', index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e
dataset = dataset.dropna(axis=0,how = 'any',inplace=False)

dataset.index = dataset.index.to_datetime()


# print(dataset.isnull().sum())
# exit(0)
# Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task.

prepare = PrepareDatasetForLearning()

train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(dataset, 'gyr_phone_x', '2017-06-09 10:20:28',
                                                                                   '2017-06-09 11:18:27', '2017-06-09 11:43:23')

print 'Training set length is: ', len(train_X.index)
print 'Test set length is: ', len(test_X.index)

# Select subsets of the features that we will consider:

print 'Training set length is: ', len(train_X.index)
print 'Test set length is: ', len(test_X.index)

# Select subsets of the features that we will consider:

basic_features = ['acc_phone_x','acc_phone_y','acc_phone_z','gyr_phone_y','gyr_phone_z',

Exemple #11

0

Afficher le fichier

def experiment(file):
    dataset = pd.read_csv(file, index_col=time_col)
    DataViz = VisualizeDataset(__file__.split('.')[0] +
                               file.split('.')[0].split('/')[1] + '.py',
                               show=True)
    print(DataViz.figures_dir)
    dataset.index = pd.to_datetime(dataset.index)
    prepare = PrepareDatasetForLearning()
    train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(
        dataset, ['label'],
        'like',
        0.7,
        filter=False,
        temporal=False,
        drop_na=False,
        fill_na=True)

    time_features = [name for name in dataset.columns if '_temp' in name]
    freq_features = [
        name for name in dataset.columns
        if (('_freq' in name) or ('_pse' in name))
    ]
    cluster_features = ['cluster']
    features_2 = list(set().union(basic_features, time_features))
    features_3 = list(set().union(basic_features, time_features,
                                  freq_features))
    features_4 = list(set().union(basic_features, time_features, freq_features,
                                  cluster_features))

    # print('feature selection')
    # fs = FeatureSelectionClassification()
    # features, selected_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION,
    #                                                                   train_X[features_4], train_y)
    # log([str(ordered_scores), str(selected_features)])
    selected_features = [
        'gyr_y_temp_std_ws_1200', 'acc_z_temp_mean_ws_120',
        'acc_x_temp_mean_ws_120', 'gyr_x_temp_std_ws_2400', 'gyr_z_max_freq',
        'gyr_y_freq_1.9_Hz_ws_40', 'acc_z_freq_0.4_Hz_ws_40',
        'gyr_z_freq_1.2_Hz_ws_40', 'gyr_x_freq_0.2_Hz_ws_40',
        'acc_z_freq_1.0_Hz_ws_40', 'acc_x_freq_0.2_Hz_ws_40',
        'acc_y_freq_1.9_Hz_ws_40', 'gyr_x_temp_mean_ws_1200',
        'acc_z_freq_1.9_Hz_ws_40', 'acc_x_temp_std_ws_120',
        'gyr_z_temp_std_ws_120', 'gyr_y_freq_1.5_Hz_ws_40',
        'gyr_z_temp_mean_ws_120', 'gyr_x_freq_0.0_Hz_ws_40',
        'acc_z_freq_0.6_Hz_ws_40'
    ]
    DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION + 1)],
                    y=[selected_features],
                    xlabel='number of features',
                    ylabel='accuracy')

    print('feature selection finished for %s' % file)
    learner = ClassificationAlgorithms()
    eval = ClassificationEvaluation()

    possible_feature_sets = [
        basic_features, features_2, features_3, features_4, selected_features
    ]
    feature_names = [
        'Basic features', 'Features with time', 'Features with frequency',
        'Features with cluster', 'Selected features'
    ]

    # with shelve.open('temp/shelve.out', 'n') as f:
    #     for key in dir():
    #         try:
    #             f[key] = globals()[key]
    #         except:
    #             print('ERROR shelving: {0}'.format(key))

    N_KCV_REPEATS = 1

    scores_over_all_algs = []

    for i in range(0, len(possible_feature_sets)):
        print(datetime.now())
        print('possible feature sets', i)
        log(['Features %d' % i])
        selected_train_X = train_X[possible_feature_sets[i]]
        selected_test_X = test_X[possible_feature_sets[i]]

        # First we run our non deterministic classifiers a number of times to average their score.

        performance_tr_rf = 0
        performance_te_rf = 0

        for repeat in range(0, N_KCV_REPEATS):
            print(datetime.now())
            print('\nRepeat', repeat)
            print('Random Forest')
            class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(
                selected_train_X,
                train_y,
                selected_test_X,
                gridsearch=True,
                print_model_details=True)
            test_cm = eval.confusion_matrix(test_y, class_test_y,
                                            class_train_prob_y.columns)

            DataViz.plot_confusion_matrix(test_cm,
                                          class_train_prob_y.columns,
                                          normalize=False)

            performance_tr_rf += eval.accuracy(train_y, class_train_y)
            performance_te_rf += eval.accuracy(test_y, class_test_y)

            print(datetime.now())

        overall_performance_tr_rf = performance_tr_rf / N_KCV_REPEATS
        overall_performance_te_rf = performance_te_rf / N_KCV_REPEATS
        log([
            'RF' + ' train acc: %f' % performance_te_rf +
            ' test acc: %f' % performance_te_rf
        ])

        # And we run our deterministic classifiers:

        print('decision tree')
        class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(
            selected_train_X,
            train_y,
            selected_test_X,
            gridsearch=True,
            print_model_details=True)
        performance_tr_dt = eval.accuracy(train_y, class_train_y)
        performance_te_dt = eval.accuracy(test_y, class_test_y)
        test_cm = eval.confusion_matrix(test_y, class_test_y,
                                        class_train_prob_y.columns)

        DataViz.plot_confusion_matrix(test_cm,
                                      class_train_prob_y.columns,
                                      normalize=False)

        log([
            'DT' + ' train acc: %f' % performance_tr_dt +
            ' test acc: %f' % performance_te_dt
        ])
        scores_with_sd = util.print_table_row_performances(
            feature_names[i], len(selected_train_X.index),
            len(selected_test_X.index), [
                (overall_performance_tr_rf, overall_performance_te_rf),
                (performance_tr_dt, performance_te_dt),
            ])
        scores_over_all_algs.append(scores_with_sd)

    DataViz.plot_performances_classification(['RF', 'DT'], feature_names,
                                             scores_over_all_algs)
    print(datetime.now())

Exemple #12

0

Afficher le fichier

Fichier : crowdsignals_ch8_regression.py Projet : antreashp/ML4QS_Group_41

try:
    dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

random_state = 2020

dataset.index = pd.to_datetime(dataset.index)
# dataset["hr_watch_rate"] = np.random.randint(60,120, size=len(dataset))
dataset = dataset.fillna(0)
# print(dataset["hr_watch_rate"])
# Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task.

prepare = PrepareDatasetForLearning()

train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(
    dataset,
    'acc_phone_x',
    '2020-06-05 13:11:27',
    #                                                                                   '2016-02-08 18:29:58','2016-02-08 18:29:59')
    '2020-06-05 13:43:26',
    '2020-06-05 13:55:40')
print('Training set length is: ', len(train_X.index))
print('Test set length is: ', len(test_X.index))

# Select subsets of the features that we will consider:

print('Training set length is: ', len(train_X.index))
print('Test set length is: ', len(test_X.index))

Exemple #13

0

Afficher le fichier

Fichier : crowdsignals_ch8_regression.py Projet : liuyuxue97/ML4QS-77

DATA_PATH = Path('./intermediate_datafiles/')
DATASET_FNAME = 'chapter5_result_own.csv'

DataViz = VisualizeDataset(__file__)

try:
    dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = pd.to_datetime(dataset.index)

# Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task.

prepare = PrepareDatasetForLearning()

train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(
    dataset,
    'gyr_phone_z',
    '2020-06-02 13:11:36',
    #                                                                                   '2016-02-08 18:29:58','2016-02-08 18:29:59')
    '2020-06-02 13:52:51',
    '2020-06-02 14:13:28')

print('Training set length is: ', len(train_X.index))
print('Test set length is: ', len(test_X.index))

# Select subsets of the features that we will consider:

print('Training set length is: ', len(train_X.index))

Exemple #14

0

Afficher le fichier

def main():
    # Read the result from the previous chapter and convert the index to datetime
    try:
        dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Consider the second task, namely the prediction of the heart rate. Therefore create a dataset with the heart
    # rate as target and split using timestamps, because this is considered as a temporal task
    print('\n- - - Loading dataset - - -')
    prepare = PrepareDatasetForLearning()
    train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(
        dataset, 'hr_watch_rate', '2016-02-08 18:29:56', '2016-02-08 19:34:07',
        '2016-02-08 20:07:50')
    print('Training set length is: ', len(train_X.index))
    print('Test set length is: ', len(test_X.index))

    # Select subsets of the features
    print('\n- - - Selecting subsets - - -')
    basic_features = [
        'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
        'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y',
        'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z',
        'labelOnTable', 'labelSitting', 'labelWashingHands', 'labelWalking',
        'labelStanding', 'labelDriving', 'labelEating', 'labelRunning',
        'light_phone_lux', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z',
        'mag_watch_x', 'mag_watch_y', 'mag_watch_z', 'press_phone_pressure'
    ]
    pca_features = [
        'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7'
    ]
    time_features = [
        name for name in dataset.columns
        if ('temp_' in name and 'hr_watch' not in name)
    ]
    freq_features = [
        name for name in dataset.columns
        if (('_freq' in name) or ('_pse' in name))
    ]
    cluster_features = ['cluster']
    print('#basic features: ', len(basic_features))
    print('#PCA features: ', len(pca_features))
    print('#time features: ', len(time_features))
    print('#frequency features: ', len(freq_features))
    print('#cluster features: ', len(cluster_features))
    features_after_chapter_3 = list(set().union(basic_features, pca_features))
    features_after_chapter_4 = list(set().union(features_after_chapter_3,
                                                time_features, freq_features))
    features_after_chapter_5 = list(set().union(features_after_chapter_4,
                                                cluster_features))

    selected_features = [
        'temp_pattern_labelOnTable', 'labelOnTable',
        'temp_pattern_labelOnTable(b)labelOnTable', 'cluster',
        'pca_1_temp_mean_ws_120', 'pca_2_temp_mean_ws_120', 'pca_2',
        'acc_watch_y_temp_mean_ws_120', 'gyr_watch_y_pse', 'gyr_watch_x_pse'
    ]
    possible_feature_sets = [
        basic_features, features_after_chapter_3, features_after_chapter_4,
        features_after_chapter_5, selected_features
    ]
    feature_names = [
        'initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5',
        'Selected features'
    ]

    if FLAGS.mode == 'correlation' or FLAGS.mode == 'all':
        # First study whether the time series is stationary and what the autocorrelations are
        adfuller(dataset['hr_watch_rate'], autolag='AIC')
        plt.Figure()
        autocorrelation_plot(dataset['hr_watch_rate'])
        DataViz.save(plt)
        plt.show()

    # Now focus on the learning part
    learner = TemporalRegressionAlgorithms()
    evaluate = RegressionEvaluation()

    if FLAGS.mode == 'overall' or FLAGS.mode == 'all':
        # Repeat the experiment a number of times to get a bit more robust data as the initialization of e.g. the NN is
        # random
        repeats = FLAGS.repeats

        # Set a washout time to give the NN's the time to stabilize (so don't compute the error during the washout time)
        washout_time = FLAGS.washout
        scores_over_all_algs = []

        for i in range(0, len(possible_feature_sets)):
            print(f'Evaluating for features {possible_feature_sets[i]}')
            selected_train_X = train_X[possible_feature_sets[i]]
            selected_test_X = test_X[possible_feature_sets[i]]

            # First run non deterministic classifiers a number of times to average their score
            performance_tr_res, performance_tr_res_std = 0, 0
            performance_te_res, performance_te_res_std = 0, 0
            performance_tr_rnn, performance_tr_rnn_std = 0, 0
            performance_te_rnn, performance_te_rnn_std = 0, 0

            for repeat in range(0, repeats):
                print(f'--- run {repeat} ---')
                regr_train_y, regr_test_y = learner.reservoir_computing(
                    selected_train_X,
                    train_y,
                    selected_test_X,
                    test_y,
                    gridsearch=True,
                    per_time_step=False)

                mean_tr, std_tr = evaluate.mean_squared_error_with_std(
                    train_y.iloc[washout_time:, ],
                    regr_train_y.iloc[washout_time:, ])
                mean_te, std_te = evaluate.mean_squared_error_with_std(
                    test_y.iloc[washout_time:, ],
                    regr_test_y.iloc[washout_time:, ])

                performance_tr_res += mean_tr
                performance_tr_res_std += std_tr
                performance_te_res += mean_te
                performance_te_res_std += std_te

                regr_train_y, regr_test_y = learner.recurrent_neural_network(
                    selected_train_X,
                    train_y,
                    selected_test_X,
                    test_y,
                    gridsearch=True)

                mean_tr, std_tr = evaluate.mean_squared_error_with_std(
                    train_y.iloc[washout_time:, ],
                    regr_train_y.iloc[washout_time:, ])
                mean_te, std_te = evaluate.mean_squared_error_with_std(
                    test_y.iloc[washout_time:, ],
                    regr_test_y.iloc[washout_time:, ])

                performance_tr_rnn += mean_tr
                performance_tr_rnn_std += std_tr
                performance_te_rnn += mean_te
                performance_te_rnn_std += std_te

            # Only apply the time series in case of the basis features
            if feature_names[i] == 'initial set':
                regr_train_y, regr_test_y = learner.time_series(
                    selected_train_X,
                    train_y,
                    selected_test_X,
                    test_y,
                    gridsearch=True)

                mean_tr, std_tr = evaluate.mean_squared_error_with_std(
                    train_y.iloc[washout_time:, ],
                    regr_train_y.iloc[washout_time:, ])
                mean_te, std_te = evaluate.mean_squared_error_with_std(
                    test_y.iloc[washout_time:, ],
                    regr_test_y.iloc[washout_time:, ])

                overall_performance_tr_ts = mean_tr
                overall_performance_tr_ts_std = std_tr
                overall_performance_te_ts = mean_te
                overall_performance_te_ts_std = std_te
            else:
                overall_performance_tr_ts = 0
                overall_performance_tr_ts_std = 0
                overall_performance_te_ts = 0
                overall_performance_te_ts_std = 0

            overall_performance_tr_res = performance_tr_res / repeats
            overall_performance_tr_res_std = performance_tr_res_std / repeats
            overall_performance_te_res = performance_te_res / repeats
            overall_performance_te_res_std = performance_te_res_std / repeats
            overall_performance_tr_rnn = performance_tr_rnn / repeats
            overall_performance_tr_rnn_std = performance_tr_rnn_std / repeats
            overall_performance_te_rnn = performance_te_rnn / repeats
            overall_performance_te_rnn_std = performance_te_rnn_std / repeats

            scores_with_sd = [
                (overall_performance_tr_res, overall_performance_tr_res_std,
                 overall_performance_te_res, overall_performance_te_res_std),
                (overall_performance_tr_rnn, overall_performance_tr_rnn_std,
                 overall_performance_te_rnn, overall_performance_te_rnn_std),
                (overall_performance_tr_ts, overall_performance_tr_ts_std,
                 overall_performance_te_ts, overall_performance_te_ts_std)
            ]
            util.print_table_row_performances_regression(
                feature_names[i], scores_with_sd)
            scores_over_all_algs.append(scores_with_sd)

        DataViz.plot_performances_regression(
            ['Reservoir', 'RNN', 'Time series'], feature_names,
            scores_over_all_algs)

    if FLAGS.mode == 'detail' or FLAGS.mode == 'all':
        regr_train_y, regr_test_y = learner.reservoir_computing(
            train_X[features_after_chapter_5],
            train_y,
            test_X[features_after_chapter_5],
            test_y,
            gridsearch=False)
        DataViz.plot_numerical_prediction_versus_real(
            train_X.index, train_y, regr_train_y['hr_watch_rate'],
            test_X.index, test_y, regr_test_y['hr_watch_rate'], 'heart rate')

        regr_train_y, regr_test_y = learner.recurrent_neural_network(
            train_X[basic_features],
            train_y,
            test_X[basic_features],
            test_y,
            gridsearch=True)
        DataViz.plot_numerical_prediction_versus_real(
            train_X.index, train_y, regr_train_y['hr_watch_rate'],
            test_X.index, test_y, regr_test_y['hr_watch_rate'], 'heart rate')

        regr_train_y, regr_test_y = learner.time_series(
            train_X[basic_features],
            train_y,
            test_X[basic_features],
            test_y,
            gridsearch=True)
        DataViz.plot_numerical_prediction_versus_real(
            train_X.index, train_y, regr_train_y['hr_watch_rate'],
            test_X.index, test_y, regr_test_y['hr_watch_rate'], 'heart rate')

    if FLAGS.mode == 'dynamical' or FLAGS.mode == 'all':
        # And now some example code for using the dynamical systems model with parameter tuning (note: focus on
        # predicting accelerometer data):
        train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression(
            copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y'],
            0.9,
            filter_data=False,
            temporal=True)
        output_sets = learner. \
            dynamical_systems_model_nsga_2(train_X, train_y, test_X, test_y,
                                           ['self.acc_phone_x', 'self.acc_phone_y', 'self.acc_phone_z'],
                                           ['self.a * self.acc_phone_x + self.b * self.acc_phone_y',
                                            'self.c * self.acc_phone_y + self.d * self.acc_phone_z',
                                            'self.e * self.acc_phone_x + self.f * self.acc_phone_z'],
                                           ['self.acc_phone_x', 'self.acc_phone_y'],
                                           ['self.a', 'self.b', 'self.c', 'self.d', 'self.e', 'self.f'],
                                           pop_size=10, max_generations=10, per_time_step=True)
        DataViz.plot_pareto_front(output_sets)

        DataViz.plot_numerical_prediction_versus_real_dynsys_mo(
            train_X.index, train_y, test_X.index, test_y, output_sets, 0,
            'acc_phone_x')

        regr_train_y, regr_test_y = learner. \
            dynamical_systems_model_ga(train_X, train_y, test_X, test_y,
                                       ['self.acc_phone_x', 'self.acc_phone_y', 'self.acc_phone_z'],
                                       ['self.a * self.acc_phone_x + self.b * self.acc_phone_y',
                                        'self.c * self.acc_phone_y + self.d * self.acc_phone_z',
                                        'self.e * self.acc_phone_x + self.f * self.acc_phone_z'],
                                       ['self.acc_phone_x', 'self.acc_phone_y'],
                                       ['self.a', 'self.b', 'self.c', 'self.d', 'self.e', 'self.f'],
                                       pop_size=5, max_generations=10, per_time_step=True)

        DataViz.plot_numerical_prediction_versus_real(
            train_X.index, train_y['acc_phone_x'], regr_train_y['acc_phone_x'],
            test_X.index, test_y['acc_phone_x'], regr_test_y['acc_phone_x'],
            'acc_phone_x')

        regr_train_y, regr_test_y = learner. \
            dynamical_systems_model_sa(train_X, train_y, test_X, test_y,
                                       ['self.acc_phone_x', 'self.acc_phone_y', 'self.acc_phone_z'],
                                       ['self.a * self.acc_phone_x + self.b * self.acc_phone_y',
                                        'self.c * self.acc_phone_y + self.d * self.acc_phone_z',
                                        'self.e * self.acc_phone_x + self.f * self.acc_phone_z'],
                                       ['self.acc_phone_x', 'self.acc_phone_y'],
                                       ['self.a', 'self.b', 'self.c', 'self.d', 'self.e', 'self.f'],
                                       max_generations=10, per_time_step=True)

        DataViz.plot_numerical_prediction_versus_real(
            train_X.index, train_y['acc_phone_x'], regr_train_y['acc_phone_x'],
            test_X.index, test_y['acc_phone_x'], regr_test_y['acc_phone_x'],
            'acc_phone_x')

Exemple #15

0

Afficher le fichier

            flattened_values[columnname +"_max"] = (np.max(values))
            #flattened_values = flattened_values.join(np.max(transformation))
            #flattened_values = np.append(flattened_values, np.argmax(transformation))
            #flattened_values = np.append(flattened_values, np.max(transformation))

        df = pd.DataFrame(data=flattened_values)
        df['class'] = str(label)

        frames.append(df)

result = pd.concat(frames)

result.columns = result.columns.astype(str)

# result = result.sample(frac=1)
prepare = PrepareDatasetForLearning()

train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(result, ['class'], 'unlike', 0.8,
                                                                               filter=True, temporal=False)

#number_training_samples = len(train_X)
#val_split = int(0.7 * number_training_samples)
#val_X = train_X[val_split:-1]
#val_y = train_y[val_split:-1]
#train_X = train_X[0:val_split - 1]
#train_y = train_y[0:val_split - 1]

learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()

print(len(train_X))

Exemple #16

0

Afficher le fichier

                          index_col=0)
except IOError as e:
    print('File not found, try to run previous crowdsignals scripts first!')
    raise e

dataset.index = dataset.index.to_datetime()
dataset = dataset.dropna()
del dataset['silhouette']

# Let us consider our first task, namely the prediction of the label. We consider this as a non-temporal task.

# We create a single column with the categorical attribute representing our class. Furthermore, we use 70% of our data
# for training and the remaining 30% as an independent test set. We select the sets based on stratified sampling. We remove
# cases where we do not know the label.

prepare = PrepareDatasetForLearning()

train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(
    dataset,
    'acc_phone_x',
    '2017-06-13 22:21:02',
    #                                                                                   '2016-02-08 18:29:58','2016-02-08 18:29:59')
    '2017-06-13 23:40:47',
    '2017-06-14 00:22:24')

print 'Training set length is: ', len(train_X.index)
print 'Test set length is: ', len(test_X.index)

# Select subsets of the features that we will consider:

print 'Training set length is: ', len(train_X.index)