Esempio n. 1
0
    def test_from_columns(self):
        tsn = "TEST_TIME_SERIES"

        fset = ComprehensiveFCParameters()
        self.assertRaises(TypeError, from_columns, 42)
        self.assertRaises(TypeError, from_columns, 42)
        self.assertRaises(ValueError, from_columns, ["This is not a column name"])
        self.assertRaises(ValueError, from_columns, ["This__neither"])
        self.assertRaises(ValueError, from_columns, ["This__also__not"])

        # Aggregate functions
        feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"]

        # Aggregate functions with params
        feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30',
                          tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf',
                          tsn + '__value_count__value_nan']

        # Apply functions
        feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1']

        kind_to_fc_parameters = from_columns(feature_names)

        six.assertCountEqual(self, list(kind_to_fc_parameters[tsn].keys()),
                             ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks",
                              "ar_coefficient", "value_count"])

        self.assertEqual(kind_to_fc_parameters[tsn]["sum_values"], None)
        self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"],
                         [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}])

        self.assertEqual(kind_to_fc_parameters[tsn]["value_count"],
                         [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}])

        # test that it passes for all functions
        fset = ComprehensiveFCParameters()
        X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                                 default_fc_parameters=fset,
                                 column_id="id", column_value="value",
                                 n_jobs=0)

        inferred_fset = from_columns(X_org)

        X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                                 kind_to_fc_parameters=inferred_fset,
                                 column_id="id", column_value="value",
                                 n_jobs=0)

        assert_frame_equal(X_org.sort_index(), X_new.sort_index())
Esempio n. 2
0
    def test_from_columns(self):
        tsn = "TEST_TIME_SERIES"

        fset = ComprehensiveFCParameters()
        self.assertRaises(TypeError, from_columns, 42)
        self.assertRaises(TypeError, from_columns, 42)
        self.assertRaises(ValueError, from_columns, ["This is not a column name"])
        self.assertRaises(ValueError, from_columns, ["This__neither"])
        self.assertRaises(ValueError, from_columns, ["This__also__not"])

        # Aggregate functions
        feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"]

        # Aggregate functions with params
        feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30',
                          tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf',
                          tsn + '__value_count__value_nan']

        # Apply functions
        feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1']

        kind_to_fc_parameters = from_columns(feature_names)

        six.assertCountEqual(self, list(kind_to_fc_parameters[tsn].keys()),
                             ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks",
                              "ar_coefficient", "value_count"])

        self.assertEqual(kind_to_fc_parameters[tsn]["sum_values"], None)
        self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"],
                         [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}])

        self.assertEqual(kind_to_fc_parameters[tsn]["value_count"],
                         [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}])

        # test that it passes for all functions
        fset = ComprehensiveFCParameters()
        X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                                 default_fc_parameters=fset,
                                 column_id="id", column_value="value",
                                 n_jobs=0)

        inferred_fset = from_columns(X_org)

        X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                                 kind_to_fc_parameters=inferred_fset,
                                 column_id="id", column_value="value",
                                 n_jobs=0)

        assert_frame_equal(X_org.sort_index(), X_new.sort_index())
Esempio n. 3
0
    def test_from_column_correct_for_selected_columns(self):
        tsn = "TEST_TIME_SERIES"

        # Aggregate functions
        feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"]

        # Aggregate functions with params
        feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30',
                          tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf',
                          tsn + '__value_count__value_nan']

        # Apply functions
        feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1']

        kind_to_fc_parameters = from_columns(feature_names)
        self.assertCountEqual(list(kind_to_fc_parameters[tsn].keys()),
                              ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks",
                               "ar_coefficient", "value_count"])

        self.assertIsNone(kind_to_fc_parameters[tsn]["sum_values"])
        self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"],
                         [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}])

        self.assertEqual(kind_to_fc_parameters[tsn]["value_count"],
                         [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}])
Esempio n. 4
0
    def test_from_columns_correct_for_different_kind_datatypes(self):
        """The `settings.from_columns()` function is supposed to save the feature extraction / selection results so it
        can be reused later. It works by parsing the column names of the extracted dataframes. An unfortunate side
        effect of this is that when used with the 'long' format time series input, the typing information about the
        'kind' column is lost. For example, even if the 'kind' values are in int32, in the resulting settings dict, the
        type of the top level keys (representing different kind values) will be str
        """
        df = pd.DataFrame({
            'id': [1, 1, 1, 1],
            'time': [1, 1, 2, 2],
            'kind': [1, 2, 1, 2],
            'value': [1, 2, 3, 4]
        })

        features = extract_features(
            df,
            column_id='id',
            column_sort='time',
            column_kind='kind',
            column_value='value',
            default_fc_parameters=MinimalFCParameters())
        sample_settings = from_columns(features)
        X = extract_features(df,
                             column_id='id',
                             column_sort='time',
                             column_kind='kind',
                             column_value='value',
                             kind_to_fc_parameters=sample_settings)
        assert X.shape == (1, 2 * len(MinimalFCParameters()))
    def transform(self, X):
        """
        After the fit step, it is known which features are relevant, Only extract those from the time series handed in
        with the function :func:`~set_timeseries_container`.

        If filter_only_tsfresh_features is False, also delete the irrelevant,
        already present features in the data frame.

        :param X: the data sample to add the relevant (and delete the irrelevant) features to.
        :type X: pandas.DataFrame or numpy.array

        :return: a data sample with the same information as X, but with added relevant time series features and
            deleted irrelevant information (only if filter_only_tsfresh_features is False).
        :rtype: pandas.DataFrame
        """

        if self.timeseries_container is None:
            raise RuntimeError("You have to provide a time series using the set_timeseries_container function before.")

        if self.feature_selector is None:
            raise RuntimeError("You have to call fit before calling transform.")

        if self.feature_selector.relevant_features is None:
            raise RuntimeError("You have to call fit before calling transform.")

        self.feature_extractor.set_timeseries_container(self.timeseries_container)

        relevant_time_series_features = set(self.feature_selector.relevant_features) - set(pd.DataFrame(X).columns)
        relevant_extraction_settings = from_columns(relevant_time_series_features)

        # Set imputing strategy
        impute_function = partial(impute_dataframe_range, col_to_max=self.col_to_max,
                                  col_to_min=self.col_to_min, col_to_median=self.col_to_median)

        relevant_feature_extractor = FeatureAugmenter(kind_to_fc_parameters=relevant_extraction_settings,
                                                      default_fc_parameters={},
                                                      column_id=self.feature_extractor.column_id,
                                                      column_sort=self.feature_extractor.column_sort,
                                                      column_kind=self.feature_extractor.column_kind,
                                                      column_value=self.feature_extractor.column_value,
                                                      chunksize=self.feature_extractor.chunksize,
                                                      n_jobs=self.feature_extractor.n_jobs,
                                                      show_warnings=self.feature_extractor.show_warnings,
                                                      disable_progressbar=self.feature_extractor.disable_progressbar,
                                                      impute_function=impute_function,
                                                      profile=self.feature_extractor.profile,
                                                      profiling_filename=self.feature_extractor.profiling_filename,
                                                      profiling_sorting=self.feature_extractor.profiling_sorting)

        relevant_feature_extractor.set_timeseries_container(self.feature_extractor.timeseries_container)

        X_augmented = relevant_feature_extractor.transform(X)

        if self.filter_only_tsfresh_features:
            return X_augmented.copy().loc[:, self.feature_selector.relevant_features + X.columns.tolist()]
        else:
            return X_augmented.copy().loc[:, self.feature_selector.relevant_features]
Esempio n. 6
0
    def test_from_columns(self):
        tsn = "TEST_TIME_SERIES"

        fset = ComprehensiveFCParameters()
        self.assertRaises(TypeError, from_columns, 42)
        self.assertRaises(TypeError, from_columns, 42)
        self.assertRaises(ValueError, from_columns,
                          ["This is not a column name"])
        self.assertRaises(ValueError, from_columns, ["This__neither"])
        self.assertRaises(ValueError, from_columns, ["This__also__not"])

        # Aggregate functions
        feature_names = [
            tsn + '__sum_values', tsn + "__median", tsn + "__length",
            tsn + "__sample_entropy"
        ]

        # Aggregate functions with params
        feature_names += [
            tsn + '__quantile__q_10', tsn + '__quantile__q_70',
            tsn + '__number_peaks__n_30', tsn + '__value_count__value_inf',
            tsn + '__value_count__value_-inf', tsn + '__value_count__value_nan'
        ]

        # Apply functions
        feature_names += [
            tsn + '__ar_coefficient__k_20__coeff_4',
            tsn + '__ar_coefficient__coeff_10__k_-1'
        ]

        kind_to_fc_parameters = from_columns(feature_names)

        six.assertCountEqual(self, list(kind_to_fc_parameters[tsn].keys()), [
            "sum_values", "median", "length", "sample_entropy", "quantile",
            "number_peaks", "ar_coefficient", "value_count"
        ])

        self.assertEqual(kind_to_fc_parameters[tsn]["sum_values"], None)
        self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"],
                         [{
                             "k": 20,
                             "coeff": 4
                         }, {
                             "k": -1,
                             "coeff": 10
                         }])

        self.assertEqual(kind_to_fc_parameters[tsn]["value_count"],
                         [{
                             "value": np.PINF
                         }, {
                             "value": np.NINF
                         }, {
                             "value": np.NaN
                         }])
Esempio n. 7
0
    def test_from_columns_ignores_columns(self):

        tsn = "TEST_TIME_SERIES"
        feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"]
        feature_names += ["THIS_COL_SHOULD_BE_IGNORED"]

        kind_to_fc_parameters = from_columns(feature_names, columns_to_ignore=["THIS_COL_SHOULD_BE_IGNORED",
                                                                               "THIS_AS_WELL"])

        six.assertCountEqual(self, list(kind_to_fc_parameters[tsn].keys()),
                             ["sum_values", "median", "length", "sample_entropy"])
Esempio n. 8
0
    def test_from_columns_ignores_columns(self):

        tsn = "TEST_TIME_SERIES"
        feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"]
        feature_names += ["THIS_COL_SHOULD_BE_IGNORED"]

        kind_to_fc_parameters = from_columns(feature_names, columns_to_ignore=["THIS_COL_SHOULD_BE_IGNORED",
                                                                               "THIS_AS_WELL"])

        self.assertCountEqual(list(kind_to_fc_parameters[tsn].keys()),
                              ["sum_values", "median", "length", "sample_entropy"])
def handle_message(msg):

    if msg.key() is None or type(msg.key()) is not dict:
        logger.warning("Key is none. Ignoring message.")
        return
    elif msg.value() is None or type(msg.value()) is not dict:
        logger.warning("Value is none. Ignoring message.")
        return

    try:
        time_begin = time.time()

        timeseries = pd.melt(
            pd.DataFrame.from_dict(msg.value(),
                                   orient='index').transpose()).dropna()
        timeseries['group_id'] = 0

        if timeseries.isnull().sum().sum() > 0:
            logger.warning("at least one field of timeseries is null")
            return

        X = extract_features(
            timeseries,
            column_id='group_id',
            column_kind="variable",
            column_value="value",
            kind_to_fc_parameters=settings.from_columns(fc_parameters))

        if X.isnull().sum().sum() > 0:
            logger.warning("at least one field of extracted features is null")
            return

        kritisch = ml_model.predict(pca_model.transform(X))[0]

        time_end = time.time()

        start_prediction_interval = time.localtime(msg.key()['timestamp_end'] /
                                                   1000)
        end_prediction_interval = time.localtime(msg.key()['timestamp_end'] /
                                                 1000 + 60 * 5)

        print("Prediction for interval",
              time.strftime("%H:%M:%S", start_prediction_interval), "to",
              time.strftime("%H:%M:%S", end_prediction_interval), ":",
              "kritisch" if kritisch else "unkritisch")

        if SHOW_CALCULATION_TIME == 1:
            print("time for calculation", round(time_end - time_begin, 5),
                  "seconds")

    except Exception as e:
        logger.exception(e)
        consumer.stop()
Esempio n. 10
0
 def test_from_column_correct_for_comprehensive_fc_parameters(self):
     fset = ComprehensiveFCParameters()
     X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                              default_fc_parameters=fset,
                              column_id="id", column_value="value",
                              n_jobs=0)
     inferred_fset = from_columns(X_org)
     X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}),
                              kind_to_fc_parameters=inferred_fset,
                              column_id="id", column_value="value",
                              n_jobs=0)
     assert_frame_equal(X_org.sort_index(), X_new.sort_index())
    def transform(self, X):
        """
        After the fit step, it is known which features are relevant, Only extract those from the time series handed in
        with the function :func:`~set_timeseries_container`.

        If filter_only_tsfresh_features is False, also delete the irrelevant, already present features in the data frame.

        :param X: the data sample to add the relevant (and delete the irrelevant) features to.
        :type X: pandas.DataFrame or numpy.array

        :return: a data sample with the same information as X, but with added relevant time series features and
            deleted irrelevant information (only if filter_only_tsfresh_features is False).
        :rtype: pandas.DataFrame
        """
        if self.feature_selector.relevant_features is None:
            raise RuntimeError("You have to call fit before.")

        if self.timeseries_container is None:
            raise RuntimeError("You have to provide a time series using the set_timeseries_container function before.")

        self.feature_extractor.set_timeseries_container(self.timeseries_container)

        relevant_time_series_features = set(self.feature_selector.relevant_features) - set(pd.DataFrame(X).columns)
        relevant_extraction_settings = from_columns(relevant_time_series_features)

        # Set imputing strategy
        impute_function = partial(impute_dataframe_range, col_to_max=self.col_to_max,
                                  col_to_min=self.col_to_min, col_to_median=self.col_to_median)

        relevant_feature_extractor = FeatureAugmenter(kind_to_fc_parameters=relevant_extraction_settings,
                                                      default_fc_parameters={},
                                                      column_id=self.feature_extractor.column_id,
                                                      column_sort=self.feature_extractor.column_sort,
                                                      column_kind=self.feature_extractor.column_kind,
                                                      column_value=self.feature_extractor.column_value,
                                                      chunksize=self.feature_extractor.chunksize,
                                                      n_jobs=self.feature_extractor.n_jobs,
                                                      show_warnings=self.feature_extractor.show_warnings,
                                                      disable_progressbar=self.feature_extractor.disable_progressbar,
                                                      impute_function=impute_function,
                                                      profile=self.feature_extractor.profile,
                                                      profiling_filename=self.feature_extractor.profiling_filename,
                                                      profiling_sorting=self.feature_extractor.profiling_sorting)

        relevant_feature_extractor.set_timeseries_container(self.feature_extractor.timeseries_container)

        X_augmented = relevant_feature_extractor.transform(X)

        if self.filter_only_tsfresh_features:
            return X_augmented.copy().loc[:, self.feature_selector.relevant_features + X.columns.tolist()]
        else:
            return X_augmented.copy().loc[:, self.feature_selector.relevant_features]
def get_features(X, y=None, kind_to_fc_parameters=None):
    samples, time_steps, data_dim = X.shape
    X = X.reshape([-1, data_dim])
    time = list(range(time_steps)) * samples
    ids = []
    for i in range(samples):
        ids.extend([i] * time_steps)
    X = pd.DataFrame(X)
    X['id'] = ids
    X['time'] = time

    if y is not None:
        features = extract_relevant_features(X, y, column_id='id', column_sort='time', n_jobs=0)
        kind_to_fc_parameters = from_columns(features)
        return features.values, kind_to_fc_parameters
    elif kind_to_fc_parameters is not None:
        features = extract_features(X, column_id='id', column_sort='time', n_jobs=0,
                                    default_fc_parameters=kind_to_fc_parameters)
    else:
        features = extract_features(X, column_id='id', column_sort='time', n_jobs=0)
    return features.values
Esempio n. 13
0
def extract_data(data_folder: str,
                 columns: list,
                 overlap=False,
                 all: bool = True,
                 est_events: bool = False,
                 event: str = None,
                 event_type: str = None):
    '''
	This function uses tsFRESH to extract relevant features for multiple machine learning tasks.
	If a csv file of features to use already exists (as features.csv), then those features will
	be used instead of finding relevant features from scratch (speeds up computing time).

	Inputs:

	data_folder: a string containing the location of the directory which the dataset.pkl is saved in.
				 This dataset it created using data_preperation.py.

	columns: a list of strings containing the columns from the dataset which the user wishes to extract 
			 features from. This includes: id, time, ax_l, ay_l, az_l, ax_r, ay_r, az_r,
		 	 ax_diff, ay_diff, az_diff, a_res_l, a_res_r, a_res_diff.
			 NOTE: if id or time are not included in this list, they will be automatically added as they
			 are necessary.

	all: a boolean (either True or False). If true, feature extraction will be run using all the data, if
		 False, feature extraction will be run using the first trial, and then that we be used on all the
		 data.

	est_events: a boolean (either True or False). If True, features will be extracted to estimate whether
				an event occured or not within a 100 ms time frame. If False, features will be extracted
				to estimate vertical GRF for the entire timeseries.

	event: A string containing either FS or FO. This will indicate which event the user wants to predict on.
		NOTE: this is only necessary as an input if est_events is True.

	event_type: A string containing either binary or time. This will indicate which type of output the user wants.
		NOTE: this is only necessary as an input if est_events is True.
	
	Outputs:
	This function does not return anything. However, it does save *.csv files in appropriate folders (based off
	the columns chosen) which can be used to fit either a classification or regression model (depending on what
	task is required) - see model_fitting.py

	Alex Woodall

	Auckland Bioengineering Institute

	08/04/2020

	'''

    from tsfresh import extract_features, extract_relevant_features, select_features
    from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters
    from tsfresh.feature_extraction.settings import from_columns
    from tsfresh.utilities.dataframe_functions import impute

    import pickle
    import numpy as np
    import pandas as pd
    import os

    # Load data
    try:
        if overlap:
            dataset = pickle.load(
                open(data_folder + "dataset_overlap.pkl", "rb"))

        else:
            dataset = pickle.load(
                open(data_folder + "dataset_no_overlap.pkl", "rb"))

    except FileNotFoundError:
        dataset = pickle.load(open(data_folder + "dataset_200.pkl", "rb"))

    # Number selected columns which the user chose to use for feature extraction
    columns, columns_num = selected_columns(columns)

    # Create directories for saving
    new_directory = "{}{}\\".format(data_folder,
                                    ("_".join(map(str, columns_num))))
    save_dir = create_directories(new_directory, event, event_type, est_events)

    # Attempt to load features from the save directory.
    try:
        X_features = pd.read_csv(
            "{}features.csv".format(save_dir),
            index_col=0)  # DataFrame containing the features we want
        features_string = X_features.columns
        extraction_settings = from_columns(
            features_string)  # These are the features that we will be using

        pre_extracted = True

    except FileNotFoundError:  # File does not exist
        pre_extracted = False

    # List to append last uid's from each key (used when using all trials to extract features)
    uid_last = []

    # Iterate through all the trials in the dataset
    for key in dataset.keys():

        # Create the timeseries based on the user input columns
        for col in columns_num:
            if col == 0:
                timeseries = (
                    dataset[key]['X'])[:, col]  # Only true accelerations
            else:
                timeseries = np.vstack((timeseries, dataset[key]['X'][:, col]))

        # dataset[key].keys() = ['X', 'force', 'y_FS_binary', 'y_FO_binary', 'y_FS_time_to', 'y_FO_time_to']

        # Create y (real data output)
        if est_events:  # If estimating events

            try:
                if event_type == 'binary':
                    y = dataset[key]['y_{}_binary'.format(event)]

                    # Convert to boolean (will remain boolean if already)
                    y = (y == 1.0)

                elif event_type == 'time':
                    y = dataset[key]['y_{}_time_to_next'.format(event)]

                else:
                    print('Event type must either be binary or time')

                    return

            except KeyError:
                print("Event must equal either 'FS' or 'FO'.")

                return

        else:  # Estimating forces
            # possible force = ['Fx', 'Fy', 'Fz'] Assuming z direction is vertical
            y = dataset[key]['y'][:, 2]

        # Convert to pandas DataFrame/Series
        if type(timeseries) is np.ndarray:
            # Needs to be a pandas dataframe
            timeseries = pd.DataFrame(timeseries.T, columns=columns)

            # Convert ID column into integers
            timeseries = timeseries.astype({'id': int})

            if est_events:
                if event_type == 'binary':
                    y = pd.Series(data=y, dtype=bool, name='events')
                elif event_type == 'time':
                    y = pd.Series(data=y, dtype=float, name='events')
            else:
                # Change ID column to fit for regression method
                ID = (np.arange(0, len(timeseries))).astype(int)

                timeseries['id'] = ID

                y = pd.Series(data=y, dtype=float, name='Fz')

        # Save X full dataset
        timeseries.to_csv("{}{}_timeseries.csv".format(save_dir, key),
                          index=True,
                          header=True)

        # Extract features from the first trial and use those for the rest if all == True
        if not all:
            # Extract features using tsFRESH
            if not pre_extracted:
                print('Finding relevant features using {}'.format(key))
                X_filtered = extract_relevant_features(
                    timeseries,
                    y,
                    column_id="id",
                    column_sort="time",
                    default_fc_parameters=ComprehensiveFCParameters())

                # Save filtered features
                X_filtered.to_csv("{}features.csv".format(save_dir),
                                  header=True)

                features_string = X_filtered.columns
                extraction_settings = from_columns(
                    features_string
                )  # These are the features that we will be using

                pre_extracted = True

            if pre_extracted:
                print('Using pre-extracted features for event = {}'.format(
                    event))
                print(str(key))
                X_filtered = extract_features(
                    timeseries,
                    column_id="id",
                    column_sort="time",
                    kind_to_fc_parameters=extraction_settings)

            # Add start_time and mass column to dataframe
            if est_events:
                start_time = dataset[key]['X_starting_time']
                mass = dataset[key]['X_mass_sample']

                X_filtered.insert(0, "start_time", start_time, True)
                X_filtered.insert(1, "mass", mass, True)

            else:
                mass = dataset[key]['X_mass_all']
                X_filtered.insert(0, "mass", mass, True)

            # Save dataframes
            X_filtered.to_csv("{}{}_X.csv".format(save_dir, key),
                              index=True,
                              header=True)
            y.to_csv("{}{}_y.csv".format(save_dir, key),
                     index=True,
                     header=True)

        else:
            try:
                uid_change = timeseries_temp['id'].iloc[-1]

                uid_last.append(uid_change)

                timeseries['id'] = timeseries['id'] + uid_change + 1
                timeseries_temp = timeseries_temp.append(timeseries)
                y_temp = y_temp.append(y, ignore_index=True)

            except NameError:  # *_temp DataFrames do not exist yet
                timeseries_temp = timeseries
                y_temp = y

    if all:
        print('Using all data to extract relevant features')

        # First remove any NaN values in y, this should only be at the end
        print('Extracting all features')

        if est_events:
            X = extract_features(
                timeseries_temp,
                column_id="id",
                column_sort="time",
                default_fc_parameters=ComprehensiveFCParameters(),
                impute_function=impute)

            y = y_temp

            # Remove NaN index's from X and y
            remove_idx = pd.isnull(y.to_numpy()).nonzero()[0]
            y = y.drop(remove_idx)
            X = X.drop(remove_idx)

            print('Selecting relevant features')
            X_filtered = select_features(X, y)

        else:
            X_filtered = extract_relevant_features(
                timeseries_temp,
                y_temp,
                column_id="id",
                column_sort="time",
                default_fc_parameters=ComprehensiveFCParameters())

        X_filtered.to_csv("{}features.csv".format(save_dir), header=True)

        # Now save individual datasets
        # Reload DataFrame
        X_features = pd.read_csv("{}features.csv".format(save_dir),
                                 index_col=0)

        # Index values
        names = X_features.index.values

        # Saving individual trials
        print('Saving features for each trial')
        start = 0
        i = 0
        for key in dataset.keys():
            try:
                end_temp = uid_last[i]  # Name of the row

            except IndexError:
                # Last key
                end_temp = X_features.iloc[-1].name

            end = end_temp

            # Find the new end index accounting for removed values
            removed = True

            while removed:
                if end in remove_idx:
                    end -= 1
                else:
                    removed = False

            # end = the name of the row (NOT index) which is the last in the trial
            end_idx = np.where(names == end)[0][0]

            X_save = X_features.iloc[start:end_idx + 1]
            X_save = X_save.reset_index(drop=True)

            y_save = y.iloc[start:end_idx + 1]
            y_save = y_save.reset_index(drop=True)

            start = end_idx + 1
            i += 1

            # Add start_time and mass column to dataframe
            if est_events:
                start_time = dataset[key]['X_starting_time']
                mass = dataset[key]['X_mass_sample']

                # Remove those due to NaN's
                start_time_new = start_time[:len(X_save)]
                mass_new = mass[:len(X_save)]

                X_save.insert(0, "start_time", start_time_new, True)
                X_save.insert(1, "mass", mass_new, True)

            else:
                mass = dataset[key]['X_mass_all']

                # Remove those due to NaN's (should be zero for GRF estimation)
                mass_new = mass[:len(X_save)]
                X_save.insert(0, "mass", mass_new, True)

            # Save
            X_save.to_csv("{}{}_X.csv".format(save_dir, key),
                          index=True,
                          header=True)
            y_save.to_csv("{}{}_y.csv".format(save_dir, key),
                          index=True,
                          header=True)

    return
Esempio n. 14
0
        tstart.append(tstart_tmp)
    
    # Featurize using tsfresh
    feat702 = []

    for i in range(len(eta_g702)):
        dict = {'id':run_id[i], 'time':times[i], 'eta': eta_g702[i]}

        feat_temp = extract_features(pd.DataFrame(dict), column_id='id', column_sort='time',\
                            default_fc_parameters=ComprehensiveFCParameters(), impute_function=impute)

        # drop constant features
        feat702.append(feat_temp.loc[:, feat_temp.apply(pd.Series.nunique) != 1]) 
    
    # Extract feature settings for model prediction
    params_30min = settings.from_columns(feat702[0])
    params_60min = settings.from_columns(feat702[1])
    
    # Create targets for train/test
    g901max = tsr.max_eta(eta,901,runnos)
    g911max = tsr.max_eta(eta,911,runnos)
    
    # Specify the model
    rmodel = RandomForestRegressor(n_estimators=100)

    ## Gauge 901
    pred_901, trp_901, target_901, evs_901, scalers_901, models_901 = tsr.train_test(feat702, g901max,\
                                                                                     index_train, index_test,\
                                                                                    sc,'r', rmodel,True)
    ## Gauge 911
    pred_911, trp_911, target_911, evs_911, scalers_911, models_911 = tsr.train_test(feat702, g911max,\
Esempio n. 15
0
    def execute_training(self, config):
        """
        Run training based on config.
        :param config: dict
        """

        # 1. Init pipeline
        print('--------------------INIT PIPELINE--------------------')
        dao, preprocessor, extractor, model_factory = self.init_pipeline(
            config)

        # 2. Load data
        print('--------------------LOAD DATA------------------------')
        labels, data = dao.bulk_read_data(
            file_path=[config['data_set_path'], config['data_labels_path']],
            identifiers=config['data_set_trips'],
            column_names=[
                config['data_set_column_names'],
                config['data_label_column_names']
            ],
            use_columns=[
                config['data_set_columns'], config['data_label_columns']
            ],
            check_distribution=True)

        # 3. Preprocessing
        print('--------------------PRE PROCESSING--------------------')
        data_train, mean_train, std_train, data_test, data_valid = preprocessor.training_split_process(
            data=data, config=config, labels=labels)

        # 4. Feature extraction
        print('--------------------FEATURE EXTRACTION------------------')
        X_train = None
        X_test = None
        y_train = None
        y_test = None

        if config['feature_eng_extractor_type'] == "motif":
            X_train = extractor.extract_select_training_features(
                data_train, [
                    config['feature_eng_mp_extractor_radii'],
                    config['feature_eng_mp_extractor_lengths']
                ])
            X_test = extractor.extract_select_training_features(
                data_test, [
                    config['feature_eng_mp_extractor_radii'],
                    config['feature_eng_mp_extractor_lengths']
                ])

        segment_length = config['feature_eng_baseline_extractor_segement_len']
        if config['feature_eng_extractor_type'] == "ts-fresh":

            # TODO migrate the preperation for extraction to extract_select_training_features, make label column name configureable
            data_train = preprocessor.encode_categorical_features(
                data=data_train,
                mode='custom_function',
                columns=['road_label'],
                encoding_function=lambda x:
                (x > 2.0).astype(int))  # 0 City, 1 Countryside

            data_test = preprocessor.encode_categorical_features(
                data=data_test,
                mode='custom_function',
                columns=['road_label'],
                encoding_function=lambda x:
                (x > 2.0).astype(int))  # 0 City, 1 Countryside

            #Find segements with homogeneous labeling
            split = lambda df, chunk_size: numpy.array_split(
                df, len(df) // chunk_size + 1, axis=0)
            segments_train = split(data_train, segment_length)
            segments_test = split(data_test, segment_length)
            segments_train_homogeneous, segments_test_homogeneous = [], []
            for segment in segments_train:
                if segment.road_label.nunique(
                ) == 1:  #and segment.shape[0] == segment_length: TODO Homogeneous length rmoved write that in paper
                    segments_train_homogeneous.append(segment)
            for segment in segments_test:
                if segment.road_label.nunique(
                ) == 1:  #and segment.shape[0] == segment_length:
                    segments_test_homogeneous.append(segment)

            data_train = pandas.concat(segments_train_homogeneous, axis=0)
            data_test = pandas.concat(segments_test_homogeneous, axis=0)

            #Generate id column
            train_id = [None] * data_train.index.size
            id = 0
            for i in range(0, data_train.index.size, segment_length):
                train_id[i:i + segment_length] = [id] * segment_length
                id += 1
            train_id = train_id[:data_train.index.size]
            data_train['id'] = train_id

            test_id = [None] * data_test.index.size
            id = 0
            for i in range(0, data_test.index.size, segment_length):
                test_id[i:i + segment_length] = [id] * segment_length
                id += 1
            test_id = test_id[:data_test.index.size]
            data_test['id'] = test_id

            y_train = data_train[['road_label', 'id']].reset_index(drop=True)
            y_train = y_train.groupby(y_train.index // segment_length).agg(
                lambda x: x.value_counts().index[0]
            )  #majority label in segment
            X_train = data_train[['acceleration_abs',
                                  'id']].reset_index(drop=True)
            y_test = data_test[['road_label', 'id']].reset_index(drop=True)
            y_test = y_test.groupby(
                y_test.index //
                segment_length).agg(lambda x: x.value_counts().index[0])
            X_test = data_test[['acceleration_abs',
                                'id']].reset_index(drop=True)

            #Extract Training features
            X_train = extractor.extract_select_training_features(
                X_train,
                args=[
                    'id', config['hw_num_processors'], None,
                    y_train['road_label'],
                    config['feature_eng_baseline_extractor_fdr']
                ])

            #Get feature map for validation and training set
            kind_to_fc_parameters = from_columns(X_train)
            X_test = extractor.extract_select_inference_features(
                X_test,
                args=[
                    'id', config['hw_num_processors'], None,
                    kind_to_fc_parameters
                ])

            X_train = [
                'placeholder',
                [
                    X_train,
                    y_train['road_label'].rename(columns={'road_label': 0},
                                                 inplace=True), 'N/A', 'N/A',
                    'N/A'
                ]
            ]  # required for further processing. TODO: Unifiy naming!#

            X_test = [
                'placeholder',
                [
                    X_test,
                    y_test['road_label'].rename(columns={'road_label': 0},
                                                inplace=True), 'N/A', 'N/A',
                    'N/A'
                ]
            ]

        if X_train is None or X_test is None:
            pass  # TODO Raise Error

        # 5. Find optimal classifier for given training set
        print('--------------------TRAINING PHASE----------------------')
        print(X_test[0][0])
        print(X_test[0][1])
        clf, score, conf, X_train, motif_len, motif_radius, motif_count, run_summary = model_factory.find_optimal_model(
            config[
                'feature_eng_extractor_type'],  # TODO remove bc deprecated. config handed anyways
            config,
            X_train,
            X_test,
        )

        if clf is None or score is None or conf is None:
            pass  # TODO Raise Error

        # 6. Prepare Validation
        print('--------------------PREPARE VALIDATION-------------------')

        X_valid, y_valid, kind_to_fc_parameters = None, None, None
        if config['feature_eng_extractor_type'] == "ts-fresh":

            data_valid = preprocessor.encode_categorical_features(
                data=data_valid,
                mode='custom_function',
                columns=['road_label'],
                encoding_function=lambda x:
                (x > 2.0).astype(int))  # 0 City, 1 Countryside

            #Segement validation data ins pieces with homogeneous length
            # Find segements with homogeneous labeling

            split = lambda df, chunk_size: numpy.array_split(
                df, len(df) // chunk_size + 1, axis=0)
            segments_valid = split(data_valid, segment_length)
            segments_valid_homogeneous = []
            for segment in segments_valid:
                if segment.road_label.nunique(
                ) == 1:  # and segment.shape[0] == segment_length: TODO Homogeneous length rmoved write that in paper
                    segments_valid_homogeneous.append(segment)

            data_valid = pandas.concat(segments_valid_homogeneous, axis=0)

            #Generate id column
            valid_id = [None] * data_valid.index.size
            id = 0
            for i in range(0, data_valid.index.size, segment_length):
                valid_id[i:i + segment_length] = [id] * segment_length
                id += 1
            valid_id = valid_id[:data_valid.index.size]
            data_valid['id'] = valid_id

            y_valid = data_valid[['road_label', 'id']].reset_index(drop=True)
            y_valid = y_valid.groupby(
                y_valid.index //
                segment_length).agg(lambda x: x.value_counts().index[0])
            X_valid = data_valid[['acceleration_abs',
                                  'id']].reset_index(drop=True)

            # Get feature map for validation and training set
            kind_to_fc_parameters = from_columns(X_train)
            run_summary['ts_fresh_relevant_features'] = kind_to_fc_parameters
            X_valid = extractor.extract_select_inference_features(
                X_valid,
                args=[
                    'id', config['hw_num_processors'], None,
                    kind_to_fc_parameters
                ])

            y_valid = y_valid['road_label'].rename(columns={'road_label': 0},
                                                   inplace=True)

        if config['feature_eng_extractor_type'] == "motif":
            X_valid, y_valid = extractor.extract_select_inference_features(
                data_valid,
                [motif_radius, motif_len, config['hw_num_processors']], True)

        # 7. Run Validation
        print('--------------------VALIDATION---------------------------')
        print(X_valid)
        print(y_valid)
        X_valid, y_valid = model_factory.pre_clustering(X_valid, y_valid, None)
        if config['feature_eng_extractor_type'] == 'motif':
            print("Validation y label 1: {}".format(
                list(y_valid[0]).count(1.0) /
                len(y_valid)))  # TODO: make configureable
            print("Validation y label 3: {}".format(
                list(y_valid[0]).count(3.0) / len(y_valid)))
            run_summary['valid_lbl_1'] = list(
                y_valid[0]).count(1.0) / len(y_valid)
            run_summary['valid_lbl_3'] = list(
                y_valid[0]).count(3.0) / len(y_valid)
        elif config['feature_eng_extractor_type'] == 'ts-fresh':
            print("Validation y label 1: {}".format(
                list(y_valid).count(1.0) /
                len(y_valid)))  # TODO: make configureable
            print("Validation y label 3: {}".format(
                list(y_valid).count(0.0) / len(y_valid)))
            run_summary['valid_lbl_1'] = list(y_valid).count(0.0) / len(
                y_valid)
            run_summary['valid_lbl_3'] = list(y_valid).count(1.0) / len(
                y_valid)

        score = clf.score(X_valid, y_valid)
        print(score)
        y_pred = clf.predict(X_valid)
        conf = confusion_matrix(y_valid,
                                y_pred,
                                labels=None,
                                sample_weight=None)
        print(conf)
        report = str(classification_report(y_valid, y_pred))
        best_params = clf.best_params_
        print(report)

        # 8. Store Results
        print('--------------------STORE RESULTS------------------------')
        # TODO: delegate to DAO, make storing configureable

        results_tag = "{0}_{1}_{2}_{3}".format(
            config['feature_eng_extractor_type'],
            config['pre_proc_resample_freq'],
            config['feature_eng_dim_reduction_type'],
            config['feature_eng_baseline_extractor_segement_len'])
        pandas.DataFrame(X_train).to_pickle(
            "X_train_{}.pkl".format(results_tag))
        pandas.DataFrame(X_test).to_pickle("X_test_{}.pkl".format(results_tag))
        pandas.DataFrame(X_valid).to_pickle(
            "X_valid_{}.pkl".format(results_tag))
        pandas.DataFrame(y_train).to_pickle(
            "y_train_{}.pkl".format(results_tag))
        pandas.DataFrame(y_test).to_pickle("y_test_{}.pkl".format(results_tag))
        pandas.DataFrame(y_valid).to_pickle(
            "y_valid_{}.pkl".format(results_tag))

        with open("./clf_{}.pkl".format(results_tag), 'wb') as clf_file:
            pickle.dump(clf, clf_file)

        with open("./clf.pkl", 'wb') as clf_file:
            pickle.dump(clf, clf_file)

        meta_data = None
        if config['feature_eng_extractor_type'] == "motif":
            meta_data = {
                'mean_train': mean_train,
                'std_train': std_train,
                'motif_len': motif_len,
                'motif_radius': motif_radius,
                'motif_count': motif_count,
                'clf_score': score,
                'clf_conf': conf,
                'clf_report': report,
                'clf_best_params': best_params
            }

        if config['feature_eng_extractor_type'] == "ts-fresh":
            meta_data = {
                'mean_train': mean_train,
                'std_train': std_train,
                'feature_mapping': kind_to_fc_parameters,
                'clf_score': score,
                'clf_conf': conf,
                'clf_report': report,
                'clf_best_params': best_params
            }

        run_summary['global_best_meta_data'] = meta_data
        run_summary['config'] = config

        with open("./meta_data_{}.pkl".format(results_tag), 'wb') as meta_file:
            pickle.dump(meta_data, meta_file)

        with open("./meta_data.pkl", 'wb') as meta_file:
            pickle.dump(meta_data, meta_file)

        with open("./run_summary_{}.pkl".format(results_tag),
                  'wb') as run_summary_file:
            pickle.dump(run_summary, run_summary_file)

        print('--------------------PRINT SUMMARY------------------------')
        pprint.pprint(run_summary)
Esempio n. 16
0
                     default_fc_parameters=extraction_settings,
                     impute_function=impute,
                     n_jobs=3)

print('selecting training features')
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
X_filtered = select_features(X, train_y_train, n_jobs=3)
print('X_filtered shape=')
print(X_filtered.shape)

print('extracting selected features for ALL training data')
X_train_filtered = extract_features(
    X0_train,
    column_id='id',
    column_sort='time',
    kind_to_fc_parameters=settings.from_columns(X_filtered.columns),
    impute_function=impute,
    n_jobs=3)

print('X_train_filtered shape=')
print(X_train_filtered.shape)

print('extracting selected features for test data')
X_test_filtered = extract_features(X0_test,
                                   column_id='id',
                                   column_sort='time',
                                   kind_to_fc_parameters=settings.from_columns(
                                       X_filtered.columns),
                                   impute_function=impute,
                                   n_jobs=3)
Esempio n. 17
0
    #####################################################################################
    ###
    ###             process data and labels to reduce dimensions
    ######################################################################################    #understand  relevant features and update X accordingly
    X_filtered_features = select_features(x_input, y_for_filtering)

    #If we want another features different for conditions prediction
    # y_for_filtering =
    # X_filtered_features_st = select_features(x_input, y_for_filtering)

    # # Significant columns to use in tested data
    significant_features = X_filtered_features.columns
    num_significant_features = len(significant_features)

    to_filter_by = from_columns(X_filtered_features)  # dictionary
    pickle.dump(to_filter_by,
                open("saved_features_no_PCA.p",
                     "wb"))  # save it into a file named saved_features.p

    # Load the dictionary back from the pickle file.
    # features_loaded = pickle.load(open("saved_features.p", "rb"))

    ###################################################
    try_pca_n_component = 15
    #create PCA
    pca_n_component = min(try_pca_n_component, num_significant_features)
    pca_n_component
    pca_allData = PCAForPandas(n_components=pca_n_component)

    X_pca = pca_allData.fit_transform(X_filtered_features)  #YONIT