def test_from_columns(self): tsn = "TEST_TIME_SERIES" fset = ComprehensiveFCParameters() self.assertRaises(TypeError, from_columns, 42) self.assertRaises(TypeError, from_columns, 42) self.assertRaises(ValueError, from_columns, ["This is not a column name"]) self.assertRaises(ValueError, from_columns, ["This__neither"]) self.assertRaises(ValueError, from_columns, ["This__also__not"]) # Aggregate functions feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"] # Aggregate functions with params feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30', tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf', tsn + '__value_count__value_nan'] # Apply functions feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1'] kind_to_fc_parameters = from_columns(feature_names) six.assertCountEqual(self, list(kind_to_fc_parameters[tsn].keys()), ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks", "ar_coefficient", "value_count"]) self.assertEqual(kind_to_fc_parameters[tsn]["sum_values"], None) self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"], [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}]) self.assertEqual(kind_to_fc_parameters[tsn]["value_count"], [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}]) # test that it passes for all functions fset = ComprehensiveFCParameters() X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), default_fc_parameters=fset, column_id="id", column_value="value", n_jobs=0) inferred_fset = from_columns(X_org) X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), kind_to_fc_parameters=inferred_fset, column_id="id", column_value="value", n_jobs=0) assert_frame_equal(X_org.sort_index(), X_new.sort_index())
def test_from_columns(self): tsn = "TEST_TIME_SERIES" fset = ComprehensiveFCParameters() self.assertRaises(TypeError, from_columns, 42) self.assertRaises(TypeError, from_columns, 42) self.assertRaises(ValueError, from_columns, ["This is not a column name"]) self.assertRaises(ValueError, from_columns, ["This__neither"]) self.assertRaises(ValueError, from_columns, ["This__also__not"]) # Aggregate functions feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"] # Aggregate functions with params feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30', tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf', tsn + '__value_count__value_nan'] # Apply functions feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1'] kind_to_fc_parameters = from_columns(feature_names) six.assertCountEqual(self, list(kind_to_fc_parameters[tsn].keys()), ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks", "ar_coefficient", "value_count"]) self.assertEqual(kind_to_fc_parameters[tsn]["sum_values"], None) self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"], [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}]) self.assertEqual(kind_to_fc_parameters[tsn]["value_count"], [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}]) # test that it passes for all functions fset = ComprehensiveFCParameters() X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), default_fc_parameters=fset, column_id="id", column_value="value", n_jobs=0) inferred_fset = from_columns(X_org) X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), kind_to_fc_parameters=inferred_fset, column_id="id", column_value="value", n_jobs=0) assert_frame_equal(X_org.sort_index(), X_new.sort_index())
def test_from_column_correct_for_selected_columns(self): tsn = "TEST_TIME_SERIES" # Aggregate functions feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"] # Aggregate functions with params feature_names += [tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30', tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf', tsn + '__value_count__value_nan'] # Apply functions feature_names += [tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1'] kind_to_fc_parameters = from_columns(feature_names) self.assertCountEqual(list(kind_to_fc_parameters[tsn].keys()), ["sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks", "ar_coefficient", "value_count"]) self.assertIsNone(kind_to_fc_parameters[tsn]["sum_values"]) self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"], [{"k": 20, "coeff": 4}, {"k": -1, "coeff": 10}]) self.assertEqual(kind_to_fc_parameters[tsn]["value_count"], [{"value": np.PINF}, {"value": np.NINF}, {"value": np.NaN}])
def test_from_columns_correct_for_different_kind_datatypes(self): """The `settings.from_columns()` function is supposed to save the feature extraction / selection results so it can be reused later. It works by parsing the column names of the extracted dataframes. An unfortunate side effect of this is that when used with the 'long' format time series input, the typing information about the 'kind' column is lost. For example, even if the 'kind' values are in int32, in the resulting settings dict, the type of the top level keys (representing different kind values) will be str """ df = pd.DataFrame({ 'id': [1, 1, 1, 1], 'time': [1, 1, 2, 2], 'kind': [1, 2, 1, 2], 'value': [1, 2, 3, 4] }) features = extract_features( df, column_id='id', column_sort='time', column_kind='kind', column_value='value', default_fc_parameters=MinimalFCParameters()) sample_settings = from_columns(features) X = extract_features(df, column_id='id', column_sort='time', column_kind='kind', column_value='value', kind_to_fc_parameters=sample_settings) assert X.shape == (1, 2 * len(MinimalFCParameters()))
def transform(self, X): """ After the fit step, it is known which features are relevant, Only extract those from the time series handed in with the function :func:`~set_timeseries_container`. If filter_only_tsfresh_features is False, also delete the irrelevant, already present features in the data frame. :param X: the data sample to add the relevant (and delete the irrelevant) features to. :type X: pandas.DataFrame or numpy.array :return: a data sample with the same information as X, but with added relevant time series features and deleted irrelevant information (only if filter_only_tsfresh_features is False). :rtype: pandas.DataFrame """ if self.timeseries_container is None: raise RuntimeError("You have to provide a time series using the set_timeseries_container function before.") if self.feature_selector is None: raise RuntimeError("You have to call fit before calling transform.") if self.feature_selector.relevant_features is None: raise RuntimeError("You have to call fit before calling transform.") self.feature_extractor.set_timeseries_container(self.timeseries_container) relevant_time_series_features = set(self.feature_selector.relevant_features) - set(pd.DataFrame(X).columns) relevant_extraction_settings = from_columns(relevant_time_series_features) # Set imputing strategy impute_function = partial(impute_dataframe_range, col_to_max=self.col_to_max, col_to_min=self.col_to_min, col_to_median=self.col_to_median) relevant_feature_extractor = FeatureAugmenter(kind_to_fc_parameters=relevant_extraction_settings, default_fc_parameters={}, column_id=self.feature_extractor.column_id, column_sort=self.feature_extractor.column_sort, column_kind=self.feature_extractor.column_kind, column_value=self.feature_extractor.column_value, chunksize=self.feature_extractor.chunksize, n_jobs=self.feature_extractor.n_jobs, show_warnings=self.feature_extractor.show_warnings, disable_progressbar=self.feature_extractor.disable_progressbar, impute_function=impute_function, profile=self.feature_extractor.profile, profiling_filename=self.feature_extractor.profiling_filename, profiling_sorting=self.feature_extractor.profiling_sorting) relevant_feature_extractor.set_timeseries_container(self.feature_extractor.timeseries_container) X_augmented = relevant_feature_extractor.transform(X) if self.filter_only_tsfresh_features: return X_augmented.copy().loc[:, self.feature_selector.relevant_features + X.columns.tolist()] else: return X_augmented.copy().loc[:, self.feature_selector.relevant_features]
def test_from_columns(self): tsn = "TEST_TIME_SERIES" fset = ComprehensiveFCParameters() self.assertRaises(TypeError, from_columns, 42) self.assertRaises(TypeError, from_columns, 42) self.assertRaises(ValueError, from_columns, ["This is not a column name"]) self.assertRaises(ValueError, from_columns, ["This__neither"]) self.assertRaises(ValueError, from_columns, ["This__also__not"]) # Aggregate functions feature_names = [ tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy" ] # Aggregate functions with params feature_names += [ tsn + '__quantile__q_10', tsn + '__quantile__q_70', tsn + '__number_peaks__n_30', tsn + '__value_count__value_inf', tsn + '__value_count__value_-inf', tsn + '__value_count__value_nan' ] # Apply functions feature_names += [ tsn + '__ar_coefficient__k_20__coeff_4', tsn + '__ar_coefficient__coeff_10__k_-1' ] kind_to_fc_parameters = from_columns(feature_names) six.assertCountEqual(self, list(kind_to_fc_parameters[tsn].keys()), [ "sum_values", "median", "length", "sample_entropy", "quantile", "number_peaks", "ar_coefficient", "value_count" ]) self.assertEqual(kind_to_fc_parameters[tsn]["sum_values"], None) self.assertEqual(kind_to_fc_parameters[tsn]["ar_coefficient"], [{ "k": 20, "coeff": 4 }, { "k": -1, "coeff": 10 }]) self.assertEqual(kind_to_fc_parameters[tsn]["value_count"], [{ "value": np.PINF }, { "value": np.NINF }, { "value": np.NaN }])
def test_from_columns_ignores_columns(self): tsn = "TEST_TIME_SERIES" feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"] feature_names += ["THIS_COL_SHOULD_BE_IGNORED"] kind_to_fc_parameters = from_columns(feature_names, columns_to_ignore=["THIS_COL_SHOULD_BE_IGNORED", "THIS_AS_WELL"]) six.assertCountEqual(self, list(kind_to_fc_parameters[tsn].keys()), ["sum_values", "median", "length", "sample_entropy"])
def test_from_columns_ignores_columns(self): tsn = "TEST_TIME_SERIES" feature_names = [tsn + '__sum_values', tsn + "__median", tsn + "__length", tsn + "__sample_entropy"] feature_names += ["THIS_COL_SHOULD_BE_IGNORED"] kind_to_fc_parameters = from_columns(feature_names, columns_to_ignore=["THIS_COL_SHOULD_BE_IGNORED", "THIS_AS_WELL"]) self.assertCountEqual(list(kind_to_fc_parameters[tsn].keys()), ["sum_values", "median", "length", "sample_entropy"])
def handle_message(msg): if msg.key() is None or type(msg.key()) is not dict: logger.warning("Key is none. Ignoring message.") return elif msg.value() is None or type(msg.value()) is not dict: logger.warning("Value is none. Ignoring message.") return try: time_begin = time.time() timeseries = pd.melt( pd.DataFrame.from_dict(msg.value(), orient='index').transpose()).dropna() timeseries['group_id'] = 0 if timeseries.isnull().sum().sum() > 0: logger.warning("at least one field of timeseries is null") return X = extract_features( timeseries, column_id='group_id', column_kind="variable", column_value="value", kind_to_fc_parameters=settings.from_columns(fc_parameters)) if X.isnull().sum().sum() > 0: logger.warning("at least one field of extracted features is null") return kritisch = ml_model.predict(pca_model.transform(X))[0] time_end = time.time() start_prediction_interval = time.localtime(msg.key()['timestamp_end'] / 1000) end_prediction_interval = time.localtime(msg.key()['timestamp_end'] / 1000 + 60 * 5) print("Prediction for interval", time.strftime("%H:%M:%S", start_prediction_interval), "to", time.strftime("%H:%M:%S", end_prediction_interval), ":", "kritisch" if kritisch else "unkritisch") if SHOW_CALCULATION_TIME == 1: print("time for calculation", round(time_end - time_begin, 5), "seconds") except Exception as e: logger.exception(e) consumer.stop()
def test_from_column_correct_for_comprehensive_fc_parameters(self): fset = ComprehensiveFCParameters() X_org = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), default_fc_parameters=fset, column_id="id", column_value="value", n_jobs=0) inferred_fset = from_columns(X_org) X_new = extract_features(pd.DataFrame({"value": [1, 2, 3], "id": [1, 1, 1]}), kind_to_fc_parameters=inferred_fset, column_id="id", column_value="value", n_jobs=0) assert_frame_equal(X_org.sort_index(), X_new.sort_index())
def transform(self, X): """ After the fit step, it is known which features are relevant, Only extract those from the time series handed in with the function :func:`~set_timeseries_container`. If filter_only_tsfresh_features is False, also delete the irrelevant, already present features in the data frame. :param X: the data sample to add the relevant (and delete the irrelevant) features to. :type X: pandas.DataFrame or numpy.array :return: a data sample with the same information as X, but with added relevant time series features and deleted irrelevant information (only if filter_only_tsfresh_features is False). :rtype: pandas.DataFrame """ if self.feature_selector.relevant_features is None: raise RuntimeError("You have to call fit before.") if self.timeseries_container is None: raise RuntimeError("You have to provide a time series using the set_timeseries_container function before.") self.feature_extractor.set_timeseries_container(self.timeseries_container) relevant_time_series_features = set(self.feature_selector.relevant_features) - set(pd.DataFrame(X).columns) relevant_extraction_settings = from_columns(relevant_time_series_features) # Set imputing strategy impute_function = partial(impute_dataframe_range, col_to_max=self.col_to_max, col_to_min=self.col_to_min, col_to_median=self.col_to_median) relevant_feature_extractor = FeatureAugmenter(kind_to_fc_parameters=relevant_extraction_settings, default_fc_parameters={}, column_id=self.feature_extractor.column_id, column_sort=self.feature_extractor.column_sort, column_kind=self.feature_extractor.column_kind, column_value=self.feature_extractor.column_value, chunksize=self.feature_extractor.chunksize, n_jobs=self.feature_extractor.n_jobs, show_warnings=self.feature_extractor.show_warnings, disable_progressbar=self.feature_extractor.disable_progressbar, impute_function=impute_function, profile=self.feature_extractor.profile, profiling_filename=self.feature_extractor.profiling_filename, profiling_sorting=self.feature_extractor.profiling_sorting) relevant_feature_extractor.set_timeseries_container(self.feature_extractor.timeseries_container) X_augmented = relevant_feature_extractor.transform(X) if self.filter_only_tsfresh_features: return X_augmented.copy().loc[:, self.feature_selector.relevant_features + X.columns.tolist()] else: return X_augmented.copy().loc[:, self.feature_selector.relevant_features]
def get_features(X, y=None, kind_to_fc_parameters=None): samples, time_steps, data_dim = X.shape X = X.reshape([-1, data_dim]) time = list(range(time_steps)) * samples ids = [] for i in range(samples): ids.extend([i] * time_steps) X = pd.DataFrame(X) X['id'] = ids X['time'] = time if y is not None: features = extract_relevant_features(X, y, column_id='id', column_sort='time', n_jobs=0) kind_to_fc_parameters = from_columns(features) return features.values, kind_to_fc_parameters elif kind_to_fc_parameters is not None: features = extract_features(X, column_id='id', column_sort='time', n_jobs=0, default_fc_parameters=kind_to_fc_parameters) else: features = extract_features(X, column_id='id', column_sort='time', n_jobs=0) return features.values
def extract_data(data_folder: str, columns: list, overlap=False, all: bool = True, est_events: bool = False, event: str = None, event_type: str = None): ''' This function uses tsFRESH to extract relevant features for multiple machine learning tasks. If a csv file of features to use already exists (as features.csv), then those features will be used instead of finding relevant features from scratch (speeds up computing time). Inputs: data_folder: a string containing the location of the directory which the dataset.pkl is saved in. This dataset it created using data_preperation.py. columns: a list of strings containing the columns from the dataset which the user wishes to extract features from. This includes: id, time, ax_l, ay_l, az_l, ax_r, ay_r, az_r, ax_diff, ay_diff, az_diff, a_res_l, a_res_r, a_res_diff. NOTE: if id or time are not included in this list, they will be automatically added as they are necessary. all: a boolean (either True or False). If true, feature extraction will be run using all the data, if False, feature extraction will be run using the first trial, and then that we be used on all the data. est_events: a boolean (either True or False). If True, features will be extracted to estimate whether an event occured or not within a 100 ms time frame. If False, features will be extracted to estimate vertical GRF for the entire timeseries. event: A string containing either FS or FO. This will indicate which event the user wants to predict on. NOTE: this is only necessary as an input if est_events is True. event_type: A string containing either binary or time. This will indicate which type of output the user wants. NOTE: this is only necessary as an input if est_events is True. Outputs: This function does not return anything. However, it does save *.csv files in appropriate folders (based off the columns chosen) which can be used to fit either a classification or regression model (depending on what task is required) - see model_fitting.py Alex Woodall Auckland Bioengineering Institute 08/04/2020 ''' from tsfresh import extract_features, extract_relevant_features, select_features from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters from tsfresh.feature_extraction.settings import from_columns from tsfresh.utilities.dataframe_functions import impute import pickle import numpy as np import pandas as pd import os # Load data try: if overlap: dataset = pickle.load( open(data_folder + "dataset_overlap.pkl", "rb")) else: dataset = pickle.load( open(data_folder + "dataset_no_overlap.pkl", "rb")) except FileNotFoundError: dataset = pickle.load(open(data_folder + "dataset_200.pkl", "rb")) # Number selected columns which the user chose to use for feature extraction columns, columns_num = selected_columns(columns) # Create directories for saving new_directory = "{}{}\\".format(data_folder, ("_".join(map(str, columns_num)))) save_dir = create_directories(new_directory, event, event_type, est_events) # Attempt to load features from the save directory. try: X_features = pd.read_csv( "{}features.csv".format(save_dir), index_col=0) # DataFrame containing the features we want features_string = X_features.columns extraction_settings = from_columns( features_string) # These are the features that we will be using pre_extracted = True except FileNotFoundError: # File does not exist pre_extracted = False # List to append last uid's from each key (used when using all trials to extract features) uid_last = [] # Iterate through all the trials in the dataset for key in dataset.keys(): # Create the timeseries based on the user input columns for col in columns_num: if col == 0: timeseries = ( dataset[key]['X'])[:, col] # Only true accelerations else: timeseries = np.vstack((timeseries, dataset[key]['X'][:, col])) # dataset[key].keys() = ['X', 'force', 'y_FS_binary', 'y_FO_binary', 'y_FS_time_to', 'y_FO_time_to'] # Create y (real data output) if est_events: # If estimating events try: if event_type == 'binary': y = dataset[key]['y_{}_binary'.format(event)] # Convert to boolean (will remain boolean if already) y = (y == 1.0) elif event_type == 'time': y = dataset[key]['y_{}_time_to_next'.format(event)] else: print('Event type must either be binary or time') return except KeyError: print("Event must equal either 'FS' or 'FO'.") return else: # Estimating forces # possible force = ['Fx', 'Fy', 'Fz'] Assuming z direction is vertical y = dataset[key]['y'][:, 2] # Convert to pandas DataFrame/Series if type(timeseries) is np.ndarray: # Needs to be a pandas dataframe timeseries = pd.DataFrame(timeseries.T, columns=columns) # Convert ID column into integers timeseries = timeseries.astype({'id': int}) if est_events: if event_type == 'binary': y = pd.Series(data=y, dtype=bool, name='events') elif event_type == 'time': y = pd.Series(data=y, dtype=float, name='events') else: # Change ID column to fit for regression method ID = (np.arange(0, len(timeseries))).astype(int) timeseries['id'] = ID y = pd.Series(data=y, dtype=float, name='Fz') # Save X full dataset timeseries.to_csv("{}{}_timeseries.csv".format(save_dir, key), index=True, header=True) # Extract features from the first trial and use those for the rest if all == True if not all: # Extract features using tsFRESH if not pre_extracted: print('Finding relevant features using {}'.format(key)) X_filtered = extract_relevant_features( timeseries, y, column_id="id", column_sort="time", default_fc_parameters=ComprehensiveFCParameters()) # Save filtered features X_filtered.to_csv("{}features.csv".format(save_dir), header=True) features_string = X_filtered.columns extraction_settings = from_columns( features_string ) # These are the features that we will be using pre_extracted = True if pre_extracted: print('Using pre-extracted features for event = {}'.format( event)) print(str(key)) X_filtered = extract_features( timeseries, column_id="id", column_sort="time", kind_to_fc_parameters=extraction_settings) # Add start_time and mass column to dataframe if est_events: start_time = dataset[key]['X_starting_time'] mass = dataset[key]['X_mass_sample'] X_filtered.insert(0, "start_time", start_time, True) X_filtered.insert(1, "mass", mass, True) else: mass = dataset[key]['X_mass_all'] X_filtered.insert(0, "mass", mass, True) # Save dataframes X_filtered.to_csv("{}{}_X.csv".format(save_dir, key), index=True, header=True) y.to_csv("{}{}_y.csv".format(save_dir, key), index=True, header=True) else: try: uid_change = timeseries_temp['id'].iloc[-1] uid_last.append(uid_change) timeseries['id'] = timeseries['id'] + uid_change + 1 timeseries_temp = timeseries_temp.append(timeseries) y_temp = y_temp.append(y, ignore_index=True) except NameError: # *_temp DataFrames do not exist yet timeseries_temp = timeseries y_temp = y if all: print('Using all data to extract relevant features') # First remove any NaN values in y, this should only be at the end print('Extracting all features') if est_events: X = extract_features( timeseries_temp, column_id="id", column_sort="time", default_fc_parameters=ComprehensiveFCParameters(), impute_function=impute) y = y_temp # Remove NaN index's from X and y remove_idx = pd.isnull(y.to_numpy()).nonzero()[0] y = y.drop(remove_idx) X = X.drop(remove_idx) print('Selecting relevant features') X_filtered = select_features(X, y) else: X_filtered = extract_relevant_features( timeseries_temp, y_temp, column_id="id", column_sort="time", default_fc_parameters=ComprehensiveFCParameters()) X_filtered.to_csv("{}features.csv".format(save_dir), header=True) # Now save individual datasets # Reload DataFrame X_features = pd.read_csv("{}features.csv".format(save_dir), index_col=0) # Index values names = X_features.index.values # Saving individual trials print('Saving features for each trial') start = 0 i = 0 for key in dataset.keys(): try: end_temp = uid_last[i] # Name of the row except IndexError: # Last key end_temp = X_features.iloc[-1].name end = end_temp # Find the new end index accounting for removed values removed = True while removed: if end in remove_idx: end -= 1 else: removed = False # end = the name of the row (NOT index) which is the last in the trial end_idx = np.where(names == end)[0][0] X_save = X_features.iloc[start:end_idx + 1] X_save = X_save.reset_index(drop=True) y_save = y.iloc[start:end_idx + 1] y_save = y_save.reset_index(drop=True) start = end_idx + 1 i += 1 # Add start_time and mass column to dataframe if est_events: start_time = dataset[key]['X_starting_time'] mass = dataset[key]['X_mass_sample'] # Remove those due to NaN's start_time_new = start_time[:len(X_save)] mass_new = mass[:len(X_save)] X_save.insert(0, "start_time", start_time_new, True) X_save.insert(1, "mass", mass_new, True) else: mass = dataset[key]['X_mass_all'] # Remove those due to NaN's (should be zero for GRF estimation) mass_new = mass[:len(X_save)] X_save.insert(0, "mass", mass_new, True) # Save X_save.to_csv("{}{}_X.csv".format(save_dir, key), index=True, header=True) y_save.to_csv("{}{}_y.csv".format(save_dir, key), index=True, header=True) return
tstart.append(tstart_tmp) # Featurize using tsfresh feat702 = [] for i in range(len(eta_g702)): dict = {'id':run_id[i], 'time':times[i], 'eta': eta_g702[i]} feat_temp = extract_features(pd.DataFrame(dict), column_id='id', column_sort='time',\ default_fc_parameters=ComprehensiveFCParameters(), impute_function=impute) # drop constant features feat702.append(feat_temp.loc[:, feat_temp.apply(pd.Series.nunique) != 1]) # Extract feature settings for model prediction params_30min = settings.from_columns(feat702[0]) params_60min = settings.from_columns(feat702[1]) # Create targets for train/test g901max = tsr.max_eta(eta,901,runnos) g911max = tsr.max_eta(eta,911,runnos) # Specify the model rmodel = RandomForestRegressor(n_estimators=100) ## Gauge 901 pred_901, trp_901, target_901, evs_901, scalers_901, models_901 = tsr.train_test(feat702, g901max,\ index_train, index_test,\ sc,'r', rmodel,True) ## Gauge 911 pred_911, trp_911, target_911, evs_911, scalers_911, models_911 = tsr.train_test(feat702, g911max,\
def execute_training(self, config): """ Run training based on config. :param config: dict """ # 1. Init pipeline print('--------------------INIT PIPELINE--------------------') dao, preprocessor, extractor, model_factory = self.init_pipeline( config) # 2. Load data print('--------------------LOAD DATA------------------------') labels, data = dao.bulk_read_data( file_path=[config['data_set_path'], config['data_labels_path']], identifiers=config['data_set_trips'], column_names=[ config['data_set_column_names'], config['data_label_column_names'] ], use_columns=[ config['data_set_columns'], config['data_label_columns'] ], check_distribution=True) # 3. Preprocessing print('--------------------PRE PROCESSING--------------------') data_train, mean_train, std_train, data_test, data_valid = preprocessor.training_split_process( data=data, config=config, labels=labels) # 4. Feature extraction print('--------------------FEATURE EXTRACTION------------------') X_train = None X_test = None y_train = None y_test = None if config['feature_eng_extractor_type'] == "motif": X_train = extractor.extract_select_training_features( data_train, [ config['feature_eng_mp_extractor_radii'], config['feature_eng_mp_extractor_lengths'] ]) X_test = extractor.extract_select_training_features( data_test, [ config['feature_eng_mp_extractor_radii'], config['feature_eng_mp_extractor_lengths'] ]) segment_length = config['feature_eng_baseline_extractor_segement_len'] if config['feature_eng_extractor_type'] == "ts-fresh": # TODO migrate the preperation for extraction to extract_select_training_features, make label column name configureable data_train = preprocessor.encode_categorical_features( data=data_train, mode='custom_function', columns=['road_label'], encoding_function=lambda x: (x > 2.0).astype(int)) # 0 City, 1 Countryside data_test = preprocessor.encode_categorical_features( data=data_test, mode='custom_function', columns=['road_label'], encoding_function=lambda x: (x > 2.0).astype(int)) # 0 City, 1 Countryside #Find segements with homogeneous labeling split = lambda df, chunk_size: numpy.array_split( df, len(df) // chunk_size + 1, axis=0) segments_train = split(data_train, segment_length) segments_test = split(data_test, segment_length) segments_train_homogeneous, segments_test_homogeneous = [], [] for segment in segments_train: if segment.road_label.nunique( ) == 1: #and segment.shape[0] == segment_length: TODO Homogeneous length rmoved write that in paper segments_train_homogeneous.append(segment) for segment in segments_test: if segment.road_label.nunique( ) == 1: #and segment.shape[0] == segment_length: segments_test_homogeneous.append(segment) data_train = pandas.concat(segments_train_homogeneous, axis=0) data_test = pandas.concat(segments_test_homogeneous, axis=0) #Generate id column train_id = [None] * data_train.index.size id = 0 for i in range(0, data_train.index.size, segment_length): train_id[i:i + segment_length] = [id] * segment_length id += 1 train_id = train_id[:data_train.index.size] data_train['id'] = train_id test_id = [None] * data_test.index.size id = 0 for i in range(0, data_test.index.size, segment_length): test_id[i:i + segment_length] = [id] * segment_length id += 1 test_id = test_id[:data_test.index.size] data_test['id'] = test_id y_train = data_train[['road_label', 'id']].reset_index(drop=True) y_train = y_train.groupby(y_train.index // segment_length).agg( lambda x: x.value_counts().index[0] ) #majority label in segment X_train = data_train[['acceleration_abs', 'id']].reset_index(drop=True) y_test = data_test[['road_label', 'id']].reset_index(drop=True) y_test = y_test.groupby( y_test.index // segment_length).agg(lambda x: x.value_counts().index[0]) X_test = data_test[['acceleration_abs', 'id']].reset_index(drop=True) #Extract Training features X_train = extractor.extract_select_training_features( X_train, args=[ 'id', config['hw_num_processors'], None, y_train['road_label'], config['feature_eng_baseline_extractor_fdr'] ]) #Get feature map for validation and training set kind_to_fc_parameters = from_columns(X_train) X_test = extractor.extract_select_inference_features( X_test, args=[ 'id', config['hw_num_processors'], None, kind_to_fc_parameters ]) X_train = [ 'placeholder', [ X_train, y_train['road_label'].rename(columns={'road_label': 0}, inplace=True), 'N/A', 'N/A', 'N/A' ] ] # required for further processing. TODO: Unifiy naming!# X_test = [ 'placeholder', [ X_test, y_test['road_label'].rename(columns={'road_label': 0}, inplace=True), 'N/A', 'N/A', 'N/A' ] ] if X_train is None or X_test is None: pass # TODO Raise Error # 5. Find optimal classifier for given training set print('--------------------TRAINING PHASE----------------------') print(X_test[0][0]) print(X_test[0][1]) clf, score, conf, X_train, motif_len, motif_radius, motif_count, run_summary = model_factory.find_optimal_model( config[ 'feature_eng_extractor_type'], # TODO remove bc deprecated. config handed anyways config, X_train, X_test, ) if clf is None or score is None or conf is None: pass # TODO Raise Error # 6. Prepare Validation print('--------------------PREPARE VALIDATION-------------------') X_valid, y_valid, kind_to_fc_parameters = None, None, None if config['feature_eng_extractor_type'] == "ts-fresh": data_valid = preprocessor.encode_categorical_features( data=data_valid, mode='custom_function', columns=['road_label'], encoding_function=lambda x: (x > 2.0).astype(int)) # 0 City, 1 Countryside #Segement validation data ins pieces with homogeneous length # Find segements with homogeneous labeling split = lambda df, chunk_size: numpy.array_split( df, len(df) // chunk_size + 1, axis=0) segments_valid = split(data_valid, segment_length) segments_valid_homogeneous = [] for segment in segments_valid: if segment.road_label.nunique( ) == 1: # and segment.shape[0] == segment_length: TODO Homogeneous length rmoved write that in paper segments_valid_homogeneous.append(segment) data_valid = pandas.concat(segments_valid_homogeneous, axis=0) #Generate id column valid_id = [None] * data_valid.index.size id = 0 for i in range(0, data_valid.index.size, segment_length): valid_id[i:i + segment_length] = [id] * segment_length id += 1 valid_id = valid_id[:data_valid.index.size] data_valid['id'] = valid_id y_valid = data_valid[['road_label', 'id']].reset_index(drop=True) y_valid = y_valid.groupby( y_valid.index // segment_length).agg(lambda x: x.value_counts().index[0]) X_valid = data_valid[['acceleration_abs', 'id']].reset_index(drop=True) # Get feature map for validation and training set kind_to_fc_parameters = from_columns(X_train) run_summary['ts_fresh_relevant_features'] = kind_to_fc_parameters X_valid = extractor.extract_select_inference_features( X_valid, args=[ 'id', config['hw_num_processors'], None, kind_to_fc_parameters ]) y_valid = y_valid['road_label'].rename(columns={'road_label': 0}, inplace=True) if config['feature_eng_extractor_type'] == "motif": X_valid, y_valid = extractor.extract_select_inference_features( data_valid, [motif_radius, motif_len, config['hw_num_processors']], True) # 7. Run Validation print('--------------------VALIDATION---------------------------') print(X_valid) print(y_valid) X_valid, y_valid = model_factory.pre_clustering(X_valid, y_valid, None) if config['feature_eng_extractor_type'] == 'motif': print("Validation y label 1: {}".format( list(y_valid[0]).count(1.0) / len(y_valid))) # TODO: make configureable print("Validation y label 3: {}".format( list(y_valid[0]).count(3.0) / len(y_valid))) run_summary['valid_lbl_1'] = list( y_valid[0]).count(1.0) / len(y_valid) run_summary['valid_lbl_3'] = list( y_valid[0]).count(3.0) / len(y_valid) elif config['feature_eng_extractor_type'] == 'ts-fresh': print("Validation y label 1: {}".format( list(y_valid).count(1.0) / len(y_valid))) # TODO: make configureable print("Validation y label 3: {}".format( list(y_valid).count(0.0) / len(y_valid))) run_summary['valid_lbl_1'] = list(y_valid).count(0.0) / len( y_valid) run_summary['valid_lbl_3'] = list(y_valid).count(1.0) / len( y_valid) score = clf.score(X_valid, y_valid) print(score) y_pred = clf.predict(X_valid) conf = confusion_matrix(y_valid, y_pred, labels=None, sample_weight=None) print(conf) report = str(classification_report(y_valid, y_pred)) best_params = clf.best_params_ print(report) # 8. Store Results print('--------------------STORE RESULTS------------------------') # TODO: delegate to DAO, make storing configureable results_tag = "{0}_{1}_{2}_{3}".format( config['feature_eng_extractor_type'], config['pre_proc_resample_freq'], config['feature_eng_dim_reduction_type'], config['feature_eng_baseline_extractor_segement_len']) pandas.DataFrame(X_train).to_pickle( "X_train_{}.pkl".format(results_tag)) pandas.DataFrame(X_test).to_pickle("X_test_{}.pkl".format(results_tag)) pandas.DataFrame(X_valid).to_pickle( "X_valid_{}.pkl".format(results_tag)) pandas.DataFrame(y_train).to_pickle( "y_train_{}.pkl".format(results_tag)) pandas.DataFrame(y_test).to_pickle("y_test_{}.pkl".format(results_tag)) pandas.DataFrame(y_valid).to_pickle( "y_valid_{}.pkl".format(results_tag)) with open("./clf_{}.pkl".format(results_tag), 'wb') as clf_file: pickle.dump(clf, clf_file) with open("./clf.pkl", 'wb') as clf_file: pickle.dump(clf, clf_file) meta_data = None if config['feature_eng_extractor_type'] == "motif": meta_data = { 'mean_train': mean_train, 'std_train': std_train, 'motif_len': motif_len, 'motif_radius': motif_radius, 'motif_count': motif_count, 'clf_score': score, 'clf_conf': conf, 'clf_report': report, 'clf_best_params': best_params } if config['feature_eng_extractor_type'] == "ts-fresh": meta_data = { 'mean_train': mean_train, 'std_train': std_train, 'feature_mapping': kind_to_fc_parameters, 'clf_score': score, 'clf_conf': conf, 'clf_report': report, 'clf_best_params': best_params } run_summary['global_best_meta_data'] = meta_data run_summary['config'] = config with open("./meta_data_{}.pkl".format(results_tag), 'wb') as meta_file: pickle.dump(meta_data, meta_file) with open("./meta_data.pkl", 'wb') as meta_file: pickle.dump(meta_data, meta_file) with open("./run_summary_{}.pkl".format(results_tag), 'wb') as run_summary_file: pickle.dump(run_summary, run_summary_file) print('--------------------PRINT SUMMARY------------------------') pprint.pprint(run_summary)
default_fc_parameters=extraction_settings, impute_function=impute, n_jobs=3) print('selecting training features') print(datetime.now().strftime('%Y-%m-%d %H:%M:%S')) X_filtered = select_features(X, train_y_train, n_jobs=3) print('X_filtered shape=') print(X_filtered.shape) print('extracting selected features for ALL training data') X_train_filtered = extract_features( X0_train, column_id='id', column_sort='time', kind_to_fc_parameters=settings.from_columns(X_filtered.columns), impute_function=impute, n_jobs=3) print('X_train_filtered shape=') print(X_train_filtered.shape) print('extracting selected features for test data') X_test_filtered = extract_features(X0_test, column_id='id', column_sort='time', kind_to_fc_parameters=settings.from_columns( X_filtered.columns), impute_function=impute, n_jobs=3)
##################################################################################### ### ### process data and labels to reduce dimensions ###################################################################################### #understand relevant features and update X accordingly X_filtered_features = select_features(x_input, y_for_filtering) #If we want another features different for conditions prediction # y_for_filtering = # X_filtered_features_st = select_features(x_input, y_for_filtering) # # Significant columns to use in tested data significant_features = X_filtered_features.columns num_significant_features = len(significant_features) to_filter_by = from_columns(X_filtered_features) # dictionary pickle.dump(to_filter_by, open("saved_features_no_PCA.p", "wb")) # save it into a file named saved_features.p # Load the dictionary back from the pickle file. # features_loaded = pickle.load(open("saved_features.p", "rb")) ################################################### try_pca_n_component = 15 #create PCA pca_n_component = min(try_pca_n_component, num_significant_features) pca_n_component pca_allData = PCAForPandas(n_components=pca_n_component) X_pca = pca_allData.fit_transform(X_filtered_features) #YONIT