def __init__(self, data_df): super(ElectricityFormatter, self).__init__() """ Args: csv_file (string): Path to the csv file with annotations. """ # Attribute loading the data self.data = data_df.reset_index(drop=True) self.id_col = get_single_col_by_input_type(InputTypes.ID, self._column_definition) self.time_col = get_single_col_by_input_type(InputTypes.TIME, self._column_definition) self.target_col = get_single_col_by_input_type(InputTypes.TARGET, self._column_definition) self.input_cols = [ tup[0] for tup in self._column_definition if tup[2] not in {InputTypes.ID, InputTypes.TIME} ] self.col_mappings = { 'identifier': [self.id_col], 'time': [self.time_col], 'outputs': [self.target_col], 'inputs': self.input_cols } self.lookback = self.get_time_steps() self.num_encoder_steps = self.get_num_encoder_steps() self.data_index = self.get_index_filtering() self.group_size = self.data.groupby( [self.id_col]).apply(lambda x: x.shape[0]).mean() self.data_index = self.data_index[ self.data_index.end_rel < self.group_size].reset_index()
def set_scalers(self, df): """Calibrates scalers using the data supplied. Args: df: Data to use to calibrate scalers. """ print('Setting scalers with training data...') column_definitions = self.get_column_definition() id_column = get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = get_single_col_by_input_type(InputTypes.TARGET, column_definitions) # Format real scalers real_inputs = extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) print('Real Scalers') # Initialise scaler caches self._real_scalers = {} self._target_scaler = {} identifiers = [] for identifier, sliced in df.groupby(id_column): print('{} - {}'.format(identifier, len(sliced))) if len(sliced) >= self._time_steps: data = sliced[real_inputs].values targets = sliced[[target_column]].values self._real_scalers[identifier] \ = sklearn.preprocessing.StandardScaler().fit(data) self._target_scaler[identifier] \ = sklearn.preprocessing.StandardScaler().fit(targets) identifiers.append(identifier) # Format categorical scalers categorical_inputs = extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) print('Categorical Scalers') categorical_scalers = {} num_classes = [] for col in categorical_inputs: print(col) # Set all to str so that we don't have mixed integer/string columns srs = df[col] #.astype(str) categorical_scalers[col] = sklearn.preprocessing.LabelEncoder( ).fit(srs.values) num_classes.append(srs.nunique()) # Set categorical scaler outputs self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes # Extract identifiers in case required self.identifiers = identifiers
def __init__(self): """Initialises formatter.""" self.identifiers = None self._real_scalers = None self._cat_scalers = None self._target_scaler = None self._num_classes_per_cat_input = None self._time_steps = self.get_fixed_params()['total_time_steps'] self._num_encoder_steps = self.get_fixed_params()['num_encoder_steps'] # Extract relevant columns self._column_definitions = self.get_column_definition() self._id_col = get_single_col_by_input_type(InputTypes.ID, self._column_definitions) self._target_column = get_single_col_by_input_type( InputTypes.TARGET, self._column_definitions) self._real_inputs = extract_cols_from_data_type( DataTypes.REAL_VALUED, self._column_definitions, {InputTypes.ID, InputTypes.TIME}) self._categorical_inputs = extract_cols_from_data_type( DataTypes.CATEGORICAL, self._column_definitions, {InputTypes.ID, InputTypes.TIME})
def transform_inputs(self, df): """Performs feature transformations. This includes both feature engineering, preprocessing and normalisation. Args: df: Data frame to transform. Returns: Transformed data frame. """ print('Transforming the training data...') if self._real_scalers is None and self._cat_scalers is None: raise ValueError('Scalers have not been set!') # Extract relevant columns column_definitions = self.get_column_definition() id_col = get_single_col_by_input_type(InputTypes.ID, column_definitions) real_inputs = extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_inputs = extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) # Transform real inputs per entity df_list = [] print('Real Features Transform') for identifier, sliced in df.groupby(id_col): print('{} - {}'.format(identifier, len(sliced))) # Filter out any trajectories that are too short if len(sliced) >= self._time_steps: sliced_copy = sliced.copy() sliced_copy[real_inputs] = self._real_scalers[ identifier].transform(sliced_copy[real_inputs].values) df_list.append(sliced_copy) output = pd.concat(df_list, axis=0) print('Categorical Features Transform') # Format categorical inputs for col in categorical_inputs: print(col) string_df = df[col] #.apply(str) output[col] = self._cat_scalers[col].transform(string_df) return output