def set_scalers(self, df): print('Setting scalers with training data...') column_definitions = self.get_column_definition() id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = utils.get_single_col_by_input_type( InputTypes.TARGET, column_definitions) self.identifiers = list(df[id_column].unique()) real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) data = df[real_inputs].values self._real_scalers = sklearn.preprocessing.StandardScaler().fit(data) self._target_scaler = sklearn.preprocessing.StandardScaler().fit( df[[target_column]].values) categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_scalers = {} num_classes = [] for col in categorical_inputs: srs = df[col].apply(str) categorical_scalers[col] = sklearn.preprocessing.LabelEncoder( ).fit(srs.values) num_classes.append(srs.nunique()) self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes
def transform_inputs(self, df): output = df.copy() if self._real_scalers is None and self._cat_scalers is None: raise ValueError('Scalers have not been set!') column_definitions = self.get_column_definition() real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) output[real_inputs] = self._real_scalers.transform( df[real_inputs].values) for col in categorical_inputs: string_df = df[col].apply(str) output[col] = self._cat_scalers[col].transform(string_df) output = output.fillna(0) print(output) return output
def transform_inputs(self, df): """Performs feature transformations. This includes both feature engineering, preprocessing and normalisation. Args: df: Data frame to transform. Returns: Transformed data frame. """ output = df.copy() if self._real_scalers is None and self._cat_scalers is None: raise ValueError('Scalers have not been set!') column_definitions = self.get_column_definition() real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) # Format real inputs output[real_inputs] = self._real_scalers.transform(df[real_inputs].values) # Format categorical inputs for col in categorical_inputs: string_df = df[col].apply(str) output[col] = self._cat_scalers[col].transform(string_df) return output
def set_scalers(self, df): """Calibrates scalers using the data supplied. Args: df: Data to use to calibrate scalers. """ print('Setting scalers with training data...') column_definitions = self.get_column_definition() id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = utils.get_single_col_by_input_type( InputTypes.TARGET, column_definitions) # Format real scalers real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) # Initialise scaler caches self._real_scalers = {} self._target_scaler = {} identifiers = [] for identifier, sliced in df.groupby(id_column): if len(sliced) >= self._time_steps: data = sliced[real_inputs].values targets = sliced[[target_column]].values self._real_scalers[identifier] \ = sklearn.preprocessing.StandardScaler().fit(data) self._target_scaler[identifier] \ = sklearn.preprocessing.StandardScaler().fit(targets) identifiers.append(identifier) # Format categorical scalers categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_scalers = {} num_classes = [] for col in categorical_inputs: # Set all to str so that we don't have mixed integer/string columns srs = df[col].apply(str) categorical_scalers[col] = sklearn.preprocessing.LabelEncoder( ).fit(srs.values) num_classes.append(srs.nunique()) # Set categorical scaler outputs self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes # Extract identifiers in case required self.identifiers = identifiers
def set_scalers(self, df, set_real=True): print('Setting scalers with training data...') column_definitions = self.get_column_definition() id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = utils.get_single_col_by_input_type( InputTypes.TARGET, column_definitions) if set_real: # Extract identifiers in case required self.identifiers = list(df[id_column].unique()) real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) # Format real scalers self._real_scalers = {} # for col in real_inputs: # self._real_scalers[col] = (df[col].mean(), df[col].std()) # self._target_scaler = (df[target_column].mean(), df[target_column].std()) data = df[real_inputs].values self._real_scalers = sklearn.preprocessing.StandardScaler().fit( data) self._target_scaler = sklearn.preprocessing.StandardScaler().fit( df[[target_column]].values) else: # Format categorical scalers categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_scalers = {} num_classes = [] if self.identifiers is None: raise ValueError('Scale real-valued inputs first!') id_set = set(self.identifiers) valid_idx = df['ID'].apply(lambda x: x in id_set) for col in categorical_inputs: # Set all to str so that we don't have mixed integer/string columns srs = df[col].apply(str).loc[valid_idx] categorical_scalers[col] = sklearn.preprocessing.LabelEncoder( ).fit(srs.values) num_classes.append(srs.nunique()) # Set categorical scaler outputs self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes
def set_scalers(self, df, enable_scaling=False): """Calibrates scalers using the data supplied. Args: df: Data to use to calibrate scalers. """ print('Setting scalers with training data...') column_definitions = self.get_column_definition() id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = utils.get_single_col_by_input_type( InputTypes.TARGET, column_definitions) # Extract identifiers in case required self.identifiers = list(df[id_column].unique()) # Format real scalers real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) data = df[real_inputs].values print("Scaling Enabled:" + str(enable_scaling)) self._real_scalers = sklearn.preprocessing.StandardScaler( with_mean=enable_scaling, with_std=enable_scaling).fit(data) self._target_scaler = sklearn.preprocessing.StandardScaler( with_mean=enable_scaling, with_std=enable_scaling).fit(df[[target_column ]].values) # used for predictions # Format categorical scalers categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_scalers = {} num_classes = [] for col in categorical_inputs: # Set all to str so that we don't have mixed integer/string columns srs = df[col].apply(str) categorical_scalers[col] = sklearn.preprocessing.LabelEncoder( ).fit(srs.values) num_classes.append(srs.nunique()) # Set categorical scaler outputs self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes
def set_scalers(self, df, set_real=True): print('Setting scalers with training data...') column_definitions = self.get_column_definition() id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = utils.get_single_col_by_input_type(InputTypes.TARGET, column_definitions) if set_real: self.identifiers = list(df[id_column].unique()) self._real_scalers = {} for col in ['transactions', 'log_sales']: # 'oil' 뺐음 self._real_scalers[col] = (df[col].mean(), df[col].std()) self._target_scaler = (df[target_column].mean(), df[target_column].std()) else: categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_scalers = {} num_classes = [] if self.identifiers is None: raise ValueError('Scale real-valued inputs first') id_set = set(self.identifiers) valid_idx = df['traj_id'].apply(lambda x: x in id_set) for col in categorical_inputs: srs = df[col].apply(str).loc[valid_idx] categorical_scalers[col] = sklearn.preprocessing.LabelEncoder().fit(srs.values) num_classes.append(srs.nunique()) self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes
def transform_inputs(self, df: DataFrame): """Performs feature transformations. This includes both feature engineering, preprocessing and normalisation. Args: df: Data frame to transform. Returns: Transformed data frame. """ output = df.copy() if self._real_scalers is None and self._cat_scalers is None: raise ValueError('Scalers have not been set!') column_definitions = self.get_column_definition() categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) # Format real inputs for col in ['log_sales', 'oil', 'transactions']: mean, std = self._real_scalers[col] output[col] = (df[col] - mean) / std if col == 'log_sales': output[col] = output[col].fillna(0.) # mean imputation # Format categorical inputs for col in categorical_inputs: string_df = df[col].apply(str) output[col] = self._cat_scalers[col].transform(string_df) return output
def transform_inputs(self, df): """Performs feature transformations. This includes both feature engineering, preprocessing and normalisation. Args: df: Data frame to transform. Returns: Transformed data frame. """ if self._real_scalers is None and self._cat_scalers is None: raise ValueError('Scalers have not been set!') # Extract relevant columns column_definitions = self.get_column_definition() id_col = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) real_inputs = utils.extract_cols_from_data_type( DataTypes.REAL_VALUED, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) # Transform real inputs per entity df_list = [] for identifier, sliced in df.groupby(id_col): # Filter out any trajectories that are too short if len(sliced) >= self._time_steps: sliced_copy = sliced.copy() sliced_copy[real_inputs] = self._real_scalers[ identifier].transform(sliced_copy[real_inputs].values) df_list.append(sliced_copy) output = pd.concat(df_list, axis=0) # Format categorical inputs for col in categorical_inputs: string_df = df[col].apply(str) output[col] = self._cat_scalers[col].transform(string_df) return output
def set_scalers(self, df, set_real=True): """Calibrates scalers using the data supplied. Label encoding is applied to the entire dataset (i.e. including test), so that unseen labels can be handled at run-time. Args: df: Data to use to calibrate scalers. set_real: Whether to fit set real-valued or categorical scalers """ print('Setting scalers with training data...') column_definitions = self.get_column_definition() id_column = utils.get_single_col_by_input_type(InputTypes.ID, column_definitions) target_column = utils.get_single_col_by_input_type( InputTypes.TARGET, column_definitions) if set_real: # Extract identifiers in case required self.identifiers = list(df[id_column].unique()) # Format real scalers self._real_scalers = {} for col in ['oil', 'transactions', 'log_sales']: self._real_scalers[col] = (df[col].mean(), df[col].std()) self._target_scaler = (df[target_column].mean(), df[target_column].std()) else: # Format categorical scalers categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) categorical_scalers = {} num_classes = [] if self.identifiers is None: raise ValueError('Scale real-valued inputs first!') id_set = set(self.identifiers) valid_idx = df['traj_id'].apply(lambda x: x in id_set) for col in categorical_inputs: # Set all to str so that we don't have mixed integer/string columns srs = df[col].apply(str).loc[valid_idx] categorical_scalers[col] = sklearn.preprocessing.LabelEncoder( ).fit(srs.values) num_classes.append(srs.nunique()) # Set categorical scaler outputs self._cat_scalers = categorical_scalers self._num_classes_per_cat_input = num_classes
def transform_inputs(self, df): output = df.copy() if self._real_scalers is None and self._cat_scalers is None: raise ValueError('Scalers have not been set') column_definitions = self.get_column_definition() categorical_inputs = utils.extract_cols_from_data_type( DataTypes.CATEGORICAL, column_definitions, {InputTypes.ID, InputTypes.TIME}) for col in ['log_sales', 'transactions']: # 'oil' mean, std = self._real_scalers[col] output[col] = (df[col] - mean) / std if col == 'log_sales': output[col] = output[col].fillna(0.) for col in categorical_inputs: string_df = df[col].apply(str) output[col] = self._cat_scalers[col].transform(string_df) return output