def transform_inputs_outputs(self, ds: DataSet, **kwargs): """Transform of data into inputs and outptus for a strategy Parameters ---------- ds: `DataSet` Dataset with columns corresponding to the inputs and objectives of the domain. copy: bool, optional Copy the dataset internally. Defaults to True. transform_descriptors: bool, optional Transform the descriptors into continuous variables. Default True. Returns ------- inputs, outputs Datasets with the input and output datasets """ copy = kwargs.get("copy", True) transform_descriptors = kwargs.get("transform_descriptors", True) data_columns = ds.data_columns new_ds = ds.copy() if copy else ds # Determine input and output columns in dataset input_columns = [] output_columns = [] for variable in self.domain.input_variables: if isinstance(variable, CategoricalVariable) and transform_descriptors: # Add descriptors to the dataset var_descriptor_names = variable.ds.data_columns if all( np.isin(var_descriptor_names, new_ds.columns.levels[0].to_list())): # Make the descriptors columns a metadata column column_list_1 = new_ds.columns.levels[0].to_list() ix = [ column_list_1.index(d_name) for d_name in var_descriptor_names ] column_codes_2 = list(new_ds.columns.codes[1]) ix_code = [ np.where(new_ds.columns.codes[0] == tmp_ix)[0][0] for tmp_ix in ix ] for ixc in ix_code: column_codes_2[ixc] = 0 new_ds.columns.set_codes(column_codes_2, level=1, inplace=True) else: indices = new_ds[variable.name].values descriptors = variable.ds.loc[indices] descriptors.index = new_ds.index new_ds = new_ds.join(descriptors, how="inner") # Make the original descriptors column a metadata column column_list_1 = new_ds.columns.levels[0].to_list() ix = column_list_1.index(variable.name) column_codes_2 = list(new_ds.columns.codes[1]) ix_code = np.where(new_ds.columns.codes[0] == ix)[0][0] column_codes_2[ix_code] = 1 new_ds.columns.set_codes(column_codes_2, level=1, inplace=True) # add descriptors data columns to inputs input_columns.extend(var_descriptor_names) elif isinstance(variable, Variable): input_columns.append(variable.name) else: raise DomainError( f"Variable {variable.name} is not a continuous or categorical variable." ) for variable in self.domain.output_variables: if variable.name in data_columns and variable.is_objective: if isinstance(variable, CategoricalVariable): raise DomainError( "Output variables cannot be categorical variables currently." ) output_columns.append(variable.name) # Ensure continuous variables are floats new_ds[variable.name] = new_ds[variable.name].astype(np.float) else: raise DomainError( f"Variable {variable.name} is not in the dataset.") if output_columns is None: raise DomainError( "No output columns in the domain. Add at least one output column for optimisation." ) # Return the inputs and outputs as separate datasets return new_ds[input_columns].copy(), new_ds[output_columns].copy()
def transform_inputs_outputs(self, ds: DataSet, **kwargs): """Transform of data into inputs and outptus for a strategy Parameters ---------- ds: `DataSet` Dataset with columns corresponding to the inputs and objectives of the domain. copy: bool, optional Copy the dataset internally. Defaults to True. standardize_inputs : bool, optional Standardize all input continuous variables. Default is False. standardize_outputs : bool, optional Standardize all output continuous variables. Default is False. categorical_method : str, optional The method for transforming categorical variables. Either "one-hot" or "descriptors". Descriptors must be included in the categorical variables for the later. Returns ------- inputs, outputs Datasets with the input and output datasets """ copy = kwargs.get("copy", True) categorical_method = kwargs.get("categorical_method", "one-hot") standardize_inputs = kwargs.get("standardize_inputs", False) standardize_outputs = kwargs.get("standardize_outputs", False) data_columns = ds.data_columns new_ds = ds.copy() if copy else ds # Determine input and output columns in dataset input_columns = [] output_columns = [] self.input_means, self.input_stds = {}, {} self.output_means, self.output_stds = {}, {} for variable in self.domain.input_variables: if (isinstance(variable, CategoricalVariable) and categorical_method == "descriptors"): # Add descriptors to the dataset var_descriptor_names = variable.ds.data_columns if all( np.isin(var_descriptor_names, new_ds.columns.levels[0].to_list())): # Make the descriptors columns a metadata column column_list_1 = new_ds.columns.levels[0].to_list() ix = [ column_list_1.index(d_name) for d_name in var_descriptor_names ] column_codes_2 = list(new_ds.columns.codes[1]) ix_code = [ np.where(new_ds.columns.codes[0] == tmp_ix)[0][0] for tmp_ix in ix ] for ixc in ix_code: column_codes_2[ixc] = 0 new_ds.columns.set_codes(column_codes_2, level=1, inplace=True) else: indices = new_ds[variable.name].values descriptors = variable.ds.loc[indices] descriptors.index = new_ds.index new_ds = new_ds.join(descriptors, how="inner") # Make the original descriptors column a metadata column column_list_1 = new_ds.columns.levels[0].to_list() ix = column_list_1.index(variable.name) column_codes_2 = list(new_ds.columns.codes[1]) ix_code = np.where(new_ds.columns.codes[0] == ix)[0][0] column_codes_2[ix_code] = 1 new_ds.columns.set_codes(column_codes_2, level=1, inplace=True) # add descriptors data columns to inputs input_columns.extend(var_descriptor_names) elif (isinstance(variable, CategoricalVariable) and categorical_method == "one-hot"): # Create one-hot encoding columns & insert to DataSet enc = OneHotEncoder(categories=[variable.levels]) values = np.atleast_2d(new_ds[variable.name].to_numpy()).T one_hot_values = enc.fit_transform(values).toarray() for loc, l in enumerate(variable.levels): column_name = f"{variable.name}_{l}" new_ds[column_name, "DATA"] = one_hot_values[:, loc] input_columns.append(column_name) variable.enc = enc # Drop old categorical column, then write as metadata new_ds = new_ds.drop(variable.name, axis=1) new_ds[variable.name, "METADATA"] = values elif isinstance(variable, ContinuousVariable): if standardize_inputs: values, mean, std = self.standardize_column( new_ds[variable.name].astype(np.float)) self.input_means[variable.name] = mean self.input_stds[variable.name] = std new_ds[variable.name, "DATA"] = values input_columns.append(variable.name) else: raise DomainError( f"Variable {variable.name} is not a continuous or categorical variable." ) for variable in self.domain.output_variables: if variable.name in data_columns and variable.is_objective: if isinstance(variable, CategoricalVariable): raise DomainError( "Output variables cannot be categorical variables currently." ) if standardize_outputs: values, mean, std = self.standardize_column( new_ds[variable.name].astype(np.float)) self.output_means[variable.name] = mean self.output_stds[variable.name] = std new_ds[variable.name, "DATA"] = values output_columns.append(variable.name) # Ensure continuous variables are floats new_ds[variable.name] = new_ds[variable.name].astype(np.float) else: raise DomainError( f"Variable {variable.name} is not in the dataset.") if output_columns is None: raise DomainError( "No output columns in the domain. Add at least one output column for optimisation." ) # Return the inputs and outputs as separate datasets return new_ds[input_columns].copy(), new_ds[output_columns].copy()