コード例 #1
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
コード例 #2
0
ファイル: power_transformer.py プロジェクト: sreev/lale
 def __init__(self, method='yeo-johnson', standardize=True, copy=True):
     self._hyperparams = {
         'method': method,
         'standardize': standardize,
         'copy': copy
     }
     self._wrapped_model = SKLModel(**self._hyperparams)
コード例 #3
0
class PowerTransformerImpl():

    def __init__(self, method='yeo-johnson', standardize=True, copy=True):
        self._hyperparams = {
            'method': method,
            'standardize': standardize,
            'copy': copy}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def transform(self, X):
        return self._sklearn_model.transform(X)
コード例 #4
0
    def get_transformed_numeric_data(self):
        """Transforms the numeric data from the dask dataframe contained in 'self.dask_dataframe', selected based on the contents of 'self.num_labels'.

        Returns:
            A dataframe with the scaled and transformed columns.
        """
        updated_data_bc = {}

        # Iterate through the numeric labels.
        for column in self.num_labels:
            # For each column, add 1 to shift data to right and avoid zeros.
            # We need to call 'computed()' on the column so that we can retrieve its values and apply 'reshape()'.
            data_in_column = self.dask_df[column]
            data = data_in_column.compute().values.reshape(-1, 1) + 1

            # For each column, compose the path and name of the file which holds the
            # 'PowerTransformer' object with the fitted lambdas using the model directory,
            # the day of the last training (current day if we are preprocessing for training) and a unique hash.
            pkl_file = f'{self.models_dir}/{self.day_as_str}_{self.unique_hash}_ga_chp_box_cox_{column}.pkl'

            # If predicting load the specific 'PowerTransformer' object for this column and apply the transformation.
            if self.training_or_prediction == 'prediction':
                box_cox = joblib.load(pkl_file)
                data_bc = box_cox.transform(data)
            # If training, fit the 'PowerTransformer' and save the object in the specified column's file then apply the transformation.
            else:
                # Create a 'PowerTransformer' object using the 'box-cox' method.
                box_cox = PowerTransformer(method='box-cox')
                box_cox.fit(data)
                joblib.dump(box_cox, pkl_file)
                data_bc = box_cox.transform(data)

            updated_data_bc[column] = data_bc.T.tolist()[0]

        # Append all the columns to an array and generate a dask dataframe from it with the data
        # transformed using Box-Cox.
        bc_list = []

        for column in self.num_labels:
            bc_list.append(updated_data_bc[column])

        bc_array = np.array(bc_list).transpose()

        transformed_bc_data = dd.from_array(bc_array,
                                            chunksize=200000,
                                            columns=self.num_labels)

        # Generate a similar .pkl file name and path for the 'Pipeline' type object with the fitted hyperparameters.
        pkl_file = f'{self.models_dir}/{self.day_as_str}_{self.unique_hash}_ga_chp_pipeline.pkl'

        # If predicting, load the pipeline and use it to transform the data.
        if self.training_or_prediction == 'prediction':
            pipeline = joblib.load(pkl_file)
            transformed_numeric = pipeline.transform(transformed_bc_data)
        else:
            # If training, generate a 'Pipeline' using a 'SimpleImputer', 'Normalizer' and 'StandardScaler'.
            pipeline = Pipeline([
                # Replace zeros with mean value.
                ('imputer', SimpleImputer(strategy="mean", missing_values=0)),
                # Scale in interval (0, 1).
                ('normalizer', Normalizer()),
                # Substract mean and divide by variance.
                ('scaler', StandardScaler()),
            ])
            # Fit the pipeline and save it to the specified file then apply the transformation.
            pipeline.fit(transformed_bc_data)
            joblib.dump(pipeline, pkl_file)
            transformed_numeric = pipeline.transform(transformed_bc_data)

        return dd.from_array(transformed_numeric,
                             chunksize=200000,
                             columns=self.num_labels)