def _initial_prep(self, X_train, y_train, X_validation=None, y_validation=None): if not isinstance(X_train, pd.DataFrame): X_train = pd.DataFrame(X_train) if not isinstance(X_train.columns[0], str): X_train.columns = [str(c) for c in X_train.columns] X_train.reset_index(drop=True, inplace=True) if isinstance(y_train, pd.DataFrame): if "target" not in y_train.columns: raise AutoMLException( "y_train should be Numpy array, Pandas Series or DataFrame with column 'target' " ) else: y_train = y_train["target"] y_train = pd.Series(np.array(y_train), name="target") X_train, y_train = ExcludeRowsMissingTarget.transform(X_train, y_train, warn=True) return X_train, y_train, X_validation, y_validation
def test_transform_with_sample_weight(self): d_test = { "col1": [1, 1, np.nan, 3], "col2": ["a", "a", np.nan, "a"], "col3": [1, 1, 1, 3], "col4": ["a", "a", "b", "c"], "sample_weight": [1, 2, 3, 4], "y": [np.nan, 1, np.nan, 2], } df_test = pd.DataFrame(data=d_test) X = df_test.loc[:, ["col1", "col2", "col3", "col4"]] y = df_test.loc[:, "y"] sample_weight = df_test.loc[:, "sample_weight"] self.assertEqual(X.shape[0], 4) self.assertEqual(y.shape[0], 4) X, y, sw = ExcludeRowsMissingTarget.transform(X, y, sample_weight) self.assertEqual(X.shape[0], 2) self.assertEqual(y.shape[0], 2) self.assertEqual(sw.shape[0], 2) self.assertEqual(y[0], 1) self.assertEqual(y[1], 2) self.assertEqual(sw[0], 2) self.assertEqual(sw[1], 4)
def _build_dataframe(self, X, y=None): # If Inputs are not pandas dataframes use scikit-learn validation for X array if not isinstance(X, pd.DataFrame): # Validate X as array X = check_array(X, ensure_2d=False) # Force X to be 2D X = np.atleast_2d(X) # Create Pandas dataframe from np.arrays, columns get names with the schema: feature_{index} X = pd.DataFrame( X, columns=["feature_" + str(i) for i in range(1, len(X[0]) + 1)]) # Enforce column names # Enforce X_train columns to be string X.columns = X.columns.astype(str) X.reset_index(drop=True, inplace=True) if y is None: return X # Check if y is np.ndarray, transform to pd.Series if isinstance(y, np.ndarray): y = check_array(y, ensure_2d=False) y = pd.Series(np.array(y), name="target") # if pd.DataFrame, slice first column elif isinstance(y, pd.DataFrame): y = np.array(y.iloc[:, 0]) y = check_array(y, ensure_2d=False) y = pd.Series(np.array(y), name="target") X, y = ExcludeRowsMissingTarget.transform(X, y, warn=True) return X, y
def _initial_prep(self, X_train, y_train, X_validation=None, y_validation=None): if not isinstance(X_train, pd.DataFrame): X_train = pd.DataFrame(X_train) if not isinstance(X_train.columns[0], str): X_train.columns = [str(c) for c in X_train.columns] X_train.reset_index(drop=True, inplace=True) y_train = pd.Series(np.array(y_train), name="target") X_train, y_train = ExcludeRowsMissingTarget.transform( X_train, y_train, warn=True ) return X_train, y_train, X_validation, y_validation
def test_transform(self): d_test = { "col1": [1, 1, np.nan, 3], "col2": ["a", "a", np.nan, "a"], "col3": [1, 1, 1, 3], "col4": ["a", "a", "b", "c"], "y": [np.nan, 1, np.nan, 2], } df_test = pd.DataFrame(data=d_test) X = df_test.loc[:, ["col1", "col2", "col3", "col4"]] y = df_test.loc[:, "y"] self.assertEqual(X.shape[0], 4) self.assertEqual(y.shape[0], 4) X, y = ExcludeRowsMissingTarget.transform(X, y) self.assertEqual(X.shape[0], 2) self.assertEqual(y.shape[0], 2) print(y) self.assertEqual(y[0], 1) self.assertEqual(y[1], 2)
def fit_and_transform(self, X_train, y_train): logger.debug("Preprocessing.fit_and_transform") if y_train is not None: # target preprocessing # this must be used first, maybe we will drop some rows because of missing target values target_preprocessing = self._params.get("target_preprocessing") logger.debug( "target_preprocessing params: {}".format(target_preprocessing)) X_train, y_train = ExcludeRowsMissingTarget.transform( X_train, y_train) if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing: logger.debug("Convert target to integer") self._categorical_y = LabelEncoder() self._categorical_y.fit(y_train) y_train = pd.Series(self._categorical_y.transform(y_train)) if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing: logger.debug("Convert target to one-hot coding") self._categorical_y = LabelBinarizer() self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target") y_train = self._categorical_y.transform( pd.DataFrame({"target": y_train}), "target") if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing: logger.debug("Scale log and normal") self._scale_y = Scale(["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL) y_train = pd.DataFrame({"target": y_train}) self._scale_y.fit(y_train) y_train = self._scale_y.transform(y_train) y_train = y_train["target"] if Scale.SCALE_NORMAL in target_preprocessing: logger.debug("Scale normal") self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL) y_train = pd.DataFrame({"target": y_train}) self._scale_y.fit(y_train) y_train = self._scale_y.transform(y_train) y_train = y_train["target"] # columns preprocessing columns_preprocessing = self._params.get("columns_preprocessing") for column in columns_preprocessing: transforms = columns_preprocessing[column] # logger.debug("Preprocess column {} with: {}".format(column, transforms)) # remove empty or constant columns cols_to_remove = list( filter( lambda k: "remove_column" in columns_preprocessing[k], columns_preprocessing, )) if X_train is not None: X_train.drop(cols_to_remove, axis=1, inplace=True) self._remove_columns = cols_to_remove for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]: cols_to_process = list( filter( lambda k: missing_method in columns_preprocessing[k], columns_preprocessing, )) missing = PreprocessingMissingValues(cols_to_process, missing_method) missing.fit(X_train) X_train = missing.transform(X_train) self._missing_values += [missing] for convert_method in [PreprocessingCategorical.CONVERT_INTEGER]: cols_to_process = list( filter( lambda k: convert_method in columns_preprocessing[k], columns_preprocessing, )) convert = PreprocessingCategorical(cols_to_process, convert_method) convert.fit(X_train) X_train = convert.transform(X_train) self._categorical += [convert] # SCALE for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]: cols_to_process = list( filter( lambda k: scale_method in columns_preprocessing[k], columns_preprocessing, )) if len(cols_to_process): scale = Scale(cols_to_process) scale.fit(X_train) X_train = scale.transform(X_train) self._scale += [scale] return X_train, y_train
def transform(self, X_validation, y_validation): logger.debug("Preprocessing.transform") # doing copy to avoid SettingWithCopyWarning if X_validation is not None: X_validation = X_validation.copy(deep=False) if y_validation is not None: y_validation = y_validation.copy(deep=False) # target preprocessing # this must be used first, maybe we will drop some rows because of missing target values if y_validation is not None: target_preprocessing = self._params.get("target_preprocessing") logger.debug( "target_preprocessing -> {}".format(target_preprocessing)) X_validation, y_validation = ExcludeRowsMissingTarget.transform( X_validation, y_validation) if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing: if y_validation is not None and self._categorical_y is not None: y_validation = pd.Series( self._categorical_y.transform(y_validation)) if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing: if y_validation is not None and self._categorical_y is not None: y_validation = self._categorical_y.transform( pd.DataFrame({"target": y_validation}), "target") if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing: if self._scale_y is not None and y_validation is not None: logger.debug("Transform log and normalize") y_validation = pd.DataFrame({"target": y_validation}) y_validation = self._scale_y.transform(y_validation) y_validation = y_validation["target"] if Scale.SCALE_NORMAL in target_preprocessing: if self._scale_y is not None and y_validation is not None: logger.debug("Transform normalize") y_validation = pd.DataFrame({"target": y_validation}) y_validation = self._scale_y.transform(y_validation) y_validation = y_validation["target"] # columns preprocessing if len(self._remove_columns) and X_validation is not None: cols_to_remove = [ col for col in X_validation.columns if col in self._remove_columns ] X_validation.drop(cols_to_remove, axis=1, inplace=True) for missing in self._missing_values: if X_validation is not None and missing is not None: X_validation = missing.transform(X_validation) # to be sure that all missing are filled # in case new data there can be gaps! if (X_validation is not None and np.sum(np.sum(pd.isnull(X_validation))) > 0 and len(self._params["columns_preprocessing"]) > 0): # there is something missing, fill it # we should notice user about it! warnings.warn( "There are columns {} with missing values which didnt have missing values in train dataset." .format( list(X_validation.columns[np.where( np.sum(pd.isnull(X_validation)))]))) missing = PreprocessingMissingValues( X_validation.columns, PreprocessingMissingValues.FILL_NA_MEDIAN) missing.fit(X_validation) X_validation = missing.transform(X_validation) for convert in self._categorical: if X_validation is not None and convert is not None: X_validation = convert.transform(X_validation) for scale in self._scale: if X_validation is not None and scale is not None: X_validation = scale.transform(X_validation) return X_validation, y_validation
def fit_and_transform(self, X_train, y_train, sample_weight=None): logger.debug("Preprocessing.fit_and_transform") if y_train is not None: # target preprocessing # this must be used first, maybe we will drop some rows because of missing target values target_preprocessing = self._params.get("target_preprocessing") logger.debug( "target_preprocessing params: {}".format(target_preprocessing)) X_train, y_train, sample_weight = ExcludeRowsMissingTarget.transform( X_train, y_train, sample_weight) if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing: logger.debug("Convert target to integer") self._categorical_y = LabelEncoder(try_to_fit_numeric=True) self._categorical_y.fit(y_train) y_train = pd.Series(self._categorical_y.transform(y_train)) if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing: logger.debug("Convert target to one-hot coding") self._categorical_y = LabelBinarizer() self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target") y_train = self._categorical_y.transform( pd.DataFrame({"target": y_train}), "target") if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing: logger.debug("Scale log and normal") self._scale_y = Scale(["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL) y_train = pd.DataFrame({"target": y_train}) self._scale_y.fit(y_train) y_train = self._scale_y.transform(y_train) y_train = y_train["target"] if Scale.SCALE_NORMAL in target_preprocessing: logger.debug("Scale normal") self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL) y_train = pd.DataFrame({"target": y_train}) self._scale_y.fit(y_train) y_train = self._scale_y.transform(y_train) y_train = y_train["target"] # columns preprocessing columns_preprocessing = self._params.get("columns_preprocessing") for column in columns_preprocessing: transforms = columns_preprocessing[column] # logger.debug("Preprocess column {} with: {}".format(column, transforms)) # remove empty or constant columns cols_to_remove = list( filter( lambda k: "remove_column" in columns_preprocessing[k], columns_preprocessing, )) if X_train is not None: X_train.drop(cols_to_remove, axis=1, inplace=True) self._remove_columns = cols_to_remove numeric_cols = [] # get numeric cols before text transformations # needed for golden features if X_train is not None and ("golden_features" in self._params or "kmeans_features" in self._params): numeric_cols = X_train.select_dtypes( include="number").columns.tolist() # there can be missing values in the text data, # but we don't want to handle it by fill missing methods # zeros will be imputed by text_transform method cols_to_process = list( filter( lambda k: "text_transform" in columns_preprocessing[k], columns_preprocessing, )) new_text_columns = [] for col in cols_to_process: t = TextTransformer() t.fit(X_train, col) X_train = t.transform(X_train) self._text_transforms += [t] new_text_columns += t._new_columns # end of text transform for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]: cols_to_process = list( filter( lambda k: missing_method in columns_preprocessing[k], columns_preprocessing, )) missing = PreprocessingMissingValues(cols_to_process, missing_method) missing.fit(X_train) X_train = missing.transform(X_train) self._missing_values += [missing] # golden features golden_columns = [] if "golden_features" in self._params: results_path = self._params["golden_features"]["results_path"] ml_task = self._params["golden_features"]["ml_task"] self._golden_features = GoldenFeaturesTransformer( results_path, ml_task) self._golden_features.fit(X_train[numeric_cols], y_train) X_train = self._golden_features.transform(X_train) golden_columns = self._golden_features._new_columns kmeans_columns = [] if "kmeans_features" in self._params: results_path = self._params["kmeans_features"]["results_path"] self._kmeans = KMeansTransformer(results_path, self._model_name, self._k_fold) self._kmeans.fit(X_train[numeric_cols], y_train) X_train = self._kmeans.transform(X_train) kmeans_columns = self._kmeans._new_features for convert_method in [ PreprocessingCategorical.CONVERT_INTEGER, PreprocessingCategorical.CONVERT_ONE_HOT, PreprocessingCategorical.CONVERT_LOO, ]: cols_to_process = list( filter( lambda k: convert_method in columns_preprocessing[k], columns_preprocessing, )) convert = PreprocessingCategorical(cols_to_process, convert_method) convert.fit(X_train, y_train) X_train = convert.transform(X_train) self._categorical += [convert] # datetime transform cols_to_process = list( filter( lambda k: "datetime_transform" in columns_preprocessing[k], columns_preprocessing, )) new_datetime_columns = [] for col in cols_to_process: t = DateTimeTransformer() t.fit(X_train, col) X_train = t.transform(X_train) self._datetime_transforms += [t] new_datetime_columns += t._new_columns # SCALE for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]: cols_to_process = list( filter( lambda k: scale_method in columns_preprocessing[k], columns_preprocessing, )) if (len(cols_to_process) and len(new_datetime_columns) and scale_method == Scale.SCALE_NORMAL): cols_to_process += new_datetime_columns if (len(cols_to_process) and len(new_text_columns) and scale_method == Scale.SCALE_NORMAL): cols_to_process += new_text_columns if (len(cols_to_process) and len(golden_columns) and scale_method == Scale.SCALE_NORMAL): cols_to_process += golden_columns if (len(cols_to_process) and len(kmeans_columns) and scale_method == Scale.SCALE_NORMAL): cols_to_process += kmeans_columns if len(cols_to_process): scale = Scale(cols_to_process) scale.fit(X_train) X_train = scale.transform(X_train) self._scale += [scale] if self._add_random_feature: # -1, 1, with 0 mean X_train["random_feature"] = np.random.rand( X_train.shape[0]) * 2.0 - 1.0 if self._drop_features: available_cols = X_train.columns.tolist() drop_cols = [c for c in self._drop_features if c in available_cols] if len(drop_cols) == X_train.shape[1]: raise AutoMLException( "All features are droppped! Your data looks like random data." ) if drop_cols: X_train.drop(drop_cols, axis=1, inplace=True) self._drop_features = drop_cols if X_train is not None: # there can be catagorical columns (in CatBoost) which cant be clipped numeric_cols = X_train.select_dtypes( include="number").columns.tolist() X_train[numeric_cols] = X_train[numeric_cols].clip( lower=np.finfo(np.float32).min + 1000, upper=np.finfo(np.float32).max - 1000, ) return X_train, y_train, sample_weight
def transform(self, X_validation, y_validation, sample_weight_validation=None): logger.debug("Preprocessing.transform") # doing copy to avoid SettingWithCopyWarning if X_validation is not None: X_validation = X_validation.copy(deep=False) if y_validation is not None: y_validation = y_validation.copy(deep=False) # target preprocessing # this must be used first, maybe we will drop some rows because of missing target values if y_validation is not None: target_preprocessing = self._params.get("target_preprocessing") logger.debug( "target_preprocessing -> {}".format(target_preprocessing)) ( X_validation, y_validation, sample_weight_validation, ) = ExcludeRowsMissingTarget.transform(X_validation, y_validation, sample_weight_validation) if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing: if y_validation is not None and self._categorical_y is not None: y_validation = pd.Series( self._categorical_y.transform(y_validation)) if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing: if y_validation is not None and self._categorical_y is not None: y_validation = self._categorical_y.transform( pd.DataFrame({"target": y_validation}), "target") if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing: if self._scale_y is not None and y_validation is not None: logger.debug("Transform log and normalize") y_validation = pd.DataFrame({"target": y_validation}) y_validation = self._scale_y.transform(y_validation) y_validation = y_validation["target"] if Scale.SCALE_NORMAL in target_preprocessing: if self._scale_y is not None and y_validation is not None: logger.debug("Transform normalize") y_validation = pd.DataFrame({"target": y_validation}) y_validation = self._scale_y.transform(y_validation) y_validation = y_validation["target"] # columns preprocessing if len(self._remove_columns) and X_validation is not None: cols_to_remove = [ col for col in X_validation.columns if col in self._remove_columns ] X_validation.drop(cols_to_remove, axis=1, inplace=True) # text transform for tt in self._text_transforms: if X_validation is not None and tt is not None: X_validation = tt.transform(X_validation) for missing in self._missing_values: if X_validation is not None and missing is not None: X_validation = missing.transform(X_validation) # to be sure that all missing are filled # in case new data there can be gaps! if (X_validation is not None and np.sum(np.sum(pd.isnull(X_validation))) > 0 and len(self._params["columns_preprocessing"]) > 0): # there is something missing, fill it # we should notice user about it! # warnings should go to the separate file ... # warnings.warn( # "There are columns {} with missing values which didnt have missing values in train dataset.".format( # list( # X_validation.columns[np.where(np.sum(pd.isnull(X_validation)))] # ) # ) # ) missing = PreprocessingMissingValues( X_validation.columns, PreprocessingMissingValues.FILL_NA_MEDIAN) missing.fit(X_validation) X_validation = missing.transform(X_validation) # golden features if self._golden_features is not None: X_validation = self._golden_features.transform(X_validation) if self._kmeans is not None: X_validation = self._kmeans.transform(X_validation) for convert in self._categorical: if X_validation is not None and convert is not None: X_validation = convert.transform(X_validation) for dtt in self._datetime_transforms: if X_validation is not None and dtt is not None: X_validation = dtt.transform(X_validation) for scale in self._scale: if X_validation is not None and scale is not None: X_validation = scale.transform(X_validation) if self._add_random_feature: # -1, 1, with 0 mean X_validation["random_feature"] = ( np.random.rand(X_validation.shape[0]) * 2.0 - 1.0) if self._drop_features and X_validation is not None: X_validation.drop(self._drop_features, axis=1, inplace=True) if X_validation is not None: # there can be catagorical columns (in CatBoost) which cant be clipped numeric_cols = X_validation.select_dtypes( include="number").columns.tolist() X_validation[numeric_cols] = X_validation[numeric_cols].clip( lower=np.finfo(np.float32).min + 1000, upper=np.finfo(np.float32).max - 1000, ) return X_validation, y_validation, sample_weight_validation
def fit_and_transform(self, X_train, y_train): logger.debug("Preprocessing.fit_and_transform") if y_train is not None: # target preprocessing # this must be used first, maybe we will drop some rows because of missing target values target_preprocessing = self._params.get("target_preprocessing") logger.debug("target_preprocessing params: {}".format(target_preprocessing)) X_train, y_train = ExcludeRowsMissingTarget.transform(X_train, y_train) if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing: logger.debug("Convert target to integer") self._categorical_y = LabelEncoder() self._categorical_y.fit(y_train) y_train = pd.Series(self._categorical_y.transform(y_train)) if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing: logger.debug("Convert target to one-hot coding") self._categorical_y = LabelBinarizer() self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target") y_train = self._categorical_y.transform( pd.DataFrame({"target": y_train}), "target" ) if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing: logger.debug("Scale log and normal") self._scale_y = Scale( ["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL ) y_train = pd.DataFrame({"target": y_train}) self._scale_y.fit(y_train) y_train = self._scale_y.transform(y_train) y_train = y_train["target"] if Scale.SCALE_NORMAL in target_preprocessing: logger.debug("Scale normal") self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL) y_train = pd.DataFrame({"target": y_train}) self._scale_y.fit(y_train) y_train = self._scale_y.transform(y_train) y_train = y_train["target"] # columns preprocessing columns_preprocessing = self._params.get("columns_preprocessing") for column in columns_preprocessing: transforms = columns_preprocessing[column] # logger.debug("Preprocess column {} with: {}".format(column, transforms)) # remove empty or constant columns cols_to_remove = list( filter( lambda k: "remove_column" in columns_preprocessing[k], columns_preprocessing, ) ) if X_train is not None: X_train.drop(cols_to_remove, axis=1, inplace=True) self._remove_columns = cols_to_remove # there can be missing values in the text data, # but we don't want to handle it by fill missing methods # zeros will be imputed by text_transform method cols_to_process = list( filter( lambda k: "text_transform" in columns_preprocessing[k], columns_preprocessing, ) ) new_text_columns = [] for col in cols_to_process: t = TextTransformer() t.fit(X_train, col) X_train = t.transform(X_train) self._text_transforms += [t] new_text_columns += t._new_columns # end of text transform for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]: cols_to_process = list( filter( lambda k: missing_method in columns_preprocessing[k], columns_preprocessing, ) ) missing = PreprocessingMissingValues(cols_to_process, missing_method) missing.fit(X_train) X_train = missing.transform(X_train) self._missing_values += [missing] for convert_method in [ PreprocessingCategorical.CONVERT_INTEGER, PreprocessingCategorical.CONVERT_ONE_HOT, ]: cols_to_process = list( filter( lambda k: convert_method in columns_preprocessing[k], columns_preprocessing, ) ) convert = PreprocessingCategorical(cols_to_process, convert_method) convert.fit(X_train) X_train = convert.transform(X_train) self._categorical += [convert] # datetime transform cols_to_process = list( filter( lambda k: "datetime_transform" in columns_preprocessing[k], columns_preprocessing, ) ) new_datetime_columns = [] for col in cols_to_process: t = DateTimeTransformer() t.fit(X_train, col) X_train = t.transform(X_train) self._datetime_transforms += [t] new_datetime_columns += t._new_columns # SCALE for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]: cols_to_process = list( filter( lambda k: scale_method in columns_preprocessing[k], columns_preprocessing, ) ) if ( len(cols_to_process) and len(new_datetime_columns) and scale_method == Scale.SCALE_NORMAL ): cols_to_process += new_datetime_columns if ( len(cols_to_process) and len(new_text_columns) and scale_method == Scale.SCALE_NORMAL ): cols_to_process += new_text_columns if len(cols_to_process): scale = Scale(cols_to_process) scale.fit(X_train) X_train = scale.transform(X_train) self._scale += [scale] return X_train, y_train