def transform(self, X, y=None): """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are treated as the same. Arguments: X (ww.DataTable, pd.DataFrame): Data to transform y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) X_null_dropped = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) X_null_dropped.drop(self._all_null_cols, inplace=True, axis=1, errors='ignore') if X_null_dropped.empty: return _retain_custom_types_and_initalize_woodwork( X_ww, X_null_dropped) if self._numeric_cols is not None and len(self._numeric_cols) > 0: X_numeric = X_null_dropped[self._numeric_cols] imputed = self._numeric_imputer.transform(X_numeric).to_dataframe() X_null_dropped[X_numeric.columns] = imputed if self._categorical_cols is not None and len( self._categorical_cols) > 0: X_categorical = X_null_dropped[self._categorical_cols] imputed = self._categorical_imputer.transform( X_categorical).to_dataframe() X_null_dropped[X_categorical.columns] = imputed X_null_dropped = _retain_custom_types_and_initalize_woodwork( X_ww, X_null_dropped) return X_null_dropped
def transform(self, X, y): """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same. Arguments: X (ww.DataTable, pd.DataFrame): Features. Ignored. y (ww.DataColumn, pd.Series): Target data to impute. Returns: (ww.DataTable, ww.DataColumn): The original X, transformed y """ if X is not None: X = infer_feature_types(X) if y is None: return X, None y_ww = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y_ww.to_series()) y_df = y.to_frame() # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool if (y_df.dtypes == bool).all(): return X, _retain_custom_types_and_initalize_woodwork(y_ww, y) transformed = self._component_obj.transform(y_df) if transformed.shape[1] == 0: raise RuntimeError("Transformed data is empty") y_t = pd.Series(transformed[:, 0], index=y.index) return X, _retain_custom_types_and_initalize_woodwork(y_ww, y_t)
def transform(self, X, y=None): """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same. Arguments: X (ww.DataTable, pd.DataFrame): Data to transform y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool if (X.dtypes == bool).all(): return infer_feature_types(X) X_null_dropped = X.copy() X_null_dropped.drop(self._all_null_cols, axis=1, errors='ignore', inplace=True) X_t = self._component_obj.transform(X) if X_null_dropped.empty: X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns) return infer_feature_types(X_t) X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns) X_t.index = X_null_dropped.index return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def transform(self, X, y=None): """Transforms data X by applying the LSA pipeline. Arguments: X (ww.DataTable, pd.DataFrame): The data to transform. y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X. The original column is removed and replaced with two columns of the format `LSA(original_column_name)[feature_number]`, where `feature_number` is 0 or 1. """ X_ww = infer_feature_types(X) if len(self._text_columns) == 0: return X_ww X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) X_t = X.copy() provenance = {} for col in self._text_columns: transformed = self._lsa_pipeline.transform(X[col]) X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0], index=X.index) X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1], index=X.index) provenance[col] = [ 'LSA({})[0]'.format(col), 'LSA({})[1]'.format(col) ] self._provenance = provenance X_t = X_t.drop(columns=self._text_columns) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def transform(self, X, y=None): """One-hot encode the input data. Arguments: X (ww.DataTable, pd.DataFrame): Features to one-hot encode. y (ww.DataColumn, pd.Series): Ignored. Returns: ww.DataTable: Transformed data, where each categorical feature has been encoded into numerical columns using one-hot encoding. """ X_ww = infer_feature_types(X) X_copy = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) X_copy = self._handle_parameter_handle_missing(X_copy) X_t = pd.DataFrame() # Add the non-categorical columns, untouched for col in X_copy.columns: if col not in self.features_to_encode: X_t = pd.concat([X_t, X_copy[col]], axis=1) # The call to pd.concat above changes the type of the index so we will manually keep it the same. if not X_t.empty: X_t.index = X_copy.index # Call sklearn's transform on the categorical columns if len(self.features_to_encode) > 0: X_cat = pd.DataFrame(self._encoder.transform( X_copy[self.features_to_encode]).toarray(), index=X_copy.index) X_cat.columns = self._get_feature_names() X_t = pd.concat([X_t, X_cat], axis=1) X_t = X_t.drop(columns=self._features_to_drop) self._feature_names = X_t.columns return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def transform(self, X, y=None): X_ww = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) X_t = self._component_obj.transform(X) X_t_df = pd.DataFrame(X_t, columns=X.columns, index=X.index) return _retain_custom_types_and_initalize_woodwork( X_ww, X_t_df, ltypes_to_ignore=[Integer, Categorical])
def transform(self, X, y=None): """Transforms data X by creating new features using existing text columns Arguments: X (ww.DataTable, pd.DataFrame): The data to transform. y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) if self._features is None or len(self._features) == 0: return X_ww X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) es = self._make_entity_set(X, self._text_columns) X_nlp_primitives = ft.calculate_feature_matrix(features=self._features, entityset=es) if X_nlp_primitives.isnull().any().any(): X_nlp_primitives.fillna(0, inplace=True) X_lsa = self._lsa.transform(X[self._text_columns]).to_dataframe() X_nlp_primitives.set_index(X.index, inplace=True) X_t = pd.concat( [X.drop(self._text_columns, axis=1), X_nlp_primitives, X_lsa], axis=1) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def transform(self, X, y=None): """Transforms data X by creating new features using existing DateTime columns, and then dropping those DateTime columns Arguments: X (ww.DataTable, pd.DataFrame): Data to transform y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) X_t = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) features_to_extract = self.parameters["features_to_extract"] if len(features_to_extract) == 0: return infer_feature_types(X_t) for col_name in self._date_time_col_names: for feature in features_to_extract: name = f"{col_name}_{feature}" features, categories = self._function_mappings[feature]( X_t[col_name], self.encode_as_categories) X_t[name] = features if categories: self._categories[name] = categories X_t = X_t.drop(self._date_time_col_names, axis=1) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def transform(self, X, y=None): X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): raise ValueError("LDA input must be all numeric") X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) X_t = self._component_obj.transform(X) X_t = pd.DataFrame(X_t, index=X.index, columns=[f"component_{i}" for i in range(X_t.shape[1])]) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def transform(self, X, y=None): """Computes the delayed features for all features in X and y. For each feature in X, it will add a column to the output dataframe for each delay in the (inclusive) range [1, max_delay]. The values of each delayed feature are simply the original feature shifted forward in time by the delay amount. For example, a delay of 3 units means that the feature value at row n will be taken from the n-3rd row of that feature If y is not None, it will also compute the delayed values for the target variable. Arguments: X (ww.DataTable, pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used. y (ww.DataColumn, pd.Series, or None): Target. Returns: ww.DataTable: Transformed X. """ if X is None: X = pd.DataFrame() # Normalize the data into pandas objects X_ww = infer_feature_types(X) categorical_columns = self._get_categorical_columns(X_ww) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) if self.delay_features and len(X) > 0: X_categorical = self._encode_X_while_preserving_index( X[categorical_columns]) for col_name in X: col = X[col_name] if col_name in categorical_columns: col = X_categorical[col_name] X = X.assign( **{ f"{col_name}_delay_{t}": col.shift(t) for t in range(1, self.max_delay + 1) }) # Handle cases where the target was passed in if self.delay_target and y is not None: y = infer_feature_types(y) if y.logical_type == logical_types.Categorical: y = self._encode_y_while_preserving_index(y) else: y = _convert_woodwork_types_wrapper(y.to_series()) X = X.assign( **{ f"target_delay_{t}": y.shift(t) for t in range(self.start_delay_for_target, self.max_delay + 1) }) return _retain_custom_types_and_initalize_woodwork(X_ww, X)
def transform(self, X, y=None): """Computes the feature matrix for the input X using featuretools' dfs algorithm. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data to transform. Has shape [n_samples, n_features] y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Feature matrix """ X_ww = infer_feature_types(X) X_t = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) X_t.columns = X_t.columns.astype(str) es = self._make_entity_set(X_t) feature_matrix = calculate_feature_matrix(features=self.features, entityset=es) return _retain_custom_types_and_initalize_woodwork(X_ww, feature_matrix)
def transform(self, X, y=None): """Transforms input data by imputing missing values. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to transform. y (ww.DataColumn, pd.Series, optional): The target training data of length [n_samples]. Ignored. Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) X_t = X.copy() cols_to_drop = [] for column, imputer in self.imputers.items(): transformed = imputer.transform(X[[column]]).to_dataframe() if transformed.empty: cols_to_drop.append(column) else: X_t[column] = transformed[column] X_t = X_t.drop(cols_to_drop, axis=1) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def fit_transform(self, X, y=None): """Fits on X and transforms X Arguments: X (ww.DataTable, pd.DataFrame): Data to fit and transform y (ww.DataColumn, pd.Series): Target data Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) X_pd = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) if y is not None: y_ww = infer_feature_types(y) y_pd = _convert_woodwork_types_wrapper(y_ww.to_series()) try: X_t = self._component_obj.fit_transform(X_pd, y_pd) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t) except AttributeError: try: return self.fit(X, y).transform(X, y) except MethodPropertyNotFoundError as e: raise e
def transform(self, X, y=None): """Transforms data X. Arguments: X (ww.DataTable, pd.DataFrame): Data to transform. y (ww.DataColumn, pd.Series, optional): Target data. Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) if y is not None: y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) try: X_t = self._component_obj.transform(X, y) except AttributeError: raise MethodPropertyNotFoundError( "Transformer requires a transform method or a component_obj that implements transform" ) X_t_df = pd.DataFrame(X_t, columns=X.columns, index=X.index) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t_df)
def transform(self, X, y=None): """Transforms input data by selecting features. If the component_obj does not have a transform method, will raise an MethodPropertyNotFoundError exception. Arguments: X (ww.DataTable, pd.DataFrame): Data to transform. y (ww.DataColumn, pd.Series, optional): Target data. Ignored. Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) self.input_feature_names = list(X.columns.values) try: X_t = self._component_obj.transform(X) except AttributeError: raise MethodPropertyNotFoundError("Feature selector requires a transform method or a component_obj that implements transform") X_dtypes = X.dtypes.to_dict() selected_col_names = self.get_names() col_types = {key: X_dtypes[key] for key in selected_col_names} features = pd.DataFrame(X_t, columns=selected_col_names, index=X.index).astype(col_types) return _retain_custom_types_and_initalize_woodwork(X_ww, features)