def predict(self, X, y=None, objective=None): """Make predictions using selected features. Arguments: X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, np.ndarray, None): The target training targets of length [n_samples] objective (Object or string): The objective to use to make predictions Returns: ww.DataColumn: Predicted values. """ if X is None: X = pd.DataFrame() X = infer_feature_types(X) y = infer_feature_types(y) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) features = self.compute_estimator_features(X, y) features = _convert_woodwork_types_wrapper(features.to_dataframe()) features_no_nan, y = drop_rows_with_nans(features, y) y_arg = None if self.estimator.predict_uses_y: y_arg = y predictions = self.estimator.predict(features_no_nan, y_arg).to_series() predictions = predictions.rename(self.input_target_name) padded = pad_with_nans( predictions, max(0, features.shape[0] - predictions.shape[0])) return infer_feature_types(padded)
def predict(self, X, y=None, objective=None): """Make predictions using selected features. Arguments: X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, np.ndarray, None): The target training targets of length [n_samples] objective (Object or string): The objective to use to make predictions Returns: ww.DataColumn: Predicted values. """ X, y = self._convert_to_woodwork(X, y) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) y = self._encode_targets(y) n_features = max(len(y), X.shape[0]) predictions = self._predict(X, y, objective=objective, pad=False) predictions = _convert_woodwork_types_wrapper(predictions.to_series()) # In case gap is 0 and this is a baseline pipeline, we drop the nans in the # predictions before decoding them predictions = pd.Series(self._decode_targets(predictions.dropna()), name=self.input_target_name) padded = pad_with_nans(predictions, max(0, n_features - predictions.shape[0])) return infer_feature_types(padded)
def explain_prediction(pipeline, input_features, top_k=3, training_data=None, include_shap_values=False, output_format="text"): """Creates table summarizing the top_k positive and top_k negative contributing features to the prediction of a single datapoint. XGBoost models and CatBoost multiclass classifiers are not currently supported. Arguments: pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP. input_features (ww.DataTable, pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on. top_k (int): How many of the highest/lowest features to include in the table. training_data (pd.DataFrame): Training data the pipeline was fit on. This is required for non-tree estimators because we need a sample of training data for the KernelSHAP algorithm. include_shap_values (bool): Whether the SHAP values should be included in an extra column in the output. Default is False. output_format (str): Either "text" or "dict". Default is "text". Returns: str or dict - A report explaining the most positive/negative contributing features to the predictions. """ input_features = _convert_to_woodwork_structure(input_features) if not (isinstance(input_features, ww.DataTable) and input_features.shape[0] == 1): raise ValueError("features must be stored in a dataframe or datatable with exactly one row.") input_features = _convert_woodwork_types_wrapper(input_features.to_dataframe()) if training_data is not None: training_data = _convert_to_woodwork_structure(training_data) training_data = _convert_woodwork_types_wrapper(training_data.to_dataframe()) if output_format not in {"text", "dict", "dataframe"}: raise ValueError(f"Parameter output_format must be either text, dict, or dataframe. Received {output_format}") return _make_single_prediction_shap_table(pipeline, input_features, top_k, training_data, include_shap_values, output_format=output_format)
def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] y (pd.Series, ww.DataColumn): True labels of length [n_samples] objectives (list): Non-empty list of objectives to score on Returns: dict: Ordered dictionary of objective scores """ # Only converting X for the call to _score_all_objectives if X is None: X = pd.DataFrame() X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) y_predicted = self.predict(X, y) y_predicted = _convert_woodwork_types_wrapper(y_predicted.to_series()) y_shifted = y.shift(-self.gap) objectives = self.create_objectives(objectives) y_shifted, y_predicted = drop_rows_with_nans(y_shifted, y_predicted) return self._score_all_objectives(X, y_shifted, y_predicted, y_pred_proba=None, objectives=objectives)
def score(self, X, y, objectives): """Evaluate model performance on current and additional objectives. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series): True labels of length [n_samples] objectives (list): Non-empty list of objectives to score on Returns: dict: Ordered dictionary of objective scores """ X, y = self._convert_to_woodwork(X, y) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) objectives = [ get_objective(o, return_instance=True) for o in objectives ] y_encoded = self._encode_targets(y) y_shifted = y_encoded.shift(-self.gap) y_predicted, y_predicted_proba = self._compute_predictions( X, y, objectives, time_series=True) if y_predicted is not None: y_predicted = _convert_woodwork_types_wrapper( y_predicted.to_series()) if y_predicted_proba is not None: y_predicted_proba = _convert_woodwork_types_wrapper( y_predicted_proba.to_dataframe()) y_shifted, y_predicted, y_predicted_proba = drop_rows_with_nans( y_shifted, y_predicted, y_predicted_proba) return self._score_all_objectives(X, y_shifted, y_predicted, y_pred_proba=y_predicted_proba, objectives=objectives)
def validate(self, X, y): """Check if the target or any of the features have no variance (1 unique value). Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): The input features. y (ww.DataColumn, pd.Series, np.ndarray): The target data. Returns: dict: dict of warnings/errors corresponding to features or target with no variance. """ results = { "warnings": [], "errors": [], "actions": [] } X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) unique_counts = X.nunique(dropna=self._dropnan).to_dict() any_nulls = (X.isnull().any()).to_dict() for name in unique_counts: message = self._check_for_errors(name, unique_counts[name], any_nulls[name]) if not message: continue DataCheck._add_message(message, results) y_name = getattr(y, "name") if not y_name: y_name = "Y" target_message = self._check_for_errors(y_name, y.nunique(dropna=self._dropnan), y.isnull().any()) if target_message: DataCheck._add_message(target_message, results) return results
def score(self, X, y, objectives): """Evaluate model performance on objectives Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, or np.ndarray): True labels of length [n_samples] objectives (list): List of objectives to score Returns: dict: Ordered dictionary of objective scores """ y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) objectives = self.create_objectives(objectives) y = self._encode_targets(y) y_predicted, y_predicted_proba = self._compute_predictions( X, y, objectives) if y_predicted is not None: y_predicted = _convert_woodwork_types_wrapper( y_predicted.to_series()) if y_predicted_proba is not None: y_predicted_proba = _convert_woodwork_types_wrapper( y_predicted_proba.to_dataframe()) return self._score_all_objectives(X, y, y_predicted, y_predicted_proba, objectives)
def fit(self, X, y): """Fit a time series regression pipeline. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, np.ndarray): The target training targets of length [n_samples] Returns: self """ if X is None: X = pd.DataFrame() X = infer_feature_types(X) y = infer_feature_types(y) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) X_t = self._compute_features_during_fit(X, y) X_t = X_t.to_dataframe() y_shifted = y.shift(-self.gap) X_t, y_shifted = drop_rows_with_nans(X_t, y_shifted) self.estimator.fit(X_t, y_shifted) self.input_feature_names = self._component_graph.input_feature_names return self
def precision_recall_curve(y_true, y_pred_proba): """ Given labels and binary classifier predicted probabilities, compute and return the data representing a precision-recall curve. Arguments: y_true (ww.DataColumn, pd.Series or np.ndarray): True binary labels. y_pred_proba (ww.DataColumn, pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label. Returns: list: Dictionary containing metrics used to generate a precision-recall plot, with the following keys: * `precision`: Precision values. * `recall`: Recall values. * `thresholds`: Threshold values used to produce the precision and recall. * `auc_score`: The area under the ROC curve. """ y_true = _convert_to_woodwork_structure(y_true) y_pred_proba = _convert_to_woodwork_structure(y_pred_proba) y_true = _convert_woodwork_types_wrapper(y_true.to_series()) y_pred_proba = _convert_woodwork_types_wrapper(y_pred_proba.to_series()) precision, recall, thresholds = sklearn_precision_recall_curve( y_true, y_pred_proba) auc_score = sklearn_auc(recall, precision) return { 'precision': precision, 'recall': recall, 'thresholds': thresholds, 'auc_score': auc_score }
def _compute_features(self, component_list, X, y=None, fit=False): """Transforms the data by applying the given components. Arguments: component_list (list): The list of component names to compute. X (ww.DataTable, d.DataFrame): Input data to the pipeline to transform. y (ww.DataColumn, pd.Series): The target training data of length [n_samples] fit (bool): Whether to fit the estimators as well as transform it. Defaults to False. Returns: dict: Outputs from each component """ X = infer_feature_types(X) if len(component_list) == 0: return X output_cache = {} for component_name in component_list: component_instance = self.get_component(component_name) if not isinstance(component_instance, ComponentBase): raise ValueError('All components must be instantiated before fitting or predicting') x_inputs = [] y_input = None for parent_input in self.get_parents(component_name): if parent_input[-2:] == '.y': if y_input is not None: raise ValueError(f'Cannot have multiple `y` parents for a single component {component_name}') y_input = output_cache[parent_input] else: parent_x = output_cache.get(parent_input, output_cache.get(f'{parent_input}.x')) if isinstance(parent_x, ww.DataTable): parent_x = _convert_woodwork_types_wrapper(parent_x.to_dataframe()) elif isinstance(parent_x, ww.DataColumn): parent_x = pd.Series(_convert_woodwork_types_wrapper(parent_x.to_series()), name=parent_input) x_inputs.append(parent_x) input_x, input_y = self._consolidate_inputs(x_inputs, y_input, X, y) self.input_feature_names.update({component_name: list(input_x.columns)}) if isinstance(component_instance, Transformer): if fit: output = component_instance.fit_transform(input_x, input_y) else: output = component_instance.transform(input_x, input_y) if isinstance(output, tuple): output_x, output_y = output[0], output[1] else: output_x = output output_y = None output_cache[f"{component_name}.x"] = output_x output_cache[f"{component_name}.y"] = output_y else: if fit: component_instance.fit(input_x, input_y) if not (fit and component_name == self.compute_order[-1]): # Don't call predict on the final component during fit output = component_instance.predict(input_x) else: output = None output_cache[component_name] = output return output_cache
def _manage_woodwork(self, X, y=None): """Function to convert the input and target data to Pandas data structures.""" X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) if y is not None: y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) return X, y
def transform(self, X, y=None): X_ww = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) if y is not None: y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) X_t = self._component_obj.transform(X, y) X_t_df = pd.DataFrame(X_t, columns=X.columns, index=X.index) return _retain_custom_types_and_initalize_woodwork( X_ww, X_t_df, ltypes_to_ignore=[Categorical])
def explain_predictions(pipeline, input_features, training_data=None, top_k_features=3, include_shap_values=False, output_format="text"): """Creates a report summarizing the top contributing features for each data point in the input features. XGBoost models and CatBoost multiclass classifiers are not currently supported. Arguments: pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP. input_features (ww.DataTable, pd.DataFrame): Dataframe of input data to evaluate the pipeline on. training_data (ww.DataTable, pd.DataFrame): Dataframe of data the pipeline was fit on. This can be omitted for pipelines with tree-based estimators. top_k_features (int): How many of the highest/lowest contributing feature to include in the table for each data point. include_shap_values (bool): Whether SHAP values should be included in the table. Default is False. output_format (str): Either "text" or "dict". Default is "text". Returns: str or dict - A report explaining the top contributing features to each prediction for each row of input_features. The report will include the feature names, prediction contribution, and SHAP Value (optional). """ input_features = _convert_to_woodwork_structure(input_features) input_features = _convert_woodwork_types_wrapper( input_features.to_dataframe()) if training_data is not None: training_data = _convert_to_woodwork_structure(training_data) training_data = _convert_woodwork_types_wrapper( training_data.to_dataframe()) if input_features.empty: raise ValueError( "Parameter input_features must be a non-empty dataframe.") if output_format not in {"text", "dict"}: raise ValueError( f"Parameter output_format must be either text or dict. Received {output_format}" ) data = _ReportData(pipeline, input_features, y_true=None, y_pred=None, y_pred_values=None, errors=None, index_list=range(input_features.shape[0]), metric=None) report_creator = _report_creator_factory( data, report_type="explain_predictions", output_format=output_format, top_k_features=top_k_features, include_shap_values=include_shap_values) return report_creator(data)
def fit_transform(self, X, y=None): X_ww = infer_feature_types(X) if not is_all_numeric(X_ww): raise ValueError("LDA input must be all numeric") y = infer_feature_types(y) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) X_t = self._component_obj.fit_transform(X, y) X_t = pd.DataFrame(X_t, index=X.index, columns=[f"component_{i}" for i in range(X_t.shape[1])]) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def calculate_permutation_importance(pipeline, X, y, objective, n_repeats=5, n_jobs=None, random_state=0): """Calculates permutation importance for features. Arguments: pipeline (PipelineBase or subclass): Fitted pipeline X (ww.DataTable, pd.DataFrame): The input data used to score and compute permutation importance y (ww.DataColumn, pd.Series): The target data objective (str, ObjectiveBase): Objective to score on n_repeats (int): Number of times to permute a feature. Defaults to 5. n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. random_state (int, np.random.RandomState): The random seed/state. Defaults to 0. Returns: Mean feature importance scores over 5 shuffles. """ X = _convert_to_woodwork_structure(X) y = _convert_to_woodwork_structure(y) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) objective = get_objective(objective, return_instance=True) if not objective.is_defined_for_problem_type(pipeline.problem_type): raise ValueError( f"Given objective '{objective.name}' cannot be used with '{pipeline.name}'" ) def scorer(pipeline, X, y): scores = pipeline.score(X, y, objectives=[objective]) return scores[ objective. name] if objective.greater_is_better else -scores[objective.name] perm_importance = sk_permutation_importance(pipeline, X, y, n_repeats=n_repeats, scoring=scorer, n_jobs=n_jobs, random_state=random_state) mean_perm_importance = perm_importance["importances_mean"] if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) feature_names = list(X.columns) mean_perm_importance = list(zip(feature_names, mean_perm_importance)) mean_perm_importance.sort(key=lambda x: x[1], reverse=True) return pd.DataFrame(mean_perm_importance, columns=["feature", "importance"])
def roc_curve(y_true, y_pred_proba): """ Given labels and classifier predicted probabilities, compute and return the data representing a Receiver Operating Characteristic (ROC) curve. Works with binary or multiclass problems. Arguments: y_true (ww.DataColumn, pd.Series or np.ndarray): True labels. y_pred_proba (ww.DataColumn, pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied. Returns: list(dict): A list of dictionaries (with one for each class) is returned. Binary classification problems return a list with one dictionary. Each dictionary contains metrics used to generate an ROC plot with the following keys: * `fpr_rate`: False positive rate. * `tpr_rate`: True positive rate. * `threshold`: Threshold values used to produce each pair of true/false positive rates. * `auc_score`: The area under the ROC curve. """ y_true = _convert_to_woodwork_structure(y_true) y_pred_proba = _convert_to_woodwork_structure(y_pred_proba) if isinstance(y_pred_proba, ww.DataTable): y_pred_proba = _convert_woodwork_types_wrapper( y_pred_proba.to_dataframe()).to_numpy() else: y_pred_proba = _convert_woodwork_types_wrapper( y_pred_proba.to_series()).to_numpy() y_true = _convert_woodwork_types_wrapper(y_true.to_series()).to_numpy() if len(y_pred_proba.shape) == 1: y_pred_proba = y_pred_proba.reshape(-1, 1) if y_pred_proba.shape[1] == 2: y_pred_proba = y_pred_proba[:, 1].reshape(-1, 1) nan_indices = np.logical_or(pd.isna(y_true), np.isnan(y_pred_proba).any(axis=1)) y_true = y_true[~nan_indices] y_pred_proba = y_pred_proba[~nan_indices] lb = LabelBinarizer() lb.fit(np.unique(y_true)) y_one_hot_true = lb.transform(y_true) n_classes = y_one_hot_true.shape[1] curve_data = [] for i in range(n_classes): fpr_rates, tpr_rates, thresholds = sklearn_roc_curve( y_one_hot_true[:, i], y_pred_proba[:, i]) auc_score = sklearn_auc(fpr_rates, tpr_rates) curve_data.append({ 'fpr_rates': fpr_rates, 'tpr_rates': tpr_rates, 'thresholds': thresholds, 'auc_score': auc_score }) return curve_data
def transform(self, X, y=None): """Computes the delayed features for all features in X and y. For each feature in X, it will add a column to the output dataframe for each delay in the (inclusive) range [1, max_delay]. The values of each delayed feature are simply the original feature shifted forward in time by the delay amount. For example, a delay of 3 units means that the feature value at row n will be taken from the n-3rd row of that feature If y is not None, it will also compute the delayed values for the target variable. Arguments: X (pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used. y (pd.Series, None): Target. Returns: pd.DataFrame: Transformed X. """ if X is None: X = pd.DataFrame() # Normalize the data into pandas objects X = _convert_to_woodwork_structure(X) categorical_columns = self._get_categorical_columns(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) if self.delay_features and len(X) > 0: X_categorical = self._encode_X_while_preserving_index( X[categorical_columns]) for col_name in X: col = X[col_name] if col_name in categorical_columns: col = X_categorical[col_name] X = X.assign( **{ f"{col_name}_delay_{t}": col.shift(t) for t in range(1, self.max_delay + 1) }) # Handle cases where the target was passed in if self.delay_target and y is not None: y = _convert_to_woodwork_structure(y) if y.logical_type == logical_types.Categorical: y = self._encode_y_while_preserving_index(y) else: y = _convert_woodwork_types_wrapper(y.to_series()) X = X.assign( **{ f"target_delay_{t}": y.shift(t) for t in range(self.start_delay_for_target, self.max_delay + 1) }) return X
def fit(self, X, y): X = infer_feature_types(X) if not is_all_numeric(X): raise ValueError("LDA input must be all numeric") y = infer_feature_types(y) X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) n_features = X.shape[1] n_classes = y.nunique() n_components = self.parameters['n_components'] if n_components is not None and n_components > min(n_classes, n_features): raise ValueError(f"n_components value {n_components} is too large") self._component_obj.fit(X, y) return self
def transform(self, X, y=None): """Transforms data X by applying the LSA pipeline. Arguments: X (ww.DataTable, pd.DataFrame): The data to transform. y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X. The original column is removed and replaced with two columns of the format `LSA(original_column_name)[feature_number]`, where `feature_number` is 0 or 1. """ X_ww = infer_feature_types(X) if len(self._text_columns) == 0: return X_ww X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) X_t = X.copy() provenance = {} for col in self._text_columns: transformed = self._lsa_pipeline.transform(X[col]) X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0], index=X.index) X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1], index=X.index) provenance[col] = [ 'LSA({})[0]'.format(col), 'LSA({})[1]'.format(col) ] self._provenance = provenance X_t = X_t.drop(columns=self._text_columns) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def transform(self, X, y): """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same. Arguments: X (ww.DataTable, pd.DataFrame): Features. Ignored. y (ww.DataColumn, pd.Series): Target data to impute. Returns: (ww.DataTable, ww.DataColumn): The original X, transformed y """ if X is not None: X = infer_feature_types(X) if y is None: return X, None y_ww = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y_ww.to_series()) y_df = y.to_frame() # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool if (y_df.dtypes == bool).all(): return X, _retain_custom_types_and_initalize_woodwork(y_ww, y) transformed = self._component_obj.transform(y_df) if transformed.shape[1] == 0: raise RuntimeError("Transformed data is empty") y_t = pd.Series(transformed[:, 0], index=y.index) return X, _retain_custom_types_and_initalize_woodwork(y_ww, y_t)
def transform(self, X, y=None): """Transforms data X by creating new features using existing text columns Arguments: X (ww.DataTable, pd.DataFrame): The data to transform. y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) if self._features is None or len(self._features) == 0: return X_ww X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) es = self._make_entity_set(X, self._text_columns) X_nlp_primitives = ft.calculate_feature_matrix(features=self._features, entityset=es) if X_nlp_primitives.isnull().any().any(): X_nlp_primitives.fillna(0, inplace=True) X_lsa = self._lsa.transform(X[self._text_columns]).to_dataframe() X_nlp_primitives.set_index(X.index, inplace=True) X_t = pd.concat( [X.drop(self._text_columns, axis=1), X_nlp_primitives, X_lsa], axis=1) return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def _predict(self, X, y, objective=None, pad=False): features = self.compute_estimator_features(X, y) features = _convert_woodwork_types_wrapper(features.to_dataframe()) features_no_nan, y_no_nan = drop_rows_with_nans(features, y) if objective is not None: objective = get_objective(objective, return_instance=True) if not objective.is_defined_for_problem_type(self.problem_type): raise ValueError( f"Objective {objective.name} is not defined for time series binary classification." ) if self.threshold is None: predictions = self._estimator_predict(features_no_nan, y_no_nan).to_series() else: proba = self._estimator_predict_proba(features_no_nan, y_no_nan).to_dataframe() proba = proba.iloc[:, 1] if objective is None: predictions = proba > self.threshold else: predictions = objective.decision_function( proba, threshold=self.threshold, X=features_no_nan) if pad: predictions = pad_with_nans( predictions, max(0, features.shape[0] - predictions.shape[0])) return infer_feature_types(predictions)
def transform(self, X, y=None): """One-hot encode the input data. Arguments: X (ww.DataTable, pd.DataFrame): Features to one-hot encode. y (ww.DataColumn, pd.Series): Ignored. Returns: ww.DataTable: Transformed data, where each categorical feature has been encoded into numerical columns using one-hot encoding. """ X_ww = infer_feature_types(X) X_copy = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) X_copy = self._handle_parameter_handle_missing(X_copy) X_t = pd.DataFrame() # Add the non-categorical columns, untouched for col in X_copy.columns: if col not in self.features_to_encode: X_t = pd.concat([X_t, X_copy[col]], axis=1) # The call to pd.concat above changes the type of the index so we will manually keep it the same. if not X_t.empty: X_t.index = X_copy.index # Call sklearn's transform on the categorical columns if len(self.features_to_encode) > 0: X_cat = pd.DataFrame(self._encoder.transform( X_copy[self.features_to_encode]).toarray(), index=X_copy.index) X_cat.columns = self._get_feature_names() X_t = pd.concat([X_t, X_cat], axis=1) X_t = X_t.drop(columns=self._features_to_drop) self._feature_names = X_t.columns return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def transform(self, X, y=None): """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are treated as the same. Arguments: X (ww.DataTable, pd.DataFrame): Data to transform y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) X_null_dropped = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) X_null_dropped.drop(self._all_null_cols, inplace=True, axis=1, errors='ignore') if X_null_dropped.empty: return _retain_custom_types_and_initalize_woodwork( X_ww, X_null_dropped) if self._numeric_cols is not None and len(self._numeric_cols) > 0: X_numeric = X_null_dropped[self._numeric_cols] imputed = self._numeric_imputer.transform(X_numeric).to_dataframe() X_null_dropped[X_numeric.columns] = imputed if self._categorical_cols is not None and len( self._categorical_cols) > 0: X_categorical = X_null_dropped[self._categorical_cols] imputed = self._categorical_imputer.transform( X_categorical).to_dataframe() X_null_dropped[X_categorical.columns] = imputed X_null_dropped = _retain_custom_types_and_initalize_woodwork( X_ww, X_null_dropped) return X_null_dropped
def test_convert_woodwork_types_wrapper_dataframe(): X = pd.DataFrame({"Int series": pd.Series([1, 2, 3], dtype="Int64"), "Int array": pd.array([1, 2, 3], dtype="Int64"), "Int series with nan": pd.Series([1, 2, None], dtype="Int64"), "Int array with nan": pd.array([1, 2, None], dtype="Int64"), "string series": pd.Series(["a", "b", "a"], dtype="string"), "string array": pd.array(["a", "b", "a"], dtype="string"), "string series with nan": pd.Series(["a", "b", None], dtype="string"), "string array with nan": pd.array(["a", "b", None], dtype="string"), "boolean series": pd.Series([True, False, True], dtype="boolean"), "boolean array": pd.array([True, False, True], dtype="boolean"), "boolean series with nan": pd.Series([True, False, None], dtype="boolean"), "boolean array with nan": pd.array([True, False, None], dtype="boolean") }) X_expected = pd.DataFrame({"Int series": pd.Series([1, 2, 3], dtype="int64"), "Int array": pd.array([1, 2, 3], dtype="int64"), "Int series with nan": pd.Series([1, 2, np.nan], dtype="float64"), "Int array with nan": pd.array([1, 2, np.nan], dtype="float64"), "string series": pd.Series(["a", "b", "a"], dtype="object"), "string array": pd.array(["a", "b", "a"], dtype="object"), "string series with nan": pd.Series(["a", "b", np.nan], dtype="object"), "string array with nan": pd.array(["a", "b", np.nan], dtype="object"), "boolean series": pd.Series([True, False, True], dtype="bool"), "boolean array": pd.array([True, False, True], dtype="bool"), "boolean series with nan": pd.Series([True, False, np.nan], dtype="object"), "boolean array with nan": pd.array([True, False, np.nan], dtype="object") }) pd.testing.assert_frame_equal(X_expected, _convert_woodwork_types_wrapper(X))
def fit(self, X, y=None): X = infer_feature_types(X) if not is_all_numeric(X): raise ValueError("PCA input must be all numeric") X = _convert_woodwork_types_wrapper(X.to_dataframe()) self._component_obj.fit(X) return self
def transform(self, X, y=None): self._provenance = {col: [f"{col}_doubled"] for col in X.columns} X = _convert_woodwork_types_wrapper(X.to_dataframe()) new_X = X.assign(**{f"{col}_doubled": 2 * X.loc[:, col] for col in X.columns}) if self.drop_old_columns: new_X = new_X.drop(columns=X.columns) return _convert_to_woodwork_structure(new_X)
def transform(self, X, y=None): """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same. Arguments: X (ww.DataTable, pd.DataFrame): Data to transform y (ww.DataColumn, pd.Series, optional): Ignored. Returns: ww.DataTable: Transformed X """ X_ww = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool if (X.dtypes == bool).all(): return infer_feature_types(X) X_null_dropped = X.copy() X_null_dropped.drop(self._all_null_cols, axis=1, errors='ignore', inplace=True) X_t = self._component_obj.transform(X) if X_null_dropped.empty: X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns) return infer_feature_types(X_t) X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns) X_t.index = X_null_dropped.index return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
def test_more_top_n_unique_values_large(): X = pd.DataFrame({ "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"], "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"], "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"], "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1] }) random_seed = 2 encoder = OneHotEncoder(top_n=3, random_seed=random_seed) encoder.fit(X) X_t = encoder.transform(X) # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed) col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort') col_1_samples = col_1_counts.head( encoder.parameters['top_n']).index.tolist() expected_col_names = set([ "col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c", "col_4" ]) for val in col_1_samples: expected_col_names.add("col_1_" + val) col_names = set(X_t.columns) assert (col_names == expected_col_names)
def fit(self, X, y=None): X_encoded = self._encode_categories(X, fit=True) if y is not None: y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) self._component_obj.fit(X_encoded, y) return self