Beispiel #1
0
    def predict(self, X, y=None, objective=None):
        """Make predictions using selected features.

        Arguments:
            X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series, np.ndarray, None): The target training targets of length [n_samples]
            objective (Object or string): The objective to use to make predictions

        Returns:
            ww.DataColumn: Predicted values.
        """
        if X is None:
            X = pd.DataFrame()
        X = infer_feature_types(X)
        y = infer_feature_types(y)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())
        features = self.compute_estimator_features(X, y)
        features = _convert_woodwork_types_wrapper(features.to_dataframe())
        features_no_nan, y = drop_rows_with_nans(features, y)
        y_arg = None
        if self.estimator.predict_uses_y:
            y_arg = y
        predictions = self.estimator.predict(features_no_nan,
                                             y_arg).to_series()
        predictions = predictions.rename(self.input_target_name)
        padded = pad_with_nans(
            predictions, max(0, features.shape[0] - predictions.shape[0]))
        return infer_feature_types(padded)
Beispiel #2
0
    def predict(self, X, y=None, objective=None):
        """Make predictions using selected features.

        Arguments:
            X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series, np.ndarray, None): The target training targets of length [n_samples]
            objective (Object or string): The objective to use to make predictions

        Returns:
            ww.DataColumn: Predicted values.
        """
        X, y = self._convert_to_woodwork(X, y)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())
        y = self._encode_targets(y)
        n_features = max(len(y), X.shape[0])
        predictions = self._predict(X, y, objective=objective, pad=False)
        predictions = _convert_woodwork_types_wrapper(predictions.to_series())
        # In case gap is 0 and this is a baseline pipeline, we drop the nans in the
        # predictions before decoding them
        predictions = pd.Series(self._decode_targets(predictions.dropna()),
                                name=self.input_target_name)
        padded = pad_with_nans(predictions,
                               max(0, n_features - predictions.shape[0]))
        return infer_feature_types(padded)
Beispiel #3
0
def explain_prediction(pipeline, input_features, top_k=3, training_data=None, include_shap_values=False,
                       output_format="text"):
    """Creates table summarizing the top_k positive and top_k negative contributing features to the prediction of a single datapoint.

    XGBoost models and CatBoost multiclass classifiers are not currently supported.

    Arguments:
        pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP.
        input_features (ww.DataTable, pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on.
        top_k (int): How many of the highest/lowest features to include in the table.
        training_data (pd.DataFrame): Training data the pipeline was fit on.
            This is required for non-tree estimators because we need a sample of training data for the KernelSHAP algorithm.
        include_shap_values (bool): Whether the SHAP values should be included in an extra column in the output.
            Default is False.
        output_format (str): Either "text" or "dict". Default is "text".

    Returns:
        str or dict - A report explaining the most positive/negative contributing features to the predictions.
    """
    input_features = _convert_to_woodwork_structure(input_features)
    if not (isinstance(input_features, ww.DataTable) and input_features.shape[0] == 1):
        raise ValueError("features must be stored in a dataframe or datatable with exactly one row.")
    input_features = _convert_woodwork_types_wrapper(input_features.to_dataframe())
    if training_data is not None:
        training_data = _convert_to_woodwork_structure(training_data)
        training_data = _convert_woodwork_types_wrapper(training_data.to_dataframe())

    if output_format not in {"text", "dict", "dataframe"}:
        raise ValueError(f"Parameter output_format must be either text, dict, or dataframe. Received {output_format}")
    return _make_single_prediction_shap_table(pipeline, input_features, top_k, training_data, include_shap_values,
                                              output_format=output_format)
Beispiel #4
0
    def score(self, X, y, objectives):
        """Evaluate model performance on current and additional objectives.

        Arguments:
            X (ww.DataTable, pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]
            y (pd.Series, ww.DataColumn): True labels of length [n_samples]
            objectives (list): Non-empty list of objectives to score on

        Returns:
            dict: Ordered dictionary of objective scores
        """
        # Only converting X for the call to _score_all_objectives
        if X is None:
            X = pd.DataFrame()
        X = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = infer_feature_types(y)
        y = _convert_woodwork_types_wrapper(y.to_series())

        y_predicted = self.predict(X, y)
        y_predicted = _convert_woodwork_types_wrapper(y_predicted.to_series())

        y_shifted = y.shift(-self.gap)
        objectives = self.create_objectives(objectives)
        y_shifted, y_predicted = drop_rows_with_nans(y_shifted, y_predicted)
        return self._score_all_objectives(X,
                                          y_shifted,
                                          y_predicted,
                                          y_pred_proba=None,
                                          objectives=objectives)
    def score(self, X, y, objectives):
        """Evaluate model performance on current and additional objectives.

        Arguments:
            X (ww.DataTable, pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series): True labels of length [n_samples]
            objectives (list): Non-empty list of objectives to score on

        Returns:
            dict: Ordered dictionary of objective scores
        """
        X, y = self._convert_to_woodwork(X, y)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())
        objectives = [
            get_objective(o, return_instance=True) for o in objectives
        ]

        y_encoded = self._encode_targets(y)
        y_shifted = y_encoded.shift(-self.gap)
        y_predicted, y_predicted_proba = self._compute_predictions(
            X, y, objectives, time_series=True)
        if y_predicted is not None:
            y_predicted = _convert_woodwork_types_wrapper(
                y_predicted.to_series())
        if y_predicted_proba is not None:
            y_predicted_proba = _convert_woodwork_types_wrapper(
                y_predicted_proba.to_dataframe())
        y_shifted, y_predicted, y_predicted_proba = drop_rows_with_nans(
            y_shifted, y_predicted, y_predicted_proba)
        return self._score_all_objectives(X,
                                          y_shifted,
                                          y_predicted,
                                          y_pred_proba=y_predicted_proba,
                                          objectives=objectives)
    def validate(self, X, y):
        """Check if the target or any of the features have no variance (1 unique value).

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): The input features.
            y (ww.DataColumn, pd.Series, np.ndarray): The target data.

        Returns:
            dict: dict of warnings/errors corresponding to features or target with no variance.
        """
        results = {
            "warnings": [],
            "errors": [],
            "actions": []
        }

        X = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = infer_feature_types(y)
        y = _convert_woodwork_types_wrapper(y.to_series())

        unique_counts = X.nunique(dropna=self._dropnan).to_dict()
        any_nulls = (X.isnull().any()).to_dict()
        for name in unique_counts:
            message = self._check_for_errors(name, unique_counts[name], any_nulls[name])
            if not message:
                continue
            DataCheck._add_message(message, results)
        y_name = getattr(y, "name")
        if not y_name:
            y_name = "Y"
        target_message = self._check_for_errors(y_name, y.nunique(dropna=self._dropnan), y.isnull().any())
        if target_message:
            DataCheck._add_message(target_message, results)
        return results
    def score(self, X, y, objectives):
        """Evaluate model performance on objectives

        Arguments:
            X (ww.DataTable, pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series, or np.ndarray): True labels of length [n_samples]
            objectives (list): List of objectives to score

        Returns:
            dict: Ordered dictionary of objective scores
        """
        y = infer_feature_types(y)
        y = _convert_woodwork_types_wrapper(y.to_series())
        objectives = self.create_objectives(objectives)
        y = self._encode_targets(y)
        y_predicted, y_predicted_proba = self._compute_predictions(
            X, y, objectives)
        if y_predicted is not None:
            y_predicted = _convert_woodwork_types_wrapper(
                y_predicted.to_series())
        if y_predicted_proba is not None:
            y_predicted_proba = _convert_woodwork_types_wrapper(
                y_predicted_proba.to_dataframe())
        return self._score_all_objectives(X, y, y_predicted, y_predicted_proba,
                                          objectives)
Beispiel #8
0
    def fit(self, X, y):
        """Fit a time series regression pipeline.

        Arguments:
            X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series, np.ndarray): The target training targets of length [n_samples]

        Returns:
            self
        """
        if X is None:
            X = pd.DataFrame()

        X = infer_feature_types(X)
        y = infer_feature_types(y)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())
        X_t = self._compute_features_during_fit(X, y)
        X_t = X_t.to_dataframe()

        y_shifted = y.shift(-self.gap)
        X_t, y_shifted = drop_rows_with_nans(X_t, y_shifted)
        self.estimator.fit(X_t, y_shifted)
        self.input_feature_names = self._component_graph.input_feature_names

        return self
Beispiel #9
0
def precision_recall_curve(y_true, y_pred_proba):
    """
    Given labels and binary classifier predicted probabilities, compute and return the data representing a precision-recall curve.

    Arguments:
        y_true (ww.DataColumn, pd.Series or np.ndarray): True binary labels.
        y_pred_proba (ww.DataColumn, pd.Series or np.ndarray): Predictions from a binary classifier, before thresholding has been applied. Note this should be the predicted probability for the "true" label.

    Returns:
        list: Dictionary containing metrics used to generate a precision-recall plot, with the following keys:

                  * `precision`: Precision values.
                  * `recall`: Recall values.
                  * `thresholds`: Threshold values used to produce the precision and recall.
                  * `auc_score`: The area under the ROC curve.
    """
    y_true = _convert_to_woodwork_structure(y_true)
    y_pred_proba = _convert_to_woodwork_structure(y_pred_proba)
    y_true = _convert_woodwork_types_wrapper(y_true.to_series())
    y_pred_proba = _convert_woodwork_types_wrapper(y_pred_proba.to_series())

    precision, recall, thresholds = sklearn_precision_recall_curve(
        y_true, y_pred_proba)
    auc_score = sklearn_auc(recall, precision)
    return {
        'precision': precision,
        'recall': recall,
        'thresholds': thresholds,
        'auc_score': auc_score
    }
Beispiel #10
0
    def _compute_features(self, component_list, X, y=None, fit=False):
        """Transforms the data by applying the given components.

        Arguments:
            component_list (list): The list of component names to compute.
            X (ww.DataTable, d.DataFrame): Input data to the pipeline to transform.
            y (ww.DataColumn, pd.Series): The target training data of length [n_samples]
            fit (bool): Whether to fit the estimators as well as transform it.
                        Defaults to False.

        Returns:
            dict: Outputs from each component
        """
        X = infer_feature_types(X)
        if len(component_list) == 0:
            return X
        output_cache = {}
        for component_name in component_list:
            component_instance = self.get_component(component_name)
            if not isinstance(component_instance, ComponentBase):
                raise ValueError('All components must be instantiated before fitting or predicting')
            x_inputs = []
            y_input = None
            for parent_input in self.get_parents(component_name):
                if parent_input[-2:] == '.y':
                    if y_input is not None:
                        raise ValueError(f'Cannot have multiple `y` parents for a single component {component_name}')
                    y_input = output_cache[parent_input]
                else:
                    parent_x = output_cache.get(parent_input, output_cache.get(f'{parent_input}.x'))
                    if isinstance(parent_x, ww.DataTable):
                        parent_x = _convert_woodwork_types_wrapper(parent_x.to_dataframe())
                    elif isinstance(parent_x, ww.DataColumn):
                        parent_x = pd.Series(_convert_woodwork_types_wrapper(parent_x.to_series()), name=parent_input)
                    x_inputs.append(parent_x)
            input_x, input_y = self._consolidate_inputs(x_inputs, y_input, X, y)
            self.input_feature_names.update({component_name: list(input_x.columns)})

            if isinstance(component_instance, Transformer):
                if fit:
                    output = component_instance.fit_transform(input_x, input_y)
                else:
                    output = component_instance.transform(input_x, input_y)
                if isinstance(output, tuple):
                    output_x, output_y = output[0], output[1]
                else:
                    output_x = output
                    output_y = None
                output_cache[f"{component_name}.x"] = output_x
                output_cache[f"{component_name}.y"] = output_y
            else:
                if fit:
                    component_instance.fit(input_x, input_y)
                if not (fit and component_name == self.compute_order[-1]):  # Don't call predict on the final component during fit
                    output = component_instance.predict(input_x)
                else:
                    output = None
                output_cache[component_name] = output
        return output_cache
Beispiel #11
0
 def _manage_woodwork(self, X, y=None):
     """Function to convert the input and target data to Pandas data structures."""
     X = infer_feature_types(X)
     X = _convert_woodwork_types_wrapper(X.to_dataframe())
     if y is not None:
         y = infer_feature_types(y)
         y = _convert_woodwork_types_wrapper(y.to_series())
     return X, y
Beispiel #12
0
 def transform(self, X, y=None):
     X_ww = infer_feature_types(X)
     X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
     if y is not None:
         y = infer_feature_types(y)
         y = _convert_woodwork_types_wrapper(y.to_series())
     X_t = self._component_obj.transform(X, y)
     X_t_df = pd.DataFrame(X_t, columns=X.columns, index=X.index)
     return _retain_custom_types_and_initalize_woodwork(
         X_ww, X_t_df, ltypes_to_ignore=[Categorical])
Beispiel #13
0
def explain_predictions(pipeline,
                        input_features,
                        training_data=None,
                        top_k_features=3,
                        include_shap_values=False,
                        output_format="text"):
    """Creates a report summarizing the top contributing features for each data point in the input features.

    XGBoost models and CatBoost multiclass classifiers are not currently supported.

    Arguments:
        pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP.
        input_features (ww.DataTable, pd.DataFrame): Dataframe of input data to evaluate the pipeline on.
        training_data (ww.DataTable, pd.DataFrame): Dataframe of data the pipeline was fit on. This can be omitted for pipelines
            with tree-based estimators.
        top_k_features (int): How many of the highest/lowest contributing feature to include in the table for each
            data point.
        include_shap_values (bool): Whether SHAP values should be included in the table. Default is False.
        output_format (str): Either "text" or "dict". Default is "text".

    Returns:
        str or dict - A report explaining the top contributing features to each prediction for each row of input_features.
            The report will include the feature names, prediction contribution, and SHAP Value (optional).
    """
    input_features = _convert_to_woodwork_structure(input_features)
    input_features = _convert_woodwork_types_wrapper(
        input_features.to_dataframe())
    if training_data is not None:
        training_data = _convert_to_woodwork_structure(training_data)
        training_data = _convert_woodwork_types_wrapper(
            training_data.to_dataframe())

    if input_features.empty:
        raise ValueError(
            "Parameter input_features must be a non-empty dataframe.")
    if output_format not in {"text", "dict"}:
        raise ValueError(
            f"Parameter output_format must be either text or dict. Received {output_format}"
        )
    data = _ReportData(pipeline,
                       input_features,
                       y_true=None,
                       y_pred=None,
                       y_pred_values=None,
                       errors=None,
                       index_list=range(input_features.shape[0]),
                       metric=None)

    report_creator = _report_creator_factory(
        data,
        report_type="explain_predictions",
        output_format=output_format,
        top_k_features=top_k_features,
        include_shap_values=include_shap_values)
    return report_creator(data)
Beispiel #14
0
    def fit_transform(self, X, y=None):
        X_ww = infer_feature_types(X)
        if not is_all_numeric(X_ww):
            raise ValueError("LDA input must be all numeric")
        y = infer_feature_types(y)
        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())

        X_t = self._component_obj.fit_transform(X, y)
        X_t = pd.DataFrame(X_t, index=X.index, columns=[f"component_{i}" for i in range(X_t.shape[1])])
        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
Beispiel #15
0
def calculate_permutation_importance(pipeline,
                                     X,
                                     y,
                                     objective,
                                     n_repeats=5,
                                     n_jobs=None,
                                     random_state=0):
    """Calculates permutation importance for features.

    Arguments:
        pipeline (PipelineBase or subclass): Fitted pipeline
        X (ww.DataTable, pd.DataFrame): The input data used to score and compute permutation importance
        y (ww.DataColumn, pd.Series): The target data
        objective (str, ObjectiveBase): Objective to score on
        n_repeats (int): Number of times to permute a feature. Defaults to 5.
        n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines.
            None and 1 are equivalent. If set to -1, all CPUs are used. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
        random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.

    Returns:
        Mean feature importance scores over 5 shuffles.
    """
    X = _convert_to_woodwork_structure(X)
    y = _convert_to_woodwork_structure(y)
    X = _convert_woodwork_types_wrapper(X.to_dataframe())
    y = _convert_woodwork_types_wrapper(y.to_series())

    objective = get_objective(objective, return_instance=True)
    if not objective.is_defined_for_problem_type(pipeline.problem_type):
        raise ValueError(
            f"Given objective '{objective.name}' cannot be used with '{pipeline.name}'"
        )

    def scorer(pipeline, X, y):
        scores = pipeline.score(X, y, objectives=[objective])
        return scores[
            objective.
            name] if objective.greater_is_better else -scores[objective.name]

    perm_importance = sk_permutation_importance(pipeline,
                                                X,
                                                y,
                                                n_repeats=n_repeats,
                                                scoring=scorer,
                                                n_jobs=n_jobs,
                                                random_state=random_state)
    mean_perm_importance = perm_importance["importances_mean"]
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
    feature_names = list(X.columns)
    mean_perm_importance = list(zip(feature_names, mean_perm_importance))
    mean_perm_importance.sort(key=lambda x: x[1], reverse=True)
    return pd.DataFrame(mean_perm_importance,
                        columns=["feature", "importance"])
Beispiel #16
0
def roc_curve(y_true, y_pred_proba):
    """
    Given labels and classifier predicted probabilities, compute and return the data representing a Receiver Operating Characteristic (ROC) curve. Works with binary or multiclass problems.

    Arguments:
        y_true (ww.DataColumn, pd.Series or np.ndarray): True labels.
        y_pred_proba (ww.DataColumn, pd.Series or np.ndarray): Predictions from a classifier, before thresholding has been applied.

    Returns:
        list(dict): A list of dictionaries (with one for each class) is returned. Binary classification problems return a list with one dictionary.
            Each dictionary contains metrics used to generate an ROC plot with the following keys:
                  * `fpr_rate`: False positive rate.
                  * `tpr_rate`: True positive rate.
                  * `threshold`: Threshold values used to produce each pair of true/false positive rates.
                  * `auc_score`: The area under the ROC curve.
    """
    y_true = _convert_to_woodwork_structure(y_true)
    y_pred_proba = _convert_to_woodwork_structure(y_pred_proba)
    if isinstance(y_pred_proba, ww.DataTable):
        y_pred_proba = _convert_woodwork_types_wrapper(
            y_pred_proba.to_dataframe()).to_numpy()
    else:
        y_pred_proba = _convert_woodwork_types_wrapper(
            y_pred_proba.to_series()).to_numpy()
    y_true = _convert_woodwork_types_wrapper(y_true.to_series()).to_numpy()

    if len(y_pred_proba.shape) == 1:
        y_pred_proba = y_pred_proba.reshape(-1, 1)
    if y_pred_proba.shape[1] == 2:
        y_pred_proba = y_pred_proba[:, 1].reshape(-1, 1)
    nan_indices = np.logical_or(pd.isna(y_true),
                                np.isnan(y_pred_proba).any(axis=1))
    y_true = y_true[~nan_indices]
    y_pred_proba = y_pred_proba[~nan_indices]

    lb = LabelBinarizer()
    lb.fit(np.unique(y_true))
    y_one_hot_true = lb.transform(y_true)
    n_classes = y_one_hot_true.shape[1]

    curve_data = []
    for i in range(n_classes):
        fpr_rates, tpr_rates, thresholds = sklearn_roc_curve(
            y_one_hot_true[:, i], y_pred_proba[:, i])
        auc_score = sklearn_auc(fpr_rates, tpr_rates)
        curve_data.append({
            'fpr_rates': fpr_rates,
            'tpr_rates': tpr_rates,
            'thresholds': thresholds,
            'auc_score': auc_score
        })

    return curve_data
Beispiel #17
0
    def transform(self, X, y=None):
        """Computes the delayed features for all features in X and y.

        For each feature in X, it will add a column to the output dataframe for each
        delay in the (inclusive) range [1, max_delay]. The values of each delayed feature are simply the original
        feature shifted forward in time by the delay amount. For example, a delay of 3 units means that the feature
        value at row n will be taken from the n-3rd row of that feature

        If y is not None, it will also compute the delayed values for the target variable.

        Arguments:
            X (pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used.
            y (pd.Series, None): Target.

        Returns:
            pd.DataFrame: Transformed X.
        """
        if X is None:
            X = pd.DataFrame()
        # Normalize the data into pandas objects
        X = _convert_to_woodwork_structure(X)

        categorical_columns = self._get_categorical_columns(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        if self.delay_features and len(X) > 0:
            X_categorical = self._encode_X_while_preserving_index(
                X[categorical_columns])
            for col_name in X:
                col = X[col_name]
                if col_name in categorical_columns:
                    col = X_categorical[col_name]
                X = X.assign(
                    **{
                        f"{col_name}_delay_{t}": col.shift(t)
                        for t in range(1, self.max_delay + 1)
                    })

        # Handle cases where the target was passed in
        if self.delay_target and y is not None:
            y = _convert_to_woodwork_structure(y)
            if y.logical_type == logical_types.Categorical:
                y = self._encode_y_while_preserving_index(y)
            else:
                y = _convert_woodwork_types_wrapper(y.to_series())
            X = X.assign(
                **{
                    f"target_delay_{t}": y.shift(t)
                    for t in range(self.start_delay_for_target,
                                   self.max_delay + 1)
                })

        return X
Beispiel #18
0
    def fit(self, X, y):
        X = infer_feature_types(X)
        if not is_all_numeric(X):
            raise ValueError("LDA input must be all numeric")
        y = infer_feature_types(y)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())
        n_features = X.shape[1]
        n_classes = y.nunique()
        n_components = self.parameters['n_components']
        if n_components is not None and n_components > min(n_classes, n_features):
            raise ValueError(f"n_components value {n_components} is too large")

        self._component_obj.fit(X, y)
        return self
Beispiel #19
0
    def transform(self, X, y=None):
        """Transforms data X by applying the LSA pipeline.

        Arguments:
            X (ww.DataTable, pd.DataFrame): The data to transform.
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X. The original column is removed and replaced with two columns of the
                          format `LSA(original_column_name)[feature_number]`, where `feature_number` is 0 or 1.
        """
        X_ww = infer_feature_types(X)
        if len(self._text_columns) == 0:
            return X_ww

        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        X_t = X.copy()
        provenance = {}
        for col in self._text_columns:
            transformed = self._lsa_pipeline.transform(X[col])
            X_t['LSA({})[0]'.format(col)] = pd.Series(transformed[:, 0],
                                                      index=X.index)
            X_t['LSA({})[1]'.format(col)] = pd.Series(transformed[:, 1],
                                                      index=X.index)
            provenance[col] = [
                'LSA({})[0]'.format(col), 'LSA({})[1]'.format(col)
            ]
        self._provenance = provenance

        X_t = X_t.drop(columns=self._text_columns)
        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
Beispiel #20
0
    def transform(self, X, y):
        """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Features. Ignored.
            y (ww.DataColumn, pd.Series): Target data to impute.

        Returns:
            (ww.DataTable, ww.DataColumn): The original X, transformed y
        """

        if X is not None:
            X = infer_feature_types(X)
        if y is None:
            return X, None
        y_ww = infer_feature_types(y)
        y = _convert_woodwork_types_wrapper(y_ww.to_series())
        y_df = y.to_frame()

        # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
        if (y_df.dtypes == bool).all():
            return X, _retain_custom_types_and_initalize_woodwork(y_ww, y)

        transformed = self._component_obj.transform(y_df)
        if transformed.shape[1] == 0:
            raise RuntimeError("Transformed data is empty")
        y_t = pd.Series(transformed[:, 0], index=y.index)
        return X, _retain_custom_types_and_initalize_woodwork(y_ww, y_t)
Beispiel #21
0
    def transform(self, X, y=None):
        """Transforms data X by creating new features using existing text columns

        Arguments:
            X (ww.DataTable, pd.DataFrame): The data to transform.
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        if self._features is None or len(self._features) == 0:
            return X_ww
        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        es = self._make_entity_set(X, self._text_columns)
        X_nlp_primitives = ft.calculate_feature_matrix(features=self._features,
                                                       entityset=es)
        if X_nlp_primitives.isnull().any().any():
            X_nlp_primitives.fillna(0, inplace=True)

        X_lsa = self._lsa.transform(X[self._text_columns]).to_dataframe()
        X_nlp_primitives.set_index(X.index, inplace=True)
        X_t = pd.concat(
            [X.drop(self._text_columns, axis=1), X_nlp_primitives, X_lsa],
            axis=1)
        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
    def _predict(self, X, y, objective=None, pad=False):
        features = self.compute_estimator_features(X, y)
        features = _convert_woodwork_types_wrapper(features.to_dataframe())
        features_no_nan, y_no_nan = drop_rows_with_nans(features, y)

        if objective is not None:
            objective = get_objective(objective, return_instance=True)
            if not objective.is_defined_for_problem_type(self.problem_type):
                raise ValueError(
                    f"Objective {objective.name} is not defined for time series binary classification."
                )

        if self.threshold is None:
            predictions = self._estimator_predict(features_no_nan,
                                                  y_no_nan).to_series()
        else:
            proba = self._estimator_predict_proba(features_no_nan,
                                                  y_no_nan).to_dataframe()
            proba = proba.iloc[:, 1]
            if objective is None:
                predictions = proba > self.threshold
            else:
                predictions = objective.decision_function(
                    proba, threshold=self.threshold, X=features_no_nan)
        if pad:
            predictions = pad_with_nans(
                predictions, max(0, features.shape[0] - predictions.shape[0]))
        return infer_feature_types(predictions)
Beispiel #23
0
    def transform(self, X, y=None):
        """One-hot encode the input data.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Features to one-hot encode.
            y (ww.DataColumn, pd.Series): Ignored.

        Returns:
            ww.DataTable: Transformed data, where each categorical feature has been encoded into numerical columns using one-hot encoding.
        """
        X_ww = infer_feature_types(X)
        X_copy = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        X_copy = self._handle_parameter_handle_missing(X_copy)

        X_t = pd.DataFrame()
        # Add the non-categorical columns, untouched
        for col in X_copy.columns:
            if col not in self.features_to_encode:
                X_t = pd.concat([X_t, X_copy[col]], axis=1)
        # The call to pd.concat above changes the type of the index so we will manually keep it the same.
        if not X_t.empty:
            X_t.index = X_copy.index

        # Call sklearn's transform on the categorical columns
        if len(self.features_to_encode) > 0:
            X_cat = pd.DataFrame(self._encoder.transform(
                X_copy[self.features_to_encode]).toarray(),
                                 index=X_copy.index)
            X_cat.columns = self._get_feature_names()
            X_t = pd.concat([X_t, X_cat], axis=1)
            X_t = X_t.drop(columns=self._features_to_drop)
            self._feature_names = X_t.columns

        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
Beispiel #24
0
    def transform(self, X, y=None):
        """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are
            treated as the same.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        X_null_dropped = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        X_null_dropped.drop(self._all_null_cols,
                            inplace=True,
                            axis=1,
                            errors='ignore')
        if X_null_dropped.empty:
            return _retain_custom_types_and_initalize_woodwork(
                X_ww, X_null_dropped)

        if self._numeric_cols is not None and len(self._numeric_cols) > 0:
            X_numeric = X_null_dropped[self._numeric_cols]
            imputed = self._numeric_imputer.transform(X_numeric).to_dataframe()
            X_null_dropped[X_numeric.columns] = imputed

        if self._categorical_cols is not None and len(
                self._categorical_cols) > 0:
            X_categorical = X_null_dropped[self._categorical_cols]
            imputed = self._categorical_imputer.transform(
                X_categorical).to_dataframe()
            X_null_dropped[X_categorical.columns] = imputed
        X_null_dropped = _retain_custom_types_and_initalize_woodwork(
            X_ww, X_null_dropped)
        return X_null_dropped
def test_convert_woodwork_types_wrapper_dataframe():
    X = pd.DataFrame({"Int series": pd.Series([1, 2, 3], dtype="Int64"),
                      "Int array": pd.array([1, 2, 3], dtype="Int64"),
                      "Int series with nan": pd.Series([1, 2, None], dtype="Int64"),
                      "Int array with nan": pd.array([1, 2, None], dtype="Int64"),
                      "string series": pd.Series(["a", "b", "a"], dtype="string"),
                      "string array": pd.array(["a", "b", "a"], dtype="string"),
                      "string series with nan": pd.Series(["a", "b", None], dtype="string"),
                      "string array with nan": pd.array(["a", "b", None], dtype="string"),
                      "boolean series": pd.Series([True, False, True], dtype="boolean"),
                      "boolean array": pd.array([True, False, True], dtype="boolean"),
                      "boolean series with nan": pd.Series([True, False, None], dtype="boolean"),
                      "boolean array with nan": pd.array([True, False, None], dtype="boolean")
                      })
    X_expected = pd.DataFrame({"Int series": pd.Series([1, 2, 3], dtype="int64"),
                               "Int array": pd.array([1, 2, 3], dtype="int64"),
                               "Int series with nan": pd.Series([1, 2, np.nan], dtype="float64"),
                               "Int array with nan": pd.array([1, 2, np.nan], dtype="float64"),
                               "string series": pd.Series(["a", "b", "a"], dtype="object"),
                               "string array": pd.array(["a", "b", "a"], dtype="object"),
                               "string series with nan": pd.Series(["a", "b", np.nan], dtype="object"),
                               "string array with nan": pd.array(["a", "b", np.nan], dtype="object"),
                               "boolean series": pd.Series([True, False, True], dtype="bool"),
                               "boolean array": pd.array([True, False, True], dtype="bool"),
                               "boolean series with nan": pd.Series([True, False, np.nan], dtype="object"),
                               "boolean array with nan": pd.array([True, False, np.nan], dtype="object")
                               })
    pd.testing.assert_frame_equal(X_expected, _convert_woodwork_types_wrapper(X))
Beispiel #26
0
 def fit(self, X, y=None):
     X = infer_feature_types(X)
     if not is_all_numeric(X):
         raise ValueError("PCA input must be all numeric")
     X = _convert_woodwork_types_wrapper(X.to_dataframe())
     self._component_obj.fit(X)
     return self
Beispiel #27
0
 def transform(self, X, y=None):
     self._provenance = {col: [f"{col}_doubled"] for col in X.columns}
     X = _convert_woodwork_types_wrapper(X.to_dataframe())
     new_X = X.assign(**{f"{col}_doubled": 2 * X.loc[:, col] for col in X.columns})
     if self.drop_old_columns:
         new_X = new_X.drop(columns=X.columns)
     return _convert_to_woodwork_structure(new_X)
Beispiel #28
0
    def transform(self, X, y=None):
        """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())

        # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
        if (X.dtypes == bool).all():
            return infer_feature_types(X)

        X_null_dropped = X.copy()
        X_null_dropped.drop(self._all_null_cols,
                            axis=1,
                            errors='ignore',
                            inplace=True)
        X_t = self._component_obj.transform(X)
        if X_null_dropped.empty:
            X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns)
            return infer_feature_types(X_t)

        X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns)
        X_t.index = X_null_dropped.index
        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
Beispiel #29
0
def test_more_top_n_unique_values_large():
    X = pd.DataFrame({
        "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
        "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"],
        "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"],
        "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1]
    })

    random_seed = 2

    encoder = OneHotEncoder(top_n=3, random_seed=random_seed)
    encoder.fit(X)
    X_t = encoder.transform(X)

    # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too
    X = infer_feature_types(X)
    X = _convert_woodwork_types_wrapper(X.to_dataframe())
    col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
    col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed)
    col_1_counts = col_1_counts.sort_values(["col_1"],
                                            ascending=False,
                                            kind='mergesort')
    col_1_samples = col_1_counts.head(
        encoder.parameters['top_n']).index.tolist()
    expected_col_names = set([
        "col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c",
        "col_4"
    ])
    for val in col_1_samples:
        expected_col_names.add("col_1_" + val)

    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
Beispiel #30
0
 def fit(self, X, y=None):
     X_encoded = self._encode_categories(X, fit=True)
     if y is not None:
         y = infer_feature_types(y)
         y = _convert_woodwork_types_wrapper(y.to_series())
     self._component_obj.fit(X_encoded, y)
     return self