Beispiel #1
0
    def _fit_transform_features_helper(self, needs_fitting, X, y=None):
        """Helper function that transforms the input data based on the component graph components.

        Arguments:
            needs_fitting (boolean): Determines if components should be fit.
            X (ww.DataTable, pd.DataFrame): Data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series): The target training data of length [n_samples]. Defaults to None.

        Returns:
            ww.DataTable: Transformed values.
        """
        if len(self.compute_order) <= 1:
            return infer_feature_types(X)
        component_outputs = self._compute_features(self.compute_order[:-1],
                                                   X,
                                                   y=y,
                                                   fit=needs_fitting)
        final_component_inputs = []
        for parent in self.get_parents(self.compute_order[-1]):
            parent_output = component_outputs.get(
                parent, component_outputs.get(f'{parent}.x'))
            if isinstance(parent_output, ww.DataColumn):
                parent_output = parent_output.to_series()
                parent_output = pd.DataFrame(parent_output, columns=[parent])
                parent_output = infer_feature_types(parent_output)
            final_component_inputs.append(parent_output)
        concatted = pd.concat([
            component_input.to_dataframe()
            for component_input in final_component_inputs
        ],
                              axis=1)
        if needs_fitting:
            self.input_feature_names.update(
                {self.compute_order[-1]: list(concatted.columns)})
        return infer_feature_types(concatted)
    def validate(self, X, y):
        """Check if the target or any of the features have no variance (1 unique value).

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): The input features.
            y (ww.DataColumn, pd.Series, np.ndarray): The target data.

        Returns:
            dict: dict of warnings/errors corresponding to features or target with no variance.
        """
        results = {
            "warnings": [],
            "errors": [],
            "actions": []
        }

        X = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = infer_feature_types(y)
        y = _convert_woodwork_types_wrapper(y.to_series())

        unique_counts = X.nunique(dropna=self._dropnan).to_dict()
        any_nulls = (X.isnull().any()).to_dict()
        for name in unique_counts:
            message = self._check_for_errors(name, unique_counts[name], any_nulls[name])
            if not message:
                continue
            DataCheck._add_message(message, results)
        y_name = getattr(y, "name")
        if not y_name:
            y_name = "Y"
        target_message = self._check_for_errors(y_name, y.nunique(dropna=self._dropnan), y.isnull().any())
        if target_message:
            DataCheck._add_message(target_message, results)
        return results
Beispiel #3
0
def load_data(path, index, target, n_rows=None, drop=None, verbose=True, **kwargs):
    """Load features and target from file.

    Arguments:
        path (str): Path to file or a http/ftp/s3 URL
        index (str): Column for index
        target (str): Column for target
        n_rows (int): Number of rows to return
        drop (list): List of columns to drop
        verbose (bool): If True, prints information about features and target

    Returns:
        ww.DataTable, ww.DataColumn: Features matrix and target
    """

    feature_matrix = pd.read_csv(path, index_col=index, nrows=n_rows, **kwargs)

    targets = [target] + (drop or [])
    y = feature_matrix[target]
    X = feature_matrix.drop(columns=targets)

    if verbose:
        # number of features
        print(number_of_features(X.dtypes), end='\n\n')

        # number of total training examples
        info = 'Number of training examples: {}'
        print(info.format(len(X)), end='\n')

        # target distribution
        print(target_distribution(y))

    X = infer_feature_types(X)
    y = infer_feature_types(y)
    return X, y
Beispiel #4
0
    def transform(self, X, y):
        """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Features. Ignored.
            y (ww.DataColumn, pd.Series): Target data to impute.

        Returns:
            (ww.DataTable, ww.DataColumn): The original X, transformed y
        """

        if X is not None:
            X = infer_feature_types(X)
        if y is None:
            return X, None
        y_ww = infer_feature_types(y)
        y = _convert_woodwork_types_wrapper(y_ww.to_series())
        y_df = y.to_frame()

        # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
        if (y_df.dtypes == bool).all():
            return X, _retain_custom_types_and_initalize_woodwork(y_ww, y)

        transformed = self._component_obj.transform(y_df)
        if transformed.shape[1] == 0:
            raise RuntimeError("Transformed data is empty")
        y_t = pd.Series(transformed[:, 0], index=y.index)
        return X, _retain_custom_types_and_initalize_woodwork(y_ww, y_t)
Beispiel #5
0
    def transform(self, X, y=None):
        """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())

        # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
        if (X.dtypes == bool).all():
            return infer_feature_types(X)

        X_null_dropped = X.copy()
        X_null_dropped.drop(self._all_null_cols,
                            axis=1,
                            errors='ignore',
                            inplace=True)
        X_t = self._component_obj.transform(X)
        if X_null_dropped.empty:
            X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns)
            return infer_feature_types(X_t)

        X_t = pd.DataFrame(X_t, columns=X_null_dropped.columns)
        X_t.index = X_null_dropped.index
        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
Beispiel #6
0
def split_data(X, y, problem_type, problem_configuration=None, test_size=.2, random_seed=0):
    """Splits data into train and test sets.

    Arguments:
        X (ww.DataTable, pd.DataFrame or np.ndarray): data of shape [n_samples, n_features]
        y (ww.DataColumn, pd.Series, or np.ndarray): target data of length [n_samples]
        problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list.
        problem_configuration (dict): Additional parameters needed to configure the search. For example,
            in time series problems, values should be passed in for the date_index, gap, and max_delay variables.
        test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%).
        random_seed (int): Seed for the random number generator. Defaults to 0.

    Returns:
        ww.DataTable, ww.DataTable, ww.DataColumn, ww.DataColumn: Feature and target data each split into train and test sets
    """

    X = infer_feature_types(X)
    y = infer_feature_types(y)

    data_splitter = None
    if is_time_series(problem_type):
        data_splitter = TrainingValidationSplit(test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed)
    elif is_regression(problem_type):
        data_splitter = ShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed)
    elif is_classification(problem_type):
        data_splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed)

    train, test = next(data_splitter.split(X.to_dataframe(), y.to_series()))

    X_train = X.iloc[train]
    X_test = X.iloc[test]
    y_train = y.iloc[train]
    y_test = y.iloc[test]

    return X_train, X_test, y_train, y_test
Beispiel #7
0
    def predict(self, X, y=None, objective=None):
        """Make predictions using selected features.

        Arguments:
            X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series, np.ndarray, None): The target training targets of length [n_samples]
            objective (Object or string): The objective to use to make predictions

        Returns:
            ww.DataColumn: Predicted values.
        """
        if X is None:
            X = pd.DataFrame()
        X = infer_feature_types(X)
        y = infer_feature_types(y)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())
        features = self.compute_estimator_features(X, y)
        features = _convert_woodwork_types_wrapper(features.to_dataframe())
        features_no_nan, y = drop_rows_with_nans(features, y)
        y_arg = None
        if self.estimator.predict_uses_y:
            y_arg = y
        predictions = self.estimator.predict(features_no_nan,
                                             y_arg).to_series()
        predictions = predictions.rename(self.input_target_name)
        padded = pad_with_nans(
            predictions, max(0, features.shape[0] - predictions.shape[0]))
        return infer_feature_types(padded)
    def transform(self, X, y=None):
        """Transforms data X by creating new features using existing DateTime columns, and then dropping those DateTime columns

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        X_t = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        features_to_extract = self.parameters["features_to_extract"]
        if len(features_to_extract) == 0:
            return infer_feature_types(X_t)
        for col_name in self._date_time_col_names:
            for feature in features_to_extract:
                name = f"{col_name}_{feature}"
                features, categories = self._function_mappings[feature](
                    X_t[col_name], self.encode_as_categories)
                X_t[name] = features
                if categories:
                    self._categories[name] = categories
        X_t = X_t.drop(self._date_time_col_names, axis=1)
        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
Beispiel #9
0
    def score(self, X, y, objectives):
        """Evaluate model performance on current and additional objectives.

        Arguments:
            X (ww.DataTable, pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]
            y (pd.Series, ww.DataColumn): True labels of length [n_samples]
            objectives (list): Non-empty list of objectives to score on

        Returns:
            dict: Ordered dictionary of objective scores
        """
        # Only converting X for the call to _score_all_objectives
        if X is None:
            X = pd.DataFrame()
        X = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = infer_feature_types(y)
        y = _convert_woodwork_types_wrapper(y.to_series())

        y_predicted = self.predict(X, y)
        y_predicted = _convert_woodwork_types_wrapper(y_predicted.to_series())

        y_shifted = y.shift(-self.gap)
        objectives = self.create_objectives(objectives)
        y_shifted, y_predicted = drop_rows_with_nans(y_shifted, y_predicted)
        return self._score_all_objectives(X,
                                          y_shifted,
                                          y_predicted,
                                          y_pred_proba=None,
                                          objectives=objectives)
Beispiel #10
0
    def fit(self, X, y):
        """Fit a time series regression pipeline.

        Arguments:
            X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series, np.ndarray): The target training targets of length [n_samples]

        Returns:
            self
        """
        if X is None:
            X = pd.DataFrame()

        X = infer_feature_types(X)
        y = infer_feature_types(y)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())
        X_t = self._compute_features_during_fit(X, y)
        X_t = X_t.to_dataframe()

        y_shifted = y.shift(-self.gap)
        X_t, y_shifted = drop_rows_with_nans(X_t, y_shifted)
        self.estimator.fit(X_t, y_shifted)
        self.input_feature_names = self._component_graph.input_feature_names

        return self
Beispiel #11
0
def test_infer_feature_types_dataframe():
    X_pd = pd.DataFrame({0: pd.Series([1, 2]), 1: pd.Series([3, 4])})
    pd.testing.assert_frame_equal(X_pd,
                                  infer_feature_types(X_pd).to_dataframe(),
                                  check_dtype=False)

    X_pd = pd.DataFrame({
        0: pd.Series([1, 2], dtype="Int64"),
        1: pd.Series([3, 4], dtype="Int64")
    })
    pd.testing.assert_frame_equal(X_pd,
                                  infer_feature_types(X_pd).to_dataframe())

    X_expected = X_pd.copy()
    X_expected[0] = X_expected[0].astype("category")
    pd.testing.assert_frame_equal(
        X_expected,
        infer_feature_types(X_pd, {
            0: "categorical"
        }).to_dataframe())
    pd.testing.assert_frame_equal(
        X_expected,
        infer_feature_types(X_pd, {
            0: ww.logical_types.Categorical
        }).to_dataframe())
Beispiel #12
0
    def validate(self, X, y=None):
        """
        Inspects and validates the input data against data checks and returns a list of warnings and errors if applicable.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): The input data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series, np.ndarray): The target data of length [n_samples]

        Returns:
            dict: Dictionary containing DataCheckMessage objects

        """
        messages = {"warnings": [], "errors": [], "actions": []}
        X = infer_feature_types(X)
        X = X.drop(list(X.select('index').columns))
        if y is not None:
            y = infer_feature_types(y)

        for data_check in self.data_checks:
            messages_new = data_check.validate(X, y)
            messages["warnings"].extend(messages_new["warnings"])
            messages["errors"].extend(messages_new["errors"])

            new_actions = messages_new["actions"]
            for new_action in new_actions:
                if new_action not in messages["actions"]:
                    messages["actions"].append(new_action)
        return messages
Beispiel #13
0
 def _manage_woodwork(self, X, y=None):
     """Function to convert the input and target data to Pandas data structures."""
     X = infer_feature_types(X)
     X = _convert_woodwork_types_wrapper(X.to_dataframe())
     if y is not None:
         y = infer_feature_types(y)
         y = _convert_woodwork_types_wrapper(y.to_series())
     return X, y
Beispiel #14
0
 def predict_proba(self, X, y=None):
     if y is None:
         raise ValueError("Cannot predict Time Series Baseline Estimator if y is None")
     y = infer_feature_types(y)
     y = _convert_woodwork_types_wrapper(y.to_series())
     preds = self.predict(X, y).to_series().dropna(axis=0, how='any').astype('int')
     proba_arr = np.zeros((len(preds), y.max() + 1))
     proba_arr[np.arange(len(preds)), preds] = 1
     padded = pad_with_nans(pd.DataFrame(proba_arr), len(y) - len(preds))
     return infer_feature_types(padded)
Beispiel #15
0
    def predict(self, X, y=None):
        if y is None:
            raise ValueError("Cannot predict Time Series Baseline Estimator if y is None")
        y = infer_feature_types(y)
        y = _convert_woodwork_types_wrapper(y.to_series())

        if self.gap == 0:
            y = y.shift(periods=1)

        return infer_feature_types(y)
Beispiel #16
0
 def predict(self, X):
     X = infer_feature_types(X)
     X = _convert_woodwork_types_wrapper(X.to_dataframe())
     predictions = self._component_obj.predict(X)
     if predictions.ndim == 2 and predictions.shape[1] == 1:
         predictions = predictions.flatten()
     if self._label_encoder:
         predictions = self._label_encoder.inverse_transform(
             predictions.astype(np.int64))
     return infer_feature_types(predictions)
Beispiel #17
0
 def transform(self, X, y=None):
     X_ww = infer_feature_types(X)
     X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
     if y is not None:
         y = infer_feature_types(y)
         y = _convert_woodwork_types_wrapper(y.to_series())
     X_t = self._component_obj.transform(X, y)
     X_t_df = pd.DataFrame(X_t, columns=X.columns, index=X.index)
     return _retain_custom_types_and_initalize_woodwork(
         X_ww, X_t_df, ltypes_to_ignore=[Categorical])
Beispiel #18
0
    def fit_transform(self, X, y=None):
        X_ww = infer_feature_types(X)
        if not is_all_numeric(X_ww):
            raise ValueError("LDA input must be all numeric")
        y = infer_feature_types(y)
        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())

        X_t = self._component_obj.fit_transform(X, y)
        X_t = pd.DataFrame(X_t, index=X.index, columns=[f"component_{i}" for i in range(X_t.shape[1])])
        return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
Beispiel #19
0
 def predict(self, X):
     X = infer_feature_types(X)
     strategy = self.parameters["strategy"]
     if strategy == "mode":
         predictions = pd.Series([self._mode] * len(X))
     elif strategy == "random":
         predictions = get_random_state(self.random_seed).choice(
             self._classes, len(X))
     else:
         predictions = get_random_state(self.random_seed).choice(
             self._classes, len(X), p=self._percentage_freq)
     return infer_feature_types(predictions)
    def transform(self, X, y=None):
        """Computes the delayed features for all features in X and y.

        For each feature in X, it will add a column to the output dataframe for each
        delay in the (inclusive) range [1, max_delay]. The values of each delayed feature are simply the original
        feature shifted forward in time by the delay amount. For example, a delay of 3 units means that the feature
        value at row n will be taken from the n-3rd row of that feature

        If y is not None, it will also compute the delayed values for the target variable.

        Arguments:
            X (ww.DataTable, pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used.
            y (ww.DataColumn, pd.Series, or None): Target.

        Returns:
            ww.DataTable: Transformed X.
        """
        if X is None:
            X = pd.DataFrame()
        # Normalize the data into pandas objects
        X_ww = infer_feature_types(X)
        categorical_columns = self._get_categorical_columns(X_ww)
        X = _convert_woodwork_types_wrapper(X_ww.to_dataframe())

        if self.delay_features and len(X) > 0:
            X_categorical = self._encode_X_while_preserving_index(
                X[categorical_columns])
            for col_name in X:
                col = X[col_name]
                if col_name in categorical_columns:
                    col = X_categorical[col_name]
                X = X.assign(
                    **{
                        f"{col_name}_delay_{t}": col.shift(t)
                        for t in range(1, self.max_delay + 1)
                    })

        # Handle cases where the target was passed in
        if self.delay_target and y is not None:
            y = infer_feature_types(y)
            if y.logical_type == logical_types.Categorical:
                y = self._encode_y_while_preserving_index(y)
            else:
                y = _convert_woodwork_types_wrapper(y.to_series())
            X = X.assign(
                **{
                    f"target_delay_{t}": y.shift(t)
                    for t in range(self.start_delay_for_target,
                                   self.max_delay + 1)
                })

        return _retain_custom_types_and_initalize_woodwork(X_ww, X)
    def fit(self, X, y=None):
        if y is None:
            raise ValueError("Cannot fit Baseline regressor if y is None")
        X = infer_feature_types(X)
        y = infer_feature_types(y)
        y = _convert_woodwork_types_wrapper(y.to_series())

        if self.parameters["strategy"] == "mean":
            self._prediction_value = y.mean()
        elif self.parameters["strategy"] == "median":
            self._prediction_value = y.median()
        self._num_features = X.shape[1]
        return self
Beispiel #22
0
    def predict(self, X):
        """Make predictions using selected features.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data of shape [n_samples, n_features]

        Returns:
            ww.DataColumn: Predicted values.
        """
        if len(self.compute_order) == 0:
            return infer_feature_types(X)
        final_component = self.compute_order[-1]
        outputs = self._compute_features(self.compute_order, X)
        return infer_feature_types(outputs.get(final_component, outputs.get(f'{final_component}.x')))
Beispiel #23
0
def test_search(mock_automl_search, mock_data_checks_validate, X_y_binary):
    X, y = X_y_binary
    # this doesn't exactly match the data check results schema but its enough to trigger the error in search()
    data_check_results_expected = {'warnings': ['Warning 1', 'Warning 2']}
    mock_data_checks_validate.return_value = data_check_results_expected
    automl, data_check_results = search(X_train=X,
                                        y_train=y,
                                        problem_type='binary')
    assert isinstance(automl, AutoMLSearch)
    assert data_check_results is data_check_results_expected
    mock_data_checks_validate.assert_called_once()
    mock_data_checks_validate.assert_called_with(infer_feature_types(X),
                                                 y=infer_feature_types(y))
    mock_automl_search.assert_called_once()
Beispiel #24
0
    def fit(self, X, y):
        X = infer_feature_types(X)
        if not is_all_numeric(X):
            raise ValueError("LDA input must be all numeric")
        y = infer_feature_types(y)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        y = _convert_woodwork_types_wrapper(y.to_series())
        n_features = X.shape[1]
        n_classes = y.nunique()
        n_components = self.parameters['n_components']
        if n_components is not None and n_components > min(n_classes, n_features):
            raise ValueError(f"n_components value {n_components} is too large")

        self._component_obj.fit(X, y)
        return self
Beispiel #25
0
    def predict(self, X, objective=None):
        """Make predictions using selected features.

        Arguments:
            X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]
            objective (Object or string): The objective to use to make predictions

        Returns:
            ww.DataColumn: Predicted values.
        """
        X = infer_feature_types(X)
        predictions = self._component_graph.predict(X)
        predictions_series = predictions.to_series()
        predictions_series.name = self.input_target_name
        return infer_feature_types(predictions_series)
def test_infer_feature_types_series():
    X_pd = pd.Series([1, 2, 3, 4])
    X_expected = X_pd.astype("Int64")
    pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd).to_series())

    X_pd = pd.Series([1, 2, 3, 4], dtype="Int64")
    pd.testing.assert_series_equal(X_pd, infer_feature_types(X_pd).to_series())

    X_pd = pd.Series([1, 2, 3, 4], dtype="Int64")
    X_expected = X_pd.astype("category")
    pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd, "categorical").to_series())

    X_pd = pd.Series([1, 2, 3, 4], dtype="Int64")
    X_expected = X_pd.astype("category")
    pd.testing.assert_series_equal(X_expected, infer_feature_types(X_pd, ww.logical_types.Categorical).to_series())
Beispiel #27
0
def test_search_data_check_error(mock_automl_search, mock_data_checks_validate,
                                 X_y_binary):
    X, y = X_y_binary
    # this doesn't exactly match the data check results schema but its enough to trigger the error in search()
    data_check_results_expected = {'errors': ['Error 1', 'Error 2']}
    mock_data_checks_validate.return_value = data_check_results_expected
    automl, data_check_results = search(X_train=X,
                                        y_train=y,
                                        problem_type='binary')
    assert automl is None
    assert data_check_results == data_check_results_expected
    mock_data_checks_validate.assert_called_once()
    mock_data_checks_validate.assert_called_with(infer_feature_types(X),
                                                 y=infer_feature_types(y))
    mock_automl_search.assert_not_called()
Beispiel #28
0
    def transform(self, X, y=None):
        """Transforms data X by imputing missing values. 'None' values are converted to np.nan before imputation and are
            treated as the same.

        Arguments:
            X (ww.DataTable, pd.DataFrame): Data to transform
            y (ww.DataColumn, pd.Series, optional): Ignored.

        Returns:
            ww.DataTable: Transformed X
        """
        X_ww = infer_feature_types(X)
        X_null_dropped = _convert_woodwork_types_wrapper(X_ww.to_dataframe())
        X_null_dropped.drop(self._all_null_cols,
                            inplace=True,
                            axis=1,
                            errors='ignore')
        if X_null_dropped.empty:
            return _retain_custom_types_and_initalize_woodwork(
                X_ww, X_null_dropped)

        if self._numeric_cols is not None and len(self._numeric_cols) > 0:
            X_numeric = X_null_dropped[self._numeric_cols]
            imputed = self._numeric_imputer.transform(X_numeric).to_dataframe()
            X_null_dropped[X_numeric.columns] = imputed

        if self._categorical_cols is not None and len(
                self._categorical_cols) > 0:
            X_categorical = X_null_dropped[self._categorical_cols]
            imputed = self._categorical_imputer.transform(
                X_categorical).to_dataframe()
            X_null_dropped[X_categorical.columns] = imputed
        X_null_dropped = _retain_custom_types_and_initalize_woodwork(
            X_ww, X_null_dropped)
        return X_null_dropped
Beispiel #29
0
def test_more_top_n_unique_values_large():
    X = pd.DataFrame({
        "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
        "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"],
        "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"],
        "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1]
    })

    random_seed = 2

    encoder = OneHotEncoder(top_n=3, random_seed=random_seed)
    encoder.fit(X)
    X_t = encoder.transform(X)

    # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too
    X = infer_feature_types(X)
    X = _convert_woodwork_types_wrapper(X.to_dataframe())
    col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
    col_1_counts = col_1_counts.sample(frac=1, random_state=random_seed)
    col_1_counts = col_1_counts.sort_values(["col_1"],
                                            ascending=False,
                                            kind='mergesort')
    col_1_samples = col_1_counts.head(
        encoder.parameters['top_n']).index.tolist()
    expected_col_names = set([
        "col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c",
        "col_4"
    ])
    for val in col_1_samples:
        expected_col_names.add("col_1_" + val)

    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
    def _predict(self, X, y, objective=None, pad=False):
        features = self.compute_estimator_features(X, y)
        features = _convert_woodwork_types_wrapper(features.to_dataframe())
        features_no_nan, y_no_nan = drop_rows_with_nans(features, y)

        if objective is not None:
            objective = get_objective(objective, return_instance=True)
            if not objective.is_defined_for_problem_type(self.problem_type):
                raise ValueError(
                    f"Objective {objective.name} is not defined for time series binary classification."
                )

        if self.threshold is None:
            predictions = self._estimator_predict(features_no_nan,
                                                  y_no_nan).to_series()
        else:
            proba = self._estimator_predict_proba(features_no_nan,
                                                  y_no_nan).to_dataframe()
            proba = proba.iloc[:, 1]
            if objective is None:
                predictions = proba > self.threshold
            else:
                predictions = objective.decision_function(
                    proba, threshold=self.threshold, X=features_no_nan)
        if pad:
            predictions = pad_with_nans(
                predictions, max(0, features.shape[0] - predictions.shape[0]))
        return infer_feature_types(predictions)