Beispiel #1
0
def test_object_dtype_isnan(dtype, val):
    X = np.array([[val, np.nan], [np.nan, val]], dtype=dtype)

    expected_mask = np.array([[False, True], [True, False]])

    mask = _object_dtype_isnan(X)

    assert_array_equal(mask, expected_mask)
Beispiel #2
0
def _daal_assert_all_finite(X, allow_nan=False, msg_dtype=None):
    """Like assert_all_finite, but only for ndarray."""
    # validation is also imported in extmath
    from sklearn.utils.extmath import _safe_accumulator_op

    if _get_config()['assume_finite']:
        return

    is_df = is_DataFrame(X)
    num_of_types = get_number_of_types(X)

    # if X is heterogeneous pandas.DataFrame then
    # covert it to a list of arrays 
    if is_df and num_of_types > 1:
        lst = []
        for idx in X:
            arr = X[idx].to_numpy()
            lst.append(arr if arr.flags['C_CONTIGUOUS'] else np.ascontiguousarray(arr))
    else:
        X = np.asanyarray(X)
        is_df = False

    dt = np.dtype(get_dtype(X))
    is_float = dt.kind in 'fc'

    msg_err = "Input contains {} or a value too large for {!r}."
    type_err = 'infinity' if allow_nan else 'NaN, infinity'
    err = msg_err.format(type_err, msg_dtype if msg_dtype is not None else dt)

    if (X.ndim in [1, 2]
        and not np.any(np.equal(X.shape, 0))
        and dt in [np.float32, np.float64]
        ):
        if X.ndim == 1:
            X = X.reshape((-1, 1))

        x_for_daal = lst if is_df and num_of_types > 1 else X

        if dt == np.float64:
            if not d4p.daal_assert_all_finite(x_for_daal, allow_nan, 0):
                raise ValueError(err)
        elif dt == np.float32:
            if not d4p.daal_assert_all_finite(x_for_daal, allow_nan, 1):
                raise ValueError(err)
    # First try an O(n) time, O(1) space solution for the common case that
    # everything is finite; fall back to O(n) space np.isfinite to prevent
    # false positives from overflow in sum method. The sum is also calculated
    # safely to reduce dtype induced overflows.
    elif is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))):
        pass
    elif is_float:
        if (allow_nan and np.isinf(X).any() or
                not allow_nan and not np.isfinite(X).all()):
            raise ValueError(err)
    # for object dtype data, we only check for NaNs (GH-13254)
    elif dt == np.dtype('object') and not allow_nan:
        if _object_dtype_isnan(X).any():
            raise ValueError("Input contains NaN")
Beispiel #3
0
def test_object_dtype_isnan(dtype, val):
    X = np.array([[val, np.nan],
                  [np.nan, val]], dtype=dtype)

    expected_mask = np.array([[False, True],
                              [True, False]])

    mask = _object_dtype_isnan(X)

    assert_array_equal(mask, expected_mask)
def _get_mask(X, value_to_mask):
    """Compute the boolean mask X == value_to_mask."""
    # BUG: doesnt work properly
    npdtype = typemap(X.dtype())
    if is_scalar_nan(value_to_mask):
        if npdtype.kind == "f":
            return af.isnan(X)
        elif X.dtype.kind in ("i", "u"):
            # can't have NaNs in integer array.
            return af.constant(0, X.shape[0], X.shape[1], dtype=af.Dtype.b8)
        else:
            # np.isnan does not work on object dtypes.
            return _object_dtype_isnan(X)  #todo:fix
    else:
        return X == value_to_mask
Beispiel #5
0
    def _handle_missing(self, X):
        """
        Imputes missing values with `` or raises an error
        Note: modifies the array in-place.
        """
        if self.handle_missing not in ['error', 'zero_impute']:
            raise ValueError("handle_missing should be either 'error' or "
                             f"'zero_impute', got {self.handle_missing!r}")

        missing_mask = _object_dtype_isnan(X)

        if missing_mask.any():
            if self.handle_missing == 'error':
                raise ValueError('Input data contains missing values.')
            elif self.handle_missing == 'zero_impute':
                X[missing_mask] = ''

        return X
Beispiel #6
0
    def _check_X(self, X):
        """
        Perform custom check_array:
        - convert list of strings to object dtype
        - check for missing values for object dtype data (check_array does
          not do that)

        """
        X_temp = check_array(X, dtype=None)
        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
            X = check_array(X, dtype=np.object)
        else:
            X = X_temp

        if X.dtype == np.dtype('object'):
            if not _get_config()['assume_finite']:
                if _object_dtype_isnan(X).any():
                    raise ValueError("Input contains NaN")

        return X
Beispiel #7
0
def _daal_assert_all_finite(X, allow_nan=False, msg_dtype=None):
    """Like assert_all_finite, but only for ndarray."""
    # validation is also imported in extmath
    from sklearn.utils.extmath import _safe_accumulator_op

    if _get_config()['assume_finite']:
        return
    X = np.asanyarray(X)

    dt = X.dtype
    is_float = dt.kind in 'fc'

    msg_err = "Input contains {} or a value too large for {!r}."
    type_err = 'infinity' if allow_nan else 'NaN, infinity'
    err = msg_err.format(type_err,
                         msg_dtype if msg_dtype is not None else X.dtype)

    if (X.ndim in [1, 2] and not np.any(np.equal(X.shape, 0))
            and dt in [np.float32, np.float64]):
        if X.ndim == 1:
            X = X.reshape((-1, 1))
        if dt == np.float64:
            if not d4p.daal_assert_all_finite(X, allow_nan, 0):
                raise ValueError(err)
        elif dt == np.float32:
            if not d4p.daal_assert_all_finite(X, allow_nan, 1):
                raise ValueError(err)
    # First try an O(n) time, O(1) space solution for the common case that
    # everything is finite; fall back to O(n) space np.isfinite to prevent
    # false positives from overflow in sum method. The sum is also calculated
    # safely to reduce dtype induced overflows.
    elif is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))):
        pass
    elif is_float:
        if (allow_nan and np.isinf(X).any()
                or not allow_nan and not np.isfinite(X).all()):
            raise ValueError(err)
    # for object dtype data, we only check for NaNs (GH-13254)
    elif X.dtype == np.dtype('object') and not allow_nan:
        if _object_dtype_isnan(X).any():
            raise ValueError("Input contains NaN")
Beispiel #8
0
    def transform(self, X, fast=True):
        """
        Transform X using specified encoding scheme.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.

        Returns
        -------
        X_new : 2-d array, shape [n_samples, n_features_new]
            Transformed input.

        """

        if hasattr(X, 'iloc') and X.isna().values.any():
            if self.handle_missing == 'error':
                msg = ("Found missing values in input data; set "
                       "handle_missing='' to encode with missing values")
                raise ValueError(msg)
            if self.handle_missing != 'error':
                X = X.fillna(self.handle_missing)
        elif not hasattr(X, 'dtype') and isinstance(X, list):
            X = np.asarray(X, dtype=object)

        if hasattr(X, 'dtype'):
            mask = _object_dtype_isnan(X)
            if X.dtype.kind == 'O' and mask.any():
                if self.handle_missing == 'error':
                    msg = ("Found missing values in input data; set "
                           "handle_missing='' to encode with missing values")
                    raise ValueError(msg)
                if self.handle_missing != 'error':
                    X[mask] = self.handle_missing

        if LooseVersion(sklearn.__version__) > LooseVersion('0.21'):
            Xlist, n_samples, n_features = self._check_X(X)
        else:
            X = self._check_X(X)
            Xlist = X.T
            n_samples, n_features = X.shape

        for i in range(n_features):
            Xi = Xlist[i]
            valid_mask = np.in1d(Xi, self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)

        if self.similarity in ('levenshtein-ratio', 'jaro', 'jaro-winkler'):
            out = []
            vect = _VECTORIZED_EDIT_DISTANCES[self.similarity]
            for j, cats in enumerate(self.categories_):
                unqX = np.unique(Xlist[j])
                encoder_dict = {x: vect(x, cats.reshape(1, -1)) for x in unqX}
                encoder = [encoder_dict[x] for x in Xlist[j]]
                encoder = np.vstack(encoder)
                out.append(encoder)
            return np.hstack(out)

        elif self.similarity == 'ngram':
            min_n, max_n = self.ngram_range

            total_length = sum(len(x) for x in self.categories_)
            out = np.empty((len(X), total_length), dtype=self.dtype)
            last = 0
            for j, cats in enumerate(self.categories_):
                if fast:
                    encoded_Xj = self._ngram_similarity_fast(Xlist[j], j)
                else:
                    encoded_Xj = ngram_similarity(Xlist[j],
                                                  cats,
                                                  ngram_range=(min_n, max_n),
                                                  hashing_dim=self.hashing_dim,
                                                  dtype=np.float32)

                out[:, last:last + len(cats)] = encoded_Xj
                last += len(cats)
            return out
        else:
            raise ValueError("Unknown similarity: '%s'" % self.similarity)
Beispiel #9
0
    def fit(self, X, y=None):
        """
        Fit the SimilarityEncoder to X.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """

        if self.handle_missing not in ['error', '']:
            template = ("handle_missing should be either 'error' or "
                        "'', got %s")
            raise ValueError(template % self.handle_missing)
        if hasattr(X, 'iloc') and X.isna().values.any():
            if self.handle_missing == 'error':
                msg = ("Found missing values in input data; set "
                       "handle_missing='' to encode with missing values")
                raise ValueError(msg)
            if self.handle_missing != 'error':
                X = X.fillna(self.handle_missing)
        elif not hasattr(X, 'dtype') and isinstance(X, list):
            X = np.asarray(X, dtype=object)

        if hasattr(X, 'dtype'):
            mask = _object_dtype_isnan(X)
            if X.dtype.kind == 'O' and mask.any():
                if self.handle_missing == 'error':
                    msg = ("Found missing values in input data; set "
                           "handle_missing='' to encode with missing values")
                    raise ValueError(msg)
                if self.handle_missing != 'error':
                    X[mask] = self.handle_missing

        if LooseVersion(sklearn.__version__) > LooseVersion('0.21'):
            Xlist, n_samples, n_features = self._check_X(X)
        else:
            X = self._check_X(X)
            Xlist = X.T
            n_samples, n_features = X.shape

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if ((self.hashing_dim is not None)
                and (not isinstance(self.hashing_dim, int))):
            raise ValueError("value '%r' was specified for hashing_dim, "
                             "which has invalid type, expected None or "
                             "int." % self.hashing_dim)

        if self.categories not in ['auto', 'most_frequent', 'k-means']:
            for cats in self.categories:
                if not np.all(np.sort(cats) == np.array(cats)):
                    raise ValueError("Unsorted categories are not yet "
                                     "supported")

        self.categories_ = list()
        self.random_state_ = check_random_state(self.random_state)

        for i in range(n_features):
            Xi = Xlist[i]
            if self.categories == 'auto':
                self.categories_.append(np.unique(Xi))
            elif self.categories == 'most_frequent':
                self.categories_.append(self.get_most_frequent(Xi))
            elif self.categories == 'k-means':
                uniques, count = np.unique(Xi, return_counts=True)
                self.categories_.append(
                    get_kmeans_prototypes(uniques,
                                          self.n_prototypes,
                                          sample_weight=count,
                                          random_state=self.random_state_))
            else:
                if self.handle_unknown == 'error':
                    valid_mask = np.in1d(Xi, self.categories[i])
                    if not np.all(valid_mask):
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                self.categories_.append(
                    np.array(self.categories[i], dtype=object))

        if self.similarity == 'ngram':
            self.vectorizers_ = []
            self.vocabulary_count_matrices_ = []
            self.vocabulary_ngram_counts_ = []

            for i in range(n_features):
                vectorizer = CountVectorizer(analyzer='char',
                                             ngram_range=self.ngram_range,
                                             dtype=self.dtype,
                                             strip_accents=None)

                # Store the raw-categories (and not the preprocessed
                # categories) but use the preprocessed categories to compute
                # the stored count_matrices. This done to preserve the
                # equivalency between the user input and the categories_
                # attribute of the SimilarityEncoder, while being compliant
                # with the CountVectorizer preprocessing steps.
                preprocessed_categories = np.array(list(
                    map(preprocess, self.categories_[i])),
                                                   dtype=object)

                vocabulary_count_matrix = vectorizer.fit_transform(
                    preprocessed_categories)

                vocabulary_ngram_count = list(
                    map(
                        lambda x: get_ngram_count(preprocess(
                            x), self.ngram_range), self.categories_[i]))

                self.vectorizers_.append(vectorizer)
                self.vocabulary_count_matrices_.append(vocabulary_count_matrix)
                self.vocabulary_ngram_counts_.append(vocabulary_ngram_count)

        return self
Beispiel #10
0
    def fit(self, X, y):
        """Fit the TargetEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to determine the categories of each feature.
        y : array
            The associated target vector.

        Returns
        -------
        self
        """
        if self.handle_missing not in ['error', '']:
            template = ("handle_missing should be either 'error' or "
                        "'', got %s")
            raise ValueError(template % self.handle_missing)
        if hasattr(X, 'iloc') and X.isna().values.any():
            if self.handle_missing == 'error':
                msg = ("Found missing values in input data; set "
                       "handle_missing='' to encode with missing values")
                raise ValueError(msg)
            if self.handle_missing != 'error':
                X = X.fillna(self.handle_missing)
        elif not hasattr(X, 'dtype') and isinstance(X, list):
            X = np.asarray(X, dtype=object)

        if hasattr(X, 'dtype'):
            mask = _object_dtype_isnan(X)
            if X.dtype.kind == 'O' and mask.any():
                if self.handle_missing == 'error':
                    msg = ("Found missing values in input data; set "
                           "handle_missing='' to encode with missing values")
                    raise ValueError(msg)
                if self.handle_missing != 'error':
                    X[mask] = self.handle_missing

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.categories != 'auto':
            for cats in self.categories:
                if not np.all(np.sort(cats) == np.array(cats)):
                    raise ValueError("Unsorted categories are not yet "
                                     "supported")

        X_temp = check_array(X, dtype=None)
        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
            X = check_array(X, dtype=np.object)
        else:
            X = X_temp

        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for j in range(n_features):
            le = self._label_encoders_[j]
            Xj = X[:, j]
            if self.categories == 'auto':
                le.fit(Xj)
            else:
                if self.handle_unknown == 'error':
                    valid_mask = np.in1d(Xj, self.categories[j])
                    if not np.all(valid_mask):
                        diff = np.unique(Xj[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, j))
                        raise ValueError(msg)
                le.classes_ = np.array(self.categories[j])

        self.categories_ = [le.classes_ for le in self._label_encoders_]
        self.n = len(y)
        if self.clf_type in ['binary-clf', 'regression']:
            self.Eyx_ = [{cat: np.mean(y[X[:, j] == cat])
                          for cat in self.categories_[j]}
                         for j in range(len(self.categories_))]
            self.Ey_ = np.mean(y)
            self.counter_ = {j: collections.Counter(X[:, j])
                             for j in range(n_features)}
        if self.clf_type in ['multiclass-clf']:
            self.classes_ = np.unique(y)

            self.Eyx_ = {c: [{cat: np.mean((y == c)[X[:, j] == cat])
                              for cat in self.categories_[j]}
                             for j in range(len(self.categories_))]
                         for c in self.classes_}
            self.Ey_ = {c: np.mean(y == c) for c in self.classes_}
            self.counter_ = {j: collections.Counter(X[:, j])
                             for j in range(n_features)}
        self.k = {j: len(self.counter_[j]) for j in self.counter_}
        return self
Beispiel #11
0
    def transform(self, X):
        """Transform X using specified encoding scheme.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features_new]
            The data to encode. 

        Returns
        -------
        X_new : 2-d array
            Transformed input.
        """
        if hasattr(X, 'iloc') and X.isna().values.any():
            if self.handle_missing == 'error':
                msg = ("Found missing values in input data; set "
                       "handle_missing='' to encode with missing values")
                raise ValueError(msg)
            if self.handle_missing != 'error':
                X = X.fillna(self.handle_missing)
        elif not hasattr(X, 'dtype') and isinstance(X, list):
            X = np.asarray(X, dtype=object)

        if hasattr(X, 'dtype'):
            mask = _object_dtype_isnan(X)
            if X.dtype.kind == 'O' and mask.any():
                if self.handle_missing == 'error':
                    msg = ("Found missing values in input data; set "
                           "handle_missing='' to encode with missing values")
                    raise ValueError(msg)
                if self.handle_missing != 'error':
                    X[mask] = self.handle_missing

        X_temp = check_array(X, dtype=None)
        if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
            X = check_array(X, dtype=np.object)
        else:
            X = X_temp

        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            Xi = X[:, i]
            valid_mask = np.in1d(Xi, self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    Xi = Xi.copy()
                    Xi[~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(Xi)

        out = []

        for j, cats in enumerate(self.categories_):
            unqX = np.unique(X[:, j])
            encoder = {x: 0 for x in unqX}
            if self.clf_type in ['binary-clf', 'regression']:
                for x in unqX:
                    if x not in cats:
                        Eyx = 0
                    else:
                        Eyx = self.Eyx_[j][x]
                    lambda_n = lambda_(self.counter_[j][x], self.n/self.k[j])
                    encoder[x] = lambda_n*Eyx + (1 - lambda_n)*self.Ey_
                x_out = np.zeros((len(X[:, j]), 1))
                for i, x in enumerate(X[:, j]):
                    x_out[i, 0] = encoder[x]
                out.append(x_out.reshape(-1, 1))
            if self.clf_type == 'multiclass-clf':
                x_out = np.zeros((len(X[:, j]), len(self.classes_)))
                lambda_n = {x: 0 for x in unqX}
                for x in unqX:
                    lambda_n[x] = lambda_(self.counter_[j][x], self.n/self.k[j])
                for k, c in enumerate(np.unique(self.classes_)):
                    for x in unqX:
                        if x not in cats:
                            Eyx = 0
                        else:
                            Eyx = self.Eyx_[c][j][x]
                        encoder[x] = lambda_n[x]*Eyx + \
                            (1 - lambda_n[x])*self.Ey_[c]
                    for i, x in enumerate(X[:, j]):
                        x_out[i, k] = encoder[x]
                out.append(x_out)
        out = np.hstack(out)
        return out