def test_object_dtype_isnan(dtype, val): X = np.array([[val, np.nan], [np.nan, val]], dtype=dtype) expected_mask = np.array([[False, True], [True, False]]) mask = _object_dtype_isnan(X) assert_array_equal(mask, expected_mask)
def _daal_assert_all_finite(X, allow_nan=False, msg_dtype=None): """Like assert_all_finite, but only for ndarray.""" # validation is also imported in extmath from sklearn.utils.extmath import _safe_accumulator_op if _get_config()['assume_finite']: return is_df = is_DataFrame(X) num_of_types = get_number_of_types(X) # if X is heterogeneous pandas.DataFrame then # covert it to a list of arrays if is_df and num_of_types > 1: lst = [] for idx in X: arr = X[idx].to_numpy() lst.append(arr if arr.flags['C_CONTIGUOUS'] else np.ascontiguousarray(arr)) else: X = np.asanyarray(X) is_df = False dt = np.dtype(get_dtype(X)) is_float = dt.kind in 'fc' msg_err = "Input contains {} or a value too large for {!r}." type_err = 'infinity' if allow_nan else 'NaN, infinity' err = msg_err.format(type_err, msg_dtype if msg_dtype is not None else dt) if (X.ndim in [1, 2] and not np.any(np.equal(X.shape, 0)) and dt in [np.float32, np.float64] ): if X.ndim == 1: X = X.reshape((-1, 1)) x_for_daal = lst if is_df and num_of_types > 1 else X if dt == np.float64: if not d4p.daal_assert_all_finite(x_for_daal, allow_nan, 0): raise ValueError(err) elif dt == np.float32: if not d4p.daal_assert_all_finite(x_for_daal, allow_nan, 1): raise ValueError(err) # First try an O(n) time, O(1) space solution for the common case that # everything is finite; fall back to O(n) space np.isfinite to prevent # false positives from overflow in sum method. The sum is also calculated # safely to reduce dtype induced overflows. elif is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))): pass elif is_float: if (allow_nan and np.isinf(X).any() or not allow_nan and not np.isfinite(X).all()): raise ValueError(err) # for object dtype data, we only check for NaNs (GH-13254) elif dt == np.dtype('object') and not allow_nan: if _object_dtype_isnan(X).any(): raise ValueError("Input contains NaN")
def test_object_dtype_isnan(dtype, val): X = np.array([[val, np.nan], [np.nan, val]], dtype=dtype) expected_mask = np.array([[False, True], [True, False]]) mask = _object_dtype_isnan(X) assert_array_equal(mask, expected_mask)
def _get_mask(X, value_to_mask): """Compute the boolean mask X == value_to_mask.""" # BUG: doesnt work properly npdtype = typemap(X.dtype()) if is_scalar_nan(value_to_mask): if npdtype.kind == "f": return af.isnan(X) elif X.dtype.kind in ("i", "u"): # can't have NaNs in integer array. return af.constant(0, X.shape[0], X.shape[1], dtype=af.Dtype.b8) else: # np.isnan does not work on object dtypes. return _object_dtype_isnan(X) #todo:fix else: return X == value_to_mask
def _handle_missing(self, X): """ Imputes missing values with `` or raises an error Note: modifies the array in-place. """ if self.handle_missing not in ['error', 'zero_impute']: raise ValueError("handle_missing should be either 'error' or " f"'zero_impute', got {self.handle_missing!r}") missing_mask = _object_dtype_isnan(X) if missing_mask.any(): if self.handle_missing == 'error': raise ValueError('Input data contains missing values.') elif self.handle_missing == 'zero_impute': X[missing_mask] = '' return X
def _check_X(self, X): """ Perform custom check_array: - convert list of strings to object dtype - check for missing values for object dtype data (check_array does not do that) """ X_temp = check_array(X, dtype=None) if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): X = check_array(X, dtype=np.object) else: X = X_temp if X.dtype == np.dtype('object'): if not _get_config()['assume_finite']: if _object_dtype_isnan(X).any(): raise ValueError("Input contains NaN") return X
def _daal_assert_all_finite(X, allow_nan=False, msg_dtype=None): """Like assert_all_finite, but only for ndarray.""" # validation is also imported in extmath from sklearn.utils.extmath import _safe_accumulator_op if _get_config()['assume_finite']: return X = np.asanyarray(X) dt = X.dtype is_float = dt.kind in 'fc' msg_err = "Input contains {} or a value too large for {!r}." type_err = 'infinity' if allow_nan else 'NaN, infinity' err = msg_err.format(type_err, msg_dtype if msg_dtype is not None else X.dtype) if (X.ndim in [1, 2] and not np.any(np.equal(X.shape, 0)) and dt in [np.float32, np.float64]): if X.ndim == 1: X = X.reshape((-1, 1)) if dt == np.float64: if not d4p.daal_assert_all_finite(X, allow_nan, 0): raise ValueError(err) elif dt == np.float32: if not d4p.daal_assert_all_finite(X, allow_nan, 1): raise ValueError(err) # First try an O(n) time, O(1) space solution for the common case that # everything is finite; fall back to O(n) space np.isfinite to prevent # false positives from overflow in sum method. The sum is also calculated # safely to reduce dtype induced overflows. elif is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))): pass elif is_float: if (allow_nan and np.isinf(X).any() or not allow_nan and not np.isfinite(X).all()): raise ValueError(err) # for object dtype data, we only check for NaNs (GH-13254) elif X.dtype == np.dtype('object') and not allow_nan: if _object_dtype_isnan(X).any(): raise ValueError("Input contains NaN")
def transform(self, X, fast=True): """ Transform X using specified encoding scheme. Parameters ---------- X : array-like, shape [n_samples, n_features] The data to encode. Returns ------- X_new : 2-d array, shape [n_samples, n_features_new] Transformed input. """ if hasattr(X, 'iloc') and X.isna().values.any(): if self.handle_missing == 'error': msg = ("Found missing values in input data; set " "handle_missing='' to encode with missing values") raise ValueError(msg) if self.handle_missing != 'error': X = X.fillna(self.handle_missing) elif not hasattr(X, 'dtype') and isinstance(X, list): X = np.asarray(X, dtype=object) if hasattr(X, 'dtype'): mask = _object_dtype_isnan(X) if X.dtype.kind == 'O' and mask.any(): if self.handle_missing == 'error': msg = ("Found missing values in input data; set " "handle_missing='' to encode with missing values") raise ValueError(msg) if self.handle_missing != 'error': X[mask] = self.handle_missing if LooseVersion(sklearn.__version__) > LooseVersion('0.21'): Xlist, n_samples, n_features = self._check_X(X) else: X = self._check_X(X) Xlist = X.T n_samples, n_features = X.shape for i in range(n_features): Xi = Xlist[i] valid_mask = np.in1d(Xi, self.categories_[i]) if not np.all(valid_mask): if self.handle_unknown == 'error': diff = np.unique(X[~valid_mask, i]) msg = ("Found unknown categories {0} in column {1}" " during transform".format(diff, i)) raise ValueError(msg) if self.similarity in ('levenshtein-ratio', 'jaro', 'jaro-winkler'): out = [] vect = _VECTORIZED_EDIT_DISTANCES[self.similarity] for j, cats in enumerate(self.categories_): unqX = np.unique(Xlist[j]) encoder_dict = {x: vect(x, cats.reshape(1, -1)) for x in unqX} encoder = [encoder_dict[x] for x in Xlist[j]] encoder = np.vstack(encoder) out.append(encoder) return np.hstack(out) elif self.similarity == 'ngram': min_n, max_n = self.ngram_range total_length = sum(len(x) for x in self.categories_) out = np.empty((len(X), total_length), dtype=self.dtype) last = 0 for j, cats in enumerate(self.categories_): if fast: encoded_Xj = self._ngram_similarity_fast(Xlist[j], j) else: encoded_Xj = ngram_similarity(Xlist[j], cats, ngram_range=(min_n, max_n), hashing_dim=self.hashing_dim, dtype=np.float32) out[:, last:last + len(cats)] = encoded_Xj last += len(cats) return out else: raise ValueError("Unknown similarity: '%s'" % self.similarity)
def fit(self, X, y=None): """ Fit the SimilarityEncoder to X. Parameters ---------- X : array-like, shape [n_samples, n_features] The data to determine the categories of each feature. Returns ------- self """ if self.handle_missing not in ['error', '']: template = ("handle_missing should be either 'error' or " "'', got %s") raise ValueError(template % self.handle_missing) if hasattr(X, 'iloc') and X.isna().values.any(): if self.handle_missing == 'error': msg = ("Found missing values in input data; set " "handle_missing='' to encode with missing values") raise ValueError(msg) if self.handle_missing != 'error': X = X.fillna(self.handle_missing) elif not hasattr(X, 'dtype') and isinstance(X, list): X = np.asarray(X, dtype=object) if hasattr(X, 'dtype'): mask = _object_dtype_isnan(X) if X.dtype.kind == 'O' and mask.any(): if self.handle_missing == 'error': msg = ("Found missing values in input data; set " "handle_missing='' to encode with missing values") raise ValueError(msg) if self.handle_missing != 'error': X[mask] = self.handle_missing if LooseVersion(sklearn.__version__) > LooseVersion('0.21'): Xlist, n_samples, n_features = self._check_X(X) else: X = self._check_X(X) Xlist = X.T n_samples, n_features = X.shape if self.handle_unknown not in ['error', 'ignore']: template = ("handle_unknown should be either 'error' or " "'ignore', got %s") raise ValueError(template % self.handle_unknown) if ((self.hashing_dim is not None) and (not isinstance(self.hashing_dim, int))): raise ValueError("value '%r' was specified for hashing_dim, " "which has invalid type, expected None or " "int." % self.hashing_dim) if self.categories not in ['auto', 'most_frequent', 'k-means']: for cats in self.categories: if not np.all(np.sort(cats) == np.array(cats)): raise ValueError("Unsorted categories are not yet " "supported") self.categories_ = list() self.random_state_ = check_random_state(self.random_state) for i in range(n_features): Xi = Xlist[i] if self.categories == 'auto': self.categories_.append(np.unique(Xi)) elif self.categories == 'most_frequent': self.categories_.append(self.get_most_frequent(Xi)) elif self.categories == 'k-means': uniques, count = np.unique(Xi, return_counts=True) self.categories_.append( get_kmeans_prototypes(uniques, self.n_prototypes, sample_weight=count, random_state=self.random_state_)) else: if self.handle_unknown == 'error': valid_mask = np.in1d(Xi, self.categories[i]) if not np.all(valid_mask): diff = np.unique(Xi[~valid_mask]) msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) self.categories_.append( np.array(self.categories[i], dtype=object)) if self.similarity == 'ngram': self.vectorizers_ = [] self.vocabulary_count_matrices_ = [] self.vocabulary_ngram_counts_ = [] for i in range(n_features): vectorizer = CountVectorizer(analyzer='char', ngram_range=self.ngram_range, dtype=self.dtype, strip_accents=None) # Store the raw-categories (and not the preprocessed # categories) but use the preprocessed categories to compute # the stored count_matrices. This done to preserve the # equivalency between the user input and the categories_ # attribute of the SimilarityEncoder, while being compliant # with the CountVectorizer preprocessing steps. preprocessed_categories = np.array(list( map(preprocess, self.categories_[i])), dtype=object) vocabulary_count_matrix = vectorizer.fit_transform( preprocessed_categories) vocabulary_ngram_count = list( map( lambda x: get_ngram_count(preprocess( x), self.ngram_range), self.categories_[i])) self.vectorizers_.append(vectorizer) self.vocabulary_count_matrices_.append(vocabulary_count_matrix) self.vocabulary_ngram_counts_.append(vocabulary_ngram_count) return self
def fit(self, X, y): """Fit the TargetEncoder to X. Parameters ---------- X : array-like, shape [n_samples, n_features] The data to determine the categories of each feature. y : array The associated target vector. Returns ------- self """ if self.handle_missing not in ['error', '']: template = ("handle_missing should be either 'error' or " "'', got %s") raise ValueError(template % self.handle_missing) if hasattr(X, 'iloc') and X.isna().values.any(): if self.handle_missing == 'error': msg = ("Found missing values in input data; set " "handle_missing='' to encode with missing values") raise ValueError(msg) if self.handle_missing != 'error': X = X.fillna(self.handle_missing) elif not hasattr(X, 'dtype') and isinstance(X, list): X = np.asarray(X, dtype=object) if hasattr(X, 'dtype'): mask = _object_dtype_isnan(X) if X.dtype.kind == 'O' and mask.any(): if self.handle_missing == 'error': msg = ("Found missing values in input data; set " "handle_missing='' to encode with missing values") raise ValueError(msg) if self.handle_missing != 'error': X[mask] = self.handle_missing if self.handle_unknown not in ['error', 'ignore']: template = ("handle_unknown should be either 'error' or " "'ignore', got %s") raise ValueError(template % self.handle_unknown) if self.categories != 'auto': for cats in self.categories: if not np.all(np.sort(cats) == np.array(cats)): raise ValueError("Unsorted categories are not yet " "supported") X_temp = check_array(X, dtype=None) if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): X = check_array(X, dtype=np.object) else: X = X_temp n_samples, n_features = X.shape self._label_encoders_ = [LabelEncoder() for _ in range(n_features)] for j in range(n_features): le = self._label_encoders_[j] Xj = X[:, j] if self.categories == 'auto': le.fit(Xj) else: if self.handle_unknown == 'error': valid_mask = np.in1d(Xj, self.categories[j]) if not np.all(valid_mask): diff = np.unique(Xj[~valid_mask]) msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, j)) raise ValueError(msg) le.classes_ = np.array(self.categories[j]) self.categories_ = [le.classes_ for le in self._label_encoders_] self.n = len(y) if self.clf_type in ['binary-clf', 'regression']: self.Eyx_ = [{cat: np.mean(y[X[:, j] == cat]) for cat in self.categories_[j]} for j in range(len(self.categories_))] self.Ey_ = np.mean(y) self.counter_ = {j: collections.Counter(X[:, j]) for j in range(n_features)} if self.clf_type in ['multiclass-clf']: self.classes_ = np.unique(y) self.Eyx_ = {c: [{cat: np.mean((y == c)[X[:, j] == cat]) for cat in self.categories_[j]} for j in range(len(self.categories_))] for c in self.classes_} self.Ey_ = {c: np.mean(y == c) for c in self.classes_} self.counter_ = {j: collections.Counter(X[:, j]) for j in range(n_features)} self.k = {j: len(self.counter_[j]) for j in self.counter_} return self
def transform(self, X): """Transform X using specified encoding scheme. Parameters ---------- X : array-like, shape [n_samples, n_features_new] The data to encode. Returns ------- X_new : 2-d array Transformed input. """ if hasattr(X, 'iloc') and X.isna().values.any(): if self.handle_missing == 'error': msg = ("Found missing values in input data; set " "handle_missing='' to encode with missing values") raise ValueError(msg) if self.handle_missing != 'error': X = X.fillna(self.handle_missing) elif not hasattr(X, 'dtype') and isinstance(X, list): X = np.asarray(X, dtype=object) if hasattr(X, 'dtype'): mask = _object_dtype_isnan(X) if X.dtype.kind == 'O' and mask.any(): if self.handle_missing == 'error': msg = ("Found missing values in input data; set " "handle_missing='' to encode with missing values") raise ValueError(msg) if self.handle_missing != 'error': X[mask] = self.handle_missing X_temp = check_array(X, dtype=None) if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): X = check_array(X, dtype=np.object) else: X = X_temp n_samples, n_features = X.shape X_int = np.zeros_like(X, dtype=np.int) X_mask = np.ones_like(X, dtype=np.bool) for i in range(n_features): Xi = X[:, i] valid_mask = np.in1d(Xi, self.categories_[i]) if not np.all(valid_mask): if self.handle_unknown == 'error': diff = np.unique(X[~valid_mask, i]) msg = ("Found unknown categories {0} in column {1}" " during transform".format(diff, i)) raise ValueError(msg) else: # Set the problematic rows to an acceptable value and # continue `The rows are marked `X_mask` and will be # removed later. X_mask[:, i] = valid_mask Xi = Xi.copy() Xi[~valid_mask] = self.categories_[i][0] X_int[:, i] = self._label_encoders_[i].transform(Xi) out = [] for j, cats in enumerate(self.categories_): unqX = np.unique(X[:, j]) encoder = {x: 0 for x in unqX} if self.clf_type in ['binary-clf', 'regression']: for x in unqX: if x not in cats: Eyx = 0 else: Eyx = self.Eyx_[j][x] lambda_n = lambda_(self.counter_[j][x], self.n/self.k[j]) encoder[x] = lambda_n*Eyx + (1 - lambda_n)*self.Ey_ x_out = np.zeros((len(X[:, j]), 1)) for i, x in enumerate(X[:, j]): x_out[i, 0] = encoder[x] out.append(x_out.reshape(-1, 1)) if self.clf_type == 'multiclass-clf': x_out = np.zeros((len(X[:, j]), len(self.classes_))) lambda_n = {x: 0 for x in unqX} for x in unqX: lambda_n[x] = lambda_(self.counter_[j][x], self.n/self.k[j]) for k, c in enumerate(np.unique(self.classes_)): for x in unqX: if x not in cats: Eyx = 0 else: Eyx = self.Eyx_[c][j][x] encoder[x] = lambda_n[x]*Eyx + \ (1 - lambda_n[x])*self.Ey_[c] for i, x in enumerate(X[:, j]): x_out[i, k] = encoder[x] out.append(x_out) out = np.hstack(out) return out