def test_one_hot_encoder_drop_manual(missing_value): cats_to_drop = ['def', 12, 3, 56, missing_value] enc = OneHotEncoder(drop=cats_to_drop) X = [['abc', 12, 2, 55, 'a'], ['def', 12, 1, 55, 'a'], ['def', 12, 3, 56, missing_value]] trans = enc.fit_transform(X).toarray() exp = [[1, 0, 1, 1, 1], [0, 1, 0, 1, 1], [0, 0, 0, 0, 0]] assert_array_equal(trans, exp) assert enc.drop is cats_to_drop dropped_cats = [ cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_) ] X_inv_trans = enc.inverse_transform(trans) X_array = np.array(X, dtype=object) # last value is np.nan if is_scalar_nan(cats_to_drop[-1]): assert_array_equal(dropped_cats[:-1], cats_to_drop[:-1]) assert is_scalar_nan(dropped_cats[-1]) assert is_scalar_nan(cats_to_drop[-1]) # do not include the last column which includes missing values assert_array_equal(X_array[:, :-1], X_inv_trans[:, :-1]) # check last column is the missing value assert_array_equal(X_array[-1, :-1], X_inv_trans[-1, :-1]) assert is_scalar_nan(X_array[-1, -1]) assert is_scalar_nan(X_inv_trans[-1, -1]) else: assert_array_equal(dropped_cats, cats_to_drop) assert_array_equal(X_array, X_inv_trans)
def test_one_hot_encoder_categories(X, cat_exp, cat_dtype): # order of categories should not depend on order of samples for Xi in [X, X[::-1]]: enc = OneHotEncoder(categories='auto') enc.fit(Xi) # assert enc.categories == 'auto' assert isinstance(enc.categories_, list) for res, exp in zip(enc.categories_, cat_exp): res_list = res.tolist() if is_scalar_nan(exp[-1]): assert is_scalar_nan(res_list[-1]) assert res_list[:-1] == exp[:-1] else: assert res.tolist() == exp assert np.issubdtype(res.dtype, cat_dtype)
def fit(self, X, y=None): """Fit the imputer on X. Parameters ---------- X : array-like shape of (n_samples, n_features) Input data, where `n_samples` is the number of samples and `n_features` is the number of features. Returns ------- self : object """ # Check data integrity and calling arguments if not is_scalar_nan(self.missing_values): force_all_finite = True else: force_all_finite = "allow-nan" if self.metric not in _NAN_METRICS and not callable(self.metric): raise ValueError( "The selected metric does not support NaN values") if self.n_neighbors <= 0: raise ValueError("Expected n_neighbors > 0. Got {}".format( self.n_neighbors)) X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, force_all_finite=force_all_finite, copy=self.copy) super()._fit_indicator(X) _check_weights(self.weights) self._fit_X = X self._mask_fit_X = _get_mask(self._fit_X, self.missing_values) return self
def fit(self, X: pd.DataFrame, y=None, **fit_params): """ :param X: Pandas DataFrame with shape (n_sample, n_feature) :param y: a label column with shape (n_sample, ) """ cols = self.cols or X.columns.tolist() self.bins = dict() for col in cols: # use the user specified cutoff point if col in self.set_bins: if isinstance(self.set_bins[col], list): self.bins[col] = sorted(self.set_bins[col]) else: self.bins[col] = self.set_bins[col] continue cutoff = self._fit(X[col], y) if cutoff is not None: # save the sorted cutoff points self.bins[col] = sorted(cutoff) else: # save a mapping from value to encoding value (starting from 1) self.bins[col] = {v: (k+1) for k, v in enumerate(X[col].unique()) \ if not is_scalar_nan(v)} return self
def assign_group(x, bins): """ Assign the right cutoff value for each value in x except for the first interval which take the left cutoff value ex. assign_group(range(6), [0, 2, 4]) => [0, 2, 2, 4, 4, np.inf] """ # add infinite at the end bins = np.array(bins) groups = list() for v in x: if is_scalar_nan(v): groups.append(v) elif v <= bins[0]: groups.append(bins[0]) continue else: # find the cutoff value that's larger or equal than the current value idx = np.argmax(bins >= v) if idx > 0: groups.append(bins[idx]) else: # none of the cutoff points is larger than the value groups.append(np.inf) return groups
def _generate_items(self, items): """Generate items without nans. Stores the nan counts seperately.""" for item in items: if not is_scalar_nan(item): yield item continue if not hasattr(self, 'nan_count'): self.nan_count = 0 self.nan_count += 1
def decode_column(data_bunch, col_idx): col_name = data_bunch.feature_names[col_idx] if col_name in data_bunch.categories: # XXX: This would be faster with np.take, although it does not # handle missing values fast (also not with mode='wrap') cat = data_bunch.categories[col_name] result = [None if is_scalar_nan(idx) else cat[int(idx)] for idx in data_bunch.data[:, col_idx]] return np.array(result, dtype='O') else: # non-nominal attribute return data_bunch.data[:, col_idx]
def decode_column(data_bunch, col_idx): col_name = data_bunch.feature_names[col_idx] if col_name in data_bunch.categories: # XXX: This would be faster with np.take, although it does not # handle missing values fast (also not with mode='wrap') cat = data_bunch.categories[col_name] result = [None if is_scalar_nan(idx) else cat[int(idx)] for idx in data_bunch.data[:, col_idx]] return np.array(result, dtype='O') else: # non-nominal attribute return data_bunch.data[:, col_idx]
def searchsorted(a, v, fill=-1): """ Encode values in v with ascending cutoff points in a. Similar to numpy.searchsorted Left open right close except for the leftmost interval, which is close at both ends. """ encoded = list() for value in v: if is_scalar_nan(value): encoded.append(fill) elif value == min(a): # the leftmost interval close at both ends encoded.append(1) else: encoded.append(_searchsorted(a, value)) return encoded
def _unique_np(values, return_inverse=False, return_counts=False): """Helper function to find unique values for numpy arrays that correctly accounts for nans. See `_unique` documentation for details.""" uniques = np.unique(values, return_inverse=return_inverse, return_counts=return_counts) inverse, counts = None, None if return_counts: *uniques, counts = uniques if return_inverse: *uniques, inverse = uniques if return_counts or return_inverse: uniques = uniques[0] # np.unique will have duplicate missing values at the end of `uniques` # here we clip the nans and remove it from uniques if uniques.size and is_scalar_nan(uniques[-1]): nan_idx = np.searchsorted(uniques, np.nan) uniques = uniques[:nan_idx + 1] if return_inverse: inverse[inverse > nan_idx] = nan_idx if return_counts: counts[nan_idx] = np.sum(counts[nan_idx:]) counts = counts[:nan_idx + 1] ret = (uniques, ) if return_inverse: ret += (inverse, ) if return_counts: ret += (counts, ) return ret[0] if len(ret) == 1 else ret
def fit(self, X: pd.DataFrame, y=None, **fit_params): """ :param X: Pandas DataFrame with shape (n_sample, n_feature) :param y: a label column with shape (n_sample, ) """ cols = self.cols or X.columns.tolist() self.bins = dict() _range = trange if fit_params.get('verbose', 1) else range for i in _range(len(cols)): col = cols[i] # use the user specified cutoff point if col in self.set_bins: if isinstance(self.set_bins[col], list): self.bins[col] = sorted(self.set_bins[col]) else: self.bins[col] = self.set_bins[col] continue cutoff = self._fit(X[col], y) if cutoff is not None: if isinstance(cutoff, dict): # save the mapping self.bins[col] = cutoff elif isinstance(cutoff, Iterable): # save the sorted cutoff points self.bins[col] = sorted(cutoff) else: raise ValueError( 'Only iterable and dictionary is accepted as cutoff, get {} instead.' .format(type(cutoff))) else: # save a mapping from value to encoding value (starting from 1) self.bins[col] = {v: (k+1) for k, v in enumerate(X[col].unique()) \ if not is_scalar_nan(v)} return self
def _extract_missing(values): """Extract missing values from `values`. Parameters ---------- values: set Set of values to extract missing from. Returns ------- output: set Set with missing values extracted. missing_values: MissingValues Object with missing value information. """ missing_values_set = { value for value in values if value is None or is_scalar_nan(value) } if not missing_values_set: return values, MissingValues(nan=False, none=False) if None in missing_values_set: if len(missing_values_set) == 1: output_missing_values = MissingValues(nan=False, none=True) else: # If there is more than one missing value, then it has to be # float('nan') or np.nan output_missing_values = MissingValues(nan=True, none=True) else: output_missing_values = MissingValues(nan=True, none=False) # create set without the missing values output = values - missing_values_set return output, output_missing_values
def not_scalar_nan(x): return not is_scalar_nan(x)
def _map(x): if is_scalar_nan(x): return fill else: return mapping.get(x, unseen)
def transform(self, X): """Impute all missing values in X. Parameters ---------- X : array-like of shape (n_samples, n_features) The input data to complete. Returns ------- X : array-like of shape (n_samples, n_output_features) The imputed dataset. `n_output_features` is the number of features that is not always missing during `fit`. """ check_is_fitted(self) if not is_scalar_nan(self.missing_values): force_all_finite = True else: force_all_finite = "allow-nan" X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, force_all_finite=force_all_finite, copy=self.copy) X_indicator = super()._transform_indicator(X) if X.shape[1] != self._fit_X.shape[1]: raise ValueError("Incompatible dimension between the fitted " "dataset and the one to be transformed") mask = _get_mask(X, self.missing_values) mask_fit_X = self._mask_fit_X valid_mask = ~np.all(mask_fit_X, axis=0) if not np.any(mask): # No missing values in X # Remove columns where the training data is all nan return X[:, valid_mask] row_missing_idx = np.flatnonzero(mask.any(axis=1)) non_missing_fix_X = np.logical_not(mask_fit_X) # Maps from indices from X to indices in dist matrix dist_idx_map = np.zeros(X.shape[0], dtype=np.int) dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0]) def process_chunk(dist_chunk, start): row_missing_chunk = row_missing_idx[start:start + len(dist_chunk)] # Find and impute missing by column for col in range(X.shape[1]): if not valid_mask[col]: # column was all missing during training continue col_mask = mask[row_missing_chunk, col] if not np.any(col_mask): # column has no missing values continue potential_donors_idx, = np.nonzero(non_missing_fix_X[:, col]) # receivers_idx are indices in X receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)] # distances for samples that needed imputation for column dist_subset = (dist_chunk[dist_idx_map[receivers_idx] - start][:, potential_donors_idx]) # receivers with all nan distances impute with mean all_nan_dist_mask = np.isnan(dist_subset).all(axis=1) all_nan_receivers_idx = receivers_idx[all_nan_dist_mask] # Adapted the function to compute the mode for categorical variables. if all_nan_receivers_idx.size: if self.ncat is None: col_stat = np.ma.array(self._fit_X[:, col], mask=mask_fit_X[:, col]).mean() elif self.ncat[col] > 1: col_stat = mode( self._fit_X[:, col][~mask_fit_X[:, col]]).mode else: col_stat = np.ma.array(self._fit_X[:, col], mask=mask_fit_X[:, col]).mean() X[all_nan_receivers_idx, col] = col_stat if len(all_nan_receivers_idx) == len(receivers_idx): # all receivers imputed with mean continue # receivers with at least one defined distance receivers_idx = receivers_idx[~all_nan_dist_mask] dist_subset = (dist_chunk[dist_idx_map[receivers_idx] - start][:, potential_donors_idx]) n_neighbors = min(self.n_neighbors, len(potential_donors_idx)) value = self._calc_impute( dist_subset, n_neighbors, self._fit_X[potential_donors_idx, col], mask_fit_X[potential_donors_idx, col], col) X[receivers_idx, col] = value if self.ncat is not None: # process in fixed-memory chunks gen = pairwise_distances_chunked( X[row_missing_idx, :], self._fit_X, metric=self.metric, ncat=self.ncat, missing_values=self.missing_values, force_all_finite=force_all_finite, reduce_func=process_chunk) else: gen = pairwise_distances_chunked( X[row_missing_idx, :], self._fit_X, metric=self.metric, missing_values=self.missing_values, force_all_finite=force_all_finite, reduce_func=process_chunk) for chunk in gen: # process_chunk modifies X in place. No return value. pass return super()._concatenate_indicator(X[:, valid_mask], X_indicator)
def test_is_scalar_nan(value, result): assert is_scalar_nan(value) is result # make sure that we are returning a Python bool assert isinstance(is_scalar_nan(value), bool)
def test_is_scalar_nan(value, result): assert is_scalar_nan(value) is result
def test_is_scalar_nan(value, result): assert is_scalar_nan(value) is result
def __missing__(self, key): if hasattr(self, 'nan_count') and is_scalar_nan(key): return self.nan_count raise KeyError(key)
def to_nonmissing_set(X): from sklearn.utils import is_scalar_nan return set(filter(lambda x: not is_scalar_nan(x), X))
def is_valid(value): return (value in uniques_set or missing_in_uniques.none and value is None or missing_in_uniques.nan and is_scalar_nan(value))
def __init__(self, mapping): super().__init__(mapping) for key, value in mapping.items(): if is_scalar_nan(key): self.nan_value = value break