def fit(self, X, y=None): """Fit the imputer on X. Parameters ---------- X : array-like shape of (n_samples, n_features) Input data, where `n_samples` is the number of samples and `n_features` is the number of features. Returns ------- self : object """ # Check data integrity and calling arguments if not is_scalar_nan(self.missing_values): force_all_finite = True else: force_all_finite = "allow-nan" if self.metric not in _NAN_METRICS and not callable(self.metric): raise ValueError( "The selected metric does not support NaN values") if self.n_neighbors <= 0: raise ValueError("Expected n_neighbors > 0. Got {}".format( self.n_neighbors)) X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, force_all_finite=force_all_finite, copy=self.copy) super()._fit_indicator(X) _check_weights(self.weights) self._fit_X = X self._mask_fit_X = _get_mask(self._fit_X, self.missing_values) return self
def fit(self, X, y=None): mask = _get_mask(X, value_to_mask=np.nan) self._fit_indicator(mask) return self
def _missing_mean(X, missing_value): masked_X = np.ma.array(X, mask=_get_mask(X, missing_value)) masked_X_mean = masked_X.mean(axis=0) output = masked_X_mean.data output[masked_X_mean.mask] = np.nan return output
def transform(self, X): """Impute all missing values in X. Parameters ---------- X : array-like of shape (n_samples, n_features) The input data to complete. Returns ------- X : array-like of shape (n_samples, n_output_features) The imputed dataset. `n_output_features` is the number of features that is not always missing during `fit`. """ check_is_fitted(self) if not is_scalar_nan(self.missing_values): force_all_finite = True else: force_all_finite = "allow-nan" X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, force_all_finite=force_all_finite, copy=self.copy) X_indicator = super()._transform_indicator(X) if X.shape[1] != self._fit_X.shape[1]: raise ValueError("Incompatible dimension between the fitted " "dataset and the one to be transformed") mask = _get_mask(X, self.missing_values) mask_fit_X = self._mask_fit_X valid_mask = ~np.all(mask_fit_X, axis=0) if not np.any(mask): # No missing values in X # Remove columns where the training data is all nan return X[:, valid_mask] row_missing_idx = np.flatnonzero(mask.any(axis=1)) non_missing_fix_X = np.logical_not(mask_fit_X) # Maps from indices from X to indices in dist matrix dist_idx_map = np.zeros(X.shape[0], dtype=np.int) dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0]) def process_chunk(dist_chunk, start): row_missing_chunk = row_missing_idx[start:start + len(dist_chunk)] # Find and impute missing by column for col in range(X.shape[1]): if not valid_mask[col]: # column was all missing during training continue col_mask = mask[row_missing_chunk, col] if not np.any(col_mask): # column has no missing values continue potential_donors_idx, = np.nonzero(non_missing_fix_X[:, col]) # receivers_idx are indices in X receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)] # distances for samples that needed imputation for column dist_subset = (dist_chunk[dist_idx_map[receivers_idx] - start][:, potential_donors_idx]) # receivers with all nan distances impute with mean all_nan_dist_mask = np.isnan(dist_subset).all(axis=1) all_nan_receivers_idx = receivers_idx[all_nan_dist_mask] # Adapted the function to compute the mode for categorical variables. if all_nan_receivers_idx.size: if self.ncat is None: col_stat = np.ma.array(self._fit_X[:, col], mask=mask_fit_X[:, col]).mean() elif self.ncat[col] > 1: col_stat = mode( self._fit_X[:, col][~mask_fit_X[:, col]]).mode else: col_stat = np.ma.array(self._fit_X[:, col], mask=mask_fit_X[:, col]).mean() X[all_nan_receivers_idx, col] = col_stat if len(all_nan_receivers_idx) == len(receivers_idx): # all receivers imputed with mean continue # receivers with at least one defined distance receivers_idx = receivers_idx[~all_nan_dist_mask] dist_subset = (dist_chunk[dist_idx_map[receivers_idx] - start][:, potential_donors_idx]) n_neighbors = min(self.n_neighbors, len(potential_donors_idx)) value = self._calc_impute( dist_subset, n_neighbors, self._fit_X[potential_donors_idx, col], mask_fit_X[potential_donors_idx, col], col) X[receivers_idx, col] = value if self.ncat is not None: # process in fixed-memory chunks gen = pairwise_distances_chunked( X[row_missing_idx, :], self._fit_X, metric=self.metric, ncat=self.ncat, missing_values=self.missing_values, force_all_finite=force_all_finite, reduce_func=process_chunk) else: gen = pairwise_distances_chunked( X[row_missing_idx, :], self._fit_X, metric=self.metric, missing_values=self.missing_values, force_all_finite=force_all_finite, reduce_func=process_chunk) for chunk in gen: # process_chunk modifies X in place. No return value. pass return super()._concatenate_indicator(X[:, valid_mask], X_indicator)