コード例 #1
0
ファイル: knn_imputer.py プロジェクト: Ruhul964/dsa2
    def fit(self, X, y=None):
        """Fit the imputer on X.
        Parameters
        ----------
        X : array-like shape of (n_samples, n_features)
            Input data, where `n_samples` is the number of samples and
            `n_features` is the number of features.
        Returns
        -------
        self : object
        """
        # Check data integrity and calling arguments
        if not is_scalar_nan(self.missing_values):
            force_all_finite = True
        else:
            force_all_finite = "allow-nan"
            if self.metric not in _NAN_METRICS and not callable(self.metric):
                raise ValueError(
                    "The selected metric does not support NaN values")
        if self.n_neighbors <= 0:
            raise ValueError("Expected n_neighbors > 0. Got {}".format(
                self.n_neighbors))

        X = check_array(X,
                        accept_sparse=False,
                        dtype=FLOAT_DTYPES,
                        force_all_finite=force_all_finite,
                        copy=self.copy)
        super()._fit_indicator(X)

        _check_weights(self.weights)
        self._fit_X = X
        self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)
        return self
コード例 #2
0
 def fit(self, X, y=None):
     mask = _get_mask(X, value_to_mask=np.nan)
     self._fit_indicator(mask)
     return self
コード例 #3
0
def _missing_mean(X, missing_value):
    masked_X = np.ma.array(X, mask=_get_mask(X, missing_value))
    masked_X_mean = masked_X.mean(axis=0)
    output = masked_X_mean.data
    output[masked_X_mean.mask] = np.nan
    return output
コード例 #4
0
ファイル: knn_imputer.py プロジェクト: Ruhul964/dsa2
    def transform(self, X):
        """Impute all missing values in X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input data to complete.
        Returns
        -------
        X : array-like of shape (n_samples, n_output_features)
            The imputed dataset. `n_output_features` is the number of features
            that is not always missing during `fit`.
        """

        check_is_fitted(self)
        if not is_scalar_nan(self.missing_values):
            force_all_finite = True
        else:
            force_all_finite = "allow-nan"
        X = check_array(X,
                        accept_sparse=False,
                        dtype=FLOAT_DTYPES,
                        force_all_finite=force_all_finite,
                        copy=self.copy)
        X_indicator = super()._transform_indicator(X)

        if X.shape[1] != self._fit_X.shape[1]:
            raise ValueError("Incompatible dimension between the fitted "
                             "dataset and the one to be transformed")

        mask = _get_mask(X, self.missing_values)
        mask_fit_X = self._mask_fit_X
        valid_mask = ~np.all(mask_fit_X, axis=0)

        if not np.any(mask):
            # No missing values in X
            # Remove columns where the training data is all nan
            return X[:, valid_mask]

        row_missing_idx = np.flatnonzero(mask.any(axis=1))

        non_missing_fix_X = np.logical_not(mask_fit_X)

        # Maps from indices from X to indices in dist matrix
        dist_idx_map = np.zeros(X.shape[0], dtype=np.int)
        dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0])

        def process_chunk(dist_chunk, start):
            row_missing_chunk = row_missing_idx[start:start + len(dist_chunk)]

            # Find and impute missing by column
            for col in range(X.shape[1]):
                if not valid_mask[col]:
                    # column was all missing during training
                    continue

                col_mask = mask[row_missing_chunk, col]
                if not np.any(col_mask):
                    # column has no missing values
                    continue

                potential_donors_idx, = np.nonzero(non_missing_fix_X[:, col])

                # receivers_idx are indices in X
                receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)]

                # distances for samples that needed imputation for column
                dist_subset = (dist_chunk[dist_idx_map[receivers_idx] -
                                          start][:, potential_donors_idx])

                # receivers with all nan distances impute with mean
                all_nan_dist_mask = np.isnan(dist_subset).all(axis=1)
                all_nan_receivers_idx = receivers_idx[all_nan_dist_mask]

                # Adapted the function to compute the mode for categorical variables.
                if all_nan_receivers_idx.size:
                    if self.ncat is None:
                        col_stat = np.ma.array(self._fit_X[:, col],
                                               mask=mask_fit_X[:, col]).mean()
                    elif self.ncat[col] > 1:
                        col_stat = mode(
                            self._fit_X[:, col][~mask_fit_X[:, col]]).mode
                    else:
                        col_stat = np.ma.array(self._fit_X[:, col],
                                               mask=mask_fit_X[:, col]).mean()

                    X[all_nan_receivers_idx, col] = col_stat

                    if len(all_nan_receivers_idx) == len(receivers_idx):
                        # all receivers imputed with mean
                        continue

                    # receivers with at least one defined distance
                    receivers_idx = receivers_idx[~all_nan_dist_mask]
                    dist_subset = (dist_chunk[dist_idx_map[receivers_idx] -
                                              start][:, potential_donors_idx])

                n_neighbors = min(self.n_neighbors, len(potential_donors_idx))
                value = self._calc_impute(
                    dist_subset, n_neighbors, self._fit_X[potential_donors_idx,
                                                          col],
                    mask_fit_X[potential_donors_idx, col], col)
                X[receivers_idx, col] = value

        if self.ncat is not None:
            # process in fixed-memory chunks
            gen = pairwise_distances_chunked(
                X[row_missing_idx, :],
                self._fit_X,
                metric=self.metric,
                ncat=self.ncat,
                missing_values=self.missing_values,
                force_all_finite=force_all_finite,
                reduce_func=process_chunk)
        else:
            gen = pairwise_distances_chunked(
                X[row_missing_idx, :],
                self._fit_X,
                metric=self.metric,
                missing_values=self.missing_values,
                force_all_finite=force_all_finite,
                reduce_func=process_chunk)
        for chunk in gen:
            # process_chunk modifies X in place. No return value.
            pass

        return super()._concatenate_indicator(X[:, valid_mask], X_indicator)