コード例 #1
0
def test_one_hot_encoder_drop_manual(missing_value):
    cats_to_drop = ['def', 12, 3, 56, missing_value]
    enc = OneHotEncoder(drop=cats_to_drop)
    X = [['abc', 12, 2, 55, 'a'], ['def', 12, 1, 55, 'a'],
         ['def', 12, 3, 56, missing_value]]
    trans = enc.fit_transform(X).toarray()
    exp = [[1, 0, 1, 1, 1], [0, 1, 0, 1, 1], [0, 0, 0, 0, 0]]
    assert_array_equal(trans, exp)
    assert enc.drop is cats_to_drop

    dropped_cats = [
        cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_)
    ]
    X_inv_trans = enc.inverse_transform(trans)
    X_array = np.array(X, dtype=object)

    # last value is np.nan
    if is_scalar_nan(cats_to_drop[-1]):
        assert_array_equal(dropped_cats[:-1], cats_to_drop[:-1])
        assert is_scalar_nan(dropped_cats[-1])
        assert is_scalar_nan(cats_to_drop[-1])
        # do not include the last column which includes missing values
        assert_array_equal(X_array[:, :-1], X_inv_trans[:, :-1])

        # check last column is the missing value
        assert_array_equal(X_array[-1, :-1], X_inv_trans[-1, :-1])
        assert is_scalar_nan(X_array[-1, -1])
        assert is_scalar_nan(X_inv_trans[-1, -1])
    else:
        assert_array_equal(dropped_cats, cats_to_drop)
        assert_array_equal(X_array, X_inv_trans)
コード例 #2
0
def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
    # order of categories should not depend on order of samples
    for Xi in [X, X[::-1]]:
        enc = OneHotEncoder(categories='auto')
        enc.fit(Xi)
        # assert enc.categories == 'auto'
        assert isinstance(enc.categories_, list)
        for res, exp in zip(enc.categories_, cat_exp):
            res_list = res.tolist()
            if is_scalar_nan(exp[-1]):
                assert is_scalar_nan(res_list[-1])
                assert res_list[:-1] == exp[:-1]
            else:
                assert res.tolist() == exp
            assert np.issubdtype(res.dtype, cat_dtype)
コード例 #3
0
ファイル: knn_imputer.py プロジェクト: Ruhul964/dsa2
    def fit(self, X, y=None):
        """Fit the imputer on X.
        Parameters
        ----------
        X : array-like shape of (n_samples, n_features)
            Input data, where `n_samples` is the number of samples and
            `n_features` is the number of features.
        Returns
        -------
        self : object
        """
        # Check data integrity and calling arguments
        if not is_scalar_nan(self.missing_values):
            force_all_finite = True
        else:
            force_all_finite = "allow-nan"
            if self.metric not in _NAN_METRICS and not callable(self.metric):
                raise ValueError(
                    "The selected metric does not support NaN values")
        if self.n_neighbors <= 0:
            raise ValueError("Expected n_neighbors > 0. Got {}".format(
                self.n_neighbors))

        X = check_array(X,
                        accept_sparse=False,
                        dtype=FLOAT_DTYPES,
                        force_all_finite=force_all_finite,
                        copy=self.copy)
        super()._fit_indicator(X)

        _check_weights(self.weights)
        self._fit_X = X
        self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)
        return self
コード例 #4
0
ファイル: base.py プロジェクト: Kelly0531/Financial-Risk
    def fit(self, X: pd.DataFrame, y=None, **fit_params):
        """
        :param X: Pandas DataFrame with shape (n_sample, n_feature)
        :param y: a label column with shape (n_sample, )
        """
        cols = self.cols or X.columns.tolist()
        self.bins = dict()

        for col in cols:
            # use the user specified cutoff point
            if col in self.set_bins:
                if isinstance(self.set_bins[col], list):
                    self.bins[col] = sorted(self.set_bins[col])
                else:
                    self.bins[col] = self.set_bins[col]
                continue

            cutoff = self._fit(X[col], y)
            if cutoff is not None:
                # save the sorted cutoff points
                self.bins[col] = sorted(cutoff)
            else:
                # save a mapping from value to encoding value (starting from 1)
                self.bins[col] = {v: (k+1) for k, v in enumerate(X[col].unique()) \
                                     if not is_scalar_nan(v)}
        return self
コード例 #5
0
def assign_group(x, bins):
    """ Assign the right cutoff value for each value in x except for the first interval
        which take the left cutoff value
        ex. assign_group(range(6), [0, 2, 4]) => [0, 2, 2, 4, 4, np.inf]
    """
    # add infinite at the end
    bins = np.array(bins)
    groups = list()
    for v in x:
        if is_scalar_nan(v):
            groups.append(v)

        elif v <= bins[0]:
            groups.append(bins[0])
            continue

        else:
            # find the cutoff value that's larger or equal than the current value
            idx = np.argmax(bins >= v)
            if idx > 0:
                groups.append(bins[idx])
            else:
                # none of the cutoff points is larger than the value
                groups.append(np.inf)
    return groups
コード例 #6
0
ファイル: utils.py プロジェクト: lorentzenchr/sk_encoder_cv
 def _generate_items(self, items):
     """Generate items without nans. Stores the nan counts seperately."""
     for item in items:
         if not is_scalar_nan(item):
             yield item
             continue
         if not hasattr(self, 'nan_count'):
             self.nan_count = 0
         self.nan_count += 1
コード例 #7
0
 def decode_column(data_bunch, col_idx):
     col_name = data_bunch.feature_names[col_idx]
     if col_name in data_bunch.categories:
         # XXX: This would be faster with np.take, although it does not
         # handle missing values fast (also not with mode='wrap')
         cat = data_bunch.categories[col_name]
         result = [None if is_scalar_nan(idx) else cat[int(idx)]
                   for idx in data_bunch.data[:, col_idx]]
         return np.array(result, dtype='O')
     else:
         # non-nominal attribute
         return data_bunch.data[:, col_idx]
コード例 #8
0
ファイル: test_openml.py プロジェクト: amueller/scikit-learn
 def decode_column(data_bunch, col_idx):
     col_name = data_bunch.feature_names[col_idx]
     if col_name in data_bunch.categories:
         # XXX: This would be faster with np.take, although it does not
         # handle missing values fast (also not with mode='wrap')
         cat = data_bunch.categories[col_name]
         result = [None if is_scalar_nan(idx) else cat[int(idx)]
                   for idx in data_bunch.data[:, col_idx]]
         return np.array(result, dtype='O')
     else:
         # non-nominal attribute
         return data_bunch.data[:, col_idx]
コード例 #9
0
def searchsorted(a, v, fill=-1):
    """ Encode values in v with ascending cutoff points in a. Similar to numpy.searchsorted
        Left open right close except for the leftmost interval, which is close at both ends.
    """
    encoded = list()
    for value in v:
        if is_scalar_nan(value):
            encoded.append(fill)
        elif value == min(a):
            # the leftmost interval close at both ends
            encoded.append(1)
        else:
            encoded.append(_searchsorted(a, value))
    return encoded
コード例 #10
0
ファイル: utils.py プロジェクト: lorentzenchr/sk_encoder_cv
def _unique_np(values, return_inverse=False, return_counts=False):
    """Helper function to find unique values for numpy arrays that correctly
    accounts for nans. See `_unique` documentation for details."""
    uniques = np.unique(values,
                        return_inverse=return_inverse,
                        return_counts=return_counts)

    inverse, counts = None, None

    if return_counts:
        *uniques, counts = uniques

    if return_inverse:
        *uniques, inverse = uniques

    if return_counts or return_inverse:
        uniques = uniques[0]

    # np.unique will have duplicate missing values at the end of `uniques`
    # here we clip the nans and remove it from uniques
    if uniques.size and is_scalar_nan(uniques[-1]):
        nan_idx = np.searchsorted(uniques, np.nan)
        uniques = uniques[:nan_idx + 1]
        if return_inverse:
            inverse[inverse > nan_idx] = nan_idx

        if return_counts:
            counts[nan_idx] = np.sum(counts[nan_idx:])
            counts = counts[:nan_idx + 1]

    ret = (uniques, )

    if return_inverse:
        ret += (inverse, )

    if return_counts:
        ret += (counts, )

    return ret[0] if len(ret) == 1 else ret
コード例 #11
0
    def fit(self, X: pd.DataFrame, y=None, **fit_params):
        """
        :param X: Pandas DataFrame with shape (n_sample, n_feature)
        :param y: a label column with shape (n_sample, )
        """
        cols = self.cols or X.columns.tolist()
        self.bins = dict()

        _range = trange if fit_params.get('verbose', 1) else range
        for i in _range(len(cols)):
            col = cols[i]

            # use the user specified cutoff point
            if col in self.set_bins:
                if isinstance(self.set_bins[col], list):
                    self.bins[col] = sorted(self.set_bins[col])
                else:
                    self.bins[col] = self.set_bins[col]
                continue

            cutoff = self._fit(X[col], y)
            if cutoff is not None:
                if isinstance(cutoff, dict):
                    # save the mapping
                    self.bins[col] = cutoff
                elif isinstance(cutoff, Iterable):
                    # save the sorted cutoff points
                    self.bins[col] = sorted(cutoff)
                else:
                    raise ValueError(
                        'Only iterable and dictionary is accepted as cutoff, get {} instead.'
                        .format(type(cutoff)))
            else:
                # save a mapping from value to encoding value (starting from 1)
                self.bins[col] = {v: (k+1) for k, v in enumerate(X[col].unique()) \
                                     if not is_scalar_nan(v)}
        return self
コード例 #12
0
ファイル: utils.py プロジェクト: lorentzenchr/sk_encoder_cv
def _extract_missing(values):
    """Extract missing values from `values`.

    Parameters
    ----------
    values: set
        Set of values to extract missing from.

    Returns
    -------
    output: set
        Set with missing values extracted.

    missing_values: MissingValues
        Object with missing value information.
    """
    missing_values_set = {
        value
        for value in values if value is None or is_scalar_nan(value)
    }

    if not missing_values_set:
        return values, MissingValues(nan=False, none=False)

    if None in missing_values_set:
        if len(missing_values_set) == 1:
            output_missing_values = MissingValues(nan=False, none=True)
        else:
            # If there is more than one missing value, then it has to be
            # float('nan') or np.nan
            output_missing_values = MissingValues(nan=True, none=True)
    else:
        output_missing_values = MissingValues(nan=True, none=False)

    # create set without the missing values
    output = values - missing_values_set
    return output, output_missing_values
コード例 #13
0
def not_scalar_nan(x):
    return not is_scalar_nan(x)
コード例 #14
0
 def _map(x):
     if is_scalar_nan(x):
         return fill
     else:
         return mapping.get(x, unseen)
コード例 #15
0
ファイル: knn_imputer.py プロジェクト: Ruhul964/dsa2
    def transform(self, X):
        """Impute all missing values in X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input data to complete.
        Returns
        -------
        X : array-like of shape (n_samples, n_output_features)
            The imputed dataset. `n_output_features` is the number of features
            that is not always missing during `fit`.
        """

        check_is_fitted(self)
        if not is_scalar_nan(self.missing_values):
            force_all_finite = True
        else:
            force_all_finite = "allow-nan"
        X = check_array(X,
                        accept_sparse=False,
                        dtype=FLOAT_DTYPES,
                        force_all_finite=force_all_finite,
                        copy=self.copy)
        X_indicator = super()._transform_indicator(X)

        if X.shape[1] != self._fit_X.shape[1]:
            raise ValueError("Incompatible dimension between the fitted "
                             "dataset and the one to be transformed")

        mask = _get_mask(X, self.missing_values)
        mask_fit_X = self._mask_fit_X
        valid_mask = ~np.all(mask_fit_X, axis=0)

        if not np.any(mask):
            # No missing values in X
            # Remove columns where the training data is all nan
            return X[:, valid_mask]

        row_missing_idx = np.flatnonzero(mask.any(axis=1))

        non_missing_fix_X = np.logical_not(mask_fit_X)

        # Maps from indices from X to indices in dist matrix
        dist_idx_map = np.zeros(X.shape[0], dtype=np.int)
        dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0])

        def process_chunk(dist_chunk, start):
            row_missing_chunk = row_missing_idx[start:start + len(dist_chunk)]

            # Find and impute missing by column
            for col in range(X.shape[1]):
                if not valid_mask[col]:
                    # column was all missing during training
                    continue

                col_mask = mask[row_missing_chunk, col]
                if not np.any(col_mask):
                    # column has no missing values
                    continue

                potential_donors_idx, = np.nonzero(non_missing_fix_X[:, col])

                # receivers_idx are indices in X
                receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)]

                # distances for samples that needed imputation for column
                dist_subset = (dist_chunk[dist_idx_map[receivers_idx] -
                                          start][:, potential_donors_idx])

                # receivers with all nan distances impute with mean
                all_nan_dist_mask = np.isnan(dist_subset).all(axis=1)
                all_nan_receivers_idx = receivers_idx[all_nan_dist_mask]

                # Adapted the function to compute the mode for categorical variables.
                if all_nan_receivers_idx.size:
                    if self.ncat is None:
                        col_stat = np.ma.array(self._fit_X[:, col],
                                               mask=mask_fit_X[:, col]).mean()
                    elif self.ncat[col] > 1:
                        col_stat = mode(
                            self._fit_X[:, col][~mask_fit_X[:, col]]).mode
                    else:
                        col_stat = np.ma.array(self._fit_X[:, col],
                                               mask=mask_fit_X[:, col]).mean()

                    X[all_nan_receivers_idx, col] = col_stat

                    if len(all_nan_receivers_idx) == len(receivers_idx):
                        # all receivers imputed with mean
                        continue

                    # receivers with at least one defined distance
                    receivers_idx = receivers_idx[~all_nan_dist_mask]
                    dist_subset = (dist_chunk[dist_idx_map[receivers_idx] -
                                              start][:, potential_donors_idx])

                n_neighbors = min(self.n_neighbors, len(potential_donors_idx))
                value = self._calc_impute(
                    dist_subset, n_neighbors, self._fit_X[potential_donors_idx,
                                                          col],
                    mask_fit_X[potential_donors_idx, col], col)
                X[receivers_idx, col] = value

        if self.ncat is not None:
            # process in fixed-memory chunks
            gen = pairwise_distances_chunked(
                X[row_missing_idx, :],
                self._fit_X,
                metric=self.metric,
                ncat=self.ncat,
                missing_values=self.missing_values,
                force_all_finite=force_all_finite,
                reduce_func=process_chunk)
        else:
            gen = pairwise_distances_chunked(
                X[row_missing_idx, :],
                self._fit_X,
                metric=self.metric,
                missing_values=self.missing_values,
                force_all_finite=force_all_finite,
                reduce_func=process_chunk)
        for chunk in gen:
            # process_chunk modifies X in place. No return value.
            pass

        return super()._concatenate_indicator(X[:, valid_mask], X_indicator)
コード例 #16
0
def test_is_scalar_nan(value, result):
    assert is_scalar_nan(value) is result
    # make sure that we are returning a Python bool
    assert isinstance(is_scalar_nan(value), bool)
コード例 #17
0
def test_is_scalar_nan(value, result):
    assert is_scalar_nan(value) is result
コード例 #18
0
ファイル: test_utils.py プロジェクト: allefpablo/scikit-learn
def test_is_scalar_nan(value, result):
    assert is_scalar_nan(value) is result
コード例 #19
0
ファイル: utils.py プロジェクト: lorentzenchr/sk_encoder_cv
 def __missing__(self, key):
     if hasattr(self, 'nan_count') and is_scalar_nan(key):
         return self.nan_count
     raise KeyError(key)
コード例 #20
0
 def to_nonmissing_set(X):
     from sklearn.utils import is_scalar_nan
     return set(filter(lambda x: not is_scalar_nan(x), X))
コード例 #21
0
ファイル: utils.py プロジェクト: lorentzenchr/sk_encoder_cv
 def is_valid(value):
     return (value in uniques_set
             or missing_in_uniques.none and value is None
             or missing_in_uniques.nan and is_scalar_nan(value))
コード例 #22
0
ファイル: utils.py プロジェクト: lorentzenchr/sk_encoder_cv
 def __init__(self, mapping):
     super().__init__(mapping)
     for key, value in mapping.items():
         if is_scalar_nan(key):
             self.nan_value = value
             break