Python _handle_zeros_in_scale Exemples, sklearn.preprocessing.data._handle_zeros_in_scale Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : preprocessors.py Projet : bgruening/galaxytools

    def transform(self, X):
        """
        Parameters
        ----------
        X : {array-like, sparse matrix}
            The data used to scale along the specified axis.
        """
        check_is_fitted(self, 'iqr_', 'max_')
        X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES,
                        force_all_finite=True)

        # TODO sparse data
        train_upper_scale = (self.max_ - self.q_upper_) / self.iqr_
        train_lower_scale = (self.q_lower_ - self.min_) / self.iqr_

        test_quantiles = nanpercentile(X, (self.q_lower, self.q_upper))
        test_iqr = _handle_zeros_in_scale(
            test_quantiles[1] - test_quantiles[0], copy=False)

        test_upper_bound = test_quantiles[1] + train_upper_scale * test_iqr
        test_lower_bound = test_quantiles[0] - train_lower_scale * test_iqr

        test_min = np.nanmin(X)
        if test_lower_bound < test_min:
            test_lower_bound = test_min

        X[X > test_upper_bound] = test_upper_bound
        X[X < test_lower_bound] = test_lower_bound

        X = (X - test_lower_bound) / (test_upper_bound - test_lower_bound)\
            * (self.max_ - self.min_) + self.min_

        return X

Exemple #2

0

Afficher le fichier

Fichier : _tdm_scaler.py Projet : mbargull/Galaxy-ML

    def fit(self, X, y=None):
        """
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
        """
        X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES,
                        force_all_finite=True)

        if not 0 <= self.q_lower <= self.q_upper <= 100:
            raise ValueError("Invalid quantile parameter values: "
                             "q_lower %s, q_upper: %s"
                             % (str(self.q_lower), str(self.q_upper)))

        # TODO sparse data
        quantiles = np.nanpercentile(X, (self.q_lower, self.q_upper))
        iqr = quantiles[1] - quantiles[0]

        self.q_lower_ = quantiles[0]
        self.q_upper_ = quantiles[1]
        self.iqr_ = _handle_zeros_in_scale(iqr, copy=False)

        self.max_ = np.nanmax(X)
        self.min_ = np.nanmin(X)

        return self

Exemple #3

0

Afficher le fichier

Fichier : preprocessors.py Projet : bgruening/galaxytools

    def fit(self, X, y=None):
        """
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
        """
        X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES,
                        force_all_finite=True)

        if not 0 <= self.q_lower <= self.q_upper <= 100:
            raise ValueError("Invalid quantile parameter values: "
                             "q_lower %s, q_upper: %s"
                             % (str(self.q_lower), str(self.q_upper)))

        # TODO sparse data
        quantiles = nanpercentile(X, (self.q_lower, self.q_upper))
        iqr = quantiles[1] - quantiles[0]

        self.q_lower_ = quantiles[0]
        self.q_upper_ = quantiles[1]
        self.iqr_ = _handle_zeros_in_scale(iqr, copy=False)

        self.max_ = np.nanmax(X)
        self.min_ = np.nanmin(X)

        return self

Exemple #4

0

Afficher le fichier

Fichier : _tdm_scaler.py Projet : mbargull/Galaxy-ML

    def transform(self, X):
        """
        Parameters
        ----------
        X : {array-like, sparse matrix}
            The data used to scale along the specified axis.
        """
        check_is_fitted(self, 'iqr_', 'max_')
        X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES,
                        force_all_finite=True)

        # TODO sparse data
        train_upper_scale = (self.max_ - self.q_upper_) / self.iqr_
        train_lower_scale = (self.q_lower_ - self.min_) / self.iqr_

        test_quantiles = np.nanpercentile(X, (self.q_lower, self.q_upper))
        test_iqr = _handle_zeros_in_scale(
            test_quantiles[1] - test_quantiles[0], copy=False)

        test_upper_bound = test_quantiles[1] + train_upper_scale * test_iqr
        test_lower_bound = test_quantiles[0] - train_lower_scale * test_iqr

        test_min = np.nanmin(X)
        if test_lower_bound < test_min:
            test_lower_bound = test_min

        X[X > test_upper_bound] = test_upper_bound
        X[X < test_lower_bound] = test_lower_bound

        X = (X - test_lower_bound) / (test_upper_bound - test_lower_bound)\
            * (self.max_ - self.min_) + self.min_

        return X

Exemple #5

0

Afficher le fichier

    def fit(self, X, y=None):
        """Compute the mean and std to be used for later scaling.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, n_features]
            The data used to compute the mean and standard deviation
            used for later scaling along the features axis.

        y: Passthrough for ``Pipeline`` compatibility.
        """

        # Reset internal state before fitting
        self._reset()

        X, w = weighted_data(X)

        weighted_stats = DescrStatsW(X, weights=w, ddof=0)

        self.mean_ = weighted_stats.mean  # weighted mean of data (equivalent to np.average(array, weights=weights))
        self.var_ = weighted_stats.var  # variance with default degrees of freedom correction
        self.n_samples_seen_ = sum(w)

        if self.with_std:
            self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
        else:
            self.scale_ = None

        return self

Exemple #6

0

Afficher le fichier

def scale(x, data_mean, data_std):
    """Mean/variance scaling.

    Given mean and variances, apply mean-variance normalization to data.

    Args:
        x (array): Input data
        data_mean (array): Means for each feature dimention.
        data_std (array): Standard deviation for each feature dimention.

    Returns:
        array: Scaled data.

    Examples:
        >>> from nnmnkwii.preprocessing import meanstd, scale
        >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
        >>> from nnmnkwii.datasets import FileSourceDataset
        >>> X, Y = example_file_data_sources_for_acoustic_model()
        >>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
        >>> lengths = [len(y) for y in Y]
        >>> data_mean, data_std = meanstd(Y, lengths)
        >>> scaled_y = scale(Y[0], data_mean, data_std)

    See also:
        :func:`nnmnkwii.preprocessing.inv_scale`
    """
    return (x - data_mean) / _handle_zeros_in_scale(data_std, copy=False)

Exemple #7

0

Afficher le fichier

    def fit(self, X, y=None):
        q_min, q_max = self.quantile_range
        if not 0 <= q_min <= q_max <= 100:
            raise ValueError("Invalid quantile range: %s" % str(self.quantile_range))

        if isinstance(X, dd.DataFrame):
            n_columns = len(X.columns)
            partition_lengths = X.map_partitions(len).compute()
            dtype = np.find_common_type(X.dtypes, [])
            blocks = X.to_delayed()
            X = da.vstack(
                [
                    da.from_delayed(
                        block.values, shape=(length, n_columns), dtype=dtype
                    )
                    for block, length in zip(blocks, partition_lengths)
                ]
            )

        quantiles = [da.percentile(col, [q_min, 50., q_max]) for col in X.T]
        quantiles = da.vstack(quantiles).compute()
        self.center_ = quantiles[:, 1]
        self.scale_ = quantiles[:, 2] - quantiles[:, 0]
        self.scale_ = skdata._handle_zeros_in_scale(self.scale_, copy=False)
        return self

Exemple #8

0

Afficher le fichier

Fichier : preprocessing.py Projet : VUB-EOS-Searches/tact

    def fit(self, X, y, sample_weight=None):

        if sample_weight is None:
            return super(StandardScalerW, self).fit(X, y)

        if sparse.issparse(X):
            raise ValueError("Sparse matrix not supported")

        self._reset()

        print(sample_weight)
        average = np.average(X, axis=0, weights=sample_weight)

        if self.with_mean:
            self.mean_ = average
        if self.with_std:
            from sklearn.preprocessing.data import _handle_zeros_in_scale

            self.var_ = [np.cov(row, aweights=np.abs(sample_weight))
                         for row in X.T]
            self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
        else:
            self.scale_ = None

        return self

Exemple #9

0

Afficher le fichier

Fichier : SparseScaler.py Projet : sjl421/ML-Pipeline

    def partial_fit(self, X, y=None):
        """Online computation of mean and std on X for later scaling.
        All of X is processed as a single batch. This is intended for cases
        when `fit` is not feasible due to very large number of `n_samples`
        or because X is read from a continuous stream.

        The algorithm for incremental mean and std is given in Equation 1.5a,b
        in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms
        for computing the sample variance: Analysis and recommendations."
        The American Statistician 37.3 (1983): 242-247:

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, n_features]
            The data used to compute the mean and standard deviation
            used for later scaling along the features axis.

        y: Passthrough for ``Pipeline`` compatibility.
        """
        X = check_array(X,
                        accept_sparse=('csr', 'csc'),
                        copy=self.copy,
                        ensure_2d=False,
                        warn_on_dtype=True,
                        estimator=self,
                        dtype=FLOAT_DTYPES)

        # Even in the case of `with_mean=False`, we update the mean anyway
        # This is needed for the incremental computation of the var
        # See incr_mean_variance_axis and _incremental_mean_variance_axis

        if not sparse.issparse(X):
            return super(SparseScaler, self).partial_fit(X)

        if self.with_std:
            # First pass
            if not hasattr(self, 'n_samples_seen_'):
                self.mean_, self.var_ = mean_variance_axis(X, axis=0)
                n = X.shape[0]
                self.n_samples_seen_ = n
            # Next passes
            else:
                self.mean_, self.var_, self.n_samples_seen_ = \
                    incr_mean_variance_axis(X, axis=0,
                                            last_mean=self.mean_,
                                            last_var=self.var_,
                                            last_n=self.n_samples_seen_)

        if self.with_std:
            self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
        else:
            self.scale_ = None

        return self

Exemple #10

0

Afficher le fichier

    def fit(self, Z):
        """Compute the mean and std to be used for later scaling.
        Parameters
        ----------
        Z : DictRDD containing (X, y) pairs
            X - Training vector.
                {array-like, sparse matrix}, shape [n_samples, n_features]
                The data used to compute the mean and standard deviation
                used for later scaling along the features axis.
            y - Target labels
                Passthrough for ``Pipeline`` compatibility.
        """

        # Reset internal state before fitting
        self._reset()
        X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z
        check_rdd(X, (np.ndarray, sp.spmatrix))

        def mapper(X):
            """Calculate statistics for every numpy or scipy blocks."""
            X = check_array(X, ('csr', 'csc'), dtype=np.float64)
            if hasattr(X, "toarray"):  # sparse matrix
                mean, var = mean_variance_axis(X, axis=0)
            else:
                mean, var = np.mean(X, axis=0), np.var(X, axis=0)
            return X.shape[0], mean, var

        def reducer(a, b):
            """Calculate the combined statistics."""
            n_a, mean_a, var_a = a
            n_b, mean_b, var_b = b
            n_ab = n_a + n_b
            mean_ab = ((mean_a * n_a) + (mean_b * n_b)) / n_ab
            var_ab = (((n_a * var_a) + (n_b * var_b)) / n_ab) + \
                     ((n_a * n_b) * ((mean_b - mean_a) / n_ab) ** 2)
            return (n_ab, mean_ab, var_ab)

        if check_rdd_dtype(X, (sp.spmatrix)):
            if self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")
        self.n_samples_seen_, self.mean_, self.var_ = X.map(mapper).treeReduce(
            reducer)

        if self.with_std:
            self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
        else:
            self.scale_ = None

        return self

Exemple #11

0

Afficher le fichier

Fichier : data.py Projet : lensacom/sparkit-learn

    def fit(self, Z):
        """Compute the mean and std to be used for later scaling.
        Parameters
        ----------
        Z : DictRDD containing (X, y) pairs
            X - Training vector.
                {array-like, sparse matrix}, shape [n_samples, n_features]
                The data used to compute the mean and standard deviation
                used for later scaling along the features axis.
            y - Target labels
                Passthrough for ``Pipeline`` compatibility.
        """

        # Reset internal state before fitting
        self._reset()
        X = Z[:, 'X'] if isinstance(Z, DictRDD) else Z
        check_rdd(X, (np.ndarray, sp.spmatrix))

        def mapper(X):
            """Calculate statistics for every numpy or scipy blocks."""
            X = check_array(X, ('csr', 'csc'), dtype=np.float64)
            if hasattr(X, "toarray"):   # sparse matrix
                mean, var = mean_variance_axis(X, axis=0)
            else:
                mean, var = np.mean(X, axis=0), np.var(X, axis=0)
            return X.shape[0], mean, var

        def reducer(a, b):
            """Calculate the combined statistics."""
            n_a, mean_a, var_a = a
            n_b, mean_b, var_b = b
            n_ab = n_a + n_b
            mean_ab = ((mean_a * n_a) + (mean_b * n_b)) / n_ab
            var_ab = (((n_a * var_a) + (n_b * var_b)) / n_ab) + \
                     ((n_a * n_b) * ((mean_b - mean_a) / n_ab) ** 2)
            return (n_ab, mean_ab, var_ab)

        if check_rdd_dtype(X, (sp.spmatrix)):
            if self.with_mean:
                raise ValueError(
                    "Cannot center sparse matrices: pass `with_mean=False` "
                    "instead. See docstring for motivation and alternatives.")
        self.n_samples_seen_, self.mean_, self.var_ = X.map(mapper).treeReduce(reducer)

        if self.with_std:
            self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
        else:
            self.scale_ = None

        return self

Exemple #12

0

Afficher le fichier

def meanstd(dataset,
            lengths=None,
            mean_=0.,
            var_=0.,
            last_sample_count=0,
            return_last_sample_count=False):
    """Mean/std-deviation computation given a iterable dataset

    Dataset can have variable length samples. In that cases, you need to
    explicitly specify lengths for all the samples.

    Args:
        dataset (nnmnkwii.datasets.Dataset): Dataset
        lengths: (list): Frame lengths for each dataset sample.
        mean\_ (array or scalar): Initial value for mean vector.
        var\_ (array or scaler): Initial value for variance vector.
        last_sample_count (int): Last sample count. Default is 0. If you set
          non-default ``mean_`` and ``var_``, you need to set
          ``last_sample_count`` property. Typically this will be the number of
          time frames ever seen.
        return_last_sample_count (bool): Return ``last_sample_count`` if True.

    Returns:
        tuple: Mean and variance for each dimention. If
          ``return_last_sample_count`` is True, returns ``last_sample_count``
          as well.

    See also:
        :func:`nnmnkwii.preprocessing.meanvar`, :func:`nnmnkwii.preprocessing.scale`

    Examples:
        >>> from nnmnkwii.preprocessing import meanstd
        >>> from nnmnkwii.util import example_file_data_sources_for_acoustic_model
        >>> from nnmnkwii.datasets import FileSourceDataset
        >>> X, Y = example_file_data_sources_for_acoustic_model()
        >>> X, Y = FileSourceDataset(X), FileSourceDataset(Y)
        >>> lengths = [len(y) for y in Y]
        >>> data_mean, data_std = meanstd(Y, lengths)
    """
    ret = meanvar(dataset, lengths, mean_, var_, last_sample_count,
                  return_last_sample_count)
    m, v = ret[0], ret[1]
    v = _handle_zeros_in_scale(np.sqrt(v))
    if return_last_sample_count:
        assert len(ret) == 3
        return m, v, ret[2]
    else:
        return m, v

Exemple #13

0

Afficher le fichier

Fichier : utils.py Projet : shaikr/UNET_3D

    def partial_fit(self, X, y=None):
        """Online computation of min and max on X for later scaling.
        All of X is processed as a single batch. This is intended for cases
        when `fit` is not feasible due to very large number of `n_samples`
        or because X is read from a continuous stream.

        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data used to compute the mean and standard deviation
            used for later scaling along the features axis.

        y : Passthrough for ``Pipeline`` compatibility.
        """
        feature_range = self.feature_range
        if feature_range[0] >= feature_range[1]:
            raise ValueError("Minimum of desired feature range must be smaller"
                             " than maximum. Got %s." % str(feature_range))

        X = check_array(
            X,
            copy=self.copy,  # warn_on_dtype=True,
            estimator=self,
            dtype=FLOAT_DTYPES,
            ensure_2d=False,
            allow_nd=True)

        data_min = np.min(X)
        data_max = np.max(X)

        # First pass
        if not hasattr(self, 'n_samples_seen_'):
            self.n_samples_seen_ = X.shape[0]
        # Next steps
        else:
            data_min = np.minimum(self.data_min_, data_min)
            data_max = np.maximum(self.data_max_, data_max)
            self.n_samples_seen_ += X.shape[0]

        data_range = data_max - data_min
        self.scale_ = ((feature_range[1] - feature_range[0]) /
                       _handle_zeros_in_scale(data_range))
        self.min_ = feature_range[0] - data_min * self.scale_
        self.data_min_ = data_min
        self.data_max_ = data_max
        self.data_range_ = data_range
        return self

Exemple #14

0

Afficher le fichier

    def fit(self, X, y=None):
        if sparse.issparse(X):
            raise TypeError("RobustScaler cannot be fitted on sparse inputs")
        X = self._check_array(X, self.copy)
        if self.with_centering:
            self.center_ = np.nanmedian(X, axis=0)

        if self.with_scaling:
            q_min, q_max = self.quantile_range
            if not 0 <= q_min <= q_max <= 100:
                raise ValueError("Invalid quantile range: %s" %
                                 str(self.quantile_range))

            q = np.nanpercentile(X, self.quantile_range, axis=0)
            self.scale_ = (q[1] - q[0])
            self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
        return self

Exemple #15

0

Afficher le fichier

Fichier : Normalizer.py Projet : riyadics/PhishGan

    def fit(self, X):
        """
        Used to fit Noramlizer with data
        :param X: list
        :return: nothing
        """
        if self.norm not in ('l1', 'l2', 'max'):
            raise ValueError("'%s' is not a supported norm" % self.norm)

        if self.axis == 0:
            self.sparse_format = 'csc'
        elif self.axis == 1:
            self.sparse_format = 'csr'
        else:
            raise ValueError("'%d' is not a supported axis" % self.axis)

        X = check_array(X,
                        self.sparse_format,
                        copy=self.copy,
                        estimator='the normalize function',
                        dtype=FLOAT_DTYPES)
        if self.axis == 0:
            X = X.T

        if sparse.issparse(X):
            if self.norm == 'l1':
                inplace_csr_row_normalize_l1(X)
            elif self.norm == 'l2':
                inplace_csr_row_normalize_l2(X)
            elif self.norm == 'max':
                _, self.norms = min_max_axis(X, 1)
        else:
            if self.norm == 'l1':
                self.norms = np.abs(X).sum(axis=1)
            elif self.norm == 'l2':
                self.norms = row_norms(X)
            elif self.norm == 'max':
                self.norms = np.max(X, axis=1)
            self.norms = _handle_zeros_in_scale(self.norms, copy=False)

Exemple #16

0

Afficher le fichier

    def fit(self, X, y=None):
        """Compute the mean and std to be used for later scaling.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape [n_samples, n_features]
            The data used to compute the mean and standard deviation
            used for later scaling along the features axis.

        y : Passthrough for ``Pipeline`` compatibility.
        """

        # Reset internal state before fitting
        self._reset()

        X = check_array(X,
                        accept_sparse=('csr', 'csc'),
                        copy=self.copy,
                        force_all_finite=False,
                        warn_on_dtype=True,
                        estimator=self,
                        dtype=FLOAT_DTYPES)

        self.mean_ = .0
        if self.with_std:
            self.var_ = .0
        else:
            self.var_ = None

        if sparse.issparse(X):
            raise NotImplementedError

        if self.with_mean:
            self.mean_ = np.nanmean(X, 0)
        if self.with_std:
            scale_ = np.nanstd(X, 0)
            self.scale_ = _handle_zeros_in_scale(scale_, copy=False)

        return self

Exemple #17

0

Afficher le fichier

    def fit(self, X, y=None):
        """Fit the scaling factor for each residue type.

        Parameters
        ----------
        X : np.ndarray, shape=(n_observations, n_residues)
            Array of values to fit scaling upon.
        y : Passthrough for Pipeline compatibility.
        """

        if X.shape[1] != self.top.n_residues:
            raise exception.InvalidData("Given data had shape {s} and top had n_residues {n}".format(s=X.shape, n=self.top.n_residues))

        self.scale_factors_ = {}
        for code, residues in self.code2rindex.items():
            if code is None:
                warnings.warn(exception.SuspiciousDataWarning("ResidueTypeScaler Topology had 'None' values as residue codes. These will be scaled as though they are the same residue type."))

            target_data = X[:, residues]
            scale_factor = _handle_zeros_in_scale(self.scale_func(target_data), copy=False)

            self.scale_factors_[code] = scale_factor

        return self

Exemple #18

0

Afficher le fichier

def scale(x, data_mean, data_std):
    return (x - data_mean) / _handle_zeros_in_scale(data_std, copy=False)

Exemple #19

0

Afficher le fichier

def __minmax_scale_factor(data_min, data_max, feature_range):
    data_range = data_max - data_min
    scale = (feature_range[1] - feature_range[0]) / \
        _handle_zeros_in_scale(data_range, copy=False)
    return scale

Exemple #20

0

Afficher le fichier

Fichier : main_work.py Projet : ina-foss/louvain-news-clustering

def  zero_one_scale(serie):
    data_range = serie.max()
    scale = 1 / _handle_zeros_in_scale(data_range)
    serie *= scale

Exemple #21

0

Afficher le fichier

def nanscale(X, axis=0, with_mean=True, with_std=True, copy=True):
    """Standardize a dataset along any axis
    Center to the mean and component wise scale to unit variance.
    Read more in the :ref:`User Guide <preprocessing_scaler>`.
    Parameters
    ----------
    X : {array-like, sparse matrix}
        The data to center and scale.
    axis : int (0 by default)
        axis used to compute the means and standard deviations along. If 0,
        independently standardize each feature, otherwise (if 1) standardize
        each sample.
    with_mean : boolean, True by default
        If True, center the data before scaling.
    with_std : boolean, True by default
        If True, scale the data to unit variance (or equivalently,
        unit standard deviation).
    copy : boolean, optional, default True
        set to False to perform inplace row normalization and avoid a
        copy (if the input is already a numpy array or a scipy.sparse
        CSC matrix and if axis is 1).
    Notes
    -----
    This implementation will refuse to center scipy.sparse matrices
    since it would make them non-sparse and would potentially crash the
    program with memory exhaustion problems.
    Instead the caller is expected to either set explicitly
    `with_mean=False` (in that case, only variance scaling will be
    performed on the features of the CSC matrix) or to call `X.toarray()`
    if he/she expects the materialized dense array to fit in memory.
    To avoid memory copy the caller should pass a CSC matrix.
    For a comparison of the different scalers, transformers, and normalizers,
    see :ref:`examples/preprocessing/plot_all_scaling.py
    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
    See also
    --------
    StandardScaler: Performs scaling to unit variance using the``Transformer`` API
        (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`).
    """  # noqa
    X = check_array(X,
                    accept_sparse='csc',
                    copy=copy,
                    ensure_2d=False,
                    warn_on_dtype=True,
                    estimator='the scale function',
                    force_all_finite=False,
                    dtype=FLOAT_DTYPES)
    if sparse.issparse(X):
        if with_mean:
            raise ValueError(
                "Cannot center sparse matrices: pass `with_mean=False` instead"
                " See docstring for motivation and alternatives.")
        if axis != 0:
            raise ValueError("Can only scale sparse matrix on axis=0, "
                             " got axis=%d" % axis)
        if with_std:
            _, var = mean_variance_axis(X, axis=0)
            var = _handle_zeros_in_scale(var, copy=False)
            inplace_column_scale(X, 1 / np.sqrt(var))
    else:
        X = np.asarray(X)
        if with_mean:
            mean_ = np.nanmean(X, axis)
        if with_std:
            scale_ = np.nanstd(X, axis)
        # Xr is a view on the original array that enables easy use of
        # broadcasting on the axis in which we are interested in
        Xr = np.rollaxis(X, axis)
        if with_mean:
            Xr -= mean_
            mean_1 = Xr.mean(axis=0)
            # Verify that mean_1 is 'close to zero'. If X contains very
            # large values, mean_1 can also be very large, due to a lack of
            # precision of mean_. In this case, a pre-scaling of the
            # concerned feature is efficient, for instance by its mean or
            # maximum.
            if not np.allclose(mean_1[np.isfinite(mean_1)], 0):
                warnings.warn("Numerical issues were encountered "
                              "when centering the data "
                              "and might not be solved. Dataset may "
                              "contain too large values. You may need "
                              "to prescale your features.")
                Xr -= mean_1
        if with_std:
            scale_ = _handle_zeros_in_scale(scale_, copy=False)
            Xr /= scale_
            if with_mean:
                mean_2 = Xr.mean(axis=0)
                # If mean_2 is not 'close to zero', it comes from the fact that
                # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even
                # if mean_1 was close to zero. The problem is thus essentially
                # due to the lack of precision of mean_. A solution is then to
                # subtract the mean again:
                if not np.allclose(mean_2[np.isfinite(mean_1)], 0):
                    warnings.warn("Numerical issues were encountered "
                                  "when scaling the data "
                                  "and might not be solved. The standard "
                                  "deviation of the data is probably "
                                  "very close to 0. ")
                    Xr -= mean_2
    return X

Exemple #22

0

Afficher le fichier

def minmax_scale_params(data_min, data_max, feature_range=(0, 1)):
    data_range = data_max - data_min
    scale_ = (feature_range[1] - feature_range[0]) / \
        _handle_zeros_in_scale(data_range, copy=False)
    min_ = feature_range[0] - data_min * scale_
    return min_, scale_