def test_check_sample_weight():
    # check array order
    sample_weight = np.ones(10)[::2]
    assert not sample_weight.flags["C_CONTIGUOUS"]
    sample_weight = _check_sample_weight(sample_weight, X=np.ones((5, 1)))
    assert sample_weight.flags["C_CONTIGUOUS"]

    # check None input
    sample_weight = _check_sample_weight(None, X=np.ones((5, 2)))
    assert_allclose(sample_weight, np.ones(5))

    # check numbers input
    sample_weight = _check_sample_weight(2.0, X=np.ones((5, 2)))
    assert_allclose(sample_weight, 2 * np.ones(5))

    # check wrong number of dimensions
    with pytest.raises(ValueError,
                       match="Sample weights must be 1D array or scalar"):
        _check_sample_weight(np.ones((2, 4)), X=np.ones((2, 2)))

    # check incorrect n_samples
    msg = r"sample_weight.shape == \(4,\), expected \(2,\)!"
    with pytest.raises(ValueError, match=msg):
        _check_sample_weight(np.ones(4), X=np.ones((2, 2)))

    # float32 dtype is preserved
    X = np.ones((5, 2))
    sample_weight = np.ones(5, dtype=np.float32)
    sample_weight = _check_sample_weight(sample_weight, X)
    assert sample_weight.dtype == np.float32

    # int dtype will be converted to float64 instead
    X = np.ones((5, 2), dtype=np.int)
    sample_weight = _check_sample_weight(None, X, dtype=X.dtype)
    assert sample_weight.dtype == np.float64
Esempio n. 2
0
 def fit(self,
         X: np.array,
         y: np.array,
         sample_weight: np.array = None) -> Odte:
     # Check parameters are Ok.
     if self.n_estimators < 3:
         raise ValueError(
             f"n_estimators must be greater than 2 but got (n_estimators=\
                 {self.n_estimators})")
     check_classification_targets(y)
     X, y = self._validate_data(X, y)
     # if weights is None return np.ones
     sample_weight = _check_sample_weight(sample_weight,
                                          X,
                                          dtype=np.float64)
     check_classification_targets(y)
     # Initialize computed parameters
     #  Build the estimator
     self.max_features_ = self._initialize_max_features()
     # build base_estimator_
     self._validate_estimator()
     self.classes_, y = np.unique(y, return_inverse=True)
     self.n_classes_: int = self.classes_.shape[0]
     self.estimators_: List[BaseEstimator] = []
     self.subspaces_: List[Tuple[int, ...]] = []
     result = self._train(X, y, sample_weight)
     self.estimators_, self.subspaces_ = tuple(zip(*result))  # type: ignore
     return self
Esempio n. 3
0
def plsa_topics(X, k, **kwargs):
    """Perform a boostrap sample from a corpus of documents and fit the sample using
    pLSA to give a set of topic vectors such that the (z,w) entry of the returned
    array is the probability P(w|z) of word w occuring given the zth topic.

    Parameters
    ----------
    X: sparse matrix of shape (n_docs, n_words)
        The bag of words representation of the corpus of documents.

    k: int
        The number of topics to generate.

    kwargs:
        Further keyword arguments that can be passed on th the ``plsa_fit`` function.
        Possibilities include:
            * ``init``
            * ``n_iter``
            * ``n_iter_per_test``
            * ``tolerance``
            * ``e_step_threshold``
            * ``random_state``

    Returns
    -------
    topics: array of shape (k, n_words)
        The topics generated from the bootstrap sample.
    """
    A = X.tocsr()
    if kwargs.get("bootstrap", True):
        rng = check_random_state(kwargs.get("random_state", None))
        bootstrap_sample_indices = rng.randint(0, A.shape[0], size=A.shape[0])
        B = A[bootstrap_sample_indices]
    else:
        B = A
    sample_weight = _check_sample_weight(None, B, dtype=np.float32)
    if numba.cuda.is_available():
        doc_topic, topic_vocab = gpu_plsa_fit(
            B,
            k,
            init=kwargs.get("init", "random"),
            n_iter=kwargs.get("n_iter", 100),
            n_iter_per_test=kwargs.get("n_iter_per_test", 10),
            tolerance=kwargs.get("tolerance", 0.001),
            e_step_thresh=kwargs.get("e_step_thresh", 1e-16),
            random_state=kwargs.get("random_state", None),
        )
    else:
        doc_topic, topic_vocab = plsa_fit(
            B,
            k,
            sample_weight,
            init=kwargs.get("init", "random"),
            n_iter=kwargs.get("n_iter", 100),
            n_iter_per_test=kwargs.get("n_iter_per_test", 10),
            tolerance=kwargs.get("tolerance", 0.001),
            e_step_thresh=kwargs.get("e_step_thresh", 1e-16),
            random_state=kwargs.get("random_state", None),
        )
    return topic_vocab
Esempio n. 4
0
 def _deviance_dispersion_update(self, X, y, sample_weight=None):
     weights = _check_sample_weight(sample_weight, X)
     y_pred = self.predict(X)
     y_mean = np.average(y, weights=weights)
     deviance_ = np.sum(weights * (2 * (xlogy(y, y / y_pred) - y + y_pred)))
     null_deviance_ = np.sum(weights *
                             (2 * (xlogy(y, y / y_mean) - y + y_mean)))
     # pearson residual:  (raw residual)/(variance function)
     # TODO: put correct weibull variance here
     pearson_residuals_ = (y - y_pred) / np.sqrt(y_pred)
     pearson_chi2_ = np.sum(pearson_residuals_**2)
     model_d2_ = 1 - deviance_ / null_deviance_
     # degrees of freedom of the model (all params (including intercept) minus 1)
     df_model_ = X.shape[1]
     # degrees of freedom of residuals ((n_obs - 1) - (nparms - 1)), or
     df_residuals_ = X.shape[0] - X.shape[1] - 1
     # total degrees of freedom
     df_total_ = df_residuals_ + df_model_
     # method of moments estimator for dispersion scale
     dispersion_scale_ = pearson_chi2_ / df_residuals_
     dispersion_scale_sqrt_ = np.sqrt(dispersion_scale_)
     results = {
         'deviance_': deviance_,
         'null_deviance_': null_deviance_,
         'pearson_residuals_': pearson_residuals_,
         'pearson_chi2_': pearson_chi2_,
         'model_d2_': model_d2_,
         'df_model_': df_model_,
         'df_residuals_': df_residuals_,
         'df_total_': df_total_,
         'dispersion_scale_': dispersion_scale_,
         'dispersion_scale_sqrt_': dispersion_scale_sqrt_
     }
     return results
Esempio n. 5
0
    def fit(self, X, y, sample_weight=None):
        """
        Fit the model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data.

        y : array-like of shape (n_samples,)
            Target values.

        sample_weight, array-like of shape (n_samples,), default=Noone
            Individual weights for each sample.

        Returns
        -------
        self
        """
        check_classification_targets(y)
        if sample_weight is None:
            self.classes_, self.counts_ = np.unique(y, return_counts=True)
        else:
            sample_weight = _check_sample_weight(sample_weight, X)
            sample_weight = sample_weight / sample_weight.mean()
            df = pd.DataFrame({'y': y, 'sample_weight': sample_weight})
            df = df.groupby('y').sum()
            self.classes_ = df.index.values
            self.counts_ = df.sample_weight.values
        self.counts_ = self.counts_ / self.counts_.sum()
        self.dominant_class_ = self.classes_[np.argmax(self.counts_)]
        return self
Esempio n. 6
0
    def fit(self, X, y, *, sample_weight=None, **kwargs):
        """Build the ensemble classifier from the training set (X, y)."""

        # Check random state
        self.random_state = check_random_state(self.random_state)

        # Convert data (X is required to be 2d and indexable)
        X, y = self._validate_data(X, y, **self.check_x_y_args)
        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64)
            sample_weight /= sample_weight.sum()
            if np.any(sample_weight < 0):
                raise ValueError("sample_weight cannot contain negative weights")

        # Remap output
        n_samples, self.n_features_ = X.shape
        self.features_ = np.arange(self.n_features_)
        self._n_samples = n_samples
        y = self._validate_y(y)

        # Check parameters
        self._validate_estimator(default=DecisionTreeClassifier())
        
        # If the base estimator do not support sample weight and sample weight
        # is not None, raise an ValueError
        support_sample_weight = has_fit_parameter(self.base_estimator_,
                                                "sample_weight")
        if not support_sample_weight and sample_weight is not None:
            raise ValueError("The base estimator doesn't support sample weight")

        self.estimators_, self.estimators_features_ = [], []

        return self._fit(X, y, sample_weight=sample_weight, **kwargs)
Esempio n. 7
0
    def fit(self, X, Y, sample_weight):
        """Fit the dummy estimator on the training data and rankings."""
        (X, Y) = self._validate_data(X, Y, multi_output=True)
        sample_weight = _check_sample_weight(sample_weight, X)

        (_, n_classes) = Y.shape

        if self.strategy not in VALID_STRATEGIES:
            raise ValueError("Unknown strategy type: {0}. Expected one of {1}."
                             .format(self.strategy, list(VALID_STRATEGIES)))

        if self.strategy == "constant":
            if self.constant is None:
                raise ValueError("The constant target ranking has to be "
                                 "specified for the constant strategy.")
            elif self.constant.shape[0] != n_classes:
                raise ValueError("The constant target ranking should have "
                                 "shape {0}.".format(n_classes))
            else:
                self.constant = check_array(
                    self.constant, dtype=np.int64, ensure_2d=False)
                # Re-raise a more informative message when the constant
                # target ranking cannot be managed by the estimator
                try:
                    self._rank_algorithm.check_targets(self.constant[None, :])
                except ValueError:
                    raise ValueError("The constant target ranking is not the "
                                     "target type managed by the estimator.")

        self.ranking_ = self._rank_algorithm.aggregate(Y, sample_weight)

        return self
Esempio n. 8
0
def _daal_dbscan(X, eps=0.5, min_samples=5, sample_weight=None):
    if eps <= 0.0:
        raise ValueError("eps must be positive.")

    X = check_array(X, dtype=[np.float64, np.float32])
    if sample_weight is not None:
        sample_weight = _check_sample_weight(sample_weight, X)
        ww = make2d(sample_weight)
    else:
        ww = None

    XX = make2d(X)

    fpt = getFPType(XX)
    alg = daal4py.dbscan(method='defaultDense',
                         fptype=fpt,
                         epsilon=float(eps),
                         minObservations=int(min_samples),
                         memorySavingMode=False,
                         resultsToCompute="computeCoreIndices")

    daal_res = alg.compute(XX, ww)
    n_clusters = daal_res.nClusters[0, 0]
    assignments = daal_res.assignments.ravel()
    if daal_res.coreIndices is not None:
        core_ind = daal_res.coreIndices.ravel()
    else:
        core_ind = np.array([], dtype=np.intc)

    return (core_ind, assignments)
Esempio n. 9
0
    def _validate_input(self, X, y, sample_weight=None):
        """
        Helper function to validate the inputs
        """
        X, y = check_X_y(X, y, y_numeric=True, multi_output=True)

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight,
                                                 X,
                                                 dtype=X.dtype)

        X, y, X_offset, y_offset, X_scale = _preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            copy=self.copy_X,
            sample_weight=sample_weight,
            check_input=True)

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            outs = _rescale_data(X, y, sample_weight)
            X, y = outs[0], outs[1]

        return X, y, X_offset, y_offset, X_scale
Esempio n. 10
0
 def _validate_sample_weight(
     self,
     X: np.ndarray,
     y: np.ndarray,
     sample_weight: Union[None, np.ndarray, Iterable],
 ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """Validate that the passed sample_weight and ensure it is a Numpy array.
     """
     sample_weight = _check_sample_weight(sample_weight,
                                          X,
                                          dtype=np.dtype(
                                              tf.keras.backend.floatx()))
     # Scikit-Learn expects a 0 in sample_weight to mean
     # "ignore the sample", but because of how Keras applies
     # sample_weight to the loss function, this doesn't
     # exactly work out (as in, sklearn estimator checks fail
     # because the predictions differ by a small margin).
     # To get around this, we manually delete these samples here
     zeros = sample_weight == 0
     if zeros.sum() == zeros.size:
         raise ValueError(
             "No training samples had any weight; only zeros were passed in sample_weight."
             " That means there's nothing to train on by definition, so training can not be completed."
         )
     if np.any(zeros):
         X = X[~zeros]
         y = y[~zeros]
         sample_weight = sample_weight[~zeros]
     return X, y, sample_weight
Esempio n. 11
0
    def fit(self, X, y=None, sample_weight=None):
        """Compute kernel k-means clustering.

        Parameters
        ----------
        X : array-like of shape=(n_ts, sz, d)
            Time series dataset.

        y
            Ignored
        sample_weight : array-like of shape=(n_ts, ) or None (default: None)
            Weights to be given to time series in the learning process. By
            default, all time series weights are equal.
        """

        X = check_array(X, allow_nd=True, force_all_finite=False)
        X = check_dims(X)

        sample_weight = _check_sample_weight(sample_weight=sample_weight, X=X)

        max_attempts = max(self.n_init, 10)

        self.labels_ = None
        self.inertia_ = None
        self.sample_weight_ = None
        self._X_fit = None
        # n_iter_ will contain the number of iterations the most
        # successful run required.
        self.n_iter_ = 0

        n_samples = X.shape[0]
        K = self._get_kernel(X)
        sw = (sample_weight if sample_weight is not None
              else numpy.ones(n_samples))
        self.sample_weight_ = sw
        rs = check_random_state(self.random_state)

        last_correct_labels = None
        min_inertia = numpy.inf
        n_attempts = 0
        n_successful = 0
        while n_successful < self.n_init and n_attempts < max_attempts:
            try:
                if self.verbose and self.n_init > 1:
                    print("Init %d" % (n_successful + 1))
                n_attempts += 1
                self._fit_one_init(K, rs)
                if self.inertia_ < min_inertia:
                    last_correct_labels = self.labels_
                    min_inertia = self.inertia_
                    self.n_iter_ = self._iter
                n_successful += 1
            except EmptyClusterError:
                if self.verbose:
                    print("Resumed because of empty cluster")
        if n_successful > 0:
            self.labels_ = last_correct_labels
            self.inertia_ = min_inertia
            self._X_fit = X
        return self
Esempio n. 12
0
def _check_data_params(obj, X, y, conf_score):
    """Extracted out of BaseHandler for WeightedBag & Costing"""

    # Reproducibility
    rns = check_random_state(obj.random_state)
    for k, v in obj.get_params().items():
        if isinstance(v, BaseEstimator) and 'random_state' in v.get_params():
            v.set_params(random_state=rns.randint(10**8))

    # Parallelization
    if obj.classifier is not None and 'n_jobs' in obj.classifier.get_params():
        obj.classifier.set_params(n_jobs=obj.n_jobs)
    if obj.detector is not None and 'n_jobs' in obj.detector.get_params():
        obj.detector.set_params(n_jobs=obj.n_jobs)

    if conf_score is None and obj.detector is None:
        raise ValueError(
            "Neither conf_score or Detector is supplied to Handler")

    if conf_score is None:  # outside Pipeline/ inside Iterative Handler
        conf_score = obj.detector.detect(X, y)

    X, y = obj._validate_data(X, y)
    obj.classes_ = np.unique(y)
    conf_score = _check_sample_weight(conf_score, X)
    return X, y, conf_score
Esempio n. 13
0
    def fit(self, X, y, sample_weight=None):
        X, y = check_X_y(X, y, y_numeric=True, multi_output=True)

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight,
                                                 X,
                                                 dtype=X.dtype)

        X, y, X_offset, y_offset, X_scale = _preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            copy=self.copy_X,
            sample_weight=sample_weight,
            return_mean=True)

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            X, y = _rescale_data(X, y, sample_weight)

        self.is_fitted_ = True
        coef, alpha = fracridge(X, y, fracs=self.fracs)
        self.alpha_ = alpha
        self.coef_ = coef
        self._set_intercept(X_offset, y_offset, X_scale)
        return self
Esempio n. 14
0
    def fit(self, X, y, sample_weight=None, **kwargs):
        """Constructs a new model with `build_fn` & fit the model to `(X, y)`.

        Arguments:
            X : array-like, shape `(n_samples, n_features)`
                Training samples where `n_samples` is the number of samples
                and `n_features` is the number of features.
            y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
                True labels for `X`.
            sample_weight : array-like of shape (n_samples,), default=None
                Sample weights. The Keras Model must support this.
            **kwargs: dictionary arguments
                Legal arguments are the arguments of the keras model's `fit`
                method.
        Returns:
            self : object
                a reference to the instance that can be chain called
                (ex: instance.fit(X,y).transform(X) )
        Raises:
            ValueError : In case of invalid shape for `y` argument.
            ValuError : In case sample_weight != None and the Keras model's
                `fit` method does not support that parameter.
        """
        # basic checks
        X, y = check_X_y(
            X,
            y,
            allow_nd=True,  # allow X to have more than 2 dimensions
            multi_output=True,  # allow y to be 2D
        )

        X = check_array(X, allow_nd=True, dtype=["float64", "int"])

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight,
                                                 X,
                                                 dtype=["float64", "int"])

        # pre process X, y
        X, _ = self._pre_process_X(X)
        y, extra_args = self._pre_process_y(y)
        # update self.classes_, self.n_outputs_, self.n_classes_ and
        #  self.cls_type_
        for attr_name, attr_val in extra_args.items():
            setattr(self, attr_name, attr_val)

        # build model
        self.model_ = self._build_keras_model(X,
                                              y,
                                              sample_weight=sample_weight,
                                              **kwargs)

        y = self._check_output_model_compatibility(y)

        # fit model
        return self._fit_keras_model(X,
                                     y,
                                     sample_weight=sample_weight,
                                     **kwargs)
Esempio n. 15
0
    def fit(self, Knm, Kmm, y=None, sample_weight=None):
        """Fit KernelFlexibleCenterer

        Parameters
        ---------
        Knm: ndarray of shape (n_samples, n_active)
            Kernel matrix between the reference data set and the active set

        Kmm: ndarray of shape (n_active, n_active)
            Kernel matrix between the active set and itself

        y : None
            Ignored.

        sample_weight: ndarray of shape (n_samples,), default=None
            Weights for each sample. Sample weighting can be used to center (and
            scale) data using a weighted mean. Weights are internally normalized
            before preprocessing.

        Returns
        -------
        self : object
            Fitted transformer.
        """

        if Knm.shape[1] != Kmm.shape[0]:
            raise ValueError(
                "The reference kernel is not commensurate shape with the"
                "active kernel.")

        if Kmm.shape[0] != Kmm.shape[1]:
            raise ValueError("The active kernel is not square.")

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight,
                                                 Knm,
                                                 dtype=Knm.dtype)
            sample_weight = sample_weight / np.sum(sample_weight)

        self.n_active_ = Kmm.shape[0]

        if self.with_center:
            self.K_fit_rows_ = np.average(Knm, weights=sample_weight, axis=0)
        else:
            self.K_fit_rows_ = np.zeros(Knm.shape[1])

        if self.with_trace:
            Knm_centered = Knm - self.K_fit_rows_

            Khat = Knm_centered @ np.linalg.pinv(Kmm,
                                                 self.rcond) @ Knm_centered.T

            self.scale_ = np.sqrt(np.trace(Khat) / Knm.shape[0])
        else:
            self.scale_ = 1.0

        return self
Esempio n. 16
0
    def fit(self, X, y=None, sample_weight=None):
        """Perform DBSCAN clustering from features, or distance matrix.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features), or \
            (n_samples, n_samples)
            Training instances to cluster, or distances between instances if
            ``metric='precomputed'``. If a sparse matrix is provided, it will
            be converted into a sparse ``csr_matrix``.

        sample_weight : array, shape (n_samples,), optional
            Weight of each sample, such that a sample with a weight of at least
            ``min_samples`` is by itself a core sample; a sample with a
            negative weight may inhibit its eps-neighbor from being core.
            Note that weights are absolute, and default to 1.

        y : Ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        self

        """
        X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32])

        if self.eps <= 0.0:
            raise ValueError("eps must be positive.")

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X)

        _daal_ready = self.algorithm in [
            'auto', 'brute'] and (
            self.metric == 'euclidean' or (
                self.metric == 'minkowski' and self.p == 2)) and isinstance(
                X, np.ndarray)
        if _daal_ready:
            logging.info(
                "sklearn.cluster.DBSCAN."
                "fit: " + get_patch_message("daal"))
            core_ind, assignments = _daal_dbscan(
                X, self.eps,
                self.min_samples,
                sample_weight=sample_weight)
            self.core_sample_indices_ = core_ind
            self.labels_ = assignments
            self.components_ = np.take(X, core_ind, axis=0)
            return self
        logging.info(
            "sklearn.cluster.DBSCAN."
            "fit: " + get_patch_message("sklearn"))
        return super().fit(X, y, sample_weight=sample_weight)
Esempio n. 17
0
def check_sample_weight(sample_weight, X):
    if sample_weight is not None:
        sample_weight = _check_sample_weight(sample_weight, X)
        use_sample_weight = True
        if np.array_equal(np.unique(sample_weight), np.array([1.0])):
            sample_weight = None
            use_sample_weight = False
    else:
        use_sample_weight = False

    return sample_weight, use_sample_weight
Esempio n. 18
0
    def fit(self, X, y, sample_weight=None, check_input=True):
        """Fit a shapelet tree regressor from the training set

        Parameters
        ----------
        X : array-like of shape (n_samples, n_timesteps)
            The training time series.

        y : array-like of shape (n_samples,)
            Target values as floating point values

        sample_weight : array-like of shape (n_samples,)
            If `None`, then samples are equally weighted. Splits that would create child
            nodes with net zero or negative weight are ignored while searching for a
            split in each node. Splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        check_input : bool, optional
            Allow to bypass several input checking. Don't use this parameter unless you
            know what you do.

        Returns
        -------

        self: object
        """
        if check_input:
            X = check_array(X, allow_multivariate=True, dtype=float)
            y = check_array(y, ensure_2d=False, dtype=float)

        n_samples = X.shape[0]
        if isinstance(self.force_dim, int):
            X = np.reshape(X, [n_samples, self.force_dim, -1])

        n_timesteps = X.shape[-1]

        if X.ndim > 2:
            n_dims = X.shape[1]
        else:
            n_dims = 1

        if len(y) != n_samples:
            raise ValueError("Number of labels={} does not match "
                             "number of samples={}".format(len(y), n_samples))

        self.n_timestep_ = n_timesteps
        self.n_dims_ = n_dims
        random_state = check_random_state(self.random_state)

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X, dtype=float)

        self._fit(X, y, sample_weight, random_state)
        return self
Esempio n. 19
0
    def fit(self, X, y=None, sample_weight=None):
        """Compute mean and scaling to be applied for subsequent normalization.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features)
            The data used to compute the mean and standard deviation
            used for later scaling along the features axis.

        y: None
            Ignored.

        sample_weight: ndarray of shape (n_samples,)
            Weights for each sample. Sample weighting can be used to center
            (and scale) data using a weighted mean. Weights are internally
            normalized before preprocessing.

        Returns
        -------
        self : object
            Fitted scaler.
        """

        self.n_samples_seen_, self.n_features_ = X.shape

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight,
                                                 X,
                                                 dtype=X.dtype)
            sample_weight = sample_weight / np.sum(sample_weight)

        if self.with_mean:
            self.mean_ = np.average(X, weights=sample_weight, axis=0)
        else:
            self.mean_ = np.zeros(self.n_features_)

        self.scale_ = 1.0
        if self.with_std:
            X_mean = np.average(X, weights=sample_weight, axis=0)
            var = np.average((X - X_mean)**2, weights=sample_weight, axis=0)

            if self.column_wise:
                if np.any(var < self.atol + abs(X_mean) * self.rtol):
                    raise ValueError(
                        "Cannot normalize a feature with zero variance")
                self.scale_ = np.sqrt(var)
            else:
                var_sum = var.sum()
                if var_sum < abs(np.mean(X_mean)) * self.rtol + self.atol:
                    raise ValueError(
                        "Cannot normalize a matrix with zero variance")
                self.scale_ = np.sqrt(var_sum)

        return self
Esempio n. 20
0
def _fit_classifier(self, X, y, sample_weight=None):
    if sp.issparse(y):
        raise ValueError("sparse multilabel-indicator for y is not supported.")
    _check_parameters(self)
    if sample_weight is not None:
        sample_weight = _check_sample_weight(sample_weight, X)

    daal_ready = (self.warm_start is False and self.criterion == "gini"
                  and self.ccp_alpha == 0.0 and not sp.issparse(X))

    if daal_ready:
        _supported_dtypes_ = [np.single, np.double]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.asarray(y)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn(
                "A column-vector y was passed when a 1d array was"
                " expected. Please change the shape of y to "
                "(n_samples,), for example using ravel().",
                DataConversionWarning,
                stacklevel=2)

        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        if self.n_outputs_ != 1:
            daal_ready = False

    if daal_ready:
        logging.info("sklearn.ensemble.RandomForestClassifier.fit: " +
                     method_uses_daal)
        _daal_fit_classifier(self, X, y, sample_weight=sample_weight)

        if not hasattr(self, "estimators_"):
            self.estimators_ = self._estimators_

        # Decapsulate classes_ attributes
        self.n_classes_ = self.n_classes_[0]
        self.classes_ = self.classes_[0]
        return self

    else:
        logging.info("sklearn.ensemble.RandomForestClassifier.fit: " +
                     method_uses_sklearn)
        return super(RandomForestClassifier,
                     self).fit(X, y, sample_weight=sample_weight)
Esempio n. 21
0
def check_null_weight(
        sample_weight: Optional[ArrayLike], X: ArrayLike,
        y: ArrayLike) -> Tuple[Optional[NDArray], ArrayLike, ArrayLike]:
    """
    Check sample weights and remove samples with null sample weights.

    Parameters
    ----------
    sample_weight : Optional[ArrayLike] of shape (n_samples,)
        Sample weights.
    X : ArrayLike of shape (n_samples, n_features)
        Training samples.
    y : ArrayLike of shape (n_samples,)
        Training labels.

    Returns
    -------
    sample_weight : Optional[NDArray] of shape (n_samples,)
        Non-null sample weights.

    X : ArrayLike of shape (n_samples, n_features)
        Training samples with non-null weights.

    y : ArrayLike of shape (n_samples,)
        Training labels with non-null weights.

    Examples
    --------
    >>> import numpy as np
    >>> from mapie.utils import check_null_weight
    >>> X = np.array([[0], [1], [2], [3], [4], [5]])
    >>> y = np.array([5, 7, 9, 11, 13, 15])
    >>> sample_weight = np.array([0, 1, 1, 1, 1, 1])
    >>> sample_weight, X, y = check_null_weight(sample_weight, X, y)
    >>> print(sample_weight)
    [1. 1. 1. 1. 1.]
    >>> print(X)
    [[1]
     [2]
     [3]
     [4]
     [5]]
    >>> print(y)
    [ 7  9 11 13 15]
    """
    if sample_weight is not None:
        sample_weight = _check_sample_weight(sample_weight, X)
        non_null_weight = sample_weight != 0
        X = _safe_indexing(X, non_null_weight)
        y = _safe_indexing(y, non_null_weight)
        sample_weight = _safe_indexing(sample_weight, non_null_weight)
    sample_weight = cast(Optional[NDArray], sample_weight)
    return sample_weight, X, y
Esempio n. 22
0
 def _check_sample_weight(self, sample_weight, X):
     if sample_weight is not None:
         sample_weight = _check_sample_weight(sample_weight, X)
         use_sample_weight = True
         # ranger does additional rng on samples if weights are passed.
         # if the weights are ones, then we dont want that extra rng.
         if np.array_equal(np.unique(sample_weight), np.array([1.0])):
             sample_weight = []
             use_sample_weight = False
     else:
         sample_weight = []
         use_sample_weight = False
     return sample_weight, use_sample_weight
Esempio n. 23
0
def _check_normalize_sample_weight(sample_weight, X):
    """Set sample_weight if None, and check for correct dtype"""

    sample_weight_was_none = sample_weight is None

    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
    if not sample_weight_was_none:
        # normalize the weights to sum up to n_samples
        # an array of 1 (i.e. samples_weight is None) is already normalized
        n_samples = len(sample_weight)
        scale = n_samples / sample_weight.sum()
        sample_weight *= scale
    return sample_weight
Esempio n. 24
0
    def _prepare_inputs(self, X, sample_weight, y):
        X, y = check_X_y(X, y)
        sample_weight = _check_sample_weight(sample_weight, X)
        self.n_features_in_ = X.shape[1]

        n = X.shape[0]
        if self.copy_X:
            X_ = X.copy()
        else:
            X_ = X
        if self.fit_intercept:
            X_ = np.hstack([X_, np.ones(shape=(n, 1))])

        loss, grad_loss = self._get_objective(X_, y, sample_weight)

        return X_, grad_loss, loss
Esempio n. 25
0
def _daal4py_check_weight(self, X, y, sample_weight):
    ww = None
    if sample_weight.shape[0] > 0:
        sample_weight = _check_sample_weight(sample_weight, X)
        if np.all(sample_weight <= 0):
            raise ValueError('Invalid input - all samples have zero or negative weights.')
        elif np.any(sample_weight <= 0):
            if len(np.unique(y[sample_weight > 0])) != len(self.classes_):
                raise ValueError('Invalid input - all samples with positive weights have the same label.')
        ww = sample_weight
    elif self.class_weight is not None:
        ww = np.ones(X.shape[0], dtype=np.float64)
    if self.class_weight is not None:
        for i, v in enumerate(self.class_weight_):
            ww[y == i] *= v
    if ww is not None:
        ww = make2d(ww)
    return ww
Esempio n. 26
0
    def fit(self, X, y, sample_weight=None):
        """
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data.
        
        y : array-like of shape (n_samples, n_targets)
            Target values.

        sample_weight, array-like of shape (n_samples,), default=Noone
            Individual weights for each sample.

        Returns
        -------
        self
        """
        sample_weight = _check_sample_weight(sample_weight, X)
        self.y_mean_ = (y * sample_weight).mean() / sample_weight.mean()
        return self
    def fit(self, X, y=None, sample_weight=None):
        """Fit Kernel Ridge regression model
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data. If kernel == "precomputed" this is instead
            a precomputed kernel matrix, of shape (n_samples, n_samples).
        y : array-like of shape (n_samples,) or (n_samples, n_targets)
            Target values
        sample_weight : float or array-like of shape [n_samples]
            Individual weights for each sample, ignored if None is passed.
        Returns
        -------
        self : returns an instance of self.
        """
        # Convert data
        t0 = time.time()
        #X, y = self._validate_data(X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True)
        if sample_weight is not None and not isinstance(sample_weight, float):
            sample_weight = _check_sample_weight(sample_weight, X)

        K = self._get_kernel(X)
        alpha = np.atleast_1d(self.alpha)

        ravel = False
        if len(y.shape) == 1:
            y = y.reshape(-1, 1)
            ravel = True

        copy = self.kernel == "precomputed"
        self.dual_coef_ = _solve_cholesky_kernel(K, y, alpha,
                                                 sample_weight,
                                                 copy)
        if ravel:
            self.dual_coef_ = self.dual_coef_.ravel()

        self.X_fit_ = X
        t1 = time.time() - t0
        #print("KRR fitted in %.3f s" % t1)
        return self
Esempio n. 28
0
    def transform(self, X, y=None, sample_weight=None):
        """Transform the data X into the topic space of the fitted pLSA model.

        Parameters
        ----------
        X: array or sparse matrix of shape (n_docs, n_words)
            Corpus to be embedded into topic space

        y: Ignored

        Returns
        -------
        embedding: array of shape (n_docs, n_topics)
            An embedding of the documents X into the topic space.
        """
        X = check_array(X, accept_sparse="csr")
        sample_weight = _check_sample_weight(sample_weight,
                                             X,
                                             dtype=np.float32)
        random_state = check_random_state(self.transform_random_seed)

        if not issparse(X):
            X = coo_matrix(X)
        else:
            X = X.tocoo()

        result = plsa_refit(
            X,
            self.components_,
            sample_weight,
            block_size=self.block_size,
            n_iter=50,
            n_iter_per_test=5,
            tolerance=0.001,
            random_state=random_state,
        )

        return result
Esempio n. 29
0
    def score(self, X, y, sample_weight=None, **kwargs):
        """Returns the mean accuracy on the given test data and labels.

        Arguments:
            X: array-like, shape `(n_samples, n_features)`
                Test samples where `n_samples` is the number of samples
                and `n_features` is the number of features.
            y: array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
                True labels for `X`.
            sample_weight : array-like of shape (n_samples,), default=None
                Sample weights. The Keras Model must support this.
            **kwargs: dictionary arguments
                Legal arguments are those of self.model_.evaluate.

        Returns:
            score: float
                Mean accuracy of predictions on `X` wrt. `y`.

        Raises:
            ValueError: If the underlying model isn't configured to
                compute accuracy. You should pass `metrics=["accuracy"]` to
                the `.compile()` method of the model.
        """
        # validate sample weights
        if sample_weight is not None:
            sample_weight = _check_sample_weight(
                sample_weight, X, dtype=["float64", "int"]
            )

        # pre process X, y
        _, extra_args = self._pre_process_y(y)

        # compute Keras model score
        y_pred = self.predict(X, **kwargs)

        return self._scorer(y, y_pred, sample_weight=sample_weight)
Esempio n. 30
0
    def score(self, X, y, sample_weight=None):
        """Returns the mean accuracy on the given test data and labels.

        Arguments:
            X: array-like, shape `(n_samples, n_features)`
                Test samples where `n_samples` is the number of samples
                and `n_features` is the number of features.
            y: array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
                True labels for `X`.
            sample_weight : array-like of shape (n_samples,), default=None
                Sample weights. The Keras Model must support this.

        Returns:
            score: float
                Mean accuracy of predictions on `X` wrt. `y`.
        """
        # validate sample weights
        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X)

        # validate y
        y = check_array(y, ensure_2d=False)

        # compute Keras model score
        y_pred = self.predict(X)

        # filter kwargs and get attributes for score
        params = self.get_params()
        score_args = route_params(params,
                                  destination="score",
                                  pass_filter=set())

        return self.scorer(y,
                           y_pred,
                           sample_weight=sample_weight,
                           **score_args)