Ejemplo n.º 1
0
    def _partial_fit(self,
                     X,
                     y,
                     alpha,
                     C,
                     loss,
                     learning_rate,
                     max_iter,
                     classes,
                     sample_weight=None,
                     coef_init=None,
                     intercept_init=None):
        X, y = check_X_y(X,
                         y,
                         'csr',
                         dtype=np.float64,
                         order="C",
                         accept_large_sparse=False)

        n_samples, n_features = X.shape

        _check_partial_fit_first_call(self, classes)

        n_classes = 2

        # Allocate datastructures from input arguments
        self._expanded_class_weight = compute_class_weight(
            self.class_weight, self.classes_, y)
        if sample_weight is None:
            sample_weight = np.ones(n_samples, dtype=np.float64)

        if getattr(self, "coef_", None) is None or coef_init is not None:
            self._allocate_parameter_mem(n_classes, n_features, coef_init,
                                         intercept_init)
        elif n_features != self.coef_.shape[-1]:
            raise ValueError("Number of features %d does not match previous "
                             "data %d." % (n_features, self.coef_.shape[-1]))

        self.loss_function_ = self._get_loss_function(loss)
        if not hasattr(self, "t_"):
            self.t_ = 1.0

        self._fit_binary(X,
                         y,
                         alpha=alpha,
                         C=C,
                         learning_rate=learning_rate,
                         sample_weight=sample_weight,
                         max_iter=max_iter)
        return self
Ejemplo n.º 2
0
    def partial_fit(self, X, y, classes=None):
        """Partial fitting."""
        if not hasattr(self, "_base_clf"):
            self.set_base_clf()
        X, y = check_X_y(X, y)

        if _check_partial_fit_first_call(self, classes):
            self.classes_ = classes
            self.ensemble_ = []

        self.X_, self.y_ = X, y

        train_X, train_y = self.remove_outliers(X, y)

        # Testing all models
        scores = np.array(
            [measure(y, clf.predict(X)) for clf in self.ensemble_])

        # Pruning
        if len(self.ensemble_) > 1:
            alpha_good = scores > (0.5 + self.alpha)
            # print(scores)
            self.ensemble_ = [
                self.ensemble_[i] for i in np.where(alpha_good)[0]
            ]

        if len(self.ensemble_) > self.ensemble_size - 1:
            worst = np.argmin(scores)
            del self.ensemble_[worst]

        # Preparing and training new candidate
        self.ensemble_.append(base.clone(self._base_clf).fit(train_X, train_y))
Ejemplo n.º 3
0
    def partial_fit(self, X, y, classes=None):
        """Partial fitting."""
        if not hasattr(self, "_base_clf"):
            self.set_base_clf()
        X, y = check_X_y(X, y)

        if _check_partial_fit_first_call(self, classes):
            self.classes_ = classes
            self.ensemble_ = []
            self.weights_ = []
            self.tresholds_ = []

        self.X_, self.y_ = X, y

        # Testing all models
        scores = np.array([self.metric(y, clf.predict(X)) for clf in self.ensemble_])

        # Pruning
        self.prune(scores)

        # Preparing and training new candidate
        candidate_clf = base.clone(self._base_clf).fit(self.X_, self.y_)

        # Checking tresholds
        if self.t_strategy == "auto":
            probas = candidate_clf.predict_proba(self.X_)[:, 0]
            treshold = self.opt_quants[
                np.argmax([self.metric(self.y_, probas < t) for t in self.opt_quants])
            ]
        else:
            treshold = self.t_strategy

        self.ensemble_.append(candidate_clf)
        self.tresholds_.append(treshold)
Ejemplo n.º 4
0
    def partial_fit(self, X, y, classes=None):
        """Partially fit underlying estimators

        Should be used when memory is inefficient to train all data.
        Chunks of data can be passed in several iteration.

        Parameters
        ----------
        X : (sparse) array-like, shape = [n_samples, n_features]
            Data.

        y : (sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]
            Multi-class targets. An indicator matrix turns on multilabel
            classification.

        classes : array, shape (n_classes, )
            Classes across all calls to partial_fit.
            Can be obtained via `np.unique(y_all)`, where y_all is the
            target vector of the entire dataset.
            This argument is only required in the first call of partial_fit
            and can be omitted in the subsequent calls.

        Returns
        -------
        self
        """
        if _check_partial_fit_first_call(self, classes):
            if not hasattr(self.estimator, "partial_fit"):
                raise ValueError(("Base estimator {0}, doesn't have "
                                  "partial_fit method").format(self.estimator))
            self.estimators_ = [
                clone(self.estimator) for _ in range(self.n_classes_)
            ]

            # A sparse LabelBinarizer, with sparse_output=True, has been
            # shown to outperform or match a dense label binarizer in all
            # cases and has also resulted in less or equal memory consumption
            # in the fit_ovr function overall.
            self.label_binarizer_ = LabelBinarizer(sparse_output=True)
            self.label_binarizer_.fit(self.classes_)

        if len(np.setdiff1d(y, self.classes_)):
            raise ValueError(
                ("Mini-batch contains {0} while classes " +
                 "must be subset of {1}").format(np.unique(y), self.classes_))

        Y = self.label_binarizer_.transform(y)
        Y = Y.tocsc()
        columns = (col.toarray().ravel() for col in Y.T)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_partial_fit_binary)(estimator, X, column)
            for estimator, column in izip(self.estimators_, columns))

        return self
    def _partial_fit(self, X, y, classes=None):
        if _check_partial_fit_first_call(self, classes):
            self._label_binarizer = LabelBinarizer()
            if type_of_target(y).startswith('multilabel'):
                self._label_binarizer.fit(y)
            else:
                self._label_binarizer.fit(classes)

        super(MLPClassifier_Custom, self)._partial_fit(X, y)

        return self
Ejemplo n.º 6
0
    def partial_fit(self, X, y, classes=None):
        """Partial fitting."""
        X, y = check_X_y(X, y)
        self.X_ = X
        self.y_ = y

        # print(X[0,0])
        if _check_partial_fit_first_call(self, classes):
            self.classes_ = classes

            self.ensemble_ = []
            self.weights_ = np.array([])
        """Partial fitting"""

        # Preparing and training new candidate
        if classes is not None:
            self.classes_ = classes
        elif self.classes_ is None:
            raise Exception('Classes not specified')

        candidate_clf = clone(self.base_estimator)
        candidate_weight = self._get_weigth_for_candidate(candidate_clf)
        if self._sampling == 'over':
            ros = RandomOverSampler(random_state=0)
            X, y = ros.fit_resample(X, y)
        elif self._sampling == 'under':
            rus = RandomUnderSampler(random_state=0)
            X, y = rus.fit_resample(X, y)
        if not self._update:
            candidate_clf.fit(X, y)
        else:
            candidate_clf.partial_fit(X, y)

        self._set_weights()

        if self._update:
            random_cl_weight = self._weight_of_random_classifier()
            for i in range(len(self.ensemble_)):
                if self.weights_[i] > random_cl_weight:
                    self.ensemble_[i].partial_fit(X, y)

        self.ensemble_.append(candidate_clf)

        self.weights_ = np.append(self.weights_, np.array([candidate_weight]))

        # Post-pruning
        if len(self.ensemble_) > self.n_estimators:
            self._prune()

        # Weights normalization
        self.weights_ = self.weights_ / np.sum(self.weights_)
Ejemplo n.º 7
0
    def partial_fit(self, X, y, classes=None):
        X, y = check_X_y(X, y)

        # Check consitency
        if hasattr(self, 'X_') and X.shape[1] != self.X_.shape[1]:
            raise ValueError('number of features does not match')

        self.X_ = X
        self.y_ = y

        if not hasattr(self, 'n_features_'):
            self.n_features_ = X.shape[1]

        # Get subspace
        if not hasattr(self, 'subspace_'):
            if self.given_subspace is None:
                self.subspace_ = self._assumed_subspace()
            else:
                self.subspace_ = np.array(self.given_subspace)

        # Acquire subspaced X
        subspaced_X = X[:, self.subspace_].astype('float64')

        if _check_partial_fit_first_call(self, classes):
            self.classes_ = classes

            # Scaler
            self.scaler_ = MinMaxScaler()
            self.scaler_.fit(subspaced_X)

        # Store the classes seen during fit
        # TODO It's definitely not optimal
        y = [list(self.classes_).index(a) for a in y]

        # Expose
        if hasattr(self, 'model_'):
            self.model_ += self.expose(subspaced_X, y)
        else:
            self.model_ = self.expose(subspaced_X, y)

        # HSV
        self._hue = np.argmax(self.model_, axis=2) / float(len(self.classes_))
        self._saturation = np.max(self.model_, axis=2) - \
            np.min(self.model_, axis=2)
        self._value = np.max(self.model_, axis=2)
        self._hsv = np.dstack((self._hue, self._saturation, self._value))

        # Calculate measures
        self._calculate_measures()
Ejemplo n.º 8
0
    def partial_fit(self, X, y, classes=None):
        """Partial fitting."""
        if not hasattr(self, "_base_clf"):
            self.set_base_clf()
        X, y = check_X_y(X, y)
        self.X_ = X
        self.y_ = y

        if _check_partial_fit_first_call(self, classes):
            self.classes_ = classes

            self.ensemble_ = []
            self.weights_ = np.array([1])
            self.age_ = 0
            self.iterations_ = np.array([])
        """Partial fitting"""
        if self.age_ > 0:
            self.overall_accuracy = self.score(self.previous_X,
                                               self.previous_y)

        # Pre-pruning
        if len(self.ensemble_) > self.ensemble_size and not self.post_pruning:
            self._prune()

        # Preparing and training new candidate
        self.classes_ = classes
        candidate_clf = base.clone(self._base_clf)
        candidate_clf.fit(X, y)
        self.ensemble_.append(candidate_clf)
        self.iterations_ = np.append(self.iterations_, [1])

        self._set_weights()
        self._rejuvenate()
        self._aging()
        self._extinct()

        # Post-pruning
        if len(self.ensemble_) > self.ensemble_size and self.post_pruning:
            self._prune()

        # Weights normalization
        self.weights_ = self.weights_ / np.sum(self.weights_)

        # Ending procedure
        self.previous_X, self.previous_y = (X, y)
        self.age_ += 1
        self.iterations_ += 1
Ejemplo n.º 9
0
    def partial_fit(self, X, y, classes=None):
        """Partial fitting."""
        if not hasattr(self, "_base_clf"):
            self.set_base_clf()
        X, y = check_X_y(X, y)

        if _check_partial_fit_first_call(self, classes):
            self.classes_ = classes
            self.ensemble_ = []

        self.X_, self.y_ = X, y

        # Preparing and training new candidate
        self.ensemble_.append(base.clone(self._base_clf).fit(self.X_, self.y_))

        if len(self.ensemble_) > self.ensemble_size:
            del self.ensemble_[0]
    def partial_fit(self, X, y, classes=None):
        """Partially fit underlying estimators
        Should be used when memory is inefficient to train all data. Chunks
        of data can be passed in several iteration, where the first call
        should have an array of all target variables.
        Parameters
        ----------
        X : (sparse) array-like, shape = [n_samples, n_features]
            Data.
        y : array-like, shape = [n_samples]
            Multi-class targets.
        classes : array, shape (n_classes, )
            Classes across all calls to partial_fit.
            Can be obtained via `np.unique(y_all)`, where y_all is the
            target vector of the entire dataset.
            This argument is only required in the first call of partial_fit
            and can be omitted in the subsequent calls.
        Returns
        -------
        self
        """
        if _check_partial_fit_first_call(self, classes):
            self.estimators_ = [
                clone(self.estimator)
                for i in range(self.n_classes_ * (self.n_classes_ - 1) // 2)
            ]

        if len(np.setdiff1d(y, self.classes_)):
            raise ValueError("Mini-batch contains {0} while it "
                             "must be subset of {1}".format(
                                 np.unique(y), self.classes_))

        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
        check_classification_targets(y)
        combinations = itertools.combinations(range(self.n_classes_), 2)
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_partial_fit_ovo_binary)(estimator, X, y, self.classes_[i],
                                             self.classes_[j])
            for estimator, (i, j) in izip(self.estimators_, (combinations)))

        self.pairwise_indices_ = None

        return self
Ejemplo n.º 11
0
    def partial_fit(self, X, y, classes=None):
        """Partial fitting."""
        if not hasattr(self, "_base_clf"):
            self.set_base_clf()
        X, y = check_X_y(X, y)

        if _check_partial_fit_first_call(self, classes):
            self.classes_ = classes
            self.ensemble_ = []

        if self.oversampled == False:
            self.X_, self.y_ = X, y
        else:
            ros = RandomOverSampler(random_state=42)
            self.X_, self.y_ = ros.fit_resample(X, y)

        # Preparing and training new candidate
        self.ensemble_.append(base.clone(self._base_clf).fit(self.X_, self.y_))

        if len(self.ensemble_) > self.ensemble_size:
            del self.ensemble_[0]
Ejemplo n.º 12
0
    def partial_fit(self, X, y, classes=None):
        """Partial fitting."""
        if not hasattr(self, "_base_clf"):
            self.set_base_clf()
        X, y = check_X_y(X, y)
        self.X_ = X
        self.y_ = y

        if _check_partial_fit_first_call(self, classes):
            self.classes_ = classes
            self.ensemble_ = []
            self.previous_X = self.X_
            self.previous_y = self.y_

        # Do przemyślenia
        # if len(self.ensemble_) > 1:
        #     test = self.region_of_competence_predict(X, n_neighbors=5)

        # Copy the old chunk
        self.previous_X = self.X_
        self.previous_y = self.y_

        # Preparing and training new candidate
        self.classes_ = classes
        candidate_clf = base.clone(self._base_clf)

        # Remove outliers
        # X_wo_outliers, y_wo_outliers = self.remove_outliers(X, y)

        candidate_clf.fit(X, y)
        self.ensemble_.append(candidate_clf)

        # Score base models
        base_scores = self.f1_score_base_classifiers(X, y)

        # Prune all classifiers below f1 threshold
        base_scores = self.prune_threshold(base_scores, threshold=0.94)

        # Prune the worst classifer if ensemble size exceeded
        _ = self.prune_worst_classifier(base_scores)
Ejemplo n.º 13
0
    def partial_fit(self, X, y, classes=None, sample_weight=None):
        """Incremental fit on a batch of samples.
        This method is expected to be called several times consecutively
        on different chunks of a dataset so as to implement out-of-core
        or online learning.
        This is especially useful when the whole dataset is too big to fit in
        memory at once.
        This method has some performance overhead hence it is better to call
        partial_fit on chunks of data that are as large as possible
        (as long as fitting in the memory budget) to hide the overhead.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.
        classes : array-like, shape = [n_classes] (default=None)
            List of all the classes that can possibly appear in the y vector.
            Must be provided at the first call to partial_fit, can be omitted
            in subsequent calls.
        sample_weight : array-like, shape = [n_samples] (default=None)
            Weights applied to individual samples (1. for unweighted).
        Returns
        -------
        self : object
            Returns self.
        """
        X = check_array(X, accept_sparse='csr', dtype=np.float64)
        _, n_features = X.shape

        if _check_partial_fit_first_call(self, classes):
            # This is the first call to partial_fit:
            # initialize various cumulative counters
            n_effective_classes = len(classes) if len(classes) > 1 else 2
            self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
            self.feature_count_ = np.zeros((n_effective_classes, n_features),
                                           dtype=np.float64)
            self.complement_class_count_ = np.zeros(n_effective_classes,
                                                    dtype=np.float64)
            self.complement_feature_count_ = np.zeros(
                (n_effective_classes, n_features), dtype=np.float64)
        elif n_features != self.coef_.shape[1]:
            msg = "Number of features %d does not match previous data %d."
            raise ValueError(msg % (n_features, self.coef_.shape[-1]))

        Y = label_binarize(y, classes=self.classes_)
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        n_samples, n_classes = Y.shape

        if X.shape[0] != Y.shape[0]:
            msg = "X.shape[0]=%d and y.shape[0]=%d are incompatible."
            raise ValueError(msg % (X.shape[0], y.shape[0]))

        # label_binarize() returns arrays with dtype=np.int64.
        # We convert it to np.float64 to support sample_weight consistently
        Y = Y.astype(np.float64)
        if sample_weight is not None:
            sample_weight = np.atleast_2d(sample_weight)
            Y *= check_array(sample_weight).T

        class_prior = self.class_prior

        # Count raw events from data before updating the class log prior
        # and feature log probas
        self._count(X, Y)

        # XXX: OPTIM: we could introduce a public finalization method to
        # be called by the user explicitly just once after several consecutive
        # calls to partial_fit and prior any call to predict[_[log_]proba]
        # to avoid computing the smooth log probas at each call to partial fit
        alpha = self._check_alpha()
        self._update_feature_log_prob(alpha)
        self._update_class_log_prior(class_prior=class_prior)
        return self
Ejemplo n.º 14
0
    def fit(self, X, y, sample_weight=None):

        X, y = check_X_y(X, y, 'csr')
        _, n_features = X.shape

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
        # We convert it to np.float64 to support sample_weight consistently;
        # this means we also don't have to cast X to floating point
        Y = Y.astype(np.float64)
        if sample_weight is not None:
            sample_weight = np.atleast_2d(sample_weight)
            Y *= check_array(sample_weight).T

        class_prior = self.class_prior

        # Count raw events from data before updating the class log prior
        # and feature log probas
        n_effective_classes = Y.shape[1]
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.feature_count_ = np.zeros((n_effective_classes, n_features), dtype=np.float64)
        self._count(X, Y)
        self._update_feature_log_prob()
        self._update_class_log_prior(class_prior=class_prior)
        return self

	def partial_fit(self, X, y, classes=None, sample_weight=None):
		X = check_array(X, accept_sparse='csr', dtype=np.float64)
		_, n_features = X.shape


        self.coef_ = self._get_coef()
        #self.intercept_ = self._get_intercept()

        if _check_partial_fit_first_call(self, classes):
            # This is the first call to partial_fit:
            # initialize various cumulative counters
            n_effective_classes = len(classes) if len(classes) > 1 else 2
            self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
            self.feature_count_ = np.zeros((n_effective_classes, n_features), dtype=np.float64)
        elif n_features != self.coef_.shape[1]:
            msg = "Number of features %d does not match previous data %d."
            raise ValueError(msg % (n_features, self.coef_.shape[-1]))

        Y = label_binarize(y, classes=self.classes_)
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        n_samples, n_classes = Y.shape

        if X.shape[0] != Y.shape[0]:
            msg = "X.shape[0]=%d and y.shape[0]=%d are incompatible."
            raise ValueError(msg % (X.shape[0], y.shape[0]))

        # label_binarize() returns arrays with dtype=np.int64.
        # We convert it to np.float64 to support sample_weight consistently
        Y = Y.astype(np.float64)
        if sample_weight is not None:
            sample_weight = np.atleast_2d(sample_weight)
            Y *= check_array(sample_weight).T

        class_prior = self.class_prior

        # Count raw events from data before updating the class log prior
        # and feature log probas
        self._count(X, Y)

        # XXX: OPTIM: we could introduce a public finalization method to
        # be called by the user explicitly just once after several consecutive
        # calls to partial_fit and prior any call to predict[_[log_]proba]
        # to avoid computing the smooth log probas at each call to partial fit
        self._update_feature_log_prob()
        self._update_class_log_prior(class_prior=class_prior)
        return self
Ejemplo n.º 15
0
    def partial_fit(self, X: Union[np.array, pd.DataFrame], y: Union[np.array, pd.Series],
                    classes: Union[list, np.ndarray]=None):
        """
        Fit a single DTC using the given subset of x and y.
​
        Passes subset to fit, rather than using the same data each time. Wrap with Dask Incremental to handle subset
        feeding.
​
        First call needs to be supplied with the expected classes (similar to existing models with .partial_fit())
        in case not all classes are present in the first subset.
        TODO: This currently expected every call, but alternative could be checked with the existing sklearn mechanism.
​
        Additionally, the case where not all classes are presented in the first or subsequent subsets needs to be
        handled. For the RandomForestClassifier, tree predictions are averaged in
        sklearn.ensemble.forest.accumulate_prediction unction. This sums the output matrix with dimensions
        n rows x n classes and fails if the class dimension differs.
        The class dimension is defined at the individual estimator level during the .fit() call, which sets the
        following attributes:
            - self.n_outputs_ = y.shape[1], which is then used by _validate_y_class_weight()), always called in .fit()
              to set:
                - self.classes_
                - self.n_classes_

        This object sets classes_ and n_classes_ depending on the supplied classes. The Individual trees set theirs
        depending on the data available in the subset. The predict_proba method is modified to standardise shape to the
        dimensions defined in this object.
​
        :param x:
        :param y:
        :return:
        """

        # Set classes for forest (this only needs to be done once).
        # Not for each individual tree, these will be set by .fit() using the classes available in the subset.
        # Check classes_ is set, or provided
        # Returns false if nothing to do
        classes_need_setting = _check_partial_fit_first_call(self, classes)

        # If classes not set, set
        # Above will error if not set and classes = None
        if classes_need_setting:
            self.classes_ = np.array(classes)
            self.n_classes_ = len(classes)

        # Fit the next estimator, if not done
        if self._fit_estimators < self.max_n_estimators:
            t0 = time.time()
            self.fit(X, y)
            t1 = time.time()

            if self.verbose > 0:
                print(f"Fit estimators {self._fit_estimators} - {self._fit_estimators + self.n_estimators_per_chunk} "
                      f"/ {self.max_n_estimators}")
                print(f"Fit time: {round(t1 - t0, 2)}")
                print(len(self.estimators_))
            self._fit_estimators += self.n_estimators_per_chunk

            # If still not done, prep to fit next
            if self._fit_estimators < self.max_n_estimators:
                self.n_estimators += self.n_estimators_per_chunk

        else:
            if self.verb > 0:
                print('Done')
            return self
Ejemplo n.º 16
0
    def _partial_fit(self,
                     X,
                     y,
                     classes=None,
                     _refit=False,
                     sample_weight=None):
        X, y = check_X_y(X, y)

        # If the ratio of data variance between dimensions is too small, it
        # will cause numerical errors. To address this, we artificially
        # boost the variance by epsilon, a small fraction of the standard
        # deviation of the largest dimension.
        epsilon = 1e-9 * np.var(X, axis=0).max()

        if _refit:
            self.classes_ = None

        if _check_partial_fit_first_call(self, classes):
            # This is the first call to partial_fit:
            # initialize various cumulative counters
            n_features = X.shape[1]
            n_classes = len(self.classes_)
            self.theta_ = np.zeros((n_classes, n_features))
            self.sigma_ = np.zeros((n_classes, n_features))
            self.class_prior_ = np.zeros(n_classes)
            self.class_count_ = np.zeros(n_classes)
        else:
            if X.shape[1] != self.theta_.shape[1]:
                msg = "Number of features %d does not match previous data %d."
                raise ValueError(msg % (X.shape[1], self.theta_.shape[1]))
            # Put epsilon back in each time
            self.sigma_[:, :] -= epsilon

        classes = self.classes_

        unique_y = np.unique(y)
        unique_y_in_classes = in1d(unique_y, classes)

        if not np.all(unique_y_in_classes):
            raise ValueError("The target label(s) %s in y do not exist in the "
                             "initial classes %s" %
                             (y[~unique_y_in_classes], classes))

        for y_i in unique_y:
            i = classes.searchsorted(y_i)
            X_i = X[y == y_i, :]

            if sample_weight is not None:
                sw_i = sample_weight[y == y_i]
                N_i = sw_i.sum()
            else:
                sw_i = None
                N_i = X_i.shape[0]

            new_theta, new_sigma = self._update_mean_variance(
                self.class_count_[i], self.theta_[i, :], self.sigma_[i, :],
                X_i, sw_i)

            self.theta_[i, :] = new_theta
            self.sigma_[i, :] = new_sigma
            self.class_count_[i] += N_i

        self.sigma_[:, :] += epsilon
        self.class_prior_[:] = self.class_count_ / np.sum(self.class_count_)
        #print self.class_prior_[:]
        return self
Ejemplo n.º 17
0
    def _partial_fit(self, X, y, classes=None):
        _check_partial_fit_first_call(self, classes)

        super(MLPClassifier, self)._partial_fit(X, y)

        return self
Ejemplo n.º 18
0
Archivo: EE.py Proyecto: w4k2/exposing
 def partial_fit(self, X, y, classes=None):
     if _check_partial_fit_first_call(self, classes):
         self.fit(X, y)
     else:
         for e in self.ensemble_:
             e.partial_fit(X, y)
Ejemplo n.º 19
0
    def partial_fit(self, X, y, classes=None, sample_weight=None):
        """Incremental fit on a batch of samples.
        This method is expected to be called several times consecutively
        on different chunks of a dataset so as to implement out-of-core
        or online learning.
        This is especially useful when the whole dataset is too big to fit in
        memory at once.
        This method has some performance overhead hence it is better to call
        partial_fit on chunks of data that are as large as possible
        (as long as fitting in the memory budget) to hide the overhead.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.
        classes : array-like, shape = [n_classes] (default=None)
            List of all the classes that can possibly appear in the y vector.
            Must be provided at the first call to partial_fit, can be omitted
            in subsequent calls.
        sample_weight : array-like, shape = [n_samples] (default=None)
            Weights applied to individual samples (1. for unweighted).
        Returns
        -------
        self : object
            Returns self.
        """
        X = check_array(X, accept_sparse='csr', dtype=np.float64)
        _, n_features = X.shape

        if _check_partial_fit_first_call(self, classes):
            # This is the first call to partial_fit:
            # initialize various cumulative counters
            n_effective_classes = len(classes) if len(classes) > 1 else 2
            self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
            self.feature_count_ = np.zeros((n_effective_classes, n_features),
                                           dtype=np.float64)
            self.complement_class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
            self.complement_feature_count_ = np.zeros((n_effective_classes, n_features),
                                           dtype=np.float64)
        elif n_features != self.coef_.shape[1]:
            msg = "Number of features %d does not match previous data %d."
            raise ValueError(msg % (n_features, self.coef_.shape[-1]))

        Y = label_binarize(y, classes=self.classes_)
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        n_samples, n_classes = Y.shape

        if X.shape[0] != Y.shape[0]:
            msg = "X.shape[0]=%d and y.shape[0]=%d are incompatible."
            raise ValueError(msg % (X.shape[0], y.shape[0]))

        # label_binarize() returns arrays with dtype=np.int64.
        # We convert it to np.float64 to support sample_weight consistently
        Y = Y.astype(np.float64)
        if sample_weight is not None:
            sample_weight = np.atleast_2d(sample_weight)
            Y *= check_array(sample_weight).T

        class_prior = self.class_prior

        # Count raw events from data before updating the class log prior
        # and feature log probas
        self._count(X, Y)

        # XXX: OPTIM: we could introduce a public finalization method to
        # be called by the user explicitly just once after several consecutive
        # calls to partial_fit and prior any call to predict[_[log_]proba]
        # to avoid computing the smooth log probas at each call to partial fit
        alpha = self._check_alpha()
        self._update_feature_log_prob(alpha)
        self._update_class_log_prior(class_prior=class_prior)
        return self
    def _partial_fit(self, X, y, classes=None):
        _check_partial_fit_first_call(self, classes)

        super(MLPClassifier, self)._partial_fit(X, y)

        return self
Ejemplo n.º 21
0
    def partial_fit(self, X, y, classes=None):
        """Partial fitting."""
        if not hasattr(self, "_base_clf"):
            self.set_base_clf()
        X, y = check_X_y(X, y)

        if _check_partial_fit_first_call(self, classes):
            self.classes_ = classes
            self.ensemble_ = []

        self.X_, self.y_ = X, y

        train_X, train_y = X, y

        unique, counts = np.unique(train_y, return_counts=True)

        k_neighbors = 5
        if counts[0] - 1 < 5:
            k_neighbors = counts[0] - 1

        if self.oversampler == "SMOTE" and k_neighbors > 0:
            smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
            train_X, train_y = smote.fit_resample(train_X, train_y)
        elif self.oversampler == "svmSMOTE" and k_neighbors > 0:
            try:
                svmSmote = SVMSMOTE(random_state=42, k_neighbors=k_neighbors)
                train_X, train_y = svmSmote.fit_resample(train_X, train_y)
            except ValueError:
                pass
        elif self.oversampler == "borderline1" and k_neighbors > 0:
            borderlineSmote1 = BorderlineSMOTE(random_state=42,
                                               k_neighbors=k_neighbors,
                                               kind='borderline-1')
            train_X, train_y = borderlineSmote1.fit_resample(train_X, train_y)
        elif self.oversampler == "borderline2" and k_neighbors > 0:
            borderlineSmote2 = BorderlineSMOTE(random_state=42,
                                               k_neighbors=k_neighbors,
                                               kind='borderline-2')
            train_X, train_y = borderlineSmote2.fit_resample(train_X, train_y)
        elif self.oversampler == "ADASYN" and k_neighbors > 0:
            try:
                adasyn = ADASYN(random_state=42, n_neighbors=k_neighbors)
                train_X, train_y = adasyn.fit_resample(train_X, train_y)
            except RuntimeError:
                pass
        elif self.oversampler == "SLS" and k_neighbors > 0:
            sls = Safe_Level_SMOTE(n_neighbors=k_neighbors)
            train_X, train_y = sls.sample(train_X, train_y)

        # Testing all models
        scores = np.array([ba(y, clf.predict(X)) for clf in self.ensemble_])

        # Pruning
        if len(self.ensemble_) > 1:
            alpha_good = scores > (0.5 + self.alpha)
            self.ensemble_ = [
                self.ensemble_[i] for i in np.where(alpha_good)[0]
            ]

        if len(self.ensemble_) > self.ensemble_size - 1:
            worst = np.argmin(scores)
            del self.ensemble_[worst]

        # Preparing and training new candidate
        self.ensemble_.append(base.clone(self._base_clf).fit(train_X, train_y))
Ejemplo n.º 22
0
    def _partial_fit(self,
                     X,
                     y,
                     classes=None,
                     _refit=False,
                     sample_weight=None):
        """Actual implementation of Gaussian NB fitting.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target values.

        classes : array-like, shape (n_classes,), optional (default=None)
            List of all the classes that can possibly appear in the y vector.

            Must be provided at the first call to partial_fit, can be omitted
            in subsequent calls.

        _refit: bool, optional (default=False)
            If true, act as though this were the first time we called
            _partial_fit (ie, throw away any past fitting and start over).

        sample_weight : array-like, shape (n_samples,), optional (default=None)
            Weights applied to individual samples (1. for unweighted).

        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_X_y(X, y)
        if sample_weight is not None:
            sample_weight = check_array(sample_weight, ensure_2d=False)
            check_consistent_length(y, sample_weight)

        # If the ratio of data variance between dimensions is too small, it
        # will cause numerical errors. To address this, we artificially
        # boost the variance by epsilon, a small fraction of the standard
        # deviation of the largest dimension.
        epsilon = 1e-9 * np.var(X, axis=0).max()

        if _refit:
            self.classes_ = None

        if _check_partial_fit_first_call(self, classes):
            # This is the first call to partial_fit:
            # initialize various cumulative counters
            n_features = X.shape[1]
            n_classes = len(self.classes_)
            self.theta_ = np.zeros((n_classes, n_features))
            self.sigma_ = np.zeros((n_classes, n_features))

            self.class_count_ = np.zeros(n_classes, dtype=np.float64)

            # Initialise the class prior
            n_classes = len(self.classes_)
            # Take into account the priors
            if self.priors is not None:
                priors = np.asarray(self.priors)
                # Check that the provide prior match the number of classes
                if len(priors) != n_classes:
                    raise ValueError('Number of priors must match number of'
                                     ' classes.')
                # Check that the sum is 1
                if priors.sum() != 1.0:
                    raise ValueError('The sum of the priors should be 1.')
                # Check that the prior are non-negative
                if (priors < 0).any():
                    raise ValueError('Priors must be non-negative.')
                self.class_prior_ = priors
            else:
                # Initialize the priors to zeros for each class
                self.class_prior_ = np.zeros(len(self.classes_),
                                             dtype=np.float64)
        else:
            if X.shape[1] != self.theta_.shape[1]:
                msg = "Number of features %d does not match previous data %d."
                raise ValueError(msg % (X.shape[1], self.theta_.shape[1]))
            # Put epsilon back in each time
            self.sigma_[:, :] -= epsilon

        classes = self.classes_

        unique_y = np.unique(y)
        unique_y_in_classes = np.in1d(unique_y, classes)

        if not np.all(unique_y_in_classes):
            raise ValueError("The target label(s) %s in y do not exist in the "
                             "initial classes %s" %
                             (unique_y[~unique_y_in_classes], classes))

        for y_i in unique_y:
            i = classes.searchsorted(y_i)
            X_i = X[y == y_i, :]

            if sample_weight is not None:
                sw_i = sample_weight[y == y_i]
                N_i = sw_i.sum()
            else:
                sw_i = None
                N_i = X_i.shape[0]

            new_theta, new_sigma = self._update_mean_variance(
                self.class_count_[i], self.theta_[i, :], self.sigma_[i, :],
                X_i, sw_i)

            self.theta_[i, :] = new_theta
            self.sigma_[i, :] = new_sigma
            self.class_count_[i] += N_i

        self.sigma_[:, :] += epsilon

        # Update if only no priors is provided
        if self.priors is None:
            # Empirical prior, with sample_weight taken into account
            self.class_prior_ = self.class_count_ / self.class_count_.sum()

        return self
Ejemplo n.º 23
0
    def _partial_fit(self,
                     X,
                     y,
                     classes=None,
                     _refit=False,
                     sample_weight=None):
        self.accountant.check(self.epsilon, 0)

        if sample_weight is not None:
            warn_unused_args("sample_weight")

        X, y = check_X_y(X, y)

        if self.bounds is None:
            warnings.warn(
                "Bounds have not been specified and will be calculated on the data provided. This will "
                "result in additional privacy leakage. To ensure differential privacy and no additional "
                "privacy leakage, specify bounds for each dimension.",
                PrivacyLeakWarning)
            self.bounds = (np.min(X, axis=0), np.max(X, axis=0))

        self.bounds = check_bounds(self.bounds, shape=X.shape[1])
        X = clip_to_bounds(X, self.bounds)

        self.epsilon_ = self.var_smoothing

        if _refit:
            self.classes_ = None

        if _check_partial_fit_first_call(self, classes):
            n_features = X.shape[1]
            n_classes = len(self.classes_)
            self.theta_ = np.zeros((n_classes, n_features))
            self.sigma_ = np.zeros((n_classes, n_features))

            self.class_count_ = np.zeros(n_classes, dtype=np.float64)

            if self.priors is not None:
                priors = np.asarray(self.priors)

                if len(priors) != n_classes:
                    raise ValueError(
                        "Number of priors must match number of classes.")
                if not np.isclose(priors.sum(), 1.0):
                    raise ValueError("The sum of the priors should be 1.")
                if (priors < 0).any():
                    raise ValueError("Priors must be non-negative.")
                self.class_prior_ = priors
            else:
                # Initialize the priors to zeros for each class
                self.class_prior_ = np.zeros(len(self.classes_),
                                             dtype=np.float64)
        else:
            if X.shape[1] != self.theta_.shape[1]:
                raise ValueError(
                    "Number of features %d does not match previous data %d." %
                    (X.shape[1], self.theta_.shape[1]))
            # Put epsilon back in each time
            self.sigma_[:, :] -= self.epsilon_

        classes = self.classes_

        unique_y = np.unique(y)
        unique_y_in_classes = np.in1d(unique_y, classes)

        if not np.all(unique_y_in_classes):
            raise ValueError(
                "The target label(s) %s in y do not exist in the initial classes %s"
                % (unique_y[~unique_y_in_classes], classes))

        noisy_class_counts = self._noisy_class_counts(y)

        for _i, y_i in enumerate(unique_y):
            i = classes.searchsorted(y_i)
            X_i = X[y == y_i, :]

            n_i = noisy_class_counts[_i]

            new_theta, new_sigma = self._update_mean_variance(
                self.class_count_[i],
                self.theta_[i, :],
                self.sigma_[i, :],
                X_i,
                n_noisy=n_i)

            self.theta_[i, :] = new_theta
            self.sigma_[i, :] = new_sigma
            self.class_count_[i] += n_i

        self.sigma_[:, :] += self.epsilon_

        # Update if only no priors is provided
        if self.priors is None:
            # Empirical prior, with sample_weight taken into account
            self.class_prior_ = self.class_count_ / self.class_count_.sum()

        self.accountant.spend(self.epsilon, 0)

        return self
Ejemplo n.º 24
0
    def _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter,
                     classes, sample_weight, coef_init, intercept_init,
                     per_feature_alpha, per_feature_beta, modal_vector):
        X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C")

        n_samples, n_features = X.shape

        _check_partial_fit_first_call(self, classes)

        n_classes = self.classes_.shape[0]

        # Allocate datastructures from input arguments
        self._expanded_class_weight = compute_class_weight(
            self.class_weight, self.classes_, y)
        sample_weight = self._validate_sample_weight(sample_weight, n_samples)

        fitted = getattr(self, "coef_", None) is not None or \
                 coef_init is not None

        if getattr(self, "coef_", None) is None or coef_init is not None:
            self._allocate_parameter_mem(n_classes, n_features, coef_init,
                                         intercept_init)
        elif n_features != self.coef_.shape[-1]:
            raise ValueError("Number of features %d does not match previous "
                             "data %d." % (n_features, self.coef_.shape[-1]))

        # NOTE: put initialization of the 3 additionial vectors here,
        # might make more sense to put in self._allocate_parameter_mem
        if modal_vector is None:
            modal_vector = np.zeros((n_classes, n_features))
        elif modal_vector.shape[-1] != n_features:
            raise ValueError('Shape of modal_vecter must be the same as '
                             'the coefficient vectors')
        elif len(modal_vector.shape) == 1:
            # passing single vector in
            modal_vector = np.stack([modal_vector for _ in range(n_classes)])

        l1_ratio = self.l1_ratio
        if self.penalty == 'l2':
            l1_ratio = 0.0
        elif self.penalty == 'l1':
            l1_ratio = 1.0

        if per_feature_alpha is None and per_feature_beta is None:
            if per_feature_alpha is None:
                per_feature_alpha = np.ones(
                    self.coef_.shape) * alpha * (1.0 - l1_ratio)
            if per_feature_beta is None:
                per_feature_beta = np.ones(self.coef_.shape) * alpha * l1_ratio
        else:
            if per_feature_alpha is None:
                per_feature_alpha = np.zeros(self.coef_.shape)
            elif self.penalty == 'l1':
                raise ValueError('Penalty set to l1 but per_feature_alpha '
                                 'is still provided')
            if per_feature_beta is None:
                per_feature_beta = np.zeros(self.coef_.shape)
            elif self.penalty == 'l2':
                raise ValueError('Penalty set to l2 but per_feature_beta '
                                 'is still provided')

        if per_feature_alpha.shape[-1] != n_features:
            raise ValueError('Shape of per_feature_alpha must be the same as '
                             'the coefficient vectors')
        if per_feature_beta.shape[-1] != n_features:
            raise ValueError('Shape of per_feature_beta must be the same as '
                             'the coefficient vectors')

        self.loss_function_ = self._get_loss_function(loss)
        if not hasattr(self, "t_"):
            self.t_ = 1.0

        # delegate to concrete training procedure
        if n_classes > 2:
            if not fitted:
                self.coef_ = modal_vector.copy()
            self._fit_multiclass(X,
                                 y,
                                 alpha=alpha,
                                 C=C,
                                 per_feature_alpha=per_feature_alpha,
                                 per_feature_beta=per_feature_beta,
                                 modal_vector=modal_vector,
                                 learning_rate=learning_rate,
                                 sample_weight=sample_weight,
                                 max_iter=max_iter)
        elif n_classes == 2:
            if not fitted:
                # print("overwrite initial weight vector with modal vector")
                self.coef_ = modal_vector[0].copy()
            self._fit_binary(X,
                             y,
                             alpha=alpha,
                             C=C,
                             per_feature_alpha=per_feature_alpha,
                             per_feature_beta=per_feature_beta,
                             modal_vector=modal_vector[0],
                             learning_rate=learning_rate,
                             sample_weight=sample_weight,
                             max_iter=max_iter)
        else:
            raise ValueError(
                "The number of classes has to be greater than one;"
                " got %d class" % n_classes)

        return self