Beispiel #1
0
    def approximate(self, X):

        # todo: X may be optional
        # todo: allow to use a list of scores for approximation, instead of
        # todo: decision_scores

        self.approx_flags, _ = build_codes(self.n_estimators,
                                           self.base_estimators,
                                           self.approx_clf_list,
                                           self.approx_ng_clf_list,
                                           self.approx_flag_global)

        n_estimators_list, starts, n_jobs = _partition_estimators(
            self.n_estimators, n_jobs=self.n_jobs)

        all_approx_results = Parallel(n_jobs=n_jobs, verbose=True)(
            delayed(_parallel_approx_estimators)(
                n_estimators_list[i],
                self.base_estimators[starts[i]:starts[i + 1]],
                X,  # if it is a PyOD model, we do not need this
                self.n_estimators,
                self.approx_flags[starts[i]:starts[i + 1]],
                self.approx_clf,
                self.jl_transformers_[starts[i]:starts[i + 1]],
                verbose=True)
            for i in range(n_jobs))

        # print('Balanced Scheduling Total Test Time:', time.time() - start)

        self.approximators = _unfold_parallel(all_approx_results, n_jobs)
        return self
Beispiel #2
0
    def approximate(self, X):
        """Use the supervised regressor (random forest by default) to
        approximate unsupervised fitted outlier detectors.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples. The same feature space of the unsupervised
            outlier detector will be used.

        Returns
        -------
        self : object
            The estimator after with approximation.
        """

        # todo: X may be optional
        # todo: allow to use a list of scores for approximation, instead of
        # todo: decision_scores

        self.approx_flags, _ = build_codes(self.base_estimators,
                                           self.approx_clf_list,
                                           self.approx_ng_clf_list,
                                           self.approx_flag_global)

        n_estimators_list, starts, n_jobs = _partition_estimators(
            self.n_estimators, n_jobs=self.n_jobs)

        all_approx_results = Parallel(n_jobs=n_jobs, verbose=True)(
            delayed(_parallel_approx_estimators)(
                n_estimators_list[i],
                self.base_estimators[starts[i]:starts[i + 1]],
                X,  # if it is a PyOD model, we do not need this
                self.n_estimators,
                self.approx_flags[starts[i]:starts[i + 1]],
                self.approx_clf,
                self.jl_transformers_[starts[i]:starts[i + 1]],
                verbose=True) for i in range(n_jobs))

        # print('Balanced Scheduling Total Test Time:', time.time() - start)

        self.approximators = _unfold_parallel(all_approx_results, n_jobs)
        return self
Beispiel #3
0
    start = time.time()
    predicted_labels = model.predict(X_test)  # predict labels
    print('Predict time:', time.time() - start)
    print()

    start = time.time()
    predicted_scores = model.decision_function(X_test)  # predict scores
    print('Decision Function time:', time.time() - start)
    print()

    ##########################################################################
    # compare with no projection, no bps, and no approximation
    print("******************************************************************")
    start = time.time()
    n_estimators = len(base_estimators)
    n_estimators_list, starts, n_jobs = _partition_estimators(
        n_estimators, n_jobs)

    rp_flags = np.zeros([n_estimators, 1])
    approx_flags = np.zeros([n_estimators, 1])
    objective_dim = None
    rp_method = None

    all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)(
        delayed(_parallel_fit)(n_estimators_list[i],
                               base_estimators[starts[i]:starts[i + 1]],
                               X_train,
                               n_estimators,
                               rp_flags[starts[i]:starts[i + 1]],
                               objective_dim,
                               rp_method=rp_method,
                               verbose=True) for i in range(n_jobs))
Beispiel #4
0
    def decision_function(self, X):
        """Predict raw anomaly scores of X using the fitted detectors.

        The anomaly score of an input sample is computed based on the fitted
        detector. For consistency, outliers are assigned with
        higher anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        X = check_array(X)
        n_samples, n_features = X.shape[0], X.shape[1]

        # decide whether bps is needed
        # it is turned off
        if self.bps_flag:
            # load the pre-trained cost predictor to forecast the train cost
            cost_predictor = joblib.load(self.cost_forecast_loc_pred_)

            time_cost_pred = cost_forecast_meta(cost_predictor, X,
                                                self.base_estimator_names)

            n_estimators_list, starts, n_jobs = balanced_scheduling(
                time_cost_pred, self.n_estimators, self.n_jobs)
        else:
            # use simple equal split by sklearn
            n_estimators_list, starts, n_jobs = _partition_estimators(
                self.n_estimators, self.n_jobs)

        # fit the base models
        if self.verbose:
            print('Parallel score prediction...')
            start = time.time()

        # TODO: code cleanup. There is an existing bug for joblib on Windows:
        # https://github.com/joblib/joblib/issues/806
        # max_nbytes can be dropped on other OS
        all_results_scores = Parallel(n_jobs=n_jobs, max_nbytes=None,
                                      verbose=True)(
            delayed(_parallel_decision_function)(
                n_estimators_list[i],
                self.base_estimators[starts[i]:starts[i + 1]],
                self.approximators[starts[i]:starts[i + 1]],
                X,
                self.n_estimators,
                # self.rp_flags[starts[i]:starts[i + 1]],
                self.jl_transformers_[starts[i]:starts[i + 1]],
                self.approx_flags[starts[i]:starts[i + 1]],
                verbose=True)
            for i in range(n_jobs))

        # fit the base models
        if self.verbose:
            print('Parallel Score Prediction without Approximators '
                  'Total Time:', time.time() - start)

        # unfold and generate the label matrix
        predicted_scores = np.zeros([n_samples, self.n_estimators])
        for i in range(n_jobs):
            predicted_scores[:, starts[i]:starts[i + 1]] = np.asarray(
                all_results_scores[i]).T

        return predicted_scores
Beispiel #5
0
    def predict(self, X):
        """Predict the class labels for the provided data.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        labels : numpy array of shape (n_samples,)
            Class labels for each data sample.
        """
        X = check_array(X)
        n_samples, n_features = X.shape[0], X.shape[1]

        # decide whether bps is needed
        # it is turned off
        if self.bps_flag:
            # load the pre-trained cost predictor to forecast the train cost
            cost_predictor = joblib.load(self.cost_forecast_loc_pred_)

            time_cost_pred = cost_forecast_meta(cost_predictor, X,
                                                self.base_estimator_names)

            n_estimators_list, starts, n_jobs = balanced_scheduling(
                time_cost_pred, self.n_estimators, self.n_jobs)
        else:
            # use simple equal split by sklearn
            n_estimators_list, starts, n_jobs = _partition_estimators(
                self.n_estimators, self.n_jobs)

        # fit the base models
        print('Parallel label prediction...')
        start = time.time()

        # TODO: code cleanup. There is an existing bug for joblib on Windows:
        # https://github.com/joblib/joblib/issues/806
        # max_nbytes can be dropped on other OS
        all_results_pred = Parallel(n_jobs=n_jobs, max_nbytes=None,
                                    verbose=True)(
            delayed(_parallel_predict)(
                n_estimators_list[i],
                self.base_estimators[starts[i]:starts[i + 1]],
                self.approximators[starts[i]:starts[i + 1]],
                X,
                self.n_estimators,
                # self.rp_flags[starts[i]:starts[i + 1]],
                self.jl_transformers_[starts[i]:starts[i + 1]],
                self.approx_flags[starts[i]:starts[i + 1]],
                self.contamination,
                verbose=True)
            for i in range(n_jobs))

        print('Parallel Label Predicting without Approximators Total Time:',
              time.time() - start)

        # unfold and generate the label matrix
        predicted_labels = np.zeros([n_samples, self.n_estimators])
        for i in range(n_jobs):
            predicted_labels[:, starts[i]:starts[i + 1]] = np.asarray(
                all_results_pred[i]).T

        return predicted_labels
Beispiel #6
0
    def fit(self, X, y=None):
        """Fit estimator. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).

        Returns
        -------
        self
        """
        X = check_array(X)
        n_samples, n_features = X.shape[0], X.shape[1]

        # Validate max_features for random projection
        if isinstance(self.max_features, (numbers.Integral, np.integer)):
            self.max_features_ = self.max_features
        else:  # float
            self.max_features_ = int(self.max_features * n_features)

        # build flags for random projection
        self.rp_flags_, _ = build_codes(
            self.n_estimators, self.base_estimators, self.rp_clf_list,
            self.rp_ng_clf_list, self.rp_flag_global)

        # decide whether bps is needed
        # it is turned off
        if self.bps_flag:
            # load the pre-trained cost predictor to forecast the train cost
            cost_predictor = joblib.load(self.cost_forecast_loc_fit_)

            time_cost_pred = cost_forecast_meta(cost_predictor, X,
                                                self.base_estimator_names)

            # use BPS
            n_estimators_list, starts, n_jobs = balanced_scheduling(
                time_cost_pred, self.n_estimators, self.n_jobs)
        else:
            # use the default sklearn equal split
            n_estimators_list, starts, n_jobs = _partition_estimators(
                self.n_estimators, self.n_jobs)

        # fit the base models
        print('Parallel Training...')
        start = time.time()

        # TODO: code cleanup. There is an existing bug for joblib on Windows:
        # https://github.com/joblib/joblib/issues/806
        # max_nbytes can be dropped on other OS
        all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)(
            delayed(_parallel_fit)(
                n_estimators_list[i],
                self.base_estimators[starts[i]:starts[i + 1]],
                X,
                self.n_estimators,
                self.rp_flags[starts[i]:starts[i + 1]],
                self.max_features_,
                self.rp_method,
                verbose=self.verbose)
            for i in range(n_jobs))

        print('Balanced Scheduling Total Train Time:', time.time() - start)

        # reformat and unfold the lists. Save the trained estimators and transformers
        all_results = list(map(list, zip(*all_results)))

        # overwrite estimators
        self.base_estimators = _unfold_parallel(all_results[0], n_jobs)
        self.jl_transformers_ = _unfold_parallel(all_results[1], n_jobs)

        return self
Beispiel #7
0
    def predict_proba(self, X):
        """Predict the probability of a sample being outlier. Two approaches
        are possible:

        1. simply use Min-max conversion to linearly transform the outlier
           scores into the range of [0,1]. The model must be
           fitted first.
        2. use unifying scores, see :cite:`kriegel2011interpreting`.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        method : str, optional (default='linear')
            probability conversion method. It must be one of
            'linear' or 'unify'.

        Returns
        -------
        outlier_probability : numpy array of shape (n_samples,)
            For each observation, tells whether or not
            it should be considered as an outlier according to the
            fitted model. Return the outlier probability, ranging
            in [0,1].
        """
        X = check_array(X)
        n_samples, n_features = X.shape[0], X.shape[1]

        # decide whether bps is needed
        # it is turned off
        if self.bps_flag:
            # load the pre-trained cost predictor to forecast the train cost
            cost_predictor = joblib.load(self.cost_forecast_loc_pred_)

            time_cost_pred = cost_forecast_meta(cost_predictor, X,
                                                self.base_estimator_names)

            n_estimators_list, starts, n_jobs = balanced_scheduling(
                time_cost_pred, self.n_estimators, self.n_jobs)
        else:
            # use simple equal split by sklearn
            n_estimators_list, starts, n_jobs = _partition_estimators(
                self.n_estimators, self.n_jobs)

        # fit the base models
        if self.verbose:
            print('Parallel score prediction...')
            start = time.time()

        # TODO: code cleanup. There is an existing bug for joblib on Windows:
        # https://github.com/joblib/joblib/issues/806
        # max_nbytes can be dropped on other OS
        all_results_scores = Parallel(
            n_jobs=n_jobs, max_nbytes=None, verbose=True)(
                delayed(_parallel_predict_proba)(
                    n_estimators_list[i],
                    self.base_estimators[starts[i]:starts[i + 1]],
                    self.approximators[starts[i]:starts[i + 1]],
                    X,
                    self.n_estimators,
                    # self.rp_flags[starts[i]:starts[i + 1]],
                    self.jl_transformers_[starts[i]:starts[i + 1]],
                    self.approx_flags[starts[i]:starts[i + 1]],
                    verbose=True) for i in range(n_jobs))

        # fit the base models
        if self.verbose:
            print(
                'Parallel Score Prediction without Approximators '
                'Total Time:',
                time.time() - start)

        # unfold and generate the label matrix
        predicted_scores = np.zeros([n_samples, self.n_estimators])
        for i in range(n_jobs):
            predicted_scores[:, starts[i]:starts[i + 1]] = np.asarray(
                all_results_scores[i]).T

        return predicted_scores