def approximate(self, X): # todo: X may be optional # todo: allow to use a list of scores for approximation, instead of # todo: decision_scores self.approx_flags, _ = build_codes(self.n_estimators, self.base_estimators, self.approx_clf_list, self.approx_ng_clf_list, self.approx_flag_global) n_estimators_list, starts, n_jobs = _partition_estimators( self.n_estimators, n_jobs=self.n_jobs) all_approx_results = Parallel(n_jobs=n_jobs, verbose=True)( delayed(_parallel_approx_estimators)( n_estimators_list[i], self.base_estimators[starts[i]:starts[i + 1]], X, # if it is a PyOD model, we do not need this self.n_estimators, self.approx_flags[starts[i]:starts[i + 1]], self.approx_clf, self.jl_transformers_[starts[i]:starts[i + 1]], verbose=True) for i in range(n_jobs)) # print('Balanced Scheduling Total Test Time:', time.time() - start) self.approximators = _unfold_parallel(all_approx_results, n_jobs) return self
def approximate(self, X): """Use the supervised regressor (random forest by default) to approximate unsupervised fitted outlier detectors. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. The same feature space of the unsupervised outlier detector will be used. Returns ------- self : object The estimator after with approximation. """ # todo: X may be optional # todo: allow to use a list of scores for approximation, instead of # todo: decision_scores self.approx_flags, _ = build_codes(self.base_estimators, self.approx_clf_list, self.approx_ng_clf_list, self.approx_flag_global) n_estimators_list, starts, n_jobs = _partition_estimators( self.n_estimators, n_jobs=self.n_jobs) all_approx_results = Parallel(n_jobs=n_jobs, verbose=True)( delayed(_parallel_approx_estimators)( n_estimators_list[i], self.base_estimators[starts[i]:starts[i + 1]], X, # if it is a PyOD model, we do not need this self.n_estimators, self.approx_flags[starts[i]:starts[i + 1]], self.approx_clf, self.jl_transformers_[starts[i]:starts[i + 1]], verbose=True) for i in range(n_jobs)) # print('Balanced Scheduling Total Test Time:', time.time() - start) self.approximators = _unfold_parallel(all_approx_results, n_jobs) return self
start = time.time() predicted_labels = model.predict(X_test) # predict labels print('Predict time:', time.time() - start) print() start = time.time() predicted_scores = model.decision_function(X_test) # predict scores print('Decision Function time:', time.time() - start) print() ########################################################################## # compare with no projection, no bps, and no approximation print("******************************************************************") start = time.time() n_estimators = len(base_estimators) n_estimators_list, starts, n_jobs = _partition_estimators( n_estimators, n_jobs) rp_flags = np.zeros([n_estimators, 1]) approx_flags = np.zeros([n_estimators, 1]) objective_dim = None rp_method = None all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)( delayed(_parallel_fit)(n_estimators_list[i], base_estimators[starts[i]:starts[i + 1]], X_train, n_estimators, rp_flags[starts[i]:starts[i + 1]], objective_dim, rp_method=rp_method, verbose=True) for i in range(n_jobs))
def decision_function(self, X): """Predict raw anomaly scores of X using the fitted detectors. The anomaly score of an input sample is computed based on the fitted detector. For consistency, outliers are assigned with higher anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ X = check_array(X) n_samples, n_features = X.shape[0], X.shape[1] # decide whether bps is needed # it is turned off if self.bps_flag: # load the pre-trained cost predictor to forecast the train cost cost_predictor = joblib.load(self.cost_forecast_loc_pred_) time_cost_pred = cost_forecast_meta(cost_predictor, X, self.base_estimator_names) n_estimators_list, starts, n_jobs = balanced_scheduling( time_cost_pred, self.n_estimators, self.n_jobs) else: # use simple equal split by sklearn n_estimators_list, starts, n_jobs = _partition_estimators( self.n_estimators, self.n_jobs) # fit the base models if self.verbose: print('Parallel score prediction...') start = time.time() # TODO: code cleanup. There is an existing bug for joblib on Windows: # https://github.com/joblib/joblib/issues/806 # max_nbytes can be dropped on other OS all_results_scores = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)( delayed(_parallel_decision_function)( n_estimators_list[i], self.base_estimators[starts[i]:starts[i + 1]], self.approximators[starts[i]:starts[i + 1]], X, self.n_estimators, # self.rp_flags[starts[i]:starts[i + 1]], self.jl_transformers_[starts[i]:starts[i + 1]], self.approx_flags[starts[i]:starts[i + 1]], verbose=True) for i in range(n_jobs)) # fit the base models if self.verbose: print('Parallel Score Prediction without Approximators ' 'Total Time:', time.time() - start) # unfold and generate the label matrix predicted_scores = np.zeros([n_samples, self.n_estimators]) for i in range(n_jobs): predicted_scores[:, starts[i]:starts[i + 1]] = np.asarray( all_results_scores[i]).T return predicted_scores
def predict(self, X): """Predict the class labels for the provided data. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. Returns ------- labels : numpy array of shape (n_samples,) Class labels for each data sample. """ X = check_array(X) n_samples, n_features = X.shape[0], X.shape[1] # decide whether bps is needed # it is turned off if self.bps_flag: # load the pre-trained cost predictor to forecast the train cost cost_predictor = joblib.load(self.cost_forecast_loc_pred_) time_cost_pred = cost_forecast_meta(cost_predictor, X, self.base_estimator_names) n_estimators_list, starts, n_jobs = balanced_scheduling( time_cost_pred, self.n_estimators, self.n_jobs) else: # use simple equal split by sklearn n_estimators_list, starts, n_jobs = _partition_estimators( self.n_estimators, self.n_jobs) # fit the base models print('Parallel label prediction...') start = time.time() # TODO: code cleanup. There is an existing bug for joblib on Windows: # https://github.com/joblib/joblib/issues/806 # max_nbytes can be dropped on other OS all_results_pred = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)( delayed(_parallel_predict)( n_estimators_list[i], self.base_estimators[starts[i]:starts[i + 1]], self.approximators[starts[i]:starts[i + 1]], X, self.n_estimators, # self.rp_flags[starts[i]:starts[i + 1]], self.jl_transformers_[starts[i]:starts[i + 1]], self.approx_flags[starts[i]:starts[i + 1]], self.contamination, verbose=True) for i in range(n_jobs)) print('Parallel Label Predicting without Approximators Total Time:', time.time() - start) # unfold and generate the label matrix predicted_labels = np.zeros([n_samples, self.n_estimators]) for i in range(n_jobs): predicted_labels[:, starts[i]:starts[i + 1]] = np.asarray( all_results_pred[i]).T return predicted_labels
def fit(self, X, y=None): """Fit estimator. y is optional for unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). Returns ------- self """ X = check_array(X) n_samples, n_features = X.shape[0], X.shape[1] # Validate max_features for random projection if isinstance(self.max_features, (numbers.Integral, np.integer)): self.max_features_ = self.max_features else: # float self.max_features_ = int(self.max_features * n_features) # build flags for random projection self.rp_flags_, _ = build_codes( self.n_estimators, self.base_estimators, self.rp_clf_list, self.rp_ng_clf_list, self.rp_flag_global) # decide whether bps is needed # it is turned off if self.bps_flag: # load the pre-trained cost predictor to forecast the train cost cost_predictor = joblib.load(self.cost_forecast_loc_fit_) time_cost_pred = cost_forecast_meta(cost_predictor, X, self.base_estimator_names) # use BPS n_estimators_list, starts, n_jobs = balanced_scheduling( time_cost_pred, self.n_estimators, self.n_jobs) else: # use the default sklearn equal split n_estimators_list, starts, n_jobs = _partition_estimators( self.n_estimators, self.n_jobs) # fit the base models print('Parallel Training...') start = time.time() # TODO: code cleanup. There is an existing bug for joblib on Windows: # https://github.com/joblib/joblib/issues/806 # max_nbytes can be dropped on other OS all_results = Parallel(n_jobs=n_jobs, max_nbytes=None, verbose=True)( delayed(_parallel_fit)( n_estimators_list[i], self.base_estimators[starts[i]:starts[i + 1]], X, self.n_estimators, self.rp_flags[starts[i]:starts[i + 1]], self.max_features_, self.rp_method, verbose=self.verbose) for i in range(n_jobs)) print('Balanced Scheduling Total Train Time:', time.time() - start) # reformat and unfold the lists. Save the trained estimators and transformers all_results = list(map(list, zip(*all_results))) # overwrite estimators self.base_estimators = _unfold_parallel(all_results[0], n_jobs) self.jl_transformers_ = _unfold_parallel(all_results[1], n_jobs) return self
def predict_proba(self, X): """Predict the probability of a sample being outlier. Two approaches are possible: 1. simply use Min-max conversion to linearly transform the outlier scores into the range of [0,1]. The model must be fitted first. 2. use unifying scores, see :cite:`kriegel2011interpreting`. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. method : str, optional (default='linear') probability conversion method. It must be one of 'linear' or 'unify'. Returns ------- outlier_probability : numpy array of shape (n_samples,) For each observation, tells whether or not it should be considered as an outlier according to the fitted model. Return the outlier probability, ranging in [0,1]. """ X = check_array(X) n_samples, n_features = X.shape[0], X.shape[1] # decide whether bps is needed # it is turned off if self.bps_flag: # load the pre-trained cost predictor to forecast the train cost cost_predictor = joblib.load(self.cost_forecast_loc_pred_) time_cost_pred = cost_forecast_meta(cost_predictor, X, self.base_estimator_names) n_estimators_list, starts, n_jobs = balanced_scheduling( time_cost_pred, self.n_estimators, self.n_jobs) else: # use simple equal split by sklearn n_estimators_list, starts, n_jobs = _partition_estimators( self.n_estimators, self.n_jobs) # fit the base models if self.verbose: print('Parallel score prediction...') start = time.time() # TODO: code cleanup. There is an existing bug for joblib on Windows: # https://github.com/joblib/joblib/issues/806 # max_nbytes can be dropped on other OS all_results_scores = Parallel( n_jobs=n_jobs, max_nbytes=None, verbose=True)( delayed(_parallel_predict_proba)( n_estimators_list[i], self.base_estimators[starts[i]:starts[i + 1]], self.approximators[starts[i]:starts[i + 1]], X, self.n_estimators, # self.rp_flags[starts[i]:starts[i + 1]], self.jl_transformers_[starts[i]:starts[i + 1]], self.approx_flags[starts[i]:starts[i + 1]], verbose=True) for i in range(n_jobs)) # fit the base models if self.verbose: print( 'Parallel Score Prediction without Approximators ' 'Total Time:', time.time() - start) # unfold and generate the label matrix predicted_scores = np.zeros([n_samples, self.n_estimators]) for i in range(n_jobs): predicted_scores[:, starts[i]:starts[i + 1]] = np.asarray( all_results_scores[i]).T return predicted_scores