def score_to_label(pred_scores, outliers_fraction=0.1): """Turn raw outlier outlier scores to binary labels (0 or 1). Parameters ---------- pred_scores : list or numpy array of shape (n_samples,) Raw outlier scores. Outliers are assumed have larger values. outliers_fraction : float in (0,1) Percentage of outliers. Returns ------- outlier_labels : numpy array of shape (n_samples,) For each observation, tells whether or not it should be considered as an outlier according to the fitted model. Return the outlier probability, ranging in [0,1]. """ # check input values pred_scores = column_or_1d(pred_scores) check_parameter(outliers_fraction, 0, 1) threshold = percentile(pred_scores, 100 * (1 - outliers_fraction)) pred_labels = (pred_scores > threshold).astype('int') return pred_labels
def __init__(self, base_estimators, meta_clf=None, n_folds=2, keep_original=True, use_proba=False, shuffle_data=False, random_state=None, threshold=None, pre_fitted=None): super(Stacking, self).__init__( base_estimators=base_estimators, pre_fitted=pre_fitted) # validate input parameters if not isinstance(n_folds, int): raise ValueError('n_folds must be an integer variable') check_parameter(n_folds, low=2, include_left=True, param_name='n_folds') self.n_folds = n_folds if meta_clf is not None: self.meta_clf = meta_clf else: self.meta_clf = LogisticRegression() # set flags self.keep_original = keep_original self.use_proba = use_proba self.shuffle_data = shuffle_data self.random_state = random_state if threshold is not None: warnings.warn( "Stacking does not support threshold setting option. " "Please set the threshold in classifiers directly.") if pre_fitted is not None: warnings.warn("Stacking does not support pre_fitted option.")
def __init__(self, hidden_neurons=None, hidden_activation='relu', output_activation='sigmoid', loss=mean_squared_error, optimizer='adam', epochs=100, batch_size=32, dropout_rate=0.2, l2_regularizer=0.1, validation_size=0.1, preprocessing=True, verbose=1, random_state=None, contamination=0.1): super(AutoEncoder, self).__init__(contamination=contamination) self.hidden_neurons = hidden_neurons self.hidden_activation = hidden_activation self.output_activation = output_activation self.loss = loss self.optimizer = optimizer self.epochs = epochs self.batch_size = batch_size self.dropout_rate = dropout_rate self.l2_regularizer = l2_regularizer self.validation_size = validation_size self.preprocessing = preprocessing self.verbose = verbose self.random_state = random_state # default values if self.hidden_neurons is None: self.hidden_neurons = [64, 32, 32, 64] # Verify the network design is valid if not self.hidden_neurons == self.hidden_neurons[::-1]: print(self.hidden_neurons) raise ValueError("Hidden units should be symmetric") self.hidden_neurons_ = self.hidden_neurons check_parameter(dropout_rate, 0, 1, param_name='dropout_rate', include_left=True)
def __init__(self, base_estimators, method='average', threshold=0.5, weights=None, pre_fitted=False): super(SimpleClassifierAggregator, self).__init__(base_estimators=base_estimators, pre_fitted=pre_fitted) # validate input parameters if method not in [ 'average', 'maximization', 'majority_vote', 'median' ]: raise ValueError( "{method} is not a valid parameter.".format(method=method)) self.method = method check_parameter(threshold, 0, 1, include_left=False, include_right=False, param_name='threshold') self.threshold = threshold # set estimator weights self._set_weights(weights)
def split_datasets(X, y, n_folds=3, shuffle_data=False, random_state=None): """Utility function to split the data for stacking. The data is split into n_folds with roughly equal rough size. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,) The ground truth of the input samples (labels). n_folds : int, optional (default=3) The number of splits of the training sample. shuffle_data : bool, optional (default=False) If True, shuffle the input data. random_state : RandomState, optional (default=None) A random number generator instance to define the state of the random permutations generator. Returns ------- X : numpy array of shape (n_samples, n_features) The input samples. If shuffle_data, return the shuffled data. y : numpy array of shape (n_samples,) The ground truth of the input samples (labels). If shuffle_data, return the shuffled data. index_lists : list of list The list of indexes of each fold regarding the returned X and y. For instance, index_lists[0] contains the indexes of fold 0. """ if not isinstance(n_folds, int): raise ValueError('n_folds must be an integer variable') check_parameter(n_folds, low=2, include_left=True, param_name='n_folds') random_state = check_random_state(random_state) if shuffle_data: X, y = shuffle(X, y, random_state=random_state) idx_length = len(y) idx_list = list(range(idx_length)) avg_length = int(idx_length / n_folds) index_lists = [] for i in range(n_folds - 1): index_lists.append(idx_list[i * avg_length:(i + 1) * avg_length]) index_lists.append(idx_list[(n_folds - 1) * avg_length:]) return X, y, index_lists
def __init__(self, encoder_neurons=None, decoder_neurons=None, latent_dim=2, hidden_activation='relu', output_activation='sigmoid', loss=mse, optimizer='adam', epochs=100, batch_size=32, dropout_rate=0.2, l2_regularizer=0.1, validation_size=0.1, preprocessing=True, verbose=1, random_state=None, contamination=0.1, gamma=1.0, capacity=0.0): super(VAE_EDE, self).__init__(contamination=contamination) self.encoder_neurons = encoder_neurons self.decoder_neurons = decoder_neurons self.hidden_activation = hidden_activation self.output_activation = output_activation self.loss = loss self.optimizer = optimizer self.epochs = epochs self.batch_size = batch_size self.dropout_rate = dropout_rate self.l2_regularizer = l2_regularizer self.validation_size = validation_size self.preprocessing = preprocessing self.verbose = verbose self.random_state = random_state self.latent_dim = latent_dim self.gamma = gamma self.capacity = capacity # default values if self.encoder_neurons is None: self.encoder_neurons = [128, 64, 32] if self.decoder_neurons is None: self.decoder_neurons = [32, 64, 128] self.encoder_neurons_ = self.encoder_neurons self.decoder_neurons_ = self.decoder_neurons check_parameter(dropout_rate, 0, 1, param_name='dropout_rate', include_left=True)
def __init__(self, base_estimators, n_clusters, linkage_method='single', weights=None, pre_fitted=False): super(EAC, self).__init__( base_estimators=base_estimators, pre_fitted=pre_fitted) check_parameter(n_clusters, low=2, param_name='n_clusters') self.n_clusters = n_clusters # set estimator weights self._set_weights(weights) self.linkage_method = linkage_method
def argmaxn(value_list, n, order='desc'): """Return the index of top n elements in the list if order is set to 'desc', otherwise return the index of n smallest ones. Parameters ---------- value_list : list, array, numpy array of shape (n_samples,) A list containing all values. n : int The number of elements to select. order : str, optional (default='desc') The order to sort {'desc', 'asc'}: - 'desc': descending - 'asc': ascending Returns ------- index_list : numpy array of shape (n,) The index of the top n elements. """ value_list = column_or_1d(value_list) length = len(value_list) # validate the choice of n check_parameter(n, 1, length, include_left=True, include_right=True, param_name='n') # for the smallest n, flip the value if order != 'desc': n = length - n value_sorted = np.partition(value_list, length - n) threshold = value_sorted[int(length - n)] if order == 'desc': return np.where(np.greater_equal(value_list, threshold))[0] else: # return the index of n smallest elements return np.where(np.less(value_list, threshold))[0]
def __init__(self, base_estimators, local_region_size=30, threshold=None, pre_fitted=None): super(DCS_LA, self).__init__( base_estimators=base_estimators, pre_fitted=pre_fitted) # validate input parameters if not isinstance(local_region_size, int): raise ValueError('local_region_size must be an integer variable') check_parameter(local_region_size, low=2, include_left=True, param_name='local_region_size') self.local_region_size = local_region_size if threshold is not None: warnings.warn( "DCS does not support threshold setting option. " "Please set the threshold in classifiers directly.") if pre_fitted is not None: warnings.warn("DCS does not support pre_fitted option.")
def __init__(self, base_estimators, local_region_size=30, n_selected_clfs=None, use_weights=False, threshold=None, pre_fitted=None): super(DES_LA, self).__init__(base_estimators=base_estimators, pre_fitted=pre_fitted) # validate input parameters if not isinstance(local_region_size, int): raise ValueError('local_region_size must be an integer variable') check_parameter(local_region_size, low=2, include_left=True, param_name='local_region_size') self.local_region_size = local_region_size if n_selected_clfs is None: self.n_selected_clfs = int(self.n_base_estimators_ * 0.5) else: if not isinstance(n_selected_clfs, int): raise ValueError('n_selected_clfs must be an integer variable') check_parameter(n_selected_clfs, low=1, high=self.n_base_estimators_, include_left=True, include_right=True, param_name='n_selected_clfs') self.n_selected_clfs = n_selected_clfs self.use_weights = use_weights if threshold is not None: warnings.warn("DES does not support threshold setting option. " "Please set the threshold in classifiers directly.") if pre_fitted is not None: warnings.warn("DES does not support pre_fitted option.")
def __init__(self, hidden_neurons=None, hidden_activation='leakyrelu', output_activation='leakyrelu', loss=None, optimizer='adam', lr=1e-3, epochs=20, batch_size=32, dropout_rate=0.2, l2_regularizer=0.1, validation_size=0.1, preprocessing=False, verbose=1, random_state=None, contamination=0.1): super(AE, self).__init__(contamination=contamination) self.hidden_neurons = hidden_neurons self.hidden_activation = hidden_activation self.output_activation = output_activation self.loss = loss self.optimizer = optimizer self.epochs = epochs self.batch_size = batch_size self.dropout_rate = dropout_rate self.l2_regularizer = l2_regularizer self.validation_size = validation_size self.preprocessing = preprocessing self.verbose = verbose self.random_state = random_state self.lr = lr self.hidden_neurons_ = self.hidden_neurons check_parameter(dropout_rate, 0, 1, param_name='dropout_rate', include_left=True)
def __init__(self, base_estimators, n_clusters, weights=None, reference_idx=0, pre_fitted=False): super(ClustererEnsemble, self).__init__(base_estimators=base_estimators, pre_fitted=pre_fitted) check_parameter(n_clusters, low=2, param_name='n_clusters') self.n_clusters = n_clusters check_parameter(reference_idx, low=0, high=self.n_base_estimators_ - 1, include_left=True, include_right=True) self.reference_idx = reference_idx # set estimator weights self._set_weights(weights)
def test_check_parameter_range(self): # verify parameter type correction with assert_raises(TypeError): check_parameter('f', 0, 100) with assert_raises(TypeError): check_parameter(1, 'f', 100) with assert_raises(TypeError): check_parameter(1, 0, 'f') with assert_raises(TypeError): check_parameter(argmaxn(value_list=[1, 2, 3], n=1), 0, 100) # if low and high are both unset with assert_raises(ValueError): check_parameter(50) # if low <= high with assert_raises(ValueError): check_parameter(50, 100, 99) with assert_raises(ValueError): check_parameter(50, 100, 100) # check one side with assert_raises(ValueError): check_parameter(50, low=100) with assert_raises(ValueError): check_parameter(50, high=0) assert_equal(True, check_parameter(50, low=10)) assert_equal(True, check_parameter(50, high=100)) # if check fails with assert_raises(ValueError): check_parameter(-1, 0, 100) with assert_raises(ValueError): check_parameter(101, 0, 100) with assert_raises(ValueError): check_parameter(0.5, 0.2, 0.3) # if check passes assert_equal(True, check_parameter(50, 0, 100)) assert_equal(True, check_parameter(0.5, 0.1, 0.8)) # if includes left or right bounds with assert_raises(ValueError): check_parameter(100, 0, 100, include_left=False, include_right=False) assert_equal( True, check_parameter(0, 0, 100, include_left=True, include_right=False)) assert_equal( True, check_parameter(0, 0, 100, include_left=True, include_right=True)) assert_equal( True, check_parameter(100, 0, 100, include_left=False, include_right=True)) assert_equal( True, check_parameter(100, 0, 100, include_left=True, include_right=True))
def _aom_moa_helper(mode, scores, n_buckets, method, bootstrap_estimators, random_state): """Internal helper function for Average of Maximum (AOM) and Maximum of Average (MOA). See :cite:`aggarwal2015theoretical` for details. First dividing estimators into subgroups, take the maximum/average score as the subgroup score. Finally, take the average/maximum of all subgroup scores. Parameters ---------- mode : str Define the operation model, either "AOM" or "MOA". scores : numpy array of shape (n_samples, n_estimators) The score matrix outputted from various estimators. n_buckets : int, optional (default=5) The number of subgroups to build. method : str, optional (default='static') {'static', 'dynamic'}, if 'dynamic', build subgroups randomly with dynamic bucket size. bootstrap_estimators : bool, optional (default=False) Whether estimators are drawn with replacement. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- combined_scores : Numpy array of shape (n_samples,) The combined scores. """ if mode != 'AOM' and mode != 'MOA': raise NotImplementedError( '{mode} is not implemented'.format(mode=mode)) scores = check_array(scores) # TODO: add one more parameter for max number of estimators # use random_state instead # for now it is fixed at n_estimators/2 n_estimators = scores.shape[1] check_parameter(n_buckets, 2, n_estimators, include_left=True, include_right=True, param_name='n_buckets') scores_buckets = np.zeros([scores.shape[0], n_buckets]) if method == 'static': n_estimators_per_bucket = int(n_estimators / n_buckets) if n_estimators % n_buckets != 0: raise ValueError('n_estimators / n_buckets has a remainder. Not ' 'allowed in static mode.') if not bootstrap_estimators: # shuffle the estimator order shuffled_list = shuffle(list(range(0, n_estimators, 1)), random_state=random_state) head = 0 for i in range(0, n_estimators, n_estimators_per_bucket): tail = i + n_estimators_per_bucket batch_ind = int(i / n_estimators_per_bucket) if mode == 'AOM': scores_buckets[:, batch_ind] = np.max( scores[:, shuffled_list[head:tail]], axis=1) else: scores_buckets[:, batch_ind] = np.mean( scores[:, shuffled_list[head:tail]], axis=1) # increment index head = head + n_estimators_per_bucket # noinspection PyUnusedLocal else: for i in range(n_buckets): ind = sample_without_replacement(n_estimators, n_estimators_per_bucket, random_state=random_state) if mode == 'AOM': scores_buckets[:, i] = np.max(scores[:, ind], axis=1) else: scores_buckets[:, i] = np.mean(scores[:, ind], axis=1) elif method == 'dynamic': # random bucket size for i in range(n_buckets): # the number of estimators in a bucket should be 2 - n/2 max_estimator_per_bucket = RandomState(seed=random_state).randint( 2, int(n_estimators / 2)) ind = sample_without_replacement(n_estimators, max_estimator_per_bucket, random_state=random_state) if mode == 'AOM': scores_buckets[:, i] = np.max(scores[:, ind], axis=1) else: scores_buckets[:, i] = np.mean(scores[:, ind], axis=1) else: raise NotImplementedError( '{method} is not implemented'.format(method=method)) if mode == 'AOM': return np.mean(scores_buckets, axis=1) else: return np.max(scores_buckets, axis=1)
def _parameter_validation(self, contamination, n_jobs, rp_clf_list, rp_ng_clf_list, approx_clf_list, approx_ng_clf_list, approx_clf, cost_forecast_loc_fit, cost_forecast_loc_pred): """Internal function to valid the initial parameters Returns ------- self : object Post-check estimator. """ if not (0. < contamination <= 0.5): raise ValueError("contamination must be in (0, 0.5], " "got: %f" % contamination) self.contamination = contamination if approx_clf is not None: self.approx_clf = approx_clf else: self.approx_clf = RandomForestRegressor(n_estimators=50) if n_jobs is None: self.n_jobs = 1 elif n_jobs == -1: self.n_jobs = effective_n_jobs() else: self.n_jobs = n_jobs # validate random projection list if rp_clf_list is None: # the algorithms that should be be using random projection self.rp_clf_list = ['LOF', 'KNN', 'ABOD', 'COF'] else: self.rp_clf_list = rp_clf_list if rp_ng_clf_list is None: # the algorithms that should not be using random projection self.rp_ng_clf_list = ['IForest', 'PCA', 'HBOS', 'MCD', 'LMDD'] else: self.rp_ng_clf_list = rp_ng_clf_list # Validate target_dim_frac check_parameter(self.target_dim_frac, low=0, high=1, include_left=False, include_right=True, param_name='target_dim_frac') # validate model approximation list if approx_clf_list is None: # the algorithms that should be be using approximation self.approx_clf_list = ['LOF', 'KNN', 'CBLOF', 'OCSVM'] else: self.approx_clf_list = approx_clf_list if approx_ng_clf_list is None: # the algorithms that should not be using approximation self.approx_ng_clf_list = [ 'PCA', 'HBOS', 'ABOD', 'MCD', 'LMDD', 'LSCP', 'IForest' ] else: self.approx_ng_clf_list = approx_ng_clf_list this_directory = os.path.abspath(os.path.dirname(__file__)) # validate the trained model if cost_forecast_loc_fit is None: self.cost_forecast_loc_fit_ = os.path.join(this_directory, 'saved_models', 'bps_train.joblib') else: self.cost_forecast_loc_fit_ = cost_forecast_loc_fit if cost_forecast_loc_pred is None: self.cost_forecast_loc_pred_ = os.path.join( this_directory, 'saved_models', 'bps_prediction.joblib') else: self.cost_forecast_loc_pred_ = cost_forecast_loc_pred return self
def _parameter_validation(self, contamination, n_jobs, rp_clf_list, rp_ng_clf_list, approx_clf_list, approx_ng_clf_list, approx_clf, cost_forecast_loc_fit, cost_forecast_loc_pred): if not (0. < contamination <= 0.5): raise ValueError("contamination must be in (0, 0.5], " "got: %f" % contamination) self.contamination = contamination if approx_clf is not None: self.approx_clf = approx_clf else: self.approx_clf = RandomForestRegressor(n_estimators=50) if n_jobs is None: self.n_jobs = 1 else: self.n_jobs = n_jobs # validate random projection list if rp_clf_list is None: # the algorithms that should be be using random projection self.rp_clf_list = ['LOF', 'KNN', 'ABOD'] else: self.rp_clf_list = rp_clf_list if rp_ng_clf_list is None: # the algorithms that should be be using random projection self.rp_ng_clf_list = ['IForest', 'PCA', 'HBOS', 'MCD', 'LMDD'] else: self.rp_ng_clf_list = rp_ng_clf_list # Validate max_features check_parameter(self.max_features, low=0, high=1, include_left=False, include_right=True, param_name='max_features') # validate model approximation list if approx_clf_list is None: # the algorithms that should be be using random projection self.approx_clf_list = ['LOF', 'KNN', 'CBLOF', 'OCSVM', 'IForest'] else: self.approx_clf_list = approx_clf_list if approx_ng_clf_list is None: # the algorithms that should be be using random projection self.approx_ng_clf_list = ['PCA', 'HBOS', 'ABOD', 'MCD', 'LMDD', 'LSCP'] else: self.approx_ng_clf_list = approx_ng_clf_list this_directory = os.path.abspath(os.path.dirname(__file__)) if cost_forecast_loc_fit is None: self.cost_forecast_loc_fit_ = os.path.join( this_directory, 'saved_models', 'bps_train.joblib') else: self.cost_forecast_loc_fit_ = cost_forecast_loc_fit if cost_forecast_loc_pred is None: self.cost_forecast_loc_pred_ = os.path.join( this_directory, 'saved_models', 'bps_prediction.joblib') else: self.cost_forecast_loc_pred_ = cost_forecast_loc_pred return self
def __init__(self, epochs=100, batch_size=32, lr=1e-3, loss='mse', dropout_rate=0.2, l2_regularizer=0.1, validation_size=0.1, verbose=1, random_state=42, contamination=0.1, hid_dim=16, lat_dim=8): """AutoEncoder Parameters ---------- epochs: int (default is 100) The number of iterations to train the model. batch_size: int (default is 32) The number of instances used to train the model. lr: float (default is 1e-3) The learning step loss: str (default is "mse") The loss function dropout_rate: float (default is 0.2) (not implemented) It's in range (0,1) l2_regularizer: float (default is 0.1) The hyperparameter used to balance loss and weights. validation_size: float (default is 0.2) It's in range (0,1), which is used to evaluate the training result (not implemented) contamination: float (default is 0.1) It's in range (0,1). A threshold used to decide the normal score (not used) hid_dim: int (default is 16) The number of neurons of the hidden layer. lat_dim: int (default is 8) The number of neurons of the latent layer. verbose: int (default is 1) A print level is to control what information should be printed according to the given value. The higher the value is, the more info is printed. random_state: int (default is 42) """ self.epochs = epochs self.batch_size = batch_size self.loss = loss self.dropout_rate = dropout_rate self.l2_regularizer = l2_regularizer self.validation_size = validation_size self.verbose = verbose self.random_state = random_state self.lr = lr self.contamination = contamination self.hid_dim = hid_dim self.lat_dim = lat_dim check_parameter(dropout_rate, 0, 1, param_name='dropout_rate', include_left=True) if self.loss == 'mse' or (not self.loss): self.criterion = nn.MSELoss()