def test_aom_static_n_buckets(self): with assert_raises(ValueError): aom(self.scores, 5, method='static', bootstrap_estimators=False, random_state=42)
def test_aom_dynamic_repeat(self): score = aom(self.scores, 3, method='dynamic', bootstrap_estimators=True, random_state=42) assert_equal(score.shape, (4, ))
def test_aom_static_norepeat(self): score = aom(self.scores, 3, method='static', bootstrap_estimators=False, random_state=42) assert_equal(score.shape, (4, )) shuffled_list = shuffle(list(range(0, 6, 1)), random_state=42) manual_scores = np.zeros([4, 3]) manual_scores[:, 0] = np.max(self.scores[:, shuffled_list[0:2]], axis=1) manual_scores[:, 1] = np.max(self.scores[:, shuffled_list[2:4]], axis=1) manual_scores[:, 2] = np.max(self.scores[:, shuffled_list[4:6]], axis=1) manual_score = np.mean(manual_scores, axis=1) assert_array_equal(score, manual_score)
X_test, n_estimators, # rp_flags[starts[i]:starts[i + 1]], jl_transformers, approx_flags[starts[i]:starts[i + 1]], verbose=True) for i in range(n_jobs)) print('Orig decision_function time:', time.time() - start) print() # unfold and generate the label matrix predicted_scores_orig = np.zeros([X_test.shape[0], n_estimators]) for i in range(n_jobs): predicted_scores_orig[:, starts[i]:starts[i + 1]] = np.asarray( all_results_scores[i]).T ########################################################################## predicted_scores = standardizer(predicted_scores) predicted_scores_orig = standardizer(predicted_scores_orig) evaluate_print('orig', y_test, average(predicted_scores_orig)) evaluate_print('new', y_test, average(predicted_scores)) evaluate_print('orig max', y_test, maximization(predicted_scores_orig)) evaluate_print('new max', y_test, maximization(predicted_scores)) evaluate_print('orig aom', y_test, aom(predicted_scores_orig)) evaluate_print('new aom', y_test, aom(predicted_scores)) evaluate_print('orig moa', y_test, moa(predicted_scores_orig)) evaluate_print('new moa', y_test, moa(predicted_scores))
def fit(self, X, shrink_cols = True, data_scaler = preprocessing.MaxAbsScaler(), quick_methods = True, slow_methods = False, nn_methods = False, contamination = 0.05, use_score_rank = False, random_state = None, verbose = 0): if len(X.shape) > 2: X = X.reshape(X.shape[0], X.shape[1]*X.shape[2]) elif len(X.shape) > 3: raise ValueError("Expected number of dimensions: 2 or 3") if shrink_cols: X = X[:,~np.all(X == 0, axis=0)] log.info('zero columns shrinked') if data_scaler: X = data_scaler.fit_transform(X) log.info(f'used {data_scaler} data scaler') #log.info(X[0:1,:]) n_rows = X.shape[0] n_features = X.shape[1] log.info (f'n_rows = {n_rows}, n_features = {n_features}') quick_scores = np.zeros([n_rows, 0]) slow_scores = np.zeros([n_rows, 0]) nn_scores = np.zeros([n_rows, 0]) if quick_methods: # Define anomaly detection tools to be compared quick_classifiers = { 'PCA_randomized': PCA(contamination=contamination, random_state=random_state, standardization = False, svd_solver = 'randomized'), 'PCA_full': PCA(contamination=contamination, random_state=random_state, standardization = False, svd_solver = 'full'), 'COPOD': COPOD(contamination=contamination), f'HBOS': HBOS(contamination=contamination), f'HBOS_{200}': HBOS(contamination=contamination, n_bins = 200), f'HBOS_{300}': HBOS(contamination=contamination, n_bins = 300), 'LODA': LODA(contamination=contamination), 'LODA_200': LODA(contamination=contamination, n_random_cuts = 200), 'LODA_300': LODA(contamination=contamination, n_random_cuts = 300), 'IForest_100': IForest(contamination=contamination, random_state=random_state, n_estimators = 100, bootstrap = False, n_jobs = -1), 'IForest_200': IForest(contamination=contamination, random_state=random_state, n_estimators = 200, bootstrap = False, n_jobs = -1), 'IForest_bootstrap': IForest(contamination = contamination, random_state=random_state, n_estimators = 150, bootstrap = True, n_jobs = -1), #'MCD': # MCD(contamination=contamination, random_state=random_state, assume_centered = False), #'MCD_centered': # MCD(contamination=contamination, random_state=random_state, assume_centered = True), f'CBLOF_16': CBLOF(contamination=contamination, random_state=random_state, n_clusters = 16), f'CBLOF_24': CBLOF(contamination=contamination, random_state=random_state, n_clusters = 24), f'CBLOF_32': CBLOF(contamination=contamination, random_state=random_state, n_clusters = 32) } quick_scores = np.zeros([n_rows, len(quick_classifiers)]) for i, (clf_name, clf) in enumerate(quick_classifiers.items()): log.info(f'{i+1} - fitting {clf_name}') try: clf.fit(X) quick_scores[:, i] = clf.decision_scores_ except: log.info(traceback.print_exc()) else: log.info(f'Base detector {i+1}/{len(quick_classifiers)} is fitted for prediction') quick_scores = np.nan_to_num(quick_scores) if slow_methods: # initialize a set of detectors for LSCP detector_list = [LOF(n_neighbors=10), LOF(n_neighbors=15), LOF(n_neighbors=20)] slow_classifiers = { #'Angle-based Outlier Detector (ABOD)': #too slow and nan results # ABOD(contamination=contamination), #'One-class SVM (OCSVM)': # OCSVM(contamination=contamination, cache_size = 2000, shrinking = False, tol = 1e-2), #'LSCP': #slow and no parallel # LSCP(detector_list, contamination=contamination, random_state=random_state, local_region_size = 30), #'Feature Bagging': #ensemble #no real par # FeatureBagging(LOF(n_neighbors=20), contamination=contamination, # random_state=random_state, n_jobs = -1), #'SOS' : # too memory inefficient # SOS(contamination=contamination), #'COF': # memory inefficient # COF(contamination=contamination), #'SOD': # SOD(contamination = contamination), #'KNN': # KNN(contamination=contamination, n_jobs = -1), #'KNN_50': # KNN(contamination=contamination, leaf_size = 50, n_jobs = -1), #'KNN_70': # KNN(contamination=contamination, leaf_size = 70, n_jobs = -1), 'LOF_4': LOF(n_neighbors=4, contamination=contamination, n_jobs = -1), 'LOF_5': LOF(n_neighbors=5, contamination=contamination, n_jobs = -1), 'LOF_6': LOF(n_neighbors=6, contamination=contamination, n_jobs = -1), 'LOF_7': LOF(n_neighbors=7, contamination=contamination, n_jobs = -1), 'LOF_8': LOF(n_neighbors=8, contamination=contamination, n_jobs = -1), 'LOF_9': LOF(n_neighbors=9, contamination=contamination, n_jobs = -1), 'LOF_10': LOF(n_neighbors=10, contamination=contamination, n_jobs = -1), 'LOF_12': LOF(n_neighbors=12, contamination=contamination, n_jobs = -1), 'LOF_14': LOF(n_neighbors=14, contamination=contamination, n_jobs = -1), 'LOF_16': LOF(n_neighbors=16, contamination=contamination, n_jobs = -1), 'LOF_18': LOF(n_neighbors=18, contamination=contamination, n_jobs = -1), 'LOF_20': LOF(n_neighbors=20, contamination=contamination, n_jobs = -1), 'LOF_22': LOF(n_neighbors=22, contamination=contamination, n_jobs = -1) } slow_scores = np.zeros([n_rows, len(slow_classifiers)]) for i, (clf_name, clf) in enumerate(slow_classifiers.items()): log.info(f'{i+1} - fitting {clf_name}') try: clf.fit(X) slow_scores[:, i] = clf.decision_scores_ except: log.info(traceback.print_exc()) else: log.info(f'Base detector {i+1}/{len(slow_classifiers)} is fitted for prediction') slow_scores = np.nan_to_num(slow_scores) if nn_methods: nn_classifiers = {} n_list = [1024, 512, 256, 128, 64, 32, 16, 8, 4, 2] n_idx = next(x[0] for x in enumerate(n_list) if x[1] < n_features) for i in range(3,6): n_enc = n_list[n_idx:n_idx+i-1] n_dec = n_enc[::-1] n_enc_dec = n_enc + n_dec nn_classifiers[f'FULL_AE_{len(n_enc + n_dec)}'] = {'clf': self.full_autoencoder, 'hidden_layers' : n_enc_dec } nn_classifiers[f'VAE_{len(n_enc_dec)}'] = {'clf': VAE(contamination = contamination, random_state = random_state, encoder_neurons = n_enc, decoder_neurons = n_dec, preprocessing = False, epochs = 32, verbosity = verbose), 'hidden_layers' : n_enc + n_dec } nn_scores = np.zeros([n_rows, len(nn_classifiers)]) for i, (clf_name, clf) in enumerate(nn_classifiers.items()): log.info(f'''{i+1} - fitting {clf_name} with layers {clf['hidden_layers']}''') try: if clf['clf'] == self.full_autoencoder: nn_scores[:, i] = clf['clf'](X, neurons_list = clf['hidden_layers'], verbose = verbose) else: clf['clf'].fit(X) nn_scores[:, i] = clf['clf'].decision_scores_ except: log.info(traceback.print_exc()) else: log.info(f'Base detector {i+1}/{len(nn_classifiers)} is fitted for prediction') nn_scores = np.nan_to_num(nn_scores) all_scores = np.concatenate((quick_scores, slow_scores, nn_scores), axis=1) all_scores = all_scores[:,~np.all(all_scores == 0, axis=0)] log.info(f'total scores = {all_scores.shape[1]}') all_scores_norm = np.copy(all_scores) if use_score_rank: all_scores_norm = np.apply_along_axis(rank_fun, 0, all_scores_norm) log.info(f'score rank applied') all_scores_norm = preprocessing.MinMaxScaler().fit_transform(all_scores_norm) if all_scores_norm.shape[1] >= 12: score_by_aom = aom(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4)) score_by_moa = moa(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4)) score_by_avg = np.mean(all_scores_norm, axis = 1) score_by_max = np.max(all_scores_norm, axis = 1) else: score_by_avg = np.mean(all_scores_norm, axis = 1) score_by_max = np.max(all_scores_norm, axis = 1) score_by_aom = score_by_avg score_by_moa = score_by_max return score_by_aom, score_by_moa, score_by_max, score_by_avg, all_scores, all_scores_norm