Esempio n. 1
0
 def test_moa_static_n_buckets(self):
     with assert_raises(ValueError):
         moa(self.scores,
             5,
             method='static',
             bootstrap_estimators=False,
             random_state=42)
Esempio n. 2
0
 def test_moa_dynamic_repeat(self):
     score = moa(self.scores,
                 3,
                 method='dynamic',
                 bootstrap_estimators=True,
                 random_state=42)
     assert_equal(score.shape, (4, ))
Esempio n. 3
0
    def test_moa_static_norepeat(self):
        score = moa(self.scores,
                    3,
                    method='static',
                    bootstrap_estimators=False,
                    random_state=42)

        assert_equal(score.shape, (4, ))

        shuffled_list = shuffle(list(range(0, 6, 1)), random_state=42)
        manual_scores = np.zeros([4, 3])
        manual_scores[:, 0] = np.mean(self.scores[:, shuffled_list[0:2]],
                                      axis=1)
        manual_scores[:, 1] = np.mean(self.scores[:, shuffled_list[2:4]],
                                      axis=1)
        manual_scores[:, 2] = np.mean(self.scores[:, shuffled_list[4:6]],
                                      axis=1)

        manual_score = np.max(manual_scores, axis=1)
        assert_array_equal(score, manual_score)
Esempio n. 4
0
                X_test,
                n_estimators,
                # rp_flags[starts[i]:starts[i + 1]],
                jl_transformers,
                approx_flags[starts[i]:starts[i + 1]],
                verbose=True) for i in range(n_jobs))

    print('Orig decision_function time:', time.time() - start)
    print()

    # unfold and generate the label matrix
    predicted_scores_orig = np.zeros([X_test.shape[0], n_estimators])
    for i in range(n_jobs):
        predicted_scores_orig[:, starts[i]:starts[i + 1]] = np.asarray(
            all_results_scores[i]).T
    ##########################################################################
    predicted_scores = standardizer(predicted_scores)
    predicted_scores_orig = standardizer(predicted_scores_orig)

    evaluate_print('orig', y_test, average(predicted_scores_orig))
    evaluate_print('new', y_test, average(predicted_scores))

    evaluate_print('orig max', y_test, maximization(predicted_scores_orig))
    evaluate_print('new max', y_test, maximization(predicted_scores))

    evaluate_print('orig aom', y_test, aom(predicted_scores_orig))
    evaluate_print('new aom', y_test, aom(predicted_scores))

    evaluate_print('orig moa', y_test, moa(predicted_scores_orig))
    evaluate_print('new moa', y_test, moa(predicted_scores))
Esempio n. 5
0
    def fit(self, X, shrink_cols = True, data_scaler = preprocessing.MaxAbsScaler(), 
            quick_methods = True, slow_methods = False, nn_methods = False, 
            contamination = 0.05, use_score_rank = False, random_state = None, verbose = 0):

        if len(X.shape) > 2:
            X = X.reshape(X.shape[0], X.shape[1]*X.shape[2])
        elif len(X.shape) > 3:
            raise ValueError("Expected number of dimensions: 2 or 3") 
        
        if shrink_cols:
            X = X[:,~np.all(X == 0, axis=0)]
            log.info('zero columns shrinked')
        if data_scaler:
            X = data_scaler.fit_transform(X)
            log.info(f'used {data_scaler} data scaler')
            #log.info(X[0:1,:])
        
        n_rows = X.shape[0]
        n_features = X.shape[1]
        log.info (f'n_rows = {n_rows}, n_features = {n_features}')
        
        quick_scores = np.zeros([n_rows, 0])
        slow_scores = np.zeros([n_rows, 0])
        nn_scores = np.zeros([n_rows, 0])
        
        if quick_methods:
            # Define anomaly detection tools to be compared
            quick_classifiers = {
                'PCA_randomized':
                    PCA(contamination=contamination, random_state=random_state, 
                        standardization = False, svd_solver = 'randomized'),
                'PCA_full':
                    PCA(contamination=contamination, random_state=random_state, 
                        standardization = False, svd_solver = 'full'),                               
                'COPOD':
                   COPOD(contamination=contamination),  
                f'HBOS': 
                    HBOS(contamination=contamination),
                f'HBOS_{200}': 
                    HBOS(contamination=contamination, n_bins = 200),                
                f'HBOS_{300}':  
                    HBOS(contamination=contamination, n_bins = 300), 
                'LODA':
                    LODA(contamination=contamination),
                'LODA_200':
                    LODA(contamination=contamination, n_random_cuts  = 200),
                'LODA_300':
                    LODA(contamination=contamination, n_random_cuts  = 300),                
                'IForest_100':
                    IForest(contamination=contamination, random_state=random_state, 
                            n_estimators = 100, bootstrap = False, n_jobs = -1),
                'IForest_200':
                    IForest(contamination=contamination, random_state=random_state, 
                            n_estimators = 200, bootstrap = False, n_jobs = -1),                
                'IForest_bootstrap':
                    IForest(contamination = contamination, random_state=random_state, 
                            n_estimators = 150, bootstrap = True, n_jobs = -1), 
                #'MCD': 
                #    MCD(contamination=contamination, random_state=random_state, assume_centered = False),
                #'MCD_centered': 
                #    MCD(contamination=contamination, random_state=random_state, assume_centered = True),    
                f'CBLOF_16':
                    CBLOF(contamination=contamination, random_state=random_state, n_clusters = 16),
                f'CBLOF_24':
                    CBLOF(contamination=contamination, random_state=random_state, n_clusters = 24),
                f'CBLOF_32':
                    CBLOF(contamination=contamination, random_state=random_state, n_clusters = 32)
            }
            
            quick_scores = np.zeros([n_rows, len(quick_classifiers)])

            for i, (clf_name, clf) in enumerate(quick_classifiers.items()):
                log.info(f'{i+1} - fitting {clf_name}')
                try:
                    clf.fit(X)
                    quick_scores[:, i] = clf.decision_scores_
                except:
                    log.info(traceback.print_exc())
                else:    
                    log.info(f'Base detector {i+1}/{len(quick_classifiers)} is fitted for prediction') 

            quick_scores = np.nan_to_num(quick_scores)
            
        if slow_methods:
            # initialize a set of detectors for LSCP
            detector_list = [LOF(n_neighbors=10), LOF(n_neighbors=15), LOF(n_neighbors=20)]
            slow_classifiers = {               
                #'Angle-based Outlier Detector (ABOD)': #too slow and nan results
                #   ABOD(contamination=contamination),
                #'One-class SVM (OCSVM)':
                #   OCSVM(contamination=contamination, cache_size = 2000, shrinking = False, tol = 1e-2),   
                #'LSCP': #slow and no parallel
                #   LSCP(detector_list, contamination=contamination, random_state=random_state, local_region_size = 30),
                #'Feature Bagging': #ensemble #no real par
                #   FeatureBagging(LOF(n_neighbors=20), contamination=contamination, 
                #                  random_state=random_state, n_jobs = -1),                
                #'SOS' : # too memory inefficient  
                #    SOS(contamination=contamination),
                #'COF': # memory inefficient
                #   COF(contamination=contamination),                  
                #'SOD':
                #    SOD(contamination = contamination),
                #'KNN': 
                #   KNN(contamination=contamination, n_jobs = -1),
                #'KNN_50': 
                #   KNN(contamination=contamination, leaf_size = 50, n_jobs = -1),
                #'KNN_70': 
                #   KNN(contamination=contamination, leaf_size = 70, n_jobs = -1),

                'LOF_4':
                   LOF(n_neighbors=4, contamination=contamination, n_jobs = -1),
                'LOF_5':
                   LOF(n_neighbors=5, contamination=contamination, n_jobs = -1),                
                'LOF_6':
                   LOF(n_neighbors=6, contamination=contamination, n_jobs = -1),
                'LOF_7':
                   LOF(n_neighbors=7, contamination=contamination, n_jobs = -1),                
                'LOF_8':
                   LOF(n_neighbors=8, contamination=contamination, n_jobs = -1),
                'LOF_9':
                   LOF(n_neighbors=9, contamination=contamination, n_jobs = -1),                
                'LOF_10':
                   LOF(n_neighbors=10, contamination=contamination, n_jobs = -1),
                'LOF_12':
                   LOF(n_neighbors=12, contamination=contamination, n_jobs = -1),  
                'LOF_14':
                   LOF(n_neighbors=14, contamination=contamination, n_jobs = -1),
                'LOF_16':
                   LOF(n_neighbors=16, contamination=contamination, n_jobs = -1),
                'LOF_18':
                   LOF(n_neighbors=18, contamination=contamination, n_jobs = -1),
                'LOF_20':
                   LOF(n_neighbors=20, contamination=contamination, n_jobs = -1), 
                'LOF_22':
                   LOF(n_neighbors=22, contamination=contamination, n_jobs = -1)            
            }
            
            slow_scores = np.zeros([n_rows, len(slow_classifiers)])

            for i, (clf_name, clf) in enumerate(slow_classifiers.items()):
                log.info(f'{i+1} - fitting {clf_name}')
                try:
                    clf.fit(X)
                    slow_scores[:, i] = clf.decision_scores_
                except:
                    log.info(traceback.print_exc())
                else:    
                    log.info(f'Base detector {i+1}/{len(slow_classifiers)} is fitted for prediction') 
            
            slow_scores = np.nan_to_num(slow_scores)
        
        if nn_methods:
            
            nn_classifiers = {}
            n_list = [1024, 512, 256, 128, 64, 32, 16, 8, 4, 2]
            n_idx = next(x[0] for x in enumerate(n_list) if x[1] < n_features)
            for i in range(3,6):
                n_enc = n_list[n_idx:n_idx+i-1] 
                n_dec = n_enc[::-1]
                n_enc_dec = n_enc + n_dec
                nn_classifiers[f'FULL_AE_{len(n_enc + n_dec)}'] = {'clf': self.full_autoencoder, 
                                                                   'hidden_layers' : n_enc_dec
                                                                  }
                nn_classifiers[f'VAE_{len(n_enc_dec)}'] = {'clf': VAE(contamination = contamination, random_state = random_state,
                                                                      encoder_neurons  = n_enc, decoder_neurons = n_dec,
                                                                      preprocessing = False, epochs = 32, verbosity = verbose), 
                                                            'hidden_layers' : n_enc + n_dec
                                                            }                
                
            
            nn_scores = np.zeros([n_rows, len(nn_classifiers)])
            
            for i, (clf_name, clf) in enumerate(nn_classifiers.items()):
                log.info(f'''{i+1} - fitting {clf_name} with layers {clf['hidden_layers']}''')
                try:
                    if clf['clf'] == self.full_autoencoder:
                        nn_scores[:, i] = clf['clf'](X, neurons_list = clf['hidden_layers'], verbose = verbose)
                    else:
                        clf['clf'].fit(X)
                        nn_scores[:, i] = clf['clf'].decision_scores_                        
                except:
                    log.info(traceback.print_exc())
                else:    
                    log.info(f'Base detector {i+1}/{len(nn_classifiers)} is fitted for prediction')             

            nn_scores = np.nan_to_num(nn_scores)

            
        all_scores = np.concatenate((quick_scores, slow_scores, nn_scores), axis=1)
        all_scores = all_scores[:,~np.all(all_scores == 0, axis=0)]
        log.info(f'total scores = {all_scores.shape[1]}')
        
        all_scores_norm = np.copy(all_scores)
        if use_score_rank:
            all_scores_norm = np.apply_along_axis(rank_fun, 0, all_scores_norm)
            log.info(f'score rank applied')
        all_scores_norm = preprocessing.MinMaxScaler().fit_transform(all_scores_norm)
        
        if all_scores_norm.shape[1] >= 12:
            score_by_aom = aom(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4))
            score_by_moa = moa(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4))
            score_by_avg = np.mean(all_scores_norm, axis = 1) 
            score_by_max = np.max(all_scores_norm, axis = 1)
        else:
            score_by_avg = np.mean(all_scores_norm, axis = 1)
            score_by_max = np.max(all_scores_norm, axis = 1)
            score_by_aom = score_by_avg
            score_by_moa = score_by_max
        return score_by_aom, score_by_moa, score_by_max, score_by_avg, all_scores, all_scores_norm