Ejemplo n.º 1
0
def main():

    scalers = ['no', 'std', 'minmax']
    root = 'Unsupervised_Anamaly_Detection_csv'
    start = 0
    counts = 90
    CPUS = 3
    CPUS_Models = 4
    sklearn_models = [
        'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS',
        'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
        'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal',
        'VAE', 'AutoEncoder'
    ]

    models = {
        'BRM': BRM(bootstrap_sample_percent=70),
        'GM': GaussianMixture(),
        'IF': IsolationForest(),
        'OCSVM': OneClassSVM(),
        'EE': EllipticEnvelope(),
        'AvgKNN': KNN(method='mean'),
        'LargestKNN': KNN(method='largest'),
        'MedKNN': KNN(method='median'),
        'PCA': PCA(),
        'COF': COF(),
        'LODA': LODA(),
        'LOF': LOF(),
        'HBOS': HBOS(),
        'MCD': MCD(),
        'AvgBagging': FeatureBagging(combination='average'),
        'MaxBagging': FeatureBagging(combination='max'),
        'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'FactorAnalysis': FactorAnalysis(),
        'KernelDensity': KernelDensity(),
        'COPOD': COPOD(),
        'SOD': SOD(),
        'LSCPwithLODA': LSCP([LODA(), LODA()]),
        'AveLMDD': LMDD(dis_measure='aad'),
        'VarLMDD': LMDD(dis_measure='var'),
        'IqrLMDD': LMDD(dis_measure='iqr'),
        'SoGaal': SO_GAAL(),
        'MoGaal': MO_GAAL(),
        'VAE': VAE(encoder_neurons=[8, 4, 2]),
        'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'OCKRA': m_OCKRA(),
    }

    name = "30_Models"

    Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root, scaler, models, start, counts,
          other_models=sklearn_models,
          CPUS=CPUS_Models,
          save_name=name)
         for scaler in scalers)
Ejemplo n.º 2
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = LODA(contamination=self.contamination)
        self.clf.fit(self.X_train)
Ejemplo n.º 3
0
 def model_init(self, model):
     """Model initialisation of a single model.
     """
     if self.model == 'pca':
         self.models[model] = PCA(contamination=self.contamination)
     elif self.model == 'loda':
         self.models[model] = LODA(contamination=self.contamination)
     elif self.model == 'iforest':
         self.models[model] = IForest(n_estimators=50,
                                      bootstrap=True,
                                      behaviour='new',
                                      contamination=self.contamination)
     elif self.model == 'cblof':
         self.models[model] = CBLOF(n_clusters=3,
                                    contamination=self.contamination)
     elif self.model == 'feature_bagging':
         self.models[model] = FeatureBagging(
             base_estimator=PCA(contamination=self.contamination),
             contamination=self.contamination)
     elif self.model == 'copod':
         self.models[model] = COPOD(contamination=self.contamination)
     elif self.model == 'hbos':
         self.models[model] = HBOS(contamination=self.contamination)
     else:
         self.models[model] = HBOS(contamination=self.contamination)
     self.custom_model_scalers[model] = MinMaxScaler()
Ejemplo n.º 4
0
 def models_init(self):
     """Models initialisation.
     """
     self.model = self.configuration.get('model', 'pca')
     if self.model == 'pca':
         self.models = {
             model: PCA(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'loda':
         self.models = {
             model: LODA(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'iforest':
         self.models = {
             model: IForest(n_estimators=50,
                            bootstrap=True,
                            behaviour='new',
                            contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'cblof':
         self.models = {
             model: CBLOF(n_clusters=3, contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'feature_bagging':
         self.models = {
             model: FeatureBagging(
                 base_estimator=PCA(contamination=self.contamination),
                 contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'copod':
         self.models = {
             model: COPOD(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'hbos':
         self.models = {
             model: HBOS(contamination=self.contamination)
             for model in self.models_in_scope
         }
     else:
         self.models = {
             model: HBOS(contamination=self.contamination)
             for model in self.models_in_scope
         }
     self.custom_model_scalers = {
         model: MinMaxScaler()
         for model in self.models_in_scope
     }
Ejemplo n.º 5
0
    def __init__(self, *,
                 hyperparams: Hyperparams, #
                 random_seed: int = 0,
                 docker_containers: Dict[str, DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)

        self._clf = LODA(contamination=hyperparams['contamination'],
                        n_bins=hyperparams['n_bins'],
                         n_random_cuts=hyperparams['n_random_cuts'],
                        )

        return
Ejemplo n.º 6
0
def train(doc_list, dataset_name, clf_name):
    model_roc = []
    model_prc = []
    if clf_name == "PCA":
        clf = PCA()
    elif clf_name == "MCD":
        clf = MCD()
    elif clf_name == "LOF":
        clf = LOF()
    elif clf_name == "KNN":
        clf = KNN()
    elif clf_name == "LODA":
        clf = LODA()
    for i in range(10):
        data = pd.read_csv(doc_list[i], header=0, index_col=0)
        train_x = data.drop(drop + ground_truth, axis=1).values
        train_y = np.array([
            transfor[x]
            for x in list(_flatten(data[ground_truth].values.tolist()))
        ])
        clf.fit(train_x)
        predict = clf.decision_scores_
        roc = roc_auc_score(train_y, predict)
        prc = precision_n_scores(train_y, predict)
        if ((i + 1) % 200 == 0):
            print("第" + str(i + 1) + "个文件结果:")
            evaluate_print(clf_name, train_y, predict)
        model_roc.append(roc)
        model_prc.append(prc)
    model_roc_avg = np.mean(model_roc)
    model_prc_avg = np.mean(model_prc)
    print("模型" + clf_name + "在数据集" + dataset_name + "的平均roc_auc为" +
          str(round(model_roc_avg, 4)) + ",平均prc为" +
          str(round(model_prc_avg, 4)) + "。")

    return model_roc_avg, model_prc_avg
Ejemplo n.º 7
0
    def initialise_pyod_classifiers(self, outlier_fraction):
        #Testing every query to every class and then predicting only if it belongs to the same class
        classifiers = {}
        #Proximity based
        classifiers['K Nearest Neighbors (KNN)'] = []
        classifiers['Average K Nearest Neighbors (AvgKNN)'] = []
        classifiers['Median K Nearest Neighbors (MedKNN)'] = []
        classifiers['Local Outlier Factor (LOF)'] = []
        classifiers['Connectivity-Based Outlier Factor (COF)'] = []
        #classifiers['Clustering-Based Local Outlier Factor (CBLOF)'] = []
        classifiers['LOCI'] = []
        #classifiers['Histogram-based Outlier Score (HBOS)'] = []
        classifiers['Subspace Outlier Detection (SOD)'] = []
        #Linear models
        classifiers['Principal Component Analysis (PCA)'] = []
        #classifiers['Minimum Covariance Determinant (MCD)'] = []           #To slow
        classifiers['One-Class Support Vector Machines (OCSVM)'] = []
        classifiers['Deviation-based Outlier Detection (LMDD)'] = []
        #Probabilistic
        classifiers['Angle-Based Outlier Detection (ABOD)'] = []
        classifiers['Stochastic Outlier Selection (SOS)'] = []
        #Outlier Ensembles
        classifiers['Isolation Forest (IForest)'] = []
        classifiers['Feature Bagging'] = []
        classifiers['Lightweight On-line Detector of Anomalies (LODA)'] = []

        for i in range(self.k_way):
            for i in range(self.k_way):
                classifiers['K Nearest Neighbors (KNN)'].append(
                    KNN(method='largest',
                        n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Average K Nearest Neighbors (AvgKNN)'].append(
                    KNN(method='mean',
                        n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Median K Nearest Neighbors (MedKNN)'].append(
                    KNN(method='median',
                        n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Local Outlier Factor (LOF)'].append(
                    LOF(n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Connectivity-Based Outlier Factor (COF)'].append(
                    COF(n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['LOCI'].append(
                    LOCI(contamination=outlier_fraction))
                classifiers['Subspace Outlier Detection (SOD)'].append(
                    SOD(n_neighbors=int(self.n_shot / 3) + 2,
                        contamination=outlier_fraction,
                        ref_set=max(2, int((int(self.n_shot / 3) + 2) / 3))))
                classifiers['Principal Component Analysis (PCA)'].append(
                    PCA(contamination=outlier_fraction))
                classifiers[
                    'One-Class Support Vector Machines (OCSVM)'].append(
                        OCSVM(contamination=outlier_fraction))
                classifiers['Deviation-based Outlier Detection (LMDD)'].append(
                    LMDD(contamination=outlier_fraction))
                classifiers['Angle-Based Outlier Detection (ABOD)'].append(
                    ABOD(contamination=outlier_fraction))
                classifiers['Stochastic Outlier Selection (SOS)'].append(
                    SOS(contamination=outlier_fraction))
                classifiers['Isolation Forest (IForest)'].append(
                    IForest(contamination=outlier_fraction))
                classifiers['Feature Bagging'].append(
                    FeatureBagging(contamination=outlier_fraction))
                classifiers[
                    'Lightweight On-line Detector of Anomalies (LODA)'].append(
                        LODA(contamination=outlier_fraction))
        self.num_different_models = len(classifiers)
        return classifiers
Ejemplo n.º 8
0
     'Angle-based Outlier Detector (ABOD)':
     ABOD(contamination=outliers_fraction),
     'Cluster-based Local Outlier Factor':
     CBLOF(n_clusters=10,
           contamination=outliers_fraction,
           check_estimator=False,
           random_state=random_state),
     'Histogram-base Outlier Detection (HBOS)':
     HBOS(contamination=outliers_fraction),
     'Isolation Forest':
     IForest(contamination=outliers_fraction,
             random_state=random_state),
     'K Nearest Neighbors (KNN)':
     KNN(contamination=outliers_fraction),
     'Lightweight on-line detector of anomalies (LODA)':
     LODA(contamination=outliers_fraction),
     'Local Outlier Factor (LOF)':
     LOF(contamination=outliers_fraction),
     'One-class SVM (OCSVM)':
     OCSVM(contamination=outliers_fraction),
     'Principal Component Analysis (PCA)':
     PCA(contamination=outliers_fraction, random_state=random_state),
     'COD':
     COD(contamination=outliers_fraction)
 }
 classifiers_indices = {
     'Angle-based Outlier Detector (ABOD)': 0,
     'Cluster-based Local Outlier Factor': 1,
     'Histogram-base Outlier Detection (HBOS)': 2,
     'Isolation Forest': 3,
     'K Nearest Neighbors (KNN)': 4,
Ejemplo n.º 9
0
                      n_features=3,
                      contamination=contamination,
                      random_state=42)
    # load pretrained models
    prepare_trained_model()

    # recommended models
    selected_models = select_model(X_train, n_selection=100)

    print("Showing the top recommended models...")
    for i, model in enumerate(selected_models):
        print(i, model)

    print()

    model_1 = LODA(n_bins=5, n_random_cuts=100)
    print(
        "1st model Average Precision",
        average_precision_score(y_train,
                                model_1.fit(X_train).decision_scores_))

    model_10 = LODA(n_bins=5, n_random_cuts=20)
    print(
        "10th model Average Precision",
        average_precision_score(y_train,
                                model_10.fit(X_train).decision_scores_))

    model_50 = OCSVM(kernel='sigmoid', nu=0.6)
    print(
        "50th model Average Precision",
        average_precision_score(y_train,
Ejemplo n.º 10
0
    def execute(self):
        evaluation_results = []

        print("Loading training data...")
        data = pd.DataFrame()

        for i, chunk in enumerate(
                pd.read_csv(self.input_file,
                            header=None,
                            chunksize=self.chunk_size)):
            print("Reading chunk: %d" % (i + 1))
            #print(chunk)
            data = data.append(chunk)

        input_dimensionality = len(data.columns) - 1
        print("Input Dimensionality: %d" % (input_dimensionality))

        positive_data = data[data[len(data.columns) -
                                  1] == 1].iloc[:, :len(data.columns) - 1]
        negative_data = data[data[len(data.columns) -
                                  1] == -1].iloc[:, :len(data.columns) - 1]

        training_data = positive_data.sample(frac=0.70)
        positive_validation_data = positive_data.drop(training_data.index)

        if self.neg_cont and self.neg_cont > 0:
            print("Negative Contamination: %0.4f" % (self.neg_cont))
            num_negative = math.floor(
                self.neg_cont *
                (len(negative_data) + len(positive_validation_data)))
            negative_data = data.sample(frac=1, random_state=200)[
                data[len(data.columns) -
                     1] == -1].iloc[:num_negative, :len(data.columns) - 1]

        negative_validation_data = negative_data.copy()

        temp_positive = positive_validation_data.copy()
        temp_positive[input_dimensionality] = 1

        temp_negative = negative_data.copy()
        temp_negative[input_dimensionality] = -1

        validation_data_with_labels = pd.concat([temp_positive, temp_negative],
                                                ignore_index=True)
        validation_data = validation_data_with_labels.iloc[:, :len(data.columns
                                                                   ) - 1]
        validation_labels = validation_data_with_labels.iloc[:, -1:].values

        # Convert to tensor
        positive_data = torch.tensor(positive_data.values).float().to(
            self.device)
        negative_data = torch.tensor(negative_data.values).float().to(
            self.device)
        training_data = torch.tensor(training_data.values).float()
        validation_data = torch.tensor(validation_data.values).float()

        print("Validation Data:")
        print(validation_data)

        ## AE-D TRAINING ##
        print("Initializing autoencoder...")
        net = Autoencoder(layers=self.layers,
                          device=self.device,
                          add_syn=self.add_syn)
        net.to(self.device)

        print(net)

        print("Training Stochastic Autoencoder...")
        net.fit(training_data,
                epochs=self.epochs,
                lr=self.lr,
                batch_size=self.batch_size)

        predictions = net.predict(validation_data)

        tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc = performance_metrics(
            validation_labels, predictions)

        r = ["AE-D", tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc]

        evaluation_results.append(r)

        print("AE-D Results:")
        print(
            tabulate([r], [
                "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV",
                "TS", "PT", "ACC", "F1", "MCC"
            ],
                     tablefmt="grid"))

        # Convert back to CPU before other methods
        validation_data = validation_data.cpu()

        # Train only linear classifiers
        if self.eval_cat == "linear":
            print("Initiating training for linear detectors...")

            ## MCD ##
            print("Training MCD...")
            result = train_and_evaluate_classifier("MCD", MCD(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## ROBUST COVARIANCE ##
            print("Training Robust Covariance...")
            result = train_and_evaluate_classifier("ROB-COV",
                                                   EllipticEnvelope(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## ONE CLASS SVM TRAINING ##
            print("Training OneClassSVM...")
            result = train_and_evaluate_classifier(
                "OC-SVM", svm.OneClassSVM(gamma="auto"), validation_data,
                validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "prob":
            ## ABOD ##
            #print("Training ABOD...")
            #result = train_and_evaluate_classifier("ABOD", ABOD(), validation_data, validation_labels)
            #evaluation_results.append(result)

            ## SOS ##
            #print("Training SOS...")
            #result = train_and_evaluate_classifier("SOS", SOS(), validation_data, validation_labels)
            #evaluation_results.append(result)

            ## COPOD ##
            print("Training COPOD...")
            result = train_and_evaluate_classifier("COPOD", COPOD(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "ensemble":
            ## ISOLATION FOREST TRAINING ##
            print("Training Isolation Forest...")
            result = train_and_evaluate_classifier(
                "ISO-F", IsolationForest(random_state=0), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## LODA ##
            print("Training LODA...")
            result = train_and_evaluate_classifier("LODA", LODA(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## LSCP ##


#      print("Training LSCP...")
#      result = train_and_evaluate_classifier("LSCP", LSCP([LOF(), LOF()]), validation_data, validation_labels)
#      evaluation_results.append(result)

        elif self.eval_cat == "proximity":
            ## LOCAL OUTLIER FACTOR ##
            print("Training Local Outlier Factor...")
            result = train_and_evaluate_classifier(
                "LOC-OF", LocalOutlierFactor(novelty=True), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## CBLOF ##
            print("Training CBLOF...")
            result = train_and_evaluate_classifier("CBLOF", CBLOF(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## HBOS ##
            print("Training HBOS...")
            result = train_and_evaluate_classifier("HBOS", HBOS(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "nn":
            ## VAE ##
            print("Training VAE...")
            result = train_and_evaluate_classifier(
                "VAE",
                VAE(encoder_neurons=self.layers,
                    decoder_neurons=self.layers.reverse()), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## SO_GAAL ##
            print("Training SO_GAAL...")
            result = train_and_evaluate_classifier(
                "SO_GAAL", SO_GAAL(lr_d=self.lr, stop_epochs=self.epochs),
                validation_data, validation_labels)
            evaluation_results.append(result)

            ## MO_GAAL ##
            print("Training MO_GAAL...")
            result = train_and_evaluate_classifier(
                "MO_GAAL", MO_GAAL(lr_d=self.lr, stop_epochs=self.epochs),
                validation_data, validation_labels)
            evaluation_results.append(result)

        ## EVALUATE RESULTS ##
        if self.eval_cat != "none":
            print("Aggregated Results:")
            print(
                tabulate(evaluation_results, [
                    "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV",
                    "TS", "PT", "ACC", "F1", "MCC"
                ],
                         tablefmt="grid"))

        ## DATASET METRICS ##
        len_training_data_points = len(training_data)
        len_positive_validations = len(positive_validation_data)
        len_negative_validations = len(negative_validation_data)
        len_validations = len_positive_validations + len_negative_validations

        metrics_results = [
            ["Training Data Points", len_training_data_points],
            ["# Normal Points", len_positive_validations],
            ["# Anomalies", len_negative_validations],
            [
                "Contamination Percentage",
                math.floor((len_negative_validations / len_validations) * 100)
            ]
        ]

        ## EVALUATE RESULTS ##
        print(tabulate(metrics_results, ["Metric", "Value"], tablefmt="grid"))

        if self.printout:
            print("Saving results to %s" % (self.printout))
            df = pd.DataFrame(evaluation_results)
            df.to_csv(self.printout, header=None, index=False)
Ejemplo n.º 11
0
def generate_meta_features(X):
    """Get the meta-features of a datasets X

    Parameters
    ----------
    X : numpy array of shape (n_samples, n_features)
        Input array

    Returns
    -------
    meta_features : numpy array of shape (1, 200)
        Meta-feature in dimension of 200

    """
    # outliers_fraction = np.count_nonzero(y) / len(y)
    # outliers_percentage = round(outliers_fraction * 100, ndigits=4)
    X = check_array(X)

    meta_vec = []
    meta_vec_names = []

    # on the sample level
    n_samples, n_features = X.shape[0], X.shape[1]

    meta_vec.append(n_samples)
    meta_vec.append(n_features)

    meta_vec_names.append('n_samples')
    meta_vec_names.append('n_features')

    sample_mean = np.mean(X)
    sample_median = np.median(X)
    sample_var = np.var(X)
    sample_min = np.min(X)
    sample_max = np.max(X)
    sample_std = np.std(X)

    q1, q25, q75, q99 = np.percentile(X, [0.01, 0.25, 0.75, 0.99])
    iqr = q75 - q25

    normalized_mean = sample_mean / sample_max
    normalized_median = sample_median / sample_max
    sample_range = sample_max - sample_min
    sample_gini = gini(X)
    med_abs_dev = np.median(np.absolute(X - sample_median))
    avg_abs_dev = np.mean(np.absolute(X - sample_mean))
    quant_coeff_disp = (q75 - q25) / (q75 + q25)
    coeff_var = sample_var / sample_mean

    outliers_15iqr = np.logical_or(X < (q25 - 1.5 * iqr), X >
                                   (q75 + 1.5 * iqr))
    outliers_3iqr = np.logical_or(X < (q25 - 3 * iqr), X > (q75 + 3 * iqr))
    outliers_1_99 = np.logical_or(X < q1, X > q99)
    outliers_3std = np.logical_or(X < (sample_mean - 3 * sample_std), X >
                                  (sample_mean + 3 * sample_std))

    percent_outliers_15iqr = np.sum(outliers_15iqr) / len(X)
    percent_outliers_3iqr = np.sum(outliers_3iqr) / len(X)
    percent_outliers_1_99 = np.sum(outliers_1_99) / len(X)
    percent_outliers_3std = np.sum(outliers_3std) / len(X)

    has_outliers_15iqr = np.any(outliers_15iqr).astype(int)
    has_outliers_3iqr = np.any(outliers_3iqr).astype(int)
    has_outliers_1_99 = np.any(outliers_1_99).astype(int)
    has_outliers_3std = np.any(outliers_3std).astype(int)

    meta_vec.extend([
        sample_mean,
        sample_median,
        sample_var,
        sample_min,
        sample_max,
        sample_std,
        q1,
        q25,
        q75,
        q99,
        iqr,
        normalized_mean,
        normalized_median,
        sample_range,
        sample_gini,
        med_abs_dev,
        avg_abs_dev,
        quant_coeff_disp,
        coeff_var,
        # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10,
        percent_outliers_15iqr,
        percent_outliers_3iqr,
        percent_outliers_1_99,
        percent_outliers_3std,
        has_outliers_15iqr,
        has_outliers_3iqr,
        has_outliers_1_99,
        has_outliers_3std
    ])

    meta_vec_names.extend([
        'sample_mean',
        'sample_median',
        'sample_var',
        'sample_min',
        'sample_max',
        'sample_std',
        'q1',
        'q25',
        'q75',
        'q99',
        'iqr',
        'normalized_mean',
        'normalized_median',
        'sample_range',
        'sample_gini',
        'med_abs_dev',
        'avg_abs_dev',
        'quant_coeff_disp',
        'coeff_var',
        # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10,
        'percent_outliers_15iqr',
        'percent_outliers_3iqr',
        'percent_outliers_1_99',
        'percent_outliers_3std',
        'has_outliers_15iqr',
        'has_outliers_3iqr',
        'has_outliers_1_99',
        'has_outliers_3std'
    ])

    ###########################################################################

    normality_k2, normality_p = normaltest(X)
    is_normal_5 = (normality_p < 0.05).astype(int)
    is_normal_1 = (normality_p < 0.01).astype(int)

    meta_vec.extend(list_process(normality_p))
    meta_vec.extend(list_process(is_normal_5))
    meta_vec.extend(list_process(is_normal_1))

    meta_vec_names.extend(list_process_name('normality_p'))
    meta_vec_names.extend(list_process_name('is_normal_5'))
    meta_vec_names.extend(list_process_name('is_normal_1'))

    moment_5 = moment(X, moment=5)
    moment_6 = moment(X, moment=6)
    moment_7 = moment(X, moment=7)
    moment_8 = moment(X, moment=8)
    moment_9 = moment(X, moment=9)
    moment_10 = moment(X, moment=10)
    meta_vec.extend(list_process(moment_5))
    meta_vec.extend(list_process(moment_6))
    meta_vec.extend(list_process(moment_7))
    meta_vec.extend(list_process(moment_8))
    meta_vec.extend(list_process(moment_9))
    meta_vec.extend(list_process(moment_10))
    meta_vec_names.extend(list_process_name('moment_5'))
    meta_vec_names.extend(list_process_name('moment_6'))
    meta_vec_names.extend(list_process_name('moment_7'))
    meta_vec_names.extend(list_process_name('moment_8'))
    meta_vec_names.extend(list_process_name('moment_9'))
    meta_vec_names.extend(list_process_name('moment_10'))

    # note: this is for each dimension == the number of dimensions
    skewness_list = skew(X).reshape(-1, 1)
    skew_values = list_process(skewness_list)
    meta_vec.extend(skew_values)
    meta_vec_names.extend(list_process_name('skewness'))

    # note: this is for each dimension == the number of dimensions
    kurtosis_list = kurtosis(X)
    kurtosis_values = list_process(kurtosis_list)
    meta_vec.extend(kurtosis_values)
    meta_vec_names.extend(list_process_name('kurtosis'))

    correlation = np.nan_to_num(pd.DataFrame(X).corr(), nan=0)
    correlation_list = flatten_diagonally(correlation)[0:int(
        (n_features * n_features - n_features) / 2)]
    correlation_values = list_process(correlation_list)
    meta_vec.extend(correlation_values)
    meta_vec_names.extend(list_process_name('correlation'))

    covariance = np.cov(X.T)
    covariance_list = flatten_diagonally(covariance)[0:int(
        (n_features * n_features - n_features) / 2)]
    covariance_values = list_process(covariance_list)
    meta_vec.extend(covariance_values)
    meta_vec_names.extend(list_process_name('covariance'))

    # sparsity
    rep_counts = []
    for i in range(n_features):
        rep_counts.append(len(np.unique(X[:, i])))
    sparsity_list = np.asarray(rep_counts) / (n_samples)
    sparsity = list_process(sparsity_list)
    meta_vec.extend(sparsity)
    meta_vec_names.extend(list_process_name('sparsity'))

    # ANOVA p value
    p_values_list = []
    all_perm = list(itertools.combinations(list(range(n_features)), 2))
    for j in all_perm:
        p_values_list.append(f_oneway(X[:, j[0]], X[:, j[1]])[1])
    anova_p_value = list_process(np.asarray(p_values_list))
    # anova_p_value = np.mean(p_values_list)
    # anova_p_value_exceed_thresh = np.mean((np.asarray(p_values_list)<0.05).astype(int))
    meta_vec.extend(anova_p_value)
    meta_vec_names.extend(list_process_name('anova_p_value'))

    # pca
    pca_transformer = sklearn_PCA(n_components=3)
    X_transform = pca_transformer.fit_transform(X)

    # first pc
    pca_fpc = list_process(X_transform[0, :],
                           r_min=False,
                           r_max=False,
                           r_mean=False,
                           r_std=True,
                           r_skew=True,
                           r_kurtosis=True)
    meta_vec.extend(pca_fpc)
    meta_vec_names.extend(
        ['first_pca_std', 'first_pca_skewness', 'first_pca_kurtosis'])

    # entropy
    entropy_list = []
    for i in range(n_features):
        counts = pd.Series(X[:, i]).value_counts()
        entropy_list.append(entropy(counts) / n_samples)
    entropy_values = list_process(entropy_list)
    meta_vec.extend(entropy_values)
    meta_vec_names.extend(list_process_name('entropy'))

    ##############################Landmarkers######################################
    # HBOS
    clf = HBOS(n_bins=10)
    clf.fit(X)
    HBOS_hists = clf.hist_
    HBOS_mean = np.mean(HBOS_hists, axis=0)
    HBOS_max = np.max(HBOS_hists, axis=0)
    HBOS_min = np.min(HBOS_hists, axis=0)
    meta_vec.extend(list_process(HBOS_mean))
    meta_vec.extend(list_process(HBOS_max))
    meta_vec.extend(list_process(HBOS_min))
    meta_vec_names.extend(list_process_name('HBOS_mean'))
    meta_vec_names.extend(list_process_name('HBOS_max'))
    meta_vec_names.extend(list_process_name('HBOS_min'))

    # IForest
    n_estimators = 100
    clf = IForest(n_estimators=n_estimators)
    clf.fit(X)

    n_leaves = []
    n_depth = []
    fi_mean = []
    fi_max = []

    # doing this for each sub-trees
    for i in range(n_estimators):
        n_leaves.append(clf.estimators_[i].get_n_leaves())
        n_depth.append(clf.estimators_[i].get_depth())
        fi_mean.append(clf.estimators_[i].feature_importances_.mean())
        fi_max.append(clf.estimators_[i].feature_importances_.max())
        # print(clf.estimators_[i].tree_)

    meta_vec.extend(list_process(n_leaves))
    meta_vec.extend(list_process(n_depth))
    meta_vec.extend(list_process(fi_mean))
    meta_vec.extend(list_process(fi_max))

    meta_vec_names.extend(list_process_name('IForest_n_leaves'))
    meta_vec_names.extend(list_process_name('IForest_n_depth'))
    meta_vec_names.extend(list_process_name('IForest_fi_mean'))
    meta_vec_names.extend(list_process_name('IForest_fi_max'))

    # PCA
    clf = PCA(n_components=3)
    clf.fit(X)
    meta_vec.extend(clf.explained_variance_ratio_)
    meta_vec.extend(clf.singular_values_)
    meta_vec_names.extend(
        ['pca_expl_ratio_1', 'pca_expl_ratio_2', 'pca_expl_ratio_3'])
    meta_vec_names.extend(['pca_sv_1', 'pca_sv_2', 'pca_sv_3'])

    # LODA
    n_bins = 10
    n_random_cuts = 100

    n_hists_mean = []
    n_hists_max = []

    n_cuts_mean = []
    n_cuts_max = []

    clf = LODA(n_bins=n_bins, n_random_cuts=n_random_cuts)
    clf.fit(X)

    for i in range(n_bins):
        n_hists_mean.append(clf.histograms_[:, i].mean())
        n_hists_max.append(clf.histograms_[:, i].max())
    for i in range(n_random_cuts):
        n_cuts_mean.append(clf.histograms_[i, :].mean())
        n_cuts_max.append(clf.histograms_[i, :].max())

    meta_vec.extend(list_process(n_hists_mean))
    meta_vec.extend(list_process(n_hists_max))
    meta_vec.extend(list_process(n_cuts_mean))
    meta_vec.extend(list_process(n_cuts_max))

    meta_vec_names.extend(list_process_name('LODA_n_hists_mean'))
    meta_vec_names.extend(list_process_name('LODA_n_hists_max'))
    meta_vec_names.extend(list_process_name('LODA_n_cuts_mean'))
    meta_vec_names.extend(list_process_name('LODA_n_cuts_max'))

    return meta_vec, meta_vec_names
Ejemplo n.º 12
0
    'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE',
    'AutoEncoder'
]

models = {
    'BRM': BRM(),
    'GM': GaussianMixture(),
    'IF': IsolationForest(),
    'OCSVM': OneClassSVM(),
    'EE': EllipticEnvelope(),
    'AvgKNN': KNN(method='mean'),
    'LargestKNN': KNN(method='largest'),
    'MedKNN': KNN(method='median'),
    'PCA': PCA(),
    'COF': COF(),
    'LODA': LODA(),
    'LOF': LOF(),
    'HBOS': HBOS(),
    'MCD': MCD(),
    'AvgBagging': FeatureBagging(combination='average'),
    'MaxBagging': FeatureBagging(combination='max'),
    'IForest': IForest(),
    'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
    'FactorAnalysis': FactorAnalysis(),
    'KernelDensity': KernelDensity(),
    'COPOD': COPOD(),
    'SOD': SOD(),
    'LSCPwithLODA': LSCP([LODA(), LODA()]),
    'AveLMDD': LMDD(dis_measure='aad'),
    'VarLMDD': LMDD(dis_measure='var'),
    'IqrLMDD': LMDD(dis_measure='iqr'),
Ejemplo n.º 13
0
def main():

    # PART 1:
    # Getting the predictions for each classifier
    # SK means: The classifier is from sklearn or works like sklearn
    # PY means: The classifier is from pyod or works like pyod

    models = {
        'SK_EE': EllipticEnvelope(),
        'SK_GM': GaussianMixture(),
        'SK_IF': IsolationForest(),
        'SK_OCSVM': OneClassSVM(),
        'SK_FA': FactorAnalysis(),
        'SK_KD': KernelDensity(),
        'PY_PCA': PCA(),
        'PY_COF': COF(),
        'PY_LODA': LODA(),
        'PY_LOF': LOF(),
        'PY_HBOS': HBOS(),
        'PY_MCD': MCD(),
        'PY_AvgKNN': KNN(method='mean'),
        'PY_LargestKNN': KNN(method='largest'),
        'PY_MedKNN': KNN(method='median'),
        'PY_AvgBagging': FeatureBagging(combination='average'),
        'PY_MaxBagging': FeatureBagging(combination='max'),
        'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'PY_COPOD': COPOD(),
        'PY_SOD': SOD(),
        'PY_LSCPwithLODA': LSCP([LODA(), LODA()]),
        'PY_AveLMDD': LMDD(dis_measure='aad'),
        'PY_VarLMDD': LMDD(dis_measure='var'),
        'PY_IqrLMDD': LMDD(dis_measure='iqr'),
        'PY_VAE': VAE(encoder_neurons=[8, 4, 2]),
        'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'SK_BRM': BRM(bootstrap_sample_percent=70),
        'SK_OCKRA': m_OCKRA(),
        'PY_SoGaal': SO_GAAL(),
        'PY_MoGaal': MO_GAAL()
    }
    ranker = ADRanker(data="datasets", models=models)
    ranker.get_predictions()

    # PART 2:
    # After predictions, we can evaluate our classifiers using different scores
    # You can add manually a new metric by modifying 'metrics.py'

    ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave})

    # PART 3:
    # Finally, it is time to summarize the results by plotting different graphs
    # You can add your own graphs by modifying ' plots.py'
    plot = Plots()
    plot.make_plot_basic(paths=[
        'results/scores/auc/no/results.csv',
        'results/scores/auc/minmax/results.csv',
        'results/scores/auc/std/results.csv',
        'results/scores/ave/no/results.csv',
        'results/scores/ave/minmax/results.csv',
        'results/scores/ave/std/results.csv'
    ],
                         scalers=[
                             'Without scaler', 'Min max scaler',
                             'Standard scaler', 'Without scaler',
                             'Min max scaler', 'Standard scaler'
                         ])

    plot.make_cd_plot(
        paths=[
            'results/scores/auc/minmax/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/no/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/std/results.csv',
            'results/scores/ave/std/results.csv'
        ],
        names=[
            'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale',
            'CD ave no scale', 'CD auc std scale', 'CD ave std scale'
        ],
        titles=[
            'CD diagram - AUC with min max scaling',
            'CD diagram - Average precision with min max scaling',
            'CD diagram - AUC without scaling',
            'CD diagram - Average precision without scaling',
            'CD diagram - AUC with standard scaling',
            'CD diagram - Average precision with  standard scaling'
        ])
Ejemplo n.º 14
0
    y_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_scores = clf.decision_scores_  # raw outlier scores
    evaluation(y, y_scores, clf_name)
    all_scores['LOF'] = y_scores

    clf_name = 'PCA'
    clf = PCA(contamination=contam)
    x_train = standardizer(x_train)
    clf.fit(x_train)
    y_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_scores = clf.decision_scores_  # raw outlier scores
    evaluation(y, y_scores, clf_name)
    all_scores['PCA'] = y_scores

    clf_name = 'LODA'
    clf = LODA(contamination=contam)
    x_train = standardizer(x_train)
    clf.fit(x_train)
    y_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_scores = clf.decision_scores_  # raw outlier scores
    evaluation(y, y_scores, clf_name)
    all_scores['LODA'] = y_scores

    pca = PCA(n_components=2)
    kpca = KernelPCA(n_components=2, kernel="poly")
    x_train_pca = kpca.fit_transform(x_train)
    clf = KNN(n_neighbors=5, contamination=contam)
    x_train_pca = standardizer(x_train_pca)
    clf.fit(x_train_pca)
    y_pred_pca = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_scores = clf.decision_scores_  # raw outlier scores
Ejemplo n.º 15
0
begin = "2020-02-13"
end = "2020-02-15"

test_date = "2020-02-16"

KNN_clf = KNN(contamination=0.05)
PCA_clf = PCA(contamination=0.05)
VAE_clf = VAE(contamination=0.05, epochs=30, encoder_neurons=[9, 4], decoder_neurons=[4, 9])
LOF_clf = LOF(contamination=0.05)
IForest_clf = IForest(contamination=0.05)
AutoEncoder_clf = AutoEncoder(contamination=0.05, epochs=30, hidden_neurons=[9, 4, 4, 9])
FeatureBagging_clf = FeatureBagging(contamination=0.05, check_estimator=False)
ABOD_clf = ABOD(contamination=0.05)
HBOS_clf = HBOS(contamination=0.05)
CBLOF_clf = CBLOF(contamination=0.05)
LODA_clf = LODA(contamination=0.05)
MCD_clf = MCD(contamination=0.05)
MO_GAAL_clf = MO_GAAL(k=3, stop_epochs=2, contamination=0.05)
SO_GAAL_clf = SO_GAAL(contamination=0.05)
KNN_MAH_clf = None

S_models = ["KNN", "LOF", "PCA", "IForest", "HBOS", "LODA", "MCD", "CBLOF", "FeatureBagging", "ABOD", "KNN_MAH"]
K_models = ["AutoEncoder", "SO_GAAL", "VAE"]

def get_train_data():
    """
    获取训练样本
    :return:    x_train 9特征训练样本
                df 原训练数据
    """
    acc_date = pd.date_range(begin, end, freq='1D')
Ejemplo n.º 16
0
    else:
        # grogger
        df['arr86'] = (df['narr86'] >= 1).astype(int)
        Y = df['arr86']
        X = df[[
            'pcnv', 'avgsen', 'tottime', 'ptime86', 'inc86', 'black', 'hispan',
            'born60'
        ]]

    print(i, X.shape, Y.shape)

    if OD_Flag:

        # clf = HBOS(contamination=0.05)
        # clf = IForest(contamination=0.05)
        clf = LODA(contamination=0.05)
        clf.fit(X)

        # remove outliers
        X = X.loc[np.where(clf.labels_ == 0)]
        Y = Y.loc[np.where(clf.labels_ == 0)]

    X = sm.add_constant(X)

    # general OLS
    # https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLS.html
    # model=sm.OLS(Y, X.astype(float))

    # robust regression
    # https://www.statsmodels.org/stable/generated/statsmodels.robust.robust_linear_model.RLM.html
    # model=sm.RLM(Y, X.astype(float))
Ejemplo n.º 17
0
        # standardizing data for processing
        X_train_norm, X_test_norm = standardizer(X_train, X_test)

        classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(
            contamination=outliers_fraction),
            'Cluster-based Local Outlier Factor': CBLOF(
                n_clusters=10,
                contamination=outliers_fraction,
                check_estimator=False,
                random_state=random_state),
            'Histogram-base Outlier Detection (HBOS)': HBOS(
                contamination=outliers_fraction),
            'Isolation Forest': IForest(contamination=outliers_fraction,
                                        random_state=random_state),
            'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
            'Lightweight on-line detector of anomalies (LODA)': LODA(contamination=outliers_fraction),
            'Local Outlier Factor (LOF)': LOF(
                contamination=outliers_fraction),
            'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
            'Principal Component Analysis (PCA)': PCA(
                contamination=outliers_fraction, random_state=random_state),
            'COD': COD(contamination=outliers_fraction)
        }
        classifiers_indices = {
            'Angle-based Outlier Detector (ABOD)': 0,
            'Cluster-based Local Outlier Factor': 1,
            'Histogram-base Outlier Detection (HBOS)': 2,
            'Isolation Forest': 3,
            'K Nearest Neighbors (KNN)': 4,
            'Lightweight on-line detector of anomalies (LODA)': 5,
            'Local Outlier Factor (LOF)': 6,
def compare(inputdata, labels, n_clusters, dset_name):
    """
    Compute the AUC, Fgap, Frank score on all conventional outlier detectors for the given dataset
    Args:
        inputdata: input data
        labels: ground truth outlier labels
        n_clusters: number of clusters, for some cluster-based detectors
        dset_name: dataset

    Returns: AUC, Fgap, Frank

    """
    print(
        "Competing with conventional unsupervised outlier detection algorithms..."
    )
    random_state = np.random.RandomState(1)
    if inputdata.shape[1] < 64:
        AEneurons = [16, 8, 8, 16]
        VAEneurons = [16, 8, 4], [4, 8, 16]
    else:
        AEneurons = [64, 32, 32, 64]
        VAEneurons = [128, 64, 32], [32, 64, 128]

    classifiers = {
        'PCA':
        PCA(random_state=random_state),
        'AutoEncoder':
        AutoEncoder(batch_size=100,
                    hidden_neurons=AEneurons,
                    random_state=random_state),
        'VAE':
        VAE(batch_size=100,
            encoder_neurons=VAEneurons[0],
            decoder_neurons=VAEneurons[1],
            random_state=random_state),
        'COPOD':
        COPOD(),
        'Iforest':
        IForest(random_state=random_state),
        'AutoEncoder':
        AutoEncoder(batch_size=100, random_state=random_state),
        'VAE':
        VAE(batch_size=100, random_state=random_state),
        'LODA':
        LODA(),
        'OCSVM':
        OCSVM(),
        'ABOD':
        ABOD(n_neighbors=20),
        'Fb':
        FeatureBagging(random_state=random_state),
        'CBLOF':
        CBLOF(n_clusters=n_clusters,
              check_estimator=False,
              random_state=random_state),
        'LOF':
        LOF(),
        'COF':
        COF()
    }

    for clf_name, clf in classifiers.items():
        print(f"Using {clf_name} method")
        starttime = time.time()
        clf.fit(inputdata)
        time_taken = time.time() - starttime
        test_scores = clf.decision_scores_

        # -----fix some broken scores----- #
        for i in range(len(test_scores)):
            cur = test_scores[i]
            if np.isnan(cur) or not np.isfinite(cur):
                test_scores[i] = 0

        np.save(f'{dset_name}/{clf_name}_raw.npy', test_scores)
        auc = roc_auc_score(labels, test_scores)
        print('AUC:', auc)
        fetch(normalize(test_scores), f'../datasets/{dset_name.upper()}_Y.npy',
              f'{dset_name}/attribute.npy')
        print('time_taken:', time_taken)
Ejemplo n.º 19
0
def pyod_init(model, n_features=None):
    # initial model set up
    if model == 'abod':
        from pyod.models.abod import ABOD
        clf = ABOD()
    elif model == 'auto_encoder' and n_features:
        #import os
        #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
        from pyod.models.auto_encoder import AutoEncoder
        clf = AutoEncoder(hidden_neurons=[
            n_features, n_features * 5, n_features * 5, n_features
        ],
                          epochs=5,
                          batch_size=64,
                          preprocessing=False)
    elif model == 'cblof':
        from pyod.models.cblof import CBLOF
        clf = CBLOF(n_clusters=4)
    elif model == 'hbos':
        from pyod.models.hbos import HBOS
        clf = HBOS()
    elif model == 'iforest':
        from pyod.models.iforest import IForest
        clf = IForest()
    elif model == 'knn':
        from pyod.models.knn import KNN
        clf = KNN()
    elif model == 'lmdd':
        from pyod.models.lmdd import LMDD
        clf = LMDD()
    elif model == 'loci':
        from pyod.models.loci import LOCI
        clf = LOCI()
    elif model == 'loda':
        from pyod.models.loda import LODA
        clf = LODA()
    elif model == 'lof':
        from pyod.models.lof import LOF
        clf = LOF()
    elif model == 'mcd':
        from pyod.models.mcd import MCD
        clf = MCD()
    elif model == 'ocsvm':
        from pyod.models.ocsvm import OCSVM
        clf = OCSVM()
    elif model == 'pca':
        from pyod.models.pca import PCA
        clf = PCA()
    elif model == 'sod':
        from pyod.models.sod import SOD
        clf = SOD()
    elif model == 'vae':
        from pyod.models.vae import VAE
        clf = VAE()
    elif model == 'xgbod':
        from pyod.models.xgbod import XGBOD
        clf = XGBOD()
    else:
        #raise ValueError(f"unknown model {model}")
        clf = PyODDefaultModel()
    return clf
Ejemplo n.º 20
0
iterate_threshold = True

if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s P%(process)d %(levelname)s %(message)s",
    )

    # load dataset
    data_dict = load_dataset(
        dataset,
        subdataset,
        "all",
    )

    x_train = data_dict["train"]
    x_test = data_dict["test"]
    x_test_labels = data_dict["test_labels"]

    # data preprocessing for MSCRED
    od = LODA(n_bins=n_bins, n_random_cuts=n_random_cuts)
    od.fit(x_train)

    # get outlier scores
    anomaly_score = od.decision_function(x_test)

    anomaly_label = x_test_labels

    # Make evaluation
    evaluate_all(anomaly_score, anomaly_label)
Ejemplo n.º 21
0
    def fit(self, X, shrink_cols = True, data_scaler = preprocessing.MaxAbsScaler(), 
            quick_methods = True, slow_methods = False, nn_methods = False, 
            contamination = 0.05, use_score_rank = False, random_state = None, verbose = 0):

        if len(X.shape) > 2:
            X = X.reshape(X.shape[0], X.shape[1]*X.shape[2])
        elif len(X.shape) > 3:
            raise ValueError("Expected number of dimensions: 2 or 3") 
        
        if shrink_cols:
            X = X[:,~np.all(X == 0, axis=0)]
            log.info('zero columns shrinked')
        if data_scaler:
            X = data_scaler.fit_transform(X)
            log.info(f'used {data_scaler} data scaler')
            #log.info(X[0:1,:])
        
        n_rows = X.shape[0]
        n_features = X.shape[1]
        log.info (f'n_rows = {n_rows}, n_features = {n_features}')
        
        quick_scores = np.zeros([n_rows, 0])
        slow_scores = np.zeros([n_rows, 0])
        nn_scores = np.zeros([n_rows, 0])
        
        if quick_methods:
            # Define anomaly detection tools to be compared
            quick_classifiers = {
                'PCA_randomized':
                    PCA(contamination=contamination, random_state=random_state, 
                        standardization = False, svd_solver = 'randomized'),
                'PCA_full':
                    PCA(contamination=contamination, random_state=random_state, 
                        standardization = False, svd_solver = 'full'),                               
                'COPOD':
                   COPOD(contamination=contamination),  
                f'HBOS': 
                    HBOS(contamination=contamination),
                f'HBOS_{200}': 
                    HBOS(contamination=contamination, n_bins = 200),                
                f'HBOS_{300}':  
                    HBOS(contamination=contamination, n_bins = 300), 
                'LODA':
                    LODA(contamination=contamination),
                'LODA_200':
                    LODA(contamination=contamination, n_random_cuts  = 200),
                'LODA_300':
                    LODA(contamination=contamination, n_random_cuts  = 300),                
                'IForest_100':
                    IForest(contamination=contamination, random_state=random_state, 
                            n_estimators = 100, bootstrap = False, n_jobs = -1),
                'IForest_200':
                    IForest(contamination=contamination, random_state=random_state, 
                            n_estimators = 200, bootstrap = False, n_jobs = -1),                
                'IForest_bootstrap':
                    IForest(contamination = contamination, random_state=random_state, 
                            n_estimators = 150, bootstrap = True, n_jobs = -1), 
                #'MCD': 
                #    MCD(contamination=contamination, random_state=random_state, assume_centered = False),
                #'MCD_centered': 
                #    MCD(contamination=contamination, random_state=random_state, assume_centered = True),    
                f'CBLOF_16':
                    CBLOF(contamination=contamination, random_state=random_state, n_clusters = 16),
                f'CBLOF_24':
                    CBLOF(contamination=contamination, random_state=random_state, n_clusters = 24),
                f'CBLOF_32':
                    CBLOF(contamination=contamination, random_state=random_state, n_clusters = 32)
            }
            
            quick_scores = np.zeros([n_rows, len(quick_classifiers)])

            for i, (clf_name, clf) in enumerate(quick_classifiers.items()):
                log.info(f'{i+1} - fitting {clf_name}')
                try:
                    clf.fit(X)
                    quick_scores[:, i] = clf.decision_scores_
                except:
                    log.info(traceback.print_exc())
                else:    
                    log.info(f'Base detector {i+1}/{len(quick_classifiers)} is fitted for prediction') 

            quick_scores = np.nan_to_num(quick_scores)
            
        if slow_methods:
            # initialize a set of detectors for LSCP
            detector_list = [LOF(n_neighbors=10), LOF(n_neighbors=15), LOF(n_neighbors=20)]
            slow_classifiers = {               
                #'Angle-based Outlier Detector (ABOD)': #too slow and nan results
                #   ABOD(contamination=contamination),
                #'One-class SVM (OCSVM)':
                #   OCSVM(contamination=contamination, cache_size = 2000, shrinking = False, tol = 1e-2),   
                #'LSCP': #slow and no parallel
                #   LSCP(detector_list, contamination=contamination, random_state=random_state, local_region_size = 30),
                #'Feature Bagging': #ensemble #no real par
                #   FeatureBagging(LOF(n_neighbors=20), contamination=contamination, 
                #                  random_state=random_state, n_jobs = -1),                
                #'SOS' : # too memory inefficient  
                #    SOS(contamination=contamination),
                #'COF': # memory inefficient
                #   COF(contamination=contamination),                  
                #'SOD':
                #    SOD(contamination = contamination),
                #'KNN': 
                #   KNN(contamination=contamination, n_jobs = -1),
                #'KNN_50': 
                #   KNN(contamination=contamination, leaf_size = 50, n_jobs = -1),
                #'KNN_70': 
                #   KNN(contamination=contamination, leaf_size = 70, n_jobs = -1),

                'LOF_4':
                   LOF(n_neighbors=4, contamination=contamination, n_jobs = -1),
                'LOF_5':
                   LOF(n_neighbors=5, contamination=contamination, n_jobs = -1),                
                'LOF_6':
                   LOF(n_neighbors=6, contamination=contamination, n_jobs = -1),
                'LOF_7':
                   LOF(n_neighbors=7, contamination=contamination, n_jobs = -1),                
                'LOF_8':
                   LOF(n_neighbors=8, contamination=contamination, n_jobs = -1),
                'LOF_9':
                   LOF(n_neighbors=9, contamination=contamination, n_jobs = -1),                
                'LOF_10':
                   LOF(n_neighbors=10, contamination=contamination, n_jobs = -1),
                'LOF_12':
                   LOF(n_neighbors=12, contamination=contamination, n_jobs = -1),  
                'LOF_14':
                   LOF(n_neighbors=14, contamination=contamination, n_jobs = -1),
                'LOF_16':
                   LOF(n_neighbors=16, contamination=contamination, n_jobs = -1),
                'LOF_18':
                   LOF(n_neighbors=18, contamination=contamination, n_jobs = -1),
                'LOF_20':
                   LOF(n_neighbors=20, contamination=contamination, n_jobs = -1), 
                'LOF_22':
                   LOF(n_neighbors=22, contamination=contamination, n_jobs = -1)            
            }
            
            slow_scores = np.zeros([n_rows, len(slow_classifiers)])

            for i, (clf_name, clf) in enumerate(slow_classifiers.items()):
                log.info(f'{i+1} - fitting {clf_name}')
                try:
                    clf.fit(X)
                    slow_scores[:, i] = clf.decision_scores_
                except:
                    log.info(traceback.print_exc())
                else:    
                    log.info(f'Base detector {i+1}/{len(slow_classifiers)} is fitted for prediction') 
            
            slow_scores = np.nan_to_num(slow_scores)
        
        if nn_methods:
            
            nn_classifiers = {}
            n_list = [1024, 512, 256, 128, 64, 32, 16, 8, 4, 2]
            n_idx = next(x[0] for x in enumerate(n_list) if x[1] < n_features)
            for i in range(3,6):
                n_enc = n_list[n_idx:n_idx+i-1] 
                n_dec = n_enc[::-1]
                n_enc_dec = n_enc + n_dec
                nn_classifiers[f'FULL_AE_{len(n_enc + n_dec)}'] = {'clf': self.full_autoencoder, 
                                                                   'hidden_layers' : n_enc_dec
                                                                  }
                nn_classifiers[f'VAE_{len(n_enc_dec)}'] = {'clf': VAE(contamination = contamination, random_state = random_state,
                                                                      encoder_neurons  = n_enc, decoder_neurons = n_dec,
                                                                      preprocessing = False, epochs = 32, verbosity = verbose), 
                                                            'hidden_layers' : n_enc + n_dec
                                                            }                
                
            
            nn_scores = np.zeros([n_rows, len(nn_classifiers)])
            
            for i, (clf_name, clf) in enumerate(nn_classifiers.items()):
                log.info(f'''{i+1} - fitting {clf_name} with layers {clf['hidden_layers']}''')
                try:
                    if clf['clf'] == self.full_autoencoder:
                        nn_scores[:, i] = clf['clf'](X, neurons_list = clf['hidden_layers'], verbose = verbose)
                    else:
                        clf['clf'].fit(X)
                        nn_scores[:, i] = clf['clf'].decision_scores_                        
                except:
                    log.info(traceback.print_exc())
                else:    
                    log.info(f'Base detector {i+1}/{len(nn_classifiers)} is fitted for prediction')             

            nn_scores = np.nan_to_num(nn_scores)

            
        all_scores = np.concatenate((quick_scores, slow_scores, nn_scores), axis=1)
        all_scores = all_scores[:,~np.all(all_scores == 0, axis=0)]
        log.info(f'total scores = {all_scores.shape[1]}')
        
        all_scores_norm = np.copy(all_scores)
        if use_score_rank:
            all_scores_norm = np.apply_along_axis(rank_fun, 0, all_scores_norm)
            log.info(f'score rank applied')
        all_scores_norm = preprocessing.MinMaxScaler().fit_transform(all_scores_norm)
        
        if all_scores_norm.shape[1] >= 12:
            score_by_aom = aom(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4))
            score_by_moa = moa(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4))
            score_by_avg = np.mean(all_scores_norm, axis = 1) 
            score_by_max = np.max(all_scores_norm, axis = 1)
        else:
            score_by_avg = np.mean(all_scores_norm, axis = 1)
            score_by_max = np.max(all_scores_norm, axis = 1)
            score_by_aom = score_by_avg
            score_by_moa = score_by_max
        return score_by_aom, score_by_moa, score_by_max, score_by_avg, all_scores, all_scores_norm
Ejemplo n.º 22
0
clf.fit(X_train)
sklearn_score_anomalies = clf.decision_function(X_test)
original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies]
auc_svm_ws = evaluate.AUC(original_paper_score, y_test)

# --- LOF --- #
lof = LocalOutlierFactor(novelty=True)
lof.fit(X_train)
sklearn_score_anomalies = lof.decision_function(X_test)
original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies]
auc_lof_ws = evaluate.AUC(original_paper_score, y_test)

# --- LODA --- #
aucs_loda_ws = np.zeros(num_of_experiments)
for r in tqdm(range(num_of_experiments)):
    loda = LODA()
    loda.fit(X_train)
    y_pred_proba_loda = np.zeros(X_test.shape[0])
    for i in tqdm(range(X_test.shape[0])):
        loda.fit(X_test[i, :].reshape(1, -1))
        y_pred_proba_loda[i] = loda.decision_function(X_test[i, :].reshape(
            1, -1))
    aucs_loda_ws[r] = evaluate.AUC(1 - y_pred_proba_loda, y_test)
auc_loda_ws = np.mean(aucs_loda_ws)

# --- HalfSpaceTrees --- #
aucs_hst_ws = np.zeros(num_of_experiments)
for r in tqdm(range(num_of_experiments)):
    hst = HalfSpaceTrees(n_features=X_train_hst.shape[1], n_estimators=100)
    hst.fit(X_train_hst, np.zeros(X_train_hst.shape[0]))
    y_pred_proba_hst = np.zeros(X_test_hst.shape[0])
Ejemplo n.º 23
0
class TestLODA(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = LODA(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert (hasattr(self.clf, 'projections_')
                and self.clf.projections_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
Ejemplo n.º 24
0
def get_detectors():
    # randomness_flags = []
    BASE_ESTIMATORS = [
        LODA(n_bins=5, n_random_cuts=10),
        LODA(n_bins=5, n_random_cuts=20),
        LODA(n_bins=5, n_random_cuts=30),
        LODA(n_bins=5, n_random_cuts=40),
        LODA(n_bins=5, n_random_cuts=50),
        LODA(n_bins=5, n_random_cuts=75),
        LODA(n_bins=5, n_random_cuts=100),
        LODA(n_bins=5, n_random_cuts=150),
        LODA(n_bins=5, n_random_cuts=200),
        LODA(n_bins=10, n_random_cuts=10),
        LODA(n_bins=10, n_random_cuts=20),
        LODA(n_bins=10, n_random_cuts=30),
        LODA(n_bins=10, n_random_cuts=40),
        LODA(n_bins=10, n_random_cuts=50),
        LODA(n_bins=10, n_random_cuts=75),
        LODA(n_bins=10, n_random_cuts=100),
        LODA(n_bins=10, n_random_cuts=150),
        LODA(n_bins=10, n_random_cuts=200),
        LODA(n_bins=15, n_random_cuts=10),
        LODA(n_bins=15, n_random_cuts=20),
        LODA(n_bins=15, n_random_cuts=30),
        LODA(n_bins=15, n_random_cuts=40),
        LODA(n_bins=15, n_random_cuts=50),
        LODA(n_bins=15, n_random_cuts=75),
        LODA(n_bins=15, n_random_cuts=100),
        LODA(n_bins=15, n_random_cuts=150),
        LODA(n_bins=15, n_random_cuts=200),
        LODA(n_bins=20, n_random_cuts=10),
        LODA(n_bins=20, n_random_cuts=20),
        LODA(n_bins=20, n_random_cuts=30),
        LODA(n_bins=20, n_random_cuts=40),
        LODA(n_bins=20, n_random_cuts=50),
        LODA(n_bins=20, n_random_cuts=75),
        LODA(n_bins=20, n_random_cuts=100),
        LODA(n_bins=20, n_random_cuts=150),
        LODA(n_bins=20, n_random_cuts=200),
        LODA(n_bins=25, n_random_cuts=10),
        LODA(n_bins=25, n_random_cuts=20),
        LODA(n_bins=25, n_random_cuts=30),
        LODA(n_bins=25, n_random_cuts=40),
        LODA(n_bins=25, n_random_cuts=50),
        LODA(n_bins=25, n_random_cuts=75),
        LODA(n_bins=25, n_random_cuts=100),
        LODA(n_bins=25, n_random_cuts=150),
        LODA(n_bins=25, n_random_cuts=200),
        LODA(n_bins=30, n_random_cuts=10),
        LODA(n_bins=30, n_random_cuts=20),
        LODA(n_bins=30, n_random_cuts=30),
        LODA(n_bins=30, n_random_cuts=40),
        LODA(n_bins=30, n_random_cuts=50),
        LODA(n_bins=30, n_random_cuts=75),
        LODA(n_bins=30, n_random_cuts=100),
        LODA(n_bins=30, n_random_cuts=150),
        LODA(n_bins=30, n_random_cuts=200),
        ABOD(n_neighbors=3),
        ABOD(n_neighbors=5),
        ABOD(n_neighbors=10),
        ABOD(n_neighbors=15),
        ABOD(n_neighbors=20),
        ABOD(n_neighbors=25),
        ABOD(n_neighbors=50),
        ABOD(n_neighbors=60),
        ABOD(n_neighbors=75),
        ABOD(n_neighbors=80),
        ABOD(n_neighbors=90),
        ABOD(n_neighbors=100),
        IForest(n_estimators=10, max_features=0.1),
        IForest(n_estimators=10, max_features=0.2),
        IForest(n_estimators=10, max_features=0.3),
        IForest(n_estimators=10, max_features=0.4),
        IForest(n_estimators=10, max_features=0.5),
        IForest(n_estimators=10, max_features=0.6),
        IForest(n_estimators=10, max_features=0.7),
        IForest(n_estimators=10, max_features=0.8),
        IForest(n_estimators=10, max_features=0.9),
        IForest(n_estimators=20, max_features=0.1),
        IForest(n_estimators=20, max_features=0.2),
        IForest(n_estimators=20, max_features=0.3),
        IForest(n_estimators=20, max_features=0.4),
        IForest(n_estimators=20, max_features=0.5),
        IForest(n_estimators=20, max_features=0.6),
        IForest(n_estimators=20, max_features=0.7),
        IForest(n_estimators=20, max_features=0.8),
        IForest(n_estimators=20, max_features=0.9),
        IForest(n_estimators=30, max_features=0.1),
        IForest(n_estimators=30, max_features=0.2),
        IForest(n_estimators=30, max_features=0.3),
        IForest(n_estimators=30, max_features=0.4),
        IForest(n_estimators=30, max_features=0.5),
        IForest(n_estimators=30, max_features=0.6),
        IForest(n_estimators=30, max_features=0.7),
        IForest(n_estimators=30, max_features=0.8),
        IForest(n_estimators=30, max_features=0.9),
        IForest(n_estimators=40, max_features=0.1),
        IForest(n_estimators=40, max_features=0.2),
        IForest(n_estimators=40, max_features=0.3),
        IForest(n_estimators=40, max_features=0.4),
        IForest(n_estimators=40, max_features=0.5),
        IForest(n_estimators=40, max_features=0.6),
        IForest(n_estimators=40, max_features=0.7),
        IForest(n_estimators=40, max_features=0.8),
        IForest(n_estimators=40, max_features=0.9),
        IForest(n_estimators=50, max_features=0.1),
        IForest(n_estimators=50, max_features=0.2),
        IForest(n_estimators=50, max_features=0.3),
        IForest(n_estimators=50, max_features=0.4),
        IForest(n_estimators=50, max_features=0.5),
        IForest(n_estimators=50, max_features=0.6),
        IForest(n_estimators=50, max_features=0.7),
        IForest(n_estimators=50, max_features=0.8),
        IForest(n_estimators=50, max_features=0.9),
        IForest(n_estimators=75, max_features=0.1),
        IForest(n_estimators=75, max_features=0.2),
        IForest(n_estimators=75, max_features=0.3),
        IForest(n_estimators=75, max_features=0.4),
        IForest(n_estimators=75, max_features=0.5),
        IForest(n_estimators=75, max_features=0.6),
        IForest(n_estimators=75, max_features=0.7),
        IForest(n_estimators=75, max_features=0.8),
        IForest(n_estimators=75, max_features=0.9),
        IForest(n_estimators=100, max_features=0.1),
        IForest(n_estimators=100, max_features=0.2),
        IForest(n_estimators=100, max_features=0.3),
        IForest(n_estimators=100, max_features=0.4),
        IForest(n_estimators=100, max_features=0.5),
        IForest(n_estimators=100, max_features=0.6),
        IForest(n_estimators=100, max_features=0.7),
        IForest(n_estimators=100, max_features=0.8),
        IForest(n_estimators=100, max_features=0.9),
        IForest(n_estimators=150, max_features=0.1),
        IForest(n_estimators=150, max_features=0.2),
        IForest(n_estimators=150, max_features=0.3),
        IForest(n_estimators=150, max_features=0.4),
        IForest(n_estimators=150, max_features=0.5),
        IForest(n_estimators=150, max_features=0.6),
        IForest(n_estimators=150, max_features=0.7),
        IForest(n_estimators=150, max_features=0.8),
        IForest(n_estimators=150, max_features=0.9),
        IForest(n_estimators=200, max_features=0.1),
        IForest(n_estimators=200, max_features=0.2),
        IForest(n_estimators=200, max_features=0.3),
        IForest(n_estimators=200, max_features=0.4),
        IForest(n_estimators=200, max_features=0.5),
        IForest(n_estimators=200, max_features=0.6),
        IForest(n_estimators=200, max_features=0.7),
        IForest(n_estimators=200, max_features=0.8),
        IForest(n_estimators=200, max_features=0.9),
        KNN(n_neighbors=1, method='largest'),
        KNN(n_neighbors=5, method='largest'),
        KNN(n_neighbors=10, method='largest'),
        KNN(n_neighbors=15, method='largest'),
        KNN(n_neighbors=20, method='largest'),
        KNN(n_neighbors=25, method='largest'),
        KNN(n_neighbors=50, method='largest'),
        KNN(n_neighbors=60, method='largest'),
        KNN(n_neighbors=70, method='largest'),
        KNN(n_neighbors=80, method='largest'),
        KNN(n_neighbors=90, method='largest'),
        KNN(n_neighbors=100, method='largest'),
        KNN(n_neighbors=1, method='mean'),
        KNN(n_neighbors=5, method='mean'),
        KNN(n_neighbors=10, method='mean'),
        KNN(n_neighbors=15, method='mean'),
        KNN(n_neighbors=20, method='mean'),
        KNN(n_neighbors=25, method='mean'),
        KNN(n_neighbors=50, method='mean'),
        KNN(n_neighbors=60, method='mean'),
        KNN(n_neighbors=70, method='mean'),
        KNN(n_neighbors=80, method='mean'),
        KNN(n_neighbors=90, method='mean'),
        KNN(n_neighbors=100, method='mean'),
        KNN(n_neighbors=1, method='median'),
        KNN(n_neighbors=5, method='median'),
        KNN(n_neighbors=10, method='median'),
        KNN(n_neighbors=15, method='median'),
        KNN(n_neighbors=20, method='median'),
        KNN(n_neighbors=25, method='median'),
        KNN(n_neighbors=50, method='median'),
        KNN(n_neighbors=60, method='median'),
        KNN(n_neighbors=70, method='median'),
        KNN(n_neighbors=80, method='median'),
        KNN(n_neighbors=90, method='median'),
        KNN(n_neighbors=100, method='median'),
        LOF(n_neighbors=1, metric='manhattan'),
        LOF(n_neighbors=5, metric='manhattan'),
        LOF(n_neighbors=10, metric='manhattan'),
        LOF(n_neighbors=15, metric='manhattan'),
        LOF(n_neighbors=20, metric='manhattan'),
        LOF(n_neighbors=25, metric='manhattan'),
        LOF(n_neighbors=50, metric='manhattan'),
        LOF(n_neighbors=60, metric='manhattan'),
        LOF(n_neighbors=70, metric='manhattan'),
        LOF(n_neighbors=80, metric='manhattan'),
        LOF(n_neighbors=90, metric='manhattan'),
        LOF(n_neighbors=100, metric='manhattan'),
        LOF(n_neighbors=1, metric='euclidean'),
        LOF(n_neighbors=5, metric='euclidean'),
        LOF(n_neighbors=10, metric='euclidean'),
        LOF(n_neighbors=15, metric='euclidean'),
        LOF(n_neighbors=20, metric='euclidean'),
        LOF(n_neighbors=25, metric='euclidean'),
        LOF(n_neighbors=50, metric='euclidean'),
        LOF(n_neighbors=60, metric='euclidean'),
        LOF(n_neighbors=70, metric='euclidean'),
        LOF(n_neighbors=80, metric='euclidean'),
        LOF(n_neighbors=90, metric='euclidean'),
        LOF(n_neighbors=100, metric='euclidean'),
        LOF(n_neighbors=1, metric='minkowski'),
        LOF(n_neighbors=5, metric='minkowski'),
        LOF(n_neighbors=10, metric='minkowski'),
        LOF(n_neighbors=15, metric='minkowski'),
        LOF(n_neighbors=20, metric='minkowski'),
        LOF(n_neighbors=25, metric='minkowski'),
        LOF(n_neighbors=50, metric='minkowski'),
        LOF(n_neighbors=60, metric='minkowski'),
        LOF(n_neighbors=70, metric='minkowski'),
        LOF(n_neighbors=80, metric='minkowski'),
        LOF(n_neighbors=90, metric='minkowski'),
        LOF(n_neighbors=100, metric='minkowski'),
        HBOS(n_bins=5, alpha=0.1),
        HBOS(n_bins=5, alpha=0.2),
        HBOS(n_bins=5, alpha=0.3),
        HBOS(n_bins=5, alpha=0.4),
        HBOS(n_bins=5, alpha=0.5),
        HBOS(n_bins=10, alpha=0.1),
        HBOS(n_bins=10, alpha=0.2),
        HBOS(n_bins=10, alpha=0.3),
        HBOS(n_bins=10, alpha=0.4),
        HBOS(n_bins=10, alpha=0.5),
        HBOS(n_bins=20, alpha=0.1),
        HBOS(n_bins=20, alpha=0.2),
        HBOS(n_bins=20, alpha=0.3),
        HBOS(n_bins=20, alpha=0.4),
        HBOS(n_bins=20, alpha=0.5),
        HBOS(n_bins=30, alpha=0.1),
        HBOS(n_bins=30, alpha=0.2),
        HBOS(n_bins=30, alpha=0.3),
        HBOS(n_bins=30, alpha=0.4),
        HBOS(n_bins=30, alpha=0.5),
        HBOS(n_bins=40, alpha=0.1),
        HBOS(n_bins=40, alpha=0.2),
        HBOS(n_bins=40, alpha=0.3),
        HBOS(n_bins=40, alpha=0.4),
        HBOS(n_bins=40, alpha=0.5),
        HBOS(n_bins=50, alpha=0.1),
        HBOS(n_bins=50, alpha=0.2),
        HBOS(n_bins=50, alpha=0.3),
        HBOS(n_bins=50, alpha=0.4),
        HBOS(n_bins=50, alpha=0.5),
        HBOS(n_bins=75, alpha=0.1),
        HBOS(n_bins=75, alpha=0.2),
        HBOS(n_bins=75, alpha=0.3),
        HBOS(n_bins=75, alpha=0.4),
        HBOS(n_bins=75, alpha=0.5),
        HBOS(n_bins=100, alpha=0.1),
        HBOS(n_bins=100, alpha=0.2),
        HBOS(n_bins=100, alpha=0.3),
        HBOS(n_bins=100, alpha=0.4),
        HBOS(n_bins=100, alpha=0.5),
        OCSVM(nu=0.1, kernel="linear"),
        OCSVM(nu=0.2, kernel="linear"),
        OCSVM(nu=0.3, kernel="linear"),
        OCSVM(nu=0.4, kernel="linear"),
        OCSVM(nu=0.5, kernel="linear"),
        OCSVM(nu=0.6, kernel="linear"),
        OCSVM(nu=0.7, kernel="linear"),
        OCSVM(nu=0.8, kernel="linear"),
        OCSVM(nu=0.9, kernel="linear"),
        OCSVM(nu=0.1, kernel="poly"),
        OCSVM(nu=0.2, kernel="poly"),
        OCSVM(nu=0.3, kernel="poly"),
        OCSVM(nu=0.4, kernel="poly"),
        OCSVM(nu=0.5, kernel="poly"),
        OCSVM(nu=0.6, kernel="poly"),
        OCSVM(nu=0.7, kernel="poly"),
        OCSVM(nu=0.8, kernel="poly"),
        OCSVM(nu=0.9, kernel="poly"),
        OCSVM(nu=0.1, kernel="rbf"),
        OCSVM(nu=0.2, kernel="rbf"),
        OCSVM(nu=0.3, kernel="rbf"),
        OCSVM(nu=0.4, kernel="rbf"),
        OCSVM(nu=0.5, kernel="rbf"),
        OCSVM(nu=0.6, kernel="rbf"),
        OCSVM(nu=0.7, kernel="rbf"),
        OCSVM(nu=0.8, kernel="rbf"),
        OCSVM(nu=0.9, kernel="rbf"),
        OCSVM(nu=0.1, kernel="sigmoid"),
        OCSVM(nu=0.2, kernel="sigmoid"),
        OCSVM(nu=0.3, kernel="sigmoid"),
        OCSVM(nu=0.4, kernel="sigmoid"),
        OCSVM(nu=0.5, kernel="sigmoid"),
        OCSVM(nu=0.6, kernel="sigmoid"),
        OCSVM(nu=0.7, kernel="sigmoid"),
        OCSVM(nu=0.8, kernel="sigmoid"),
        OCSVM(nu=0.9, kernel="sigmoid"),
        COF(n_neighbors=3),
        COF(n_neighbors=5),
        COF(n_neighbors=10),
        COF(n_neighbors=15),
        COF(n_neighbors=20),
        COF(n_neighbors=25),
        COF(n_neighbors=50),
    ]

    # randomness_flags.extend([True] * 54)  # LODA
    # randomness_flags.extend([False] * 7)  # ABOD
    # randomness_flags.extend([True] * 81)  # IForest
    # randomness_flags.extend([False] * 36)  # KNN
    # randomness_flags.extend([False] * 36)  # LOF
    # randomness_flags.extend([False] * 40)  # HBOS
    # randomness_flags.extend([False] * 36)  # OCSVM
    # randomness_flags.extend([False] * 7)  # COF
    # return BASE_ESTIMATORS, randomness_flags
    return BASE_ESTIMATORS
Ejemplo n.º 25
0
if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, X_test, y_train, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train LOCI detector
    clf_name = 'LODA'
    clf = LODA()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Ejemplo n.º 26
0
if __name__ == '__main__':
    # Specify the root directory
    rootDir = 'G:/My Drive/Github/ml-group-col/One-Class-models/Anomaly_Datasets_csv/'
    # specify the random state
    rs = 10
    # Save how to run the models
    models = [
        (IsolationForest(random_state=rs), 'ISOF'),
        (EllipticEnvelope(random_state=rs), 'EE'),
        (LMDD(dis_measure='aad', random_state=rs), 'AAD_LMDD'),
        (COPOD(), 'COPOD'),
        (FeatureBagging(combination='average',
                        random_state=rs), 'AVE_Bagging'),  # n_jobs
        (LMDD(dis_measure='iqr', random_state=rs), 'IQR_LMDD'),
        (KNN(method='largest'), 'Largest_KNN'),  # n_jobs
        (LODA(), 'LODA'),
        (FeatureBagging(combination='max', n_jobs=-1,
                        random_state=rs), 'MAX_Bagging'),
        (MCD(random_state=rs), 'MCD'),
        (XGBOD(random_state=rs), 'XGBOD'),  # n_jobs
        (GaussianMixture(random_state=rs), 'GMM'),
        (LocalOutlierFactor(novelty=True), 'LOF'),
        (KNN(method='median'), 'Median_KNN'),  # n_jobs
        (KNN(method='mean'), 'Avg_KNN'),  # n_jobs
        (CBLOF(n_clusters=10, random_state=rs), 'CBLOF'),
        (HBOS(), 'HBOS'),
        (SOD(), 'SOD'),
        (PCA(random_state=rs), 'PCA'),
        (VAE(encoder_neurons=[3, 4, 3],
             decoder_neurons=[3, 4, 3],
             random_state=rs), 'VAE'),