def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LODA(contamination=self.contamination) self.clf.fit(self.X_train)
def model_init(self, model): """Model initialisation of a single model. """ if self.model == 'pca': self.models[model] = PCA(contamination=self.contamination) elif self.model == 'loda': self.models[model] = LODA(contamination=self.contamination) elif self.model == 'iforest': self.models[model] = IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) elif self.model == 'cblof': self.models[model] = CBLOF(n_clusters=3, contamination=self.contamination) elif self.model == 'feature_bagging': self.models[model] = FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) elif self.model == 'copod': self.models[model] = COPOD(contamination=self.contamination) elif self.model == 'hbos': self.models[model] = HBOS(contamination=self.contamination) else: self.models[model] = HBOS(contamination=self.contamination) self.custom_model_scalers[model] = MinMaxScaler()
def models_init(self): """Models initialisation. """ self.model = self.configuration.get('model', 'pca') if self.model == 'pca': self.models = { model: PCA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'loda': self.models = { model: LODA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'iforest': self.models = { model: IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'cblof': self.models = { model: CBLOF(n_clusters=3, contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'feature_bagging': self.models = { model: FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'copod': self.models = { model: COPOD(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'hbos': self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } else: self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } self.custom_model_scalers = { model: MinMaxScaler() for model in self.models_in_scope }
def __init__(self, *, hyperparams: Hyperparams, # random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._clf = LODA(contamination=hyperparams['contamination'], n_bins=hyperparams['n_bins'], n_random_cuts=hyperparams['n_random_cuts'], ) return
def train(doc_list, dataset_name, clf_name): model_roc = [] model_prc = [] if clf_name == "PCA": clf = PCA() elif clf_name == "MCD": clf = MCD() elif clf_name == "LOF": clf = LOF() elif clf_name == "KNN": clf = KNN() elif clf_name == "LODA": clf = LODA() for i in range(10): data = pd.read_csv(doc_list[i], header=0, index_col=0) train_x = data.drop(drop + ground_truth, axis=1).values train_y = np.array([ transfor[x] for x in list(_flatten(data[ground_truth].values.tolist())) ]) clf.fit(train_x) predict = clf.decision_scores_ roc = roc_auc_score(train_y, predict) prc = precision_n_scores(train_y, predict) if ((i + 1) % 200 == 0): print("第" + str(i + 1) + "个文件结果:") evaluate_print(clf_name, train_y, predict) model_roc.append(roc) model_prc.append(prc) model_roc_avg = np.mean(model_roc) model_prc_avg = np.mean(model_prc) print("模型" + clf_name + "在数据集" + dataset_name + "的平均roc_auc为" + str(round(model_roc_avg, 4)) + ",平均prc为" + str(round(model_prc_avg, 4)) + "。") return model_roc_avg, model_prc_avg
def initialise_pyod_classifiers(self, outlier_fraction): #Testing every query to every class and then predicting only if it belongs to the same class classifiers = {} #Proximity based classifiers['K Nearest Neighbors (KNN)'] = [] classifiers['Average K Nearest Neighbors (AvgKNN)'] = [] classifiers['Median K Nearest Neighbors (MedKNN)'] = [] classifiers['Local Outlier Factor (LOF)'] = [] classifiers['Connectivity-Based Outlier Factor (COF)'] = [] #classifiers['Clustering-Based Local Outlier Factor (CBLOF)'] = [] classifiers['LOCI'] = [] #classifiers['Histogram-based Outlier Score (HBOS)'] = [] classifiers['Subspace Outlier Detection (SOD)'] = [] #Linear models classifiers['Principal Component Analysis (PCA)'] = [] #classifiers['Minimum Covariance Determinant (MCD)'] = [] #To slow classifiers['One-Class Support Vector Machines (OCSVM)'] = [] classifiers['Deviation-based Outlier Detection (LMDD)'] = [] #Probabilistic classifiers['Angle-Based Outlier Detection (ABOD)'] = [] classifiers['Stochastic Outlier Selection (SOS)'] = [] #Outlier Ensembles classifiers['Isolation Forest (IForest)'] = [] classifiers['Feature Bagging'] = [] classifiers['Lightweight On-line Detector of Anomalies (LODA)'] = [] for i in range(self.k_way): for i in range(self.k_way): classifiers['K Nearest Neighbors (KNN)'].append( KNN(method='largest', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Average K Nearest Neighbors (AvgKNN)'].append( KNN(method='mean', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Median K Nearest Neighbors (MedKNN)'].append( KNN(method='median', n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Local Outlier Factor (LOF)'].append( LOF(n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['Connectivity-Based Outlier Factor (COF)'].append( COF(n_neighbors=int(self.n_shot / 3) + 1, contamination=outlier_fraction)) classifiers['LOCI'].append( LOCI(contamination=outlier_fraction)) classifiers['Subspace Outlier Detection (SOD)'].append( SOD(n_neighbors=int(self.n_shot / 3) + 2, contamination=outlier_fraction, ref_set=max(2, int((int(self.n_shot / 3) + 2) / 3)))) classifiers['Principal Component Analysis (PCA)'].append( PCA(contamination=outlier_fraction)) classifiers[ 'One-Class Support Vector Machines (OCSVM)'].append( OCSVM(contamination=outlier_fraction)) classifiers['Deviation-based Outlier Detection (LMDD)'].append( LMDD(contamination=outlier_fraction)) classifiers['Angle-Based Outlier Detection (ABOD)'].append( ABOD(contamination=outlier_fraction)) classifiers['Stochastic Outlier Selection (SOS)'].append( SOS(contamination=outlier_fraction)) classifiers['Isolation Forest (IForest)'].append( IForest(contamination=outlier_fraction)) classifiers['Feature Bagging'].append( FeatureBagging(contamination=outlier_fraction)) classifiers[ 'Lightweight On-line Detector of Anomalies (LODA)'].append( LODA(contamination=outlier_fraction)) self.num_different_models = len(classifiers) return classifiers
'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(n_clusters=10, contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Lightweight on-line detector of anomalies (LODA)': LODA(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), 'COD': COD(contamination=outliers_fraction) } classifiers_indices = { 'Angle-based Outlier Detector (ABOD)': 0, 'Cluster-based Local Outlier Factor': 1, 'Histogram-base Outlier Detection (HBOS)': 2, 'Isolation Forest': 3, 'K Nearest Neighbors (KNN)': 4,
n_features=3, contamination=contamination, random_state=42) # load pretrained models prepare_trained_model() # recommended models selected_models = select_model(X_train, n_selection=100) print("Showing the top recommended models...") for i, model in enumerate(selected_models): print(i, model) print() model_1 = LODA(n_bins=5, n_random_cuts=100) print( "1st model Average Precision", average_precision_score(y_train, model_1.fit(X_train).decision_scores_)) model_10 = LODA(n_bins=5, n_random_cuts=20) print( "10th model Average Precision", average_precision_score(y_train, model_10.fit(X_train).decision_scores_)) model_50 = OCSVM(kernel='sigmoid', nu=0.6) print( "50th model Average Precision", average_precision_score(y_train,
def execute(self): evaluation_results = [] print("Loading training data...") data = pd.DataFrame() for i, chunk in enumerate( pd.read_csv(self.input_file, header=None, chunksize=self.chunk_size)): print("Reading chunk: %d" % (i + 1)) #print(chunk) data = data.append(chunk) input_dimensionality = len(data.columns) - 1 print("Input Dimensionality: %d" % (input_dimensionality)) positive_data = data[data[len(data.columns) - 1] == 1].iloc[:, :len(data.columns) - 1] negative_data = data[data[len(data.columns) - 1] == -1].iloc[:, :len(data.columns) - 1] training_data = positive_data.sample(frac=0.70) positive_validation_data = positive_data.drop(training_data.index) if self.neg_cont and self.neg_cont > 0: print("Negative Contamination: %0.4f" % (self.neg_cont)) num_negative = math.floor( self.neg_cont * (len(negative_data) + len(positive_validation_data))) negative_data = data.sample(frac=1, random_state=200)[ data[len(data.columns) - 1] == -1].iloc[:num_negative, :len(data.columns) - 1] negative_validation_data = negative_data.copy() temp_positive = positive_validation_data.copy() temp_positive[input_dimensionality] = 1 temp_negative = negative_data.copy() temp_negative[input_dimensionality] = -1 validation_data_with_labels = pd.concat([temp_positive, temp_negative], ignore_index=True) validation_data = validation_data_with_labels.iloc[:, :len(data.columns ) - 1] validation_labels = validation_data_with_labels.iloc[:, -1:].values # Convert to tensor positive_data = torch.tensor(positive_data.values).float().to( self.device) negative_data = torch.tensor(negative_data.values).float().to( self.device) training_data = torch.tensor(training_data.values).float() validation_data = torch.tensor(validation_data.values).float() print("Validation Data:") print(validation_data) ## AE-D TRAINING ## print("Initializing autoencoder...") net = Autoencoder(layers=self.layers, device=self.device, add_syn=self.add_syn) net.to(self.device) print(net) print("Training Stochastic Autoencoder...") net.fit(training_data, epochs=self.epochs, lr=self.lr, batch_size=self.batch_size) predictions = net.predict(validation_data) tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc = performance_metrics( validation_labels, predictions) r = ["AE-D", tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc] evaluation_results.append(r) print("AE-D Results:") print( tabulate([r], [ "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV", "TS", "PT", "ACC", "F1", "MCC" ], tablefmt="grid")) # Convert back to CPU before other methods validation_data = validation_data.cpu() # Train only linear classifiers if self.eval_cat == "linear": print("Initiating training for linear detectors...") ## MCD ## print("Training MCD...") result = train_and_evaluate_classifier("MCD", MCD(), validation_data, validation_labels) evaluation_results.append(result) ## ROBUST COVARIANCE ## print("Training Robust Covariance...") result = train_and_evaluate_classifier("ROB-COV", EllipticEnvelope(), validation_data, validation_labels) evaluation_results.append(result) ## ONE CLASS SVM TRAINING ## print("Training OneClassSVM...") result = train_and_evaluate_classifier( "OC-SVM", svm.OneClassSVM(gamma="auto"), validation_data, validation_labels) evaluation_results.append(result) elif self.eval_cat == "prob": ## ABOD ## #print("Training ABOD...") #result = train_and_evaluate_classifier("ABOD", ABOD(), validation_data, validation_labels) #evaluation_results.append(result) ## SOS ## #print("Training SOS...") #result = train_and_evaluate_classifier("SOS", SOS(), validation_data, validation_labels) #evaluation_results.append(result) ## COPOD ## print("Training COPOD...") result = train_and_evaluate_classifier("COPOD", COPOD(), validation_data, validation_labels) evaluation_results.append(result) elif self.eval_cat == "ensemble": ## ISOLATION FOREST TRAINING ## print("Training Isolation Forest...") result = train_and_evaluate_classifier( "ISO-F", IsolationForest(random_state=0), validation_data, validation_labels) evaluation_results.append(result) ## LODA ## print("Training LODA...") result = train_and_evaluate_classifier("LODA", LODA(), validation_data, validation_labels) evaluation_results.append(result) ## LSCP ## # print("Training LSCP...") # result = train_and_evaluate_classifier("LSCP", LSCP([LOF(), LOF()]), validation_data, validation_labels) # evaluation_results.append(result) elif self.eval_cat == "proximity": ## LOCAL OUTLIER FACTOR ## print("Training Local Outlier Factor...") result = train_and_evaluate_classifier( "LOC-OF", LocalOutlierFactor(novelty=True), validation_data, validation_labels) evaluation_results.append(result) ## CBLOF ## print("Training CBLOF...") result = train_and_evaluate_classifier("CBLOF", CBLOF(), validation_data, validation_labels) evaluation_results.append(result) ## HBOS ## print("Training HBOS...") result = train_and_evaluate_classifier("HBOS", HBOS(), validation_data, validation_labels) evaluation_results.append(result) elif self.eval_cat == "nn": ## VAE ## print("Training VAE...") result = train_and_evaluate_classifier( "VAE", VAE(encoder_neurons=self.layers, decoder_neurons=self.layers.reverse()), validation_data, validation_labels) evaluation_results.append(result) ## SO_GAAL ## print("Training SO_GAAL...") result = train_and_evaluate_classifier( "SO_GAAL", SO_GAAL(lr_d=self.lr, stop_epochs=self.epochs), validation_data, validation_labels) evaluation_results.append(result) ## MO_GAAL ## print("Training MO_GAAL...") result = train_and_evaluate_classifier( "MO_GAAL", MO_GAAL(lr_d=self.lr, stop_epochs=self.epochs), validation_data, validation_labels) evaluation_results.append(result) ## EVALUATE RESULTS ## if self.eval_cat != "none": print("Aggregated Results:") print( tabulate(evaluation_results, [ "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV", "TS", "PT", "ACC", "F1", "MCC" ], tablefmt="grid")) ## DATASET METRICS ## len_training_data_points = len(training_data) len_positive_validations = len(positive_validation_data) len_negative_validations = len(negative_validation_data) len_validations = len_positive_validations + len_negative_validations metrics_results = [ ["Training Data Points", len_training_data_points], ["# Normal Points", len_positive_validations], ["# Anomalies", len_negative_validations], [ "Contamination Percentage", math.floor((len_negative_validations / len_validations) * 100) ] ] ## EVALUATE RESULTS ## print(tabulate(metrics_results, ["Metric", "Value"], tablefmt="grid")) if self.printout: print("Saving results to %s" % (self.printout)) df = pd.DataFrame(evaluation_results) df.to_csv(self.printout, header=None, index=False)
def generate_meta_features(X): """Get the meta-features of a datasets X Parameters ---------- X : numpy array of shape (n_samples, n_features) Input array Returns ------- meta_features : numpy array of shape (1, 200) Meta-feature in dimension of 200 """ # outliers_fraction = np.count_nonzero(y) / len(y) # outliers_percentage = round(outliers_fraction * 100, ndigits=4) X = check_array(X) meta_vec = [] meta_vec_names = [] # on the sample level n_samples, n_features = X.shape[0], X.shape[1] meta_vec.append(n_samples) meta_vec.append(n_features) meta_vec_names.append('n_samples') meta_vec_names.append('n_features') sample_mean = np.mean(X) sample_median = np.median(X) sample_var = np.var(X) sample_min = np.min(X) sample_max = np.max(X) sample_std = np.std(X) q1, q25, q75, q99 = np.percentile(X, [0.01, 0.25, 0.75, 0.99]) iqr = q75 - q25 normalized_mean = sample_mean / sample_max normalized_median = sample_median / sample_max sample_range = sample_max - sample_min sample_gini = gini(X) med_abs_dev = np.median(np.absolute(X - sample_median)) avg_abs_dev = np.mean(np.absolute(X - sample_mean)) quant_coeff_disp = (q75 - q25) / (q75 + q25) coeff_var = sample_var / sample_mean outliers_15iqr = np.logical_or(X < (q25 - 1.5 * iqr), X > (q75 + 1.5 * iqr)) outliers_3iqr = np.logical_or(X < (q25 - 3 * iqr), X > (q75 + 3 * iqr)) outliers_1_99 = np.logical_or(X < q1, X > q99) outliers_3std = np.logical_or(X < (sample_mean - 3 * sample_std), X > (sample_mean + 3 * sample_std)) percent_outliers_15iqr = np.sum(outliers_15iqr) / len(X) percent_outliers_3iqr = np.sum(outliers_3iqr) / len(X) percent_outliers_1_99 = np.sum(outliers_1_99) / len(X) percent_outliers_3std = np.sum(outliers_3std) / len(X) has_outliers_15iqr = np.any(outliers_15iqr).astype(int) has_outliers_3iqr = np.any(outliers_3iqr).astype(int) has_outliers_1_99 = np.any(outliers_1_99).astype(int) has_outliers_3std = np.any(outliers_3std).astype(int) meta_vec.extend([ sample_mean, sample_median, sample_var, sample_min, sample_max, sample_std, q1, q25, q75, q99, iqr, normalized_mean, normalized_median, sample_range, sample_gini, med_abs_dev, avg_abs_dev, quant_coeff_disp, coeff_var, # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10, percent_outliers_15iqr, percent_outliers_3iqr, percent_outliers_1_99, percent_outliers_3std, has_outliers_15iqr, has_outliers_3iqr, has_outliers_1_99, has_outliers_3std ]) meta_vec_names.extend([ 'sample_mean', 'sample_median', 'sample_var', 'sample_min', 'sample_max', 'sample_std', 'q1', 'q25', 'q75', 'q99', 'iqr', 'normalized_mean', 'normalized_median', 'sample_range', 'sample_gini', 'med_abs_dev', 'avg_abs_dev', 'quant_coeff_disp', 'coeff_var', # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10, 'percent_outliers_15iqr', 'percent_outliers_3iqr', 'percent_outliers_1_99', 'percent_outliers_3std', 'has_outliers_15iqr', 'has_outliers_3iqr', 'has_outliers_1_99', 'has_outliers_3std' ]) ########################################################################### normality_k2, normality_p = normaltest(X) is_normal_5 = (normality_p < 0.05).astype(int) is_normal_1 = (normality_p < 0.01).astype(int) meta_vec.extend(list_process(normality_p)) meta_vec.extend(list_process(is_normal_5)) meta_vec.extend(list_process(is_normal_1)) meta_vec_names.extend(list_process_name('normality_p')) meta_vec_names.extend(list_process_name('is_normal_5')) meta_vec_names.extend(list_process_name('is_normal_1')) moment_5 = moment(X, moment=5) moment_6 = moment(X, moment=6) moment_7 = moment(X, moment=7) moment_8 = moment(X, moment=8) moment_9 = moment(X, moment=9) moment_10 = moment(X, moment=10) meta_vec.extend(list_process(moment_5)) meta_vec.extend(list_process(moment_6)) meta_vec.extend(list_process(moment_7)) meta_vec.extend(list_process(moment_8)) meta_vec.extend(list_process(moment_9)) meta_vec.extend(list_process(moment_10)) meta_vec_names.extend(list_process_name('moment_5')) meta_vec_names.extend(list_process_name('moment_6')) meta_vec_names.extend(list_process_name('moment_7')) meta_vec_names.extend(list_process_name('moment_8')) meta_vec_names.extend(list_process_name('moment_9')) meta_vec_names.extend(list_process_name('moment_10')) # note: this is for each dimension == the number of dimensions skewness_list = skew(X).reshape(-1, 1) skew_values = list_process(skewness_list) meta_vec.extend(skew_values) meta_vec_names.extend(list_process_name('skewness')) # note: this is for each dimension == the number of dimensions kurtosis_list = kurtosis(X) kurtosis_values = list_process(kurtosis_list) meta_vec.extend(kurtosis_values) meta_vec_names.extend(list_process_name('kurtosis')) correlation = np.nan_to_num(pd.DataFrame(X).corr(), nan=0) correlation_list = flatten_diagonally(correlation)[0:int( (n_features * n_features - n_features) / 2)] correlation_values = list_process(correlation_list) meta_vec.extend(correlation_values) meta_vec_names.extend(list_process_name('correlation')) covariance = np.cov(X.T) covariance_list = flatten_diagonally(covariance)[0:int( (n_features * n_features - n_features) / 2)] covariance_values = list_process(covariance_list) meta_vec.extend(covariance_values) meta_vec_names.extend(list_process_name('covariance')) # sparsity rep_counts = [] for i in range(n_features): rep_counts.append(len(np.unique(X[:, i]))) sparsity_list = np.asarray(rep_counts) / (n_samples) sparsity = list_process(sparsity_list) meta_vec.extend(sparsity) meta_vec_names.extend(list_process_name('sparsity')) # ANOVA p value p_values_list = [] all_perm = list(itertools.combinations(list(range(n_features)), 2)) for j in all_perm: p_values_list.append(f_oneway(X[:, j[0]], X[:, j[1]])[1]) anova_p_value = list_process(np.asarray(p_values_list)) # anova_p_value = np.mean(p_values_list) # anova_p_value_exceed_thresh = np.mean((np.asarray(p_values_list)<0.05).astype(int)) meta_vec.extend(anova_p_value) meta_vec_names.extend(list_process_name('anova_p_value')) # pca pca_transformer = sklearn_PCA(n_components=3) X_transform = pca_transformer.fit_transform(X) # first pc pca_fpc = list_process(X_transform[0, :], r_min=False, r_max=False, r_mean=False, r_std=True, r_skew=True, r_kurtosis=True) meta_vec.extend(pca_fpc) meta_vec_names.extend( ['first_pca_std', 'first_pca_skewness', 'first_pca_kurtosis']) # entropy entropy_list = [] for i in range(n_features): counts = pd.Series(X[:, i]).value_counts() entropy_list.append(entropy(counts) / n_samples) entropy_values = list_process(entropy_list) meta_vec.extend(entropy_values) meta_vec_names.extend(list_process_name('entropy')) ##############################Landmarkers###################################### # HBOS clf = HBOS(n_bins=10) clf.fit(X) HBOS_hists = clf.hist_ HBOS_mean = np.mean(HBOS_hists, axis=0) HBOS_max = np.max(HBOS_hists, axis=0) HBOS_min = np.min(HBOS_hists, axis=0) meta_vec.extend(list_process(HBOS_mean)) meta_vec.extend(list_process(HBOS_max)) meta_vec.extend(list_process(HBOS_min)) meta_vec_names.extend(list_process_name('HBOS_mean')) meta_vec_names.extend(list_process_name('HBOS_max')) meta_vec_names.extend(list_process_name('HBOS_min')) # IForest n_estimators = 100 clf = IForest(n_estimators=n_estimators) clf.fit(X) n_leaves = [] n_depth = [] fi_mean = [] fi_max = [] # doing this for each sub-trees for i in range(n_estimators): n_leaves.append(clf.estimators_[i].get_n_leaves()) n_depth.append(clf.estimators_[i].get_depth()) fi_mean.append(clf.estimators_[i].feature_importances_.mean()) fi_max.append(clf.estimators_[i].feature_importances_.max()) # print(clf.estimators_[i].tree_) meta_vec.extend(list_process(n_leaves)) meta_vec.extend(list_process(n_depth)) meta_vec.extend(list_process(fi_mean)) meta_vec.extend(list_process(fi_max)) meta_vec_names.extend(list_process_name('IForest_n_leaves')) meta_vec_names.extend(list_process_name('IForest_n_depth')) meta_vec_names.extend(list_process_name('IForest_fi_mean')) meta_vec_names.extend(list_process_name('IForest_fi_max')) # PCA clf = PCA(n_components=3) clf.fit(X) meta_vec.extend(clf.explained_variance_ratio_) meta_vec.extend(clf.singular_values_) meta_vec_names.extend( ['pca_expl_ratio_1', 'pca_expl_ratio_2', 'pca_expl_ratio_3']) meta_vec_names.extend(['pca_sv_1', 'pca_sv_2', 'pca_sv_3']) # LODA n_bins = 10 n_random_cuts = 100 n_hists_mean = [] n_hists_max = [] n_cuts_mean = [] n_cuts_max = [] clf = LODA(n_bins=n_bins, n_random_cuts=n_random_cuts) clf.fit(X) for i in range(n_bins): n_hists_mean.append(clf.histograms_[:, i].mean()) n_hists_max.append(clf.histograms_[:, i].max()) for i in range(n_random_cuts): n_cuts_mean.append(clf.histograms_[i, :].mean()) n_cuts_max.append(clf.histograms_[i, :].max()) meta_vec.extend(list_process(n_hists_mean)) meta_vec.extend(list_process(n_hists_max)) meta_vec.extend(list_process(n_cuts_mean)) meta_vec.extend(list_process(n_cuts_max)) meta_vec_names.extend(list_process_name('LODA_n_hists_mean')) meta_vec_names.extend(list_process_name('LODA_n_hists_max')) meta_vec_names.extend(list_process_name('LODA_n_cuts_mean')) meta_vec_names.extend(list_process_name('LODA_n_cuts_max')) return meta_vec, meta_vec_names
'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'IForest': IForest(), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'),
def main(): # PART 1: # Getting the predictions for each classifier # SK means: The classifier is from sklearn or works like sklearn # PY means: The classifier is from pyod or works like pyod models = { 'SK_EE': EllipticEnvelope(), 'SK_GM': GaussianMixture(), 'SK_IF': IsolationForest(), 'SK_OCSVM': OneClassSVM(), 'SK_FA': FactorAnalysis(), 'SK_KD': KernelDensity(), 'PY_PCA': PCA(), 'PY_COF': COF(), 'PY_LODA': LODA(), 'PY_LOF': LOF(), 'PY_HBOS': HBOS(), 'PY_MCD': MCD(), 'PY_AvgKNN': KNN(method='mean'), 'PY_LargestKNN': KNN(method='largest'), 'PY_MedKNN': KNN(method='median'), 'PY_AvgBagging': FeatureBagging(combination='average'), 'PY_MaxBagging': FeatureBagging(combination='max'), 'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'PY_COPOD': COPOD(), 'PY_SOD': SOD(), 'PY_LSCPwithLODA': LSCP([LODA(), LODA()]), 'PY_AveLMDD': LMDD(dis_measure='aad'), 'PY_VarLMDD': LMDD(dis_measure='var'), 'PY_IqrLMDD': LMDD(dis_measure='iqr'), 'PY_VAE': VAE(encoder_neurons=[8, 4, 2]), 'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'SK_BRM': BRM(bootstrap_sample_percent=70), 'SK_OCKRA': m_OCKRA(), 'PY_SoGaal': SO_GAAL(), 'PY_MoGaal': MO_GAAL() } ranker = ADRanker(data="datasets", models=models) ranker.get_predictions() # PART 2: # After predictions, we can evaluate our classifiers using different scores # You can add manually a new metric by modifying 'metrics.py' ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave}) # PART 3: # Finally, it is time to summarize the results by plotting different graphs # You can add your own graphs by modifying ' plots.py' plot = Plots() plot.make_plot_basic(paths=[ 'results/scores/auc/no/results.csv', 'results/scores/auc/minmax/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/ave/minmax/results.csv', 'results/scores/ave/std/results.csv' ], scalers=[ 'Without scaler', 'Min max scaler', 'Standard scaler', 'Without scaler', 'Min max scaler', 'Standard scaler' ]) plot.make_cd_plot( paths=[ 'results/scores/auc/minmax/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/no/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/std/results.csv' ], names=[ 'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale', 'CD ave no scale', 'CD auc std scale', 'CD ave std scale' ], titles=[ 'CD diagram - AUC with min max scaling', 'CD diagram - Average precision with min max scaling', 'CD diagram - AUC without scaling', 'CD diagram - Average precision without scaling', 'CD diagram - AUC with standard scaling', 'CD diagram - Average precision with standard scaling' ])
y_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_scores = clf.decision_scores_ # raw outlier scores evaluation(y, y_scores, clf_name) all_scores['LOF'] = y_scores clf_name = 'PCA' clf = PCA(contamination=contam) x_train = standardizer(x_train) clf.fit(x_train) y_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_scores = clf.decision_scores_ # raw outlier scores evaluation(y, y_scores, clf_name) all_scores['PCA'] = y_scores clf_name = 'LODA' clf = LODA(contamination=contam) x_train = standardizer(x_train) clf.fit(x_train) y_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_scores = clf.decision_scores_ # raw outlier scores evaluation(y, y_scores, clf_name) all_scores['LODA'] = y_scores pca = PCA(n_components=2) kpca = KernelPCA(n_components=2, kernel="poly") x_train_pca = kpca.fit_transform(x_train) clf = KNN(n_neighbors=5, contamination=contam) x_train_pca = standardizer(x_train_pca) clf.fit(x_train_pca) y_pred_pca = clf.labels_ # binary labels (0: inliers, 1: outliers) y_scores = clf.decision_scores_ # raw outlier scores
begin = "2020-02-13" end = "2020-02-15" test_date = "2020-02-16" KNN_clf = KNN(contamination=0.05) PCA_clf = PCA(contamination=0.05) VAE_clf = VAE(contamination=0.05, epochs=30, encoder_neurons=[9, 4], decoder_neurons=[4, 9]) LOF_clf = LOF(contamination=0.05) IForest_clf = IForest(contamination=0.05) AutoEncoder_clf = AutoEncoder(contamination=0.05, epochs=30, hidden_neurons=[9, 4, 4, 9]) FeatureBagging_clf = FeatureBagging(contamination=0.05, check_estimator=False) ABOD_clf = ABOD(contamination=0.05) HBOS_clf = HBOS(contamination=0.05) CBLOF_clf = CBLOF(contamination=0.05) LODA_clf = LODA(contamination=0.05) MCD_clf = MCD(contamination=0.05) MO_GAAL_clf = MO_GAAL(k=3, stop_epochs=2, contamination=0.05) SO_GAAL_clf = SO_GAAL(contamination=0.05) KNN_MAH_clf = None S_models = ["KNN", "LOF", "PCA", "IForest", "HBOS", "LODA", "MCD", "CBLOF", "FeatureBagging", "ABOD", "KNN_MAH"] K_models = ["AutoEncoder", "SO_GAAL", "VAE"] def get_train_data(): """ 获取训练样本 :return: x_train 9特征训练样本 df 原训练数据 """ acc_date = pd.date_range(begin, end, freq='1D')
else: # grogger df['arr86'] = (df['narr86'] >= 1).astype(int) Y = df['arr86'] X = df[[ 'pcnv', 'avgsen', 'tottime', 'ptime86', 'inc86', 'black', 'hispan', 'born60' ]] print(i, X.shape, Y.shape) if OD_Flag: # clf = HBOS(contamination=0.05) # clf = IForest(contamination=0.05) clf = LODA(contamination=0.05) clf.fit(X) # remove outliers X = X.loc[np.where(clf.labels_ == 0)] Y = Y.loc[np.where(clf.labels_ == 0)] X = sm.add_constant(X) # general OLS # https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLS.html # model=sm.OLS(Y, X.astype(float)) # robust regression # https://www.statsmodels.org/stable/generated/statsmodels.robust.robust_linear_model.RLM.html # model=sm.RLM(Y, X.astype(float))
# standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD( contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF( n_clusters=10, contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS( contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Lightweight on-line detector of anomalies (LODA)': LODA(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF( contamination=outliers_fraction), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA( contamination=outliers_fraction, random_state=random_state), 'COD': COD(contamination=outliers_fraction) } classifiers_indices = { 'Angle-based Outlier Detector (ABOD)': 0, 'Cluster-based Local Outlier Factor': 1, 'Histogram-base Outlier Detection (HBOS)': 2, 'Isolation Forest': 3, 'K Nearest Neighbors (KNN)': 4, 'Lightweight on-line detector of anomalies (LODA)': 5, 'Local Outlier Factor (LOF)': 6,
def compare(inputdata, labels, n_clusters, dset_name): """ Compute the AUC, Fgap, Frank score on all conventional outlier detectors for the given dataset Args: inputdata: input data labels: ground truth outlier labels n_clusters: number of clusters, for some cluster-based detectors dset_name: dataset Returns: AUC, Fgap, Frank """ print( "Competing with conventional unsupervised outlier detection algorithms..." ) random_state = np.random.RandomState(1) if inputdata.shape[1] < 64: AEneurons = [16, 8, 8, 16] VAEneurons = [16, 8, 4], [4, 8, 16] else: AEneurons = [64, 32, 32, 64] VAEneurons = [128, 64, 32], [32, 64, 128] classifiers = { 'PCA': PCA(random_state=random_state), 'AutoEncoder': AutoEncoder(batch_size=100, hidden_neurons=AEneurons, random_state=random_state), 'VAE': VAE(batch_size=100, encoder_neurons=VAEneurons[0], decoder_neurons=VAEneurons[1], random_state=random_state), 'COPOD': COPOD(), 'Iforest': IForest(random_state=random_state), 'AutoEncoder': AutoEncoder(batch_size=100, random_state=random_state), 'VAE': VAE(batch_size=100, random_state=random_state), 'LODA': LODA(), 'OCSVM': OCSVM(), 'ABOD': ABOD(n_neighbors=20), 'Fb': FeatureBagging(random_state=random_state), 'CBLOF': CBLOF(n_clusters=n_clusters, check_estimator=False, random_state=random_state), 'LOF': LOF(), 'COF': COF() } for clf_name, clf in classifiers.items(): print(f"Using {clf_name} method") starttime = time.time() clf.fit(inputdata) time_taken = time.time() - starttime test_scores = clf.decision_scores_ # -----fix some broken scores----- # for i in range(len(test_scores)): cur = test_scores[i] if np.isnan(cur) or not np.isfinite(cur): test_scores[i] = 0 np.save(f'{dset_name}/{clf_name}_raw.npy', test_scores) auc = roc_auc_score(labels, test_scores) print('AUC:', auc) fetch(normalize(test_scores), f'../datasets/{dset_name.upper()}_Y.npy', f'{dset_name}/attribute.npy') print('time_taken:', time_taken)
def pyod_init(model, n_features=None): # initial model set up if model == 'abod': from pyod.models.abod import ABOD clf = ABOD() elif model == 'auto_encoder' and n_features: #import os #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from pyod.models.auto_encoder import AutoEncoder clf = AutoEncoder(hidden_neurons=[ n_features, n_features * 5, n_features * 5, n_features ], epochs=5, batch_size=64, preprocessing=False) elif model == 'cblof': from pyod.models.cblof import CBLOF clf = CBLOF(n_clusters=4) elif model == 'hbos': from pyod.models.hbos import HBOS clf = HBOS() elif model == 'iforest': from pyod.models.iforest import IForest clf = IForest() elif model == 'knn': from pyod.models.knn import KNN clf = KNN() elif model == 'lmdd': from pyod.models.lmdd import LMDD clf = LMDD() elif model == 'loci': from pyod.models.loci import LOCI clf = LOCI() elif model == 'loda': from pyod.models.loda import LODA clf = LODA() elif model == 'lof': from pyod.models.lof import LOF clf = LOF() elif model == 'mcd': from pyod.models.mcd import MCD clf = MCD() elif model == 'ocsvm': from pyod.models.ocsvm import OCSVM clf = OCSVM() elif model == 'pca': from pyod.models.pca import PCA clf = PCA() elif model == 'sod': from pyod.models.sod import SOD clf = SOD() elif model == 'vae': from pyod.models.vae import VAE clf = VAE() elif model == 'xgbod': from pyod.models.xgbod import XGBOD clf = XGBOD() else: #raise ValueError(f"unknown model {model}") clf = PyODDefaultModel() return clf
iterate_threshold = True if __name__ == "__main__": logging.basicConfig( level=logging.INFO, format="%(asctime)s P%(process)d %(levelname)s %(message)s", ) # load dataset data_dict = load_dataset( dataset, subdataset, "all", ) x_train = data_dict["train"] x_test = data_dict["test"] x_test_labels = data_dict["test_labels"] # data preprocessing for MSCRED od = LODA(n_bins=n_bins, n_random_cuts=n_random_cuts) od.fit(x_train) # get outlier scores anomaly_score = od.decision_function(x_test) anomaly_label = x_test_labels # Make evaluation evaluate_all(anomaly_score, anomaly_label)
def fit(self, X, shrink_cols = True, data_scaler = preprocessing.MaxAbsScaler(), quick_methods = True, slow_methods = False, nn_methods = False, contamination = 0.05, use_score_rank = False, random_state = None, verbose = 0): if len(X.shape) > 2: X = X.reshape(X.shape[0], X.shape[1]*X.shape[2]) elif len(X.shape) > 3: raise ValueError("Expected number of dimensions: 2 or 3") if shrink_cols: X = X[:,~np.all(X == 0, axis=0)] log.info('zero columns shrinked') if data_scaler: X = data_scaler.fit_transform(X) log.info(f'used {data_scaler} data scaler') #log.info(X[0:1,:]) n_rows = X.shape[0] n_features = X.shape[1] log.info (f'n_rows = {n_rows}, n_features = {n_features}') quick_scores = np.zeros([n_rows, 0]) slow_scores = np.zeros([n_rows, 0]) nn_scores = np.zeros([n_rows, 0]) if quick_methods: # Define anomaly detection tools to be compared quick_classifiers = { 'PCA_randomized': PCA(contamination=contamination, random_state=random_state, standardization = False, svd_solver = 'randomized'), 'PCA_full': PCA(contamination=contamination, random_state=random_state, standardization = False, svd_solver = 'full'), 'COPOD': COPOD(contamination=contamination), f'HBOS': HBOS(contamination=contamination), f'HBOS_{200}': HBOS(contamination=contamination, n_bins = 200), f'HBOS_{300}': HBOS(contamination=contamination, n_bins = 300), 'LODA': LODA(contamination=contamination), 'LODA_200': LODA(contamination=contamination, n_random_cuts = 200), 'LODA_300': LODA(contamination=contamination, n_random_cuts = 300), 'IForest_100': IForest(contamination=contamination, random_state=random_state, n_estimators = 100, bootstrap = False, n_jobs = -1), 'IForest_200': IForest(contamination=contamination, random_state=random_state, n_estimators = 200, bootstrap = False, n_jobs = -1), 'IForest_bootstrap': IForest(contamination = contamination, random_state=random_state, n_estimators = 150, bootstrap = True, n_jobs = -1), #'MCD': # MCD(contamination=contamination, random_state=random_state, assume_centered = False), #'MCD_centered': # MCD(contamination=contamination, random_state=random_state, assume_centered = True), f'CBLOF_16': CBLOF(contamination=contamination, random_state=random_state, n_clusters = 16), f'CBLOF_24': CBLOF(contamination=contamination, random_state=random_state, n_clusters = 24), f'CBLOF_32': CBLOF(contamination=contamination, random_state=random_state, n_clusters = 32) } quick_scores = np.zeros([n_rows, len(quick_classifiers)]) for i, (clf_name, clf) in enumerate(quick_classifiers.items()): log.info(f'{i+1} - fitting {clf_name}') try: clf.fit(X) quick_scores[:, i] = clf.decision_scores_ except: log.info(traceback.print_exc()) else: log.info(f'Base detector {i+1}/{len(quick_classifiers)} is fitted for prediction') quick_scores = np.nan_to_num(quick_scores) if slow_methods: # initialize a set of detectors for LSCP detector_list = [LOF(n_neighbors=10), LOF(n_neighbors=15), LOF(n_neighbors=20)] slow_classifiers = { #'Angle-based Outlier Detector (ABOD)': #too slow and nan results # ABOD(contamination=contamination), #'One-class SVM (OCSVM)': # OCSVM(contamination=contamination, cache_size = 2000, shrinking = False, tol = 1e-2), #'LSCP': #slow and no parallel # LSCP(detector_list, contamination=contamination, random_state=random_state, local_region_size = 30), #'Feature Bagging': #ensemble #no real par # FeatureBagging(LOF(n_neighbors=20), contamination=contamination, # random_state=random_state, n_jobs = -1), #'SOS' : # too memory inefficient # SOS(contamination=contamination), #'COF': # memory inefficient # COF(contamination=contamination), #'SOD': # SOD(contamination = contamination), #'KNN': # KNN(contamination=contamination, n_jobs = -1), #'KNN_50': # KNN(contamination=contamination, leaf_size = 50, n_jobs = -1), #'KNN_70': # KNN(contamination=contamination, leaf_size = 70, n_jobs = -1), 'LOF_4': LOF(n_neighbors=4, contamination=contamination, n_jobs = -1), 'LOF_5': LOF(n_neighbors=5, contamination=contamination, n_jobs = -1), 'LOF_6': LOF(n_neighbors=6, contamination=contamination, n_jobs = -1), 'LOF_7': LOF(n_neighbors=7, contamination=contamination, n_jobs = -1), 'LOF_8': LOF(n_neighbors=8, contamination=contamination, n_jobs = -1), 'LOF_9': LOF(n_neighbors=9, contamination=contamination, n_jobs = -1), 'LOF_10': LOF(n_neighbors=10, contamination=contamination, n_jobs = -1), 'LOF_12': LOF(n_neighbors=12, contamination=contamination, n_jobs = -1), 'LOF_14': LOF(n_neighbors=14, contamination=contamination, n_jobs = -1), 'LOF_16': LOF(n_neighbors=16, contamination=contamination, n_jobs = -1), 'LOF_18': LOF(n_neighbors=18, contamination=contamination, n_jobs = -1), 'LOF_20': LOF(n_neighbors=20, contamination=contamination, n_jobs = -1), 'LOF_22': LOF(n_neighbors=22, contamination=contamination, n_jobs = -1) } slow_scores = np.zeros([n_rows, len(slow_classifiers)]) for i, (clf_name, clf) in enumerate(slow_classifiers.items()): log.info(f'{i+1} - fitting {clf_name}') try: clf.fit(X) slow_scores[:, i] = clf.decision_scores_ except: log.info(traceback.print_exc()) else: log.info(f'Base detector {i+1}/{len(slow_classifiers)} is fitted for prediction') slow_scores = np.nan_to_num(slow_scores) if nn_methods: nn_classifiers = {} n_list = [1024, 512, 256, 128, 64, 32, 16, 8, 4, 2] n_idx = next(x[0] for x in enumerate(n_list) if x[1] < n_features) for i in range(3,6): n_enc = n_list[n_idx:n_idx+i-1] n_dec = n_enc[::-1] n_enc_dec = n_enc + n_dec nn_classifiers[f'FULL_AE_{len(n_enc + n_dec)}'] = {'clf': self.full_autoencoder, 'hidden_layers' : n_enc_dec } nn_classifiers[f'VAE_{len(n_enc_dec)}'] = {'clf': VAE(contamination = contamination, random_state = random_state, encoder_neurons = n_enc, decoder_neurons = n_dec, preprocessing = False, epochs = 32, verbosity = verbose), 'hidden_layers' : n_enc + n_dec } nn_scores = np.zeros([n_rows, len(nn_classifiers)]) for i, (clf_name, clf) in enumerate(nn_classifiers.items()): log.info(f'''{i+1} - fitting {clf_name} with layers {clf['hidden_layers']}''') try: if clf['clf'] == self.full_autoencoder: nn_scores[:, i] = clf['clf'](X, neurons_list = clf['hidden_layers'], verbose = verbose) else: clf['clf'].fit(X) nn_scores[:, i] = clf['clf'].decision_scores_ except: log.info(traceback.print_exc()) else: log.info(f'Base detector {i+1}/{len(nn_classifiers)} is fitted for prediction') nn_scores = np.nan_to_num(nn_scores) all_scores = np.concatenate((quick_scores, slow_scores, nn_scores), axis=1) all_scores = all_scores[:,~np.all(all_scores == 0, axis=0)] log.info(f'total scores = {all_scores.shape[1]}') all_scores_norm = np.copy(all_scores) if use_score_rank: all_scores_norm = np.apply_along_axis(rank_fun, 0, all_scores_norm) log.info(f'score rank applied') all_scores_norm = preprocessing.MinMaxScaler().fit_transform(all_scores_norm) if all_scores_norm.shape[1] >= 12: score_by_aom = aom(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4)) score_by_moa = moa(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4)) score_by_avg = np.mean(all_scores_norm, axis = 1) score_by_max = np.max(all_scores_norm, axis = 1) else: score_by_avg = np.mean(all_scores_norm, axis = 1) score_by_max = np.max(all_scores_norm, axis = 1) score_by_aom = score_by_avg score_by_moa = score_by_max return score_by_aom, score_by_moa, score_by_max, score_by_avg, all_scores, all_scores_norm
clf.fit(X_train) sklearn_score_anomalies = clf.decision_function(X_test) original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies] auc_svm_ws = evaluate.AUC(original_paper_score, y_test) # --- LOF --- # lof = LocalOutlierFactor(novelty=True) lof.fit(X_train) sklearn_score_anomalies = lof.decision_function(X_test) original_paper_score = [-1 * s + 0.5 for s in sklearn_score_anomalies] auc_lof_ws = evaluate.AUC(original_paper_score, y_test) # --- LODA --- # aucs_loda_ws = np.zeros(num_of_experiments) for r in tqdm(range(num_of_experiments)): loda = LODA() loda.fit(X_train) y_pred_proba_loda = np.zeros(X_test.shape[0]) for i in tqdm(range(X_test.shape[0])): loda.fit(X_test[i, :].reshape(1, -1)) y_pred_proba_loda[i] = loda.decision_function(X_test[i, :].reshape( 1, -1)) aucs_loda_ws[r] = evaluate.AUC(1 - y_pred_proba_loda, y_test) auc_loda_ws = np.mean(aucs_loda_ws) # --- HalfSpaceTrees --- # aucs_hst_ws = np.zeros(num_of_experiments) for r in tqdm(range(num_of_experiments)): hst = HalfSpaceTrees(n_features=X_train_hst.shape[1], n_estimators=100) hst.fit(X_train_hst, np.zeros(X_train_hst.shape[0])) y_pred_proba_hst = np.zeros(X_test_hst.shape[0])
class TestLODA(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LODA(contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'projections_') and self.clf.projections_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
def get_detectors(): # randomness_flags = [] BASE_ESTIMATORS = [ LODA(n_bins=5, n_random_cuts=10), LODA(n_bins=5, n_random_cuts=20), LODA(n_bins=5, n_random_cuts=30), LODA(n_bins=5, n_random_cuts=40), LODA(n_bins=5, n_random_cuts=50), LODA(n_bins=5, n_random_cuts=75), LODA(n_bins=5, n_random_cuts=100), LODA(n_bins=5, n_random_cuts=150), LODA(n_bins=5, n_random_cuts=200), LODA(n_bins=10, n_random_cuts=10), LODA(n_bins=10, n_random_cuts=20), LODA(n_bins=10, n_random_cuts=30), LODA(n_bins=10, n_random_cuts=40), LODA(n_bins=10, n_random_cuts=50), LODA(n_bins=10, n_random_cuts=75), LODA(n_bins=10, n_random_cuts=100), LODA(n_bins=10, n_random_cuts=150), LODA(n_bins=10, n_random_cuts=200), LODA(n_bins=15, n_random_cuts=10), LODA(n_bins=15, n_random_cuts=20), LODA(n_bins=15, n_random_cuts=30), LODA(n_bins=15, n_random_cuts=40), LODA(n_bins=15, n_random_cuts=50), LODA(n_bins=15, n_random_cuts=75), LODA(n_bins=15, n_random_cuts=100), LODA(n_bins=15, n_random_cuts=150), LODA(n_bins=15, n_random_cuts=200), LODA(n_bins=20, n_random_cuts=10), LODA(n_bins=20, n_random_cuts=20), LODA(n_bins=20, n_random_cuts=30), LODA(n_bins=20, n_random_cuts=40), LODA(n_bins=20, n_random_cuts=50), LODA(n_bins=20, n_random_cuts=75), LODA(n_bins=20, n_random_cuts=100), LODA(n_bins=20, n_random_cuts=150), LODA(n_bins=20, n_random_cuts=200), LODA(n_bins=25, n_random_cuts=10), LODA(n_bins=25, n_random_cuts=20), LODA(n_bins=25, n_random_cuts=30), LODA(n_bins=25, n_random_cuts=40), LODA(n_bins=25, n_random_cuts=50), LODA(n_bins=25, n_random_cuts=75), LODA(n_bins=25, n_random_cuts=100), LODA(n_bins=25, n_random_cuts=150), LODA(n_bins=25, n_random_cuts=200), LODA(n_bins=30, n_random_cuts=10), LODA(n_bins=30, n_random_cuts=20), LODA(n_bins=30, n_random_cuts=30), LODA(n_bins=30, n_random_cuts=40), LODA(n_bins=30, n_random_cuts=50), LODA(n_bins=30, n_random_cuts=75), LODA(n_bins=30, n_random_cuts=100), LODA(n_bins=30, n_random_cuts=150), LODA(n_bins=30, n_random_cuts=200), ABOD(n_neighbors=3), ABOD(n_neighbors=5), ABOD(n_neighbors=10), ABOD(n_neighbors=15), ABOD(n_neighbors=20), ABOD(n_neighbors=25), ABOD(n_neighbors=50), ABOD(n_neighbors=60), ABOD(n_neighbors=75), ABOD(n_neighbors=80), ABOD(n_neighbors=90), ABOD(n_neighbors=100), IForest(n_estimators=10, max_features=0.1), IForest(n_estimators=10, max_features=0.2), IForest(n_estimators=10, max_features=0.3), IForest(n_estimators=10, max_features=0.4), IForest(n_estimators=10, max_features=0.5), IForest(n_estimators=10, max_features=0.6), IForest(n_estimators=10, max_features=0.7), IForest(n_estimators=10, max_features=0.8), IForest(n_estimators=10, max_features=0.9), IForest(n_estimators=20, max_features=0.1), IForest(n_estimators=20, max_features=0.2), IForest(n_estimators=20, max_features=0.3), IForest(n_estimators=20, max_features=0.4), IForest(n_estimators=20, max_features=0.5), IForest(n_estimators=20, max_features=0.6), IForest(n_estimators=20, max_features=0.7), IForest(n_estimators=20, max_features=0.8), IForest(n_estimators=20, max_features=0.9), IForest(n_estimators=30, max_features=0.1), IForest(n_estimators=30, max_features=0.2), IForest(n_estimators=30, max_features=0.3), IForest(n_estimators=30, max_features=0.4), IForest(n_estimators=30, max_features=0.5), IForest(n_estimators=30, max_features=0.6), IForest(n_estimators=30, max_features=0.7), IForest(n_estimators=30, max_features=0.8), IForest(n_estimators=30, max_features=0.9), IForest(n_estimators=40, max_features=0.1), IForest(n_estimators=40, max_features=0.2), IForest(n_estimators=40, max_features=0.3), IForest(n_estimators=40, max_features=0.4), IForest(n_estimators=40, max_features=0.5), IForest(n_estimators=40, max_features=0.6), IForest(n_estimators=40, max_features=0.7), IForest(n_estimators=40, max_features=0.8), IForest(n_estimators=40, max_features=0.9), IForest(n_estimators=50, max_features=0.1), IForest(n_estimators=50, max_features=0.2), IForest(n_estimators=50, max_features=0.3), IForest(n_estimators=50, max_features=0.4), IForest(n_estimators=50, max_features=0.5), IForest(n_estimators=50, max_features=0.6), IForest(n_estimators=50, max_features=0.7), IForest(n_estimators=50, max_features=0.8), IForest(n_estimators=50, max_features=0.9), IForest(n_estimators=75, max_features=0.1), IForest(n_estimators=75, max_features=0.2), IForest(n_estimators=75, max_features=0.3), IForest(n_estimators=75, max_features=0.4), IForest(n_estimators=75, max_features=0.5), IForest(n_estimators=75, max_features=0.6), IForest(n_estimators=75, max_features=0.7), IForest(n_estimators=75, max_features=0.8), IForest(n_estimators=75, max_features=0.9), IForest(n_estimators=100, max_features=0.1), IForest(n_estimators=100, max_features=0.2), IForest(n_estimators=100, max_features=0.3), IForest(n_estimators=100, max_features=0.4), IForest(n_estimators=100, max_features=0.5), IForest(n_estimators=100, max_features=0.6), IForest(n_estimators=100, max_features=0.7), IForest(n_estimators=100, max_features=0.8), IForest(n_estimators=100, max_features=0.9), IForest(n_estimators=150, max_features=0.1), IForest(n_estimators=150, max_features=0.2), IForest(n_estimators=150, max_features=0.3), IForest(n_estimators=150, max_features=0.4), IForest(n_estimators=150, max_features=0.5), IForest(n_estimators=150, max_features=0.6), IForest(n_estimators=150, max_features=0.7), IForest(n_estimators=150, max_features=0.8), IForest(n_estimators=150, max_features=0.9), IForest(n_estimators=200, max_features=0.1), IForest(n_estimators=200, max_features=0.2), IForest(n_estimators=200, max_features=0.3), IForest(n_estimators=200, max_features=0.4), IForest(n_estimators=200, max_features=0.5), IForest(n_estimators=200, max_features=0.6), IForest(n_estimators=200, max_features=0.7), IForest(n_estimators=200, max_features=0.8), IForest(n_estimators=200, max_features=0.9), KNN(n_neighbors=1, method='largest'), KNN(n_neighbors=5, method='largest'), KNN(n_neighbors=10, method='largest'), KNN(n_neighbors=15, method='largest'), KNN(n_neighbors=20, method='largest'), KNN(n_neighbors=25, method='largest'), KNN(n_neighbors=50, method='largest'), KNN(n_neighbors=60, method='largest'), KNN(n_neighbors=70, method='largest'), KNN(n_neighbors=80, method='largest'), KNN(n_neighbors=90, method='largest'), KNN(n_neighbors=100, method='largest'), KNN(n_neighbors=1, method='mean'), KNN(n_neighbors=5, method='mean'), KNN(n_neighbors=10, method='mean'), KNN(n_neighbors=15, method='mean'), KNN(n_neighbors=20, method='mean'), KNN(n_neighbors=25, method='mean'), KNN(n_neighbors=50, method='mean'), KNN(n_neighbors=60, method='mean'), KNN(n_neighbors=70, method='mean'), KNN(n_neighbors=80, method='mean'), KNN(n_neighbors=90, method='mean'), KNN(n_neighbors=100, method='mean'), KNN(n_neighbors=1, method='median'), KNN(n_neighbors=5, method='median'), KNN(n_neighbors=10, method='median'), KNN(n_neighbors=15, method='median'), KNN(n_neighbors=20, method='median'), KNN(n_neighbors=25, method='median'), KNN(n_neighbors=50, method='median'), KNN(n_neighbors=60, method='median'), KNN(n_neighbors=70, method='median'), KNN(n_neighbors=80, method='median'), KNN(n_neighbors=90, method='median'), KNN(n_neighbors=100, method='median'), LOF(n_neighbors=1, metric='manhattan'), LOF(n_neighbors=5, metric='manhattan'), LOF(n_neighbors=10, metric='manhattan'), LOF(n_neighbors=15, metric='manhattan'), LOF(n_neighbors=20, metric='manhattan'), LOF(n_neighbors=25, metric='manhattan'), LOF(n_neighbors=50, metric='manhattan'), LOF(n_neighbors=60, metric='manhattan'), LOF(n_neighbors=70, metric='manhattan'), LOF(n_neighbors=80, metric='manhattan'), LOF(n_neighbors=90, metric='manhattan'), LOF(n_neighbors=100, metric='manhattan'), LOF(n_neighbors=1, metric='euclidean'), LOF(n_neighbors=5, metric='euclidean'), LOF(n_neighbors=10, metric='euclidean'), LOF(n_neighbors=15, metric='euclidean'), LOF(n_neighbors=20, metric='euclidean'), LOF(n_neighbors=25, metric='euclidean'), LOF(n_neighbors=50, metric='euclidean'), LOF(n_neighbors=60, metric='euclidean'), LOF(n_neighbors=70, metric='euclidean'), LOF(n_neighbors=80, metric='euclidean'), LOF(n_neighbors=90, metric='euclidean'), LOF(n_neighbors=100, metric='euclidean'), LOF(n_neighbors=1, metric='minkowski'), LOF(n_neighbors=5, metric='minkowski'), LOF(n_neighbors=10, metric='minkowski'), LOF(n_neighbors=15, metric='minkowski'), LOF(n_neighbors=20, metric='minkowski'), LOF(n_neighbors=25, metric='minkowski'), LOF(n_neighbors=50, metric='minkowski'), LOF(n_neighbors=60, metric='minkowski'), LOF(n_neighbors=70, metric='minkowski'), LOF(n_neighbors=80, metric='minkowski'), LOF(n_neighbors=90, metric='minkowski'), LOF(n_neighbors=100, metric='minkowski'), HBOS(n_bins=5, alpha=0.1), HBOS(n_bins=5, alpha=0.2), HBOS(n_bins=5, alpha=0.3), HBOS(n_bins=5, alpha=0.4), HBOS(n_bins=5, alpha=0.5), HBOS(n_bins=10, alpha=0.1), HBOS(n_bins=10, alpha=0.2), HBOS(n_bins=10, alpha=0.3), HBOS(n_bins=10, alpha=0.4), HBOS(n_bins=10, alpha=0.5), HBOS(n_bins=20, alpha=0.1), HBOS(n_bins=20, alpha=0.2), HBOS(n_bins=20, alpha=0.3), HBOS(n_bins=20, alpha=0.4), HBOS(n_bins=20, alpha=0.5), HBOS(n_bins=30, alpha=0.1), HBOS(n_bins=30, alpha=0.2), HBOS(n_bins=30, alpha=0.3), HBOS(n_bins=30, alpha=0.4), HBOS(n_bins=30, alpha=0.5), HBOS(n_bins=40, alpha=0.1), HBOS(n_bins=40, alpha=0.2), HBOS(n_bins=40, alpha=0.3), HBOS(n_bins=40, alpha=0.4), HBOS(n_bins=40, alpha=0.5), HBOS(n_bins=50, alpha=0.1), HBOS(n_bins=50, alpha=0.2), HBOS(n_bins=50, alpha=0.3), HBOS(n_bins=50, alpha=0.4), HBOS(n_bins=50, alpha=0.5), HBOS(n_bins=75, alpha=0.1), HBOS(n_bins=75, alpha=0.2), HBOS(n_bins=75, alpha=0.3), HBOS(n_bins=75, alpha=0.4), HBOS(n_bins=75, alpha=0.5), HBOS(n_bins=100, alpha=0.1), HBOS(n_bins=100, alpha=0.2), HBOS(n_bins=100, alpha=0.3), HBOS(n_bins=100, alpha=0.4), HBOS(n_bins=100, alpha=0.5), OCSVM(nu=0.1, kernel="linear"), OCSVM(nu=0.2, kernel="linear"), OCSVM(nu=0.3, kernel="linear"), OCSVM(nu=0.4, kernel="linear"), OCSVM(nu=0.5, kernel="linear"), OCSVM(nu=0.6, kernel="linear"), OCSVM(nu=0.7, kernel="linear"), OCSVM(nu=0.8, kernel="linear"), OCSVM(nu=0.9, kernel="linear"), OCSVM(nu=0.1, kernel="poly"), OCSVM(nu=0.2, kernel="poly"), OCSVM(nu=0.3, kernel="poly"), OCSVM(nu=0.4, kernel="poly"), OCSVM(nu=0.5, kernel="poly"), OCSVM(nu=0.6, kernel="poly"), OCSVM(nu=0.7, kernel="poly"), OCSVM(nu=0.8, kernel="poly"), OCSVM(nu=0.9, kernel="poly"), OCSVM(nu=0.1, kernel="rbf"), OCSVM(nu=0.2, kernel="rbf"), OCSVM(nu=0.3, kernel="rbf"), OCSVM(nu=0.4, kernel="rbf"), OCSVM(nu=0.5, kernel="rbf"), OCSVM(nu=0.6, kernel="rbf"), OCSVM(nu=0.7, kernel="rbf"), OCSVM(nu=0.8, kernel="rbf"), OCSVM(nu=0.9, kernel="rbf"), OCSVM(nu=0.1, kernel="sigmoid"), OCSVM(nu=0.2, kernel="sigmoid"), OCSVM(nu=0.3, kernel="sigmoid"), OCSVM(nu=0.4, kernel="sigmoid"), OCSVM(nu=0.5, kernel="sigmoid"), OCSVM(nu=0.6, kernel="sigmoid"), OCSVM(nu=0.7, kernel="sigmoid"), OCSVM(nu=0.8, kernel="sigmoid"), OCSVM(nu=0.9, kernel="sigmoid"), COF(n_neighbors=3), COF(n_neighbors=5), COF(n_neighbors=10), COF(n_neighbors=15), COF(n_neighbors=20), COF(n_neighbors=25), COF(n_neighbors=50), ] # randomness_flags.extend([True] * 54) # LODA # randomness_flags.extend([False] * 7) # ABOD # randomness_flags.extend([True] * 81) # IForest # randomness_flags.extend([False] * 36) # KNN # randomness_flags.extend([False] * 36) # LOF # randomness_flags.extend([False] * 40) # HBOS # randomness_flags.extend([False] * 36) # OCSVM # randomness_flags.extend([False] * 7) # COF # return BASE_ESTIMATORS, randomness_flags return BASE_ESTIMATORS
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, X_test, y_train, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train LOCI detector clf_name = 'LODA' clf = LODA() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
if __name__ == '__main__': # Specify the root directory rootDir = 'G:/My Drive/Github/ml-group-col/One-Class-models/Anomaly_Datasets_csv/' # specify the random state rs = 10 # Save how to run the models models = [ (IsolationForest(random_state=rs), 'ISOF'), (EllipticEnvelope(random_state=rs), 'EE'), (LMDD(dis_measure='aad', random_state=rs), 'AAD_LMDD'), (COPOD(), 'COPOD'), (FeatureBagging(combination='average', random_state=rs), 'AVE_Bagging'), # n_jobs (LMDD(dis_measure='iqr', random_state=rs), 'IQR_LMDD'), (KNN(method='largest'), 'Largest_KNN'), # n_jobs (LODA(), 'LODA'), (FeatureBagging(combination='max', n_jobs=-1, random_state=rs), 'MAX_Bagging'), (MCD(random_state=rs), 'MCD'), (XGBOD(random_state=rs), 'XGBOD'), # n_jobs (GaussianMixture(random_state=rs), 'GMM'), (LocalOutlierFactor(novelty=True), 'LOF'), (KNN(method='median'), 'Median_KNN'), # n_jobs (KNN(method='mean'), 'Avg_KNN'), # n_jobs (CBLOF(n_clusters=10, random_state=rs), 'CBLOF'), (HBOS(), 'HBOS'), (SOD(), 'SOD'), (PCA(random_state=rs), 'PCA'), (VAE(encoder_neurons=[3, 4, 3], decoder_neurons=[3, 4, 3], random_state=rs), 'VAE'),