def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = CBLOF(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train)
def getOutlierCBLOF(dataset): ''' @brief Function that executes CBLOF algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model cblof = CBLOF() # Fits the data and obtains labels cblof.fit(dataset) # Return labels return cblof.labels_
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test): if method == 'KNN': clf = KNN() elif method == 'CBLOF': clf = CBLOF() elif method == 'PCA': clf = PCA() else: clf = IForest() clf.fit(x_train) # 使用x_train训练检测器clf # 返回训练数据x_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) print("On train Data:") evaluate_print(method, y_train, y_train_scores) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(x_test) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(x_test) # 返回未知数据上的异常值 (分值越大越异常) print("On Test Data:") evaluate_print(method, y_test, y_test_scores) y_true = column_or_1d(y_test) y_pred = column_or_1d(y_test_scores) check_consistent_length(y_true, y_pred) roc = np.round(roc_auc_score(y_true, y_pred), decimals=4), prn = np.round(precision_n_scores(y_true, y_pred), decimals=4) total_roc.append(roc) total_prn.append(prn)
def __load_classifiers(self): outliers_fraction = 0.05 random_state = np.random.RandomState(0) classifiers = { 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), } return classifiers
def model_init(self, model): """Model initialisation of a single model. """ if self.model == 'pca': self.models[model] = PCA(contamination=self.contamination) elif self.model == 'loda': self.models[model] = LODA(contamination=self.contamination) elif self.model == 'iforest': self.models[model] = IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) elif self.model == 'cblof': self.models[model] = CBLOF(n_clusters=3, contamination=self.contamination) elif self.model == 'feature_bagging': self.models[model] = FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) elif self.model == 'copod': self.models[model] = COPOD(contamination=self.contamination) elif self.model == 'hbos': self.models[model] = HBOS(contamination=self.contamination) else: self.models[model] = HBOS(contamination=self.contamination) self.custom_model_scalers[model] = MinMaxScaler()
def load_classifiers(outliers_fraction): outliers_fraction = min(0.5, outliers_fraction) random_state = np.random.RandomState(42) # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state, behaviour="new"), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def define_classifiers(random_state, outliers_fraction): classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def outlier_detection(x_raw, y_raw): """ Filter all ourlier points :param x_raw: feature in ndarray :param y_raw: label in ndarray :return x_clean, y_clean: cleaned feature and label in ndarray """ # TODO Filter the outliers. print() print("Detecting outliers...") print("Before outlier detection: {}".format(x_raw.shape)) outliers_fraction = 0.04 random_state = np.random.RandomState(42) # all outlier detection method candidate list as follows classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), 'Improving Supervised Outlier Detection with Unsupervised Representation Learning': XGBOD(contamination=outliers_fraction), } clf_name = 'Isolation Forest' clf = IForest(contamination=outliers_fraction, random_state=random_state) # clf_name = 'Angle-based Outlier Detector (ABOD)' # clf = ABOD(contamination=outliers_fraction, method='default') clf.fit(x_raw) y_pred = clf.predict(x_raw) # for pyod, 1 means outliers and 0 means inliers # for sklearn, -1 means outliers and 1 means inliers idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1] x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0) y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0) print("After outlier detection: {}".format(x_clean.shape)) assert (x_clean.shape[0] == y_clean.shape[0]) return x_clean, y_clean
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = CBLOF(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train)
def OD_detect(df, id_col=None, contamination=0.05, trans_cols=None): """ use pyod lib to find 5% outlier in dataset """ df = df.copy() OD_clfs = { "HBOS": HBOS(contamination=contamination), "IForest": IForest(contamination=contamination), "CBLOF": CBLOF(contamination=contamination, n_clusters=5), # "OCSVM": OCSVM(contamination=contamination), "PCA": PCA(contamination=contamination) } results_list = [] od_cols = ["id", "name", "result", "label"] if id_col is None: s_id = df.index od_cols = df.columns else: s_id = df[id_col] X_cols = df.columns.drop(id_col) if trans_cols is not None: for col in trans_cols: df[col] = PowerTransformer().fit_transform(df[col].values.reshape( -1, 1)) for clf_name, clf in OD_clfs.items(): od_result = pd.DataFrame(columns=od_cols) # create an empty dataframe od_result["id"] = s_id od_result['name'] = clf_name print(f"{clf_name}, {clf}") clf.fit(df[X_cols]) od_result['result'] = clf.decision_scores_ od_result['label'] = clf.labels_ results_list.append(od_result) od_results_df = pd.concat(results_list, axis=0, ignore_index=True) job_name = f'{pd.datetime.now():%H%M}' od_results_df['job_name'] = job_name od_results_df.to_sql('t_ml', engine, if_exists='append', schema='wh_v1', method=psql_insert_copy) print( f"OD results {od_results_df.shape}exported to database{engine},job_name={job_name}" ) return od_results_df
def models_init(self): """Models initialisation. """ self.model = self.configuration.get('model', 'pca') if self.model == 'pca': self.models = { model: PCA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'loda': self.models = { model: LODA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'iforest': self.models = { model: IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'cblof': self.models = { model: CBLOF(n_clusters=3, contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'feature_bagging': self.models = { model: FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'copod': self.models = { model: COPOD(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'hbos': self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } else: self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } self.custom_model_scalers = { model: MinMaxScaler() for model in self.models_in_scope }
def ranger(parameter, classifier): __ = parameter classi__ = { 'CBLOF': (CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state, n_clusters=__)), 'HBOS': (HBOS(contamination=outliers_fraction, n_bins=__)), 'KNN': (KNN(contamination=outliers_fraction, n_neighbors=__)), 'LOF': (LOF(n_neighbors=__, contamination=outliers_fraction)) } return classi__[classifier]
def outliers_detect(self, columns,outliers_fraction = 0.05): X = pd.get_dummies(self.data[columns]) clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0) clf.fit(X) scores_pred = clf.decision_function(X) * -1 y_pred = clf.predict(X) self.data['outlier'] = y_pred.tolist() n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1)
def fit_transform(self, df_train, df_corrupted): pyod_model = CBLOF(contamination=0.25) # n_clusters = 8 default df_outliers_num = self.num_out_detect(df_train, df_corrupted, pyod_model) df_outliers_cat = self.cat_out_detect(df_train, df_corrupted) df_outliers = df_outliers_num.join(df_outliers_cat, how='inner') for col in df_corrupted.columns: for i in df_outliers.index: if df_outliers.loc[i, col + "_outlier"] == 1: df_outliers.loc[i, col] = np.nan return df_outliers, self.predictors
def identify_fit(algo_type, outliers_fraction): random_state = np.random.RandomState(42) if algo_type == 'hbos': clf = HBOS(contamination=outliers_fraction) elif algo_type == 'cblof': clf = CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state) elif algo_type == 'iforest': clf = IForest(contamination=outliers_fraction, random_state=random_state) elif algo_type == 'knn': clf = KNN(contamination=outliers_fraction) elif algo_type == 'aknn': clf = KNN(method='mean', contamination=outliers_fraction) return clf
def run(data_train, data_test, clf_name): classifiers = { "CBLOF": CBLOF(random_state=0), } X_train, y_train = train_data_process(data_train) X_test, y_true = test_data_process(data_test) clf = classifiers[clf_name] try: clf.fit(X_train) y_pred = clf.predict(X_test) TP = 0 FN = 0 FP = 0 TN = 0 for i, label in enumerate(y_true): if label: if y_pred[i]: TP += 1 else: FN += 1 else: if y_pred[i]: FP += 1 else: TN += 1 if (FP + TN) == 0: pf = "no negative samples." else: pf = FP / (FP + TN) try: auc = roc_auc_score(y_true, y_pred) except ValueError as e: auc = str(e) return { 'train samples': str(X_train.shape[0]), 'defective train samples': str(np.sum(y_train)), 'precision': precision_score(y_true, y_pred), 'recall': recall_score(y_true, y_pred), 'pf': pf, 'F-measure': f1_score(y_true, y_pred), 'accuracy': accuracy_score(y_true, y_pred), 'AUC': auc } except ValueError as e: return str(e)
def detect_anomaly(df): x_values = df.index.values.reshape(df.index.values.shape[0],1) y_values = df.change.values.reshape(df.change.values.shape[0],1) clf = KNN() clf.fit(y_values) clf.predict(y_values) df["label_knn"] = clf.predict(y_values) df["score_knn"] = clf.decision_function(y_values).round(4) clf = IForest() clf.fit(y_values) clf.predict(y_values) df["label_iforest"] = clf.predict(y_values) df["score_iforest"] = clf.decision_function(y_values).round(4) clf = CBLOF() clf.fit(y_values) clf.predict(y_values) df["label_cblof"] = clf.predict(y_values) df["score_cblof"] = clf.decision_function(y_values).round(2) return df
def out_lier_score(df, target, num_var): scaler = MinMaxScaler(feature_range=(0, 1)) df = scaler.fit_transform(df.loc[:, num_var], df[target]) #.to_numpy() random_state = np.random.RandomState(42) outliers_fraction = 0.05 X = df df_out_score = [] # Define seven outlier tools detectionto be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction) } for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) df_out_score.append(y_pred.tolist()) df_out_score = pd.DataFrame(df_out_score).T df_out_score.columns = list(classifiers.keys()) return df_out_score
def ede_cblof(contamination, clustering_estimator=None, alpha=0.9, beta=5, use_weights=False, n_clusters=8, check_estimator=False, random_state=42, n_jobs=1): clf = CBLOF(contamination=contamination, clustering_estimator=clustering_estimator, alpha=alpha, beta=beta, use_weights=use_weights, n_clusters=n_clusters, check_estimator=check_estimator, random_state=random_state, n_jobs=n_jobs) return clf
def get_model_cblof(percentage_of_outliers=0.002, num_clusters=2): """Create a CBLOF model. Args: percentage_of_outliers: percentage of fraud on data num_clusters: number of clusters to form as well as the number of centroids to generate Returns: model: CBLOF model """ utils.save_log('{0} :: {1}'.format( get_model_cblof.__module__, get_model_cblof.__name__)) model = CBLOF(contamination=percentage_of_outliers, n_clusters=num_clusters, random_state=config.random_seed, n_jobs=config.num_jobs) return model
def __init__( self, *, hyperparams: Hyperparams, # random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._clf = CBLOF( contamination=hyperparams['contamination'], n_clusters=hyperparams['n_clusters'], alpha=hyperparams['alpha'], beta=hyperparams['beta'], use_weights=hyperparams['use_weights'], check_estimator=hyperparams['check_estimator'], random_state=hyperparams['random_state'], ) return
def choose_model(model, nnet): """ among implemented in PyOD """ clfs = { 'AE': AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15), 'VAE': VAE(encoder_neurons=nnet[:5], decoder_neurons=nnet[4:], contamination=0.1, epochs=13), 'ABOD': ABOD(), 'FeatureBagging': FeatureBagging(), 'HBOS': HBOS(), 'IForest': IForest(), 'KNN': KNN(), 'LOF': LOF(), 'OCSVM': OCSVM(), 'PCA': PCA(), 'SOS': SOS(), 'COF': COF(), 'CBLOF': CBLOF(), 'SOD': SOD(), 'LOCI': LOCI(), 'MCD': MCD() } return clfs[model]
def runMethod(self): ''' @brief This function is the actual implementation of HICS ''' if self.verbose: print("Calculating the subspaces\n") # First we obtain the high contrast subspaces subspaces = self.hicsFramework() if self.verbose: print("Now calculating the scoring\n") # We initialize the scores for each instance as 0 scores = np.zeros(len(self.dataset)) # For each subspace for sub in subspaces: # We place the corresponding scorer according to parameter scorer = None if self.outlier_rank == "lof": scorer = LOF() elif self.outlier_rank == "cof": scorer = COF() elif self.outlier_rank == "cblof": scorer = CBLOF() elif self.outlier_rank == "loci": scorer = LOCI() elif self.outlier_rank == "hbos": scorer = HBOS() elif self.outlier_rank == "sod": scorer = SOD() # Fits the scorer with the dataset projection scorer.fit(self.dataset[:, sub]) # Adds the scores obtained to the global ones scores = scores + scorer.decision_scores_ # Compute the average self.outlier_score = scores / len(subspaces) # Marks the calculations as done self.calculations_done = True
def cblof(self, X_train, contamination=None, random_state=None): """ Train CBLOF model from PYOD Parameters __________ X_train: scaled training data contamination: percentage of anomalies in the data random_state: random number seed Returns ________ Anomaly scores """ model = CBLOF(contamination=contamination, random_state=random_state) model.fit(X_train) # Predict raw anomaly score labels = model.predict(X_train) # outlier labels (0 or 1) cblof_anomaly_scores = model.decision_function( X_train) # outlier scores cblof_anomaly_scores = self.min_max_scaler(cblof_anomaly_scores) return cblof_anomaly_scores, labels
def get_CBOLF_scores(dataframe, cols, outliers_fraction=0.01): '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default Returns: df with CBOLF scores added ''' #standardize selected variables minmax = MinMaxScaler(feature_range=(0, 1)) dataframe[cols] = minmax.fit_transform(dataframe[cols]) #Convert dataframe to a numpy array in order to incorprate our algorithm arrays = [] for row in cols: row = dataframe[row].values.reshape(-1, 1) arrays.append(row) X = np.concatenate((arrays), axis=1) #fit clf = CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=0) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) CheckOutliers.df1 = dataframe CheckOutliers.df1['outlier'] = y_pred.tolist() print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with CBLOF')
df[['R', 'G']] = scaler.fit_transform(df[['R', 'G']]) df[['R', 'G']].head() X1 = df['R'].values.reshape(-1, 1) X2 = df['G'].values.reshape(-1, 1) X = np.concatenate((X1, X2), axis=1) random_state = np.random.RandomState(42) outliers_fraction = 0.05 # Define seven outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction) }
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train CBLOF detector clf_name = 'CBLOF' clf = CBLOF() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, X_test, y_train, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train CBLOF detector clf_name = 'CBLOF' clf = CBLOF(random_state=42) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
# Show the statics of the data print('Number of inliers: %i' % n_inliers) print('Number of outliers: %i' % n_outliers) print( 'Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format( shape=ground_truth.shape)) print(ground_truth) random_state = np.random.RandomState(42) # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(n_neighbors=10, contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Median KNN': KNN(method='median', contamination=outliers_fraction), 'Local Outlier Factor (LOF)':
def main(): # PART 1: # Getting the predictions for each classifier # SK means: The classifier is from sklearn or works like sklearn # PY means: The classifier is from pyod or works like pyod models = { 'SK_EE': EllipticEnvelope(), 'SK_GM': GaussianMixture(), 'SK_IF': IsolationForest(), 'SK_OCSVM': OneClassSVM(), 'SK_FA': FactorAnalysis(), 'SK_KD': KernelDensity(), 'PY_PCA': PCA(), 'PY_COF': COF(), 'PY_LODA': LODA(), 'PY_LOF': LOF(), 'PY_HBOS': HBOS(), 'PY_MCD': MCD(), 'PY_AvgKNN': KNN(method='mean'), 'PY_LargestKNN': KNN(method='largest'), 'PY_MedKNN': KNN(method='median'), 'PY_AvgBagging': FeatureBagging(combination='average'), 'PY_MaxBagging': FeatureBagging(combination='max'), 'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'PY_COPOD': COPOD(), 'PY_SOD': SOD(), 'PY_LSCPwithLODA': LSCP([LODA(), LODA()]), 'PY_AveLMDD': LMDD(dis_measure='aad'), 'PY_VarLMDD': LMDD(dis_measure='var'), 'PY_IqrLMDD': LMDD(dis_measure='iqr'), 'PY_VAE': VAE(encoder_neurons=[8, 4, 2]), 'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'SK_BRM': BRM(bootstrap_sample_percent=70), 'SK_OCKRA': m_OCKRA(), 'PY_SoGaal': SO_GAAL(), 'PY_MoGaal': MO_GAAL() } ranker = ADRanker(data="datasets", models=models) ranker.get_predictions() # PART 2: # After predictions, we can evaluate our classifiers using different scores # You can add manually a new metric by modifying 'metrics.py' ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave}) # PART 3: # Finally, it is time to summarize the results by plotting different graphs # You can add your own graphs by modifying ' plots.py' plot = Plots() plot.make_plot_basic(paths=[ 'results/scores/auc/no/results.csv', 'results/scores/auc/minmax/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/ave/minmax/results.csv', 'results/scores/ave/std/results.csv' ], scalers=[ 'Without scaler', 'Min max scaler', 'Standard scaler', 'Without scaler', 'Min max scaler', 'Standard scaler' ]) plot.make_cd_plot( paths=[ 'results/scores/auc/minmax/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/no/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/std/results.csv' ], names=[ 'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale', 'CD ave no scale', 'CD auc std scale', 'CD ave std scale' ], titles=[ 'CD diagram - AUC with min max scaling', 'CD diagram - Average precision with min max scaling', 'CD diagram - AUC without scaling', 'CD diagram - Average precision without scaling', 'CD diagram - AUC with standard scaling', 'CD diagram - Average precision with standard scaling' ])
X_valid3.drop(['Duration_max', 'Duration_median', 'BiDirection_Bytes_median', 'SrcToDst_Bytes_median', 'Label_<lambda>'], inplace = True, axis = 1) #Extracting y-labels for the validation data and dropping in X data. Y labels will be the same for all feature sets ofcourse Y_valid1 = X_valid1['Label_<lambda>'] X_valid1.drop(['Label_<lambda>'], inplace=True, axis=1) # Reading original test data to extract the malicious flow data after prediction orig_test_data = pd.read_csv("test_data.csv", header=None) orig_test_data.columns = ['Date_Flow_Start', 'Duration','Protocol','Src_IP','Src_Port','Direction','Dst_IP','Dst_Port','State','Source_Service','Dest_Service','Total_Packets','BiDirection_Bytes','SrcToDst_Bytes'] """ Training on Feature Set 1 CBLOF on Default Parameters """ clf1 = CBLOF(random_state=42) # Default contamination 0.1 clf1.fit(X_train1) #Setting threshold using the contamination parameter dec_scores = clf1.decision_scores_ dec_scores_sorted=sorted(dec_scores, reverse=True) a = round(len(X_train1) * clf1.contamination) print(a) anomalies=dec_scores_sorted[:a] threshold = anomalies[-1] print(threshold) # Validation data is scored y_valid_scores = clf1.decision_function(X_valid1) y_valid_scores = pd.Series(y_valid_scores)
class TestLOF(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = CBLOF(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_sklearn_estimator(self): # TODO: sklearn examples are too small to form valid # check_estimator(self.clf) pass def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'clustering_estimator_') and self.clf.clustering_estimator_ is not None) assert_true(hasattr(self.clf, 'cluster_labels_') and self.clf.cluster_labels_ is not None) assert_true(hasattr(self.clf, 'cluster_sizes_') and self.clf.cluster_sizes_ is not None) assert_true(hasattr(self.clf, 'cluster_centers_') and self.clf.cluster_centers_ is not None) assert_true(hasattr(self.clf, '_clustering_threshold') and self.clf._clustering_threshold is not None) assert_true(hasattr(self.clf, 'small_cluster_labels_') and self.clf.small_cluster_labels_ is not None) assert_true(hasattr(self.clf, 'large_cluster_labels_') and self.clf.large_cluster_labels_ is not None) assert_true(hasattr(self.clf, '_large_cluster_centers') and self.clf._large_cluster_centers is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass