def getOulierMOGAAL(dataset): ''' @brief Function that executes MO_GAAL algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model mg = MO_GAAL() # Fits the data and obtains labels mg.fit(dataset) # Return labels return mg.labels_
def setUp(self): self.n_train = 1000 self.n_test = 200 self.n_features = 2 self.contamination = 0.1 # GAN may yield unstable results; turning performance check off # self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = MO_GAAL(k=1, stop_epochs=2, contamination=self.contamination) self.clf.fit(self.X_train)
def _mo_gaal_experiment(dataset_load_fn, dataset_name, single_class_ind): (x_train, y_train), (x_test, y_test) = dataset_load_fn() x_train = x_train.reshape((len(x_train), -1)) x_test = x_test.reshape((len(x_test), -1)) x_train_task = x_train[y_train.flatten() == single_class_ind] best_mo_gaal = MO_GAAL().fit(x_train_task) scores = best_mo_gaal.decision_function(x_test) labels = y_test.flatten() == single_class_ind res_file_name = '{}_mo-gaal_{}_{}.npz'.format( dataset_name, get_class_name_from_index(single_class_ind, dataset_name), datetime.datetime.now().strftime('%Y-%m-%d-%H%M')) res_file_path = os.path.join(RESULTS_DIR, dataset_name, res_file_name) save_roc_pr_curve_data(scores, labels, res_file_path)
def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def setUp(self): self.n_train = 3000 self.n_test = 1000 self.n_features = 10 self.contamination = 0.1 # TODO: GAN may yield unstable results; turning performance check off # self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = MO_GAAL(k=1, stop_epochs=2, contamination=self.contamination) self.clf.fit(self.X_train)
def __init__( self, *, hyperparams: Hyperparams, # random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._clf = MO_GAAL( stop_epochs=hyperparams['stop_epochs'], k=hyperparams['k'], lr_d=hyperparams['lr_d'], lr_g=hyperparams['lr_g'], decay=hyperparams['decay'], momentum=hyperparams['momentum'], contamination=hyperparams['contamination'], ) return
def main(): # PART 1: # Getting the predictions for each classifier # SK means: The classifier is from sklearn or works like sklearn # PY means: The classifier is from pyod or works like pyod models = { 'SK_EE': EllipticEnvelope(), 'SK_GM': GaussianMixture(), 'SK_IF': IsolationForest(), 'SK_OCSVM': OneClassSVM(), 'SK_FA': FactorAnalysis(), 'SK_KD': KernelDensity(), 'PY_PCA': PCA(), 'PY_COF': COF(), 'PY_LODA': LODA(), 'PY_LOF': LOF(), 'PY_HBOS': HBOS(), 'PY_MCD': MCD(), 'PY_AvgKNN': KNN(method='mean'), 'PY_LargestKNN': KNN(method='largest'), 'PY_MedKNN': KNN(method='median'), 'PY_AvgBagging': FeatureBagging(combination='average'), 'PY_MaxBagging': FeatureBagging(combination='max'), 'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'PY_COPOD': COPOD(), 'PY_SOD': SOD(), 'PY_LSCPwithLODA': LSCP([LODA(), LODA()]), 'PY_AveLMDD': LMDD(dis_measure='aad'), 'PY_VarLMDD': LMDD(dis_measure='var'), 'PY_IqrLMDD': LMDD(dis_measure='iqr'), 'PY_VAE': VAE(encoder_neurons=[8, 4, 2]), 'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'SK_BRM': BRM(bootstrap_sample_percent=70), 'SK_OCKRA': m_OCKRA(), 'PY_SoGaal': SO_GAAL(), 'PY_MoGaal': MO_GAAL() } ranker = ADRanker(data="datasets", models=models) ranker.get_predictions() # PART 2: # After predictions, we can evaluate our classifiers using different scores # You can add manually a new metric by modifying 'metrics.py' ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave}) # PART 3: # Finally, it is time to summarize the results by plotting different graphs # You can add your own graphs by modifying ' plots.py' plot = Plots() plot.make_plot_basic(paths=[ 'results/scores/auc/no/results.csv', 'results/scores/auc/minmax/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/ave/minmax/results.csv', 'results/scores/ave/std/results.csv' ], scalers=[ 'Without scaler', 'Min max scaler', 'Standard scaler', 'Without scaler', 'Min max scaler', 'Standard scaler' ]) plot.make_cd_plot( paths=[ 'results/scores/auc/minmax/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/no/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/std/results.csv' ], names=[ 'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale', 'CD ave no scale', 'CD auc std scale', 'CD ave std scale' ], titles=[ 'CD diagram - AUC with min max scaling', 'CD diagram - Average precision with min max scaling', 'CD diagram - AUC without scaling', 'CD diagram - Average precision without scaling', 'CD diagram - AUC with standard scaling', 'CD diagram - Average precision with standard scaling' ])
class TestMO_GAAL(unittest.TestCase): """ Notes: GAN may yield unstable results, so the test is design for running models only, without any performance check. """ def setUp(self): self.n_train = 1000 self.n_test = 200 self.n_features = 2 self.contamination = 0.1 # GAN may yield unstable results; turning performance check off # self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = MO_GAAL(k=1, stop_epochs=2, contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'discriminator') and self.clf.discriminator is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance # assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def tearDown(self): pass
detector_list = [LOF(), LOF()] models = [ # BRM github (brminer.BRM(), 'BRM'), # ocSVM sklearn (OneClassSVM(gamma='auto'), 'ocSVM'), # COF pyod (COF(contamination=0.1, n_neighbors=20), 'COF'), # ABOD pyod (ABOD(contamination=0.1, n_neighbors=5, method='fast'), 'ABOD'), # MO_GAAL pyod (MO_GAAL(k=10, stop_epochs=20, lr_d=0.01, lr_g=0.0001, decay=1e-06, momentum=0.9, contamination=0.1), 'MO_GAAL'), # SO_GAAL pyod (SO_GAAL(stop_epochs=20, lr_d=0.01, lr_g=0.0001, decay=1e-06, momentum=0.9, contamination=0.1), 'SO_GAAL'), # OCKRA github (m_ockra.m_OCKRA(), 'OCKRA'), # VAR LMDD pyOD (LMDD(dis_measure='var', random_state=rs), 'VAR_LMDD'), # LOCI pyod
contamination = 0.1 # percentage of outliers n_train = 20000 # number of training points n_test = 2000 # number of testing points n_features = 300 # number of features # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=n_features, contamination=contamination, random_state=42) # train MO_GAAL detector clf_name = 'MO_GAAL' clf = MO_GAAL(k=3, stop_epochs=2, contamination=contamination) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
def execute(self): evaluation_results = [] print("Loading training data...") data = pd.DataFrame() for i, chunk in enumerate( pd.read_csv(self.input_file, header=None, chunksize=self.chunk_size)): print("Reading chunk: %d" % (i + 1)) #print(chunk) data = data.append(chunk) input_dimensionality = len(data.columns) - 1 print("Input Dimensionality: %d" % (input_dimensionality)) positive_data = data[data[len(data.columns) - 1] == 1].iloc[:, :len(data.columns) - 1] negative_data = data[data[len(data.columns) - 1] == -1].iloc[:, :len(data.columns) - 1] training_data = positive_data.sample(frac=0.70) positive_validation_data = positive_data.drop(training_data.index) if self.neg_cont and self.neg_cont > 0: print("Negative Contamination: %0.4f" % (self.neg_cont)) num_negative = math.floor( self.neg_cont * (len(negative_data) + len(positive_validation_data))) negative_data = data.sample(frac=1, random_state=200)[ data[len(data.columns) - 1] == -1].iloc[:num_negative, :len(data.columns) - 1] negative_validation_data = negative_data.copy() temp_positive = positive_validation_data.copy() temp_positive[input_dimensionality] = 1 temp_negative = negative_data.copy() temp_negative[input_dimensionality] = -1 validation_data_with_labels = pd.concat([temp_positive, temp_negative], ignore_index=True) validation_data = validation_data_with_labels.iloc[:, :len(data.columns ) - 1] validation_labels = validation_data_with_labels.iloc[:, -1:].values # Convert to tensor positive_data = torch.tensor(positive_data.values).float().to( self.device) negative_data = torch.tensor(negative_data.values).float().to( self.device) training_data = torch.tensor(training_data.values).float() validation_data = torch.tensor(validation_data.values).float() print("Validation Data:") print(validation_data) ## AE-D TRAINING ## print("Initializing autoencoder...") net = Autoencoder(layers=self.layers, device=self.device, add_syn=self.add_syn) net.to(self.device) print(net) print("Training Stochastic Autoencoder...") net.fit(training_data, epochs=self.epochs, lr=self.lr, batch_size=self.batch_size) predictions = net.predict(validation_data) tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc = performance_metrics( validation_labels, predictions) r = ["AE-D", tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc] evaluation_results.append(r) print("AE-D Results:") print( tabulate([r], [ "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV", "TS", "PT", "ACC", "F1", "MCC" ], tablefmt="grid")) # Convert back to CPU before other methods validation_data = validation_data.cpu() # Train only linear classifiers if self.eval_cat == "linear": print("Initiating training for linear detectors...") ## MCD ## print("Training MCD...") result = train_and_evaluate_classifier("MCD", MCD(), validation_data, validation_labels) evaluation_results.append(result) ## ROBUST COVARIANCE ## print("Training Robust Covariance...") result = train_and_evaluate_classifier("ROB-COV", EllipticEnvelope(), validation_data, validation_labels) evaluation_results.append(result) ## ONE CLASS SVM TRAINING ## print("Training OneClassSVM...") result = train_and_evaluate_classifier( "OC-SVM", svm.OneClassSVM(gamma="auto"), validation_data, validation_labels) evaluation_results.append(result) elif self.eval_cat == "prob": ## ABOD ## #print("Training ABOD...") #result = train_and_evaluate_classifier("ABOD", ABOD(), validation_data, validation_labels) #evaluation_results.append(result) ## SOS ## #print("Training SOS...") #result = train_and_evaluate_classifier("SOS", SOS(), validation_data, validation_labels) #evaluation_results.append(result) ## COPOD ## print("Training COPOD...") result = train_and_evaluate_classifier("COPOD", COPOD(), validation_data, validation_labels) evaluation_results.append(result) elif self.eval_cat == "ensemble": ## ISOLATION FOREST TRAINING ## print("Training Isolation Forest...") result = train_and_evaluate_classifier( "ISO-F", IsolationForest(random_state=0), validation_data, validation_labels) evaluation_results.append(result) ## LODA ## print("Training LODA...") result = train_and_evaluate_classifier("LODA", LODA(), validation_data, validation_labels) evaluation_results.append(result) ## LSCP ## # print("Training LSCP...") # result = train_and_evaluate_classifier("LSCP", LSCP([LOF(), LOF()]), validation_data, validation_labels) # evaluation_results.append(result) elif self.eval_cat == "proximity": ## LOCAL OUTLIER FACTOR ## print("Training Local Outlier Factor...") result = train_and_evaluate_classifier( "LOC-OF", LocalOutlierFactor(novelty=True), validation_data, validation_labels) evaluation_results.append(result) ## CBLOF ## print("Training CBLOF...") result = train_and_evaluate_classifier("CBLOF", CBLOF(), validation_data, validation_labels) evaluation_results.append(result) ## HBOS ## print("Training HBOS...") result = train_and_evaluate_classifier("HBOS", HBOS(), validation_data, validation_labels) evaluation_results.append(result) elif self.eval_cat == "nn": ## VAE ## print("Training VAE...") result = train_and_evaluate_classifier( "VAE", VAE(encoder_neurons=self.layers, decoder_neurons=self.layers.reverse()), validation_data, validation_labels) evaluation_results.append(result) ## SO_GAAL ## print("Training SO_GAAL...") result = train_and_evaluate_classifier( "SO_GAAL", SO_GAAL(lr_d=self.lr, stop_epochs=self.epochs), validation_data, validation_labels) evaluation_results.append(result) ## MO_GAAL ## print("Training MO_GAAL...") result = train_and_evaluate_classifier( "MO_GAAL", MO_GAAL(lr_d=self.lr, stop_epochs=self.epochs), validation_data, validation_labels) evaluation_results.append(result) ## EVALUATE RESULTS ## if self.eval_cat != "none": print("Aggregated Results:") print( tabulate(evaluation_results, [ "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV", "TS", "PT", "ACC", "F1", "MCC" ], tablefmt="grid")) ## DATASET METRICS ## len_training_data_points = len(training_data) len_positive_validations = len(positive_validation_data) len_negative_validations = len(negative_validation_data) len_validations = len_positive_validations + len_negative_validations metrics_results = [ ["Training Data Points", len_training_data_points], ["# Normal Points", len_positive_validations], ["# Anomalies", len_negative_validations], [ "Contamination Percentage", math.floor((len_negative_validations / len_validations) * 100) ] ] ## EVALUATE RESULTS ## print(tabulate(metrics_results, ["Metric", "Value"], tablefmt="grid")) if self.printout: print("Saving results to %s" % (self.printout)) df = pd.DataFrame(evaluation_results) df.to_csv(self.printout, header=None, index=False)
class TestMO_GAAL(unittest.TestCase): def setUp(self): self.n_train = 3000 self.n_test = 1000 self.n_features = 10 self.contamination = 0.1 # TODO: GAN may yield unstable results; turning performance check off # self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = MO_GAAL(k=1, stop_epochs=2, contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'discriminator') and self.clf.discriminator is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance # assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def tearDown(self): pass
test_date = "2020-02-16" KNN_clf = KNN(contamination=0.05) PCA_clf = PCA(contamination=0.05) VAE_clf = VAE(contamination=0.05, epochs=30, encoder_neurons=[9, 4], decoder_neurons=[4, 9]) LOF_clf = LOF(contamination=0.05) IForest_clf = IForest(contamination=0.05) AutoEncoder_clf = AutoEncoder(contamination=0.05, epochs=30, hidden_neurons=[9, 4, 4, 9]) FeatureBagging_clf = FeatureBagging(contamination=0.05, check_estimator=False) ABOD_clf = ABOD(contamination=0.05) HBOS_clf = HBOS(contamination=0.05) CBLOF_clf = CBLOF(contamination=0.05) LODA_clf = LODA(contamination=0.05) MCD_clf = MCD(contamination=0.05) MO_GAAL_clf = MO_GAAL(k=3, stop_epochs=2, contamination=0.05) SO_GAAL_clf = SO_GAAL(contamination=0.05) KNN_MAH_clf = None S_models = ["KNN", "LOF", "PCA", "IForest", "HBOS", "LODA", "MCD", "CBLOF", "FeatureBagging", "ABOD", "KNN_MAH"] K_models = ["AutoEncoder", "SO_GAAL", "VAE"] def get_train_data(): """ 获取训练样本 :return: x_train 9特征训练样本 df 原训练数据 """ acc_date = pd.date_range(begin, end, freq='1D') for day in acc_date: date = str(day.date())
X = df.to_numpy() scaler = StandardScaler() # Representation 1 & 4 #X_scaled = scaler.fit_transform(X) # Representation 2 # times & TFIDF: Scale times only separately #X_times_scaled = scaler.fit_transform(X[:,:len(TIME_FIELDS)]) #X_scaled = np.hstack([X_times_scaled, X[:,len(TIME_FIELDS):]]) # Representation 3 # times & TFIDF: Scale all # X_scaled = scaler.fit_transform(X) # Representation 5 # times & TFIDF: Scale times only and TFIDF features separately. TFIDF vectors are made L2 norm = 1 X_times_scaled = scaler.fit_transform(X[:, :len(TIME_FIELDS) - 1]) scaler_tfidf = Normalizer() X_tfidf = scaler_tfidf.fit_transform(X[:, len(TIME_FIELDS) - 1:]) X_scaled = np.hstack([X_times_scaled, X_tfidf]) clf = MO_GAAL(contamination=0.05) clf.fit(X_scaled) df_all['scores'] = clf.decision_scores_ df_all['labels'] = clf.labels_ df_out = df_all.where(df_all['labels'] == 1).dropna() df_out = df_out.loc[:, (df_out != 0).any(axis=0)] df_out.to_excel(os.path.join(OUT_PATH, OUT_FILE))
def run_all_models(all_array, labels, pca, data_set_name): picture_name = all_array.get("# img", 1) all_array = all_array.drop("# img", 1) # standardizing data for processing all_array = standardizer(all_array) y = labels.get("in").to_numpy() x_train, x_test, y_train, y_test, picture_train, picture_test = train_test_split(all_array, y, picture_name, test_size=0.4) if pca: transformer = IncrementalPCA() all_array = transformer.fit_transform(all_array) print("OCSVM") now = time() clf = OCSVM() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("OCSVM", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("Auto-encoder") now = time() clf = AutoEncoder(epochs=30) clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("Auto-encoder", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("HBOS") now = time() clf = HBOS() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("HBOS", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("SO_GAAL") now = time() clf = SO_GAAL() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("SO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("MO_GAAL") now = time() clf = MO_GAAL() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("MO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("MCD") now = time() clf = MCD() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("MCD", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("SOS") now = time() clf = SOS() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("SOS", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("IForest") now = time() clf = IForest() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("IFrorest", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("KNN") now = time() clf = KNN() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("KNN", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("PCA") now = time() clf = PCA() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("PCA", all_array.shape, temp, data_set_name, time() - now, scores_train))