def obj_func_LOF(params): ## objective function used in baseian optimization outlier_fraction = params[0] n_neighbors = params[1] algorithm = params[2] leaf_size = params[3] # load data set to function work space Y_train = np.load('Y_train.npy') X_train = np.load('X_train.npy') # create model clf = LOF(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, contamination=outlier_fraction) # fit the dataset to the model clf.fit(X_train) scores_pred = clf.decision_function( X_train) * -1 # predict raw anomaly score Rprecision = Rprecision_f(Y_train, scores_pred) if glb_verbose: print('R Precision : ', Rprecision) y_pred = clf.predict( X_train) # prediction of a datapoint category outlier or inlier objVal = objVal_f(Rprecision, y_pred, Y_train) return objVal
def lof_pyod_once(X_nor, X_test, y_test, n_neighbors, contamination=0.05): lof = LOF(n_neighbors=n_neighbors, contamination=contamination) X_train = X_nor.astype(float).values.copy() lof.fit(X_train) ## now threshold is determined y_pred = lof.predict(X_test) scoreTable = lof.decision_function(X_test) #print(scoreTable) scoreTable = np.nan_to_num(scoreTable, copy=True) ## confusion matrix tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() tpr = tp / (tp + fn) fpr = fp / (tn + fp) #tprW[trail] = tpr #fprW[trail] = fpr tprW = tpr fprW = fpr # Auc score auc = roc_auc_score(y_test, scoreTable) #print(tpr, fpr) #print(auc) return tprW, fprW, auc, scoreTable
def calculate_LOF(given_DT, given_neighbors): X_1 = pd.DataFrame(given_DT) X = X_1.values clf = LOF(n_neighbors=given_neighbors) clf.fit(X) X_scores = clf.decision_scores_#clf.decision_function(XX_1) return X_scores
def anomaly_detection(data, label): X = data[data.select_dtypes('number').columns.tolist()] y = data[label] y = y.values X = X.drop([label], axis=1) sc = StandardScaler() X = pd.DataFrame(data=sc.fit_transform(X), columns=X.columns) ifo = IForest(contamination=0.01, behaviour='new', n_estimators=1000, max_samples=1024, n_jobs=-1, verbose=1) ifo.fit(X) ifo_pred = ifo.labels_ print('ROC score for Isolation forest: ', roc_auc_score(y, ifo_pred)) utilities.plot_outlier_scores( y, ifo.decision_scores_, bw=0.1, title='Fraud, Isolation forest. (n_estimators={})'.format( ifo.n_estimators)) ae = AutoEncoder(hidden_neurons=[25, 20, 15, 20, 25], hidden_activation='relu', output_activation='sigmoid', optimizer='adam', epochs=20, batch_size=128, dropout_rate=0.2, l2_regularizer=0.0, validation_size=0.1, preprocessing=False, verbose=1, random_state=1, contamination=0.01) ae.fit(X) ae_pred = ae.labels_ print('ROC score for Autoencoder: ', roc_auc_score(y, ae_pred)) utilities.plot_outlier_scores( y, ae.decision_scores_, bw=0.1, title='Fraud, Autoencoder. (epochs={})'.format(ae.epochs)) # Too long to train, under-sample needed lof = LOF(n_neighbors=int(y.sum() * 1.3), contamination=0.01, n_jobs=-1) lof.fit(X) lof_pred = lof.labels_ print('ROC score for LOF: ', roc_auc_score(y, lof_pred)) utilities.plot_outlier_scores( y, lof.decision_scores_, bw=0.1, title='Fraud, Local outliers factor. (n_neighbors={})'.format( lof.n_neighbors)) return y, ifo_pred, ae_pred, lof_pred
def __call__(self): clf = LOF(contamination=0.1) buggy_enter_csv = self.get_file( join(self.data_buggy_dir, '*_ENTER.csv')) buggy_exit_csv = self.get_file(join(self.data_buggy_dir, '*_EXIT.csv')) data = self.get_data(buggy_enter_csv, buggy_exit_csv) # extend data with self.data_orig_dir for cur_dir, dirs, files in os.walk(self.data_orig_dir): for f_dir in dirs: enter_csv = self.get_file(join(cur_dir, f_dir, '*_ENTER.csv')) exit_csv = self.get_file(join(cur_dir, f_dir, '*_EXIT.csv')) ext_data = self.get_data(enter_csv, exit_csv) logger.debug('shape of data: {}'.format(data.shape)) logger.debug('shape of ext_data: {}'.format(ext_data.shape)) data = np.concatenate((data, ext_data), axis=0) logger.debug('shape of data: {}'.format(data.shape)) clf.fit(data) train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) unique, counts = np.unique(train_pred, return_counts=True) logger.debug('unique (train): {}'.format(unique)) logger.debug('counts (train): {}'.format(counts)) if 0 not in unique: raise ModelError('Model contains no inlier') inliers_size = counts[0] outliers_size = counts[1] if len(counts) > 1 else 0 logger.debug('num of inliers: {}'.format(inliers_size)) logger.debug('num of outliers: {}'.format(outliers_size)) return self.predict(clf)
def run_LOF_base_detector(data, k, metric='euclidean', p=2): """ Function to fit and predict the LOF base detector on `data`. Input: - data: pd.DataFrame, to run LOF on - k: integer, parameter to indicate the amount of neighbours to include in relative density determination - metric: string, distance metric to use, default `euclidean` - p: int, default 1 since metric = `euclidean`, otherwise set according to distance metric Output: - clf of class pyod.models.lof.LOF with all its properties """ # Split data in values and targets: some datasets have an ID column, others don't try: X = data.drop(['outlier', 'id'], axis=1) except KeyError: X = data.drop('outlier', axis=1) # Construct and fit classifier clf = LOF(n_neighbors=k, metric='euclidean', p=p) clf.fit(X) # Fit only on features # Add ground truth labels for evaluation of the classifier clf.true_labels_ = data['outlier'] # Return the classifier for further processing return clf
def getOutlierLOF(dataset): ''' @brief Function that executes LOF algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model lof = LOF() # Fits the data and obtains labels lof.fit(dataset) # Return labels return lof.labels_
def runMethod(self): ''' @brief This function is the actual implementation of HICS ''' if self.verbose: print("Calculating the subspaces\n") # First we obtain the high contrast subspaces subspaces = self.hicsFramework() if self.verbose: print("Now calculating the scoring\n") # We initialize the scores for each instance as 0 scores = np.zeros(len(self.dataset)) # For each subspace for sub in subspaces: # We place the corresponding scorer according to parameter scorer = None if self.outlier_rank == "lof": scorer = LOF() elif self.outlier_rank == "cof": scorer = COF() elif self.outlier_rank == "cblof": scorer = CBLOF() elif self.outlier_rank == "loci": scorer = LOCI() elif self.outlier_rank == "hbos": scorer = HBOS() elif self.outlier_rank == "sod": scorer = SOD() # Fits the scorer with the dataset projection scorer.fit(self.dataset[:, sub]) # Adds the scores obtained to the global ones scores = scores + scorer.decision_scores_ # Compute the average self.outlier_score = scores / len(subspaces) # Marks the calculations as done self.calculations_done = True
# load dataset data_dict = load_dataset( dataset, subdataset, "all", ) x_train = data_dict["train"] x_test = data_dict["test"] x_test_labels = data_dict["test_labels"] # data preprocessing for MSCRED start = time.time() od = LOF(n_neighbors=n_neighbors, leaf_size=leaf_size, p=p) od.fit(x_train) # get outlier scores anomaly_score = od.decision_function(x_test) anomaly_label = x_test_labels end = time.time() time = end - start # Make evaluation evaluate_all(anomaly_score, anomaly_label) salience = compute_salience(anomaly_score, anomaly_label) print('time') print(' ', time)
class SolverAECIFAR(): def __init__(self, data_name, hidden_dim=256, seed=0, learning_rate=3e-4, normal_class=0, anomaly_ratio=0.1, batch_size=128, concentrated=0, training_ratio=0.8, SN=1, Trim=1, L=1.5, max_epochs=100): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if use_cuda else "cpu") self.L = L if concentrated == 1.0: full_data_name = 'CIFAR10_Concentrated' elif concentrated == 0.0: full_data_name = 'CIFAR10' self.result_path = "./results/{}_{}_{}/0.0/LOF/{}/".format( full_data_name, normal_class, anomaly_ratio, seed ) data_path = "./data/" + data_name + ".npy" self.learning_rate = learning_rate self.SN = SN self.Trim = Trim # self.dataset = RealGraphDataset(data_path, missing_ratio=0, radius=2) self.dataset = CIFARVGGDataset(data_path, normal_class=normal_class, anomaly_ratio=anomaly_ratio, concentrated=concentrated) self.seed = seed self.hidden_dim = hidden_dim self.max_epochs = max_epochs self.data_path = data_path self.data_anomaly_ratio = self.dataset.__anomalyratio__() self.batch_size = batch_size self.input_dim = self.dataset.__dim__() self.data_normaly_ratio = 1 - self.data_anomaly_ratio n_sample = self.dataset.__len__() self.n_train = int(n_sample * training_ratio) self.n_test = n_sample - self.n_train print('|data dimension: {}|data noise ratio:{}'.format(self.dataset.__dim__(), self.data_anomaly_ratio)) self.training_data, self.testing_data = data.random_split(dataset=self.dataset, lengths=[ self.n_train, self.n_test ]) self.ae = None self.discriminator = None self.model=None def train(self): self.model = LOF() self.model.fit(self.training_data.dataset.x) def test(self): y_test_scores = self.model.decision_function(self.testing_data.dataset.x) auc = roc_auc_score(self.testing_data.dataset.y, y_test_scores) from sklearn.metrics import precision_recall_fscore_support as prf, accuracy_score print("AUC:{:0.4f}".format( auc)) os.makedirs(self.result_path, exist_ok=True) np.save( self.result_path + "result.npy", { "accuracy": auc, "precision": auc, "recall": auc, "f1": auc, "auc": auc, }, ) # for consistency print("result save to {}".format(self.result_path))
# In[19]: pca_vectors, tweets_dict = extract_2pca() # In[22]: pca_vectors.shape # ### 5.5.2 Implement LOF model # In[13]: #Implement LOF model, extract decision scores lof = LOF(metric='cosine') #cosine is good for measuring non-numeric distances lof_model = lof.fit(pca_vectors) scores = lof_model.decision_scores_ # In[32]: max(scores) # ### 5.5.3 Implement function to extract top 5 outliers # In[35]: top_n = 5 tweet_index_decision_scores = [] decision_scores_tweet_index = [] for index, score in enumerate(scores):
# cla.append(classifier) ## cla.append((_*(144//splits),(_+1)*144//(splits))) #clf = LOF(n_neighbors=10, contamination=0.1) #clf.fit(X_train) #%% #predictions = {} #for _ in range (0,splits): # predictions['score_cla{}'.format(_)] = _ #predictions #%% cla = [] splits = 3 for _ in range(0, splits): classifier = LOF(n_neighbors=10, contamination=0.1) #classifier classifier.fit(X_train[:, _ * (144 // splits):(_ + 1) * 144 // (splits)]) cla.append(classifier) del classifier # cla.append((_*(144//splits),(_+1)*144//(splits))) clf = LOF(n_neighbors=10, contamination=0.1) #classifier clf.fit(X_train) #colors = ['red','green','blue'] i = 0 for dt in rrule(DAILY, dtstart=start_date, until=end_date): if (data.loc[dt.strftime("%Y-%m-%d")]['value']).values.size != 0: data.loc[dt.strftime("%Y-%m-%d"), 'score'] = clf.predict( preprocessing.normalize( data.loc[dt.strftime("%Y-%m-%d")].value.values.reshape(1, -1))) data.loc[dt.strftime("%Y-%m-%d"), 'probab'] = (clf.predict_proba((preprocessing.normalize( data.loc[dt.strftime("%Y-%m-%d")].value.values.reshape(
KNN(n_neighbors=25, contamination=contamination), KNN(n_neighbors=35, contamination=contamination), KNN(n_neighbors=45, contamination=contamination), IForest(n_estimators=50, contamination=contamination), IForest(n_estimators=100, contamination=contamination), LSCP(detector_list=[LOF(contamination=contamination), LOF(contamination=contamination)]) ] model = SUOD(base_estimators=base_estimators, n_jobs=6, bps_flag=True, contamination=contamination, approx_flag_global=False) model.fit(X) # fit all models with X model.approximate(X) # conduct model approximation if it is enabled predicted_labels = model.predict(X) # predict labels on X; for demo purpose only predicted_scores = model.decision_function(X) # predict scores on X; for demo purpose only # %% evaluate_print('majority vote', y, majority_vote(predicted_labels)) evaluate_print('average', y, average(predicted_scores)) evaluate_print('maximization', y, maximization(predicted_scores)) clf = LOF() clf.fit(X) evaluate_print('LOF', y, clf.decision_scores_) clf = IForest() clf.fit(X) evaluate_print('IForest', y, clf.decision_scores_)
print "x" #Number of states St = 5 Label = [] if args.m == 'lof' or args.m == 'LOF' or args.m == 'Lof': print "lof" #print "D = ", df[:30] numK = int(args.th) print "creating LOF" clf = LOF(n_neighbors=numK) print "fitting LOF" #E = np.array(df[:30]) E = np.array(df) E = np.asfarray(E, float) print "E = ", E[0:30] clf.fit(E) #clf.fit(df[:50]) print "fitting done" #print "l = ", clf.decision_scores_ mn = min(clf.decision_scores_) mx = max(clf.decision_scores_) R = mx - mn print "R = ", R, mn, mx for lab in clf.decision_scores_: v = int(St * ((lab - mn) / R)) Label.append(v) else: if args.m == 'IF' or args.m == 'if' or args.m == 'If': print "lof" #print "D = ", df[:30] numE = int(args.th)
@author: zixing.mei """ from pyod.models.lof import LOF #训练异常检测模型,然后输出训练集样本的异常分 clf = LOF(n_neighbors=20, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination=0.1, n_jobs=1) clf.fit(x) #异常分 out_pred = clf.predict_proba(x, method='linear')[:, 1] train['out_pred'] = out_pred #异常分在0.9百分位以下的样本删掉 key = train['out_pred'].quantile(0.9) x = train[train.out_pred < key][feature_lst] y = train[train.out_pred < key]['bad_ind'] val_x = val[feature_lst] val_y = val['bad_ind'] #重新训练模型
contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train LOF detector clf_name = 'LOF' clf = LOF() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
class TestLOF(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LOF(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'n_neighbors_') and self.clf.n_neighbors_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) print(pred_ranks) print(pred_ranks) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
def outlier_detection(df): testing_df = df[(df['Chassis_Number'] == 'WBA1C11080J829552')] # testing_df = df[(df['Chassis_Number'] == 'VF3LCYHZPJS332137')] clf = LOF( n_neighbors=10, contamination=0.1 ) data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1) data_reshaped = np.round(data_reshaped, 0) clf.fit(data_reshaped) y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1)) # y_pred[y_pred < 0] = 0.0 testing_df['outlier_score_lof'] = y_pred clf = LMDD( n_iter=100, contamination=0.1 ) data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1) data_reshaped = np.round(data_reshaped, 0) clf.fit(data_reshaped) y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1)) # y_pred[y_pred < 0] = 0.0 testing_df['outlier_score_lmdd'] = y_pred clf = IsolationForest( n_estimators=100, contamination=0.1 ) data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1) data_reshaped = np.round(data_reshaped, 0) clf.fit(data_reshaped) y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1)) # y_pred[y_pred < 0] = 0.0 testing_df['outlier_score_isolation_forest'] = y_pred clf = KNN( method='mean', n_neighbors=3, contamination=0.1 ) data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1) data_reshaped = np.round(data_reshaped, 0) clf.fit(data_reshaped) y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1)) # y_pred[y_pred < 0] = 0.0 testing_df['outlier_score_knn_mean'] = y_pred clf = KNN( method='median', n_neighbors=3, contamination=0.1 ) data_reshaped = np.array(testing_df['Kms'].values).reshape(-1, 1) data_reshaped = np.round(data_reshaped, 0) clf.fit(data_reshaped) y_pred = clf.predict(np.array(data_reshaped).reshape(-1, 1)) # y_pred[y_pred < 0] = 0.0 testing_df['outlier_score_knn_median'] = y_pred print(testing_df[['Movement_Date', 'Kms', 'Kms_diff', 'outlier_score_lof', 'outlier_score_lmdd', 'outlier_score_isolation_forest', 'outlier_score_knn_mean', 'outlier_score_knn_median']]) return
class TestLOF(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = LOF(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): if not hasattr(self.clf, 'decision_scores_') or self.clf.decision_scores_ is None: self.assertRaises(AttributeError, 'decision_scores_ is not set') if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None: self.assertRaises(AttributeError, 'labels_ is not set') if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None: self.assertRaises(AttributeError, 'threshold_ is not set') if not hasattr(self.clf, 'negative_outlier_factor_') or self.clf.negative_outlier_factor_ is None: self.assertRaises(AttributeError, 'negative_outlier_factor_ is not set') if not hasattr(self.clf, 'n_neighbors') or self.clf.n_neighbors_ is None: self.assertRaises(AttributeError, 'n_neighbors is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_evaluate(self): self.clf.fit_predict_evaluate(self.X_test, self.y_test) def tearDown(self): pass
toeplitz_roc = [] toeplitz_prn = [] toeplitz_time = [] pca_roc = [] pca_prn = [] pca_time = [] rp_roc = [] rp_prn = [] rp_time = [] for j in range(n_iter): start = time.time() clf = LOF() # change this to other detection algorithms clf.fit(X) y_train_scores = clf.decision_scores_ original_time.append(time.time() - start) original_roc.append(roc_auc_score(y, y_train_scores)) original_prn.append(precision_n_scores(y, y_train_scores)) X_transformed, _ = jl_fit_transform(X, dim_new, "basic") start = time.time() clf.fit(X_transformed) y_train_scores = clf.decision_scores_ basic_time.append(time.time() - start) basic_roc.append(roc_auc_score(y, y_train_scores)) basic_prn.append(precision_n_scores(y, y_train_scores)) X_transformed, _ = jl_fit_transform(X, dim_new, "discrete") start = time.time()
from pyod.models.mcd import MCD from pyod.models.lscp import LSCP # from pyod.models.auto_encoder import AutoEncoder clf_knn = KNN() clf_pca = PCA() clf_mcd = MCD() clf_lof = LOF() clf_cblof = CBLOF() # clf_lscp = LSCP([clf_knn, clf_pca, clf_mcd ]) # clf_ae = AutoEncoder(epochs=50) clf_mcd.fit(encodings_train) clf_pca.fit(encodings_train) clf_knn.fit(encodings_train) clf_lof.fit(encodings_train) clf_cblof.fit(encodings_train) # clf_lscp.fit(encodings_train) # clf_ae.fit(encodings_train) anomaly_scores_mcd = clf_mcd.decision_function(encodings_train) anomaly_scores_pca = clf_pca.decision_function(encodings_train) anomaly_scores_knn = clf_knn.decision_function(encodings_train) anomaly_scores_lof = clf_lof.decision_function(encodings_train) anomaly_scores_cblof = clf_cblof.decision_function(encodings_train) # anomaly_scores_lscp = clf_lscp.decision_function(encodings_train) # anomaly_scores_ae = clf_ae.predict_proba(encodings_train) # y_test_scores = [] # for x,_ in test_loader: # encodings_test = encoder(torch.Tensor(x).to(device))
toeplitz_roc = [] toeplitz_prn = [] toeplitz_time = [] pca_roc = [] pca_prn = [] pca_time = [] rp_roc = [] rp_prn = [] rp_time = [] for j in range(n_iter): start = time.time() clf = LOF() # change this to other detection algorithms clf.fit(X) y_train_scores = clf.decision_scores_ original_time.append(time.time() - start) original_roc.append(roc_auc_score(y, y_train_scores)) original_prn.append(precision_n_scores(y, y_train_scores)) X_transformer, _ = jl_fit_transform(X, dim_new, "basic") start = time.time() clf.fit(X_transformer) y_train_scores = clf.decision_scores_ basic_time.append(time.time() - start) basic_roc.append(roc_auc_score(y, y_train_scores)) basic_prn.append(precision_n_scores(y, y_train_scores)) X_transformer, _ = jl_fit_transform(X, dim_new, "discrete") start = time.time()
def train(self): model = LOF(contamination=self.data_anomaly_ratio) model.fit(self.X_train) self.best_model = model
class TestLOF(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LOF(contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'n_neighbors_') and self.clf.n_neighbors_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_prediction_labels_confidence(self): pred_labels, confidence = self.clf.predict(self.X_test, return_confidence=True) assert_equal(pred_labels.shape, self.y_test.shape) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_prediction_proba_linear_confidence(self): pred_proba, confidence = self.clf.predict_proba(self.X_test, method='linear', return_confidence=True) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) assert_equal(confidence.shape, self.y_test.shape) assert (confidence.min() >= 0) assert (confidence.max() <= 1) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
# detector_list, contamination=outliers_fraction, # random_state=random_state) #} #%% file_no = 1 data = preprocess(data_1) anomalies = anomalies_1 start_date = data.head(1).index.date[0] end_date = data.tail(1).index.date[0] datatotrain, datatotest, datatotrain_normalized, datatotest_normalized, train_data, test_data = createtraintest( data) X_train, X_test = datatotrain_normalized, datatotest_normalized #%% clf = LOF(n_neighbors=10, contamination=0.1) clf.fit(X_train) clf1 = LOF(n_neighbors=10, contamination=0.1) clf1.fit(X_train[:, 0:48]) clf2 = LOF(n_neighbors=10, contamination=0.1) clf2.fit(X_train[:, 48:96]) clf3 = LOF(n_neighbors=10, contamination=0.1) clf3.fit(X_train[:, 96:144]) #%% datax = data['value'].values.reshape(-1, 144) data_n = preprocessing.normalize(datax, norm='l2') #y_pred = clf.predict(data_n) i = 0 for dt in rrule(DAILY, dtstart=start_date, until=end_date): if (data.loc[dt.strftime("%Y-%m-%d")]['value']).values.size != 0: data.loc[dt.strftime("%Y-%m-%d"), 'ocsvm_score'] = clf.predict(
class TestLOF(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = LOF(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): if not hasattr( self.clf, 'decision_scores_') or self.clf.decision_scores_ is None: self.assertRaises(AttributeError, 'decision_scores_ is not set') if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None: self.assertRaises(AttributeError, 'labels_ is not set') if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None: self.assertRaises(AttributeError, 'threshold_ is not set') if not hasattr(self.clf, 'negative_outlier_factor_' ) or self.clf.negative_outlier_factor_ is None: self.assertRaises(AttributeError, 'negative_outlier_factor_ is not set') if not hasattr(self.clf, 'n_neighbors') or self.clf.n_neighbors_ is None: self.assertRaises(AttributeError, 'n_neighbors is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass