def test_normalization(self): norm_X_train, norm_X_test = standardizer(self.X_train, self.X_train) assert_allclose(norm_X_train.mean(), 0, atol=0.05) assert_allclose(norm_X_train.std(), 1, atol=0.05) assert_allclose(norm_X_test.mean(), 0, atol=0.05) assert_allclose(norm_X_test.std(), 1, atol=0.05) # test when X_t is not presented norm_X_train = standardizer(self.X_train) assert_allclose(norm_X_train.mean(), 0, atol=0.05) assert_allclose(norm_X_train.std(), 1, atol=0.05)
def filter(self): bd = pd.read_pickle(self.filename) #data=load("/home/has/Airline/dm-pfe-hm/d","rb") #bd df = pd.DataFrame(bd) df = df.loc[df['details_agent'] == self.agent_ref] df['B_dept'] = (df.details_flights_departure - df.details_validation_at) #df.dropna() df['B_dept'] = df['B_dept'] / np.timedelta64(1, 'h') #df=df[df.details_status =='TKTT'] df['d'] = df.details_validation_at.dt.date df['t'] = df.details_validation_at.dt.time df = df[df.details_status == 'TKTT'] df['B_dept'] = round(df.B_dept, 0) df = df.drop(df[df.B_dept < 0].index) df = df.drop(df[df.details_price < 400].index) X = df.iloc[:, [2, 5]].values #from sklearn.preprocessing import MinMaxScaler #scaler = MinMaxScaler() #X=scaler.fit_transform(X) X = standardizer(X) # self.x=X return X
def _create_scores(self, X): """Internal function to generate and combine scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. Returns ------- agg_score: numpy array of shape (n_samples,) Aggregated scores. """ all_scores = np.zeros([X.shape[0], self.n_base_estimators_]) for i, clf in enumerate(self.base_estimators): if hasattr(clf, 'decision_function'): all_scores[:, i] = clf.decision_function(X) else: raise ValueError( "{clf} does not have decision_function.".format(clf=clf)) if self.standardization: all_scores = standardizer(all_scores) if self.method == 'average': agg_score = average(all_scores, estimator_weights=self.weights) if self.method == 'maximization': agg_score = maximization(all_scores) if self.method == 'median': agg_score = median(all_scores) return agg_score
def fit(self, X, contamination=0.01): """ Fit detector Args: X: pd.DataFrame """ self.detectors = { "auto_encoder": AutoEncoder( epochs=256, validation_size=0, preprocessing=False, verbose=0, contamination=contamination, ), } # print("train_data.shape:", X.shape) # 数据预处理 # 标准化 X_train_norm, self.data_norm_scalar = standardizer(X, keep_scalar=True) # 归一化 X_train_unif, self.data_unif_scalar = minmaxizer(X_train_norm, keep_scalar=True) train_scores = np.zeros([X.shape[0], len(self.detectors)]) thresholds = np.zeros([1, len(self.detectors)]) # 训练 for i, clf_name in enumerate(self.detectors): clf = self.detectors[clf_name] clf.fit(X_train_unif) train_scores[:, i] = clf.decision_scores_ thresholds[:, i] = clf.threshold_ # 训练集异常程度及阈值 train_scores_norm, self.score_scalar = standardizer(train_scores, keep_scalar=True) thresholds_norm = self.score_scalar.transform(thresholds) self.decision_scores = pd.DataFrame(average(train_scores_norm), index=X.index) self.decision_scores.columns = ["score"] self.threshold = average(thresholds_norm)[0] self.label = self.get_label(self.decision_scores)
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.X_train, self.X_test = standardizer(self.X_train, self.X_test) self.detector_list = [LOF(), LOF()] self.clf = LSCP(self.detector_list, contamination=self.contamination) self.clf.fit(self.X_train)
def __train_classifiers(self): scaler = MinMaxScaler(feature_range=(0, 1)) X = scaler.fit_transform(self.df.copy()) classifiers = self.__load_classifiers() scores = np.zeros([X.shape[0], len(classifiers)]) for i, (clf_name, clf) in enumerate(classifiers.items()): try: clf.fit(X) scores[:, i] = clf.decision_scores_ except Exception as e: print("Failed for ", clf_name) print("because of ", e) standard_scores = standardizer(scores) combined_scores = maximization(standard_scores) return combined_scores
def stratified_cv(X, y, num_folds): folds = [] skf = StratifiedKFold(n_splits=num_folds) splits = skf.split(X, y) for train_index, test_index in splits: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train, X_test = standardizer(X_train, X_test) folds.append((X_train, y_train, X_test, y_test)) return folds
def _get_decision_scores(self, X): # ensure local region size is within acceptable limits self.local_region_size = max(self.local_region_size, self.local_region_min) self.local_region_size = min(self.local_region_size, self.local_region_max) # standardize test data and get local region for each test instance X_test_norm = X ind_arr = self._get_local_region(X_test_norm) # calculate test scores test_scores = np.zeros([X_test_norm.shape[0], self.n_clf]) for k, estimator in enumerate(self.estimator_list): test_scores[:, k] = estimator.decision_function(X_test_norm) # generate standardized scores train_scores_norm, test_scores_norm = standardizer(self.train_scores_, test_scores) # generate pseudo target for training --> for calculating weights self.training_pseudo_label_ = np.max(train_scores_norm, axis=1).reshape(-1, 1) # placeholder for predictions pred_scores_ens = np.zeros([X_test_norm.shape[0], ]) # iterate through test instances (ind_arr indices correspond to x_test) for i, ind_k in enumerate(ind_arr): # get pseudo target and training scores in local region of test instance local_pseudo_ground_truth = self.training_pseudo_label_[ind_k,].ravel() local_train_scores = train_scores_norm[ind_k, :] # calculate pearson correlation between local pseudo ground truth and local train scores pearson_corr_scores = np.zeros([self.n_clf, ]) for d in range(self.n_clf): pearson_corr_scores[d,] = pearsonr(local_pseudo_ground_truth, local_train_scores[:, d])[0] # return best score pred_scores_ens[i,] = np.mean( test_scores_norm[i, self._get_competent_detectors(pearson_corr_scores)]) return pred_scores_ens
def test_normalization(self): # test when X_t is presented and no scalar norm_X_train, norm_X_test = standardizer(self.X_train, self.X_test) assert_allclose(norm_X_train.mean(), 0, atol=0.05) assert_allclose(norm_X_train.std(), 1, atol=0.05) assert_allclose(norm_X_test.mean(), 0, atol=0.05) assert_allclose(norm_X_test.std(), 1, atol=0.05) # test when X_t is not presented and no scalar norm_X_train = standardizer(self.X_train) assert_allclose(norm_X_train.mean(), 0, atol=0.05) assert_allclose(norm_X_train.std(), 1, atol=0.05) # test when X_t is presented and the scalar is kept norm_X_train, norm_X_test, scalar = standardizer(self.X_train, self.X_test, keep_scalar=True) assert_allclose(norm_X_train.mean(), 0, atol=0.05) assert_allclose(norm_X_train.std(), 1, atol=0.05) assert_allclose(norm_X_test.mean(), 0, atol=0.05) assert_allclose(norm_X_test.std(), 1, atol=0.05) if not hasattr(scalar, 'fit') or not hasattr(scalar, 'transform'): raise AttributeError("%s is not a detector instance." % (scalar)) # test when X_t is not presented and the scalar is kept norm_X_train, scalar = standardizer(self.X_train, keep_scalar=True) assert_allclose(norm_X_train.mean(), 0, atol=0.05) assert_allclose(norm_X_train.std(), 1, atol=0.05) if not hasattr(scalar, 'fit') or not hasattr(scalar, 'transform'): raise AttributeError("%s is not a detector instance." % (scalar)) # test shape difference with assert_raises(ValueError): standardizer(self.X_train, self.X_test_diff)
except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) n_clf = 20 # number of base detectors # Initialize 20 base detectors for combination k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200] train_scores = np.zeros([X_train.shape[0], n_clf]) test_scores = np.zeros([X_test.shape[0], n_clf]) print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf)) for i in range(n_clf): k = k_list[i]
time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage] roc_mat = np.zeros([n_ite, n_classifiers]) prn_mat = np.zeros([n_ite, n_classifiers]) time_mat = np.zeros([n_ite, n_classifiers]) for i in range(n_ite): print("\n... Processing", mat_file, '...', 'Iteration', i + 1) random_state = np.random.RandomState(i) # 60% data for training and 40% for testing X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.4, random_state=random_state) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD( contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF( contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS( contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(
rp_flags[starts[i]:starts[i + 1]], None, approx_flags[starts[i]:starts[i + 1]], verbose=True) for i in range(n_jobs)) print('Orig decision_function time:', time.time() - start) print() # unfold and generate the label matrix predicted_scores_orig = np.zeros([X.shape[0], n_estimators]) for i in range(n_jobs): predicted_scores_orig[:, starts[i]:starts[i + 1]] = np.asarray( all_results_scores[i]).T ########################################################################## predicted_scores = standardizer(predicted_scores) predicted_scores_orig = standardizer(predicted_scores_orig) evaluate_print('orig', y_test, np.mean(predicted_scores_orig, axis=1)) evaluate_print('new', y_test, np.mean(predicted_scores, axis=1)) #%% ########################################################################## start = time.time() for i in range(n_estimators): print(i) trained_estimators[i].predict(X) print('Orig decision_function time:', time.time() - start) print()
def _get_decision_scores(self, X): """ Helper function for getting outlier scores on test data X (note: model must already be fit) Parameters ---------- X : numpy array, shape (n_samples, n_features) Test data Returns ------- pred_scores_ens : numpy array, shape (n_samples,) Outlier scores for test samples """ # raise warning if local region size is outside acceptable limits if (self.local_region_size < self.local_region_min) or ( self.local_region_size > self.local_region_max): warnings.warn("Local region size of {} is outside " "recommended range [{}, {}]".format( self.local_region_size, self.local_region_min, self.local_region_max)) # standardize test data and get local region for each test instance X_test_norm = X test_local_regions = self._get_local_region(X_test_norm) # calculate test scores test_scores = np.zeros([X_test_norm.shape[0], self.n_clf]) for k, detector in enumerate(self.detector_list): test_scores[:, k] = detector.decision_function(X_test_norm) # generate standardized scores train_scores_norm, test_scores_norm = standardizer(self.train_scores_, test_scores) # generate pseudo target for training --> for calculating weights self.training_pseudo_label_ = np.max(train_scores_norm, axis=1).reshape(-1, 1) # placeholder for ensemble predictions pred_scores_ens = np.zeros([X_test_norm.shape[0], ]) # iterate through test instances (test_local_regions # indices correspond to x_test) for i, test_local_region in enumerate(test_local_regions): # get pseudo target and training scores in local region of # test instance local_pseudo_ground_truth = self.training_pseudo_label_[ test_local_region,].ravel() local_train_scores = train_scores_norm[test_local_region, :] # calculate pearson correlation between local pseudo ground truth # and local train scores pearson_corr_scores = np.zeros([self.n_clf, ]) for d in range(self.n_clf): pearson_corr_scores[d,] = pearsonr( local_pseudo_ground_truth, local_train_scores[:, d])[0] # return best score pred_scores_ens[i,] = np.mean( test_scores_norm[ i, self._get_competent_detectors(pearson_corr_scores)]) return pred_scores_ens
] mat_file = mat_file_list[0] mat_file_name = mat_file.replace('.mat', '') print("\n... Processing", mat_file_name, '...') mat = sp.io.loadmat(os.path.join('', 'datasets', mat_file)) X = mat['X'] y = mat['y'] # split dataset into train and test X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) # standardize data to be digestible for most algorithms X_train, X_test = standardizer(X_train, X_test) contamination = y.sum() / len(y) # get estimators for training and prediction base_estimators = get_estimators(contamination=contamination) ########################################################################## model = SUOD(base_estimators=base_estimators, rp_flag_global=True, approx_clf=approx_clf, n_jobs=n_jobs, bps_flag=True, contamination=contamination, approx_flag_global=True)
def run_all_models(all_array, labels, pca, data_set_name): picture_name = all_array.get("# img", 1) all_array = all_array.drop("# img", 1) # standardizing data for processing all_array = standardizer(all_array) y = labels.get("in").to_numpy() x_train, x_test, y_train, y_test, picture_train, picture_test = train_test_split(all_array, y, picture_name, test_size=0.4) if pca: transformer = IncrementalPCA() all_array = transformer.fit_transform(all_array) print("OCSVM") now = time() clf = OCSVM() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("OCSVM", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("Auto-encoder") now = time() clf = AutoEncoder(epochs=30) clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("Auto-encoder", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("HBOS") now = time() clf = HBOS() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("HBOS", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("SO_GAAL") now = time() clf = SO_GAAL() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("SO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("MO_GAAL") now = time() clf = MO_GAAL() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("MO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("MCD") now = time() clf = MCD() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("MCD", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("SOS") now = time() clf = SOS() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("SOS", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("IForest") now = time() clf = IForest() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("IFrorest", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("KNN") now = time() clf = KNN() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("KNN", all_array.shape, temp, data_set_name, time() - now, scores_train)) print("PCA") now = time() clf = PCA() clf.fit(x_train) test_scores = clf.decision_function(x_test) temp = print_score(picture_test, test_scores, y_test) train_scores = clf.decision_function(x_train) scores_train = print_score(picture_train, train_scores, y_train) output_table.append(("PCA", all_array.shape, temp, data_set_name, time() - now, scores_train))
outliers_percentage = round(outliers_fraction * 100, ndigits=4) print ('Dataset Shape:', X.shape) print ('Outliers Percentage', outliers_percentage) # construct containers for saving results of each dataset roc_list = [] prn_list = [] time_list = [] # 60% data for training and 40% for testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=random_state) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) # define classifiers classifiers = define_classifiers(random_state, outliers_fraction) # create df for results train_results = pd.DataFrame(columns=classifiers.keys()) test_results = pd.DataFrame(columns=classifiers.keys()) print ('\n', 'Outliers Detection', '\n') for clf_name, clf in classifiers.items(): # keep name of convetional models (once on the first itteration) if num_mat == 0: method_names.append(clf_name)
def normalize_data(data): return standardizer(data)
print('processing file '+ file[-8:-4]) print('----------') df = pd.read_csv(file) x = df.drop(['ground.truth','point.id','motherset','origin','original.label'],axis = 1).values y = df['ground.truth'].values y = [0 if i == 'nominal' else 1 for i in y] outliers_fraction = min(np.count_nonzero(y) / len(y),0.5) outliers_percentage = round(outliers_fraction * 100, ndigits=4) roc_list = [file[-8:-4], x.shape[0], x.shape[1], outliers_percentage] prn_list = [file[-8:-4], x.shape[0], x.shape[1], outliers_percentage] time_list = [file[-8:-4], x.shape[0], x.shape[1], outliers_percentage] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4,random_state=random_state) x_train_norm, x_test_norm = standardizer(x_train, x_test) classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD( contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF( contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS( contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF( contamination=outliers_fraction),
from vae import VAE from pyod.utils.data import generate_data, evaluate_print from pyod.utils.utility import standardizer if __name__ == "__main__": # contamination = 0.1 # percentage of outliers # n_train = 20000 # number of training points # n_test = 2000 # number of testing points X_image = np.load('train_image_embedding.npy') X_text = np.load('word2vec.npy') X = np.concatenate([X_image, X_text], axis=1) n_features = X.shape[1] # number of features X_transformed = standardizer(X) # # train VAE detector (Beta-VAE) clf_name = 'VAE' clf = VAE(epochs=50, latent_dim=128, gamma=1, capacity=0) clf.fit(X_transformed) # # get the prediction labels and outlier scores of the training data # y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) # y_train_scores = clf.decision_scores_ # raw outlier scores # # get the prediction on the test data # y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) # y_test_scores = clf.decision_function(X_test) # outlier scores # # evaluate and print the results # print("\nOn Training Data:")
IForest(random_state=42), LOF(), OCSVM(), PCA(), KNN(), HBOS(), COPOD(), AutoEncoder(verbose=0), VAE(latent_dim=32, verbosity=0) ] for embedding, modality in zip(unimodal_embeddings, unimodality): print() print(modality) print() embedding_scaled = standardizer(embedding) for clf in clfs: # print(clf) clf.fit(embedding_scaled) evaluate_print(clf.__class__.__name__, anomaly_label, clf.decision_scores_) #%% image_text_embedding = [ np.load(os.path.join("unimodality", "image", "train_image_embedding.npy")), np.load(os.path.join("unimodality", "language", "word2vec.npy")), ] print("score averaging")
if i[1][5] == 'anomaly': y.append(1) contam += 1 else: y.append(0) x_train.append(list(i[1][6:17])) x_train = np.array(x_train) y = np.array(y) contam /= len(y) algorithms = ['KNN', 'LOF', 'PCA', 'LODA'] all_scores = {} clf_name = 'KNN' clf = KNN(n_neighbors=5, contamination=contam) x_train = standardizer(x_train) clf.fit(x_train) knn_y_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) knn_y_scores = clf.decision_scores_ # raw outlier scores evaluation(y, knn_y_scores, clf_name) all_scores['KNN'] = knn_y_scores clf_name = 'LOF' clf = LOF(contamination=contam) x_train = standardizer(x_train) clf.fit(x_train) y_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_scores = clf.decision_scores_ # raw outlier scores evaluation(y, y_scores, clf_name) all_scores['LOF'] = y_scores
def analyze_selected_algorithm(file_id, dataset_title, selected_algortihm): clf_name = selected_algortihm.split()[-1].strip("()") mat = db_queries.get_dataframe(file_id) filename = "analyze_{}_{}_{}.png".format( file_id, clf_name, len(os.listdir("./images/")) ) path = "./images/{}".format(filename) mat = mat.drop(["Unnamed: 0", "Index", "id", "Id"], axis=1, errors="ignore") y = mat["outlier"].values X = mat.drop("outlier", axis=1).values X_embedded = TSNE(n_components=2).fit_transform(X) outliers_fraction = np.count_nonzero(y) / len(y) b = np.arange(X.shape[0]).reshape((X.shape[0], 1)) X = np.hstack((X, b)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) train_ids = X_train[:, -1].astype(int) X_train = X_train[:, :-1] test_ids = X_test[:, -1].astype(int) X_test = X_test[:, :-1] # standardizing data for processing, mean=0, var=1 X_train_norm, X_test_norm = standardizer(X_train, X_test) if clf_name in ["PCA", "IFOREST"]: # clf = algo_mapping[clf_name]( # contamination=outliers_fraction, random_state=random_state # ) clf = algo_mapping[clf_name](contamination=outliers_fraction) else: clf = algo_mapping[clf_name](contamination=outliers_fraction) clf.fit(X_train_norm) test_scores = clf.decision_function(X_test_norm) roc = round(roc_auc_score(y_test, test_scores), ndigits=4) y_test_predicted = clf.predict(X_test_norm) print(X_train_norm.shape) if X_train_norm.shape[1] > 2: # Building the Plot. fig = plt.figure(figsize=(10, 4)) fig.add_subplot(1, 2, 1) X_out, X_in = X_embedded[test_ids[y_test == 1]], X_embedded[test_ids[y_test == 0]] plt.scatter(X_in[:, 0], X_in[:, 1], color="blue", marker="^", alpha=0.4) plt.scatter(X_out[:, 0], X_out[:, 1], color="orange", marker="h", alpha=0.5) plt.title("Ground truth") fig.add_subplot(1, 2, 2) X_out, X_in = ( X_embedded[test_ids[y_test_predicted == 1]], X_embedded[test_ids[y_test_predicted == 0]], ) plt.scatter(X_in[:, 0], X_in[:, 1], color="blue", marker="^", alpha=0.4) plt.scatter(X_out[:, 0], X_out[:, 1], color="orange", marker="h", alpha=0.5) plt.title("Predicted") sptl = plt.suptitle( "Датасет: {}, ROC: {}\nАлгоритм: {}".format(dataset_title[:-4], roc, clf_name), y=1.08, fontsize=14, ) lgd = plt.legend( labels=["Нормальные данные", "Аномальные данные"], title="Обозначения", shadow=True, ncol=1, fontsize=12, loc="center left", bbox_to_anchor=(1, 0.5), ) plt.savefig(path, dpi=100, bbox_extra_artists=(lgd, sptl), bbox_inches="tight") plt.close() else: # Building the Plot. fig = plt.figure(figsize=(10, 4)) fig.add_subplot(1, 2, 1) X_out, X_in = X[test_ids[y_test == 1]][:, :-1], X[test_ids[y_test == 0]][:, :-1] plt.scatter(X_in[:, 0], X_in[:, 1], color="blue", marker="^", alpha=0.4) plt.scatter(X_out[:, 0], X_out[:, 1], color="orange", marker="h", alpha=0.5) plt.title("Ground truth") fig.add_subplot(1, 2, 2) X_out, X_in = ( X[test_ids[y_test_predicted == 1]][:, :-1], X[test_ids[y_test_predicted == 0]][:, :-1], ) plt.scatter(X_in[:, 0], X_in[:, 1], color="blue", marker="^", alpha=0.4) plt.scatter(X_out[:, 0], X_out[:, 1], color="orange", marker="h", alpha=0.5) plt.title("Predicted") sptl = plt.suptitle( "Датасет: {}, ROC: {}\nАлгоритм: {}".format(dataset_title[:-4], roc, clf_name), y=1.08, fontsize=14, ) lgd = plt.legend( labels=["Нормальные данные", "Аномальные данные"], title="Обозначения", shadow=True, ncol=1, fontsize=12, loc="center left", bbox_to_anchor=(1, 0.5), ) plt.savefig(path, dpi=100, bbox_extra_artists=(lgd, sptl), bbox_inches="tight") plt.close() return filename
train_scores = pd.DataFrame({'clf1': clf1.decision_scores_, 'clf2': clf2.decision_scores_, 'clf3': clf3.decision_scores_ }) test_scores = pd.DataFrame({'clf1': clf1.decision_function(X_test), 'clf2': clf2.decision_function(X_test), 'clf3': clf3.decision_function(X_test) }) # Although we did standardization before, it was for the variables. # Now we do the standardization for the decision scores from pyod.utils.utility import standardizer train_scores_norm, test_scores_norm = standardizer(train_scores,test_scores) # Combination by average y_by_average = average(test_scores_norm) import matplotlib.pyplot as plt plt.hist(y_by_average, bins='auto') # arguments are passed to np.histogram plt.title("Combination by average") plt.show() df_test = pd.DataFrame(X_test) df_test['y_by_average_score'] = y_by_average df_test['y_by_average_cluster'] = np.where(df_test['y_by_average_score']<0, 0, 1) df_test['y_by_average_cluster'].value_counts()
with open(fileName) as data: lines = data.readlines() for line in lines: lineData = line.strip().split(' ') lineData = list(map(lambda x: float(x), lineData)) dataMat.append(lineData) return (np.array(dataMat)) data = data_loadDataSet() X_train, y_train, X_test, y_test = generate_data(n_train=50, n_test=50, contamination=0.1, random_state=42) X_train, X_test = standardizer(X_train, X_test) detector_list = [LOF(n_neighbors=10), LOF(n_neighbors=15)] clf = LSCP(detector_list) clf.fit(X_train) clf.fit(data) y_train_scores = clf.decision_scores_ sort_factor = argsort(y_train_scores, kind='quicksort') print(sort_factor) sort_factors = sort_factor[::-1] print(sort_factors) np.savetxt(r'C:\Users\zz\Desktop\res\lscp\D1_2.txt', sort_factors, fmt='%f', delimiter=' ')
roc_mat = np.zeros([n_ite, n_classifiers]) prn_mat = np.zeros([n_ite, n_classifiers]) ap_mat = np.zeros([n_ite, n_classifiers]) time_mat = np.zeros([n_ite, n_classifiers]) for i in range(n_ite): print("\n... Processing", mat_file, '...', 'Iteration', i + 1) random_state = np.random.RandomState(i) # 60% data for training and 40% for testing X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.4, random_state=random_state) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) classifiers = {'COD_L': COD(contamination=outliers_fraction, tail='left'), 'COD_R': COD(contamination=outliers_fraction, tail='right'), 'COD_B': COD(contamination=outliers_fraction, tail='both'), 'COD_S': COD(contamination=outliers_fraction, tail='skew'), 'COD_M': COD(contamination=outliers_fraction, tail='max'), 'COD': COD(contamination=outliers_fraction) } classifiers_indices = { 'COD_L': 0, 'COD_R': 1, 'COD_B': 2, 'COD_S': 3, 'COD_M': 4, 'COD': 5
def analysis(): roc_df = pd.DataFrame(columns=df_columns) prn_df = pd.DataFrame(columns=df_columns) for doc in fileList: print(doc) df = pd.read_csv(doc, encoding='utf-8') # x =df.loc[:,('V1','V2','V3','V4','V5','V6','V7')] x = df.loc[:, ('R', 'G', 'B')] # x=df.iloc[:,6:57] y = df.loc[:, 'original.label'] roc_list = [count, doc] count = count + 1 roc_mat = np.zeros(6) # 设置 5%的离群点数据 random_state = np.random.RandomState(42) outliers_fraction = 0.02 # 定义6个后续会使用的离群点检测模型 classifiers = { "Feature Bagging": FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False, random_state=random_state), "Isolation Forest": IForest(contamination=outliers_fraction, random_state=random_state), "KNN": KNN(contamination=outliers_fraction), 'Local Outlier Factor': LOF(contamination=outliers_fraction), 'One-class SVM': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis': PCA(contamination=outliers_fraction, random_state=random_state), } classifiers_indices = { 'Feature Bagging': 0, 'Isolation Forest': 1, "Average KNN": 2, 'Local Outlier Factor': 3, 'One-class SVM': 4, 'Principal Component Analysis': 5, } # 60% data for training and 40% for testing X_train, X_test, y_train, y_test = \ train_test_split(x, y, test_size=0.4, random_state=random_state) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(X_train_norm, y_train) # 预测离群点得分 scores_pred = clf.decision_function(X_test_norm) try: roc = round(roc_auc_score(y_test, scores_pred), ndigits=4) roc_mat[classifiers_indices[clf_name]] = roc except ValueError: continue roc_list = roc_list + roc_mat.tolist() temp_df = pd.DataFrame(roc_list).transpose() temp_df.columns = [ 'Data', 'dir', 'FB', 'IForest', 'Average KNN', 'LOF', 'OCSVM', 'PCA' ] roc_df = pd.concat([roc_df, temp_df], axis=0) roc_df.to_csv("roc.csv", index=False, float_format="%.3f")
roc_mean = [] roc_max = [] roc_aom = [] roc_moa = [] prn_mean = [] prn_max = [] prn_aom = [] prn_moa = [] for t in range(ite): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) # initialize 20 base detectors for combination k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200] train_scores = np.zeros([X_train.shape[0], n_clf]) test_scores = np.zeros([X_test.shape[0], n_clf]) for i in range(n_clf): k = k_list[i] clf = Knn(n_neighbors=k, method='largest') clf.fit(X_train_norm) train_scores[:, i] = clf.decision_scores.ravel()
return if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, contamination=contamination, random_state=42) X_train, X_test = standardizer(X_train, X_test) # train lscp clf_name = 'LSCP' detector_list = [LOF(), LOF()] clf = LSCP(detector_list, random_state=42) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores