def remove_outliers_knn( x: pd.DataFrame, y: np.array, contamination: float = 0.1) -> Tuple[pd.DataFrame, np.array]: """Remove outliers from the training/test set using PyOD's KNN classifier Args: x: DataFrame containing the X's y: target array contamination: the amount of contamination of the data set Returns: x and y with outliers removed """ clf = KNN(contamination=contamination, n_jobs=-1) clf.fit(x) labels = clf.labels_ print( "{0:.2%} among {1:,} sample points are identified and removed as outliers" .format(sum(labels) / x.shape[0], x.shape[0])) x = x.iloc[labels == 0] y = y[labels == 0] return x, y
def train(): dataset = get_data(1000, 10, 100) contamination = 0.01 with mlflow.start_run(): base_estimators = [ LOF(n_neighbors=5, contamination=contamination), LOF(n_neighbors=15, contamination=contamination), LOF(n_neighbors=25, contamination=contamination), PCA(contamination=contamination), KNN(n_neighbors=5, contamination=contamination), KNN(n_neighbors=15, contamination=contamination), KNN(n_neighbors=25, contamination=contamination)] model = SUOD(base_estimators=base_estimators, n_jobs=6, rp_flag_global=True, bps_flag=True, approx_flag_global=False, contamination=contamination) model.fit(dataset) model.approximate(dataset) predicted_labels = model.predict(dataset) voted_labels = vote(predicted_labels) true_labels = [0]*1000 + [1]*10 auc_score = roc_auc_score(voted_labels, true_labels) print("The resulted area under the ROC curve score is {}".format(auc_score)) mlflow.log_metric("auc_score", auc_score) mlflow.sklearn.log_model(model, "anomaly_model", conda_env="conda.yaml")
def get_all_readings_from_person(self, person_tag, remove_outliers=0, additional_where=""): #Debug.print_debug(self.file_path) print(self.file_path) dataset = sqlite3.connect(self.file_path) if len(additional_where) > 0: to_return = self.get_data_sql_query( "select {} from {} where {} like {} {}".format( ', '.join(self.features), self.table_name, self.person_column, person_tag, additional_where), dataset) else: to_return = self.get_data_sql_query( "select {} from {} where {} like '{}'".format( ', '.join(self.features), self.table_name, self.person_column, person_tag), dataset) self.data = to_return if (remove_outliers > 0): knn = KNN(contamination=remove_outliers) to_return_aux = to_return.copy() to_return_aux = to_return_aux.drop(self.label_tag, 1) knn.fit(to_return_aux) pred = knn.predict(to_return_aux) to_return = to_return.iloc[np.where(pred == 0)[0], :] return to_return
def run_KNN_base_detector(data, k, metric='euclidean', p=2, method='mean'): """ Function to fit and predict the KNN base detector on `data`. Input: - data: pd.DataFrame, to run KNN on - k: integer, parameter to indicate the amount of neighbours to include in relative density determination - metric: string, distance metric to use, default `euclidean` - p: int, default 2 since metric = `euclidean`, otherwise set according to distance metric Output: - clf of class pyod.models.knn.KNN with all its properties """ # Split data in values and targets: some datasets have an ID column, others don't try: X = data.drop(['outlier', 'id'], axis=1) except KeyError: X = data.drop('outlier', axis=1) # Construct and fit classifier clf = KNN(n_neighbors=k, metric='euclidean', p=p, method=method) clf.fit(X) # Fit only on features # Add ground truth labels for evaluation of the classifier clf.true_labels_ = data['outlier'] # Return the classifier for further processing return clf
def detectarOutlierKNN(self, idmodelo, Xtodos, corteOutlier): # Detecao Outliers 1-------------------------------------------------------------- clf = KNN() clf.fit(Xtodos) # get outlier scores y_train_scores = clf.decision_scores_ # raw outlier scores y_test_scores = clf.decision_function(Xtodos) # outlier scores YCodigoTodosComOutilier = self.selectMatrizY(idmodelo, "ID", "TODOS") cont = 0 amostrasRemovidas = 0 for itemOutilier in y_train_scores: if itemOutilier > corteOutlier: contTodos = 0 for item in YCodigoTodosComOutilier: amostra = str(item) amostra = amostra.replace("[", "") amostra = amostra.replace("]", "") if contTodos == cont: db.execute( " update amostra set tpamostra = 'OUTLIER' where idamostra = " + str(amostra) + " and idmodelo = " + str( idmodelo) + "") print(itemOutilier) amostrasRemovidas = amostrasRemovidas + 1 break contTodos = contTodos + 1 cont = cont + 1 session.commit() print("Numero de Amostras Removidas: " + str(amostrasRemovidas)) return cont
def training(data, img_shape, re_sample_type, text_len, permission_names, extract_f): # load training data print('preparing training data') inputs, permissions = prepare_training_data(data, img_shape, re_sample_type, text_len, permission_names) # get features print('generating training features') features = extract_f.predict(inputs) # train auto encoder model, knn model print('training outlier model + knn model') detectors = [] knn_trees = [] features_in_permissions = [ ] # features in each permission, [permission_id, feature_id] for p in permission_names: print('training', p, '...') features_current = [] for i in range(len(permissions)): if p in permissions[i]: features_current.append(features[i]) features_in_permissions.append(features_current) detector = AutoEncoder(epochs=200, verbose=0) detector.fit(features_current) detectors.append(detector) knn = KNN() knn.fit(features_current) knn_trees.append(knn) return detectors, knn_trees, features_in_permissions
def construct_raw_base_estimators(): from pyod.models.knn import KNN from pyod.models.lof import LOF from pyod.models.cblof import CBLOF from pyod.models.hbos import HBOS from pyod.models.iforest import IForest from pyod.models.abod import ABOD from pyod.models.ocsvm import OCSVM estimator_list = [] # predefined range of n_neighbors for KNN, AvgKNN, and LOF k_range = [3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] for k in k_range: estimator_list.append( KNN(n_neighbors=k, method="largest", contamination=0.05)) estimator_list.append( KNN(n_neighbors=k, method="mean", contamination=0.05)) estimator_list.append(LOF(n_neighbors=k, contamination=0.05)) # predefined range of nu for one-class svm nu_range = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99] for nu in nu_range: estimator_list.append(OCSVM(nu=nu, contamination=0.05)) # predefined range for number of estimators in isolation forests n_range = [10, 20, 50, 70, 100, 150, 200, 250] for n in n_range: estimator_list.append( IForest(n_estimators=n, random_state=42, contamination=0.05)) return estimator_list
class IForestSupervisedKNN(BaseDetector): def __init__(self, get_top=0.8, if_params={}, knn_params={}): super(IForestSupervisedKNN, self).__init__() self.get_top = get_top self.is_fitted = False self.iforest = IForest(**if_params) self.knn = KNN(**knn_params) def fit(self, X, y=None): X = check_array(X) self._set_n_classes(y) self.iforest.fit(X) scores = self.iforest.predict_proba(X)[:, 1] normal_instances = X[np.argsort(scores)[:int(len(X) * self.get_top)]] self.knn.fit(normal_instances) self.decision_scores_ = self.decision_function(X) self._process_decision_scores() self.is_fitted = True return self def decision_function(self, X): check_is_fitted(self, ['is_fitted']) return self.knn.decision_function(X)
def load_classifiers(outliers_fraction): outliers_fraction = min(0.5, outliers_fraction) random_state = np.random.RandomState(42) # Define nine outlier detection tools to be compared classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state, behaviour="new"), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state) } return classifiers
def __load_classifiers(self): outliers_fraction = 0.05 random_state = np.random.RandomState(0) classifiers = { 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Average KNN': KNN(method='mean', contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(n_neighbors=35, contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), } return classifiers
def __init__(self, window_size, step_size=1, contamination=0.1, n_neighbors=5, method='largest', radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=1, **kwargs): super(KDiscord, self).__init__(contamination=contamination) self.window_size = window_size self.step_size = step_size # parameters for kNN self.n_neighbors = n_neighbors self.method = method self.radius = radius self.algorithm = algorithm self.leaf_size = leaf_size self.metric = metric self.p = p self.metric_params = metric_params self.n_jobs = n_jobs # initialize a kNN model self.model_ = KNN(contamination=self.contamination, n_neighbors=self.n_neighbors, radius=self.radius, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, p=self.p, metric_params=self.metric_params, n_jobs=self.n_jobs, **kwargs)
def some_random_test(): np.set_printoptions(threshold=sys.maxsize) X = load_npz("X.npz").toarray() Y = genfromtxt('Y.csv', delimiter=',') # train kNN detector clf_name = 'KNN' clf = KNN() # find outliers per class # print(Y.shape) # print(X[Y == 1.].shape) # print(X[Y == 0.].shape) # print(X[Y == 7.].shape) # collect the outliers in a per class manner classList = [1.0, 0.0, 7.0] y_train_pred_total = [] for clas in classList: clf.fit(X[Y == clas]) y_train_pred_total.append(clf.labels_) # -------------------------RESULT--------------------- # 0:inlier, 1: outlier np.array(y_train_pred_total).tofile('outliers.csv', sep=',', format='%10.5f')
def distanceBased(self): ''' @brief Function that implements the distance based component @param self @return It returns the vector with the scores of the instances ''' # Initialize the scores scores = np.array([0] * len(self.dataset)).astype(float) for i in range(self.num_iter): knn = KNN(n_neighbors=5, contamination=self.contamination) # Number in the interval [50, 1000] subsample_size = np.random.randint(50, 1001) sample = [] if subsample_size >= len(self.dataset): sample = list(range(len(self.dataset))) else: # Take the sample and train the model sample = np.random.choice(len(self.dataset), size=subsample_size, replace=False) knn.fit(self.dataset[sample]) # Update the score to compute the mean scores[sample] += knn.decision_scores_ # Return the mean scores = scores / self.num_iter scores = scale(scores) return scores
def pred_KNN(self, k=5, comp_with="openaq"): ## hyperparameters for KNN is tuned here # if self.bool_o_dict == True: self.comp_with = comp_with if comp_with == "openaq": if self.X_o == []: pred = [] elif self.X_o.shape[0] > k: self.clf = KNN(n_neighbors=k) self.clf.fit(self.X_o) pred = self.clf.labels_ elif self.X_o.shape[0] > 2: # print(f"The value of k is changed from {k} to {self.X_o.shape[0]-1}") k = self.X_o.shape[0] - 1 self.clf = KNN(n_neighbors=k) self.clf.fit(self.X_o) pred = self.clf.labels_ else: pred = [] #A_location, B_location, C_location = self.pred_location(pred) elif comp_with == "cams": pred = [] for each_X in self.X_c: # if each_X exists then it will have a shape of (10,8) self.clf = KNN(n_neighbors=k) self.clf.fit(each_X) pred.append(self.clf.labels_[-1]) A_location, B_location, C_location = self.pred_location(pred) return A_location, B_location, C_location
def stop_train(filename): """ Stops training and saves the model as filename.sav also saves the threshold, mean and standard deviation in a json file of the same name. Also saves the pca model """ pca = PCA(n_components=3) pca.fit(np.array(train.arr)) with open(filename + 'pca.sav', 'wb') as savpca: pickle.dump(pca, savpca) z = find_theta_score(np.array(train.arr), pca) lof = KNN(n_neighbors=1) lof.fit(z) scores = lof.decision_scores_ with open(filename + 'knn.sav', 'wb') as savknn: pickle.dump(lof, savknn) mean = scores.mean() stdev = scores.std() thres = mean + 18 * stdev params = {} params['mean'] = mean params['std'] = stdev params['threshold'] = thres with open(filename + '.json', 'w') as jsonf: json.dump(params, jsonf) print() print("Training Completed")
def removeOutliers(df_flights_list, contamination=0.001, n_neighbors=1000, method='mean'): '''Remove Outliers''' lf_array = [] for flights in df_flights_list: lf_array.append(flights.lf.values) lf_array = np.array(lf_array) # Train kNN detector outlier_model = KNN(contamination=contamination, n_neighbors=n_neighbors, method=method) outlier_model.fit(lf_array) # Get the prediction labels outliers_labels = outlier_model.labels_ # binary labels (0: inliers, 1: outliers) df_flights_list = [ df_flight for index, df_flight in enumerate(df_flights_list) if outliers_labels[index] == 0 ] return df_flights_list
def fit(self,df): logging.info("Initializaing Pipeline") isTraining = True self.isTraining = isTraining self.adf = df self.df = df.copy() self.numeric_cols = getNumericColumns(df) self.cat_cols = getCategorialColumns(df) self.DEPENDENT_VARIABLE = getDependentVariable() self.cat_cols_useless = [ "encounter_id" , "hospital_id" , "patient_id" , "icu_id"] self.cat_cols_minus = [c for c in self.cat_cols if c not in ["clusterId","hospital_death", "encounter_id" , "hospital_id" , "patient_id"]] self.cat_cols_minus_useless = [c for c in self.cat_cols if c not in ["clusterId", "encounter_id" , "hospital_id" , "patient_id" , "icu_id" ]] self.cols_to_dummy = [c for c in self.cat_cols_minus_useless if c != "hospital_death"] self.num_mean = SimpleImputer(strategy="median") self.cat_freq = SimpleImputer(strategy="most_frequent") self.rs = RobustScaler() self.pt = PowerTransformer() self.ohe = OneHotEncoder(handle_unknown='ignore' , sparse=False) self.outlierKNN = KNN() self.num_means = [MatrixFactorization() for i in range(4)] self.cat_freqs = [SimpleImputer(strategy="most_frequent") for i in range(4)] #self.label_encoders = defaultdict(LabelEncoder) self.label_encoders = WOEEncoder() self.later_num_transform = PowerTransformer() self.X = self.df.drop([self.DEPENDENT_VARIABLE] , axis=1) self.y = self.df[self.DEPENDENT_VARIABLE] return self.GetTransformedData(isTraining)
def train_knn_anomaly_detector(input_df: pandas.DataFrame, domain: str, train_fields=(), n_neighbors=10, contamination=0.1): """ :param input_df: The input dataframe :param domain: The domain (model name) :param train_fields: The features (numeric only) :param n_neighbors: Number of neighbors to use by default for k neighbors queries. :param contamination: The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function :return: A list of predictions with included fields """ feature_group_id = hashlib.md5(str(list(train_fields).sort()).encode()).hexdigest() drop_fields = [field for field in input_df.columns if field not in train_fields] train_df = input_df.drop(drop_fields, axis=1) for column in train_df.columns: train_df[column] = train_df[column].fillna(0) model_directory = os.path.join(const.DYNAMITE_CONF_ROOT, 'models', 'knn_anomaly_detector', feature_group_id) model_pkl_file = os.path.join(model_directory, domain + '.pkl') makedirs(model_directory) model = KNN(contamination=contamination, n_neighbors=n_neighbors, metric='manhattan') joblib.dump(model.fit(train_df), model_pkl_file)
def api_alert(influxdb_ip, influxdb_port, influxdb_user, influxdb_pwd, influxdb_database, influxdb_table, apiid): timelimit = 'time > now()-1d' # 访问influxdb client = InfluxDBClient(influxdb_ip, influxdb_port, influxdb_user, influxdb_pwd, influxdb_database) # 获取当前API一天前的数据 result = client.query('select Average, CallCount, ErrorRate from ' + influxdb_table + ' where ApiId = \'' + apiid + '\' and ' + timelimit + ';') # 把resultset格式的数据转换成list格式 apis_table = list(result.get_points(measurement='apis')) # 把要处理的数据存成DataFrame df = pd.DataFrame(data=apis_table) # 去掉不参与运算的列,取训练集x x = df x = x.drop("time", axis=1) # 数据处理一下,归一化,映射到[0,1] x['CallCount'] = (x['CallCount']-x['CallCount'].min()) / \ (x['CallCount'].max()-x['CallCount'].min()) x['Average'] = (x['Average']-x['Average'].min()) / \ (x['Average'].max()-x['Average'].min()) x['ErrorRate'] = x['ErrorRate'] / 100 # 取最后十秒的数据点作为测试点 x_last = x.tail(1) #df_last = df.tail(1) x = x.drop(x.index[-1]) df = df.drop(df.index[-1]) # 转换成numpy格式准备计算 x = x.values # 训练一个kNN检测器 clf_name = 'kNN' clf = KNN() # 初始化检测器clf clf.fit(x) # 使用X_train训练检测器clf # 给df添加一列显示异常分数 df['score'] = clf.decision_scores_ # 排序分数 df = df.sort_values("score", ascending=False) #print(df.head(20)) # 新数据预测 test_data = x_last test_scores = clf.decision_function(test_data) if (test_scores > 0.8): print('数据点异常程度4,必须报警') elif (test_scores > 0.5): print('数据点异常程度3,需要报警') elif (test_scores > 0.1): print('数据点异常程度2,建议报警') elif (test_scores > 0.05): print('数据点异常程度1,可以报警') #这个分级是根据KNN.py的图像分析出来的,0.05以上的很明显是异常点,0.1以上已经出现了离群现象,0.5以上就距离数据点很远了。 #这个值根据训练用的时间相关,一天的数据0.05比较合适。 return test_scores
def __init__(self, get_top=0.8, if_params={}, knn_params={}): super(IForestSupervisedKNN, self).__init__() self.get_top = get_top self.is_fitted = False self.iforest = IForest(**if_params) self.knn = KNN(**knn_params)
def median_knn(X_train, X_test, Y_train, Y_test): from pyod.models.knn import KNN model = KNN(method='median') model.fit(X_train) pred = model.predict(X_test) acc = np.sum(pred == Y_test) / X_test.shape[0] print(acc) return (acc * 100)
def knnAD(self): clf_name = 'KNN' clf = KNN() clf.fit(self.X) # get the prediction labels and outlier scores of the training data y_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_scores = clf.decision_scores_ # raw outlier scores generateAnomalis(self.data, self.label, y_pred)
def train_monitoring_model(data): logger.info("Training a monitoring model") X_train, X_test = train_test_split(np.array(data, dtype='float'), test_size=0.2) monitoring_model = KNN(contamination=0.05, n_neighbors=15, p=5) monitoring_model.fit(X_train) return monitoring_model
def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def outliers(base): detector = KNN() detector.fit(base) previsoes = detector.labels_ outliers = [] for i in range(len(previsoes)): if previsoes[i] == 1: outliers.append(i) base = base.drop(base.index[outliers]) return base
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.75 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = KNN(contamination=self.contamination, method='median')
def S2(self): self.S1() water_data = self.water_data result = self.result # 数据预处理及模型训练 clean_data = water_data[water_data['S1'] == 0] Y = pd.DataFrame(index=clean_data.index, columns=['S2']) X_train = np.array(clean_data.iloc[:, 1:12]) name = list(clean_data.iloc[:, 1:12].columns.values) scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True) clf2 = KNN(contamination=0.05, n_neighbors=100) clf3 = HBOS(contamination=0.05, n_bins=10) clf4 = PCA(contamination=0.05) clf1.fit(X_train) clf2.fit(X_train) clf3.fit(X_train) clf4.fit(X_train) Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_ water_data = pd.concat([water_data, Y], axis=1) # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0; result['统计异常'] = water_data['S2'].values # 寻找异常维度 from sklearn.neighbors import KernelDensity clean_data = water_data[water_data['S1'] == 0] dens = pd.DataFrame(index=clean_data.index, columns=[ 'temperature', 'pH', 'EC', 'ORP', 'DO', 'turbidity', 'transparency', 'COD', 'P', 'NH3N', 'flux' ]) for i in dens.columns: kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit( clean_data[i].values.reshape(-1, 1)) dens[i] = np.exp( kde.score_samples(clean_data[i].values.reshape(-1, 1))) dens = dens.iloc[:, 0:11].rank() dens['S2_names'] = dens.idxmin(axis=1) water_data = pd.concat([water_data, dens['S2_names']], axis=1) self.water_data = water_data result['统计异常维度'] = water_data['S2_names'].values # 存储模型 joblib.dump(scaler, "./water_model/S2_scaler") joblib.dump(clf1, "./water_model/S2_Iforest")
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test): if method == 'KNN': clf = KNN() elif method == 'CBLOF': clf = CBLOF() elif method == 'PCA': clf = PCA() else: clf = IForest() clf.fit(x_train) # 使用x_train训练检测器clf # 返回训练数据x_train上的异常标签和异常分值 y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值) y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常) print("On train Data:") evaluate_print(method, y_train, y_train_scores) # 用训练好的clf来预测未知数据中的异常值 y_test_pred = clf.predict(x_test) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) y_test_scores = clf.decision_function(x_test) # 返回未知数据上的异常值 (分值越大越异常) print("On Test Data:") evaluate_print(method, y_test, y_test_scores) y_true = column_or_1d(y_test) y_pred = column_or_1d(y_test_scores) check_consistent_length(y_true, y_pred) roc = np.round(roc_auc_score(y_true, y_pred), decimals=4), prn = np.round(precision_n_scores(y_true, y_pred), decimals=4) total_roc.append(roc) total_prn.append(prn)
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = KNN(contamination=self.contamination) self.clf.fit(self.X_train)
def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.75 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = KNN(contamination=self.contamination) self.clf.fit(self.X_train)
class TestKnnMedian(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.75 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = KNN(contamination=self.contamination, method='median') def test_fit(self): self.clf.fit(self.X_train) def test_decision_function(self): self.clf.fit(self.X_train) self.clf.decision_function(self.X_train) self.clf.decision_function(self.X_test) def test_sklearn_estimator(self): check_estimator(self.clf) def tearDown(self): pass
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train kNN detector clf_name = 'KNN' clf = KNN() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
n_clf = 20 # number of base detectors # Initialize 20 base detectors for combination k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200] train_scores = np.zeros([X_train.shape[0], n_clf]) test_scores = np.zeros([X_test.shape[0], n_clf]) print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf)) for i in range(n_clf): k = k_list[i] clf = KNN(n_neighbors=k, method='largest') clf.fit(X_train_norm) train_scores[:, i] = clf.decision_scores_ test_scores[:, i] = clf.decision_function(X_test_norm) # Decision scores have to be normalized before combination train_scores_norm, test_scores_norm = standardizer(train_scores, test_scores) # Combination by average y_by_average = average(test_scores_norm) evaluate_print('Combination by Average', y_test, y_by_average) # Combination by max y_by_maximization = maximization(test_scores_norm) evaluate_print('Combination by Maximization', y_test, y_by_maximization)
class TestKnn(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = KNN(contamination=self.contamination) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass