# 종속 변수만 있는 데이터를 만든다 y = boston["medv"] # ============================================================================= # 3. 이상치 존재 여부 확인 # ============================================================================= ## 1) 회귀식을 구한 다음 그 모듈 안에서 이상치를 체크한다. -> statsmodels ## 2) 회귀식 없이 순수 데이터만 이용해 이상치를 체크한다. -> sklearn.neighbors 최근접이웃 분류기법(KNN) from sklearn.neighbors import LocalOutlierFactor # LocalOutlierFactor(n_neighbors=이웃의 숫자, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination="legacy", novelty=False, n_jobs=None) # 이웃의 숫자가 너무 많으면 엉뚱한 값이 나오므로 적당히 낮은 걸로 한다. lof1 = LocalOutlierFactor(n_neighbors=5) # 실제 그 값을 찾으려면 fit() # fit(수치형 데이터만 가능하다. 범주형은 불가능) lof1.fit(xx) # fit()을 한 뒤에는 변수명을 새로 만들지 않는다. 그 안에서 값을 만들고 가지고 있으라고 실행하는 것이기 때문이다. # 이상치를 구한다. lof1.negative_outlier_factor_ # -2보다 크면 정상치, 작으면 이상치이다. # 이상치의 총 개수는 506개다. len(lof1.negative_outlier_factor_) # 정상치는 값을 행에 넣고 모든 값을 열로 넣는다. xx1 = xx.loc[lof1.negative_outlier_factor_ > -2, :]
X = root2array('../no_truecc_cut_stride2_offset0.root', branches=['calehad', 'cvnpi0', 'cvnchargedpion', 'cvnneutron', 'cvnproton'], selection='mustopz<1275&&remidtrkismuon==1&&isnumucc==1', step=scaledown) X = X.view(np.float32).reshape(X.shape + (-1,)) recoemu_official = root2array('../no_truecc_cut_stride2_offset0.root', branches='recoemu', selection='mustopz<1275&&remidtrkismuon==1&&isnumucc==1', step=scaledown) trueenu = root2array('../no_truecc_cut_stride2_offset0.root', branches='trueenu', selection='mustopz<1275&&remidtrkismuon==1&&isnumucc==1', step=scaledown) y = trueenu - recoemu_official Xy = np.insert(X, 5, y, axis=1) # fit the model clf = LocalOutlierFactor(n_neighbors = nneighbors) y_pred = clf.fit_predict(Xy) #~ # plot the level sets of the decision function #~ xx, yy = np.meshgrid(np.linspace(0, 15, 150), np.linspace(0, 15, 150)) #~ Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) #~ Z = Z.reshape(xx.shape) #~ # level curve plot with original distribution #~ plt.figure(1) #~ plt.subplot(1, 2, 1) #~ plt.title("Local Outlier Factor (LOF)") #~ plt.contourf(xx, yy, -Z, locator=ticker.LogLocator(), cmap=plt.cm.Blues_r) #~ a = plt.scatter(X, y, c='white', #~ edgecolor='k', s=20) #~ plt.axis('tight')
# ## Set up for training # ### Set up 5-fold cross validation # In[6]: X = data.drop(columns=['Class']) y = data['Class'] cv = KFold(shuffle=True) # ### Set classifiers # In[7]: classifiers = { "LOF": LocalOutlierFactor(n_neighbors=20, novelty=True), "SVM-rbf": SVC(), "SVM-poly": SVC(kernel="poly") } # ### Set score names # In[8]: score_names = ["time", "accuracy", "precision", "recall", "f1"] # ### Set a function to get the scores # In[9]:
def local_outlier_factory(dataset, neighbours): lof = LocalOutlierFactor(n_neighbors=neighbours, contamination=0.1,novelty=True).fit(dataset) return lof
print(Y.shape) ##Define the outlier detection methods classifiers = { "Isolation Forest": IsolationForest(n_estimators=100, max_samples=len(X), contamination=outlier_fraction, random_state=state, verbose=0), "Local Outlier Factor": LocalOutlierFactor(n_neighbors=20, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination=outlier_fraction) } type(classifiers) n_outliers = len(Fraud) for i, (clf_name, clf) in enumerate(classifiers.items()): #Fit the data and tag outliers if clf_name == "Local Outlier Factor": y_pred = clf.fit_predict(X) scores_prediction = clf.negative_outlier_factor_ else: clf.fit(X) scores_prediction = clf.decision_function(X)
def train(loader, epoch, model_list, method='ocsvm'): # 大于阈值表示属于正常 # model_list 对需要多轮训练的模型有效, 传入上一次训练的模型,例如ocnn datas, labels = get_features(loader) threshold_list = [] update_models = [] update_optimizer = [] clf_list, optimizers = model_list for label in range(args.class_num): # 为每个类别拟合ocsvm模型 condition_index = np.where(labels == label)[0] fit_data = datas[condition_index] # 标签label的训练数据 optimizer = optimizers[label] if method == 'ocsvm': clf = OneClassSVM() elif method == 'isofore': clf = IsolationForest() elif method == 'gmm': clf = BayesianGaussianMixture() elif method == 'svdd': clf = SVDD(parameters) elif method == 'lof': clf = LocalOutlierFactor(novelty=True, n_neighbors=int(fit_data.size * 0.1)) elif method == 'cnn': clf = '' elif method != 'sp': clf = clf_list[label] # 训练异常检测模型 if method == 'ocnn': clf, optimizer = fit(clf, fit_data, optimizer, epoch) scores_temp = score_samples(clf, fit_data, epoch) elif method == 'lof': clf.fit(fit_data) scores_temp = clf.decision_function(fit_data) elif method == 'sp': pass elif method == 'cnn': pass else: clf.fit(fit_data) scores_temp = clf.score_samples(fit_data) # 异常检测模型阈值的计算 if method != 'sp' and method != 'gmm' and method != 'cnn': threshold = np.mean(scores_temp) - \ args.threshold_std_times*np.std(scores_temp) update_optimizer.append(optimizer) update_models.append(clf) threshold_list.append(threshold) elif method == 'gmm': threshold = np.mean(scores_temp) update_optimizer.append(optimizer) update_models.append(clf) threshold_list.append(threshold) elif method == 'sp': from cnn import get_c_v threshold_list = get_c_v(p_s=datas, labels=labels) elif method == 'cnn': threshold_list = '' model_list = (update_models, optimizers) return model_list, threshold_list
print(__doc__) np.random.seed(42) xx, yy = np.meshgrid(np.linspace(-5, 5, 500), np.linspace(-5, 5, 500)) # Generate normal (not abnormal) training observations X = 0.3 * np.random.randn(100, 2) X_train = np.r_[X + 2, X - 2] # Generate new normal (not abnormal) observations X = 0.3 * np.random.randn(20, 2) X_test = np.r_[X + 2, X - 2] # Generate some abnormal novel observations X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) # fit the model for novelty detection (novelty=True) clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1) clf.fit(X_train) # DO NOT use predict, decision_function and score_samples on X_train as this # would give wrong results but only on new unseen data (not used in X_train), # e.g. X_test, X_outliers or the meshgrid y_pred_test = clf.predict(X_test) y_pred_outliers = clf.predict(X_outliers) n_error_test = y_pred_test[y_pred_test == -1].size n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size # plot the learned frontier, the points, and the nearest vectors to the plane Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Novelty Detection with LOF") plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
# In[29]: st.subheader("Accuracy score For Isolation forest") ISF = IsolationForest(random_state=42) ISF.fit(ins) falsepositive_isf = ISF.predict(ins) falsenegative_isf = ISF.predict(outs) in_accuracy_isf = falsepositive_accuracy(falsepositive_isf) out_accuracy_isf = falsenegative_accuracy(falsenegative_isf) st.write("Accuracy in Detecting falsepositive Alarm:", in_accuracy_isf) st.write("Accuracy in Detecting falsenegative Alarm:", out_accuracy_isf) # In[30]: st.subheader("Accuracy score For Local Outlier Factor") LOF = LocalOutlierFactor(novelty=True) LOF.fit(ins) falsepositive_lof = LOF.predict(ins) falsenegative_lof = LOF.predict(outs) in_accuracy_lof = falsepositive_accuracy(falsepositive_lof) out_accuracy_lof = falsenegative_accuracy(falsenegative_lof) st.write("Accuracy in Detecting falsepositive Alarm :", in_accuracy_lof) st.write("Accuracy in Detecting falsenegative Alarm:", out_accuracy_lof) # In[31]: if st.sidebar.checkbox("Alarm Report", False): st.subheader("classification of Alarm") fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[16, 3]) ax1.set_title("Accuracy of Isolation Forest", fontsize=20) st.write(
ii.fit(dataset[features]) #Error occurs here. dataset['outlier'] = ii.predict(dataset[features]) del ii print(dataset[dataset['outlier'] == -1]) #IsolationForest from sklearn.ensemble import IsolationForest ii = IsolationForest(max_samples=62, contamination=0.25, random_state=np.random.RandomState(42)) print("Fit data") ii.fit(dataset[features]) #Error occurs here. dataset['outlier'] = ii.predict(dataset[features]) del ii print(dataset[dataset['outlier'] == -1]) #LocalOutlierFactor from sklearn.neighbors import LocalOutlierFactor ii = LocalOutlierFactor(n_neighbors=35, contamination=0.25) dataset['outlier'] = ii.fit_predict(dataset[features]) del ii print(dataset[dataset['outlier'] == -1])
def _LocalOutlierFactor(X): n = int(round(X.shape[0] * 0.2)) clf = LocalOutlierFactor(n_neighbors=n) return clf.fit_predict(X)
for dat in datasets: plt.clf() plt.figure(figsize=(25, 13)) # loading and vectorization #X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,n_clusters_per_class=1, weights=[0.999], flip_y=0, random_state=4) X = df.values X.astype(float) n_features = 2 X_train = X#.reshape(-1, 1) # define models: iforest = IsolationForest(n_estimators=50, max_samples='auto', contamination=float(0.01),max_features=2).fit(X) lof = LocalOutlierFactor(n_neighbors=5, novelty=True) #ocsvm = OneClassSVM() #ocsvm = OneClassSVM(kernel='linear', degree=2, gamma='auto', nu=0.5) ocsvm = OneClassSVM(gamma='auto', nu=0.01) lim_inf = X.min(axis=0) lim_sup = X.max(axis=0) volume_support = (lim_sup - lim_inf).prod() t = np.arange(0, 100 / volume_support, 0.01 / volume_support) axis_alpha = np.arange(alpha_min, alpha_max, 0.0001) unif = np.random.uniform(lim_inf, lim_sup, size=(n_generated, n_features)) # fit: print('IsolationForest processing...') iforest = IsolationForest()
# In[20]: # The max power used by the appliance in the initial active areas, filtered by this percentile, # is assumed to be roughly the max power used by the appliance rough_max_power_percentile = 95 # Any initial active area where the max power used is less than # this ratio of the rough max power of the appliance is ignored too_low_power_ratio = 0.02 # In[21]: len_max_coords = np.column_stack( ([len(a) for a in active_area_data], [a.max() for a in active_area_data])) lof_labels = LocalOutlierFactor().fit_predict(len_max_coords) # In[22]: colors = np.array(['r.', 'g.', 'b.']) plt.scatter(len_max_coords[:, 0], len_max_coords[:, 1], c=lof_labels) # In[23]: plt.hist([len_max_coords[lof_labels == l][:, 1] for l in [1, -1]], stacked=True) # In[24]: rough_max_power = np.percentile(len_max_coords[:, 1], 95) too_low_power = rough_max_power * too_low_power_ratio
print(y_value.head()) y_value = y_value.values.reshape(-1) print(y_value.shape) x_value = sampled_data.drop(labels="Class", axis=1) print(x_value.columns) print(x_value.shape) # Print shapes print(x_value.shape) print(y_value.shape) #Algorithms used: Random Isolation, LocalOutlier factor are common anomaly detection methods random_isolation = IsolationForest(max_samples=len(x_value), contamination=outlier_value, random_state=3) local_outlier = LocalOutlierFactor(n_neighbors=12, contamination=outlier_value) n_outlier = len(fraudal_count) #fit and predict random_isolation.fit(x_value) score_prediction = random_isolation.decision_function(x_value) y_predict_lof = random_isolation.predict(x_value) y_predict_isf = local_outlier.fit_predict(x_value) score_prediction = local_outlier.negative_outlier_factor_ #Change the value to 0 for valid and 1 for fradual cases. y_predict_isf[y_predict_isf == 1] = 0 y_predict_isf[y_predict_isf == -1] = 1 y_predict_lof[y_predict_lof == 1] = 0 y_predict_lof[y_predict_lof == -1] = 1
df2 = df2[df2["Job Title"].isin(emp_counts[emp_counts > 3000].index)] df2['Salary Paid'] = df2['Salary Paid'].apply(lambda x:x.split('.')[0].strip()).replace({'\$':'', ',':''}, regex=True) FirAtt_lst = df2['Job Title'].unique() SecAtt_lst = df2['Employer'].unique() ThrAtt_lst = df2['Calendar Year'].unique() ################################### Forming a context ####################################### Orgn_Ctx = df2.loc[df2['Job Title'].isin([FirAtt_lst[0],FirAtt_lst[1],FirAtt_lst[2],FirAtt_lst[3], FirAtt_lst[4]]) & \ df2['Employer'].isin([SecAtt_lst[0],SecAtt_lst[1], SecAtt_lst[2],SecAtt_lst[3], SecAtt_lst[4], SecAtt_lst[5]]) & \ df2['Calendar Year'].isin([ThrAtt_lst[0],ThrAtt_lst[1],ThrAtt_lst[2],ThrAtt_lst[3],ThrAtt_lst[4]])] ####################### Finding an outlier in the selected context ####################### clf = LocalOutlierFactor(n_neighbors=20) Sal_outliers = clf.fit_predict(Orgn_Ctx['Salary Paid'].values.reshape(-1,1)) Queried_ID =Orgn_Ctx.iloc[Sal_outliers.argmin()][1] print '\n\n Outlier\'s ID in the selected context is: ', Queried_ID ################# Exploring Contexts larger than the original to find the maximal ################# FirAtt_Sprset = sum(map(lambda r: list(combinations(FirAtt_lst[5:], r)), range(1, len(FirAtt_lst[5:])+1)), []) SecAtt_Sprset = sum(map(lambda r: list(combinations(SecAtt_lst[6:], r)), range(1, len(SecAtt_lst[6:])+1)), []) ThrAtt_Sprset = sum(map(lambda r: list(combinations(ThrAtt_lst[5:], r)), range(1, len(ThrAtt_lst[5:])+1)), []) Sub_pop = [] Sub_pop_count = 0 Epsilon = 0.1 ### Privacy Parameter output = [] context = []
result_angriff_excel = [] x_train = signal[:, 0:35000] x_train = np.transpose(x_train) x_test = signal[:, 35000:len(signal[0])] x_test = np.transpose(x_test) # x_outliers = np.concatenate((angriff_sis_attack_1,angriff_sis_attack_2), axis = 1) # x_outliers = np.transpose(x_outliers) x_outliers = np.transpose(angriff_sis_attack_2) ground_truth_angriff = pred_sis_attack neighbours = 3000 print('Neighbours: ', neighbours) lof = LocalOutlierFactor(n_neighbors=neighbours, novelty=True) lof.fit(x_train) test_pred = lof.predict(x_test) n_outliers = 0 ground_truth = np.ones(len(x_test), dtype=int) n_errors = (test_pred != ground_truth).sum() result_test = (len(x_test) - n_errors) / (len(x_test)) result_test_excel += [result_test] lof.fit(x_train) outlier_pred = lof.predict(x_outliers) n_outliers = len(x_outliers)
def test_import_from_sklearn_pipeline_no_wrapper(self): from sklearn.neighbors import LocalOutlierFactor from sklearn.pipeline import make_pipeline sklearn_pipeline = make_pipeline(PCA(), LocalOutlierFactor()) _ = import_from_sklearn_pipeline(sklearn_pipeline, fitted=False)
pass out_df = out_df.append(working, ignore_index=True) out_df = out_df[blank_dict.keys()] out_df.to_csv(out_csv, index=False) if visualize: print('Visualizing') brain_vol_df = pd.read_csv(brain_vol_csv) collated_csv = os.path.join(out_folder, 'collated.csv') clean_table = pd.read_csv(collated_csv, index_col='mr_id') clean_table = clean_table[clean_table['exclude'] != 1] clf = LocalOutlierFactor(n_neighbors=20, contamination=0.06) y_pred = clf.fit_predict(clean_table) #y_pred_unsort = y_pred.copy() x_scores = clf.negative_outlier_factor_ #x_scores_unsort = x_scores.copy() clean_table['outlier'] = y_pred clean_table['normal_control'] = [ all([i, not j]) for i, j in zip(clean_table['control'], clean_table['sci']) ] clean_table['sci_control'] = [ all([i, j]) for i, j in zip(clean_table['control'], clean_table['sci']) ]
print('\n******Iso-Forest*******\n') start = time.time() clf = IsolationForest(contamination=0.1, behaviour='new') clf.fit(X) end = time.time() time_all[j, 0] = end - start iso_scores = clf.score_samples(X) if run_lof_svm == 0: lof_scores = iso_scores osvm_scores = iso_scores elif j == 0: print('\n******LOF*******\n') start = time.time() lof = LocalOutlierFactor() lof.fit(X) end = time.time() time_all[j, 1] = end - start lof_scores = lof.negative_outlier_factor_ print('\n******1-class SVM*******\n') start = time.time() osvm = OneClassSVM(kernel='rbf') osvm.fit(X) end = time.time() time_all[j, 2] = end - start osvm_scores = osvm.score_samples(X) print('\n******Our Algo*******\n') start = time.time()
def main(): X = read_data() Y = read_labels() isf = IsolationForest() lof = LocalOutlierFactor(novelty=True) svm = OneClassSVM(kernel="rbf") cov = EllipticEnvelope() kmn = KMeans(n_clusters=1) k_fold = StratifiedKFold(n_splits=3, shuffle=True) params_isf = [] params_lof = [] params_svm = [] params_cov = [] params_kmn = [] for user in range(0, num_of_labeled_users): X_all = X[user] Y_all = Y[user].astype(int) X_genuine = X[user][0:num_of_genuine_segments] Y_genuine = Y[user][0:num_of_genuine_segments].astype(int) X_unlabeled = X[user][num_of_genuine_segments:] Y_unlabeled = Y[user][num_of_genuine_segments:].astype(int) ''' count_vect = CountVectorizer() tfidf_transformer = TfidfTransformer(use_idf=False) X_all_counts = count_vect.fit_transform(X_all) X_all_tfidf = tfidf_transformer.fit_transform(X_all_counts) isf_random = RandomizedSearchCV(estimator=isf, param_distributions=ISF_HYPER_PARAMS, n_iter=random_search_iter, cv=k_fold, verbose=2, random_state=42, n_jobs=-1, scoring=make_scorer(custom_acc)) svm_random = RandomizedSearchCV(estimator=svm, param_distributions=SVM_HYPER_PARAMS, n_iter=random_search_iter, cv=k_fold, verbose=2, random_state=42, n_jobs=-1, scoring=make_scorer(custom_acc)) lof_random = RandomizedSearchCV(estimator=lof, param_distributions=LOF_HYPER_PARAMS, n_iter=random_search_iter, cv=k_fold, verbose=2, random_state=42, n_jobs=-1, scoring=make_scorer(custom_acc)) kmn_random = RandomizedSearchCV(estimator=kmn, param_distributions=KMN_HYPER_PARAMS, n_iter=random_search_iter, cv=k_fold, verbose=2, random_state=42, n_jobs=-1, scoring=make_scorer(custom_acc)) cov_random = RandomizedSearchCV(estimator=cov, param_distributions=COV_HYPER_PARAMS, n_iter=random_search_iter, cv=k_fold, verbose=2, random_state=42, n_jobs=-1, scoring=make_scorer(custom_acc)) isf_random.fit(X_all_tfidf, Y_all) svm_random.fit(X_all_tfidf, Y_all) lof_random.fit(X_all_tfidf, Y_all) kmn_random.fit(X_all_tfidf, Y_all) cov_random.fit(X_all_tfidf.toarray(), Y_all) p_isf = dict(isf_random.best_params_) p_svm = dict(svm_random.best_params_) p_lof = dict(lof_random.best_params_) p_kmn = dict(kmn_random.best_params_) p_cov = dict(cov_random.best_params_) p_isf["score"] = isf_random.best_score_ p_svm["score"] = svm_random.best_score_ p_lof["score"] = lof_random.best_score_ p_kmn["score"] = kmn_random.best_score_ p_cov["score"] = cov_random.best_score_ params_isf.append(p_isf) params_svm.append(p_svm) params_lof.append(p_lof) params_kmn.append(p_kmn) params_cov.append(p_cov) ''' params_isf.append( calc_best_detector_for_algoritm(X_unlabeled, Y_unlabeled, isf, ISF_HYPER_PARAMS, k_fold)) params_svm.append( calc_best_detector_for_algoritm(X_unlabeled, Y_unlabeled, svm, SVM_HYPER_PARAMS, k_fold)) params_lof.append( calc_best_detector_for_algoritm(X_unlabeled, Y_unlabeled, lof, LOF_HYPER_PARAMS, k_fold)) params_kmn.append( calc_best_detector_for_algoritm(X_unlabeled, Y_unlabeled, kmn, KMN_HYPER_PARAMS, k_fold)) params_cov.append( calc_best_detector_for_algoritm(X_unlabeled, Y_unlabeled, cov, COV_HYPER_PARAMS, k_fold)) write_output(params_isf, 'IsolationForest') write_output(params_svm, 'OneClassSVM') write_output(params_lof, 'LocalOutlierFactor') write_output(params_kmn, 'KMeans') write_output(params_cov, 'EllipticEnvelope')
def lof(df, training_df): lof = LocalOutlierFactor(n_neighbors=20, contamination='auto') y_pred = lof.fit_predict(training_df) outliers = np.where(y_pred == -1) print('Removing ' + str(len(outliers[0])) + ' records') return df.drop(outliers[0])
import matplotlib.pyplot as plt from scipy.io import loadmat from sklearn.covariance import EllipticEnvelope from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor from sklearn.decomposition import PCA data = loadmat('ex8data2.mat') X = data['X'] e1 = EllipticEnvelope() labels1 = e1.fit_predict(X) e2 = LocalOutlierFactor() labels2 = e2.fit_predict(X) n_components = 3 pca1 = PCA(n_components=n_components) Xproj = pca1.fit_transform(X) plt.figure() plt.clf() ax = plt.axes(projection='3d') # ax.scatter(image_array[:, 0], image_array[:, 1], image_array[:, 2], c=labels, cmap='coolwarm', marker=',') ax.scatter(Xproj[:, 0], Xproj[:, 1], Xproj[:, 2], marker='o', c=labels1)
def __init__(self, name="局部异常因子"): self._model = LocalOutlierFactor() self.name = name
orient='horizontal', flierprops=flierprops, whiskerprops=whiskerprops, capprops=capprops) #plt.savefig('Distribution.png',dpi=400,bbox_inches='tight') scaledData = np.log(data) ax = plt.figure(figsize=(8, 5)).gca(title='Log Sales Distribution', xlabel='Product', ylabel='Log Sales') sns.violinplot(data=scaledData) #plt.savefig('Violin.png',dpi=400,bbox_inches='tight') # remove outliers outliers = LocalOutlierFactor(n_neighbors=20, contamination=.05) scaledData['inlier'] = outliers.fit_predict(scaledData) cleanData = scaledData.loc[scaledData.inlier == 1, products] #sns.pairplot(cleanData, plot_kws={'s': 5}) #plt.tight_layout(); sns.clustermap(cleanData.corr(), annot=True, fmt='.1%', center=0.0, vmin=-1, vmax=1, cmap=sns.diverging_palette(250, 10, n=20)) #plt.savefig('Heatmap.png',dpi=400,bbox_inches='tight') # run PCA
def BFS_Alg(Org_Vec, Queue, Data_to_write, Epsilon, max_ctx): Visited = [] BFS_Vec = np.zeros(len(Org_Vec)) for i in range(len(Org_Vec)): BFS_Vec[i] = Org_Vec[i] BFS_Flp = np.zeros(len(Org_Vec)) termination_threshold = 500 Terminator = 0 # I use the Queue it for visited nodes. # and just use sub_q here, for each sample I add the children to this sub_q without resetting it first sub_q = [[ 0, mp.exp(Epsilon * (Orgn_Ctx.shape[0])), Orgn_Ctx.shape[0], Org_Vec ]] contexts = [Org_Vec] while len(Visited) < 100: Terminator += 1 if (Terminator > termination_threshold): break #print 'sub_q before: ', sub_q for i in range(len(sub_q)): sub_q[i][0] = i Sub_elements = [elem for elem in range(len(sub_q))] Sub_probabilities = [] for prob in sub_q: Sub_probabilities.append(prob[1] / (sum([prob[1] for prob in sub_q]))) SubRes = np.random.choice(Sub_elements, 1, p=Sub_probabilities) Queue.append([ len(Queue), sub_q[SubRes[0]][1], sub_q[SubRes[0]][2], sub_q[SubRes[0]][3][:] ]) #print 'Queue is:', Queue Visited.append(sub_q[SubRes[0]][3][:]) #print 'Visited is:', Visited sub_q.remove(sub_q[SubRes[0]]) #print 'Visited is:', Visited for Flp_bit in range(0, (len(BFS_Vec))): for i in range(len(BFS_Flp)): BFS_Flp[i] = Queue[len(Queue) - 1][3][i] Sub_Sal_list = [] Sub_ID_list = [] BFS_Flp[Flp_bit] = 1 - BFS_Flp[Flp_bit] BFS_Ctx = df2.loc[df2['Weapon'].isin(FirAtt_lst[np.where(BFS_Flp[0:len(FirAtt_lst)] == 1)].tolist()) &\ df2['State'].isin(SecAtt_lst[np.where(BFS_Flp[len(FirAtt_lst):len(FirAtt_lst)+len(SecAtt_lst)] == 1)].tolist()) &\ df2['AgencyType'].isin(ThrAtt_lst[np.where(BFS_Flp[len(FirAtt_lst)+len(SecAtt_lst):len(FirAtt_lst)+len(SecAtt_lst)+len(ThrAtt_lst)] == 1)].tolist())] if ((not any(np.array_equal(BFS_Flp[:], x[:]) for x in Visited)) and (not any(np.array_equal(BFS_Flp[:], x[:]) for x in contexts)) and (BFS_Ctx.shape[0] > 20)): for row in range(BFS_Ctx.shape[0]): #VictimAge is column 4 and the ID is on column 0 Sub_Sal_list.append(BFS_Ctx.iloc[row, 4]) Sub_ID_list.append(BFS_Ctx.iloc[row, 0]) Sub_Sal_arr = np.array(Sub_Sal_list) clf = LocalOutlierFactor(n_neighbors=20) Sub_Sal_outliers = clf.fit_predict(Sub_Sal_arr.reshape(-1, 1)) for outlier_finder in range(0, len(Sub_ID_list)): if ((Sub_Sal_outliers[outlier_finder] == -1) and (Sub_ID_list[outlier_finder] == Queried_ID)): Sub_Score = mp.exp(Epsilon * (BFS_Ctx.shape[0])) sub_q.append([ Flp_bit, Sub_Score, BFS_Ctx.shape[0], np.zeros(len(Org_Vec)) ]) for i in range(len(sub_q[len(sub_q) - 1][3])): sub_q[len(sub_q) - 1][3][i] = BFS_Flp[i] contexts.append(np.zeros(len(Org_Vec))) for i in range(len(Org_Vec)): contexts[len(contexts) - 1][i] = BFS_Flp[i] # Exp mechanism on the visited nodes for i in range(len(Queue)): Queue[i][0] = i elements = [elem for elem in range(len(Queue))] probabilities = [] for prob in Queue: probabilities.append(prob[1] / (sum([prob[1] for prob in Queue]))) Res = np.random.choice(elements, 1, p=probabilities) Data_to_write.append(Queue[Res[0]][2] / max_ctx) return
target = 'Class' X = data[columns] Y = data[target] print(X.shape) print(Y.shape) from sklearn.metrics import classification_report, accuracy_score from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor state = 1 classifiers = { 'Isolation Forest': IsolationForest(max_samples=len(X), contamination=outlier_fraction, random_state=state), 'Local Outlier Factor': LocalOutlierFactor(n_neighbors=20, contamination=outlier_fraction) } n_outliers = len(fraud) for i, (clf_name, clf) in enumerate(classifiers.items()): if clf_name == 'Local Outlier Factor': y_pred = clf.fit_predict(X) scores_pred = clf.negative_outlier_factor_ else: clf.fit(X) scores_pred = clf.decision_function(X) y_pred = clf.predict(X) y_pred[y_pred == 1] = 0 y_pred[y_pred == -1] = 1 n_errors = (y_pred != Y).sum() print('{}: {}'.format(clf_name, n_errors)) print(accuracy_score(Y, y_pred))
def main(camera_FPS, camera_width, camera_height, inference_scale, threshold, num_threads): interpreter = None input_details = None output_details = None path = "pictures/" if not os.path.exists(path): os.mkdir(path) model_path = "OneClassAnomalyDetection-RaspberryPi3/DOC/model/" if os.path.exists(model_path): # LOF print("LOF model building...") x_train = np.loadtxt(model_path + "train.csv", delimiter=",") ms = MinMaxScaler() x_train = ms.fit_transform(x_train) # fit the LOF model clf = LocalOutlierFactor(n_neighbors=5) clf.fit(x_train) # DOC print("DOC Model loading...") interpreter = interpreter_wrapper.Interpreter( model_path="models/tensorflow/weights.tflite") interpreter.allocate_tensors() interpreter.set_num_threads(num_threads) input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() print("loading finish") else: print("Nothing model folder") sys.exit(0) base_range = min(camera_width, camera_height) stretch_ratio = inference_scale / base_range resize_image_width = int(camera_width * stretch_ratio) resize_image_height = int(camera_height * stretch_ratio) if base_range == camera_height: crop_start_x = (resize_image_width - inference_scale) // 2 crop_start_y = 0 else: crop_start_x = 0 crop_start_y = (resize_image_height - inference_scale) // 2 crop_end_x = crop_start_x + inference_scale crop_end_y = crop_start_y + inference_scale fps = "" message = "Push [p] to take a picture" result = "Push [s] to start anomaly detection" flag_score = False picture_num = 1 elapsedTime = 0 score = 0 score_mean = np.zeros(10) mean_NO = 0 cap = cv2.VideoCapture(0) cap.set(cv2.CAP_PROP_FPS, camera_FPS) cap.set(cv2.CAP_PROP_FRAME_WIDTH, camera_width) cap.set(cv2.CAP_PROP_FRAME_HEIGHT, camera_height) time.sleep(1) while cap.isOpened(): t1 = time.time() ret, image = cap.read() if not ret: break image_copy = image.copy() # prediction if flag_score == True: prepimg = cv2.resize(image, (resize_image_width, resize_image_height)) prepimg = prepimg[crop_start_y:crop_end_y, crop_start_x:crop_end_x] prepimg = np.array(prepimg).reshape( (1, inference_scale, inference_scale, 3)) prepimg = prepimg / 255 interpreter.set_tensor(input_details[0]['index'], np.array(prepimg, dtype=np.float32)) interpreter.invoke() outputs = interpreter.get_tensor(output_details[0]['index']) outputs = outputs.reshape((len(outputs), -1)) outputs = ms.transform(outputs) score = -clf._decision_function(outputs) # output score if flag_score == False: cv2.putText(image, result, (camera_width - 350, 100), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA) else: score_mean[mean_NO] = score[0] mean_NO += 1 if mean_NO == len(score_mean): mean_NO = 0 if np.mean(score_mean) > threshold: #red if score is big cv2.putText(image, "{:.1f} Score".format(np.mean(score_mean)), (camera_width - 230, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 1, cv2.LINE_AA) else: # blue if score is small cv2.putText(image, "{:.1f} Score".format(np.mean(score_mean)), (camera_width - 230, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA) # message cv2.putText(image, message, (camera_width - 285, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, cv2.LINE_AA) cv2.putText(image, fps, (camera_width - 164, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 1, cv2.LINE_AA) cv2.imshow("Result", image) # FPS elapsedTime = time.time() - t1 fps = "{:.0f} FPS".format(1 / elapsedTime) # quit or calculate score or take a picture key = cv2.waitKey(1) & 0xFF if key == ord("q"): break if key == ord("p"): cv2.imwrite(path + str(picture_num) + ".jpg", image_copy) picture_num += 1 if key == ord("s"): flag_score = True cv2.destroyAllWindows()
outliers_fraction = 0.15 n_outliers = int(outliers_fraction * n_samples) n_inliers = n_samples - n_outliers # define outlier/anomaly detection methods to be compared anomaly_algorithms = [("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)), ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)), ("Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42)), ("Local Outlier Factor", LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction))] # Define datasets blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2) datasets = [ make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5, **blobs_params)[0], make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5], **blobs_params)[0], make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3], **blobs_params)[0], 4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] - np.array([0.5, 0.25])), 14. * (np.random.RandomState(42).rand(n_samples, 2) - 0.5) ]
CONTAMINATION = 0.1 # try reading a csv y_pred = [] filename = 'Outlier_multy_n={}_c={}.csv'.format(N_NEIGHBORS, CONTAMINATION) try: out_frame = pd.read_csv(filename) y_pred = out_frame.Out except FileNotFoundError: # file was not found, create and train new model, then print results to csv print('file ', filename, ' was not found :(') print('new file will be generated') print() print('create new classifier') outlier_clf = LocalOutlierFactor(n_neighbors = N_NEIGHBORS, contamination = CONTAMINATION ) print("training model for corruption: ", CONTAMINATION, ', neighbors: ', N_NEIGHBORS) y_pred = outlier_clf.fit_predict(features) print("outliers detected, creating csv") # create new frame and print it to csv f = pd.DataFrame({'Out': y_pred}) f.to_csv(filename) # read data train_og = pd.read_hdf("train.h5", "train") all_data = pd.read_hdf("train.h5", "train").drop(['y'], axis = 1) # insert outlier-column train_og.insert(0, column = 'outlier', value = y_pred)
if dataset_name == "SA": lb = LabelBinarizer() x1 = lb.fit_transform(X[:, 1].astype(str)) x2 = lb.fit_transform(X[:, 2].astype(str)) x3 = lb.fit_transform(X[:, 3].astype(str)) X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] y = (y != b"normal.").astype(int) if dataset_name == "http" or dataset_name == "smtp": y = (y != b"normal.").astype(int) X = X.astype(float) print("LocalOutlierFactor processing...") model = LocalOutlierFactor(n_neighbors=20) tstart = time() model.fit(X) fit_time = time() - tstart scoring = -model.negative_outlier_factor_ # the lower, the more normal fpr, tpr, thresholds = roc_curve(y, scoring) AUC = auc(fpr, tpr) plt.plot( fpr, tpr, lw=1, label="ROC for %s (area = %0.3f, train-time: %0.2fs)" % (dataset_name, AUC, fit_time), ) plt.xlim([-0.05, 1.05])
def anomaly_detection(testdata_name,rank_method_index,test_EVs_ts,test_MVs_ts): # Local Outlier Factor from sklearn.neighbors import LocalOutlierFactor from myFunctions import gen_dist_mat # experimentName = '{}_LOF'.format(testdata_name) # Choose ranking method # rank_group = rank_high_low rank_group = rank_methods[rank_method_index] rank_method_name = rank_methods_names[rank_method_index] test_weather_ts = test_EVs_ts[0] # test weather data # MV_index = 0 # MV we are examining MV_predictions = [] for MV_index in range(len(MVs)): predictions = [] for n in range(test_weather_ts.shape[0]): # The 20th closest weather data weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20] print('{} - group length:{}'.format(n,len(weather_group))) if len(weather_group) < 10: predictions.append('len<') continue # reshape to row array to concatenate test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1])) # concatenated matrix of training data and the test data sample NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0) LOF = LocalOutlierFactor(n_neighbors = 3,metric='precomputed') D = gen_dist_mat(NT_data) # distance matrix # if distance matrix are all zeros(all TS are identical), then skip this if len(D[D == 0]) == D.shape[0]*D.shape[1]: predictions.append('D=0') continue pred = LOF.fit_predict(D) predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later # if detected as outlier, save plot of MVs if pred[-1] == -1: plt.figure() # # draw only the current MV----- for c in weather_group: plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted') plt.plot(test_MVs_ts[MV_index,n],color='gold') #-------------------------------- # # draw for all MVs------------- # for index in range(MVs_ts.shape[0]): # for c in combination: # plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted') # plt.plot(test_MVs_ts[index,n],color='gold') # plt.show() # ------------------------------- dir_loc = r'C:\Users\James\Desktop\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index]) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) # save faulty plot plt.savefig(dir_loc + '\\n{}.png'.format(n)) plt.close() MV_predictions.append(np.array(predictions)) p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data p_fault[:] = False p_normal[:] = True # False p_lack[:] = True # False for predictions in MV_predictions: p_fault = np.logical_or(p_fault, predictions=='-1') normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0') p_normal = np.logical_and(p_normal,normal_with_identical) p_lack = np.logical_and(p_lack, predictions=='len<') # the indices of ts sample which are considered faulty fault_index = np.arange(len(p_fault))[p_fault] normal_index = np.arange(len(p_normal))[p_normal] lack_index = np.arange(len(p_lack))[p_lack] # print results: fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100) nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100) ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100) print(fd_rate) print(nd_rate) print(ld_rate) # Save results: dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName) with open(dir_loc+'\\results.txt','w') as f: f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate) # Isolation Forest from sklearn.ensemble import IsolationForest from myFunctions import gen_dist_mat # experimentName = '{}_IsolationForest'.format(testdata_name) # Choose ranking method # rank_group = rank_high_low rank_group = rank_methods[rank_method_index] rank_method_name = rank_methods_names[rank_method_index] # test_weather_ts = test_EVs_ts[0] # test weather data # MV_index = 0 # MV we are examining MV_predictions = [] for MV_index in range(len(MVs)): predictions = [] for n in range(test_weather_ts.shape[0]): # The 20th closest weather data weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20] print('{} - group length:{}'.format(n,len(weather_group))) if len(weather_group) < 10: predictions.append('len<') continue # reshape to row array to concatenate test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1])) # concatenated matrix of training data and the test data sample NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0) D = gen_dist_mat(NT_data) # distance matrix # if distance matrix are all zeros(all TS are identical), then skip this if len(D[D == 0]) == D.shape[0]*D.shape[1]: predictions.append('D=0') continue IsoForest = IsolationForest() IsoForest.fit(NT_data) pred = IsoForest.predict(NT_data) predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later # if detected as outlier, save plot of MVs if pred[-1] == -1: plt.figure() # # draw only the current MV----- for c in weather_group: plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted') plt.plot(test_MVs_ts[MV_index,n],color='gold') #-------------------------------- # # draw for all MVs------------- # for index in range(MVs_ts.shape[0]): # for c in combination: # plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted') # plt.plot(test_MVs_ts[index,n],color='gold') # plt.show() # ------------------------------- dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index]) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) # save faulty plot plt.savefig(dir_loc + '\\n{}.png'.format(n)) plt.close() MV_predictions.append(np.array(predictions)) p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data p_fault[:] = False p_normal[:] = True # False p_lack[:] = True # False for predictions in MV_predictions: p_fault = np.logical_or(p_fault, predictions=='-1') normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0') p_normal = np.logical_and(p_normal,normal_with_identical) p_lack = np.logical_and(p_lack, predictions=='len<') # the indices of ts sample which are considered faulty fault_index = np.arange(len(p_fault))[p_fault] normal_index = np.arange(len(p_normal))[p_normal] lack_index = np.arange(len(p_lack))[p_lack] # print results: fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100) nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100) ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100) print(fd_rate) print(nd_rate) print(ld_rate) # Save results: dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName) with open(dir_loc+'\\results.txt','w') as f: f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate)