def detect_anomalies(kills): num_neighbors = min(KILL_NUM_NEIGHBORS, len(kills) - 1) contam = min(float(KILL_MAX_ANOM) / len(kills), 0.2) lof = LocalOutlierFactor(num_neighbors, metric="manhattan", contamination=contam) kill_vals = np.array([[k.value / 1e6] for k in kills]) res = lof.fit_predict(kill_vals) return [kills[i] for i in np.nditer(np.where(res == -1))]
import matplotlib.pyplot as plt from sklearn.neighbors import LocalOutlierFactor from storage import DataStorage np.random.seed(42) storage = DataStorage() records = pd.read_csv(storage.augmented_dataset_file_name) X = records[['average_cpu', 'average_memory']] y = records['is_normal'] clf = LocalOutlierFactor(n_neighbors=5, contamination=0.1) y_pred = clf.fit_predict(X) X_scores = clf.negative_outlier_factor_ plt.title("Local Outlier Factor (LOF)") plt.scatter(X.iloc[:, 0].values, X.iloc[:, 1].values, color='k', s=3., label='Data points') radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min()) plt.scatter(X.iloc[:, 0].values, X.iloc[:, 1].values, s=1000 * radius, edgecolors='r', facecolors='none', label='Outlier scores') plt.ylabel('Average memory usage') plt.xlabel('Average CPU usage') legend = plt.legend(loc='upper left') legend.legendHandles[0]._sizes = [10]
# TerminalSN_le = preprocessing.LabelEncoder() # preprocessed_features["TerminalSN"] = TerminalSN_le.fit_transform(preprocessed_features["TerminalSN"]) # EventID_le = preprocessing.LabelEncoder() # preprocessed_features["EventID"] = EventID_le.fit_transform(preprocessed_features["EventID"]) # Split data set, not needed as it is unsupervised # train_features, test_features = train_test_split(preprocessed_features, test_size=0.2) # Begin Training neigh = LocalOutlierFactor(n_neighbors=300, leaf_size=100, novelty=False, algorithm="auto", contamination=0.01) train_outliers = neigh.fit_predict(preprocessed_features) # On training data # Compile into Data Frame for print outlier_result_df = pd.DataFrame() # outlier_result_df["UserID"] = UserID_le.inverse_transform(preprocessed_features["UserID"]) # outlier_result_df["TerminalSN"] = TerminalSN_le.inverse_transform(preprocessed_features["TerminalSN"]) # outlier_result_df["Timestamps"] = raw_df["TIMESTAMPS"] outlier_result_df["Time_Of_Day"] = preprocessed_features["Time_Of_Day"] outlier_result_df["Outlier"] = train_outliers print(outlier_result_df) # Get Percentage of Outliers outlier_percentage = len(outlier_result_df.loc[outlier_result_df["Outlier"] == -1]) / len(outlier_result_df) print(outlier_percentage)
#修改完成图 locate = [] position = [] l = [] num = 0 sum = 0 line = write.get_line(dodification_path) for i in range(1, len(lat) - 1): if abs(lat[i] - lat[i + 1]) + abs(lng[i] - lng[i + 1]) < 0.00037 and abs( lat[i] - lat[i - 1]) + abs(lng[i] - lng[i - 1]) < 0.00037: locate.append([lat[i], lng[i], int(time.mktime(m_t[i])), i]) cls = LocalOutlierFactor(n_neighbors=190, contamination=c) k = cls.fit_predict(locate) for i in range(len(k)): if k[i] == 1: position.append(locate[i]) length = int(input("请输入异常点之间的分割长度:")) # AA00002中length = 300 # AB00006中length = 400 # AD00003中length = 500 # AD00013中length = 300 # AD00053中length = 700 # AD00083中length = 300 # AD00419中length = 300 # AF00098中length = 300
import numpy as np import matplotlib.pyplot as plt from sklearn.neighbors import LocalOutlierFactor import data_generation as DG data, data_error = DG.generate_random_data(100, 100, 10) raw_data = data # Add X to real data # data = np.reshape(raw_data, (-1,1)) data = (data - min(data)) / (max(data) - min(data)) # # fit the model clf = LocalOutlierFactor(n_neighbors=10, contamination=0.1) y_pred = clf.fit_predict(data) raw_y_pred = clf.negative_outlier_factor_ #y_pred_outliers = y_pred[200:] plt.plot(data) plt.scatter(np.arange(len(raw_y_pred)), raw_y_pred, c='red') #plt.scatter(np.arange(len(y_pred)),y_pred,c='red') detected_outliners = (sorted(range(len(raw_y_pred)), key=lambda i: raw_y_pred[i])[:len(data_error)]) correct_percentage = np.mean( data_error != raw_data[sorted(detected_outliners)]) * 100 print("The error percentage: ", correct_percentage, "%") plt.show()
''' local outlier factor ''' labels = [] removalPairs = [] # [inliers, outliers] cont = ['auto', 'auto', 'auto'] # contamination for each k in order outlierCount = [] ks = [28, 56, 112] # do detection for each k, later plotted for j in range(0, len(ks)): inliers = [] k = ks[j] data = nitrateMg.copy() time, inliers = remove_missing_values(time, data) localFactorDetection = LocalOutlierFactor(n_neighbors=k, contamination=cont[j]) pred = localFactorDetection.fit_predict(inliers.reshape(-1, 1)) #1 is an inlier, -1 is an outlier count = 0 outliers = np.zeros(len(inliers)) + float('nan') # using pred as mask, seperate outliers and inliers for i in range(0, len(pred)): if (pred[i] == -1): outliers[i] = inliers[i] inliers[i] = float('nan') count += 1 outlierCount.append(count) removalPairs.append([inliers, outliers, time]) labels.append('%d neighbours. Outliers: %d (%0.1f%%)' % (k, count, 100 * float(count) / len(data)))
flierprops=flierprops, whiskerprops=whiskerprops, capprops=capprops) #plt.savefig('Distribution.png',dpi=400,bbox_inches='tight') scaledData = np.log(data) ax = plt.figure(figsize=(8, 5)).gca(title='Log Sales Distribution', xlabel='Product', ylabel='Log Sales') sns.violinplot(data=scaledData) #plt.savefig('Violin.png',dpi=400,bbox_inches='tight') # remove outliers outliers = LocalOutlierFactor(n_neighbors=20, contamination=.05) scaledData['inlier'] = outliers.fit_predict(scaledData) cleanData = scaledData.loc[scaledData.inlier == 1, products] #sns.pairplot(cleanData, plot_kws={'s': 5}) #plt.tight_layout(); sns.clustermap(cleanData.corr(), annot=True, fmt='.1%', center=0.0, vmin=-1, vmax=1, cmap=sns.diverging_palette(250, 10, n=20)) #plt.savefig('Heatmap.png',dpi=400,bbox_inches='tight') # run PCA
out_df = out_df.append(working, ignore_index=True) out_df = out_df[blank_dict.keys()] out_df.to_csv(out_csv, index=False) if visualize: print('Visualizing') brain_vol_df = pd.read_csv(brain_vol_csv) collated_csv = os.path.join(out_folder, 'collated.csv') clean_table = pd.read_csv(collated_csv, index_col='mr_id') clean_table = clean_table[clean_table['exclude'] != 1] clf = LocalOutlierFactor(n_neighbors=20, contamination=0.06) y_pred = clf.fit_predict(clean_table) #y_pred_unsort = y_pred.copy() x_scores = clf.negative_outlier_factor_ #x_scores_unsort = x_scores.copy() clean_table['outlier'] = y_pred clean_table['normal_control'] = [ all([i, not j]) for i, j in zip(clean_table['control'], clean_table['sci']) ] clean_table['sci_control'] = [ all([i, j]) for i, j in zip(clean_table['control'], clean_table['sci']) ] clean_table['normal_scd'] = [ all([i, not j]) for i, j in zip(clean_table['scd'], clean_table['sci'])
fig2 = pht.plot_components(forecast) fig.savefig("{}/TimeSeries_fbProphet.png".format(foldername), bbox_inches='tight', dpi=100) fig2.savefig("{}/TimeSeries_fbProphet_components.png".format(foldername), bbox_inches='tight', dpi=100) ################################################################################ # Anomaly Detection ################################################################################ # Perform Local Outlier detection plt.clf() localOutlier = LocalOutlierFactor() local_pred = localOutlier.fit_predict( daily_transits["Transits"].values.reshape(-1, 1)) x_range = range(len(daily_transits["Transits"])) plt.scatter(x_range, daily_transits["Transits"], c=local_pred) plt.xlabel("Day") plt.ylabel("Relative Transit Uses") plt.gcf().set_size_inches((16.0, 8.0), forward=False) plt.savefig("{}/AnomalyDetection_LocalOutlier.png".format(foldername), bbox_inches='tight', dpi=100) #perform K nearest Neighbor clustering knn = 20 temp = daily_transits temp = temp.drop(columns=["Date"]) try: nbrs = NearestNeighbors(
def outlier_lof(df): lof = LocalOutlierFactor(n_jobs = -1) lof_res = lof.fit_predict(df) outliers_lof = [i for i in range(len(lof_res)) if lof_res[i] == -1] return outliers_lof
def pre_select_data(selection, norm): Trainset = np.loadtxt('Trainset.csv', delimiter=',') (train_num, b) = Trainset.shape feature = b - 1 Test = np.loadtxt('Test.csv', delimiter=',') test_num = Test.shape[0] Train_label = Trainset[:, feature] Train_info = Trainset[:, 0:feature] Test_info = Test[:, 0:feature] if selection == 1: fs = mutual_info_classif(X=Train_info, y=Train_label) count3 = 0 for i in range(0, feature): if fs[i] == 0: print(i) count3 = count3 + 1 data_new = np.zeros((train_num, b - count3)) test_new = np.zeros((test_num, b - count3)) count4 = 0 for i in range(0, feature): if fs[i] != 0: data_new[:, count4] = Trainset[:, i] test_new[:, count4] = Test[:, i] count4 = count4 + 1 feature = count4 data_new[:, feature] = Train_label test_new[:, feature] = Test[:, b - 1] print('feature = ', feature) if selection == 2: clf = ExtraTreesClassifier() clf = clf.fit(Train_info, Train_label) model = SelectFromModel(clf, prefit=True) Train_info = model.transform(Train_info) Test_info = model.transform(Test_info) feature = Train_info.shape[1] data_new = np.zeros((train_num, feature + 1)) test_new = np.zeros((test_num, feature + 1)) data_new[:, 0:feature] = Train_info data_new[:, feature] = Train_label test_new[:, 0:feature] = Test_info test_new[:, feature] = Test[:, b - 1] print('feature = ', feature) if selection == 3: (us, fs) = f_classif(X=Train_info, y=Train_label) count3 = 0 for i in range(0, feature): if fs[i] >= 0.05: print(i) count3 = count3 + 1 data_new = np.zeros((train_num, b - count3)) test_new = np.zeros((test_num, b - count3)) count4 = 0 for i in range(0, feature): if fs[i] < 0.05: data_new[:, count4] = Trainset[:, i] test_new[:, count4] = Test[:, i] count4 = count4 + 1 feature = count4 data_new[:, feature] = Train_label test_new[:, feature] = Test[:, b - 1] np.savetxt('dd.csv', data_new, delimiter=',') print('feature = ', feature) if selection == 0: feature = b - 1 data_new = Trainset test_new = Test Train_data = data_new[:, 0:feature] Train_label = data_new[:, feature] Test_data = test_new[:, 0:feature] Test_label = test_new[:, feature] np.savetxt('Bayes_label.csv', Train_label, delimiter=',') np.savetxt('Bayes.csv', Train_data, delimiter=',') np.savetxt('BayesTest.csv', Test_data, delimiter=',') np.savetxt('Bayes_TL.csv', Test_label, delimiter=',') if norm == 1: scaler = StandardScaler() scaler.fit(Train_data) Train_data = scaler.transform(Train_data) Test_data = scaler.transform(Test_data) if norm == 2: scaler = MinMaxScaler() scaler.fit(Train_data) Train_data = scaler.transform(Train_data) Test_data = scaler.transform(Test_data) data_new[:, 0:feature] = Train_data data_new[:, feature] = Train_label test_new[:, 0:feature] = Test_data test_new[:, feature] = Test_label np.savetxt('datanewtrain.csv', data_new, delimiter=',') np.savetxt('datanewtest.csv', test_new, delimiter=',') #balance the data label1 = 0 label2 = 0 train_num = data_new.shape[0] for j in range(1, train_num): if data_new[j, feature] == 0: label1 = label1 + 1 else: label2 = label2 + 1 ratio = int(np.ceil(label1 / label2)) count2 = 0 B_Trainset = np.zeros(((ratio - 1) * label2, feature)) for i in range(0, train_num): if data_new[i, feature] == 1: for c in range(0, ratio - 1): B_Trainset[count2 + c, :] = data_new[i, 0:feature] count2 = count2 + ratio - 2 B_Trainset = B_Trainset[[i for i, x in enumerate(B_Trainset) if x.any()]] cut = B_Trainset.shape[0] dev = [] for e in range(0, feature): dev.append(np.std(B_Trainset[:, e])) noisy = np.zeros((cut, feature)) for b in range(0, feature): for c in range(0, cut): noisy[c, b] = np.random.uniform(-0.1 * dev[b], 0.1 * dev[b]) B_Trainset = B_Trainset + noisy B_data = np.zeros((cut, feature + 1)) B_data[:, 0:feature] = B_Trainset B_data[:, feature] = 1 datab = np.vstack((data_new, B_data)) # shuffle the data datab = shuffle(datab) TL = datab[:, feature] TD = datab[:, 0:feature] # Outlier Detection train_num = TD.shape[0] LOF = LocalOutlierFactor(n_neighbors=80) Outlier = LOF.fit_predict(TD, TL) Train = np.zeros((train_num, feature)) Tlabel = np.zeros(train_num) count3 = 0 for c in range(0, train_num): if Outlier[c] == 1: Train[count3, :] = TD[c, :] Tlabel[count3] = TL[c] count3 = count3 + 1 Train = Train[[i for i, x in enumerate(Trainset) if x.any()]] Tlabel = Tlabel[[i for i, x in enumerate(Trainset) if x.any()]] np.savetxt('B_Trainset_data.csv', Train, delimiter=',') np.savetxt('B_Trainset_label.csv', Tlabel, delimiter=',') np.savetxt('Test_data.csv', Test_data, delimiter=',') np.savetxt('Test_label.csv', Test_label, delimiter=',')
# C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) # C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2) # C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2) # C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2) # C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2) # coords = np.vstack((C1, C2, C3, C4, C5, C6)) ## from sklearn.neighbors import LocalOutlierFactor from sklearn.ensemble import IsolationForest LOF = LocalOutlierFactor(n_neighbors=20) iForest = IsolationForest() LOF.fit(coords) lof_labels = LOF.fit_predict(coords) iForest.fit(coords) iforest_labels = iForest.predict(coords) lof_scores = LOF.negative_outlier_factor_ LOF.threshold_ if_scores = iForest.decision_function(coords) iForest.threshold_ # plot normalized scores plt.figure() plt.plot((lof_scores - np.mean(lof_scores)) / np.std(lof_scores)) plt.plot((if_scores - np.mean(if_scores)) / np.std(if_scores)) plt.hlines((LOF.threshold_ - np.mean(lof_scores)) / np.std(lof_scores), xmin=0,
def LOF(data_file_path, k_list=[5, 20, 50]): """ Use Local Outlier Facter algorithm to find outliers on the specific file :param data_file_path: The specific file path for input data :param k_list: (Optional) The list of neighbor numbers for LOF algorithm, default is k=5,20,50 :return: None """ # Make csv data to dataframe df = pd.read_csv(data_file_path, encoding='latin-1') # Print sample data in dataframe print("========== Sample data ==========") print(df.iloc[:10]) print() train_df = df # ===== Preprocessing ===== # # - Zillow file # # Remove zpid column (Zillow ID) if 'zpid' in list(train_df): train_df = train_df.drop(['zpid'], axis=1) # Remove latitude and longitude columns # if 'latitude' in list(train_df): # train_df = train_df.drop(['latitude'], axis=1) # if 'longitude' in list(train_df): # train_df = train_df.drop(['longitude'], axis=1) # Remove countryid column because all are same if 'countryid' in list(train_df) and len(set(train_df['countryid'])) == 1: train_df = train_df.drop(['countryid'], axis=1) # Convert some columns to be categorical such as cityid, countryid, zipcpde if 'cityid' in list(train_df): train_df['cityid'] = train_df['cityid'].apply( lambda x: str(x) + "_categorized") if 'zipcpde' in list(train_df): train_df['zipcpde'] = train_df['zipcpde'].apply( lambda x: str(x) + "_categorized") # - Other files # # Get rid of area_type, area_id: These features should not be used for using LOF because they are unique if 'area_id' in list(train_df) and 'area_type' in list(train_df): train_df = train_df.drop(['area_id', 'area_type'], axis=1) # Combine City and State to be one column # Or remove them because they are unique (or almost unique in some files) if 'City' in list(train_df) and 'State' in list(train_df): # train_df["CityState"] = train_df[['City', 'State']].apply(lambda x: ''.join(x), axis=1) train_df = train_df.drop(['City', 'State'], axis=1) # - All files # # Get rid of area_type: it is same for all rows train_df = train_df.loc[:, ~train_df.columns.str.contains( '^Unnamed')] # Remove Unnamed columns if there is # Do one hot to convert categorical data to numeric data train_df = pd.get_dummies(train_df) print("========== Sample train data ==========") print(train_df.iloc[:10]) print() # Normalization to make it easier to illustrate min_max_scaler = preprocessing.MinMaxScaler() train_data = min_max_scaler.fit_transform(train_df) # Dataframe after get rid of some columns and numerize any category columns print("========== Normalized train data ==========") print(train_data[:10]) print() # ===== Local Outlier Factors ===== # # # Set a file for outlier summary SCRIPT_DIR = os.path.abspath(os.path.dirname(sys.argv[0])) data_file_name = data_file_path.split('/')[len(data_file_path.split('/')) - 1] summary_file_name = SCRIPT_DIR + '/' + data_file_name + '_outlier_summary.csv' f = open(summary_file_name, 'w') # Print header header = "k-neighbors,total, outliers, non-outliers, % outliers" f.write(header + '\n') # Try LOF with different k (default = 5, 20, 50) result_list = {} for k in k_list: # Fit the model clf = LocalOutlierFactor(n_neighbors=k) y_pred = clf.fit_predict(train_data) result_list["k=" + str(k)] = y_pred # Count number of outliers outlier_number = 0 for y in y_pred: if y == -1: outlier_number += 1 # Print sample prediction results print("========== Prediction results with k=%i ==========" % k) print("Total:", len(y_pred)) print("Number of outliers:", outlier_number) print("Number of non-outliers:", (len(y_pred) - outlier_number)) print( "Percentage of outliers:", "{0:.2f}%".format( outlier_number / (len(y_pred) - outlier_number) * 100)) print("Oulier result:", y_pred) print() # Write summary to file line = ",".join([ str(k), str(len(y_pred)), str(outlier_number), str(len(y_pred) - outlier_number), "{0:.2f}%".format( outlier_number / (len(y_pred) - outlier_number) * 100) ]) f.write(line + '\n') f.close() # Set another file for outlier results result_file_name = SCRIPT_DIR + '/' + data_file_name + '_outlier_results.csv' pd.DataFrame(result_list).to_csv(result_file_name, index=False)
#将数据按期属性(按列进行)减去其均值,并处以其标准差。得到的结果是,对于每个属性/每列来说所有数据都聚集在0附近,标准差为1 Y = np.array(X) Y_scaled = preprocessing.scale(Y) print(Y_scaled) # DBscan聚类 检测异常点 # 默认eps=0.5 min_samples=5 #clf=DBSCAN(eps=0.8,metric='euclidean',algorithm='auto') # 默认n_neighbors=20,contamination=0.1 clf = LocalOutlierFactor(n_neighbors=10, contamination=0.08) # 孤立森林,默认n_estimators=100, contamination=0.1 # 方法报错,存在bug,待fix #clf = IsolationForest(n_estimators=100, contamination=0.04) y_pred = clf.fit_predict(Y_scaled) #y_pred = clf.predict(Y_scaled) print(clf) print(y_pred) x = [n[0] for n in X] y = [n[1] for n in X] # 可视化操作 plt.scatter(x, y, c=y_pred, marker='o') plt.title("LOF-Babymother Data") plt.xlabel("score") plt.ylabel("reduced weight") plt.legend(["user"]) plt.show() data['聚类标签'] = y_pred #data.to_excel('/Users/martin_yan/Desktop/clustering5.22-6.11(3).xlsx',index=False, encoding="utf_8_sig")
return model sen = label_sentences(cb_healthcare) model = train_doc2vec_model(sen) vector_list=[] for i in range(len(cb_healthcare)): vector_list.append(model.docvecs[i]) #X,y=vector_list[0:1200],vector_list[1201:1226] df_x = pd.DataFrame(X) clf_lof = LocalOutlierFactor(n_neighbors=20,metric='euclidean') y_pred = clf_lof.fit_predict(df_x) X_scores = -(clf_lof.negative_outlier_factor_) #%% clf = LocalOutlierFactor(n_neighbors=80,metric='euclidean') y_pred = clf.fit_predict(df_x) y_pred_score = clf._decision_function(df_x) scores_doc = -(clf.negative_outlier_factor_) lofScores_10 = pd.DataFrame(scores_doc,columns = ['LOF_scores']) #lofScores_10['N/M']=patent['기업이름'] sort = lofScores_10.sort_values(["LOF_scores"],ascending=[False]) sorts2 = np.array(sort) #%%
import numpy as np import matplotlib.pyplot as plt from sklearn.neighbors import LocalOutlierFactor print(__doc__) np.random.seed(42) # Generate train data X = 0.3 * np.random.randn(100, 2) # Generate some abnormal novel observations X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) X = np.r_[X + 2, X - 2, X_outliers] # fit the model clf = LocalOutlierFactor(n_neighbors=20) y_pred = clf.fit_predict(X) y_pred_outliers = y_pred[200:] # plot the level sets of the decision function xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("Local Outlier Factor (LOF)") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) a = plt.scatter(X[:200, 0], X[:200, 1], c='white') b = plt.scatter(X[200:, 0], X[200:, 1], c='red') plt.axis('tight') plt.xlim((-5, 5)) plt.ylim((-5, 5))
y_pred = [] filename = 'Outlier_multy_n={}_c={}.csv'.format(N_NEIGHBORS, CONTAMINATION) try: out_frame = pd.read_csv(filename) y_pred = out_frame.Out except FileNotFoundError: # file was not found, create and train new model, then print results to csv print('file ', filename, ' was not found :(') print('new file will be generated') print() print('create new classifier') outlier_clf = LocalOutlierFactor(n_neighbors = N_NEIGHBORS, contamination = CONTAMINATION ) print("training model for corruption: ", CONTAMINATION, ', neighbors: ', N_NEIGHBORS) y_pred = outlier_clf.fit_predict(features) print("outliers detected, creating csv") # create new frame and print it to csv f = pd.DataFrame({'Out': y_pred}) f.to_csv(filename) # read data train_og = pd.read_hdf("train.h5", "train") all_data = pd.read_hdf("train.h5", "train").drop(['y'], axis = 1) # insert outlier-column train_og.insert(0, column = 'outlier', value = y_pred) # NOTE: split here train_og, X_test = train_test_split(train_og, test_size=0.33)
def detect_conceptually_irrelevant_name(self): identifiers_with_vector = [identifier for identifier in self.identifiers if identifier.vector is not None] identifiers_with_vector_original = deepcopy(identifiers_with_vector) vectors = [identifier.vector for identifier in self.identifiers if identifier.vector is not None] vectors_original = deepcopy(vectors) clf = LocalOutlierFactor(n_neighbors=int(len(vectors_original) / 3.), ) y_pred = clf.fit_predict(vectors_original) lof_scores = clf.negative_outlier_factor_ # Normalization lof_scores_normalized_original = (lof_scores.max() - lof_scores) / (lof_scores.max() - lof_scores.min()) print('lof_scores_normalized_original', lof_scores_normalized_original) print('average naming debt: "{0}"'.format(np.mean(lof_scores_normalized_original))) Visualized.draw_names_plot(identifiers=identifiers_with_vector_original, lof_scores_normalized=lof_scores_normalized_original) if len(vectors_original) < 10: return vectors_avg = np.mean(vectors) iteration = 0 id_out_list = list() id_in_list = list() flag = True while flag and iteration < 100: print('-' * 75) print('iteration "{}" ...'.format(iteration)) clf = LocalOutlierFactor(n_neighbors=int(len(vectors) / 3.), ) y_pred = clf.fit_predict(vectors) lof_scores = clf.negative_outlier_factor_ # Normalization lof_scores_normalized = (lof_scores.max() - lof_scores) / (lof_scores.max() - lof_scores.min()) for i, identifier in enumerate(identifiers_with_vector): identifier.local_outlier_factor = lof_scores_normalized[i] identifiers_with_vector_sorted = sorted(identifiers_with_vector, key=lambda k: k.local_outlier_factor, reverse=True) print('Average naming debt is "{0}"'.format(np.mean(lof_scores_normalized))) id_out_list.append(deepcopy(identifiers_with_vector_sorted[0])) print('The identifier "{0}" should be renamed'.format(identifiers_with_vector_sorted[0])) # Avg version remained_identifiers_vectors = [identifier.vector for identifier in identifiers_with_vector_sorted[1:]] # remained_identifiers_vectors_avg = np.mean(remained_identifiers_vectors, axis=0) # print('remained_identifiers_vectors_avg', remained_identifiers_vectors_avg) # recommended_names = self.model.wv.similar_by_vector(remained_identifiers_vectors_avg, topn=10) # distance_to_neighbor = [distance.cosine(id_out_list[-1].vector, vector) # for vector in remained_identifiers_vectors] # for i, dist in enumerate(distance_to_neighbor): # if dist == 0: # distance_to_neighbor[i] = max(distance_to_neighbor) # nearest_neighbor_index = distance_to_neighbor.index(min(distance_to_neighbor)) nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(remained_identifiers_vectors) distances, indices = nbrs.kneighbors([id_out_list[-1].vector], n_neighbors=5) print('I ', indices[0]) random.shuffle(indices[0]) nearest_neighbor_indice = indices[0][0] recommended_names = self.model.wv.similar_by_vector(remained_identifiers_vectors[nearest_neighbor_indice], topn=len(self.identifiers) + 1) print('similar by vector,', recommended_names) # Check post conditions # Check if the name is verb for function or noun for attribute recommended_name = recommended_names[0][0] rank = 1 while recommended_name in [identifier.id_name for identifier in identifiers_with_vector]\ or len(recommended_name) < 4\ or recommended_name in ['char', 'int', 'float', 'double', 'string', 'class']: recommended_name = recommended_names[rank][0] rank += 1 for identifier in identifiers_with_vector: if identifier.unique_number == id_out_list[-1].unique_number: identifier.id_name = recommended_name identifier.parts = identifier.get_identifier_parts() identifier.vector = identifier.get_single_vector_for_identifier(model=self.model) id_in_list.append(deepcopy(identifier)) print('##### id changed', identifier.unique_number) break vectors = [identifier.vector for identifier in identifiers_with_vector] vectors_avg_new = np.mean(vectors, axis=0) d = distance.cosine(vectors_avg, vectors_avg_new) print('distance', d) print('improvement', ) if d <= 0.05: flag = False vectors_avg = vectors_avg_new iteration += 1 print('Number of iterations: "{0}"'.format(iteration)) print('To be renamed ids: "{0}"'.format([identifier.id_name for identifier in id_out_list])) print('Recommended names ids: "{0}"'.format([identifier.id_name for identifier in id_in_list])) Visualized.draw_names_plot(identifiers=identifiers_with_vector, lof_scores_normalized=lof_scores_normalized) print('Final IDs', [identifier.id_name for identifier in identifiers_with_vector])
from scipy.io import loadmat from sklearn.covariance import EllipticEnvelope from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor from sklearn.decomposition import PCA data = loadmat('ex8data2.mat') X = data['X'] e1 = EllipticEnvelope() labels1 = e1.fit_predict(X) e2 = LocalOutlierFactor() labels2 = e2.fit_predict(X) n_components = 3 pca1 = PCA(n_components=n_components) Xproj = pca1.fit_transform(X) plt.figure() plt.clf() ax = plt.axes(projection='3d') # ax.scatter(image_array[:, 0], image_array[:, 1], image_array[:, 2], c=labels, cmap='coolwarm', marker=',') ax.scatter(Xproj[:, 0], Xproj[:, 1], Xproj[:, 2], marker='o', c=labels1) plt.show()
def anomaly_detection(testdata_name,rank_method_index,test_EVs_ts,test_MVs_ts): # Local Outlier Factor from sklearn.neighbors import LocalOutlierFactor from myFunctions import gen_dist_mat # experimentName = '{}_LOF'.format(testdata_name) # Choose ranking method # rank_group = rank_high_low rank_group = rank_methods[rank_method_index] rank_method_name = rank_methods_names[rank_method_index] test_weather_ts = test_EVs_ts[0] # test weather data # MV_index = 0 # MV we are examining MV_predictions = [] for MV_index in range(len(MVs)): predictions = [] for n in range(test_weather_ts.shape[0]): # The 20th closest weather data weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20] print('{} - group length:{}'.format(n,len(weather_group))) if len(weather_group) < 10: predictions.append('len<') continue # reshape to row array to concatenate test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1])) # concatenated matrix of training data and the test data sample NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0) LOF = LocalOutlierFactor(n_neighbors = 3,metric='precomputed') D = gen_dist_mat(NT_data) # distance matrix # if distance matrix are all zeros(all TS are identical), then skip this if len(D[D == 0]) == D.shape[0]*D.shape[1]: predictions.append('D=0') continue pred = LOF.fit_predict(D) predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later # if detected as outlier, save plot of MVs if pred[-1] == -1: plt.figure() # # draw only the current MV----- for c in weather_group: plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted') plt.plot(test_MVs_ts[MV_index,n],color='gold') #-------------------------------- # # draw for all MVs------------- # for index in range(MVs_ts.shape[0]): # for c in combination: # plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted') # plt.plot(test_MVs_ts[index,n],color='gold') # plt.show() # ------------------------------- dir_loc = r'C:\Users\James\Desktop\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index]) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) # save faulty plot plt.savefig(dir_loc + '\\n{}.png'.format(n)) plt.close() MV_predictions.append(np.array(predictions)) p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data p_fault[:] = False p_normal[:] = True # False p_lack[:] = True # False for predictions in MV_predictions: p_fault = np.logical_or(p_fault, predictions=='-1') normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0') p_normal = np.logical_and(p_normal,normal_with_identical) p_lack = np.logical_and(p_lack, predictions=='len<') # the indices of ts sample which are considered faulty fault_index = np.arange(len(p_fault))[p_fault] normal_index = np.arange(len(p_normal))[p_normal] lack_index = np.arange(len(p_lack))[p_lack] # print results: fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100) nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100) ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100) print(fd_rate) print(nd_rate) print(ld_rate) # Save results: dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName) with open(dir_loc+'\\results.txt','w') as f: f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate) # Isolation Forest from sklearn.ensemble import IsolationForest from myFunctions import gen_dist_mat # experimentName = '{}_IsolationForest'.format(testdata_name) # Choose ranking method # rank_group = rank_high_low rank_group = rank_methods[rank_method_index] rank_method_name = rank_methods_names[rank_method_index] # test_weather_ts = test_EVs_ts[0] # test weather data # MV_index = 0 # MV we are examining MV_predictions = [] for MV_index in range(len(MVs)): predictions = [] for n in range(test_weather_ts.shape[0]): # The 20th closest weather data weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20] print('{} - group length:{}'.format(n,len(weather_group))) if len(weather_group) < 10: predictions.append('len<') continue # reshape to row array to concatenate test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1])) # concatenated matrix of training data and the test data sample NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0) D = gen_dist_mat(NT_data) # distance matrix # if distance matrix are all zeros(all TS are identical), then skip this if len(D[D == 0]) == D.shape[0]*D.shape[1]: predictions.append('D=0') continue IsoForest = IsolationForest() IsoForest.fit(NT_data) pred = IsoForest.predict(NT_data) predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later # if detected as outlier, save plot of MVs if pred[-1] == -1: plt.figure() # # draw only the current MV----- for c in weather_group: plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted') plt.plot(test_MVs_ts[MV_index,n],color='gold') #-------------------------------- # # draw for all MVs------------- # for index in range(MVs_ts.shape[0]): # for c in combination: # plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted') # plt.plot(test_MVs_ts[index,n],color='gold') # plt.show() # ------------------------------- dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index]) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) # save faulty plot plt.savefig(dir_loc + '\\n{}.png'.format(n)) plt.close() MV_predictions.append(np.array(predictions)) p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data p_fault[:] = False p_normal[:] = True # False p_lack[:] = True # False for predictions in MV_predictions: p_fault = np.logical_or(p_fault, predictions=='-1') normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0') p_normal = np.logical_and(p_normal,normal_with_identical) p_lack = np.logical_and(p_lack, predictions=='len<') # the indices of ts sample which are considered faulty fault_index = np.arange(len(p_fault))[p_fault] normal_index = np.arange(len(p_normal))[p_normal] lack_index = np.arange(len(p_lack))[p_lack] # print results: fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100) nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100) ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100) print(fd_rate) print(nd_rate) print(ld_rate) # Save results: dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName) with open(dir_loc+'\\results.txt','w') as f: f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate)
class LOFStep(): def __init__(self, include_y=True, kwargs={'contamination': 'auto'}): """ Uses the local outlier factor to detect and remove outliers. Uses sklearn’s LocalOutlierFactor class. Parameters ---------- include_y (bool, default=True), Whether or not to include the y data when fitting the isolation forest kwargs (dict, default={'contamination': 'auto'}) : arguments to pass to sklearn’s IsolationForest class initialization """ self.description = "Local Outlier Factor" self.include_y = include_y self.kwargs = kwargs self.fitted = None self.changes_num_samples = True def fit(self, X, y=None): """ Fits the outlier detection on the given data Parameters ---------- X (DataFrame) : training data y (DataFrame, default=None) : target values (if needed) Returns ------- (DataFrame, DataFrame) : A tuple of the transformed DataFrames, the first being the X data and the second being the y data """ self.fitted = LocalOutlierFactor(**self.kwargs) return self.transform(X, y=y) def transform(self, X, y=None): """ Transforms the given data using the previously fitted outlier detection method Parameters ---------- X (DataFrame) : training data y (DataFrame, default=None) : target values (if needed) Returns ------- (DataFrame, DataFrame) : A tuple of the transformed DataFrames, the first being the X data and the second being the y data """ if self.fitted is None: raise TransformError outlier_labels = self.fitted.fit_predict(X, y) # Remove outliers from data for i in range(outlier_labels.shape[0]): if outlier_labels[i] == -1: X = X.drop(index=i) if y is not None: y = y.drop(index=i) if y is None: return X.reset_index(drop=True) y = y.reset_index(drop=True) return X.reset_index(drop=True), y
'n_neighbors': 10, 'n_clusters': 3 } # connectivity matrix for structured Ward # connectivity = kneighbors_graph( # array, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric # connectivity = 0.5 * (connectivity + connectivity.T) # algorithms # two_means = cluster.MiniBatchKMeans(n_clusters=5) clf = LocalOutlierFactor(n_neighbors=10) y_pred = clf.fit_predict(array) outliers = y_pred[200:] vals = clf.negative_outlier_factor_ # print (y_pred) # print(vals) dist = list() for each in array: dist.append(np.power(each[0], 2) + np.power(each[1], 2)) npList = np.column_stack( (df.getData().index.values, df.getData().iloc[:, 0:], dist)) print(npList)
def _LocalOutlierFactor(X): n = int(round(X.shape[0] * 0.2)) clf = LocalOutlierFactor(n_neighbors=n) return clf.fit_predict(X)
df2['Salary Paid'] = df2['Salary Paid'].apply(lambda x:x.split('.')[0].strip()).replace({'\$':'', ',':''}, regex=True) FirAtt_lst = df2['Job Title'].unique() SecAtt_lst = df2['Employer'].unique() ThrAtt_lst = df2['Calendar Year'].unique() ################################### Forming a context ####################################### Orgn_Ctx = df2.loc[df2['Job Title'].isin([FirAtt_lst[0],FirAtt_lst[1],FirAtt_lst[2],FirAtt_lst[3], FirAtt_lst[4]]) & \ df2['Employer'].isin([SecAtt_lst[0],SecAtt_lst[1], SecAtt_lst[2],SecAtt_lst[3], SecAtt_lst[4], SecAtt_lst[5]]) & \ df2['Calendar Year'].isin([ThrAtt_lst[0],ThrAtt_lst[1],ThrAtt_lst[2],ThrAtt_lst[3],ThrAtt_lst[4]])] ####################### Finding an outlier in the selected context ####################### clf = LocalOutlierFactor(n_neighbors=20) Sal_outliers = clf.fit_predict(Orgn_Ctx['Salary Paid'].values.reshape(-1,1)) Queried_ID =Orgn_Ctx.iloc[Sal_outliers.argmin()][1] print '\n\n Outlier\'s ID in the selected context is: ', Queried_ID ################# Exploring Contexts larger than the original to find the maximal ################# FirAtt_Sprset = sum(map(lambda r: list(combinations(FirAtt_lst[5:], r)), range(1, len(FirAtt_lst[5:])+1)), []) SecAtt_Sprset = sum(map(lambda r: list(combinations(SecAtt_lst[6:], r)), range(1, len(SecAtt_lst[6:])+1)), []) ThrAtt_Sprset = sum(map(lambda r: list(combinations(ThrAtt_lst[5:], r)), range(1, len(ThrAtt_lst[5:])+1)), []) Sub_pop = [] Sub_pop_count = 0 Epsilon = 0.1 ### Privacy Parameter output = [] context = []
outliers_fraction = 0.2 lof = LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction) np.random.seed(42) # Data generation mean1 = [0, 0] mean2 = [3.5, 4] cov1 = [[1.5, -0.3], [-0.2, .5]] cov2 = [[0.75, 0.4], [0.3, 0.5]] X = np.r_[np.random.multivariate_normal(mean1, cov1, 100), np.random.multivariate_normal(mean2, cov2, 100)] # Add outliers y_pred = lof.fit_predict(X) scores_pred = lof.negative_outlier_factor_ plt.figure(figsize=(18, 9)) subplot = plt.subplot(1, 2, 1) b = subplot.scatter(X[:, 0], X[:, 1], c=['k' if y == 1 else 'r' for y in y_pred], s=20) subplot = plt.subplot(1, 2, 2) b = subplot.scatter(X[:, 0], X[:, 1], c=-np.log(-scores_pred), s=20, cmap=plt.get_cmap('Reds'))
print(x_value.shape) print(y_value.shape) #Algorithms used: Random Isolation, LocalOutlier factor are common anomaly detection methods random_isolation = IsolationForest(max_samples=len(x_value), contamination=outlier_value, random_state=3) local_outlier = LocalOutlierFactor(n_neighbors=12, contamination=outlier_value) n_outlier = len(fraudal_count) #fit and predict random_isolation.fit(x_value) score_prediction = random_isolation.decision_function(x_value) y_predict_lof = random_isolation.predict(x_value) y_predict_isf = local_outlier.fit_predict(x_value) score_prediction = local_outlier.negative_outlier_factor_ #Change the value to 0 for valid and 1 for fradual cases. y_predict_isf[y_predict_isf == 1] = 0 y_predict_isf[y_predict_isf == -1] = 1 y_predict_lof[y_predict_lof == 1] = 0 y_predict_lof[y_predict_lof == -1] = 1 n_error_isf = (y_predict_isf != y_value).sum() n_error_lof = (y_predict_lof != y_value).sum() print("Error value for Isolation forest ", n_error_isf) print("Error value for local outlier function ", n_error_lof) print(accuracy_score(y_value, y_predict_isf)) print(accuracy_score(y_value, y_predict_isf))
for row in outdsreader: if ((row.values()[0] in FirAtt_Sprset[i]) & (row.values()[1] in SecAtt_Sprset[j])): pop_size += 1 Sal_list.append(row['Salary(K)']) ID_list.append(row['ID']) ##################### Outlier detection in subpopulations ############################# if (pop_size != 0): #Score = np.exp(Epsilon *np.log(pop_size)) ### Score Calculation #Score = np.exp(Epsilon *(pop_size)) Score = np.exp(Epsilon * (pop_size**(1. / 3))) Sal_arr = np.array(Sal_list) clf = LocalOutlierFactor(n_neighbors=4) Sal_outliers = clf.fit_predict(Sal_arr.reshape(-1, 1)) for outlier_finder in range(0, len(ID_list)): if ((ID_list[outlier_finder] == Queried_ID) & (Sal_outliers[outlier_finder] == -1)): Sub_pop.append([i, j, pop_size, Score, Sub_pop_count]) Sub_pop_count += 1 Sub_pop_sorted = sorted(Sub_pop, key=lambda Sub_pop: Sub_pop[2]) print '\n\nSubpopulations are[Att1_index, Att2_index, Population_size, Score, ID]\n\n', Sub_pop print '\n\nSubpopulations sorted based on the score are[Att1_index, Att2_index, Population_size, Score, ID]\n\n', \ Sub_pop_sorted ############ Max subpopulation wiht least number of attribute values for outlier ########### outlier_index = len(Sub_pop) - 1 while (Sub_pop_sorted[outlier_index -
def lof(df, training_df): lof = LocalOutlierFactor(n_neighbors=20, contamination='auto') y_pred = lof.fit_predict(training_df) outliers = np.where(y_pred == -1) print('Removing ' + str(len(outliers[0])) + ' records') return df.drop(outliers[0])
def remove_outliers_lof(data, k=10): k = min((len(data), k)) lof = LocalOutlierFactor(n_neighbors=k) stays = lof.fit_predict(data) return np.array(data)[stays == 1]
class LocalOutlierFactorFilter: """ 训练与预测一体,没有单独的train和test接口 关键参数:n_neighbors : int, optional (default=20):参与预测的点的数量,无明显规律 contamination": 可以反映过滤强度, 越大过滤强度越大 """ def __init__(self, name="局部异常因子"): self._model = LocalOutlierFactor() self.name = name def get_params(self, deep=True): """ 获得模型参数 """ return self._model.get_params(deep=deep) def _get_valid_params(self): """ 获取有效参数 :return: List """ param = self.get_params() return [i for i in param.keys()] def set_params(self, **new_params): """ 设置模型参数 :param new_params: 模型参数键值 只将模型参数包含的超参赋值给模型 :return: """ for k in new_params.keys(): if k not in self._get_valid_params(): raise ValueError("传入参数含有模型中不包含的参数") break feed_dict = { k: v for k, v in new_params.items() if k in self._get_valid_params() } if len(feed_dict) == 0: warnings.warn("模型参数未被修改") self._model.set_params(**feed_dict) def fit_predict(self, x): pass """ :param x: 训练数据 :param y: 训练数据标签 :return: 训练数据准确率 """ return self._model.fit_predict(x) def _connect_SQL(self, **json_file): """ 连接到SQL :param json_file: 入参 :return:None """ json_dict = json_file self._SQL = SQLServer(host=json_dict['dbinfo']['ip'], port=json_dict['dbinfo']['port'], user=json_dict['dbinfo']['username'], pwd=json_dict['dbinfo']['password'], db=json_dict['dbinfo']['databasename']) def get_data_label(self, **json_file): """ 从数据库调取数据集的标签 :param json_file: :return: 仅含有标签的数据集 pd.dataFrame """ json_dict = json_file data_label = self._SQL.df_read_sqlserver( table=json_dict['dbinfo']['inputtable'], cols=json_dict['label_columns']) if data_label.shape[1] != 1: raise ValueError("错误:标签列数不为1") return data_label def get_data_features(self, **json_file): """ 从数据库调取数据集 :param json_file:入参, json :return: 仅含有特征变量的数据集 pd.dataFrame """ json_dict = json_file data_features = self._SQL.df_read_sqlserver( table=json_dict['dbinfo']['inputtable'], cols=json_dict['data_columns']) return data_features def train_predict_from_sql(self, **json_file): """ 训练模型并将模型保存 :param json_file: 入参,json :return:是否成功 """ try: self._connect_SQL(**json_file) self.set_params(**json_file["model_params"]) features = self.get_data_features(**json_file) pre = self.fit_predict(features) self._model.columns = features.columns.values.tolist() self.save_model(json_file["model_path"]) # 暂时保存 pre.columns = ["label"] pre.to_csv(json_file["save_path"], index=False) write = self.SQL.df_write_sqlserver( table=json_file['dbinfo']['outputtable'], df=pre, cols=json_file['data_columns']) return {"info": write} return "success" except Exception as e: print(e) return 'failed,{e}'.format(e=e) def train_predict_from_csv(self, **json): try: features = pd.read_csv(json["path"], usecols=json['data_columns']) self.set_params(**json["model_params"]) pre = pd.DataFrame(self.fit_predict(features)) self._model.columns = json['data_columns'] self.save_model(json["model_path"]) # 暂时保存 pre.columns = ["label"] pre.to_csv(json["save_path"], index=False) return {"info": "success"} except Exception as e: print(e) return 'failed,{e}'.format(e=e) def train_predict_from_xls(self, **json): try: features = pd.read_excel(json["path"], usecols=json['data_columns']) self.set_params(**json["model_params"]) pre = self.fit_predict(features) self._model.columns = json['data_columns'] self.save_model(json["model_path"]) # 暂时保存 pre.columns = ["label"] pre.to_csv(json["save_path"], index=False) return {"info": "success"} except Exception as e: print(e) return 'failed,{e}'.format(e=e) def save_model(self, model_path): """ 保存模型 :param model_path: 模型保存路径 :return:是否成功 """ try: joblib.dump(self._model, model_path) except Exception as e: print(e) return 'failed,{e}'.format(e=e) def get_model(self): """ 调用模型 :return:模型 """ try: return self._model except Exception as e: print(e) return 'failed,{e}'.format(e=e) def load_model(self, **json): model_path = json['model_path'] self._model = joblib.load(model_path)
def anomaly_detection(testdata_name, rank_method_index, test_EVs_ts, test_MVs_ts, fig_loc, result_loc, contam, savefig_=True): ''' Runs LOF and Isolation Forest for fault detection. Starts with using given rank function to group test_EVs_ts data to weather_ts data, then compare MVs data with test_MVs_ts data using LOF and Isolation Forest ----------------------------------------------------------------------------- global inputs: weather_ts: Divided TS weather data, numpy array in NT format MVs_ts: Corresponding divided TS MVs data, numpy array in NT format n_seg: number of segments for PAA conversion ----------------------------------------------------------------------------- inputs: testdata_name: folder name of testing dataset, used to print out progress rank_method_index: index to identify rank method used test_EVs_ts: Divided TS EVs data, numpy array in NT format test_MVs_ts: Corresponding divided TS MVs data, numpy array in NT format fig_loc: folder path for saved faulty figure plots result_loc: folder path for fault detection rate result text files contam: contamination parameter used for scikit-learn anomaly detection algorithms savefig_: save figure if set to True, default is True outputs: Faulty TS is saved as a plot The fault detection rate of a dataset is saved in a text file ''' # Local Outlier Factor from sklearn.neighbors import LocalOutlierFactor from myFunctions import gen_dist_mat # experimentName = '{}_LOF'.format(testdata_name) # Choose ranking method # rank_group = rank_high_low rank_group = rank_methods[rank_method_index] rank_method_name = rank_methods_names[rank_method_index] test_weather_ts = test_EVs_ts[0] # test weather data # MV_index = 0 # MV we are examining MV_predictions = [] for MV_index in range(len(MVs)): predictions = [] for n in range(test_weather_ts.shape[0]): # The 20th closest weather data weather_group = rank_group(weather_ts, test_weather_ts[n])['Day'][:30] print('{} - group length:{}'.format(n, len(weather_group))) if len(weather_group) < 10: predictions.append('len<') continue # reshape to row array to concatenate test_data_point = test_MVs_ts[MV_index, n].reshape( (1, MVs_ts[MV_index, weather_group].shape[1])) # concatenated matrix of training data and the test data sample NT_data = np.concatenate( (MVs_ts[MV_index, weather_group], test_data_point), axis=0) LOF = LocalOutlierFactor(n_neighbors=10, metric='precomputed', contamination=contam) D = gen_dist_mat(NT_data) # distance matrix # if distance matrix are all zeros(all TS are identical), then skip this if len(D[D == 0]) == D.shape[0] * D.shape[1]: predictions.append('D=0') continue pred = LOF.fit_predict(D) predictions.append( str(pred[-1]) ) # change to string to avoid comparison error in numpy later # if detected as outlier, save plot of MVs if pred[-1] == -1 and savefig_: plt.figure() # # draw only the current MV----- for c in weather_group: plt.plot(MVs_ts[MV_index, c], color='steelblue', alpha=0.5, linestyle='dotted') plt.plot(test_MVs_ts[MV_index, n], color='gold') #-------------------------------- # # draw for all MVs------------- # for index in range(MVs_ts.shape[0]): # for c in combination: # plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted') # plt.plot(test_MVs_ts[index,n],color='gold') # plt.show() # ------------------------------- # dir_loc = r'C:\Users\James\Desktop\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index]) dir_loc = fig_loc + r'\{}\{}\{}'.format( rank_method_name, experimentName, MVs[MV_index]) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) # save faulty plot plt.savefig(dir_loc + '\\n{}.png'.format(n)) plt.close() MV_predictions.append(np.array(predictions)) p_fault = np.empty(MV_predictions[0].shape, dtype=np.bool) # faulty p_normal = np.empty(MV_predictions[0].shape, dtype=np.bool) # normal p_lack = np.empty(MV_predictions[0].shape, dtype=np.bool) # lack of data p_fault[:] = False p_normal[:] = True # False p_lack[:] = True # False for predictions in MV_predictions: p_fault = np.logical_or(p_fault, predictions == '-1') normal_with_identical = np.logical_or(predictions == '1', predictions == 'D=0') p_normal = np.logical_and(p_normal, normal_with_identical) p_lack = np.logical_and(p_lack, predictions == 'len<') # the indices of ts sample which are considered faulty fault_index = np.arange(len(p_fault))[p_fault] normal_index = np.arange(len(p_normal))[p_normal] lack_index = np.arange(len(p_lack))[p_lack] # print results: fd_rate = 'Fault detection rate:\t {}%'.format( len(fault_index) / test_weather_ts.shape[0] * 100) nd_rate = 'Normal operation rate:\t {}%'.format( len(normal_index) / test_weather_ts.shape[0] * 100) ld_rate = 'Lack of data rate:\t {}%'.format( len(lack_index) / test_weather_ts.shape[0] * 100) print(fd_rate) print(nd_rate) print(ld_rate) # Save results: # dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName) dir_loc = result_loc + r'\{}\{}'.format(rank_method_name, experimentName) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) with open(dir_loc + '\\results.txt', 'w') as f: f.write(fd_rate + '\n' + nd_rate + '\n' + ld_rate) # save prediction results predArr_lof = np.array( MV_predictions).T # NF format(row:day/sample, col:MV) header = np.array(MVs).reshape(1, len(MVs)) # add header predArr_lof = np.concatenate((header, predArr_lof), axis=0) np.savetxt(dir_loc + '\\MV_predictions.csv', predArr_lof, fmt='%s', delimiter=',') # Isolation Forest from sklearn.ensemble import IsolationForest from myFunctions import gen_dist_mat # experimentName = '{}_IsolationForest'.format(testdata_name) # Choose ranking method # rank_group = rank_high_low rank_group = rank_methods[rank_method_index] rank_method_name = rank_methods_names[rank_method_index] # test_weather_ts = test_EVs_ts[0] # test weather data # MV_index = 0 # MV we are examining MV_predictions = [] for MV_index in range(len(MVs)): predictions = [] for n in range(test_weather_ts.shape[0]): # The 20th closest weather data weather_group = rank_group(weather_ts, test_weather_ts[n])['Day'][:30] print('{} - group length:{}'.format(n, len(weather_group))) if len(weather_group) < 10: predictions.append('len<') continue # reshape to row array to concatenate test_data_point = test_MVs_ts[MV_index, n].reshape( (1, MVs_ts[MV_index, weather_group].shape[1])) # concatenated matrix of training data and the test data sample NT_data = np.concatenate( (MVs_ts[MV_index, weather_group], test_data_point), axis=0) D = gen_dist_mat(NT_data) # distance matrix # if distance matrix are all zeros(all TS are identical), then skip this if len(D[D == 0]) == D.shape[0] * D.shape[1]: predictions.append('D=0') continue IsoForest = IsolationForest(contamination=contam) IsoForest.fit(NT_data) pred = IsoForest.predict(NT_data) predictions.append( str(pred[-1]) ) # change to string to avoid comparison error in numpy later # if detected as outlier, save plot of MVs if pred[-1] == -1 and savefig_: plt.figure() # # draw only the current MV----- for c in weather_group: plt.plot(MVs_ts[MV_index, c], color='steelblue', alpha=0.5, linestyle='dotted') plt.plot(test_MVs_ts[MV_index, n], color='gold') #-------------------------------- # # draw for all MVs------------- # for index in range(MVs_ts.shape[0]): # for c in combination: # plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted') # plt.plot(test_MVs_ts[index,n],color='gold') # plt.show() # ------------------------------- # dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index]) dir_loc = fig_loc + r'\{}\{}\{}'.format( rank_method_name, experimentName, MVs[MV_index]) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) # save faulty plot plt.savefig(dir_loc + '\\n{}.png'.format(n)) plt.close() MV_predictions.append(np.array(predictions)) p_fault = np.empty(MV_predictions[0].shape, dtype=np.bool) # faulty p_normal = np.empty(MV_predictions[0].shape, dtype=np.bool) # normal p_lack = np.empty(MV_predictions[0].shape, dtype=np.bool) # lack of data p_fault[:] = False p_normal[:] = True # False p_lack[:] = True # False for predictions in MV_predictions: p_fault = np.logical_or(p_fault, predictions == '-1') normal_with_identical = np.logical_or(predictions == '1', predictions == 'D=0') p_normal = np.logical_and(p_normal, normal_with_identical) p_lack = np.logical_and(p_lack, predictions == 'len<') # the indices of ts sample which are considered faulty fault_index = np.arange(len(p_fault))[p_fault] normal_index = np.arange(len(p_normal))[p_normal] lack_index = np.arange(len(p_lack))[p_lack] # print results: fd_rate = 'Fault detection rate:\t {}%'.format( len(fault_index) / test_weather_ts.shape[0] * 100) nd_rate = 'Normal operation rate:\t {}%'.format( len(normal_index) / test_weather_ts.shape[0] * 100) ld_rate = 'Lack of data rate:\t {}%'.format( len(lack_index) / test_weather_ts.shape[0] * 100) print(fd_rate) print(nd_rate) print(ld_rate) # Save results: # dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName) dir_loc = result_loc + r'\{}\{}'.format(rank_method_name, experimentName) # check directory if exists if not os.path.exists(dir_loc): os.makedirs(dir_loc) with open(dir_loc + '\\results.txt', 'w') as f: f.write(fd_rate + '\n' + nd_rate + '\n' + ld_rate) # save prediction results predArr_iForest = np.array( MV_predictions).T # NF format(row:day/sample, col:MV) header = np.array(MVs).reshape(1, len(MVs)) # add header predArr_iForest = np.concatenate((header, predArr_iForest), axis=0) np.savetxt(dir_loc + '\\MV_predictions.csv', predArr_iForest, fmt='%s', delimiter=',') # return prediction results return (predArr_lof, predArr_iForest)
def BFS_Alg(Org_Vec, Queue, Data_to_write, Epsilon, max_ctx): Visited = [] BFS_Vec = np.zeros(len(Org_Vec)) for i in range(len(Org_Vec)): BFS_Vec[i] = Org_Vec[i] BFS_Flp = np.zeros(len(Org_Vec)) termination_threshold = 500 Terminator = 0 # I use the Queue it for visited nodes. # and just use sub_q here, for each sample I add the children to this sub_q without resetting it first sub_q = [[ 0, mp.exp(Epsilon * (Orgn_Ctx.shape[0])), Orgn_Ctx.shape[0], Org_Vec ]] contexts = [Org_Vec] while len(Visited) < 100: Terminator += 1 if (Terminator > termination_threshold): break #print 'sub_q before: ', sub_q for i in range(len(sub_q)): sub_q[i][0] = i Sub_elements = [elem for elem in range(len(sub_q))] Sub_probabilities = [] for prob in sub_q: Sub_probabilities.append(prob[1] / (sum([prob[1] for prob in sub_q]))) SubRes = np.random.choice(Sub_elements, 1, p=Sub_probabilities) Queue.append([ len(Queue), sub_q[SubRes[0]][1], sub_q[SubRes[0]][2], sub_q[SubRes[0]][3][:] ]) #print 'Queue is:', Queue Visited.append(sub_q[SubRes[0]][3][:]) #print 'Visited is:', Visited sub_q.remove(sub_q[SubRes[0]]) #print 'Visited is:', Visited for Flp_bit in range(0, (len(BFS_Vec))): for i in range(len(BFS_Flp)): BFS_Flp[i] = Queue[len(Queue) - 1][3][i] Sub_Sal_list = [] Sub_ID_list = [] BFS_Flp[Flp_bit] = 1 - BFS_Flp[Flp_bit] BFS_Ctx = df2.loc[df2['Weapon'].isin(FirAtt_lst[np.where(BFS_Flp[0:len(FirAtt_lst)] == 1)].tolist()) &\ df2['State'].isin(SecAtt_lst[np.where(BFS_Flp[len(FirAtt_lst):len(FirAtt_lst)+len(SecAtt_lst)] == 1)].tolist()) &\ df2['AgencyType'].isin(ThrAtt_lst[np.where(BFS_Flp[len(FirAtt_lst)+len(SecAtt_lst):len(FirAtt_lst)+len(SecAtt_lst)+len(ThrAtt_lst)] == 1)].tolist())] if ((not any(np.array_equal(BFS_Flp[:], x[:]) for x in Visited)) and (not any(np.array_equal(BFS_Flp[:], x[:]) for x in contexts)) and (BFS_Ctx.shape[0] > 20)): for row in range(BFS_Ctx.shape[0]): #VictimAge is column 4 and the ID is on column 0 Sub_Sal_list.append(BFS_Ctx.iloc[row, 4]) Sub_ID_list.append(BFS_Ctx.iloc[row, 0]) Sub_Sal_arr = np.array(Sub_Sal_list) clf = LocalOutlierFactor(n_neighbors=20) Sub_Sal_outliers = clf.fit_predict(Sub_Sal_arr.reshape(-1, 1)) for outlier_finder in range(0, len(Sub_ID_list)): if ((Sub_Sal_outliers[outlier_finder] == -1) and (Sub_ID_list[outlier_finder] == Queried_ID)): Sub_Score = mp.exp(Epsilon * (BFS_Ctx.shape[0])) sub_q.append([ Flp_bit, Sub_Score, BFS_Ctx.shape[0], np.zeros(len(Org_Vec)) ]) for i in range(len(sub_q[len(sub_q) - 1][3])): sub_q[len(sub_q) - 1][3][i] = BFS_Flp[i] contexts.append(np.zeros(len(Org_Vec))) for i in range(len(Org_Vec)): contexts[len(contexts) - 1][i] = BFS_Flp[i] # Exp mechanism on the visited nodes for i in range(len(Queue)): Queue[i][0] = i elements = [elem for elem in range(len(Queue))] probabilities = [] for prob in Queue: probabilities.append(prob[1] / (sum([prob[1] for prob in Queue]))) Res = np.random.choice(elements, 1, p=probabilities) Data_to_write.append(Queue[Res[0]][2] / max_ctx) return