def choose_models(): isolFor = { 'name': 'Isolation Forest', 'class': ensemble.IsolationForest(), 'parameters': { 'n_estimators': [5, 10, 20, 50, 100, 150, 200] } } locOutFac = { 'name': 'Local Outlier Factor', 'class': neighbors.LocalOutlierFactor(novelty=True), 'parameters': { 'n_neighbors': range(5, 50, 5) } } # ocSVM = {'name': 'One Class SVM', # 'class': svm.OneClassSVM(), # 'parameters': { # 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], # 'nu': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] # } # } elEnv = { 'name': 'Elliptic Envelope', 'class': covariance.EllipticEnvelope(), 'parameters': { 'contamination': np.linspace(0.05, 0.45, 9) } } return [isolFor, locOutFac, elEnv]
def removeOutliers(X, y, outliersFraction): clf = covariance.EllipticEnvelope(contamination=outliersFraction) fitData = np.vstack((X, y)).T fitData = preprocessing.StandardScaler().fit_transform(fitData) try: clf.fit(fitData) outMask = clf.decision_function(fitData) threshold = stats.scoreatpercentile(outMask, 100 * outliersFraction) outMask = (outMask > threshold).flatten() except: outMask = np.ones(fitData.shape[0], dtype=bool) return X[outMask], y[outMask], outMask
def elipticEnvelope(data): data_1 = data data_1[20] = data_1[20] * 0.95 outliers_fraction = 0.2 envelope = covariance.EllipticEnvelope(contamination=outliers_fraction, random_state=0) envelope.fit(data_1) df_class0 = pd.DataFrame(data_1) df_class0['deviation'] = envelope.decision_function(data_1) df_class0['anomaly'] = envelope.predict(data_1) print len(df_class0['anomaly']) time = np.zeros(len(df_class0['anomaly'])) print len(time) for i in range(len(time)): time[i] = i fig, ax = plt.subplots() a = df_class0.loc[df_class0['anomaly'] == 1] print len(a) #ax.plot(df_class0['time_epoch'], df_class0['value'], color = 'blue') ax.scatter(time, df_class0['anomaly'], color='red') plt.show()
def elliptic_envelope(X): clf = covariance.EllipticEnvelope() return clf.fit(X)
fit_model(clf, x, y) # Split into anomaly and normal examples clf = svm.OneClassSVM(nu=.1, kernel='rbf', gamma=.1) fit_novelty_model(clf, x_class, y_class) clf = svm.OneClassSVM(nu=.1, kernel='rbf', gamma=.1) fit_novelty_model(clf, x, y) clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1) fit_model_loc(clf, x_class, y_class) clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1) fit_model_loc(clf, x, y) clf = covariance.EllipticEnvelope(assume_centered=False, contamination=.1, random_state=0) fit_model(clf, x_class, y_class) fit_model(clf, x, y) #lets group - standarization x_scaled = preprocessing.scale(x_class) distorsions = [] for k in range(2, x_scaled.shape[0]): kmeans = KMeans(n_clusters=k) kmeans.fit(x_scaled) distorsions.append(kmeans.inertia_) fig = plt.figure(figsize=(15, 5)) plt.plot(range(2, x_scaled.shape[0]), distorsions)
dfconsCRM = dfconsCRM[np.isfinite(dfconsCRM['finishedSqFt'])] #outlier based on influence graph #dfconsCRM.drop(300) #outlier based on cooks distance/ddfits #prepare data for PCA namesPCA = [ 'logMeanPrice', 'Year', 'Year_Built', 'YearH', 'May', 'SQFT', 'Oct', 'Baseline', 'Amount', 'zipIncome', 'Tier1', 'Bed', 'Bath', 'logIncome' ] dummies = pd.get_dummies(dfconsCRM['Utility Company']) dfcopy = dfconsCRM[namesPCA].copy() dfcopy = pd.concat([dfcopy, dummies], axis=1) #get rid of outliers with Isolation Forest outlierDet = covariance.EllipticEnvelope(contamination=.05, random_state=4059) trained_outlier = outlierDet.fit(dfcopy) y_vals = trained_outlier.predict(dfcopy) isoutlier = pd.DataFrame(y_vals == -1, index=dfcopy.index) dfcopy = dfcopy[y_vals == 1] dfconsCRM = dfconsCRM[y_vals == 1] replace_map = { 'Utility Company': { 'APU': 1, 'LADWP': 2, 'PGE': 3, 'RPU': 4, 'SCE': 5, 'SMUD': 6, 'sdge': 7 }
def _get_best_detector(self, train): detector = covariance.EllipticEnvelope() detector.fit(train) return detector
n_samples = 400 outliers_fraction = 0.25 cluster_separation = 2 X, y = generate_synthetic_data_outliers(n_samples, outliers_fraction, cluster_separation) plot_data_2d_outliers(X, xlim=[-7, 7], ylim=[-7, 7]) iso_forest_estimator = ensemble.IsolationForest() iso_forest_grid = {'contamination': [0.1, 0.2, 0.25, 0.3]} grid_search_plot_models_outliers(iso_forest_estimator, iso_forest_grid, X, xlim=[-7, 7], ylim=[-7, 7]) cov_estimator = covariance.EllipticEnvelope() cov_grid = {'contamination': [0.1, 0.2, 0.25, 0.3]} grid_search_plot_models_outliers(cov_estimator, cov_grid, X, xlim=[-7, 7], ylim=[-7, 7]) svm_estimator = svm.OneClassSVM(kernel="rbf", gamma=0.1) tmp = 0.95 * outliers_fraction svm_grid = {'nu': [tmp + 0.03, tmp + 0.05, tmp + 0.06, tmp + 0.07]} grid_search_plot_models_outliers(svm_estimator, svm_grid, X, xlim=[-7, 7], ylim=[-7, 7])
def cleanup(var,cbool,outlier_thresh=0.05,plot_tgl=True): ''' Runs the primary algorithm to find outliers and remove bad contact segments :param var: The variable to use as a signal for when the tracking is bad :param cbool: The contact boolean :return use_flags: A boolean mask inherited from cbool which is only 1 during good contacts (cbool includes all contacts) These should be set to zero :return outliers: A boolean mask indicating which frames are bad tracking outliers. These should be NaN'd and interpolated over ''' # either impute small nan gaps or remove contacts with more than 10 consecutive NaNs. # Allows us to make all non-flagged var=0 var_imputed, use_flags = fill_nan_gaps(var, cbool, thresh=10) var_imputed[np.invert(use_flags),:] = 0 # Perform a small medfilt to filter over single point outliers var_filt = signal.medfilt(var_imputed, kernel_size=[3, 1]) # remove contacts where there are many bad points remove_bad_contacts(var_filt, use_flags,thresh=10) # Get CC for all the contact segments to be kept cc = cbool_to_cc(use_flags) # =========================== # # =========================== # # Find point outliers once the bad contact segments have been deleted var_scaled = scale_by_contact(var_imputed, cc, True) var_d = get_d(var_imputed) var_d_scaled = scale_by_contact(var_d, cc) X = np.concatenate([var_scaled, var_d_scaled], axis=1) y = np.zeros(X.shape[0], dtype='int') # Fit outlier detection clf = covariance.EllipticEnvelope(contamination=outlier_thresh) idx = np.logical_and(np.squeeze(use_flags == 1),np.all(np.isfinite(X),axis=1)) # catch corner case where there is no good data? if X[idx,:].shape[0]==0: print('All NAN dataset found') return 1,1 clf.fit(X[idx, :]) # Find outliers y[idx] = clf.predict(X[idx, :]) y[y == 1] = 0 y[y == -1] = 1 # =========================== # # =========================== # # set outputs [use_flags, outliers] outliers = y == 1 # var_out is used mostly to evaluate the quality of the outlier detection. # We should evelntually use 'use_flags' and 'outliers' to alter all mechanics data uniformly. var_out = var.copy() var_out[use_flags == 0] = np.nan var_out[outliers] = np.nan # impute over the variable. # for start, stop in cc: # if stop - start > 10: # # var_out[start:stop] = impute_snippet(var_out[start:stop]) var_out,use_flags = fill_nan_gaps(var_out,use_flags,thresh=5) var_out[use_flags == 0] = 0 var_out_filt = signal.medfilt(var_out, kernel_size=[3, 1]) var_out_filt = signal.savgol_filter(var_out_filt,7,3,axis=0) if plot_tgl: plt.plot(var) plt.plot(var_out_filt) plt.show() return(use_flags,outliers)
SavePath = r'E:\大数据\线路故障诊断\解压后数据\广东中调' SaveFile = '整次谐波特征_零序_gd.csv' BigSamplenames,BigFeaturenames,BigFeatures,BigLabels = \ ExtractingAllFile(FilePath,SavePath,SaveFile,SelectCol=[7]) if 1: print('******************** 装载特征集 ********************\n') SavePath = r'E:\大数据\线路故障诊断\解压后数据\广东中调' SaveFile = '整次谐波特征_零序_gd.csv' BigSamplenames,BigFeaturenames,BigFeatures,BigLabels = DataSet(os.path.join(SavePath,SaveFile)) # 椭圆分布假设的异常检测 if 0: from sklearn import covariance contamination = 0.05 # 需设置异常比例 clf = covariance.EllipticEnvelope(assume_centered=False, support_fraction=None, \ contamination=contamination, random_state=42) clf.fit(BigFeatures) y_detection=clf.predict(BigFeatures) print(BigSamplenames[y_detection==-1]) # 隔离森林异常检测,适于多维数据集 if 1: print('******************** 剔除异常样本 ********************\n') from sklearn import ensemble contamination = 0.05 # 需设置异常比例 clf = ensemble.IsolationForest(max_samples='auto', contamination=contamination, \ max_features=1.0, bootstrap=False, random_state=42) clf.fit(BigFeatures) y_detection=clf.predict(BigFeatures) print('异常样本类别:\n',BigSamplenames[y_detection==-1]) Samplenames,Labels,Features = \
def elliptic_envelope(self, label, result_list): clf = covariance.EllipticEnvelope(support_fraction=1) return execute_decision_function(clf, self.train_test_split, label, result_list, self.image_creator, unsupervised=True)
dane_3.head() for column in dane_3.columns: wykres = px.box(x=column, data_frame=dane_3, title=column, orientation='h', notched=True, width=800, height=400) wykres.show() from sklearn import covariance # utworzenie detektora wartości odstających na podstawie elipsy wokół danych detektor = covariance.EllipticEnvelope(contamination=0.1, support_fraction=1) # użycie detektora detektor.fit(dane_3) # wykrywanie wartości odstających flaga_odstajace = detektor.predict(dane_3) flaga_odstajace #Wyswietlenie wartosci odstajacych - pętla upraszcza :) -> mniej wysiłku for i in dane_3.columns: wykres = px.scatter(dane_3, y=i, color=flaga_odstajace, width=700, height=350) wykres.show()