Exemple #1
0
def choose_models():

    isolFor = {
        'name': 'Isolation Forest',
        'class': ensemble.IsolationForest(),
        'parameters': {
            'n_estimators': [5, 10, 20, 50, 100, 150, 200]
        }
    }

    locOutFac = {
        'name': 'Local Outlier Factor',
        'class': neighbors.LocalOutlierFactor(novelty=True),
        'parameters': {
            'n_neighbors': range(5, 50, 5)
        }
    }
    # ocSVM = {'name': 'One Class SVM',
    #          'class': svm.OneClassSVM(),
    #          'parameters': {
    #              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    #              'nu': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]
    #          }
    #          }

    elEnv = {
        'name': 'Elliptic Envelope',
        'class': covariance.EllipticEnvelope(),
        'parameters': {
            'contamination': np.linspace(0.05, 0.45, 9)
        }
    }

    return [isolFor, locOutFac, elEnv]
def removeOutliers(X, y, outliersFraction):
    clf = covariance.EllipticEnvelope(contamination=outliersFraction)

    fitData = np.vstack((X, y)).T
    fitData = preprocessing.StandardScaler().fit_transform(fitData)
    try:
        clf.fit(fitData)
        outMask = clf.decision_function(fitData)
        threshold = stats.scoreatpercentile(outMask, 100 * outliersFraction)
        outMask = (outMask > threshold).flatten()
    except:
        outMask = np.ones(fitData.shape[0], dtype=bool)
    return X[outMask], y[outMask], outMask
Exemple #3
0
def elipticEnvelope(data):
    data_1 = data
    data_1[20] = data_1[20] * 0.95
    outliers_fraction = 0.2
    envelope = covariance.EllipticEnvelope(contamination=outliers_fraction,
                                           random_state=0)
    envelope.fit(data_1)
    df_class0 = pd.DataFrame(data_1)
    df_class0['deviation'] = envelope.decision_function(data_1)
    df_class0['anomaly'] = envelope.predict(data_1)
    print len(df_class0['anomaly'])
    time = np.zeros(len(df_class0['anomaly']))
    print len(time)
    for i in range(len(time)):
        time[i] = i
    fig, ax = plt.subplots()
    a = df_class0.loc[df_class0['anomaly'] == 1]
    print len(a)
    #ax.plot(df_class0['time_epoch'], df_class0['value'], color = 'blue')
    ax.scatter(time, df_class0['anomaly'], color='red')
    plt.show()
Exemple #4
0
def elliptic_envelope(X):
	clf = covariance.EllipticEnvelope()
	return clf.fit(X)
Exemple #5
0
fit_model(clf, x, y)

# Split into anomaly and normal examples
clf = svm.OneClassSVM(nu=.1, kernel='rbf', gamma=.1)
fit_novelty_model(clf, x_class, y_class)

clf = svm.OneClassSVM(nu=.1, kernel='rbf', gamma=.1)
fit_novelty_model(clf, x, y)

clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
fit_model_loc(clf, x_class, y_class)
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
fit_model_loc(clf, x, y)

clf = covariance.EllipticEnvelope(assume_centered=False,
                                  contamination=.1,
                                  random_state=0)
fit_model(clf, x_class, y_class)
fit_model(clf, x, y)

#lets group - standarization

x_scaled = preprocessing.scale(x_class)
distorsions = []
for k in range(2, x_scaled.shape[0]):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(x_scaled)
    distorsions.append(kmeans.inertia_)

fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, x_scaled.shape[0]), distorsions)
Exemple #6
0
dfconsCRM = dfconsCRM[np.isfinite(dfconsCRM['finishedSqFt'])]
#outlier based on influence graph
#dfconsCRM.drop(300)
#outlier based on cooks distance/ddfits

#prepare data for PCA
namesPCA = [
    'logMeanPrice', 'Year', 'Year_Built', 'YearH', 'May', 'SQFT', 'Oct',
    'Baseline', 'Amount', 'zipIncome', 'Tier1', 'Bed', 'Bath', 'logIncome'
]

dummies = pd.get_dummies(dfconsCRM['Utility Company'])
dfcopy = dfconsCRM[namesPCA].copy()
dfcopy = pd.concat([dfcopy, dummies], axis=1)
#get rid of outliers with Isolation Forest
outlierDet = covariance.EllipticEnvelope(contamination=.05, random_state=4059)
trained_outlier = outlierDet.fit(dfcopy)
y_vals = trained_outlier.predict(dfcopy)
isoutlier = pd.DataFrame(y_vals == -1, index=dfcopy.index)
dfcopy = dfcopy[y_vals == 1]
dfconsCRM = dfconsCRM[y_vals == 1]
replace_map = {
    'Utility Company': {
        'APU': 1,
        'LADWP': 2,
        'PGE': 3,
        'RPU': 4,
        'SCE': 5,
        'SMUD': 6,
        'sdge': 7
    }
 def _get_best_detector(self, train):
     detector = covariance.EllipticEnvelope()
     detector.fit(train)
     return detector
Exemple #8
0
n_samples = 400
outliers_fraction = 0.25
cluster_separation = 2
X, y = generate_synthetic_data_outliers(n_samples, outliers_fraction,
                                        cluster_separation)
plot_data_2d_outliers(X, xlim=[-7, 7], ylim=[-7, 7])

iso_forest_estimator = ensemble.IsolationForest()
iso_forest_grid = {'contamination': [0.1, 0.2, 0.25, 0.3]}
grid_search_plot_models_outliers(iso_forest_estimator,
                                 iso_forest_grid,
                                 X,
                                 xlim=[-7, 7],
                                 ylim=[-7, 7])

cov_estimator = covariance.EllipticEnvelope()
cov_grid = {'contamination': [0.1, 0.2, 0.25, 0.3]}
grid_search_plot_models_outliers(cov_estimator,
                                 cov_grid,
                                 X,
                                 xlim=[-7, 7],
                                 ylim=[-7, 7])

svm_estimator = svm.OneClassSVM(kernel="rbf", gamma=0.1)
tmp = 0.95 * outliers_fraction
svm_grid = {'nu': [tmp + 0.03, tmp + 0.05, tmp + 0.06, tmp + 0.07]}
grid_search_plot_models_outliers(svm_estimator,
                                 svm_grid,
                                 X,
                                 xlim=[-7, 7],
                                 ylim=[-7, 7])
def cleanup(var,cbool,outlier_thresh=0.05,plot_tgl=True):
    '''
    Runs the primary algorithm to find outliers and remove bad contact segments
    
    :param var:     The variable to use as a signal for when the tracking is bad 
    :param cbool:   The contact boolean
    
    :return use_flags:  A boolean mask inherited from cbool which is only 1 during good contacts (cbool includes all contacts)
                            These should be set to zero
    :return outliers:   A boolean mask indicating which frames are bad tracking outliers.
                            These should be NaN'd and interpolated over
    '''

    # either impute small nan gaps or remove contacts with more than 10 consecutive NaNs.
    # Allows us to make all non-flagged var=0
    var_imputed, use_flags = fill_nan_gaps(var, cbool, thresh=10)
    var_imputed[np.invert(use_flags),:] = 0

    # Perform a small medfilt to filter over single point outliers
    var_filt = signal.medfilt(var_imputed, kernel_size=[3, 1])


    # remove contacts where there are many bad points
    remove_bad_contacts(var_filt, use_flags,thresh=10)

    # Get CC for all the contact segments to be kept
    cc = cbool_to_cc(use_flags)

    # =========================== #
    # =========================== #
    # Find point outliers once the bad contact segments have been deleted
    var_scaled = scale_by_contact(var_imputed, cc, True)
    var_d = get_d(var_imputed)
    var_d_scaled = scale_by_contact(var_d, cc)

    X = np.concatenate([var_scaled, var_d_scaled], axis=1)
    y = np.zeros(X.shape[0], dtype='int')

    # Fit outlier detection
    clf = covariance.EllipticEnvelope(contamination=outlier_thresh)
    idx = np.logical_and(np.squeeze(use_flags == 1),np.all(np.isfinite(X),axis=1))


    # catch corner case where there is no good data?
    if X[idx,:].shape[0]==0:
        print('All NAN dataset found')
        return 1,1

    clf.fit(X[idx, :])

    # Find outliers
    y[idx] = clf.predict(X[idx, :])
    y[y == 1] = 0
    y[y == -1] = 1

    # =========================== #
    # =========================== #

    # set outputs [use_flags, outliers]
    outliers = y == 1

    # var_out is used mostly to evaluate the quality of the outlier detection.
    # We should evelntually use 'use_flags' and 'outliers' to alter all mechanics data uniformly.

    var_out = var.copy()
    var_out[use_flags == 0] = np.nan
    var_out[outliers] = np.nan

    # impute over the variable.
    # for start, stop in cc:
    #     if stop - start > 10:
    #
    #         var_out[start:stop] = impute_snippet(var_out[start:stop])
    var_out,use_flags = fill_nan_gaps(var_out,use_flags,thresh=5)
    var_out[use_flags == 0] = 0
    var_out_filt = signal.medfilt(var_out, kernel_size=[3, 1])

    var_out_filt = signal.savgol_filter(var_out_filt,7,3,axis=0)

    if plot_tgl:
        plt.plot(var)
        plt.plot(var_out_filt)
        plt.show()
    return(use_flags,outliers)
        SavePath = r'E:\大数据\线路故障诊断\解压后数据\广东中调'
        SaveFile = '整次谐波特征_零序_gd.csv'
        BigSamplenames,BigFeaturenames,BigFeatures,BigLabels = \
        ExtractingAllFile(FilePath,SavePath,SaveFile,SelectCol=[7])
    
    if 1:
        print('******************** 装载特征集 ********************\n')
        SavePath = r'E:\大数据\线路故障诊断\解压后数据\广东中调'
        SaveFile = '整次谐波特征_零序_gd.csv'
        BigSamplenames,BigFeaturenames,BigFeatures,BigLabels = DataSet(os.path.join(SavePath,SaveFile))

    # 椭圆分布假设的异常检测
    if 0:
        from sklearn import covariance
        contamination = 0.05 # 需设置异常比例
        clf = covariance.EllipticEnvelope(assume_centered=False, support_fraction=None, \
                                          contamination=contamination, random_state=42)
        clf.fit(BigFeatures)
        y_detection=clf.predict(BigFeatures)
        print(BigSamplenames[y_detection==-1])
        
    # 隔离森林异常检测,适于多维数据集
    if 1:
        print('******************** 剔除异常样本 ********************\n')
        from sklearn import ensemble
        contamination = 0.05 # 需设置异常比例
        clf = ensemble.IsolationForest(max_samples='auto', contamination=contamination, \
                                       max_features=1.0, bootstrap=False, random_state=42)
        clf.fit(BigFeatures)
        y_detection=clf.predict(BigFeatures)
        print('异常样本类别:\n',BigSamplenames[y_detection==-1])
        Samplenames,Labels,Features = \
Exemple #11
0
    def elliptic_envelope(self, label, result_list):
        clf = covariance.EllipticEnvelope(support_fraction=1)

        return execute_decision_function(clf, self.train_test_split, label, result_list, self.image_creator,
                                         unsupervised=True)
Exemple #12
0
dane_3.head()

for column in dane_3.columns:
    wykres = px.box(x=column,
                    data_frame=dane_3,
                    title=column,
                    orientation='h',
                    notched=True,
                    width=800,
                    height=400)
    wykres.show()

from sklearn import covariance
# utworzenie detektora wartości odstających na podstawie elipsy wokół danych
detektor = covariance.EllipticEnvelope(contamination=0.1, support_fraction=1)
# użycie detektora
detektor.fit(dane_3)
# wykrywanie wartości odstających
flaga_odstajace = detektor.predict(dane_3)
flaga_odstajace

#Wyswietlenie wartosci odstajacych - pętla upraszcza :) -> mniej wysiłku
for i in dane_3.columns:
    wykres = px.scatter(dane_3,
                        y=i,
                        color=flaga_odstajace,
                        width=700,
                        height=350)
    wykres.show()