Example #1
0
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

model = LetterCNN()
model.load_state_dict(torch.load('checkpoint.pt'))
# evaluate
model.eval()
image_tensor = preprocessing(train_data)
output = model(image_tensor)
# prob8=F.softmax(output,dim=1).data
prob8 = F.softmax(output, dim=1).data
#print(np.array(prob8).shape)

clf = IsolationForest(n_estimators=140,
                      behaviour='new',
                      max_samples='auto',
                      contamination=0.001,
                      max_features=5)
clf.fit(prob8)

clf1 = OneClassSVM(kernel='rbf', tol=0.01, nu=0.001)
clf1.fit(prob8)

# y_pred_train = clf.predict(percetage_train)

with open('iso_train.pickle', 'wb') as f:
    pickle.dump(clf, f)
# print(y_pred_train)

with open('svm_train.pickle', 'wb') as f:
    pickle.dump(clf1, f)
Example #2
0
cont = [
    "0", "001", "002", "003", "004", "005", "006", "007", "008", "009", "01",
    "02", "03", "04", "05"
]
cont_c = [
    0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2, 0.3,
    0.4, 0.5
]

for w, y in zip(cont, cont_c):
    print(w)
    print(y)

    outliers_fraction = y  # [0, 0.5]

    model = IsolationForest(contamination=outliers_fraction)
    model.fit(data1)
    df['anomaly2'] = pd.Series(model.predict(data1))
    data['cluster'] = df['anomaly2']
    a = data['value_temp'].tolist()
    b = data[value].tolist()
    comparison = [-1 if i != j else 1 for i, j in zip(a, b)]
    comp = pd.Series(comparison)
    comp_list = comp.tolist()
    clus_list = data['cluster'].tolist()
    cm = confusion_matrix(comp_list, clus_list, labels=[-1, 1])
    print(cm)

    data.to_csv(
        r"/media/Moon/Thesis/DataCollection/Failure_Detection/Tests/Per_Day/Result_IF/"
        + str(day) + "-O" + str(percent) + "-c" + str(w) + ".csv",
Example #3
0
# Filter the columns to remove data we do not want
columns = [c for c in columns if c not in ["Class"]]
# Store the variable we are predicting
target = "Class"
# Define a random state
state = np.random.RandomState(42)
X = data1[columns]
Y = data1[target]
X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))

classifiers = {
    "Isolation Forest":
    IsolationForest(
        n_estimators=100,
        max_samples=len(X),
        contamination=outlier_fraction,
        random_state=state,
        verbose=0,
    ),
    "Local Outlier Factor":
    LocalOutlierFactor(
        n_neighbors=20,
        algorithm="auto",
        leaf_size=30,
        metric="minkowski",
        p=2,
        metric_params=None,
        contamination=outlier_fraction,
    ),
}
encoder = keras.models.load_model('2D_encoder.h5')
enc_data = encoder.predict(data_split)
enc_df = data_split_y
enc_df["X"] = enc_data[:,0]
enc_df["y"] = enc_data[:,1]

sns.lmplot(x='X', y='y', hue = 'koi_disposition', data=enc_df, scatter_kws={'alpha':0.3}, size=12)


print("----------------- ISOLATION  FOREST -----------------------")
# IsolationForest on 'Confirmed', run on whole dataset
data_iso_con = data_split_merged 
test_iso_con = data_iso_con
from sklearn.ensemble import IsolationForest
iso_con = IsolationForest(random_state=0, n_estimators=300,verbose=1, n_jobs=16, contamination=0.5, max_features=1)
iso_con.fit(data_iso_con[data_iso_con.columns[2:]])
iso_result= iso_con.predict(test_iso_con[test_iso_con.columns[2:]])

result_series = pd.Series(iso_result)
test_iso_3d = enc_df_3d
test_iso_3d.reset_index(inplace=True, drop=True)
test_iso_3d["Isolation"] = result_series
classes = [('b', -1, 0.3), ('r', 1, 0.8)]
create_3d_plot(test_iso_3d,'Isolation',(16,12),classes)
# IsolationsForest on 'False Positive', compare on 'FP_CONFIRMED'



print("------------------ SUPPORT VECTOR MACHINE ------------------")
from sklearn.model_selection import train_test_split
def test_max_samples_consistency():
    # Make sure validated max_samples in iforest and BaseBagging are identical
    X = iris.data
    clf = IsolationForest().fit(X)
    assert clf.max_samples_ == clf._max_samples
def test_estimator_checks(test_fn):
    gmm_remover = OutlierRemover(outlier_detector=GMMOutlierDetector(), refit=True)
    test_fn(OutlierRemover.__name__, gmm_remover)

    isolation_forest_remover = OutlierRemover(outlier_detector=IsolationForest(), refit=True)
    test_fn(OutlierRemover.__name__, isolation_forest_remover)
                # Use the BroThon DataframeToMatrix class
                features = [
                    'Z', 'rejected', 'proto', 'query', 'qclass_name',
                    'qtype_name', 'rcode_name', 'query_length', 'id.resp_p'
                ]
                to_matrix = dataframe_to_matrix.DataFrameToMatrix()
                bro_matrix = to_matrix.fit_transform(bro_df[features])
                print(bro_matrix.shape)

                # Print out the range of the daterange and some stats
                print('DataFrame TimeRange: {:s} --> {:s}'.format(
                    str(bro_df['ts'].min()), str(bro_df['ts'].max())))

                # Train/fit and Predict anomalous instances using the Isolation Forest model
                odd_clf = IsolationForest(
                    contamination=0.01)  # Marking 1% as odd
                odd_clf.fit(bro_matrix)

                # Now we create a new dataframe using the prediction from our classifier
                odd_df = bro_df[odd_clf.predict(bro_matrix) == -1]

                # Now we're going to explore our odd observations with help from KMeans
                num_clusters = min(
                    len(odd_df),
                    10)  # 10 clusters unless we have less than 10 observations
                odd_matrix = to_matrix.fit_transform(odd_df[features])
                odd_df['cluster'] = KMeans(
                    n_clusters=num_clusters).fit_predict(odd_matrix)
                print(odd_matrix.shape)

                # Now group the dataframe by cluster
Example #8
0
def test_iforest_deprecation():
    iforest = IsolationForest(behaviour='new')
    warn_msg = "'behaviour' is deprecated in 0.22 and will be removed in 0.24"
    with pytest.warns(DeprecationWarning, match=warn_msg):
        iforest.fit(iris.data)
Example #9
0
def train(train_data_res,
          max_limit=0.005,
          nestimators=100,
          contamination=0.01,
          max_output=30):
    outlier_dict = dict()  # 定义空异常用户集合
    heavy_point = dict()  # 定义重心集合
    for i in range(len(train_data_res)):
        train_data_res[i] = train_data_res[i][[
            'ip', 'qps', 'impcount', 'notexsitudidcount', 'imppercent',
            'uidcount_udidcount', 'allcountpercent'
        ]]
    dfd = train_data_res
    # 统计共有多少个待识别的时段
    amount = len(train_data_res)
    max_col = len(train_data_res[0].columns)
    for a in range(amount):
        # 切比雪夫不等式切分数据集合
        # 去除类似ip的非计算指标
        DF_G = dfd[a].iloc[:, 1:max_col]
        # 保存原始变量
        DF_G_B = DF_G
        DF_G_B_T = np.array(DF_G_B).T
        DF_G_B = np.array(DF_G_B)
        DF_G_B_T_new = DF_G_B_T
        DF_G_B_new = DF_G_B
        # 计算协方差矩阵及逆矩阵
        S = np.cov(DF_G_B_T_new)
        S_f = pd.DataFrame(S)
        # 如果协方差矩阵某列全为0,则删除该列的feature,重新计算协方差
        if len(S_f[(S_f == 0).all()].index) > 0:
            while len(S_f[(S_f == 0).all()].index) > 0:
                badindex = S_f[(S_f == 0).all()].index
                indexnew = [
                    i for i in range(S_f.shape[0]) if i not in badindex
                ]
                # 如果独立变量小于二则无法检验,跳出异常检验,返回无异常用户
                if len(indexnew) <= 1:
                    print(
                        'error:the input data is not satisfay the limitation!')
                S_f = pd.DataFrame(
                    np.cov(pd.DataFrame(DF_G_B_new[:, indexnew].T))).copy()
            # 如果协方差某两列一致,则删除任意一列
            badindex1 = []
            for i in range(S_f.shape[0]):
                for j in range(S_f.shape[0]):
                    if j > i:
                        if sum(S_f[i] - S_f[j]) == 0:
                            badindex1.append(j)
                            # 剔除相关向量
            if len(badindex1) > 0:
                indexnew1 = [
                    i for i in range(S_f.shape[0]) if i not in badindex1
                ]
                S_f = pd.DataFrame(
                    np.cov(
                        pd.DataFrame(
                            DF_G_B_new[:, indexnew][:, indexnew1].T))).copy()
                DF_G_B_new = DF_G_B_new[:, indexnew][:, indexnew1]
                DF_G_B_T_new = np.array(DF_G_B_new).T
                t_indexnew = indexnew
                t_indexnew.insert(0, -1)
                t_indexnew = list(np.array(t_indexnew) + 1)
                t_indexnew1 = indexnew1
                t_indexnew1.insert(0, -1)
                t_indexnew1 = list(np.array(t_indexnew1) + 1)
                train_data_res[a] = train_data_res[
                    a].iloc[:, t_indexnew].iloc[:, t_indexnew1]
            else:
                DF_G_B_new = DF_G_B_new[:, indexnew]
                DF_G_B_T_new = np.array(DF_G_B_new).T
                t_indexnew = indexnew
                t_indexnew.insert(0, -1)
                t_indexnew = list(np.array(t_indexnew) + 1)
                train_data_res[a] = train_data_res[a].iloc[:, t_indexnew]
            S = S_f
            # 计算逆矩阵
        SI = np.linalg.inv(S)
        # 求全量数据的重心的距离
        DF_G_B_T_heavypoint = DF_G_B_T_new.mean(axis=1)
        d1 = []
        n = DF_G_B_new.shape[0]
        # 计算切比雪夫不等式中的马氏距离
        for i in range(n):
            delta = DF_G_B_new[i] - DF_G_B_T_heavypoint
            d = np.sqrt(np.dot(np.dot(delta, SI), delta.T))
            d1.append(d)
        # 异常用户集合,初筛千分之5
        d2 = pd.Series(d1)
        N = DF_G_B_new.shape[1]
        pr = max_limit
        limit = np.sqrt(N / pr)
        outlier = d2[d2 > limit]
        # 防止抛空的异常用户,至少满足20个人的异常用户,并放入outlier_data异常集合中
        while len(outlier) < max(int(0.01 * n), 20):
            pr = pr + 0.005
            limit = np.sqrt(N / pr)
            outlier = d2[d2 > limit]
        index = outlier.index
        outlier_data = DF_G_B_new[index]
        outlier_data = np.array(
            [I for I in outlier_data if ((I - DF_G_B_T_heavypoint) > 0).any()])
        # 防止抛空的未知用户,计算未知用户和正常用户,至少满足50个的未知用户,并放入outlier_data1未知集合中
        N1 = N
        pr1 = pr + 0.005
        limit1 = np.sqrt(N1 / pr1)
        outlier1 = d2[d2 > limit1]
        while len(outlier1) < max(int(0.15 * n), 50):
            pr1 = pr1 + 0.005
            limit1 = np.sqrt(N1 / pr1)
            outlier1 = d2[d2 > limit1]
        index1 = outlier1.index
        index1 = [I for I in index1 if I not in index]
        # outlier_data1为未知用户
        outlier_data1 = DF_G_B_new[index1]
        outlier_data1 = np.array([
            I for I in outlier_data1 if ((I - DF_G_B_T_heavypoint) > 0).any()
        ])
        if len(outlier_data1) == 0:
            print('没有未知用户')
        # 全量用户去除异常用户和未知用户,即为正常用户
        index2 = [i for i in range(n) if i not in index and i not in index1]
        common_data = DF_G_B_new[index2]
        heavy_point[a] = common_data.mean(axis=0)
        # 如果没有数据,则返回没有未知用户,以异常用户代替未知用户+异常用户
        if len(outlier_data1) == 0:
            train = common_data
        else:
            train = np.r_[common_data, outlier_data]  # 训练数据合并
        #预估的异常用户占比:contamination。训练的树的个数:nestimators
        nestimators = nestimators
        contamination = contamination
        clf = IsolationForest(n_estimators=nestimators,
                              contamination=contamination)
        clf.fit(train)
        # 如果没有数据,则返回没有未知用户,以异常用户代替未知用户+异常用户
        if len(outlier_data1) == 0:
            pre_format_train_data = outlier_data
        else:
            pre_format_train_data = np.r_[outlier_data, outlier_data1]
        #计算得分并排序
        Score = clf.decision_function(pre_format_train_data)
        K = np.c_[Score, pre_format_train_data]
        k_rank = np.array(sorted(K, key=lambda x: x[0]))
        # 识别排序靠前contamination参数初步定为1%
        assume_rate = np.ceil(pre_format_train_data.shape[0] * contamination *
                              3)
        # 设置上限,最多产出max_output个风控用户
        if assume_rate >= max_output:
            assume_rate = max_output
        outlier_data2 = k_rank[:int(assume_rate)]
        outlier_dict[a] = outlier_data2
        print("the no. %s time is ready" % a)
        print('the amount of this time is %s' % len(outlier_dict[a]))
    return outlier_dict, heavy_point, train_data_res
Example #10
0
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler

url ="C:/Users/Βασίλης/IdeaProjects/MyThesisApp/Data sets/Crude_Oil_Prices_Brent.csv"
dataset = pd.read_csv(url)
data = dataset[['Value']]
outliers_fraction = 0.05
scaler = StandardScaler()
np_scaled = scaler.fit_transform(data)
data = pd.DataFrame(np_scaled)
# train isolation forest
model =  IsolationForest(behaviour = 'new', max_samples=100, random_state=1, contamination= outliers_fraction)
model.fit(data)
dataset['anomaly2'] = pd.Series(model.predict(data))
# visualization
fig2, ax2 = plt.subplots(figsize=(10,6))

a = dataset.loc[dataset['anomaly2'] == -1, ['Date', 'Value']] #anomaly
a.to_csv('isolationanomaliesoil.csv')
Example #11
0
                ax=ax, orient='h', dodge=False)
    ax.set_xscale("log")
    fig.subplots_adjust(left=0.2)
    plt.show()


#show_important_features(perm_result, X)

# Outliers
# --------------------------------
from sklearn.ensemble import IsolationForest

#rng = np.random.RandomState(42)
corrs = train.corrwith(train['SalePrice'],
                       method='spearman').sort_values(ascending=False)
clf = IsolationForest(max_samples=200, random_state=42, contamination=0.01)
# Xtrain = ct.fit_transform(X)
Xtrain = train[corrs.index]

# clrs = ["blue" if y == 1 else "red" for y in pred]
markers = {1: ".", -1: "X", -2: "X"}
# palette = sns.color_palette()
# plt.scatter(train['GrLivArea'], train['SalePrice'], s=6, color=clrs, marker=".")

imp = SimpleImputer(strategy="median")
Xtrain = imp.fit_transform(Xtrain)
clf.fit(Xtrain)
pred = clf.predict(Xtrain)
y = clf.decision_function(Xtrain)

fig, axs = plt.subplots(3, 3, num="outliers", figsize=(10, 10), sharey=True)
Example #12
0
rng = np.random.RandomState(42)

# Generate training data
X = 0.3 * rng.randn(100, 2)
X_train = np.r_[X + 2, X - 2]

# Generate some regular novel observations
X = 0.3 * rng.randn(20, 2)
X_test = np.r_[X + 2, X - 2]

# Generate some abnormal observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))

# Fit the model
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)

# Plot the line, samples and nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("Isolation Forest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green', s=20, edgecolor='k')
Example #13
0
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_boston
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing.data import StandardScaler
from sklearn.ensemble import IsolationForest

bostondata = load_boston()  # 导入boston数据
boston_X = bostondata.data
boston_y = bostondata.target
boston_fulldata = np.c_[boston_X, boston_y]
boston_scaler = IsolationForest(contamination='auto',
                                behaviour='new',
                                random_state=66)
boston_scaler.fit(boston_fulldata)
datacondition = boston_scaler.predict(boston_fulldata)
number = sum(datacondition == 1)
boston_X_clean = np.zeros((number, boston_X.shape[1]))
boston_y_clean = np.zeros((number, 1))
j = 0
for i in range(0, datacondition.shape[0], 1):
    if datacondition[i] == 1:
        boston_X_clean[j] = boston_X[i]
        boston_y_clean[j] = boston_y[i]
        j += 1
boston_X = boston_X_clean
boston_y = boston_y_clean

scale_boston = StandardScaler()  # 标准化
Example #14
0
Data
++++

A simple example.
"""
from onnx.defs import onnx_opset_version
from skl2onnx import to_onnx
import numpy
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=100, n_features=2)

model = IsolationForest(n_estimators=3)
model.fit(X)
labels = model.predict(X)

fig, ax = plt.subplots(1, 1)
for k in (-1, 1):
    ax.plot(X[labels == k, 0], X[labels == k, 1], 'o', label="cl%d" % k)
ax.set_title("Sample")

#######################################
# ONNX
# ++++

onx = to_onnx(model,
              X[:1].astype(numpy.float32),
              target_opset={
https://towardsdatascience.com/anomaly-detection-with-isolation-forest-visualization-23cd75c281e2
"""

import os
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

x_all = pd.read_csv("X_all_train_wo_OS.csv")

y_all = pd.read_csv("y_train_wo_OS.csv")

#outlier decection using IsolationForest
to_model_columns = x_all.columns[3:46]

clf=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(.1), \
                        max_features = 43, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
clf.fit(x_all[to_model_columns])
pred = clf.predict(x_all[to_model_columns])
x_all['anomaly'] = pred
outliers = x_all.loc[x_all['anomaly'] == -1]
outlier_index = list(outliers.index)

#Find the number of anomalies and normal points here points classified -1 are anomalous
print(x_all['anomaly'].value_counts())

# matching anomalies to y-dataset
y_all['anomaly'] = 1

for index in outlier_index:
    y_all.iloc[index]['anomaly'] = -1
Example #16
0
datasets3D = [X_lin, X_hex, X_sph, X_gau, X_misaligned]


# define to data label: y_true
y_true = np.concatenate([np.ones(n_inliers), -np.ones(n_outliers)], axis=0)
# label 1 as inliers, -1 as outliers

# define algorithm to be compared
anomaly_algorithms = [
    ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
    ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)),
    (
        "Isolation Forest (IF)",
        IsolationForest(
            n_estimators=500,
            behaviour="new",
            contamination=outliers_fraction,
            random_state=42,
        ),
    ),
    (
        "Local Outlier Factor",
        LocalOutlierFactor(
            n_neighbors=35, contamination=outliers_fraction, novelty=True
        ),
    ),
]

# plt.figure(figsize=(13.5, 15))
plt.figure(figsize=((len(anomaly_algorithms) + 1) * 2.5 + 1, len(datasets3D) * 2.5 + 1))
plt.subplots_adjust(
    left=0.02, right=0.98, bottom=0.001, top=0.98, wspace=0.05, hspace=0.01
Example #17
0
for num in n_list:
    temp_data = anomaly_data.sample(n=num, random_state=42)

    print("\nData Construction")
    train_data = temp_data.drop(['Label'], axis=1)
    print("Data Shape : %s" % str(train_data.shape))

    # 데이터 분할
    train, test = train_test_split(train_data,
                                   test_size=0.25,
                                   random_state=42,
                                   shuffle=True)

    print("\nIsolation Forest")
    iforset = IsolationForest(max_samples=100,
                              contamination=0.1,
                              random_state=42)
    iforset.fit(train)
    iforset_pred_test = iforset.predict(test)
    iforse_pred = iforset.predict(unknown_data)

    for num in range(len(iforse_pred)):
        if iforse_pred[num] == unknown_label['Label'][num]:
            iforse_pred[num] = 1
        else:
            iforse_pred[num] = 0

    print("테스트 정확도:",
          list(iforset_pred_test).count(1) / iforset_pred_test.shape[0])
    print("예측 정확도:", list(iforse_pred).count(1) / iforse_pred.shape[0])
# Example settings
n_samples = 300
outliers_fraction = 0.15
n_outliers = int(outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers

# define outlier/anomaly detection methods to be compared
anomaly_algorithms = [("Robust covariance",
                       EllipticEnvelope(contamination=outliers_fraction)),
                      ("One-Class SVM",
                       svm.OneClassSVM(nu=outliers_fraction,
                                       kernel="rbf",
                                       gamma=0.1)),
                      ("Isolation Forest",
                       IsolationForest(behaviour='new',
                                       contamination=outliers_fraction,
                                       random_state=42)),
                      ("Local Outlier Factor",
                       LocalOutlierFactor(n_neighbors=35,
                                          contamination=outliers_fraction))]

# Define datasets
# blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2)
# datasets = [
#     make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5,
#                **blobs_params)[0],
#     make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5],
#                **blobs_params)[0],
#     make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3],
#                **blobs_params)[0],
#     4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] -
Y = data[target]

print(X.shape)
print(Y.shape)

from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

#define the random state
state = 1

#define the outlier detection methods
classifiers = {
    "Isolation Forest":
    IsolationForest(max_samples=len(X), contamination=frac,
                    random_state=state),
    "Local Outlier Factor":
    LocalOutlierFactor(n_neighbors=20, contamination=frac)
}

#fit the model
n_outliers = len(fraud)

for i, (clf_name, clf) in enumerate(classifiers.items()):

    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        scores_pred = clf.negative_outlier_factor_
    else:
        clf.fit(X)
        scores_pred = clf.decision_function(X)
Example #20
0
def train(src=None):
    """Trains the classifier and saves it on a file.

    This method elaborates the training dataset to extract haralick's and color features.

    The dataset is first normalized using Robust Scaler, then processed
    with PCA and finally used to train the classifier.

    The scaler, the PCA object and the classifier are saved on a file.

    Parameters
    ----------
    src : str, optional (default = "/training")
        The path to the directory which contains the training dataset
    """
    # Definition of the path
    if src is None:
        training_path = os.getcwd() + "/training"
    else:
        training_path = src

    # Create the list of the training files
    training_filenames = os.listdir(training_path)

    # Extract haralick features
    print("Extracting haralick features...")
    start_time = time.monotonic()
    haralick_training = haralick_features(training_path, training_filenames)
    end_time = time.monotonic()
    ex_time = timedelta(seconds=end_time - start_time)
    print("Completed in " + str(ex_time) + " seconds.")

    # Extract color features
    print("Extracting color features...")
    start_time = time.monotonic()
    hsv_training = hsv_features(training_path, training_filenames)
    end_time = time.monotonic()
    ex_time = timedelta(seconds=end_time - start_time)
    print("Completed in " + str(ex_time) + " seconds.")

    # Concatenate the lists to create a single feature vector per file
    training_set = dict()
    for file in training_filenames:
        training_set[file] = np.concatenate(
            (haralick_training[file], hsv_training[file]))

    # Normalize the dataset with Robust Scaler and save the scaler on file
    print("Scaling the training set...")
    scaler = RobustScaler()
    scaler.fit(list(training_set.values()))
    joblib.dump(scaler, 'scaler.pkl')
    scaled_training = scaler.transform(list(training_set.values()))
    for file, scaled in zip(training_filenames, scaled_training):
        training_set[file] = scaled
    print("Scaling completed. Scaler saved on " +
          os.path.join(os.getcwd(), "scaler.pkl"))

    # Transform the dataset using PCA
    print("Started PCA processing...")
    pca = PCA()
    pca.fit(list(training_set.values()))
    print(os.path.join(os.getcwd(), "pca.pkl"))
    joblib.dump(pca, 'pca.pkl')
    pca_training = pca.transform(list(training_set.values()))
    for file, pca_value in zip(training_filenames, pca_training):
        training_set[file] = pca_value
    print("PCA transform complete. Fit PCA saved on " +
          os.path.join(os.getcwd(), "pca.pkl"))

    # Train the classifier
    print("Training the classifier...")
    if_clf = IsolationForest()
    if_clf.fit(list(training_set.values()))
    print("Classifier is ready!")

    # Save the classifier to file
    joblib.dump(if_clf, 'clf.pkl')
    print("Classifier saved on " + os.path.join(os.getcwd(), "clf.pkl"))
Example #21
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor, IsolationForest

pd.set_option('display.max_columns', 60)
np.set_printoptions(threshold=np.nan)
rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
isf = IsolationForest(max_samples=0.25,
                      random_state=11,
                      contamination=0.15,
                      n_estimators=1000,
                      n_jobs=-1)

dataTrain = pd.read_excel('data/Train111.xlsx')
dataTrain = dataTrain.drop('ID', 1)
dataTest = pd.read_excel('data/Test.xlsx')
dataTest = dataTest.drop('ID', 1)
# print(dataTrain.corr())
# MinMaxScaler().fit_transform(dataTrain)
first_quartile = dataTrain['TARGET'].describe()['25%']
second_quartile = dataTrain['TARGET'].describe()['50%']
third_quartile = dataTrain['TARGET'].describe()['75%']
#
# # Interquartile range
# iqr = third_quartile - first_quartile
#
# # Remove outliers

dataTrain = dataTrain[(dataTrain['TARGET'] < second_quartile)]
Example #22
0
Y = dataset[target]
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.25)
# print the shapes of X and Y
#print(X_train)
#print(Y_train)
print(X.shape)
print(Y.shape)
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
# define a random state
state = 1
# define the outlier detection methods
classifiers = {
    "Isolation Forest": IsolationForest (max_samples=len (X),
                                         contamination=outlier_fraction,
                                         random_state=state),
    "Local Outlier Factor": LocalOutlierFactor (
        n_neighbors=20,#nearest neighbours discovering
        contamination=outlier_fraction)
}
# fit the model using the length of the fraud transactions.
n_outliers = len (fraud)
for i, (clf_name, clf) in enumerate (classifiers.items ()):

    # fit the data and tagging  outlier functions
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict (X)
        scores_pred = clf.negative_outlier_factor_
    else:
        clf.fit (X)
def test_recalculate_max_depth():
    """Check max_depth recalculation when max_samples is reset to n_samples"""
    X = iris.data
    clf = IsolationForest().fit(X)
    for est in clf.estimators_:
        assert est.max_depth == int(np.ceil(np.log2(X.shape[0])))
        model = GaussianNBFit(train, 2, 0.01)
        start = time.time()
        model.fit(beta=1.)
        end = time.time()
        fit_time = end - start
    elif parsed_args.model == 'iforest':
        #impute features with mean
        train, test = handle_missing_values(train, test)
        from sklearn.ensemble import IsolationForest
        model_features = list(train.columns)
        model_features.remove('is_anomaly')
        num_features = len(model_features)

        model = IsolationForest(behaviour='new',
                                n_estimators=100,
                                contamination=0.1,
                                n_jobs=-1,
                                max_features=0.1,
                                max_samples=0.05)
        start = time.time()
        X_train = train[train['is_anomaly'] == 0][model_features].values
        model.fit(X_train)
        end = time.time()
        fit_time = end - start
    elif parsed_args.model == 'autoencoder':
        from pyod.models.auto_encoder import AutoEncoder
        # one hot encoded features
        le = LabelEncoder()
        le.fit(train['super_department_id'].values)
        enc = OneHotEncoder()
        one_hot_train = enc.fit_transform(
            le.transform(train['super_department_id'].values).reshape(-1, 1))
for filetype in df_test.files.unique():
    X_test.append(scaler.fit_transform(np.asarray(df_test[['accTotal','gyrTotal']][df_test.files == filetype])))
    Y_test.append(np.asarray([significance[filetype -1] for i in range(X_test[filetype-1].shape[0])]))
    
X_train = scaler.fit_transform(np.asarray(df[['accTotal','gyrTotal']]))


# find the model model
""" this process takes a long time, to skip it ensure that you comment out this line, and the iForest line
below it, and to uncomment "speeding process up" """
parameters = findBestModel(X_train,X_test,Y_test)


# check visually wha tthe best model looks like

iForest = IsolationForest(n_estimators = 100, max_features = parameters[0],\
                          contamination = parameters[1], random_state = 0).fit(X_train)

# for speeding process up
#parameters = [1, 0.058, 5.8]
#iForest = IsolationForest(n_estimators = 100, max_features = parameters[0],\
#                         contamination = parameters[1], random_state = 0).fit(X_train)


output = iForest.predict(X_train)
true_false = []
                
for item in output:
    if item == 1:
        true_false.append(False)
    else:
        true_false.append(True)
Example #26
0
def apply_isolation_forsest(X):
    clf = IsolationForest(random_state=rng, contamination=0.1)
    clf.fit(X)
    res = clf.predict(X)
    return res
Example #27
0
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-d",
                "--dataset",
                required=True,
                help="path to dataset of images")
ap.add_argument("-m",
                "--model",
                required=True,
                help="path to output anomaly detection model")
args = vars(ap.parse_args())

# load and quantify our image dataset
print("[INFO] preparing dataset...")
data = load_dataset(args["dataset"], bins=(3, 3, 3))

# train the anomaly detection model
print("[INFO] fitting anomaly detection model...")
model = IsolationForest(
    behaviour='new',
    n_estimators=100,
    contamination=0.01,
    random_state=42,
)
model.fit(data)

# serialize the anomaly detection model to disk
f = open(args["model"], "wb")
f.write(pickle.dumps(model))
f.close()
Example #28
0
    pipeline = Pipeline([("mapper", mapper), ("estimator", iforest)])
    pipeline.fit(housing_X)
    pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values)
    customize(iforest, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    decisionFunction = DataFrame(pipeline.decision_function(housing_X),
                                 columns=["decisionFunction"])
    outlier = DataFrame(pipeline.predict(housing_X) == -1,
                        columns=["outlier"
                                 ]).replace(True,
                                            "true").replace(False, "false")
    store_csv(pandas.concat([decisionFunction, outlier], axis=1),
              name + ".csv")


build_iforest_housing(IsolationForest(random_state=13),
                      "IsolationForestHousing")


def build_ocsvm_housing(svm, name):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = Pipeline([("mapper", mapper), ("scaler", MaxAbsScaler()),
                         ("estimator", svm)])
    pipeline.fit(housing_X)
    pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values)
    store_pkl(pipeline, name + ".pkl")
    decisionFunction = DataFrame(pipeline.decision_function(housing_X),
                                 columns=["decisionFunction"])
    outlier = DataFrame(pipeline.predict(housing_X) <= 0,
                        columns=["outlier"
                                 ]).replace(True,
########################feature selection############################
dataset = SelectKBest(chi2, k=int(0.6*len(dataset[0]))).fit_transform(dataset, label_dataset)
print(np.shape(dataset))
#model = ExtraTreesClassifier()
#model.fit(dataset, label_dataset)
#imp=list(model.feature_importances_)
#for i in range(0,len(dataset)):
#    dataset[i] = [x for _,x in sorted(zip(imp,dataset[i]),reverse=True)]
#    dataset[i]=dataset[i][:1000]

np.shape(test_dataset)

#######################ISOLATION FOREST outliers detection############################
rng = np.random.RandomState(42)
clf = IsolationForest(behaviour='new', max_samples=200,random_state=rng, contamination='auto')
clf.fit(dataset)
outliers = clf.predict(dataset)

##########################LOCAL OUTLIER FACTOR#######################################
clf = IsolationForest()
outliers = clf.fit_predict(dataset)

final_data=[]
final_label=[]
for i in range(0,len(list(outliers))):
    if outliers[i]==1:
        final_data.append(dataset[i])
        final_label.append(label_dataset[i])
#dataset=copy.deepcopy(final_data)
#label_dataset=copy.deepcopy(final_label)
Example #30
0
    test = []
    test, train = get_random_test_and_train_sample(data_dict, key, train_size,
                                                   test_size)
    train_and_test_data[key]["train"] = train
    train_and_test_data[key]["test"] = test

end = time.time()
duration = end - start
print(duration)
"""    
params = {'max_features': 0.1, 'max_samples': 5,
            'n_estimators': 300, 'n_jobs': -1, 'random_state': 11,'behaviour':'new'}
            """
classifier = IsolationForest(contamination=0,
                             max_features=1,
                             max_samples=500,
                             n_estimators=3000,
                             n_jobs=-1,
                             random_state=11)

from sklearn import metrics
import matplotlib.pyplot as plt


def get_eer(tpr, fpr):
    eer = 100
    for i in range(len(tpr)):
        if 1 - tpr[i] < fpr[i]:
            eer = ((1 - tpr[i]) + (1 - tpr[i - 1]) + fpr[i] + fpr[i - 1]) / 4
            break
    #print(eer)
    return eer