Esempio n. 1
0
def detect_anomaly(df):
	x_values = df.index.values.reshape(df.index.values.shape[0],1)
	y_values = df.change.values.reshape(df.change.values.shape[0],1)
	clf = KNN()
	clf.fit(y_values)
	clf.predict(y_values)
	df["label_knn"] = clf.predict(y_values)
	df["score_knn"] = clf.decision_function(y_values).round(4)
	return df
Esempio n. 2
0
def detect_anomaly(df):
	clf = KNN()
	x_values = df.change.values.reshape(df.index.values.shape[0],1)
	y_values = df.change.values.reshape(df.change.values.shape[0],1)
	clf.fit(y_values)
	clf.predict(y_values)
	df["out_label"] = clf.predict(y_values)  #fit_predict_score
	df["out_score"] = clf.decision_function(y_values)
	return df
Esempio n. 3
0
def knn(X_train, y_train=None, X_test=None, y_test=None):
    # train kNN detector
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores
    # # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    #
    # # evaluate and print the results
    # print("\nOn Training Data:")
    # evaluate_print(clf_name, y_train, y_train_scores)
    # print("\nOn Test Data:")
    # evaluate_print(clf_name, y_test, y_test_scores)
    #
    # visualize the results
    visualize(clf_name,
              X_train,
              X_test,
              y_train_pred,
              y_test_pred,
              show_figure=True,
              save_figure=False)

    return y_train_pred, y_train_scores
Esempio n. 4
0
def obj_func_kNN(params):
    ## objective function used in baseian optimization
    outlier_fraction = params[0]
    n_neighbors = params[1]
    method = params[2]
    radius = params[3]

    # load data set to function work space
    Y_train = np.load('Y_train.npy')
    X_train = np.load('X_train.npy')

    # create model
    clf = KNN(contamination=outlier_fraction,
              n_neighbors=n_neighbors,
              method=method,
              radius=radius)
    # fit the dataset to the model
    clf.fit(X_train)

    scores_pred = clf.decision_function(
        X_train) * -1  # predict raw anomaly score
    Rprecision = Rprecision_f(Y_train, scores_pred)
    if glb_verbose:
        print('R Precision : ', Rprecision)

    y_pred = clf.predict(
        X_train)  # prediction of a datapoint category outlier or inlier
    objVal = objVal_f(Rprecision, y_pred, Y_train)

    return objVal
Esempio n. 5
0
def pyodtry():
    dfwhole = df_en_all
    df = dff2
    X1 = reduce(dfwhole)
    X2 = reduce(df)
    ddf = pd.read_pickle('LogFileDfs/original')

    random_state = np.random.RandomState(42)
    outliers_fraction = 0.005
    clf = KNN(method='mean', contamination=outliers_fraction)
    xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200))

    clf.fit(X1)
    scores_pred = clf.decision_function(X2) * -1
    y_pred = clf.predict(X2)
    n_inliers = len(y_pred) - np.count_nonzero(y_pred)
    n_outliers = np.count_nonzero(y_pred == 1)
    print('OUTLIERS : ', n_outliers, 'INLIERS : ', n_inliers)
    #dfx = pdf
    #dfx['outlier'] = y_pred.tolist()
    df['authenticated?'] = y_pred.tolist()
    ddf['authenticated?'] = df['authenticated?']
    output = ddf[ddf['authenticated?'] == 1]
    # create sqlalchemy engine
    #engine = create_engine("mysql+pymysql://{user}:{pw}@172.17.0.3/{db}".format(user="******",pw="richul123",db="emss"))
    # Insert whole DataFrame into  MySQL
    #output.to_sql('output', con = engine, if_exists = 'replace', chunksize = 1000)
    with pd.ExcelWriter(
            '/home/richul/Documents/EnhancingMailServerSecurity/Output/output.xlsx'
    ) as writer:
        output.to_excel(writer, sheet_name='output')
Esempio n. 6
0
    def get_all_readings_from_person(self,
                                     person_tag,
                                     remove_outliers=0,
                                     additional_where=""):
        #Debug.print_debug(self.file_path)
        print(self.file_path)
        dataset = sqlite3.connect(self.file_path)
        if len(additional_where) > 0:
            to_return = self.get_data_sql_query(
                "select {} from {} where {} like {} {}".format(
                    ', '.join(self.features), self.table_name,
                    self.person_column, person_tag, additional_where), dataset)
        else:
            to_return = self.get_data_sql_query(
                "select {} from {} where {} like '{}'".format(
                    ', '.join(self.features), self.table_name,
                    self.person_column, person_tag), dataset)
        self.data = to_return
        if (remove_outliers > 0):
            knn = KNN(contamination=remove_outliers)
            to_return_aux = to_return.copy()
            to_return_aux = to_return_aux.drop(self.label_tag, 1)
            knn.fit(to_return_aux)
            pred = knn.predict(to_return_aux)
            to_return = to_return.iloc[np.where(pred == 0)[0], :]

        return to_return
Esempio n. 7
0
def calculate(method, total_roc, total_prn, x_train, x_test, y_train, y_test):
    if method == 'KNN':
        clf = KNN()
    elif method == 'CBLOF':
        clf = CBLOF()
    elif method == 'PCA':
        clf = PCA()
    else:
        clf = IForest()
    clf.fit(x_train)  # 使用x_train训练检测器clf

    # 返回训练数据x_train上的异常标签和异常分值
    y_train_pred = clf.labels_  # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
    y_train_scores = clf.decision_scores_  # 返回训练数据上的异常值 (分值越大越异常)
    print("On train Data:")
    evaluate_print(method, y_train, y_train_scores)

    # 用训练好的clf来预测未知数据中的异常值
    y_test_pred = clf.predict(x_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
    y_test_scores = clf.decision_function(x_test)  # 返回未知数据上的异常值 (分值越大越异常)
    print("On Test Data:")
    evaluate_print(method, y_test, y_test_scores)

    y_true = column_or_1d(y_test)
    y_pred = column_or_1d(y_test_scores)
    check_consistent_length(y_true, y_pred)

    roc = np.round(roc_auc_score(y_true, y_pred), decimals=4),
    prn = np.round(precision_n_scores(y_true, y_pred), decimals=4)

    total_roc.append(roc)
    total_prn.append(prn)
Esempio n. 8
0
def median_knn(X_train, X_test, Y_train, Y_test):
    from pyod.models.knn import KNN
    model = KNN(method='median')
    model.fit(X_train)
    pred = model.predict(X_test)
    acc = np.sum(pred == Y_test) / X_test.shape[0]
    print(acc)
    return (acc * 100)
Esempio n. 9
0
def model_test(model_type, y_train, y_test, X_train, X_test, model_file,
               save_flag):
    if model_type == 'KNN':
        clf_name = 'KNN'
        clf = KNN()
        clf.fit(X_train)
    if model_type == 'XGBOD':
        clf_name = 'XGBOD'
        #set this scale_pos_weight  sum(negative instances) / sum(positive instances).
        clf = XGBOD(random_state=42, scale_pos_weight=50)
        clf.fit(X_train, y_train)
    if model_type == 'SOD':
        # train SOD detector
        # Note that SOD is meant to work in high dimensions d > 2.
        # But here we are using 2D for visualization purpose
        # thus, higher precision is expected in higher dimensions
        clf_name = 'SOD'
        clf = SOD()
        clf.fit(X_train)
    if model_type == 'VAE':
        # train VAE detector (Beta-VAE)
        clf_name = 'VAE'
        contamination = 0.01
        clf = VAE(epochs=30,
                  contamination=contamination,
                  gamma=0.8,
                  capacity=0.2)
        clf.fit(X_train)

    #save model if specified
    if save_flag == '1':
        pickle.dump(clf, open(model_file, "wb"))

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    conf_train = confusion_matrix(y_train, y_train_pred)
    print("<<<< confusion matrix for train: ", conf_train)

    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
    conf_test = confusion_matrix(y_test, y_test_pred)
    print("<<<< confusion matrix for test: ", conf_test)

    # visualize the results
    #todo: Input data has to be 2-d for visualization.
    #visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
    #         y_test_pred, show_figure=True, save_figure=False)

    return model_file
def abnormal_KNN(train_npy, test_npy):
    clf_name = 'kNN'
    clf = KNN()
    train_npy = np.array(train_npy).reshape(-1, 1)
    clf.fit(train_npy)

    test_npy = np.array(test_npy).reshape(-1, 1)
    y_test_pred = clf.predict(test_npy)
    y_test_scores = clf.decision_function(test_npy)
    return y_test_pred
Esempio n. 11
0
def main(args):
    data = loadmat(args.filename)
    trainx, testx, trainy, testy = train_test_split(data['X'],
                                                    data['y'],
                                                    test_size=args.train_split,
                                                    random_state=2)
    valx, evalx, valy, evaly = train_test_split(testx, testy, test_size=0.5)
    data_size = len(trainx[0])
    encoder_neurons = [data_size, data_size / 2, data_size / 4]
    clf = KNN()
    clf.fit(trainx)
    print("Results Validation KNN")
    print_metrics(valy, clf.predict(valx))
    print("Results Evaluation KNN")
    print_metrics(evaly, clf.predict(evalx))

    clf = PCA(n_components=args.components)
    clf.fit(trainx)
    print("Results Validation PCA")
    print_metrics(valy, clf.predict(valx))
    print("Results Evaluation PCA")
    print_metrics(evaly, clf.predict(evalx))

    clf = VAE(encoder_neurons=encoder_neurons,
              decoder_neurons=encoder_neurons[::-1],
              epochs=args.epochs,
              contamination=args.contamination,
              gamma=args.gamma,
              capacity=args.capacity)
    clf.fit(trainx)
    print("Results Validation VAE")
    print_metrics(valy, clf.predict(valx))
    print("Results Evaluation VAE")
    print_metrics(evaly, clf.predict(evalx))
Esempio n. 12
0
class RemoveOutliers():

    def __init__(self):
        self.estimator = KNN()

    def _remove(self,X):
        preds = self.estimator.predict(X)
        return X[preds , :]

    def fit(self,X,y=None):
        self.estimator.fit(X)
        return self

    def transform(self,X,y=None):
        return self._remove(X)
Esempio n. 13
0
def outlier_detection(dataset, features = ["distance","average_speed", "average_acceleration","direction",
                                            "stopped"], contamination = 0.01, n_neighbors = 5, method = "mean", \
                                                                                           metric = "minkowski"):
    """
    Detect outliers based on pyod KNN.

    Note: User may decide upon contamination threshold, number of neighbors, method and metric.
    For method three kNN detectors are supported:
        -largest: use the distance to the kth neighbor as the outlier score
        -mean(default): use the average of all k neighbors as the outlier score
        -median: use the median of the distance to k neighbors as the outlier score

    :param dataset: list of features to detect outliers upon.
    :param contamination: float in (0., 0.5),  (default=0.01) The amount of contamination of the data set,
    i.e. the proportion of outliers in the data set.
    :param n_neighbors: int, (default = 5) Number of neighbors to use by default for k neighbors queries.
    :param method: str, (default='largest') {'largest', 'mean', 'median'}
    :param metric: string or callable, default 'minkowski' metric to use for distance computation. Any metric from
    scikit-learn or scipy.spatial.distance can be used.
    :return:
    """
    clf = KNN(contamination=contamination,
              n_neighbors=n_neighbors,
              method=method,
              metric=metric)
    inp_data = dataset.loc[:, features]

    clf.fit(inp_data)
    scores_pred = clf.predict(inp_data)

    # avoid overwriting input

    # Inserting column, with 1 if outlier, else 0
    if "outlier" in dataset:
        dataset["outlier"] = scores_pred
    else:
        dataset.insert(2, "outlier", scores_pred)
    return dataset
Esempio n. 14
0
    def get_KNN_scores(dataframe,
                       cols,
                       outliers_fraction=0.01,
                       standardize=True):
        '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default
        
        Returns:
            df with KNN scores added         
        '''
        if standardize:
            #standardize selected variables
            minmax = MinMaxScaler(feature_range=(0, 1))
            dataframe[cols] = minmax.fit_transform(dataframe[cols])

        #Convert dataframe to a numpy array in order to incorprate our algorithm
        arrays = []
        for row in cols:
            row = dataframe[row].values.reshape(-1, 1)
            arrays.append(row)
        X = np.concatenate((arrays), axis=1)

        #fit
        clf = KNN(contamination=outliers_fraction)
        clf.fit(X)

        # predict raw anomaly score
        scores_pred = clf.decision_function(X) * -1

        # prediction of a datapoint category outlier or inlier
        y_pred = clf.predict(X)
        n_inliers = len(y_pred) - np.count_nonzero(y_pred)
        n_outliers = np.count_nonzero(y_pred == 1)

        CheckOutliers.df4 = dataframe
        CheckOutliers.df4['outlier'] = y_pred.tolist()

        print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with KNN')
Esempio n. 15
0
def knn_stat_tropo(df,time_param):
  knn=pd.DataFrame()
  name=[x for x in globals() if globals()[x] is df][0]
  print('dataframe: {}'.format(name))
  param=name.split('_')[-1]
  knn['date']=df[time_param]
  knn['data_val']=df[("%s"%param+'_'+file['marker_name'][0])]
  print(knn)
  knn=knn.dropna()
  x_knn = knn['data_val'].values.reshape(-1,1)
  # Train kNN detector
  clf = KNN(contamination=0.01, n_neighbors=21, method='median')
  if len(x_knn) <= clf.n_neighbors:
    clf.n_neighbors=math.floor(len(x_knn)/2)
    clf.fit(x_knn)
  else:
    clf.fit(x_knn)
  #predict raw anomaly score
  #scores_pred = clf.decision_function(x_knn)*-1
  #rediction of a datapoint category outlier or inlier
  start=time.time()
  an=clf.predict(x_knn) # to be optimized
  end=time.time()
  #knn['anomaly'] = pd.Series(clf.predict(x_knn))
  print('predict comp time {}'.format(end-start))
  knn['anomaly'] = pd.Series(an)
  #fig, ax = plt.subplots(figsize=(10,6))
  a = knn.loc[knn['anomaly'] == 1, ['date', 'data_val']] #anomaly
  # ax.scatter(knn['date'], knn['data_val'], color='blue', label = 'Normal')
  # ax.scatter(a['date'],a['data_val'], color='red', label = 'Anomaly')
  # plt.legend()
  # plt.title('KNN tropo {} {} {}'.format("%s"%param,"%s"%file['marker_name'][0], 21))
  # plt.xlabel('Date')
  # plt.show()
  # fig.savefig('KNN_tropo_{}_{}_{}.png'.format("%s"%param,"%s"%file['marker_name'][0], 21))
  # y_train_scores = clf.decision_scores_
  return(a)
          leaf_size=30,
          metric='minkowski',
          p=2,
          metric_params=None,
          n_jobs=1)

clf.fit(trainData)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
#print(y_train_pred)
y_train_scores = clf.decision_scores_  # raw outlier scores
#print(y_train_scores)

# get the prediction on the test data
y_test_pred = clf.predict(testData)  #X_test)  # outlier labels (0 or 1)
print(y_test_pred)

from sklearn.metrics import accuracy_score

accuracy_percentage = accuracy_score(testTarget, y_test_pred) * 100
print("The prediction accuracy is:", end=" ")
print(accuracy_percentage)

y_test_scores = clf.decision_function(testData)  #X_test)  # outlier scores
#print(y_test_scores)

# evaluate and print the results
#print("\nOn Training Data:")
#evaluate_print(clf_name, y_train, y_train_scores)
#print("\nOn Test Data:")
import random
from matplotlib.colors import cnames
corr = df.corr()['deposit'].abs().sort_values(ascending=False)
h_corr_cols = corr[corr < 1].index.tolist()
colors = list(cnames.keys())
sns.set_style('darkgrid')
fig , ax = plt.subplots(4,3,figsize = (16,12))
ax = ax.ravel()
for i,col in enumerate(h_corr_cols):
    sns.boxplot(df[col], ax = ax[i],color = random.choice(colors))

x = df[h_corr_cols].values
model = KNN(contamination=.1)
model.fit(x)
predicted = model.predict(x)

outliers = df.loc[(predicted == 1),:]
inliers = df.loc[(predicted == 0),:]

df = df.drop(index = df.loc[(predicted == 1),:].index )

"""###### Treating imbalance data"""

df.education.value_counts().to_frame()

df['education'].replace({'basic.9y': 'basic','basic.4y': 'basic','basic.6y':'basic'},inplace=True)

df['education'].value_counts().to_frame()

df.job.value_counts().to_frame()

        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

        clf_name = 'KNN'
        clf = KNN()  # 初始化检测器clf
        clf.fit(X_train)  # 使用X_train训练检测器clf

        # 返回训练数据X_train上的异常标签和异常分值
        y_train_pred = clf.labels_  # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
        y_train_scores = clf.decision_scores_  # 返回训练数据上的异常值 (分值越大越异常)
        print("On train Data:")
        evaluate_print(clf_name, y_train, y_train_scores)

        # 用训练好的clf来预测未知数据中的异常值
        y_test_pred = clf.predict(X_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
        y_test_scores = clf.decision_function(X_test)  # 返回未知数据上的异常值 (分值越大越异常)
        print("On Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)

        y_true = column_or_1d(y_test)
        y_pred = column_or_1d(y_test_scores)
        check_consistent_length(y_true, y_pred)

        roc = np.round(roc_auc_score(y_true, y_pred), decimals=4),
        prn = np.round(precision_n_scores(y_true, y_pred), decimals=4)
        knn_roc.append(roc)
        knn_prn.append(prn)


        clf_name = 'LOF'
Esempio n. 19
0
class TestKnn(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = KNN(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Esempio n. 20
0
Hence, we use a library called **pyod** which hosts a number of outlier detection algorithms.
"""

pip install pyod

"""### KNN Classifier  (Proximity-Based)"""

from pyod.models.knn import KNN   # kNN detector

# train kNN detector
clf_name = 'KNN'
clf = KNN()
clf.fit(X)

y_pred = clf.predict(X)  # outlier labels (0 or 1)
y_scores = clf.decision_function(X)  # outlier scores

y_pred

"""0 means normal value while 1 means anomalous value."""

colors = np.array(['#377eb8', '#ff7f00'])
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred - 1) // 2])

"""Finding the ROC Accuracy score for the prediction label."""

clf.fit_predict_score(X[:, 0].reshape(-1,1), y_pred, scoring='roc_auc_score')

"""### Angle-based Outlier Detector (Probabilistic Based Model)"""
Esempio n. 21
0
class occ():
    """
    One-class classifier for outlier detection.
    
    Attributes:
        data
        model
        X
        Y
        score
        X_proj
    
    Methods:
    
    """
    def __init__(self):
        self.data = None
        self.model = None
        self.X = None
        self.Y = None
        self.score = None
        #self.X_train = None
        #self.Y_train = None
        #self.X_test = None
        #self.Y_test = None

    def load_data_mat(self, file_name):
        """
        Load data from .mat file.
        Note that data from ODDS contains a lot of .mat data including known anomalies(Y)
        """
        # type: (str) -> None
        self.data = scipy.io.loadmat(file_name)
        self.X = self.data['X']
        self.Y = self.data['y']
        self.X_proj = occ.manipulation(self.X)
        #self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(self.X, self.Y, test_size=0.15)

    def load_data_csv(self, file_name, Y=False, **kwargs):
        """
        The CSV file should be formatted as:
         X0, X1, ..., Xn, Y
        
        """
        data = pd.read_csv(file_name, header=None, **kwargs)
        self.X = data[range(data.shape[1] - 1)].values
        if Y:
            self.Y = data[data.shape[1] - 1].to_numpy().reshape(
                [data.shape[0], 1])
        else:
            pass

    def load_data_npz(self, file_name, Y=False):
        self.X = np.load(file_name)
        if not Y:
            self.Y = np.load(Y)
        else:
            pass

    def train(self, model='ocsvm', data=None, sampling=False, **kwargs):
        # type: (str, Optional[Any], bool, **Any) -> None
        """
        
        :param sampling: (float) Proportion of sampling. If the raw data size is too large,
            we can use the sampled datasets.
        
        """
        if type(data) != np.ndarray:
            if data == None:
                data = self.X
        if sampling != False:
            data = occ.sampling_X(self.X, rate=sampling)
        kernel_set = 'poly'
        gamma_set = 'scale'
        epochs = 10
        batch_size = 50
        nu = 0.1
        hidden_neurons = None
        known_normal = False
        kernel_epochs = 50
        radius_epochs = 100
        neighbors = 10

        for k, v in kwargs.items():
            if 'kernel' == k:
                kernel_set = v
            elif 'gamma' == k:
                gamma_set = v
            elif 'epochs' == k:
                epochs = v
            elif 'nu' == k:
                nu = v
            elif 'batch_size' == k:
                batch_size = v
            elif 'hidden_neurons' == k:
                hidden_neurons = v
            elif 'known_normal' == k:
                known_normal = v
            elif 'kernel_epochs' == k:
                kernel_epochs = v
            elif 'radius_epochs' == k:
                radius_epochs = v
            elif 'neighbors' == k:
                neighbors = v

        if model == 'ocsvm':
            self.model = sklearn.svm.OneClassSVM(gamma=gamma_set,
                                                 kernel=kernel_set,
                                                 nu=nu)
        elif model == 'ocnn':
            self.model = ocnn(len(data[0]),
                              epochs=epochs,
                              nu=nu,
                              batch_size=batch_size)
        elif model == 'ensemble':
            self.model = ensemble(nu=nu)
        elif model == 'isoForest':
            self.model = sklearn.ensemble.IsolationForest(contamination=nu)
        elif model == 'autoEncoder':
            self.model = AutoEncoderODD(nu=nu,
                                        hidden_neurons=hidden_neurons,
                                        epochs=epochs,
                                        batch_size=batch_size)
        elif model == 'vae':
            self.model = VAE_ODD(nu=nu,
                                 hidden_neurons=hidden_neurons,
                                 epochs=epochs,
                                 batch_size=batch_size)
        elif model == 'deepsvdd':
            self.model = deep_SVDD(nu=nu,
                                   known_normal=known_normal,
                                   hidden_neurons=hidden_neurons,
                                   kernel_epochs=kernel_epochs,
                                   radius_epochs=radius_epochs,
                                   batch_size=batch_size)
        elif model == 'knn':
            self.model = KNN(contamination=nu, n_neighbors=neighbors)
        elif model == 'twolineAE':
            self.model = twolineAE(nu=nu,
                                   hidden_neurons=hidden_neurons,
                                   epochs=epochs,
                                   batch_size=batch_size)
        else:
            print("There is no such model type {}".format(model))

        data = self.select_data(data, **kwargs)

        self.model.fit(data)

    def predict(self, data=None, **kwargs):
        # type: (Optional[Any], **Any) -> ndarray
        if type(data) != np.ndarray:
            if data == None:
                data = self.X
        data = self.select_data(data, **kwargs)
        return self.model.predict(data).reshape(len(data), 1)

    def get_score(self, data=None, **kwargs):
        if type(data) != np.ndarray:
            if data == None:
                data = self.X
        data = self.select_data(data, **kwargs)
        return self.model.score_samples(data).reshape(len(data), 1)

    def export_csv(self, file_name, score):
        if type(self.Y) != np.ndarray:
            if self.Y == None:
                array = np.concatenate((self.X, score), axis=1)
        else:
            array = np.concatenate((self.X, self.Y, score), axis=1)
        pd.DataFrame(array).to_csv(file_name, header=None, index=False)

    def export_outliers(self, file_name, predictions):
        def export(array):
            pd.DataFrame(array[np.where(predictions == -1)[0]]).to_csv(
                file_name, header=None, index=False)

        if type(self.Y) != np.ndarray:
            if self.Y == None:
                export(self.X)
        else:
            X = self.X
            Y = self.Y
            export(np.concatenate((X, Y), axis=1))

    @staticmethod
    def select_data(data, **kwargs):
        # type: (ndarray, **Any) -> ndarray
        norm = False
        manipulate = False
        for k, v in kwargs.items():
            if 'norm' == k:
                norm = v
            if 'manipulate' == k:
                manipulate = v
        if norm:
            data = occ.norm(data)
        if manipulate:
            data = occ.manipulation(data)
        return data

    @staticmethod
    def manipulation(data, **kwargs):
        # type: (ndarray, **Any) -> ndarray
        method = 'pca'
        dim = 3
        for k, v in kwargs.items():
            if 'method' == k:
                method = v
            if 'dim' == k:
                dim = v
        if method == 'pca':
            projection = PCA(n_components=dim)
        projection.fit(data)
        return projection.transform(data)

    @staticmethod
    def show_projection(data, label=None, **kwargs):
        # type: (ndarray, ndarray, **Any) -> None
        size = 25
        cmap = 'viridis'
        norm = False
        title = None
        save_file = None

        for k, val in kwargs.items():
            if 'title' == k:
                title = val
            elif 'markersize' == k:
                size = val
            elif 'cmap' == k:
                cmap = val
            elif 'norm' == k:
                norm = val
            elif 'save_file' == k:
                save_file = val

        data = occ.select_data(data, **kwargs)
        data_proj = occ.manipulation(data, method='pca', dim=2)
        data_proj_t = data_proj.transpose()
        ax, fig = plt.subplots(figsize=(10, 10))
        ax = plt.scatter(data_proj_t[0],
                         data_proj_t[1],
                         c=label,
                         s=size,
                         marker='.')
        ax = plt.colorbar()
        ax = plt.set_cmap(cmap)
        if title != None:
            plt.title(title)
        if save_file != None:
            plt.savefig(save_file, dpi=300)
        plt.show()

    @staticmethod
    def norm(data):
        # type: (ndarray) -> ndarray
        norm = sklearn.preprocessing.Normalizer(norm='l2', copy=True).fit(data)
        return norm.transform(data)

    @staticmethod
    def proportion(data):
        return np.where(data < 0)

    @staticmethod
    def sampling_X(X, rate=0.1):
        idx = list(range(len(X)))
        random.shuffle(idx)
        return X[idx[:int(len(X) * rate)]]
Esempio n. 22
0

# Train kNN detector
clf = KNN(contamination=0.02, n_neighbors=5)
clf.fit(X)



# Get the prediction labels of the training data
y_train_pred = clf.labels_     
# Outlier scores
y_train_scores = clf.decision_scores_


# Import the utility function for model evaluation
from pyod.utils import evaluate_print
# Evaluate on the training data
evaluate_print('KNN', y, y_train_scores)


# A total of $1256
X_test_abnormal = np.array([[1256.]])
# Predict
clf.predict(X_test_abnormal)


# A total of $51896
X_test_abnormal = np.array([[51896.]])
# Predict
clf.predict(X_test_abnormal)
Esempio n. 23
0
class TestKnnMahalanobis(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        # calculate covariance for mahalanobis distance
        X_train_cov = np.cov(self.X_train, rowvar=False)

        self.clf = KNN(algorithm='auto',
                       metric='mahalanobis',
                       metric_params={'V': X_train_cov})
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
# # kNN

# In[3]:

#kNN
clf_name = 'kNN'
clf = KNN(method='median')

# In[4]:

#用训练集训练
clf.fit(X_train)
y_train_pred = clf.labels_
y_train_scores = clf.decision_scores_
y_test_pred = clf.predict(X_test)
y_test_scores = clf.decision_function(X_test)
#评价性能
roc_train = round(roc_auc_score(y_train, y_train_scores), 4)
prn_train = round(precision_n_scores(y_train, y_train_scores), ndigits=4)
roc_test = round(roc_auc_score(y_test, y_test_scores), 4)
prn_test = round(precision_n_scores(y_test, y_test_scores), ndigits=4)

# In[5]:

#输出计算得到的roc_auc和precision @ rank n
print("\nOn Train Data:")
print(clf_name, 'roc:', roc_train, 'precision @ rank n:', prn_train)
print("\nOn Test Data:")
print(clf_name, 'roc:', roc_test, 'precision @ rank n:', prn_test)
def KNNAlgo(TrainX, TestX, TrainY, TestY):
    ##Copy Variabel for Accuracy Analysis
    CopyTrainY = TrainY.copy()
    CopyTestY = TestY.copy()
    ##Applying KNN algorith,
    clf = KNN(n_neighbors=20)
    clf.fit(TrainX)
    ##Predicting Label for training Dataset
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    ##Outlier Scores for training Data Points
    y_train_scores = clf.decision_scores_  # raw outlier scores
    ##Predicting Label for Test Dataset
    y_test_pred = clf.predict(TestX)  # outlier labels (0 or 1)
    ##Outlier scores for test dataset
    y_test_scores = clf.decision_function(TestX)  # outlier scores
    ##Plot Outlier scoring Points for training Dataset
    sns.distplot(y_train_scores)
    plt.title('Distance from the Kth Nearest Neighbour')
    plt.savefig('KthDistanceTrainingSet.png')
    ##Plot Outlier scoring Points for test Dataset
    sns.distplot(y_test_scores)
    plt.title('Distance from the Kth Nearest Neighbour')
    plt.savefig('KthDistanceTestSet.png')
    ##Creating A dataframe for Train Dataset consisting of the (Outlier score + Label + Xpoints)
    TrainScores = list(y_train_scores)  ##Outlier scores of training Dataset
    XPoints = np.arange(1, len(TrainScores) + 1)  ##X axis
    CombineTrainFile = TrainY  #Labels for training Dataset
    CombineTrainFile[
        'XPoints'] = XPoints  ##0,1,2 -------------length of training set
    CombineTrainFile['TrainScores'] = TrainScores  ##Oulier scores
    ##Plot Scatter Plot for the Local Outlier Scores
    colors = ['green', 'red']
    fig = plt.figure(figsize=(8, 8))
    plt.scatter(CombineTrainFile['XPoints'],
                CombineTrainFile['TrainScores'],
                c=CombineTrainFile['Flag'],
                cmap=matplotlib.colors.ListedColormap(colors))
    plt.title('Outliers in Traning Set for Normal and Shell Company')
    plt.legend()
    ##Creating A dataframe for est Dataset consisting of the (Outlier score + Label + Xpoints)
    TestScores = list(y_test_scores)  ##Outlier scores of training Dataset
    XPoints = np.arange(1, len(TestScores) + 1)  ##X axis
    CombineTestFile = TestY  #Labels for training Dataset
    CombineTestFile[
        'XPoints'] = XPoints  ##0,1,2 -------------length of training set
    CombineTestFile['TestScores'] = TestScores  ##Oulier scores
    ##Plot Scatter Plot for the Local Outlier Scores
    colors = ['green', 'red']
    fig = plt.figure(figsize=(8, 8))
    plt.scatter(CombineTestFile['XPoints'],
                CombineTestFile['TestScores'],
                c=CombineTestFile['Flag'],
                cmap=matplotlib.colors.ListedColormap(colors))
    plt.title('Outliers in Test Set for Normal and Shell Company')
    plt.legend()
    ###Check the Accuracy of the KNN model
    print(
        "---------------------------------Accuracy for training Data-------------------------------------------------"
    )
    print("Final accuracy score on the testing data: {:.4f}".format(
        accuracy_score(CopyTrainY, y_train_pred)))
    print("Final F-score on the testing data: {:.4f}".format(
        fbeta_score(CopyTrainY, y_train_pred, beta=1.2)))
    print('precision_score', precision_score(CopyTrainY, y_train_pred))
    print('recall_score', recall_score(CopyTrainY, y_train_pred))
    print(
        "--------------------------------Accuracy for Testing Data----------------------------------------------------"
    )
    print("Final accuracy score on the testing data: {:.4f}".format(
        accuracy_score(CopyTestY, y_test_pred)))
    print("Final F-score on the testing data: {:.4f}".format(
        fbeta_score(CopyTestY, y_test_pred, beta=1.2)))
    print('precision_score', precision_score(CopyTestY, y_test_pred))
    print('recall_score', recall_score(CopyTestY, y_test_pred))
    cm = confusion_matrix(CopyTestY, y_test_pred, labels=[1, 0])
    print("Confusion matrix for test dataset")
    print(cm)
    fpr, tpr, thresholds = roc_curve(CopyTestY, y_test_pred)
    fig, ax = plt.subplots(1, figsize=(12, 6))
    plt.plot(fpr, tpr, color='darkorange', label='Model Performace')
    plt.plot([0, 1], [0, 1], color='gray', label='Random Performace')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(
        'Honest/Shell Analysis ROC Curve for KNN Outlier Detection Algorithm for test Dataset'
    )
    plt.legend(loc="lower right")
    print('Auc Score is : ', roc_auc_score(CopyTestY, y_test_pred))
Esempio n. 26
0
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train kNN detector
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    # visualize the results
    visualize(clf_name,
              X_train,
              y_train,
              X_test,
              y_test,
              y_train_pred,
Esempio n. 27
0
    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train kNN detector
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    # visualize the results
    visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
              y_test_pred, show_figure=True, save_figure=True)
Esempio n. 28
0

# In[2]:


# train kNN detector
clf_name = 'KNN'
clf = KNN()
clf.fit(X_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores


# In[3]:


# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)


# In[4]:
class Remove_Outliers(BaseEstimator, TransformerMixin):
    def __init__(self,
                 target,
                 contamination=.20,
                 random_state=42,
                 methods=['knn', 'iso', 'mcd']):

        self.target = target
        self.contamination = contamination
        self.random_state = random_state
        self.methods = methods

    def fit(self, data, y=None):
        return (None)

    def transform(self, data, y=None):
        return (data)

    def fit_transform(self, dataset, y=None):
        data = dataset.copy()

        if 'iso' in self.methods:
            self.iso_forest = IForest(contamination=self.contamination,
                                      random_state=self.random_state,
                                      behaviour='new')
            self.iso_forest.fit(data.drop(self.target, axis=1))
            iso_predict = self.iso_forest.predict(
                data.drop(self.target, axis=1))
            data['iso'] = iso_predict

        if 'knn' in self.methods:
            self.knn_out = KNN(contamination=self.contamination)
            self.knn_out.fit(data.drop(self.target, axis=1))
            knn_predict = self.knn_out.predict(data.drop(self.target, axis=1))
            data['knn'] = knn_predict

        if 'pca' in self.methods:
            self.out_pca = PCA_RO(contamination=self.contamination,
                                  random_state=self.random_state)
            self.out_pca.fit(data.drop(self.target, axis=1))
            pca_predict = self.out_pca.predict(data.drop(self.target, axis=1))
            data['pca'] = pca_predict

        # use for those features which are gaussian distributed
        if 'mcd' in self.methods:
            self.mcd = EllipticEnvelope(contamination=0.01)
            self.mcd.fit(data.drop(self.target, axis=1))
            mcd_predict = self.mcd.predict(data.drop(self.target, axis=1))
            data['mcd'] = mcd_predict

        data['vote_outlier'] = 0

        for i in self.methods:
            data['vote_outlier'] = data['vote_outlier'] + data[i]

        self.outliers = data[data['vote_outlier'] == len(self.methods)]

        return dataset[[
            True if i not in self.outliers.index else False
            for i in dataset.index
        ]]
# 根据得到的分类标签,对降维后的数据进行标记并在图像中进行展示。

# In[169]:


# 训练一个kNN检测器
clf_name = 'kNN'
clf = KNN() # 初始化检测器
clf.fit(new_origin_all[:pos]) # 使用训练集训练检测器clf

# 返回训练数据X_train上的异常标签和异常分值
y_train_pred = clf.labels_ # 返回训练数据上的分类标签 (0: 正常值, 1: 异常值)
y_train_scores = clf.decision_scores_ # 返回训练数据上的异常值 (分值越大越异常)

# 用训练好的clf来预测未知数据中的异常值
y_test_pred = clf.predict(new_origin_all[pos:]) # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值) 
y_test_scores = clf.decision_function(new_origin_all[pos:]) # 返回未知数据上的异常值

show_scatter(clf_name, df, y_train_pred, pos)


# In[170]:


clf_name = 'COF'
clf = COF(n_neighbors=30)
clf.fit(new_origin_all[:pos])

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores
Esempio n. 31
0
class TestKnn(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = KNN(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_prediction_labels_confidence(self):
        pred_labels, confidence = self.clf.predict(self.X_test,
                                                   return_confidence=True)
        assert_equal(pred_labels.shape, self.y_test.shape)
        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_prediction_proba_linear_confidence(self):
        pred_proba, confidence = self.clf.predict_proba(self.X_test,
                                                        method='linear',
                                                        return_confidence=True)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

        assert_equal(confidence.shape, self.y_test.shape)
        assert (confidence.min() >= 0)
        assert (confidence.max() <= 1)

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
Esempio n. 32
0
    plt.show()

    '''
        KNN -> K-Nearest Neighbors Detector
        For an observation, its distance to its kth nearest neighbors could be viewed as the outlying scores
        Method: -Largest -Average -Median
    '''
    clf_name = 'KNN'
    clf = KNN()
    clf.fit(X_train)

    X_train_pred = clf.labels_
    X_train_score = clf.decision_scores_

    score_pred = clf.decision_function(X_train)*-1
    y_pred = clf.predict(X_train)
    n_errors = (y_pred != y_train).sum()
    print('No of Errors:', clf_name, n_errors)

    # visualization
    xx, yy = np.meshgrid(np.linspace(-10, 10, 300), np.linspace(-10, 10, 300))
    threshold = stats.scoreatpercentile(score_pred, 100*outlier_fraction)
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
    Z = Z.reshape(xx.shape)
    # fill blue colormap from minimum anomaly score to threshold value
    plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 10), cmap=plt.cm.Blues_r)
    a = plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')
    plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
    b = plt.scatter(X_train[:-n_outliers, 0], X_train[:-n_outliers, 1], c='white',s=20, edgecolor='k')
    c = plt.scatter(X_train[-n_outliers:, 0], X_train[-n_outliers:, 1], c='black',s=20, edgecolor='k')
    plt.axis('tight')