Beispiel #1
0
def pca_outlier_detection(X_train, X_test, **kwargs):
    detector = PCA(**kwargs)
    detector.fit(X_train)
    prob = detector.predict_proba(X_test)[:, -1]

    if isinstance(X_test, pd.DataFrame):
        return pd.Series(prob, name='outlier', index=X_test.index)
    return pd.Series(prob, name='outlier')
Beispiel #2
0
def pca(X_train, X_test, Y_train, Y_test):
    from pyod.models.pca import PCA
    model = PCA()
    model.fit(X_train)
    pred = model.predict(X_test)
    acc = np.sum(pred == Y_test) / X_test.shape[0]
    print(acc)
    return (acc * 100)
Beispiel #3
0
    def S2(self):

        self.S1()
        water_data = self.water_data
        result = self.result

        # 数据预处理及模型训练
        clean_data = water_data[water_data['S1'] == 0]
        Y = pd.DataFrame(index=clean_data.index, columns=['S2'])

        X_train = np.array(clean_data.iloc[:, 1:12])
        name = list(clean_data.iloc[:, 1:12].columns.values)
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)

        clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True)
        clf2 = KNN(contamination=0.05, n_neighbors=100)
        clf3 = HBOS(contamination=0.05, n_bins=10)
        clf4 = PCA(contamination=0.05)

        clf1.fit(X_train)
        clf2.fit(X_train)
        clf3.fit(X_train)
        clf4.fit(X_train)

        Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_
        water_data = pd.concat([water_data, Y], axis=1)
        # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0;

        result['统计异常'] = water_data['S2'].values

        # 寻找异常维度
        from sklearn.neighbors import KernelDensity
        clean_data = water_data[water_data['S1'] == 0]
        dens = pd.DataFrame(index=clean_data.index,
                            columns=[
                                'temperature', 'pH', 'EC', 'ORP', 'DO',
                                'turbidity', 'transparency', 'COD', 'P',
                                'NH3N', 'flux'
                            ])

        for i in dens.columns:
            kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(
                clean_data[i].values.reshape(-1, 1))
            dens[i] = np.exp(
                kde.score_samples(clean_data[i].values.reshape(-1, 1)))
        dens = dens.iloc[:, 0:11].rank()
        dens['S2_names'] = dens.idxmin(axis=1)
        water_data = pd.concat([water_data, dens['S2_names']], axis=1)
        self.water_data = water_data
        result['统计异常维度'] = water_data['S2_names'].values

        # 存储模型
        joblib.dump(scaler, "./water_model/S2_scaler")
        joblib.dump(clf1, "./water_model/S2_Iforest")
def getOutlierPCA(dataset):
    '''
    @brief Function that executes PCA algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model
    pca = PCA()
    # Fits the data and obtains labels
    pca.fit(dataset)
    # Return labels
    return pca.labels_
Beispiel #5
0
def print_accuracy(train_arr,test_arr,trader_id):
    if len(train_arr)==0 or len(test_arr)==0:
        return
    for i in range(len(train_arr)):
        l1=len(train_arr[i])
        l2=len(test_arr[i])
        if l1==0 or l2==0:
            continue
        train_data=np.array([train_arr[i]]).T
        test_data=np.array([test_arr[i]]).T
        # clf=OCSVM(kernel ='rbf',gamma = 0.5)
        print(len(train_arr))
        clf = PCA(n_components =15)
        clf.fit(train_arr)
        y_pred=clf.predict(train_arr)
        print("TRAINING ACCURACY for TRADER",trader_id,":",100 - (sum(y_pred)*100/l1))
        y_pred=clf.predict(test_data)
        print("TESTING ACCURACY: ",sum(y_pred)*100/l2)
Beispiel #6
0
    def pca(self, X_train, n_components=None, contamination=None):
        """
        Train PCA model from PYOD

        Parameters
        __________
        X_train: scaled training data
        contamination: percentage of anomalies in the data
        n_components: number of components to transform

        Returns
        ________
        Anomaly scores
        """
        model = PCAOD(n_components=n_components, contamination=contamination)
        model.fit(X_train)

        # Predict raw anomaly score
        labels = model.predict(X_train)  # outlier labels (0 or 1)
        pca_anomaly_scores = model.decision_function(X_train)  # outlier scores
        pca_anomaly_scores = self.min_max_scaler(pca_anomaly_scores)
        return pca_anomaly_scores, labels
Beispiel #7
0
def train(doc_list, dataset_name, clf_name):
    model_roc = []
    model_prc = []
    if clf_name == "PCA":
        clf = PCA()
    elif clf_name == "MCD":
        clf = MCD()
    elif clf_name == "LOF":
        clf = LOF()
    elif clf_name == "KNN":
        clf = KNN()
    elif clf_name == "LODA":
        clf = LODA()
    for i in range(10):
        data = pd.read_csv(doc_list[i], header=0, index_col=0)
        train_x = data.drop(drop + ground_truth, axis=1).values
        train_y = np.array([
            transfor[x]
            for x in list(_flatten(data[ground_truth].values.tolist()))
        ])
        clf.fit(train_x)
        predict = clf.decision_scores_
        roc = roc_auc_score(train_y, predict)
        prc = precision_n_scores(train_y, predict)
        if ((i + 1) % 200 == 0):
            print("第" + str(i + 1) + "个文件结果:")
            evaluate_print(clf_name, train_y, predict)
        model_roc.append(roc)
        model_prc.append(prc)
    model_roc_avg = np.mean(model_roc)
    model_prc_avg = np.mean(model_prc)
    print("模型" + clf_name + "在数据集" + dataset_name + "的平均roc_auc为" +
          str(round(model_roc_avg, 4)) + ",平均prc为" +
          str(round(model_prc_avg, 4)) + "。")

    return model_roc_avg, model_prc_avg
Beispiel #8
0
         "abalone", 9, dataset2_result_prc[0], dataset2_result_prc[1],
         dataset2_result_prc[2], dataset2_result_prc[3], dataset2_result_prc[4]
     ]],
    columns=["Dataset", "Dimensions", "PCA", "MCD", "LOF", "KNN", "LODA"])
result_roc
result_prc

#对全集csv文件进行训练并可视化结果
clf = PCA()
clf_name = "PCA"
read = r"D:\研一下学期\数据挖掘\作业4\pageb\meta_data\pageb.preproc.csv"
data = pd.read_csv(read, header=0, index_col=0)
train_x = data.drop(drop + ground_truth + ["original.label"], axis=1).values
train_y = np.array(
    [transfor[x] for x in list(_flatten(data[ground_truth].values.tolist()))])
clf.fit(train_x)
label = clf.labels_
predict = clf.decision_scores_
evaluate_print(clf_name, train_y, predict)
pca = decomposition.PCA(n_components=2)
X = pca.fit_transform(train_x)
visualize(clf_name,
          X,
          train_y,
          X,
          train_y,
          label,
          train_y,
          show_figure=True,
          save_figure=True)
# In[14]:

from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)

# Reduce the redunant data
X4 = tsne.fit_transform(unique4)
plt.figure(figsize=(20, 20))
plt.scatter(X4[:, 0], X4[:, 1], c=knn4.labels_)
plt.show()

# In[14]:

from pyod.models.pca import PCA
pca2 = PCA()
pca2.fit(unique2)

# In[15]:

from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)

# Reduce the redunant data
X12 = tsne.fit_transform(unique2)
plt.figure(figsize=(20, 20))
plt.scatter(X12[:, 0], X12[:, 1], c=pca2.labels_)
plt.show()

# In[16]:

from pyod.models.pca import PCA
Beispiel #10
0
    'kernel': ['rbf'],
    'gamma': [1, 1.2, 0.8, 2, 0.1, 0.2, 0.3, 0.5, 2, 3, 4],
    'nu': [0.4, 0.03, 0.2, 0.1, 0.05, 0.06, 0.07, 0.08]
}]
# tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1],'nu':  [0.4]}]
scores = ['recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(oc_svm(),
                       tuned_parameters,
                       cv=2,
                       scoring='%s_macro' % score)
    clf.fit(train_normal_arr_stnd, train_labels)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
class Remove_Outliers(BaseEstimator, TransformerMixin):
    def __init__(self,
                 target,
                 contamination=.20,
                 random_state=42,
                 methods=['knn', 'iso', 'mcd']):

        self.target = target
        self.contamination = contamination
        self.random_state = random_state
        self.methods = methods

    def fit(self, data, y=None):
        return (None)

    def transform(self, data, y=None):
        return (data)

    def fit_transform(self, dataset, y=None):
        data = dataset.copy()

        if 'iso' in self.methods:
            self.iso_forest = IForest(contamination=self.contamination,
                                      random_state=self.random_state,
                                      behaviour='new')
            self.iso_forest.fit(data.drop(self.target, axis=1))
            iso_predict = self.iso_forest.predict(
                data.drop(self.target, axis=1))
            data['iso'] = iso_predict

        if 'knn' in self.methods:
            self.knn_out = KNN(contamination=self.contamination)
            self.knn_out.fit(data.drop(self.target, axis=1))
            knn_predict = self.knn_out.predict(data.drop(self.target, axis=1))
            data['knn'] = knn_predict

        if 'pca' in self.methods:
            self.out_pca = PCA_RO(contamination=self.contamination,
                                  random_state=self.random_state)
            self.out_pca.fit(data.drop(self.target, axis=1))
            pca_predict = self.out_pca.predict(data.drop(self.target, axis=1))
            data['pca'] = pca_predict

        # use for those features which are gaussian distributed
        if 'mcd' in self.methods:
            self.mcd = EllipticEnvelope(contamination=0.01)
            self.mcd.fit(data.drop(self.target, axis=1))
            mcd_predict = self.mcd.predict(data.drop(self.target, axis=1))
            data['mcd'] = mcd_predict

        data['vote_outlier'] = 0

        for i in self.methods:
            data['vote_outlier'] = data['vote_outlier'] + data[i]

        self.outliers = data[data['vote_outlier'] == len(self.methods)]

        return dataset[[
            True if i not in self.outliers.index else False
            for i in dataset.index
        ]]
Beispiel #12
0
    # load dataset
    data_dict = load_dataset(
        dataset,
        subdataset,
        "all",
    )

    x_train = data_dict["train"]
    x_test = data_dict["test"]
    x_test_labels = data_dict["test_labels"]

    start = time.time()
    # data preprocessing for MSCRED
    od = PCA()
    od.fit(x_train)

    # get outlier scores
    anomaly_score = od.decision_function(x_test)

    anomaly_label = x_test_labels

    end = time.time()

    time = end - start
    # Make evaluation
    evaluate_all(anomaly_score, anomaly_label)
    salience = compute_salience(anomaly_score, anomaly_label)
    print('time')
    print('   ', time)
    print('salience')
Beispiel #13
0
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train PCA detector
    clf_name = 'PCA'
    clf = PCA()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Beispiel #14
0
    values = [track[feature] for feature in characteristics]
    return values


# playlist_id = "https://open.spotify.com/playlist/3z91HHZMlFJsUZquZBbQnX"  # Harshal playlist
# playlist_id = "https://open.spotify.com/playlist/4yyfdbRQpx44MQwNPgBOek"  # No words playlist
playlist_id = "https://open.spotify.com/playlist/4DVTXRD4BzbjPSfbw1n74E"  # Questionable playlist
uris = data.get_playlist(sp, username, playlist_id)

# uri = 'spotify:track:2cGxRwrMyEAp8dEbuZaVv6'
# info = data.get_audio_features(sp, uri)
# values = track_characteristics_array(sp, uri)

X = np.array(
    [track_characteristics_array(sp, track_uri) for track_uri in uris])

clf = PCA()
clf.fit(X)

ranks = sorted(zip(clf.decision_scores_, uris), reverse=True)
for score, track_uri in ranks:
    if score < clf.threshold_:
        break
    track = data.get_track_info(sp, track_uri)
    track_info = track_characteristics_array(sp, track_uri)
    prob = clf.predict_proba(np.array([track_info]), method="unify")[:, 1][0]
    print("{: 12.3f} {: 6.3f} {:>32s} {:s}".format(score, prob,
                                                   track["artists"][0]["name"],
                                                   track["name"]))
Beispiel #15
0
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train PCA detector
    clf_name = 'PCA'
    clf = PCA()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Beispiel #16
0
pca = decomposition.PCA()

x = np.array([
    [0.387, 4878, 5.42, 1.21],
    [0.723, 12104, 5.25, 2.22],
    [1, 12756, 5.52, 3.45],
    [1.524, 6787, 3.94, 2.1],
])

x_norm = StandardScaler().fit_transform(x)

# train PCA detector
clf_name = 'PCA'
clf = PCA()
clf.fit(x_norm)

###################################################################
# from sklearn.metrics import precision_score
# from sklearn.metrics import recall_score
# from pyod.utils.utility import get_label_n
# import numpy as np
# print('\nThe number of outliers is', np.count_nonzero(y_train))
#
# for n in range(15, 25):
#     y_pred = get_label_n(y_train, y_train_scores, n=n)
#     print('n:', n,
#           'precision:', precision_score(y_train, y_pred),
#           'recall:', recall_score(y_train, y_pred))
###################################################################
Beispiel #17
0
def detect(file, amountanom, realtime, dumptocsv):
    """
    Function to apply a very simple anomaly detector
    amountanom: The top number of anomalies we want to print
    realtime: If we want to read the conn.log file in real time (not working)
    """

    # Create a Pandas dataframe from the conn.log
    bro_df = pd.read_csv(
        file,
        sep="\t",
        comment='#',
        names=[
            'ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p',
            'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes',
            'conn_state', 'local_orig', 'local_resp', 'missed_bytes',
            'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts',
            'resp_ip_bytes', 'tunnel_parents'
        ])

    # In case you need a label, due to some models being able to work in a
    # semisupervized mode, then put it here. For now everything is
    # 'normal', but we are not using this for detection
    bro_df['label'] = 'normal'

    # Replace the rows without data (with '-') with 0.
    # Even though this may add a bias in the algorithms,
    # is better than not using the lines.
    # Also fill the no values with 0
    # Finally put a type to each column
    bro_df['orig_bytes'].replace('-', '0', inplace=True)
    bro_df['orig_bytes'] = bro_df['orig_bytes'].fillna(0).astype('int32')
    bro_df['resp_bytes'].replace('-', '0', inplace=True)
    bro_df['resp_bytes'] = bro_df['resp_bytes'].fillna(0).astype('int32')
    bro_df['resp_pkts'].replace('-', '0', inplace=True)
    bro_df['resp_pkts'] = bro_df['resp_pkts'].fillna(0).astype('int32')
    bro_df['orig_ip_bytes'].replace('-', '0', inplace=True)
    bro_df['orig_ip_bytes'] = bro_df['orig_ip_bytes'].fillna(0).astype('int32')
    bro_df['resp_ip_bytes'].replace('-', '0', inplace=True)
    bro_df['resp_ip_bytes'] = bro_df['resp_ip_bytes'].fillna(0).astype('int32')
    bro_df['duration'].replace('-', '0', inplace=True)
    bro_df['duration'] = bro_df['duration'].fillna(0).astype('float64')

    # Save dataframe to disk as CSV
    if dumptocsv != "None":
        bro_df.to_csv(dumptocsv)

    # Add the columns from the log file that we know are numbers. This is only for conn.log files.
    X_train = bro_df[[
        'duration', 'orig_bytes', 'id.resp_p', 'resp_bytes', 'orig_ip_bytes',
        'resp_pkts', 'resp_ip_bytes'
    ]]

    # Our y is the label. But we are not using it now.
    y = bro_df.label

    # The X_test is where we are going to search for anomalies. In our case, its the same set of data than X_train.
    X_test = X_train

    #################
    # Select a model from below

    # ABOD class for Angle-base Outlier Detection. For an observation, the
    # variance of its weighted cosine scores to all neighbors could be
    # viewed as the outlying score.
    # clf = ABOD()

    # LOF
    # clf = LOF()

    # CBLOF
    # clf = CBLOF()

    # LOCI
    # clf = LOCI()

    # LSCP
    # clf = LSCP()

    # MCD
    # clf = MCD()

    # OCSVM
    # clf = OCSVM()

    # PCA. Good and fast!
    clf = PCA()

    # SOD
    # clf = SOD()

    # SO_GAAL
    # clf = SO_GALL()

    # SOS
    # clf = SOS()

    # XGBOD
    # clf = XGBOD()

    # KNN
    # Good results but slow
    # clf = KNN()
    # clf = KNN(n_neighbors=10)
    #################

    # Fit the model to the train data
    clf.fit(X_train)

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)

    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # Convert the ndarrays of scores and predictions to  pandas series
    scores_series = pd.Series(y_test_scores)
    pred_series = pd.Series(y_test_pred)

    # Now use the series to add a new column to the X test
    X_test['score'] = scores_series.values
    X_test['pred'] = pred_series.values

    # Add the score to the bro_df also. So we can show it at the end
    bro_df['score'] = X_test['score']

    # Keep the positive predictions only. That is, keep only what we predict is an anomaly.
    X_test_predicted = X_test[X_test.pred == 1]

    # Keep the top X amount of anomalies
    top10 = X_test_predicted.sort_values(by='score',
                                         ascending=False).iloc[:amountanom]

    # Print the results
    # Find the predicted anomalies in the original bro dataframe, where the rest of the data is
    df_to_print = bro_df.iloc[top10.index]
    print('\nFlows of the top anomalies')

    # Only print some columns, not all, so its easier to read.
    df_to_print = df_to_print.drop([
        'conn_state', 'history', 'local_orig', 'local_resp', 'missed_bytes',
        'ts', 'tunnel_parents', 'uid', 'label'
    ],
                                   axis=1)
    print(df_to_print)
Beispiel #18
0
    def AD_algo(self):

        if self.option == "PCA":
            # --- PCA --- #
            print('testing with PCA...')
            # fit PCA detector
            # clf_name = 'PCA'
            clf_name = self.option
            clf = PCA()
            clf.fit(self.data)

            # get the prediction labels and outlier scores
            y_pca_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
            y_pca_scores = clf.decision_scores_  # raw outlier scores

            # evaluate and print the results
            roc, prn, rec, f1, fp = evaluation_print(clf_name, self.label,
                                                     y_pca_scores)
            print('%s: Results for Algorithm %s are:' %
                  (self.getName(), clf_name))
            print(
                'Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}'
                .format(roc, prn, rec, f1, fp))
            # c = self.a + self.b
            # print('result for %s is:' % (self.getName()))
            # print(c)
            f = open("./experiments/plots/AD_pyod.txt", "a")
            f.write('--------------------------------------------\n')
            f.write('%s: Results for Algorithm %s are:\n' %
                    (self.getName(), clf_name))
            f.write(
                'Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}\n'
                .format(roc, prn, rec, f1, fp))
            f.close()

        elif self.option == "OCSVM":
            # --- OCSVM --- #
            print('testing with OCSVM...')
            # train one_class_svm detector
            # clf_name = 'OneClassSVM'
            clf_name = self.option
            clf = OCSVM()
            clf.fit(self.data)

            # get the prediction labels and outlier scores
            y_ocsvm_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
            y_ocsvm_scores = clf.decision_scores_  # raw outlier scores

            # evaluate and print the results
            roc, prn, rec, f1, fp = evaluation_print(clf_name, self.label,
                                                     y_ocsvm_scores)
            print('%s: Results for Algorithm %s are:' %
                  (self.getName(), clf_name))
            print(
                'Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}'
                .format(roc, prn, rec, f1, fp))
            f = open("./experiments/plots/AD_pyod.txt", "a")
            f.write('--------------------------------------------\n')
            f.write('%s: Results for Algorithm %s are:\n' %
                    (self.getName(), clf_name))
            f.write(
                'Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}\n'
                .format(roc, prn, rec, f1, fp))
            f.close()

        elif self.option == "KNN":
            # --- KNN --- #
            print('testing with KNN...')
            # train kNN detector
            # clf_name = 'KNN'
            clf_name = self.option
            clf = KNN()
            clf.fit(self.data)

            # get the prediction labels and outlier scores
            y_knn_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
            y_knn_scores = clf.decision_scores_  # raw outlier scores

            # evaluate and print the results
            roc, prn, rec, f1, fp = evaluation_print(clf_name, self.label,
                                                     y_knn_scores)
            print('%s: Results for Algorithm %s are:' %
                  (self.getName(), clf_name))
            print(
                'Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}'
                .format(roc, prn, rec, f1, fp))
            f = open("./experiments/plots/AD_pyod.txt", "a")
            f.write('--------------------------------------------\n')
            f.write('%s: Results for Algorithm %s are:\n' %
                    (self.getName(), clf_name))
            f.write(
                'Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}\n'
                .format(roc, prn, rec, f1, fp))
            f.close()

        elif self.option == "ABOD":
            # --- ABOD --- #
            print('testing with ABOD...')
            # train ABOD detector
            # clf_name = 'ABOD'
            clf_name = self.option
            clf = ABOD()
            clf.fit(self.data)

            # get the prediction labels and outlier scores
            y_abod_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
            y_abod_scores = clf.decision_scores_  # raw outlier scores

            # evaluate and print the results
            roc, prn, rec, f1, fp = evaluation_print(clf_name, self.label,
                                                     y_abod_scores)
            print('%s: Results for Algorithm %s are:' %
                  (self.getName(), clf_name))
            print(
                'Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}'
                .format(roc, prn, rec, f1, fp))
            f = open("./experiments/plots/AD_pyod.txt", "a")
            f.write('--------------------------------------------\n')
            f.write('%s: Results for Algorithm %s are:\n' %
                    (self.getName(), clf_name))
            f.write(
                'Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}\n'
                .format(roc, prn, rec, f1, fp))
            f.close()

        elif self.option == "FB":
            # --- FeatureBagging --- #
            print('testing with FeatureBagging...')
            # train FeatureBagging detector
            # clf_name = 'FeatureBagging'
            clf_name = self.option
            clf = FeatureBagging()
            clf.fit(self.data)

            # get the prediction labels and outlier scores
            y_fb_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
            y_fb_scores = clf.decision_scores_  # raw outlier scores

            # evaluate and print the results
            roc, prn, rec, f1, fp = evaluation_print(clf_name, self.label,
                                                     y_fb_scores)
            print('%s: Results for Algorithm %s are:' %
                  (self.getName(), clf_name))
            print(
                'Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}'
                .format(roc, prn, rec, f1, fp))
            f = open("./experiments/plots/AD_pyod.txt", "a")
            f.write('--------------------------------------------\n')
            f.write('%s: Results for Algorithm %s are:\n' %
                    (self.getName(), clf_name))
            f.write(
                'Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}\n'
                .format(roc, prn, rec, f1, fp))
            f.close()

        elif self.option == "AE":
            # --- AutoEncoder --- #
            contamination = 0.1
            print('testing with AutoEncoder...')
            # train AutoEncoder detector
            # clf_name = 'AutoEncoder'
            clf_name = self.option
            clf = AutoEncoder(epochs=30, contamination=contamination)
            clf.fit(self.data)

            # get the prediction labels and outlier scores
            y_ae_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
            y_ae_scores = clf.decision_scores_  # raw outlier scores

            # evaluate and print the results
            roc, prn, rec, f1, fp = evaluation_print(clf_name, self.label,
                                                     y_ae_scores)
            print('%s: Results for Algorithm %s are:' %
                  (self.getName(), clf_name))
            print(
                'Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}'
                .format(roc, prn, rec, f1, fp))
            f = open("./experiments/plots/AD_pyod.txt", "a")
            f.write('--------------------------------------------\n')
            f.write('%s: Results for Algorithm %s are:\n' %
                    (self.getName(), clf_name))
            f.write(
                'Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}\n'
                .format(roc, prn, rec, f1, fp))
            f.close()
Beispiel #19
0
class TestPCA(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination)

        self.clf = PCA(contamination=self.contamination)

    # TODO: placeholder, do not use
    def test_fit(self):
        self.clf.fit(self.X_train)

    # def test_sklearn_estimator(self):
    #     check_estimator(self.clf)
    #
    # def test_parameters(self):
    #     if not hasattr(self.clf,
    #                    'decision_scores_') or self.clf.decision_scores_ is None:
    #         self.assertRaises(AttributeError, 'decision_scores_ is not set')
    #     if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None:
    #         self.assertRaises(AttributeError, 'labels_ is not set')
    #     if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None:
    #         self.assertRaises(AttributeError, 'threshold_ is not set')
    #     if not hasattr(self.clf,
    #                    'negative_outlier_factor_') or self.clf.negative_outlier_factor_ is None:
    #         self.assertRaises(AttributeError,
    #                           'negative_outlier_factor_ is not set')
    #
    #     if not hasattr(self.clf,
    #                    'n_neighbors') or self.clf.n_neighbors_ is None:
    #         self.assertRaises(AttributeError, 'n_neighbors is not set')
    #
    # def test_train_scores(self):
    #     assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])
    #
    # def test_prediction_scores(self):
    #     pred_scores = self.clf.decision_function(self.X_test)
    #
    #     # check score shapes
    #     assert_equal(pred_scores.shape[0], self.X_test.shape[0])
    #
    #     # check performance
    #     assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)
    #
    # def test_prediction_labels(self):
    #     pred_labels = self.clf.predict(self.X_test)
    #     assert_equal(pred_labels.shape, self.y_test.shape)
    #
    # def test_prediction_proba(self):
    #     pred_proba = self.clf.predict_proba(self.X_test)
    #     assert_greater_equal(pred_proba.min(), 0)
    #     assert_less_equal(pred_proba.max(), 1)
    #
    # def test_prediction_proba_linear(self):
    #     pred_proba = self.clf.predict_proba(self.X_test, method='linear')
    #     assert_greater_equal(pred_proba.min(), 0)
    #     assert_less_equal(pred_proba.max(), 1)
    #
    # def test_prediction_proba_unify(self):
    #     pred_proba = self.clf.predict_proba(self.X_test, method='unify')
    #     assert_greater_equal(pred_proba.min(), 0)
    #     assert_less_equal(pred_proba.max(), 1)
    #
    # def test_prediction_proba_parameter(self):
    #     with assert_raises(ValueError):
    #         self.clf.predict_proba(self.X_test, method='something')
    #
    # def test_fit_predict(self):
    #     pred_labels = self.clf.fit_predict(self.X_train)
    #     assert_equal(pred_labels.shape, self.y_train.shape)
    #
    # def test_evaluate(self):
    #     self.clf.fit_predict_evaluate(self.X_test, self.y_test)

    def tearDown(self):
        pass
Beispiel #20
0
class TestPCA(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.5
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination)

        self.clf = PCA(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        if not hasattr(self.clf, 'decision_scores_') or \
                self.clf.decision_scores_ is None:
            self.assertRaises(AttributeError, 'decision_scores_ is not set')
        if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None:
            self.assertRaises(AttributeError, 'labels_ is not set')
        if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None:
            self.assertRaises(AttributeError, 'threshold_ is not set')
        if not hasattr(self.clf, 'selected_components_') or \
                self.clf.selected_components_ is None:
            self.assertRaises(AttributeError,
                              'selected_components_ is not set')

        if not hasattr(self.clf, 'selected_w_components_') or \
                self.clf.selected_w_components_ is None:
            self.assertRaises(AttributeError,
                              'selected_w_components_ is not set')

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # TODO: turn off performance check before a better data generation
        # method is available.
        # check performance
        # assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_evaluate(self):
        self.clf.fit_predict_evaluate(self.X_test, self.y_test)

    def tearDown(self):
        pass
Beispiel #21
0
train_normal_arr_stnd = train_normal_arr_stnd/(train_normal_arr_stnd.std(axis = 0)+1)

test_all_data_stnd = test_all_data - test_all_data.mean(axis = 0)
# negatives_mat = negatives_mat - negatives_mat.mean(axis =0)
# all_data_mat = all_data_mat - all_data_mat.mean(axis=0)

test_all_data_stnd = test_all_data_stnd/(test_all_data_stnd.std(axis = 0)+1)
## Generate labels

# normal_complete_data_arr = all_data[0:len(normal_complete_data_arr)]
# test_labels = all_labels[int(0.8)*len(normal_complete_data_arr):len(all_labels)]
test_labels = all_labels[int(0.8*len(normal_complete_data_arr)):len(all_labels)]
print("test",test_labels.count(0))
clf1 = PCA(n_components = 15,n_selected_components = 1,standardization = True)
clf1.fit(train_normal_arr_stnd)
predicted = clf1.predict(test_all_data_stnd)
accuracy = 0
recall = 0
for i in range(len(predicted)):
    if predicted[i] == test_labels[i] and test_labels[i] == 1:
        recall +=1
    if predicted[i] == all_labels[i]:
        accuracy +=1
print("PCA Accuracy",accuracy/len(train_normal_arr_stnd))
print("PCA Recall",recall/len(malicious_complete_data_arr))
print(clf1.singular_values_)

## OCSVM
clf1 = OCSVM(kernel = 'rbf',gamma = 1,nu = 0.4)
clf1.fit(train_normal_arr_stnd)
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.mcd import MCD
from pyod.models.lscp import LSCP
# from pyod.models.auto_encoder import AutoEncoder

clf_knn = KNN()
clf_pca = PCA()
clf_mcd = MCD()
clf_lof = LOF()
clf_cblof = CBLOF()
# clf_lscp = LSCP([clf_knn, clf_pca, clf_mcd ])
# clf_ae = AutoEncoder(epochs=50)

clf_mcd.fit(encodings_train)
clf_pca.fit(encodings_train)
clf_knn.fit(encodings_train)
clf_lof.fit(encodings_train)
clf_cblof.fit(encodings_train)
# clf_lscp.fit(encodings_train)
# clf_ae.fit(encodings_train)

anomaly_scores_mcd = clf_mcd.decision_function(encodings_train)
anomaly_scores_pca = clf_pca.decision_function(encodings_train)
anomaly_scores_knn = clf_knn.decision_function(encodings_train)
anomaly_scores_lof = clf_lof.decision_function(encodings_train)
anomaly_scores_cblof = clf_cblof.decision_function(encodings_train)
# anomaly_scores_lscp = clf_lscp.decision_function(encodings_train)
# anomaly_scores_ae = clf_ae.predict_proba(encodings_train)

# y_test_scores = []
Beispiel #23
0
class TestPCA(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            n_features=10,
            contamination=self.contamination,
            random_state=42)

        self.clf = PCA(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert (hasattr(self.clf, 'selected_components_')
                and self.clf.selected_components_ is not None)
        assert (hasattr(self.clf, 'selected_w_components_')
                and self.clf.selected_w_components_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
Beispiel #24
0
"""### Angle-based Outlier Detector (Probabilistic Based Model)"""

from pyod.models import abod

clf_abod = abod.ABOD(contamination=0.1, n_neighbors=5, method='fast')
clf_abod.fit(X)

y_pred = clf_abod.predict(X)  # outlier labels (0 or 1)
y_scores = clf_abod.decision_function(X)  # outlier scores

colors = np.array(['#377eb8', '#ff7f00'])
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred - 1) // 2])

clf_abod.fit_predict_score(X[:, 0].reshape(-1,1), y_pred, scoring='roc_auc_score')

"""### Linear Model PCA"""

from pyod.models.pca import PCA

clf_pca = PCA()
clf_pca.fit(X)

y_pred = clf_pca.predict(X)  # outlier labels (0 or 1)
y_scores = clf_pca.decision_function(X)  # outlier scores

y_pred

colors = np.array(['#377eb8', '#ff7f00'])
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred - 1) // 2])

clf_pca.fit_predict_score(X[:, 0].reshape(-1,1), y_pred, scoring='roc_auc_score')
Beispiel #25
0
class Model(object):
    def __init__(self, c_dict, o_dict):
        self.c_dict = c_dict
        self.o_dict = o_dict

        self.center_o = []

        self.X_o = []
        self.X_c = []

        x_o = []
        for _, val_o in self.o_dict.items():
            x_o.append(val_o[:8])
            if self._in_center(val_o):
                self.center_o.append(True)
            else:
                self.center_o.append(False)

        self.X_o = self._stack(x_o)

        for _, val_o in self.o_dict.items():
            if self._in_center(val_o):
                x_c = []
                for _, val_c in self.c_dict.items():
                    x_c.append(val_c[:8])
                if bool(self.c_dict):
                    x_c.append(val_o[:8])
                    self.X_c.append(self._stack(x_c))

    # Dictionary from python 3.6 have insertion order
    def _in_center(self, val):
        return val[9]["lat_lon_index"] == val[10]["center_index"]

    def _stack(self, x):
        if len(x) > 0:
            return np.stack(x)
        else:
            return []

    def pred_KNN(self, k=5, comp_with="openaq"):
        ## hyperparameters for KNN is tuned here
        # if self.bool_o_dict == True:
        self.comp_with = comp_with

        if comp_with == "openaq":
            if self.X_o == []:
                pred = []
            elif self.X_o.shape[0] > k:
                self.clf = KNN(n_neighbors=k)
                self.clf.fit(self.X_o)
                pred = self.clf.labels_
            elif self.X_o.shape[0] > 2:
                # print(f"The value of k is changed from {k} to {self.X_o.shape[0]-1}")
                k = self.X_o.shape[0] - 1
                self.clf = KNN(n_neighbors=k)
                self.clf.fit(self.X_o)
                pred = self.clf.labels_
            else:
                pred = []
            #A_location, B_location, C_location = self.pred_location(pred)

        elif comp_with == "cams":
            pred = []
            for each_X in self.X_c:
                # if each_X exists then it will have a shape of (10,8)
                self.clf = KNN(n_neighbors=k)
                self.clf.fit(each_X)
                pred.append(self.clf.labels_[-1])

        A_location, B_location, C_location = self.pred_location(pred)

        return A_location, B_location, C_location

    def pred_COPOD(self, comp_with="openaq"):

        self.comp_with = comp_with

        if comp_with == "openaq":
            if self.X_o == []:
                pred = []
            else:
                self.clf = COPOD()
                self.clf.fit(self.X_o)
                pred = self.clf.labels_

        elif comp_with == "cams":
            pred = []
            for each_X in self.X_c:
                self.clf = COPOD()
                self.clf.fit(each_X)
                pred.append(self.clf.labels_[-1])

        A_location, B_location, C_location = self.pred_location(pred)

        return A_location, B_location, C_location

    # def pred_LSCP(self, k, comp_with = "openaq"):
    #     ## hyperparameters for KNN is tuned here
    #     ## number of data points cannot be lesser than the local_regio_size (5 in this case)
    #     self.comp_with = comp_with

    #     detector_list = [LOF(n_neighbors=3), LOF(n_neighbors=5), LOF(n_neighbors=7)]

    #     if comp_with == "openaq":
    #         if self.X_o == []:
    #             pred = []
    #         elif self.X_o.shape[0] > k:
    #             self.clf = LSCP(detector_list, random_state=42, local_region_size=k)
    #             try:
    #                 self.clf.fit(self.X_o)
    #             except:
    #                 print(self.X_o.shape)
    #             pred = self.clf.labels_
    #         elif self.X_o.shape[0] > 3:
    #             # print(f"The value of k is changed from {k} to {self.X_o.shape[0]-1}")
    #             k = self.X_o.shape[0]-1
    #             self.clf = LSCP(detector_list, random_state=42, local_region_size=k)
    #             try:
    #                 self.clf.fit(self.X_o)
    #             except:
    #                 print(self.X_o.shape)
    #             pred = self.clf.labels_
    #         else:
    #             pred = []

    #     elif comp_with == "cams":
    #         pred = []
    #         for each_X in self.X_c:
    #             self.clf = LSCP(detector_list, random_state=42, local_region_size=k)
    #             self.clf.fit(each_X)
    #             pred.append(self.clf.labels_[-1])

    #     A_location, B_location, C_location = self.pred_location(pred)

    #     return A_location, B_location, C_location

    def pred_PCA(self, n_comp=3, comp_with='openaq'):

        ## hyperparameters for KNN is tuned here
        # Number of samples must be greater than the n_components (3 in this case). It can be made 0.3 to make it work

        self.comp_with = comp_with

        if comp_with == "openaq":
            if self.X_o == []:
                pred = []
            elif self.X_o.shape[0] > n_comp:
                self.clf = PCA(n_components=n_comp)
                self.clf.fit(self.X_o)
                pred = self.clf.labels_
            elif self.X_o.shape[0] > 2:
                # print(f"The value of k is changed from {k} to {self.X_o.shape[0]-1}")
                n_comp = self.X_o.shape[0] - 1
                self.clf = PCA(n_components=n_comp)
                self.clf.fit(self.X_o)
                pred = self.clf.labels_
            else:
                pred = []

        elif comp_with == "cams":
            pred = []
            for each_X in self.X_c:
                self.clf = PCA(n_components=n_comp)
                self.clf.fit(each_X)
                pred.append(self.clf.labels_[-1])

        A_location, B_location, C_location = self.pred_location(pred)

        return A_location, B_location, C_location

    def pred_location(self, pred):
        if pred == []:
            counter = 0
            A_location = []
            B_location = []
            C_location = []

            for key, _ in self.o_dict.items():
                if self.center_o[counter] == True:
                    C_location.append(
                        [key, self.o_dict[key][-3]["coordinates"]])
                    counter += 1
        else:
            counter = 0
            counter_center = 0
            counter_pred = 0
            A_location = []
            B_location = []
            C_location = []

            if self.comp_with == "cams":
                for key, _ in self.o_dict.items():
                    if self.center_o[counter_center] == True:
                        if pred[counter_pred] == 1:
                            A_location.append(
                                [key, self.o_dict[key][-3]["coordinates"]])
                        else:
                            B_location.append(
                                [key, self.o_dict[key][-3]["coordinates"]])
                        counter_pred += 1
                        counter_center += 1
                    else:
                        counter_center += 1

            elif self.comp_with == "openaq":
                for key, _ in self.o_dict.items():
                    if self.center_o[counter] == True:
                        if pred[counter] == 1:
                            A_location.append(
                                [key, self.o_dict[key][-3]["coordinates"]])
                        else:
                            B_location.append(
                                [key, self.o_dict[key][-3]["coordinates"]])
                    counter += 1

        return A_location, B_location, C_location
    test_set.append(negatives2[indices_negatives[i]])
    pred_test_set.append(1)
# print(len(train_set),len(test_set))
import numpy as np
train_set=np.array(train_set)
test_set=np.array(test_set)

from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
# from pyod.models.mcd import MCD

clf1=PCA(standardization = True,contamination=0.2)
# clf1 = MCD(assume_centered = True)
clf2=OCSVM(kernel = 'poly',nu = 0.25,degree =2,contamination =0.2)
# clf2 = OCSVM(kernel = 'linear',nu =0.02)
clf1.fit(train_set)
clf2.fit(train_set)

y_pred_train_pca=clf1.predict(train_set)
y_pred_test_pca=clf1.predict(test_set)

y_pred_train_ocsvm=clf2.predict(train_set)
y_pred_test_ocsvm=clf2.predict(test_set)
print(clf1.explained_variance_)
# print(y_pred_test_pca,y_pred_test_ocsvm)
train_pca_correct=0
train_ocsvm_correct=0
print("TRAIN SET")
for i in range(len(pred_train_set)):
    # print("Actual:",pred_train_set[i],"PCA",y_pred_train_pca[i],"OCSVM",y_pred_train_ocsvm[i])
    if pred_train_set[i]==y_pred_train_pca[i] and pred_train_set[i]==1:
Beispiel #27
0
class TestPCA(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.5
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = PCA(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        if not hasattr(self.clf, 'decision_scores_') or \
                self.clf.decision_scores_ is None:
            self.assertRaises(AttributeError, 'decision_scores_ is not set')
        if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None:
            self.assertRaises(AttributeError, 'labels_ is not set')
        if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None:
            self.assertRaises(AttributeError, 'threshold_ is not set')
        if not hasattr(self.clf, 'selected_components_') or \
                self.clf.selected_components_ is None:
            self.assertRaises(AttributeError,
                              'selected_components_ is not set')

        if not hasattr(self.clf, 'selected_w_components_') or \
                self.clf.selected_w_components_ is None:
            self.assertRaises(AttributeError,
                              'selected_w_components_ is not set')

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Beispiel #28
0
        print('{data_file} does not exist. Use generated data'.format(
            data_file=mat_file))
        X, y = generate_data(train_only=True)  # load data
    except IOError:
        print('{data_file} does not exist. Use generated data'.format(
            data_file=mat_file))
        X, y = generate_data(train_only=True)  # load data
    else:
        X = mat['X']
        y = mat['y'].ravel()

    for t in range(ite):
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.4)

        # standardizing data for processing
        X_train_norm, X_test_norm = standardizer(X_train, X_test)

        # initialize 20 base detectors for combination

        clf = PCA()
        clf.fit(X_train_norm)

        train_scores = clf.decision_scores_
        test_scores = clf.decision_function(X_test_norm)

        print()
        evaluate_print('PCA Train', y_train, train_scores)
        evaluate_print('PCA Test', y_test, test_scores)
Beispiel #29
0
class PCA(CollectiveBaseDetector):
    """PCA-based outlier detection with both univariate and multivariate
    time series data. TS data will be first transformed to tabular format. 
    For univariate data, it will be in shape of [valid_length, window_size].
    for multivariate data with d sequences, it will be in the shape of 
    [valid_length, window_size].

    Parameters
    ----------
    window_size : int
        The moving window size.

    step_size : int, optional (default=1)
        The displacement for moving window.

    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.

    n_components : int, float, None or string
        Number of components to keep. It should be smaller than the window_size.
        if n_components is not set all components are kept::

            n_components == min(n_samples, n_features)

        if n_components == 'mle' and svd_solver == 'full', Minka\'s MLE is used
        to guess the dimension
        if ``0 < n_components < 1`` and svd_solver == 'full', select the number
        of components such that the amount of variance that needs to be
        explained is greater than the percentage specified by n_components
        n_components cannot be equal to n_features for svd_solver == 'arpack'.

    n_selected_components : int, optional (default=None)
        Number of selected principal components
        for calculating the outlier scores. It is not necessarily equal to
        the total number of the principal components. If not set, use
        all principal components.

    copy : bool (default True)
        If False, data passed to fit are overwritten and running
        fit(X).transform(X) will not yield the expected results,
        use fit_transform(X) instead.

    whiten : bool, optional (default False)
        When True (False by default) the `components_` vectors are multiplied
        by the square root of n_samples and then divided by the singular values
        to ensure uncorrelated outputs with unit component-wise variances.

        Whitening will remove some information from the transformed signal
        (the relative variance scales of the components) but can sometime
        improve the predictive accuracy of the downstream estimators by
        making their data respect some hard-wired assumptions.

    svd_solver : string {'auto', 'full', 'arpack', 'randomized'}
        auto :
            the solver is selected by a default policy based on `X.shape` and
            `n_components`: if the input data is larger than 500x500 and the
            number of components to extract is lower than 80% of the smallest
            dimension of the data, then the more efficient 'randomized'
            method is enabled. Otherwise the exact full SVD is computed and
            optionally truncated afterwards.
        full :
            run exact full SVD calling the standard LAPACK solver via
            `scipy.linalg.svd` and select the components by postprocessing
        arpack :
            run SVD truncated to n_components calling ARPACK solver via
            `scipy.sparse.linalg.svds`. It requires strictly
            0 < n_components < X.shape[1]
        randomized :
            run randomized SVD by the method of Halko et al.

    tol : float >= 0, optional (default .0)
        Tolerance for singular values computed by svd_solver == 'arpack'.

    iterated_power : int >= 0, or 'auto', (default 'auto')
        Number of iterations for the power method computed by
        svd_solver == 'randomized'.

    random_state : int, RandomState instance or None, optional (default None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Used when ``svd_solver`` == 'arpack' or 'randomized'.

    weighted : bool, optional (default=True)
        If True, the eigenvalues are used in score computation.
        The eigenvectors with small eigenvalues comes with more importance
        in outlier score calculation.

    standardization : bool, optional (default=True)
        If True, perform standardization first to convert
        data to zero mean and unit variance.
        See http://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html
        
    Attributes
    ----------
    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """
    def __init__(self,
                 window_size,
                 step_size=1,
                 contamination=0.1,
                 n_components=None,
                 n_selected_components=None,
                 copy=True,
                 whiten=False,
                 svd_solver='auto',
                 tol=0.0,
                 iterated_power='auto',
                 random_state=None,
                 weighted=True,
                 standardization=True):
        super(PCA, self).__init__(contamination=contamination)
        self.window_size = window_size
        self.step_size = step_size

        # parameters for PCA
        self.n_components = n_components
        self.n_selected_components = n_selected_components
        self.copy = copy
        self.whiten = whiten
        self.svd_solver = svd_solver
        self.tol = tol
        self.iterated_power = iterated_power
        self.random_state = random_state
        self.weighted = weighted
        self.standardization = standardization

        # initialize a kNN model
        self.model_ = PCA_PYOD(
            n_components=self.n_components,
            n_selected_components=self.n_selected_components,
            contamination=self.contamination,
            copy=self.copy,
            whiten=self.whiten,
            svd_solver=self.svd_solver,
            tol=self.tol,
            iterated_power=self.iterated_power,
            random_state=self.random_state,
            weighted=self.weighted,
            standardization=self.standardization)

    def fit(self, X: np.array) -> object:
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        X = check_array(X).astype(np.float)

        # first convert it into submatrices, and flatten it
        sub_matrices, self.left_inds_, self.right_inds_ = get_sub_matrices(
            X,
            self.window_size,
            self.step_size,
            return_numpy=True,
            flatten=True,
            flatten_order='F')

        # if self.n_components > sub_matrices.shape[1]:
        #     raise ValueError('n_components exceeds window_size times the number of sequences.')

        # fit the PCA model
        self.model_.fit(sub_matrices)
        self.decision_scores_ = self.model_.decision_scores_
        self._process_decision_scores()
        return self

    def decision_function(self, X: np.array):
        """Predict raw anomaly scores of X using the fitted detector.

        The anomaly score of an input sample is computed based on the fitted
        detector. For consistency, outliers are assigned with
        higher anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['model_'])
        X = check_array(X).astype(np.float)
        # first convert it into submatrices, and flatten it
        sub_matrices, X_left_inds, X_right_inds = get_sub_matrices(
            X,
            self.window_size,
            self.step_size,
            return_numpy=True,
            flatten=True,
            flatten_order='F')

        # return the prediction result by PCA
        return self.model_.decision_function(
            sub_matrices), X_left_inds.ravel(), X_right_inds.ravel()
def detect(file, amountanom, realtime):
    """
    Functon to apply a very simple anomaly detector
    amountanom: The top number of anomalies we want to print
    realtime: If we want to read the conn.log file in real time (not working)
    """

    # Create a zeek reader on a given log file. Thanks brothon
    reader = bro_log_reader.BroLogReader(file, tail=realtime)
    # Create a Pandas dataframe from reader
    bro_df = pd.DataFrame(reader.readrows())

    # In case you need a label, due to some models being able to work in a semisupervized mode, then put it here. For now everything is 'normal', but we are not using this for detection
    bro_df['label'] = 'normal'
    # Change the datetime delta value to seconds. Scikit does not now how to work with timedeltas
    bro_df['durationsec'] = bro_df.duration.apply(lambda x: x.total_seconds())
    # Replace the rows without data (with '-') with -1. Even though this may add a bias in the algorithms, is better than not using the lines.
    bro_df['orig_bytes'] = bro_df['orig_bytes'].replace(to_replace='-',
                                                        value=-1)
    bro_df['resp_bytes'] = bro_df['resp_bytes'].replace(to_replace='-',
                                                        value=-1)
    bro_df['resp_pkts'] = bro_df['resp_pkts'].replace(to_replace='-', value=-1)
    bro_df['orig_ip_bytes'] = bro_df['orig_ip_bytes'].replace(to_replace='-',
                                                              value=-1)
    bro_df['resp_ip_bytes'] = bro_df['resp_ip_bytes'].replace(to_replace='-',
                                                              value=-1)

    # Add the columns from the log file that we know are numbers. This is only for conn.log files.
    X_train = bro_df[[
        'durationsec', 'orig_bytes', 'id.resp_p', 'resp_bytes',
        'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes'
    ]]
    # Our y is the label. But we are not using it now.
    y = bro_df.label

    # The X_test is where we are going to search for anomalies. In our case, its the same set of data than X_train.
    X_test = X_train

    #################
    # Select a model from below

    # ABOD class for Angle-base Outlier Detection. For an observation, the variance of its weighted cosine scores to all neighbors could be viewed as the outlying score.
    #clf = ABOD()

    # LOF
    #clf = LOF()

    # CBLOF
    #clf = CBLOF()

    # LOCI
    #clf = LOCI()

    # LSCP
    #clf = LSCP()

    # MCD
    #clf = MCD()

    # OCSVM
    #clf = OCSVM()

    # PCA. Good and fast!
    clf = PCA()

    # SOD
    #clf = SOD()

    # SO_GAAL
    #clf = SO_GALL()

    # SOS
    #clf = SOS()

    # XGBOD
    #clf = XGBOD()

    # KNN
    # Good results but slow
    #clf = KNN()
    #clf = KNN(n_neighbors=10)
    #################

    # Fit the model to the train data
    clf.fit(X_train)

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # Convert the ndarrays of scores and predictions to  pandas series
    scores_series = pd.Series(y_test_scores)
    pred_series = pd.Series(y_test_pred)

    # Now use the series to add a new column to the X test
    X_test['score'] = scores_series.values
    X_test['pred'] = pred_series.values

    # Add the score to the bro_df also. So we can show it at the end
    bro_df['score'] = X_test['score']

    # Keep the positive predictions only. That is, keep only what we predict is an anomaly.
    X_test_predicted = X_test[X_test.pred == 1]

    # Keep the top X amount of anomalies
    top10 = X_test_predicted.sort_values(by='score',
                                         ascending=False).iloc[:amountanom]

    ## Print the results
    # Find the predicted anomalies in the original bro dataframe, where the rest of the data is
    df_to_print = bro_df.iloc[top10.index]
    print('\nFlows of the top anomalies')
    # Only print some columns, not all, so its easier to read.
    df_to_print = df_to_print.drop([
        'conn_state', 'history', 'local_orig', 'local_resp', 'missed_bytes',
        'ts', 'tunnel_parents', 'uid', 'label'
    ],
                                   axis=1)
    print(df_to_print)
Beispiel #31
0
class TestPCA(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.5
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = PCA(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)
        assert_true(hasattr(self.clf, 'selected_components_') and
                    self.clf.selected_components_ is not None)
        assert_true(hasattr(self.clf, 'selected_w_components_') and
                    self.clf.selected_w_components_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass