Example #1
0
    def test_hbos(self):
        clf = HBOS(contamination=0.05)
        clf.fit(self.X_train)
        assert_equal(len(clf.decision_scores), self.X_train.shape[0])

        pred_scores = clf.decision_function(self.X_test)
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])
        assert_equal(clf.predict(self.X_test).shape[0],
                     self.X_test.shape[0])
        assert_greater(roc_auc_score(self.y_test, pred_scores), 0.5)
Example #2
0
    def S2(self):

        self.S1()
        water_data = self.water_data
        result = self.result

        # 数据预处理及模型训练
        clean_data = water_data[water_data['S1'] == 0]
        Y = pd.DataFrame(index=clean_data.index, columns=['S2'])

        X_train = np.array(clean_data.iloc[:, 1:12])
        name = list(clean_data.iloc[:, 1:12].columns.values)
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)

        clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True)
        clf2 = KNN(contamination=0.05, n_neighbors=100)
        clf3 = HBOS(contamination=0.05, n_bins=10)
        clf4 = PCA(contamination=0.05)

        clf1.fit(X_train)
        clf2.fit(X_train)
        clf3.fit(X_train)
        clf4.fit(X_train)

        Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_
        water_data = pd.concat([water_data, Y], axis=1)
        # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0;

        result['统计异常'] = water_data['S2'].values

        # 寻找异常维度
        from sklearn.neighbors import KernelDensity
        clean_data = water_data[water_data['S1'] == 0]
        dens = pd.DataFrame(index=clean_data.index,
                            columns=[
                                'temperature', 'pH', 'EC', 'ORP', 'DO',
                                'turbidity', 'transparency', 'COD', 'P',
                                'NH3N', 'flux'
                            ])

        for i in dens.columns:
            kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(
                clean_data[i].values.reshape(-1, 1))
            dens[i] = np.exp(
                kde.score_samples(clean_data[i].values.reshape(-1, 1)))
        dens = dens.iloc[:, 0:11].rank()
        dens['S2_names'] = dens.idxmin(axis=1)
        water_data = pd.concat([water_data, dens['S2_names']], axis=1)
        self.water_data = water_data
        result['统计异常维度'] = water_data['S2_names'].values

        # 存储模型
        joblib.dump(scaler, "./water_model/S2_scaler")
        joblib.dump(clf1, "./water_model/S2_Iforest")
Example #3
0
def detect_anomaly(df):
	df = df.fillna(0)
	clf =HBOS()
	x_values = df.index.values.reshape(df.index.values.shape[0],1)
	y_values = df.total_traded_quote_asset_volume.values.reshape(df.total_traded_quote_asset_volume.values.shape[0],1)
	clf.fit(y_values)
	clf.predict(y_values)
	df["label_qav"] = clf.predict(y_values)
	df["score_qav"] = clf.decision_function(y_values)#.round(6)
	df['change_qav'] = df.total_traded_quote_asset_volume.pct_change(periods=1)*100
	df['change_price'] = df.last_price.pct_change(periods=1)*100
	return df
def getOutlierHBOS(dataset):
    '''
    @brief Function that executes HBOS algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model
    hbos = HBOS()
    # Fits the data and obtains labels
    hbos.fit(dataset)
    # Return labels
    return hbos.labels_
Example #5
0
def train():
    """ Train the predictor on the data collected """
    start_time = time.time()
    device_uuid, date_time = request.args.get('deviceUUID'), request.args.get(
        'datetime')
    data_filename = get_data_filename(device_uuid, date_time)

    with open(data_filename, 'r') as f:
        rows = f.readlines()
    with open(config.awake_filename, 'rb') as f:
        awake_features = pickle.load(f)
    if len(rows) < config.min_train_data_size:
        return jsonify({
            "status": 1,
            "message": "Not enough training data! %d" % len(rows)
        })
    raw = np.zeros((len(rows), 3))
    for i in range(len(rows)):
        raw[i] = [int(val) for val in rows[i].strip().split(',')]
    norm = features.normalize(raw)
    temp_features = features.extract_multi_features(norm,
                                                    step=config.step_size,
                                                    x_len=config.sample_size)
    baseline_features = features.get_baseline_features(temp_features)
    norm_features = features.get_calibrated_features(temp_features,
                                                     baseline_features)
    X = np.concatenate((awake_features, norm_features), axis=0)
    X[:, 1] = np.abs(np.random.normal(0, 0.01, len(X)))
    app.logger.info(
        'Training classifier using %d feature sets, each containing %d features'
        % (X.shape[0], X.shape[1]))
    clf = HBOS(contamination=0.05)
    clf.fit(X)

    model_filename = get_model_filename(device_uuid, date_time)
    with open(model_filename, 'wb') as f:
        pickle.dump(clf, f)

    pred = clf.decision_function(X)
    baseline = {'features': baseline_features, 'hboss_base': np.min(pred)}

    baseline_filename = get_baseline_filename(device_uuid, date_time)
    with open(baseline_filename, 'wb') as f:
        pickle.dump(baseline, f)

    return jsonify({"status": 0, "time": (time.time() - start_time)})
Example #6
0
def extract_is_outlier(df: pd.DataFrame,
                       col: str,
                       pbar=None,
                       verbose: bool = True,
                       model=None,
                       outliers_fraction: float = 0.05,
                       replace_with=None) -> pd.DataFrame:
    """
    Create an is_outlier column
    :param df: the data
    :param col: the column name
    :param conf: the config dir
    :param pbar: tqdm progress bar
    :return:
    """
    df = df.copy(deep=True)

    msg = "Trying to find outliers in " + str(col)
    if pbar is None:
        print_c(verbose, msg)
    else:
        pbar.set_description(msg)

    if model is None:
        model = HBOS(contamination=outliers_fraction)
    X = df[col].astype(np.float32)
    mask = ~(np.isnan(X) | np.isinf(X) | np.isneginf(X))
    model.fit(X[mask].to_frame())
    preds = model.predict(X[mask].to_frame())
    df[col + '_' + 'isoutlier'] = 0
    df.loc[mask, col + '_' + 'isoutlier'] = preds

    if replace_with is not None:
        msg = "Replacing outliers in " + str(col) + " with " + str(
            replace_with)
        if pbar is None:
            print_c(verbose, msg)
        else:
            pbar.set_description(msg)
        df.loc[df[col + '_' + 'isoutlier'] == 1, col] = replace_with

    return df
Example #7
0
    def get_HBOS_scores(dataframe,
                        cols,
                        outliers_fraction=0.01,
                        standardize=True):
        '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default
        
        Returns:
            df with CBOLF scores added         
        '''
        if standardize:
            #standardize selected variables
            minmax = MinMaxScaler(feature_range=(0, 1))
            dataframe[cols] = minmax.fit_transform(dataframe[cols])

        #Convert dataframe to a numpy array in order to incorprate our algorithm
        arrays = []
        for row in cols:
            row = dataframe[row].values.reshape(-1, 1)
            arrays.append(row)
        X = np.concatenate((arrays), axis=1)

        #fit
        clf = HBOS(contamination=outliers_fraction)
        #clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0)
        clf.fit(X)

        # predict raw anomaly score
        scores_pred = clf.decision_function(X) * -1

        # prediction of a datapoint category outlier or inlier
        y_pred = clf.predict(X)
        n_inliers = len(y_pred) - np.count_nonzero(y_pred)
        n_outliers = np.count_nonzero(y_pred == 1)

        CheckOutliers.df2 = dataframe
        CheckOutliers.df2['outlier'] = y_pred.tolist()

        print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers,
              'found with HBOS')
Example #8
0
def _get_outlier_labels(eigs: ndarray, tol: float) -> List[str]:
    """Identify the outliers of eigs with HBOS."""
    hb = HBOS(tol=tol)
    steps = np.arange(0, len(eigs))
    X = np.vstack([eigs, steps]).T  # data array
    is_outlier = np.array(hb.fit(X).labels_, dtype=bool)  # outliers get "1"

    # because eigs are sorted, HBOS will *usually* identify outliers at one of
    # the two ends of the eigenvalues, which is what we want. But this is not
    # always the case, so we need to de-identify those values as outliers.
    if is_outlier[0]:
        start = find_first(is_outlier, False)
        for i in range(start, len(is_outlier)):
            is_outlier[i] = False
    if is_outlier[-1]:
        stop = find_last(is_outlier, False)
        for i in range(stop):
            is_outlier[i] = False
    if not is_outlier[0] and not is_outlier[-1]:  # force a break later
        is_outlier = np.zeros(is_outlier.shape, dtype=bool)

    return ["outlier" if label else "inlier" for label in is_outlier]
Example #9
0
class TestHBOS(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = HBOS(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)
        assert_true(hasattr(self.clf, 'hist_') and
                    self.clf.hist_ is not None)
        assert_true(hasattr(self.clf, 'bin_edges_') and
                    self.clf.bin_edges_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    # def test_score(self):
    #     self.clf.score(self.X_test, self.y_test)
    #     self.clf.score(self.X_test, self.y_test, scoring='roc_auc_score')
    #     self.clf.score(self.X_test, self.y_test, scoring='prc_n_score')
    #     with assert_raises(NotImplementedError):
    #         self.clf.score(self.X_test, self.y_test, scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Example #10
0
class TestHBOS(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination)

        self.clf = HBOS(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        if not hasattr(
                self.clf,
                'decision_scores_') or self.clf.decision_scores_ is None:
            self.assertRaises(AttributeError, 'decision_scores_ is not set')
        if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None:
            self.assertRaises(AttributeError, 'labels_ is not set')
        if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None:
            self.assertRaises(AttributeError, 'threshold_ is not set')
        if not hasattr(self.clf, '_mu') or self.clf._mu is None:
            self.assertRaises(AttributeError, '_mu is not set')
        if not hasattr(self.clf, '_sigma') or self.clf._sigma is None:
            self.assertRaises(AttributeError, '_sigma is not set')
        if not hasattr(self.clf, 'hist_') or self.clf.hist_ is None:
            self.assertRaises(AttributeError, 'hist_ is not set')
        if not hasattr(self.clf, 'bin_edges_') or self.clf.bin_edges_ is None:
            self.assertRaises(AttributeError, 'bin_edges_ is not set')

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_evaluate(self):
        self.clf.fit_predict_evaluate(self.X_test, self.y_test)

    def tearDown(self):
        pass
Example #11
0
def detect_anomaly(df, type):
    clf = HBOS()  #
    if type == "forest":
        clf = IForest()

    x_values = df.index.values.reshape(df.index.values.shape[0], 1)
    y_values = df.close.values.reshape(df.close.values.shape[0], 1)
    clf.fit(y_values)
    clf.predict(y_values)
    df["label_close"] = clf.predict(y_values)
    df["score_close"] = clf.decision_function(y_values)  #.round(6)

    y_values = df.volume.values.reshape(df.volume.values.shape[0], 1)
    clf.fit(y_values)
    clf.predict(y_values)
    df["label_volume"] = clf.predict(y_values)
    df["score_volume"] = clf.decision_function(y_values)  #.round(4)

    # x_values = df.index.values.reshape(df.index.values.shape[0],1)
    # y_values = df.close.values.reshape(df.close.values.shape[0],1)
    # clf = KNN()
    # clf.fit(y_values)
    # clf.predict(y_values)
    # df["label_close_knn"] = clf.predict(y_values)
    # df["score_close_knn"] = clf.decision_function(y_values)#.round(6)

    # y_values = df.volume.values.reshape(df.volume.values.shape[0],1)
    # clf = KNN()
    # clf.fit(y_values)
    # clf.predict(y_values)
    # df["label_volume_knn"] = clf.predict(y_values)
    # df["score_volume_knn"] = clf.decision_function(y_values)#.round(4)

    # x_values = df.index.values.reshape(df.index.values.shape[0],1)
    # y_values = df.close.values.reshape(df.close.values.shape[0],1)
    # clf = PCA()
    # clf.fit(y_values)
    # clf.predict(y_values)
    # df["label_close_pca"] = clf.predict(y_values)
    # df["score_close_pca"] = clf.decision_function(y_values)#.round(6)

    # y_values = df.volume.values.reshape(df.volume.values.shape[0],1)
    # clf = PCA()
    # clf.fit(y_values)
    # clf.predict(y_values)
    # df["label_volume_pca"] = clf.predict(y_values)
    # df["score_volume_pca"] = clf.decision_function(y_values)#.round(4)

    # x_values = df.index.values.reshape(df.index.values.shape[0],1)
    # y_values = df.close.values.reshape(df.close.values.shape[0],1)
    # clf = IForest()
    # clf.fit(y_values)
    # clf.predict(y_values)
    # df["label_close_iforest"] = clf.predict(y_values)
    # df["score_close_iforest"] = clf.decision_function(y_values)#.round(6)

    # y_values = df.volume.values.reshape(df.volume.values.shape[0],1)
    # clf = IForest()
    # clf.fit(y_values)
    # clf.predict(y_values)
    # df["label_volume_iforest"] = clf.predict(y_values)
    # df["score_volume_iforest"] = clf.decision_function(y_values)#.round(4)

    return df
Example #12
0
def do_pyod(model, colnames, arr_baseline, arr_highlight):

    # init some counters
    n_charts, n_dims, n_bad_data, fit_success, fit_default, fit_fail = init_counters(
        colnames)

    # dict to collect results into
    results = {}

    n_lags = model.get('n_lags', 0)
    model_level = model.get('model_level', 'dim')
    model = model.get('type', 'hbos')

    # model init
    clf = pyod_init(model)

    # get map of cols to loop over
    col_map = get_col_map(colnames, model_level)

    # build each model
    for colname in col_map:

        chart = colname.split('|')[0]
        dimension = colname.split('|')[1] if '|' in colname else '*'
        arr_baseline_dim = arr_baseline[:, col_map[colname]]
        arr_highlight_dim = arr_highlight[:, col_map[colname]]

        # check for bad data
        bad_data = False

        # skip if bad data
        if bad_data:

            n_bad_data += 1
            log.info(f'... skipping {colname} due to bad data')

        else:

            if n_lags > 0:
                arr_baseline_dim = add_lags(arr_baseline_dim, n_lags=n_lags)
                arr_highlight_dim = add_lags(arr_highlight_dim, n_lags=n_lags)

            # remove any nan rows
            arr_baseline_dim = arr_baseline_dim[~np.isnan(arr_baseline_dim).
                                                any(axis=1)]
            arr_highlight_dim = arr_highlight_dim[~np.isnan(arr_highlight_dim).
                                                  any(axis=1)]

            log.debug(f'... chart = {chart}')
            log.debug(f'... dimension = {dimension}')
            log.debug(f'... arr_baseline_dim.shape = {arr_baseline_dim.shape}')
            log.debug(
                f'... arr_highlight_dim.shape = {arr_highlight_dim.shape}')
            log.debug(f'... arr_baseline_dim = {arr_baseline_dim}')
            log.debug(f'... arr_highlight_dim = {arr_highlight_dim}')

            if model == ['auto_encoder']:
                clf = pyod_init(model, n_features=arr_baseline_dim.shape[1])

            clf, result = try_fit(clf, colname, arr_baseline_dim,
                                  PyODDefaultModel)
            fit_success += 1 if result == 'success' else 0
            fit_default += 1 if result == 'default' else 0

            # try predictions and if they fail use default model
            try:
                preds = clf.predict(arr_highlight_dim)
                probs = clf.predict_proba(arr_highlight_dim)[:, 1]
            except:
                fit_success -= 1
                fit_default += 1
                clf = PyODDefaultModel()
                clf.fit(arr_baseline_dim)
                preds = clf.predict(arr_highlight_dim)
                probs = clf.predict_proba(arr_highlight_dim)[:, 1]

            log.debug(f'... preds.shape = {preds.shape}')
            log.debug(f'... preds = {preds}')
            log.debug(f'... probs.shape = {probs.shape}')
            log.debug(f'... probs = {probs}')

            # save results
            score = (np.mean(probs) + np.mean(preds)) / 2
            if chart in results:
                results[chart].append({dimension: {'score': score}})
            else:
                results[chart] = [{dimension: {'score': score}}]

    # log some summary stats
    log.info(
        summary_info(n_charts, n_dims, n_bad_data, fit_success, fit_fail,
                     fit_default, model_level))

    return results
  data = pd.read_csv(path,index_col=0)
  data['plate'] = f
  data = data[data['Metadata_broad_sample'].isin(drugs)]
  data = data[data.columns.intersection(selected_cols)]


  b = data['Metadata_broad_sample']
  w = data['Metadata_Well']
  p =  data['plate']
  del data['Metadata_broad_sample']
  del data['Metadata_Well']
  del data['plate']
  
  outliers_fraction = 0.01
  clf = HBOS (contamination= outliers_fraction)
  clf.fit(data)
  y_pred = clf.predict(data)
  X = pd.DataFrame()
  X['outlier'] = y_pred.tolist()
  X['Metadata_broad_sample'] = b
  X['Metadata_Well'] = w
  X['plate'] = p
  X.to_csv('outlier_without_regress/'+f)


  #target = y_pred.tolist()
  #tsne = TSNE(n_components= 2, verbose=1, perplexity=40, n_iter=2000)
  #tsne_results = tsne.fit_transform(data)
  #fig = plt.figure()
  #ax = fig.add_subplot(111, projection='3d')
  #ax.scatter(tsne_results[:,0], tsne_results[:,1],tsne_results[:,2], cmap = "coolwarm", edgecolor = "None" , c = target)
Example #14
0
def fit_hbos_transformer(input_data: pd.DataFrame):
    hbos = HBOS()
    hbos.fit(input_data)
    return hbos
Example #15
0
 def hbos_transformer(self):
     hbos = HBOS()
     hbos.fit(self.train_transformed_data)
     return hbos
Example #16
0
 df = pd.read_csv(file)
 df.loc[df['ground.truth'] == 'anomaly', 'ground.truth'] = 1
 df.loc[df['ground.truth'] == 'nominal', 'ground.truth'] = 0
 y = df['ground.truth'].values.reshape(-1)
 df[['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7']] = scaler.fit_transform(
     df[['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7']])
 x1 = df['V1'].values.reshape(-1, 1)
 x2 = df['V2'].values.reshape(-1, 1)
 x3 = df['V3'].values.reshape(-1, 1)
 x4 = df['V4'].values.reshape(-1, 1)
 x5 = df['V5'].values.reshape(-1, 1)
 x6 = df['V6'].values.reshape(-1, 1)
 x7 = df['V7'].values.reshape(-1, 1)
 x = np.concatenate((x1, x2, x3, x4, x5, x6, x7), axis=1)
 hbos = HBOS(contamination=outliers_fraction)
 hbos.fit(x)
 y_pred = hbos.predict(x)
 fpr, tpr, threshold = roc_curve(y, y_pred)  ###计算真阳性率和假阳性率
 roc_auc = auc(fpr, tpr)  ###计算auc的值
 lw = 2
 ax = fig.add_subplot(3, 3, i)
 plt.plot(fpr,
          tpr,
          color='darkorange',
          lw=lw,
          label='ROC curve (area = %0.3f)' % roc_auc)  ###假正率为横坐标,真正率为纵坐标做曲线
 plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
 plt.xlim([0.0, 1.0])
 plt.ylim([0.0, 1.05])
 plt.xlabel('False Positive Rate')
 plt.ylabel('True Positive Rate')
Example #17
0
from pyod.data.load_data import generate_data
from pyod.utils.utility import precision_n_scores
from pyod.models.hbos import HBOS

if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 1000
    n_test = 500

    X_train, y_train, c_train, X_test, y_test, c_test = generate_data(
        n_train=n_train, n_test=n_test, contamination=contamination)

    # train a HBOS detector (default version)
    clf = HBOS()
    clf.fit(X_train)

    # get the prediction on the training data
    y_train_pred = clf.y_pred
    y_train_score = clf.decision_scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)
    y_test_score = clf.decision_function(X_test)

    print('Train ROC:{roc}, precision@n:{prn}'.format(
        roc=roc_auc_score(y_train, y_train_score),
        prn=precision_n_scores(y_train, y_train_score)))

    print('Test ROC:{roc}, precision@n:{prn}'.format(
        roc=roc_auc_score(y_test, y_test_score),
Example #18
0
def generate_meta_features(X):
    """Get the meta-features of a datasets X

    Parameters
    ----------
    X : numpy array of shape (n_samples, n_features)
        Input array

    Returns
    -------
    meta_features : numpy array of shape (1, 200)
        Meta-feature in dimension of 200

    """
    # outliers_fraction = np.count_nonzero(y) / len(y)
    # outliers_percentage = round(outliers_fraction * 100, ndigits=4)
    X = check_array(X)

    meta_vec = []
    meta_vec_names = []

    # on the sample level
    n_samples, n_features = X.shape[0], X.shape[1]

    meta_vec.append(n_samples)
    meta_vec.append(n_features)

    meta_vec_names.append('n_samples')
    meta_vec_names.append('n_features')

    sample_mean = np.mean(X)
    sample_median = np.median(X)
    sample_var = np.var(X)
    sample_min = np.min(X)
    sample_max = np.max(X)
    sample_std = np.std(X)

    q1, q25, q75, q99 = np.percentile(X, [0.01, 0.25, 0.75, 0.99])
    iqr = q75 - q25

    normalized_mean = sample_mean / sample_max
    normalized_median = sample_median / sample_max
    sample_range = sample_max - sample_min
    sample_gini = gini(X)
    med_abs_dev = np.median(np.absolute(X - sample_median))
    avg_abs_dev = np.mean(np.absolute(X - sample_mean))
    quant_coeff_disp = (q75 - q25) / (q75 + q25)
    coeff_var = sample_var / sample_mean

    outliers_15iqr = np.logical_or(X < (q25 - 1.5 * iqr), X >
                                   (q75 + 1.5 * iqr))
    outliers_3iqr = np.logical_or(X < (q25 - 3 * iqr), X > (q75 + 3 * iqr))
    outliers_1_99 = np.logical_or(X < q1, X > q99)
    outliers_3std = np.logical_or(X < (sample_mean - 3 * sample_std), X >
                                  (sample_mean + 3 * sample_std))

    percent_outliers_15iqr = np.sum(outliers_15iqr) / len(X)
    percent_outliers_3iqr = np.sum(outliers_3iqr) / len(X)
    percent_outliers_1_99 = np.sum(outliers_1_99) / len(X)
    percent_outliers_3std = np.sum(outliers_3std) / len(X)

    has_outliers_15iqr = np.any(outliers_15iqr).astype(int)
    has_outliers_3iqr = np.any(outliers_3iqr).astype(int)
    has_outliers_1_99 = np.any(outliers_1_99).astype(int)
    has_outliers_3std = np.any(outliers_3std).astype(int)

    meta_vec.extend([
        sample_mean,
        sample_median,
        sample_var,
        sample_min,
        sample_max,
        sample_std,
        q1,
        q25,
        q75,
        q99,
        iqr,
        normalized_mean,
        normalized_median,
        sample_range,
        sample_gini,
        med_abs_dev,
        avg_abs_dev,
        quant_coeff_disp,
        coeff_var,
        # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10,
        percent_outliers_15iqr,
        percent_outliers_3iqr,
        percent_outliers_1_99,
        percent_outliers_3std,
        has_outliers_15iqr,
        has_outliers_3iqr,
        has_outliers_1_99,
        has_outliers_3std
    ])

    meta_vec_names.extend([
        'sample_mean',
        'sample_median',
        'sample_var',
        'sample_min',
        'sample_max',
        'sample_std',
        'q1',
        'q25',
        'q75',
        'q99',
        'iqr',
        'normalized_mean',
        'normalized_median',
        'sample_range',
        'sample_gini',
        'med_abs_dev',
        'avg_abs_dev',
        'quant_coeff_disp',
        'coeff_var',
        # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10,
        'percent_outliers_15iqr',
        'percent_outliers_3iqr',
        'percent_outliers_1_99',
        'percent_outliers_3std',
        'has_outliers_15iqr',
        'has_outliers_3iqr',
        'has_outliers_1_99',
        'has_outliers_3std'
    ])

    ###########################################################################

    normality_k2, normality_p = normaltest(X)
    is_normal_5 = (normality_p < 0.05).astype(int)
    is_normal_1 = (normality_p < 0.01).astype(int)

    meta_vec.extend(list_process(normality_p))
    meta_vec.extend(list_process(is_normal_5))
    meta_vec.extend(list_process(is_normal_1))

    meta_vec_names.extend(list_process_name('normality_p'))
    meta_vec_names.extend(list_process_name('is_normal_5'))
    meta_vec_names.extend(list_process_name('is_normal_1'))

    moment_5 = moment(X, moment=5)
    moment_6 = moment(X, moment=6)
    moment_7 = moment(X, moment=7)
    moment_8 = moment(X, moment=8)
    moment_9 = moment(X, moment=9)
    moment_10 = moment(X, moment=10)
    meta_vec.extend(list_process(moment_5))
    meta_vec.extend(list_process(moment_6))
    meta_vec.extend(list_process(moment_7))
    meta_vec.extend(list_process(moment_8))
    meta_vec.extend(list_process(moment_9))
    meta_vec.extend(list_process(moment_10))
    meta_vec_names.extend(list_process_name('moment_5'))
    meta_vec_names.extend(list_process_name('moment_6'))
    meta_vec_names.extend(list_process_name('moment_7'))
    meta_vec_names.extend(list_process_name('moment_8'))
    meta_vec_names.extend(list_process_name('moment_9'))
    meta_vec_names.extend(list_process_name('moment_10'))

    # note: this is for each dimension == the number of dimensions
    skewness_list = skew(X).reshape(-1, 1)
    skew_values = list_process(skewness_list)
    meta_vec.extend(skew_values)
    meta_vec_names.extend(list_process_name('skewness'))

    # note: this is for each dimension == the number of dimensions
    kurtosis_list = kurtosis(X)
    kurtosis_values = list_process(kurtosis_list)
    meta_vec.extend(kurtosis_values)
    meta_vec_names.extend(list_process_name('kurtosis'))

    correlation = np.nan_to_num(pd.DataFrame(X).corr(), nan=0)
    correlation_list = flatten_diagonally(correlation)[0:int(
        (n_features * n_features - n_features) / 2)]
    correlation_values = list_process(correlation_list)
    meta_vec.extend(correlation_values)
    meta_vec_names.extend(list_process_name('correlation'))

    covariance = np.cov(X.T)
    covariance_list = flatten_diagonally(covariance)[0:int(
        (n_features * n_features - n_features) / 2)]
    covariance_values = list_process(covariance_list)
    meta_vec.extend(covariance_values)
    meta_vec_names.extend(list_process_name('covariance'))

    # sparsity
    rep_counts = []
    for i in range(n_features):
        rep_counts.append(len(np.unique(X[:, i])))
    sparsity_list = np.asarray(rep_counts) / (n_samples)
    sparsity = list_process(sparsity_list)
    meta_vec.extend(sparsity)
    meta_vec_names.extend(list_process_name('sparsity'))

    # ANOVA p value
    p_values_list = []
    all_perm = list(itertools.combinations(list(range(n_features)), 2))
    for j in all_perm:
        p_values_list.append(f_oneway(X[:, j[0]], X[:, j[1]])[1])
    anova_p_value = list_process(np.asarray(p_values_list))
    # anova_p_value = np.mean(p_values_list)
    # anova_p_value_exceed_thresh = np.mean((np.asarray(p_values_list)<0.05).astype(int))
    meta_vec.extend(anova_p_value)
    meta_vec_names.extend(list_process_name('anova_p_value'))

    # pca
    pca_transformer = sklearn_PCA(n_components=3)
    X_transform = pca_transformer.fit_transform(X)

    # first pc
    pca_fpc = list_process(X_transform[0, :],
                           r_min=False,
                           r_max=False,
                           r_mean=False,
                           r_std=True,
                           r_skew=True,
                           r_kurtosis=True)
    meta_vec.extend(pca_fpc)
    meta_vec_names.extend(
        ['first_pca_std', 'first_pca_skewness', 'first_pca_kurtosis'])

    # entropy
    entropy_list = []
    for i in range(n_features):
        counts = pd.Series(X[:, i]).value_counts()
        entropy_list.append(entropy(counts) / n_samples)
    entropy_values = list_process(entropy_list)
    meta_vec.extend(entropy_values)
    meta_vec_names.extend(list_process_name('entropy'))

    ##############################Landmarkers######################################
    # HBOS
    clf = HBOS(n_bins=10)
    clf.fit(X)
    HBOS_hists = clf.hist_
    HBOS_mean = np.mean(HBOS_hists, axis=0)
    HBOS_max = np.max(HBOS_hists, axis=0)
    HBOS_min = np.min(HBOS_hists, axis=0)
    meta_vec.extend(list_process(HBOS_mean))
    meta_vec.extend(list_process(HBOS_max))
    meta_vec.extend(list_process(HBOS_min))
    meta_vec_names.extend(list_process_name('HBOS_mean'))
    meta_vec_names.extend(list_process_name('HBOS_max'))
    meta_vec_names.extend(list_process_name('HBOS_min'))

    # IForest
    n_estimators = 100
    clf = IForest(n_estimators=n_estimators)
    clf.fit(X)

    n_leaves = []
    n_depth = []
    fi_mean = []
    fi_max = []

    # doing this for each sub-trees
    for i in range(n_estimators):
        n_leaves.append(clf.estimators_[i].get_n_leaves())
        n_depth.append(clf.estimators_[i].get_depth())
        fi_mean.append(clf.estimators_[i].feature_importances_.mean())
        fi_max.append(clf.estimators_[i].feature_importances_.max())
        # print(clf.estimators_[i].tree_)

    meta_vec.extend(list_process(n_leaves))
    meta_vec.extend(list_process(n_depth))
    meta_vec.extend(list_process(fi_mean))
    meta_vec.extend(list_process(fi_max))

    meta_vec_names.extend(list_process_name('IForest_n_leaves'))
    meta_vec_names.extend(list_process_name('IForest_n_depth'))
    meta_vec_names.extend(list_process_name('IForest_fi_mean'))
    meta_vec_names.extend(list_process_name('IForest_fi_max'))

    # PCA
    clf = PCA(n_components=3)
    clf.fit(X)
    meta_vec.extend(clf.explained_variance_ratio_)
    meta_vec.extend(clf.singular_values_)
    meta_vec_names.extend(
        ['pca_expl_ratio_1', 'pca_expl_ratio_2', 'pca_expl_ratio_3'])
    meta_vec_names.extend(['pca_sv_1', 'pca_sv_2', 'pca_sv_3'])

    # LODA
    n_bins = 10
    n_random_cuts = 100

    n_hists_mean = []
    n_hists_max = []

    n_cuts_mean = []
    n_cuts_max = []

    clf = LODA(n_bins=n_bins, n_random_cuts=n_random_cuts)
    clf.fit(X)

    for i in range(n_bins):
        n_hists_mean.append(clf.histograms_[:, i].mean())
        n_hists_max.append(clf.histograms_[:, i].max())
    for i in range(n_random_cuts):
        n_cuts_mean.append(clf.histograms_[i, :].mean())
        n_cuts_max.append(clf.histograms_[i, :].max())

    meta_vec.extend(list_process(n_hists_mean))
    meta_vec.extend(list_process(n_hists_max))
    meta_vec.extend(list_process(n_cuts_mean))
    meta_vec.extend(list_process(n_cuts_max))

    meta_vec_names.extend(list_process_name('LODA_n_hists_mean'))
    meta_vec_names.extend(list_process_name('LODA_n_hists_max'))
    meta_vec_names.extend(list_process_name('LODA_n_cuts_mean'))
    meta_vec_names.extend(list_process_name('LODA_n_cuts_max'))

    return meta_vec, meta_vec_names
Example #19
0
class TestHBOS(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = HBOS(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert (hasattr(self.clf, 'hist_') and self.clf.hist_ is not None)
        assert (hasattr(self.clf, 'bin_edges_')
                and self.clf.bin_edges_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    # def test_score(self):
    #     self.clf.score(self.X_test, self.y_test)
    #     self.clf.score(self.X_test, self.y_test, scoring='roc_auc_score')
    #     self.clf.score(self.X_test, self.y_test, scoring='prc_n_score')
    #     with assert_raises(NotImplementedError):
    #         self.clf.score(self.X_test, self.y_test, scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
Example #20
0
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train HBOS detector
    clf_name = 'HBOS'
    clf = HBOS()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)