コード例 #1
0
def test_iforest_subsampled_features():
    # It tests non-regression for #5732 which failed at predict.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng)
    clf = IsolationForest(max_features=0.8)
    clf.fit(X_train, y_train)
    clf.predict(X_test)
コード例 #2
0
def outlier_rejection(X, y):
    model = IsolationForest(max_samples=100,
                            contamination=0.4,
                            random_state=rng)
    model.fit(X)
    y_pred = model.predict(X)
    return X[y_pred == 1], y[y_pred == 1]
コード例 #3
0
ファイル: pyador.py プロジェクト: xiangnanyue/Pyod
    def _predict_self(self):

        clf = IsolationForest(contamination=self.frac)

        clf.fit(self.num_X)

        return clf.predict(self.num_X)
コード例 #4
0
def test_iforest_sparse():
    """Check IForest for various parameter settings on sparse input."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    grid = ParameterGrid({"max_samples": [0.5, 1.0],
                          "bootstrap": [True, False]})

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        for params in grid:
            # Trained on sparse format
            sparse_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train_sparse)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train)
            dense_results = dense_classifier.predict(X_test)

            assert_array_equal(sparse_results, dense_results)
            assert_array_equal(sparse_results, dense_results)
コード例 #5
0
def outlier_rejection(X, y):
    """This will be our function used to resample our dataset."""
    model = IsolationForest(max_samples=100,
                            contamination=0.4,
                            random_state=rng)
    model.fit(X)
    y_pred = model.predict(X)
    return X[y_pred == 1], y[y_pred == 1]
コード例 #6
0
ファイル: ITPA.py プロジェクト: Ayo616/KDD-workshop-second
def IsolationForest_calulate(train_data_one,test_data):
    # 使用异常检测方法
    clf = IsolationForest()
    # 训练异常检测模型
    clf.fit(train_data_one)
    # 模型预测
    Pre_result = clf.predict(test_data)
    # 计算多少个概率
    prob = len([x for x in Pre_result if x == 1])/len(Pre_result)
    return prob
コード例 #7
0
def test_iforest_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]

    # Test LOF
    clf = IsolationForest(random_state=rng)
    clf.fit(X)
    pred = clf.predict(X)

    # assert detect outliers:
    assert_greater(np.min(pred[-2:]), np.max(pred[:-2]))
コード例 #8
0
def test_iforest_works(contamination):
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]

    # Test IsolationForest
    clf = IsolationForest(random_state=rng, contamination=contamination)
    clf.fit(X)
    decision_func = -clf.decision_function(X)
    pred = clf.predict(X)
    # assert detect outliers:
    assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
    assert_array_equal(pred, 6 * [1] + 2 * [-1])
コード例 #9
0
ファイル: iforest.py プロジェクト: flaviassantos/pyod
    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """
        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = IsolationForest(n_estimators=self.n_estimators,
                                         max_samples=self.max_samples,
                                         contamination=self.contamination,
                                         max_features=self.max_features,
                                         bootstrap=self.bootstrap,
                                         n_jobs=self.n_jobs,
                                         random_state=self.random_state,
                                         verbose=self.verbose)
        self.detector_.fit(X=X,
                           y=None,
                           sample_weight=None)

        # invert decision_scores_. Outliers comes with higher outlier scores.
        self.decision_scores_ = invert_order(
            self.detector_.decision_function(X))
        self._process_decision_scores()
        return self
コード例 #10
0
    def isolationForest(self, settings, mname, data):
        '''
        :param settings: -> settings dictionary
        :param mname: -> name of serialized cluster
        :return: -> isolation forest instance
        :example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False,
                        max_features:1.0, n_jobs:1, random_state:None, verbose:0}
        '''
        # rng = np.random.RandomState(42)
        if settings['random_state'] == 'None':
            settings['random_state'] = None

        if isinstance(settings['bootstrap'], str):
            settings['bootstrap'] = str2Bool(settings['bootstrap'])

        if isinstance(settings['verbose'], str):
            settings['verbose'] = str2Bool(settings['verbose'])

        if settings['max_samples'] != 'auto':
            settings['max_samples'] = int(settings['max_samples'])
        # print type(settings['max_samples'])
        for k, v in settings.iteritems():
            logger.info('[%s] : [INFO] IsolationForest %s set to %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)
            print "IsolationForest %s set to %s" % (k, v)
        try:
            clf = IsolationForest(n_estimators=int(settings['n_estimators']), max_samples=settings['max_samples'], contamination=float(settings['contamination']), bootstrap=settings['bootstrap'],
                        max_features=float(settings['max_features']), n_jobs=int(settings['n_jobs']), random_state=settings['random_state'], verbose=settings['verbose'])
        except Exception as inst:
            logger.error('[%s] : [ERROR] Cannot instanciate isolation forest with %s and %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Error while  instanciating isolation forest with %s and %s" % (type(inst), inst.args)
            sys.exit(1)
        # clf = IsolationForest(max_samples=100, random_state=rng)
        # print "*&*&*&& %s" % type(data)
        try:
            clf.fit(data)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Cannot fit isolation forest model with %s and %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            sys.exit(1)
        predict = clf.predict(data)
        print "Anomaly Array:"
        print predict
        self.__serializemodel(clf, 'isoforest', mname)
        return clf
コード例 #11
0
def test_score_samples():
    X_train = [[1, 1], [1, 2], [2, 1]]
    clf1 = IsolationForest(contamination=0.1).fit(X_train)
    clf2 = IsolationForest().fit(X_train)
    assert_array_equal(clf1.score_samples([[2., 2.]]),
                       clf1.decision_function([[2., 2.]]) + clf1.offset_)
    assert_array_equal(clf2.score_samples([[2., 2.]]),
                       clf2.decision_function([[2., 2.]]) + clf2.offset_)
    assert_array_equal(clf1.score_samples([[2., 2.]]),
                       clf2.score_samples([[2., 2.]]))
コード例 #12
0
    def predict(self, X, window=DEFAULT_WINDOW):
        """
        Predict if a particular sample is an outlier or not.

        :param X: the time series to detect of
        :param type X: pandas.Series
        :param window: the length of window
        :param type window: int
        :return: 1 denotes normal, 0 denotes abnormal.
        """
        x_train = list(range(0, 2 * window + 1)) + list(range(0, 2 * window + 1)) + list(range(0, window + 1))
        sample_features = zip(x_train, X)
        clf = IsolationForest(self.n_estimators, self.max_samples, self.contamination, self.max_feature, self.bootstrap, self.n_jobs, self.random_state, self.verbose)
        clf.fit(sample_features)
        predict_res = clf.predict(sample_features)
        if predict_res[-1] == -1:
            return 0
        return 1
コード例 #13
0
def test_iforest_parallel_regression():
    """Check parallel regression."""
    rng = check_random_state(0)

    X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng)

    ensemble = IsolationForest(n_jobs=3, random_state=0).fit(X_train)

    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = IsolationForest(n_jobs=1, random_state=0).fit(X_train)

    y3 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y3)
コード例 #14
0
def test_iforest_performance():
    """Test Isolation Forest performs well"""

    # Generate train/test data
    rng = check_random_state(2)
    X = 0.3 * rng.randn(120, 2)
    X_train = np.r_[X + 2, X - 2]
    X_train = X[:100]

    # Generate some abnormal novel observations
    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
    X_test = np.r_[X[100:], X_outliers]
    y_test = np.array([0] * 20 + [1] * 20)

    # fit the model
    clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)

    # predict scores (the lower, the more normal)
    y_pred = - clf.decision_function(X_test)

    # check that there is at most 6 errors (false positive or false negative)
    assert_greater(roc_auc_score(y_test, y_pred), 0.98)
コード例 #15
0
def outlier_removal(df, col, method, params):
    if method == 'Isolation Forest':
        do_outlier_removal = IsolationForest(**params)
    if method == 'Local Outlier Factor':
        do_outlier_removal = LocalOutlierFactor(**params)
    else:
        method == None
    do_outlier_removal.fit(np.array(df[col]))
    if method == 'Isolation Forest':
        outlier_scores = do_outlier_removal.decision_function(np.array(df[col]))
        df[('meta', 'Outlier Scores - ' + method + str(params))] = outlier_scores
        is_outlier = do_outlier_removal.predict(np.array(df[col]))
        df[('meta', 'Outliers - ' + method + str(params))] = is_outlier
    if method == 'Local Outlier Factor':
        is_outlier = do_outlier_removal.fit_predict(np.array(df[col]))
        df[('meta', 'Outliers - ' + method + str(params))] = is_outlier
        df[('meta', 'Outlier Factor - ' + method + str(params))] = do_outlier_removal.negative_outlier_factor_
    return df, do_outlier_removal
コード例 #16
0
def test_iforest_warm_start():
    """Test iterative addition of iTrees to an iForest """

    rng = check_random_state(0)
    X = rng.randn(20, 2)

    # fit first 10 trees
    clf = IsolationForest(n_estimators=10, max_samples=20,
                          random_state=rng, warm_start=True)
    clf.fit(X)
    # remember the 1st tree
    tree_1 = clf.estimators_[0]
    # fit another 10 trees
    clf.set_params(n_estimators=20)
    clf.fit(X)
    # expecting 20 fitted trees and no overwritten trees
    assert len(clf.estimators_) == 20
    assert clf.estimators_[0] is tree_1
コード例 #17
0
ファイル: qc.py プロジェクト: bilgelm/APPIAN
def _IsolationForest(X):
    rng = np.random.RandomState(42)
    clf = IsolationForest(max_samples=X.shape[0], random_state=rng)
    return clf.fit(X).predict(X)
コード例 #18
0
from sklearn.datasets import load_wine
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
winedata = load_wine()
feature_names = winedata.feature_names

# %%
# derive class 1 wine data
inx = np.where(data_y == 1)[0]
class_1_y = data_y[inx]
class_1_x = data_x[inx, ]

# %%
from sklearn.ensemble import IsolationForest

clf = IsolationForest(contamination='auto')
clf.fit(class_1_x)
IFprediction = clf.predict(class_1_x)
anom_ind = np.where(IFprediction < 0)

anom_ind

# %%

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
plt.figure(figsize=(10, 8))

scatter = plt.scatter(class_1_x[:, 0],
                      class_1_x[:, 1],
                      c='slateblue',
コード例 #19
0
X_train, X_test, X_train, X_test = prepare_data_mean()

rng = np.random.RandomState(42)

# Generate train data
X = 0.3 * rng.randn(100, 2)
#X_train = np.r_[X + 2, X - 2]
# Generate some regular novel observations
X = 0.3 * rng.randn(20, 2)
#X_test = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
# = rng.uniform(low=-4, high=4, size=(20, 2))

# fit the model
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
#y_pred_outliers = clf.predict(X_outliers)

# plot the line, the samples, and the nearest vectors to the plane
#xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
#Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = clf.decision_function(X_train)
# Z = Z.reshape(xx.shape)
#
# plt.title("IsolationForest")
# plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
コード例 #20
0
# Example settings
n_samples = 300
outliers_fraction = 0.15
n_outliers = int(outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers

# define outlier/anomaly detection methods to be compared
anomaly_algorithms = [("Robust covariance",
                       EllipticEnvelope(contamination=outliers_fraction)),
                      ("One-Class SVM",
                       svm.OneClassSVM(nu=outliers_fraction,
                                       kernel="rbf",
                                       gamma=0.1)),
                      ("Isolation Forest",
                       IsolationForest(contamination=outliers_fraction,
                                       random_state=42)),
                      ("Local Outlier Factor",
                       LocalOutlierFactor(n_neighbors=35,
                                          contamination=outliers_fraction))]

# Define datasets
blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2)
datasets = [
    make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5, **blobs_params)[0],
    make_blobs(centers=[[2, 2], [-2, -2]],
               cluster_std=[0.5, 0.5],
               **blobs_params)[0],
    make_blobs(centers=[[2, 2], [-2, -2]],
               cluster_std=[1.5, .3],
               **blobs_params)[0],
    4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] -
コード例 #21
0
                idx_test.append(gid)
            else:
                print "ERROR"
                sys.exit(-1)

    Xtrain.append(column_train)
    Xtest.append(column_test)

Xtrain = np.transpose(np.array(Xtrain))
Xtest = np.transpose(np.array(Xtest))
idx_train = idx_train[:Xtrain.shape[0]]
idx_test = idx_test[:Xtest.shape[0]]

# fit an iforest
iforest =  IsolationForest(n_estimators=ntrees,
                           max_samples=sample_frac, max_features=feat_frac,
                           n_jobs=-1, random_state=rng, verbose=1)
iforest.fit(Xtrain)

# anomaly scores
y_pred_train = iforest.predict(Xtrain)
y_pred_test = iforest.predict(Xtest)
train_feature_values = [(gid, val)
                        for gid, val in zip(idx_train, list(y_pred_train))]
test_feature_values = [(gid, val)
                        for gid, val in zip(idx_test, list(y_pred_test))]
for i, scenario in enumerate(MALICIOUS_SCENARIOS):
    all_feature_values = train_feature_values + \
                         [(gid, feat_value)
                          for gid, feat_value in test_feature_values
                          if gid/100 in BENIGN_SCENARIOS or
コード例 #22
0
ファイル: housing.py プロジェクト: Sourge/udacity

# ## Improving the Predicition model ##
# This part is about finding a better metric for predicting future house sales regarding their price.
# 
# First, I will detect outliers and delete them from the dataset if needed.

# ### Detecting Outliers ###
# The first step to improve our learning behaviour is to find outliers and then remove them from the data set if needed.
# To detect outliers I will use the Isolation Forest Algorithm which is good for high-dimensional data sets as we have present here. 

# In[ ]:

from sklearn.ensemble import IsolationForest

clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(df)
y = clf.predict(df)
print y


# ### Location based prices ###
# House prices don't only depend on the size of the house or amount of rooms, but are also really dependant on the location of said house. To get an idea how the position might impact my data I analyse the relationship between location and price in my dataset.

# In[ ]:

import gmaps
gmaps.configure(api_key="AIzaSyDPWAl8lcrK9q-tOkrl64sGkxDnbWz47Ko")

locations = df[["lat", "long"]]
prices = df["price"]
コード例 #23
0
        speed.drop(['vehicle_id', 'ride_id', 'type'], axis = 1, inplace = True)

        # merge
        vehicle = pd.merge(rpm, speed, how = 'outer', on = 'timestamp')

        # drop null values and zero speeds --> neutral gear
        # speed < 200 to remove outliers

        vh = vehicle.dropna(axis = 0)
        vh = vh[(vh['rpm'] > 0) & ((vh['speed'] > 0) & (vh['speed'] < 200))]

        # detect outliers using IsolationForest
        # assume contamination at 0.01 level

        distances = pairwise_distances(vh[['rpm','speed']],vh[['rpm','speed']], metric = 'cosine')
        clf = IsolationForest(max_samples = 100, contamination = 0.01, verbose = 1)
        clf.fit(distances)
        labels = clf.predict(distances)
        vh['outlier'] = labels

        # remove outliers found by IsolationForest
        vh = vh[['rpm','speed']][vh['outlier'] == 1]

        #recompute distances after outlier removal
        distances = pairwise_distances(vh[['rpm','speed']],vh[['rpm','speed']], metric = 'cosine')

        # initialize variable to keep best model, its silhouette score and predicted labels
        best_model = (None, -1, None)

        # iterate over possible number of gears
        # since we want to pick model with best silhouette score, can't start with single cluster (k=1)
# Read and load files

activity = pd.read_csv('./evaluate/novin_feature.csv',  delimiter = ',')
#activity1 = pd.read_csv('./evaluate/thirtydays_feature.csv',  delimiter = ',')
activity1 = pd.read_csv('./evaluate/twentydays_feature.csv', delimiter=',')
X = np.array(activity.iloc[0:])
# X = np.array(activity.iloc[:,0:])

X_train = np.array(activity.iloc[:,[2,1]])

X_test = np.array(activity.iloc[:,[1,3]])


# fit the model
clf = IsolationForest(max_samples=99, random_state=rng)
clf.fit(X_train)

# Predict new test-set
pred_new = clf.predict(X_test)
y = pd.DataFrame(data=pred_new)
y_pred = np.array(y)
print(pred_new)

#estimate error rate in predicting
test_error = pred_new[pred_new == 1].size

#print(test_error)
#print(y_pred_test)
#we use pickle to save our classifier so next time we dont have to re-train
with open('isolation.pickle', 'wb') as f:
コード例 #25
0
class IForest(BaseDetector):
    """Wrapper of scikit-learn Isolation Forest with more functionalities.

    The IsolationForest 'isolates' observations by randomly selecting a
    feature and then randomly selecting a split value between the maximum and
    minimum values of the selected feature.
    See :cite:`liu2008isolation,liu2012isolation` for details.

    Since recursive partitioning can be represented by a tree structure, the
    number of splittings required to isolate a sample is equivalent to the path
    length from the root node to the terminating node.

    This path length, averaged over a forest of such random trees, is a
    measure of normality and our decision function.

    Random partitioning produces noticeably shorter paths for anomalies.
    Hence, when a forest of random trees collectively produce shorter path
    lengths for particular samples, they are highly likely to be anomalies.

    Parameters
    ----------
    n_estimators : int, optional (default=100)
        The number of base estimators in the ensemble.

    max_samples : int or float, optional (default="auto")
        The number of samples to draw from X to train each base estimator.

            - If int, then draw `max_samples` samples.
            - If float, then draw `max_samples * X.shape[0]` samples.
            - If "auto", then `max_samples=min(256, n_samples)`.

        If max_samples is larger than the number of samples provided,
        all samples will be used for all trees (no sampling).

    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set, i.e. the proportion
        of outliers in the data set. Used when fitting to define the threshold
        on the decision function.

    max_features : int or float, optional (default=1.0)
        The number of features to draw from X to train each base estimator.

            - If int, then draw `max_features` features.
            - If float, then draw `max_features * X.shape[1]` features.

    bootstrap : boolean, optional (default=False)
        If True, individual trees are fit on random subsets of the training
        data sampled with replacement. If False, sampling without replacement
        is performed.

    n_jobs : integer, optional (default=1)
        The number of jobs to run in parallel for both `fit` and `predict`.
        If -1, then the number of jobs is set to the number of cores.

    behaviour : str, default='old'
        Behaviour of the ``decision_function`` which can be either 'old' or
        'new'. Passing ``behaviour='new'`` makes the ``decision_function``
        change to match other anomaly detection algorithm API which will be
        the default behaviour in the future. As explained in details in the
        ``offset_`` attribute documentation, the ``decision_function`` becomes
        dependent on the contamination parameter, in such a way that 0 becomes
        its natural threshold to detect outliers.

        .. versionadded:: 0.7.0
           ``behaviour`` is added in 0.7.0 for back-compatibility purpose.

        .. deprecated:: 0.20
           ``behaviour='old'`` is deprecated in sklearn 0.20 and will not be
           possible in 0.22.

        .. deprecated:: 0.22
           ``behaviour`` parameter will be deprecated in sklearn 0.22 and
           removed in 0.24.

        .. warning::
            Only applicable for sklearn 0.20 above.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    verbose : int, optional (default=0)
        Controls the verbosity of the tree building process.

    Attributes
    ----------
    estimators_ : list of DecisionTreeClassifier
        The collection of fitted sub-estimators.

    estimators_samples_ : list of arrays
        The subset of drawn samples (i.e., the in-bag samples) for each base
        estimator.

    max_samples_ : integer
        The actual number of samples

    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """

    def __init__(self, n_estimators=100,
                 max_samples="auto",
                 contamination=0.1,
                 max_features=1.,
                 bootstrap=False,
                 n_jobs=1,
                 behaviour='old',
                 random_state=None,
                 verbose=0):
        super(IForest, self).__init__(contamination=contamination)
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.n_jobs = n_jobs
        self.behaviour = behaviour
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """
        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        # In sklearn 0.20+ new behaviour is added (arg behaviour={'new','old'})
        # to IsolationForest that shifts the location of the anomaly scores
        # noinspection PyProtectedMember
        if _sklearn_version_20():
            self.detector_ = IsolationForest(n_estimators=self.n_estimators,
                                             max_samples=self.max_samples,
                                             contamination=self.contamination,
                                             max_features=self.max_features,
                                             bootstrap=self.bootstrap,
                                             n_jobs=self.n_jobs,
                                             behaviour=self.behaviour,
                                             random_state=self.random_state,
                                             verbose=self.verbose)

        # Do not pass behaviour argument when sklearn version is < 0.20
        else:  # pragma: no cover
            self.detector_ = IsolationForest(n_estimators=self.n_estimators,
                                             max_samples=self.max_samples,
                                             contamination=self.contamination,
                                             max_features=self.max_features,
                                             bootstrap=self.bootstrap,
                                             n_jobs=self.n_jobs,
                                             random_state=self.random_state,
                                             verbose=self.verbose)

        self.detector_.fit(X=X,
                           y=None,
                           sample_weight=None)

        # invert decision_scores_. Outliers comes with higher outlier scores.
        self.decision_scores_ = invert_order(
            self.detector_.decision_function(X))
        self._process_decision_scores()
        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        # invert outlier scores. Outliers comes with higher outlier scores
        return invert_order(self.detector_.decision_function(X))

    @property
    def estimators_(self):
        """The collection of fitted sub-estimators.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.estimators_

    @property
    def estimators_samples_(self):
        """The subset of drawn samples (i.e., the in-bag samples) for
        each base estimator.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.estimators_samples_

    @property
    def max_samples_(self):
        """The actual number of samples.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.max_samples_
コード例 #26
0
                # Use the bat DataframeToMatrix class
                features = [
                    'Z', 'rejected', 'proto', 'query', 'qclass_name',
                    'qtype_name', 'rcode_name', 'query_length', 'id.resp_p'
                ]
                to_matrix = dataframe_to_matrix.DataFrameToMatrix()
                bro_matrix = to_matrix.fit_transform(bro_df[features])
                print(bro_matrix.shape)

                # Print out the range of the daterange and some stats
                print('DataFrame TimeRange: {:s} --> {:s}'.format(
                    str(bro_df['ts'].min()), str(bro_df['ts'].max())))

                # Train/fit and Predict anomalous instances using the Isolation Forest model
                odd_clf = IsolationForest(
                    contamination=0.2)  # Marking 20% as odd
                odd_clf.fit(bro_matrix)
                bro_df['anomalous'] = [
                    predict == -1 for predict in odd_clf.predict(bro_matrix)
                ]

                # Now we create a new dataframe using the prediction from our classifier
                odd_df = bro_df[bro_df['anomalous']]

                # Now we're going to explore our odd observations with help from KMeans
                odd_matrix = to_matrix.transform(odd_df[features])
                #clusters = KMeans(n_clusters=num_clusters).fit_predict(odd_matrix).tolist()
                batch_kmeans.partial_fit(odd_matrix)
                clusters = batch_kmeans.predict(odd_matrix).tolist()

                # Set the cluster number for all the entries in the original dataframe
コード例 #27
0
for i in range(24):
    X_pca_i = X_pca_list[i]
    plt.scatter(X_pca_i[:, 0], X_pca_i[:, 1], s=0.8)
    plt.title('hour ' + str(i))
    plt.show()

users = df.user.unique()

outliers_list = []
for i in range(24):
    X_pca_i = X_pca_list[i]
    #Xi = X[df.hours == i]
    #pca = decomposition.PCA(n_components=2)
    #pca.fit(Xi)
    #X_pca_i = pca.transform(Xi)
    model = IsolationForest(contamination=0.005)
    model.fit(X_pca_i)
    pred = model.predict(X_pca_i)
    outliers = X_pca_i[pred == -1, :]
    for outlier in outliers:
        outliers_list += [outlier]
    plt.scatter(X_pca_i[:, 0], X_pca_i[:, 1], s=.8, color='blue')
    plt.scatter(outliers[:, 0], outliers[:, 1], s=6., color='red')
    plt.show()
outliers_list = np.array(outliers_list)  #.reshape(len(outliers_list),7)

idx = []
for i, row in enumerate(X_pca[:]):
    for outlier in outliers_list[:]:
        if np.array_equal(row, outlier):
            idx += [i]
コード例 #28
0
class Detector:
    def __init__(self,
                 wait=61 * 2,
                 sensitive=3,
                 ignore_continuous=10,
                 max_window=61 * 5):
        super().__init__()
        self.data = []  # store data in [n_samples, n_features]
        self.inputs = []
        self.wait = int(wait)  # cold start waiting
        self.max_window = int(max_window)  # max data for training
        self.retrain = 16  # retrain delay time-step
        self.sensitive = sensitive  # every N anomaly should be retrained
        self.Anomaly = 0  # anomaly count
        self.sigRetrain = True  # signal of retrain

        self.ignore_continuous = ignore_continuous  # anomaly alert every N ticks
        self.continuous = 0  # counter for counting alert delay
        self.cont = False  # Anomaly continue state
        self.anomaly_cont_acc = 0  # Anomaly continue counter

        self.ma = MA(list(range(3, 32, 2)) + [61, 121])
        self.madiff = MADIFF(self.ma)
        self.ewma = EWMA([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
        self.ewmadiff = EWMADIFF(self.ewma)
        self.dif = DIF()

    def fit_predict(self, ptr):
        self.continuous = (self.continuous
                           if self.continuous == 0 else 0 if self.continuous +
                           1 > self.ignore_continuous else self.continuous + 1)
        self.inputs.append(float(ptr))

        ptr = self.preprocess(float(ptr))
        if self.data and len(self.data) >= self.wait:
            ans = self.vote(ptr)

            try:
                if ans == 1 and self.continuous == 0:
                    self.continuous += 1
                    self.anomaly_cont_acc += 1
                    return ans

                elif ans == 1 and self.continuous > 0:
                    self.cont = True
                    self.anomaly_cont_acc += 1
                    return 0

                else:
                    if self.cont:
                        self.cont = False
                        self.ignore_continuous = math.ceil(
                            self.ignore_continuous * 0.6 +
                            self.anomaly_cont_acc * 0.4)
                        self.sensitive = math.ceil(self.sensitive * 0.8 +
                                                   self.anomaly_cont_acc * 0.2)
                        self.anomaly_cont_acc = 0
                        self.continuous = 0

                    return ans
            except:
                pass
            finally:
                if len(self.data) == self.max_window:
                    _ = self.data.pop(0)

                self.Anomaly += ans
                if self.Anomaly >= self.sensitive:
                    self.sigRetrain = True

                if self.sigRetrain:
                    self.train_model()

        else:
            self.data.append(ptr)
            return 0

    def train_model(self):
        # reset signal and counter
        self.sigRetrain, self.Anomaly = False, 0

        self.iforest = IsolationForest(
            n_estimators=math.ceil(np.mean(self.ma.periods)) *
            len(self.data[-1]) // 10 + 120,
            # n_jobs=os.cpu_count() - 1,
        )
        self.ocsvm = OneClassSVM(kernel="rbf")

        # num = len(self.data) - 1 if len(self.data) < 31 else 30
        self.lof = LocalOutlierFactor(
            n_neighbors=math.ceil(np.mean(self.ma.periods)),
            novelty=True,
            # n_jobs=os.cpu_count() - 1,
        )

        self.ee = EllipticEnvelope(support_fraction=1.0, contamination=0.25)

        # self.sscalar = StandardScaler().fit(np.array(self.data))
        # tmp = self.sscalar.transform(np.array(self.data))

        tmp = np.array(self.data)

        self.ee.fit(tmp)
        self.ocsvm.fit(tmp)
        self.lof.fit(tmp)
        self.iforest.fit(tmp)

    def vote(self, val):
        if self.sigRetrain:
            self.train_model()

        # tmp = self.sscalar.transform([val])
        tmp = [val]
        ans = (  # -1 is anomaly and 1 is normal
            self.ee.predict(tmp) + self.ocsvm.predict(tmp) +
            self.lof.predict(tmp) + self.iforest.predict(tmp))

        for i in range(
                len(self.ma.data.keys()) + len(self.ewma.data.keys()) + 1):
            ans += self.Boxplot_Anatomy(val, idx=i)

        self.data.append(val)
        if len(self.data) % self.retrain == 0:
            self.sigRetrain = True
            self.retrain = int(len(self.data)**0.5) - 1

        return 1 if ans[0] < 0 else 0

    def Boxplot_Anatomy(self, vals, idx=0):
        upper_bound = np.quantile(np.array(
            self.data).T[idx], 0.75) + 1.5 * iqr(np.array(self.data).T[idx])
        lower_bound = np.quantile(np.array(
            self.data).T[idx], 0.25) - 1.5 * iqr(np.array(self.data).T[idx])
        return -1 if vals[idx] > upper_bound or vals[idx] < lower_bound else 1

    def preprocess(self, val):
        ma = self.ma.get(val)
        ewma = self.ewma.get(val)
        return ([val] + ma + ewma + self.dif.get(val) + self.madiff.get(ma) +
                self.ewmadiff.get(ewma))
コード例 #29
0
 {
     "model":
     detector.OutlierDetector,
     "params": {
         "model": LocalOutlierFactor(n_neighbors=1, contamination=0.1)
     },
     "df": [
         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0],
     ],
     "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0],
 },
 {
     "model": detector.OutlierDetector,
     "params": {
         "model": IsolationForest(n_estimators=100, contamination=0.1)
     },
     "df": [[0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0]],
     "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0],
 },
 {
     "model":
     detector.RegressionAD,
     "params": {
         "target": 2,
         "regressor": LinearRegression()
     },
     "df": [
         [0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9],
         [0, 2, 4, 6, 8, 10, 12, 14, 14, 16, 18],
         [0, 3, 6, 10, 12, 14, 18, 21, nan, 24, 27],
def initialize_isoForest(seed, n_estimators, max_samples, contamination, **kwargs):

    isoForest = IsolationForest(n_estimators=n_estimators, max_samples=max_samples,
        contamination=contamination, n_jobs=-1, random_state=seed, behaviour='new', **kwargs)
    return isoForest
コード例 #31
0
def Eval(clargs):    
    __version__ = '1.0'
    usage = """train_flows [options] normaldatafile"""
    parser = OptionParser(usage=usage, version=__version__)

    parser.add_option("-x", "--vectorizerfile", action="store", type="string", \
                      default='/tmp/vectorizers.pkl', help="")
    parser.add_option("-v", "--verbose", action="store_true", default=False, \
                      help="enable verbose output")
    parser.add_option("-o", "--maliciousdatafile", action="store", type="string", \
                      default=None, help="An optional file of malicious http logs")
    parser.add_option("-m", "--maxfeaturesperbag", action="store", type="int", \
                      default=100, help="maximum number of features per bag")
    parser.add_option("-g", "--ngramsize", action="store", type="int", \
                      default=7, help="ngram size")

    parser.add_option("-f", "--features", action="store", type="string", \
                      default="01000100111111111111", help="An optional file for choosing which features to be extracted")
    parser.add_option("-t", "--maxtrainingfeatures", action="store", type="int", \
                      default=50000, help="maximum number of rows to train with per class")
    parser.add_option("-n", "--numtrees", action="store", type="int", \
                      default=200, help="number of trees in isolation forest")
    parser.add_option("-s", "--numsamples", action="store", type="int", \
                      default=8192, help="number of samples in each tree")


    Start=time.time()
    (opts, args) = parser.parse_args(clargs)

    if len(args) != 2:
        parser.error('Incorrect number of arguments')

    ftu=[]
    features = opts.features

    for i, j in enumerate(features):
      if opts.verbose: print(j, all_fields[i])
      if j == 1 or j=='1':
        ftu.append(all_fields[i])

    if opts.verbose: print ftu
    #ftu = ['method', 'user_agent', 'status_code']


    # load the http data in to a data frame
    print('Loading HTTP data')
    df = load_brofile(args[0], fields_to_use)
    trainDf = load_brofile(args[1], fields_to_use)


    total_rows = len(df.index)
    if opts.verbose: print('Total number of rows: %d' % total_rows)
    if opts.maliciousdatafile != None:
      print('Reading malicious training data')
      df1 = load_brofile(opts.maliciousdatafile, fields_to_use)
      if opts.verbose: print('Read malicious data with %s rows ' % len(df1.index))
      #if (len(df1.index) > opts.maxtrainingfeatures):
      #  if opts.verbose: print('Too many malicious samples for training, downsampling to %d' % opts.maxtrainingfeatures)
      #  df1 = df1.sample(n=opts.maxtrainingfeatures)

      #set the classes of the dataframes and then stitch them together in to one big dataframe
      df['class'] = 0
      df1['class'] = 1
      classedDf = pd.concat([df,df1], ignore_index=True)
    else:
      #we weren't passed a file containing class-1 data, so we should generate some of our own.
      noiseDf = create_noise_contrast(df, numSamples)
      if opts.verbose: print('Added %s rows of generated malicious data'%numSamples)
      df['class'] = 0
      noiseDf['class'] = 1
      classedDf = pd.concat([df,noiseDf], ignore_index=True)

    #that doesn't matter
    trainDf['class']=0;


    #spliting into training and evaluation sets 
    classedDf['is_train']=False
    trainDf['is_train']=True

    enhancedDf = enhance_flow(pd.concat([trainDf,classedDf], ignore_index=True), ftu)
    # construct some vectorizers based on the data in the DF. We need to vectorize future log files the exact same way so we
    # will be saving these vectorizers to a file.

    vectorizers = build_vectorizers(enhancedDf, ftu, max_features=opts.maxfeaturesperbag, ngram_size=opts.ngramsize, verbose=opts.verbose)

    #use the vectorizers to featureize our DF into a numeric feature dataframe
    featureMatrix = featureize(enhancedDf, ftu, vectorizers, verbose=opts.verbose)

    #add the class column back in (it wasn't featurized by itself)
    featureMatrix['class'] = enhancedDf['class']
    featureMatrix['is_train'] = enhancedDf['is_train']


    #split out the train and test df's into separate objects
    train, test = featureMatrix[featureMatrix['is_train']==True], featureMatrix[featureMatrix['is_train']==False]

    #drop the is_train column, we don't need it anymore
    train = train.drop('is_train', axis=1)
    test = test.drop('is_train', axis=1)


    #print('Calculating features')


    Trees=opts.numtrees
    Samples=opts.numsamples
    clf = IsolationForest(n_estimators=Trees, max_samples=Samples)

    
    clf.fit(train.drop('class', axis=1))

    testnoclass = test.drop('class', axis=1)

    print('Predicting')

    test.is_copy = False

    test['prediction'] = clf.decision_function(testnoclass) + 0.5

    print('Analyzing')
    #get the class-1 (outlier/anomaly) rows from the feature matrix, and drop the prediction so we can investigate them

    ##From Here
    Left=0.001 
    Right=0.01
    
    fpr, tpr, thresholds = roc_curve(test['class'], test['prediction'], pos_label=0)
    
    F=interpolate.interp1d(fpr, tpr, assume_sorted=True)
    x=np.logspace(np.log10(Left), np.log10(Right))
    y=F(x)
    roc_auc=auc(x, y)

    plt.figure()
    plt.xscale('log')

    plt.plot(fpr, tpr, color='b')
    plt.plot(x,y, color='r')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')


    plt.plot(plt.xlim(), plt.ylim(), ls="--", c=".3")
    plt.savefig("fig3.png")
    plt.clf()
    plt.close('all')


    print('Area Under the Curve = %.6f' %(roc_auc))



    Min, Sec= divmod( int(time.time() - Start), 60 )
    #print Min, Sec

    target= open('Results.txt', 'a')
    target.write(str(Trees)+' ')
    target.write(str(Samples)+' ')
    target.write(str(Min)+' ')
    target.write(str(Sec)+' ')
    target.write(str(roc_auc))
    target.write("\n")
    target.write(str(features))
    target.write("\n")
    target.write("\n")
    target.close()

    
    print("Minutes: %d, Seconds: %d" % (int(Min), int(Sec)) )
    return roc_auc 
コード例 #32
0
                color='red',
                alpha=1.0)

    plt.xlabel("Accomodates")
    plt.ylabel("Price")
    plt.title(outputTitle)
    plt.legend()
    plt.show()


X = pd.read_csv("E:/outlier.csv")
X = X.iloc[:, 1:3]
print(X.info())
sns.jointplot(x="accommodates", y="price", data=X)

iso_forest_model = IsolationForest(n_estimators=100, contamination=0.01)
iso_forest_model.fit(X)
iso_forest_model.estimators_
outlier_scores = iso_forest_model.decision_function(X)
decisions_iso = iso_forest_model.predict(X)

displayResults(
    inliers=X[decisions_iso == 1],
    outliers=X[decisions_iso == -1],
    classifier=iso_forest_model,
    outputTitle="detecting potential outliers using isolation forest",
    outputName="outliers_isolation_forest")

lof_model = neighbors.LocalOutlierFactor(n_neighbors=20, contamination=0.01)
decisions_lof = lof_model.fit_predict(X)
print(lof_model.negative_outlier_factor_)
コード例 #33
0
ファイル: utils.py プロジェクト: viviwang1008/ETH-ML
def removeOutliers(train,
                   labels=None,
                   opt='isolation',
                   cont='auto',
                   rerun=100,
                   outlier_importance=20,
                   max_features=0.2,
                   max_samples=0.2,
                   random_state=0,
                   **kwargs):
    # Set seed and data size
    n1, m = train.shape
    np.random.seed(random_state)

    # Merge into one dataset with labels
    if labels is None: data = train
    else: data = pd.concat([train, labels], axis=1)

    # Define functions for interation of estimators
    def IterateResults(estimator, data, rerun):
        score = np.zeros(n1)
        print("Outlier detection: Iterating", opt, "estimator", rerun,
              "times.")
        print("Cummulative outliers found")

        def resample_score(seed):
            np.random.seed(seed)
            return estimator.fit(data).decision_function(data)

        mapping = map(resample_score, range(random_state,
                                            random_state + rerun))

        for i in mapping:
            # Give more weights to outliers found
            i[i < 0] = i[i < 0] * outlier_importance
            score += i
            print((score < 0).sum(), end="->")
        print("Done!")
        return score / rerun

    def MahalanobisDist(data):
        def is_pos_def(A):
            if np.allclose(A, A.T):
                try:
                    np.linalg.cholesky(A)
                    return True
                except np.linalg.LinAlgError:
                    return False
            else:
                return False

        covar = np.cov(data, rowvar=False)
        if is_pos_def(covar):
            covar_inv = np.linalg.inv(covar)
            if is_pos_def(covar_inv):
                mean = np.mean(data, axis=0)
                diff = data - mean
                md = np.sqrt(diff.dot(covar_inv).dot(diff.T).diagonal())
                return md
            else:
                print(
                    "Error: Inverse of Covariance Matrix is not positive definite!"
                )
        else:
            print("Error: Covariance Matrix is not positive definite!")

    # Choose method
    if opt == 'isolation':
        from sklearn.ensemble import IsolationForest
        estim = IsolationForest(contamination=cont,
                                behaviour='new',
                                max_samples=max_samples,
                                max_features=max_features,
                                n_estimators=50,
                                n_jobs=-1,
                                **kwargs)
        decision = estim.fit(data).predict(data)
        if (rerun > 0):
            decision = IterateResults(estim, data, rerun)

    if opt == 'lof':
        from sklearn.neighbors import LocalOutlierFactor
        estim = LocalOutlierFactor(contamination=cont,
                                   n_neighbors=55,
                                   n_jobs=-1)
        decision = estim.fit_predict(data)

    if opt == 'svm':
        from sklearn.svm import OneClassSVM
        if cont == 'auto':
            cont = 0.01
        estim = OneClassSVM(nu=cont, gamma='scale', tol=1e-3)
        decision = estim.fit(data).predict(data)

    if opt == 'covariance':
        if cont == 'auto': cont = 4
        MD = MahalanobisDist(data.values)
        std = np.std(MD)
        mean = np.mean(MD)
        k = 3. * std if True else 2. * std
        high, low = mean + k, mean - k
        decision = (MD >= high) * (-2) + (MD <= low) * (-2) + 1

    # Print summary information
    index = decision < 0
    print("Outlier values: ", round(index.sum() * 100 / n1, 3), "%  (",
          index.sum(), "/", n1, ")")
    print("Outlier values", opt, "method indecies:")
    for i in data[index].index:
        print(i, end=' ')
    print()
    if index.sum() / n1 > 0.1:
        print("Warning! More than 10% of training observations deleted!")
    # Discard outliers
    out = data[np.invert(index)]
    if labels is None:
        return out
    else:
        train = out.iloc[:, 0:m]
        labels = pd.DataFrame(out.iloc[:, -1])
        return (train, labels)
コード例 #34
0
def clean(ar, args, arch):
    """Cleans the archive and returns the cleaned copy.
    """
    ar_name = ar.get_filename().split()[-1]
    # Create copy of archive that is used to grab the profiles
    if args.bandpass:
        patient = calibrate_bandpass(ar)
    else:
        patient = ar.clone()
        patient.pscrunch()
        patient.remove_baseline()
    # Grab the profiles after dedispersing them
    patient.dedisperse()
    data = patient.get_data()[:, 0, :, :]
    if np.count_nonzero(data) == 0:
        print("Archive is empty.")
        return ar

    profile_number = data[:, :, 0].size
    pca_components = min(args.components, data.shape[2])

    if not args.quiet:
        print("Number of Profiles: %s" % profile_number)
        if not args.disable_pca:
            print("PCA parameters: n_components: %s" % pca_components)
        print(
            "IsolationForest parameters: n_estimators: %s max_samples: %s max_features: %s"
            % (args.estimators, args.samples, args.max_features))

    orig_shape = np.shape(data)
    # Reshape the profiles for pca computation

    data = np.reshape(data, (-1, orig_shape[2]))

    # Delete precleaned profiles
    if args.weight:
        orig_weights = ar.get_weights().flatten()
        known_rfi = np.where(orig_weights == 0)
        known_non_rfi = np.where(orig_weights != 0)
        data = np.delete(data, known_rfi, axis=0)

    # Compute additional features if wanted
    if args.features or args.disable_pca:
        array_feat = compute_metrics(data)

    if args.order:
        data = np.concatenate((data, array_feat), axis=1)

    # Compute the pca
    if not args.disable_pca:
        pca = PCA(n_components=pca_components, svd_solver="full")
        data_pca = pca.fit_transform(data)
        data_features = data_pca
        if args.features and not args.order:
            data_features = np.concatenate((data_features, array_feat), axis=1)
    else:
        data_features = array_feat

    print("All features: %s" % (data_features.shape[1]))

    # Compute the anomaly scores of the isolation forest algorithm
    # The random_state creates a reproducible result but this may not be the best solution in the future
    clf = IsolationForest(n_estimators=args.estimators,
                          max_samples=args.samples,
                          max_features=args.max_features,
                          n_jobs=2,
                          random_state=1)

    clf.fit(data_features)

    anomaly_factors = clf.decision_function(data_features)

    # Introduce known weights
    if args.weight:
        dummy_anomaly = np.zeros(orig_weights.shape)
        dummy_anomaly[known_non_rfi] = anomaly_factors
        dummy_anomaly[known_rfi] = np.inf
        anomaly_factors_reshape = np.reshape(dummy_anomaly, orig_shape[0:2])
    else:
        anomaly_factors_reshape = np.reshape(anomaly_factors, orig_shape[0:2])

    snrs = []
    split_values = []
    rfi_fracs = []
    # Cycle through different rfi fractions and find the best snr

    min_frac = 0
    max_frac = 50
    num_frac = 130

    for rfi_frac in np.linspace(min_frac, max_frac, num=num_frac):
        split_value = np.percentile(anomaly_factors, rfi_frac)
        test_profile = np.sum(
            data[anomaly_factors >= split_value, :orig_shape[2]], axis=0)
        profile_object = psrchive.Profile(orig_shape[2])
        profile_object.get_amps()[:] = test_profile
        test_snr = profile_object.snr()
        snrs.append(test_snr)
        split_values.append(split_value)
        rfi_fracs.append(rfi_frac)
        # print test_snr

    best_index = int(np.argmax(snrs) + args.additional * num_frac / max_frac)
    best_snr = snrs[best_index]
    best_frac = rfi_fracs[best_index]
    best_split_value = split_values[best_index]

    if not args.quiet:
        print("SNR: %.1f RFI fraction: %.4f" % (best_snr, best_frac * 0.01))

    # Set the weights in the archive
    set_weights_archive(ar, anomaly_factors_reshape, best_split_value)

    # Test if whole channel or subints should be removed
    if args.bad_chan != 1 or args.bad_subint != 1:
        ar = find_bad_parts(ar, args)

    # Create plot that shows zapped( red) and unzapped( blue) profiles if needed
    if args.print_zap:
        plt.imshow(anomaly_factors_reshape.T,
                   vmin=best_split_value - 0.0001,
                   vmax=best_split_value,
                   aspect='auto',
                   interpolation='nearest',
                   cmap=cm.coolwarm)
        plt.gca().invert_yaxis()
        plt.savefig("%s_%s_%s_%s.png" %
                    (ar_name, args.components, args.estimators, args.samples),
                    bbox_inches='tight')

    # Create log that contains the used parameters
    with open("clean.log", "a") as myfile:
        myfile.write("\n %s: Cleaned %s with %s" %
                     (datetime.datetime.now(), ar_name, args))
    return ar, (anomaly_factors, snrs, rfi_fracs)
def isolationForest(dataset, rng):
    isolationforest = IsolationForest(behaviour='new',
                                      max_samples=100,
                                      random_state=rng,
                                      contamination='auto').fit(dataset)
    return isolationforest
コード例 #36
0
def spindles_detect(data,
                    sf,
                    hypno=None,
                    include=(1, 2, 3),
                    freq_sp=(12, 15),
                    duration=(0.5, 2),
                    freq_broad=(1, 30),
                    min_distance=500,
                    downsample=True,
                    thresh={
                        'rel_pow': 0.2,
                        'corr': 0.65,
                        'rms': 1.5
                    },
                    remove_outliers=False):
    """Spindles detection.

    Parameters
    ----------
    data : array_like
        Single-channel continuous EEG data. Unit must be uV.
    sf : float
        Sampling frequency of the data in Hz.
    hypno : array_like
        Sleep stage vector (hypnogram). If the hypnogram is loaded, the
        detection will only be applied to the value defined in
        ``include`` (default = N1 + N2 + N3 sleep). ``hypno`` MUST be a 1D
        array of integers with the same size as data and where -1 = Artefact,
        0 = Wake, 1 = N1, 2 = N2, 3 = N3, 4 = REM. If you need help loading
        your hypnogram vector, please read the Visbrain documentation at
        http://visbrain.org/sleep.
    include : tuple, list or int
        Values in ``hypno`` that will be included in the mask. The default is
        (1, 2, 3), meaning that the detection is applied on N1, N2 and N3
        sleep. This has no effect is ``hypno`` is None.
    freq_sp : tuple or list
        Spindles frequency range. Default is 12 to 15 Hz. Please note that YASA
        uses a FIR filter (implemented in MNE) with a 1.5Hz transition band,
        which means that for `freq_sp = (12, 15 Hz)`, the -6 dB points are
        located at 11.25 and 15.75 Hz.
    duration : tuple or list
        The minimum and maximum duration of the spindles.
        Default is 0.5 to 2 seconds.
    freq_broad : tuple or list
        Broad band frequency of interest.
        Default is 1 to 30 Hz.
    min_distance : int
        If two spindles are closer than `min_distance` (in ms), they are
        merged into a single spindles. Default is 500 ms.
    downsample : boolean
        If True, the data will be downsampled to 100 Hz or 128 Hz (depending
        on whether the original sampling frequency is a multiple of 100 or 128,
        respectively).
    thresh : dict
        Detection thresholds::

            'rel_pow' : Relative power (= power ratio freq_sp / freq_broad).
            'corr' : Pearson correlation coefficient.
            'rms' : Mean(RMS) + 1.5 * STD(RMS).
    remove_outliers : boolean
        If True, YASA will automatically detect and remove outliers spindles
        using an Isolation Forest (implemented in the scikit-learn package).
        The outliers detection is performed on all the spindles
        parameters with the exception of the 'Start' and 'End' columns.
        YASA uses a random seed (42) to ensure reproducible results.
        Note that this step will only be applied if there are more than 50
        detected spindles in the first place. Default to False.

    Returns
    -------
    sp_params : pd.DataFrame
        Pandas DataFrame:

            'Start' : Start time of each detected spindles (in seconds)
            'End' : End time (in seconds)
            'Duration' : Duration (in seconds)
            'Amplitude' : Amplitude (in uV)
            'RMS' : Root-mean-square (in uV)
            'AbsPower' : Median absolute power (in log10 uV^2)
            'RelPower' : Median relative power (ranging from 0 to 1, in % uV^2)
            'Frequency' : Median frequency (in Hz)
            'Oscillations' : Number of oscillations (peaks)
            'Symmetry' : Symmetry index, ranging from 0 to 1
            'Stage' : Sleep stage (only if hypno was provided)

    Notes
    -----
    For better results, apply this detection only on artefact-free NREM sleep.
    """
    # Safety check
    data = np.asarray(data, dtype=np.float64)
    if data.ndim == 2:
        data = np.squeeze(data)
    assert data.ndim == 1, 'Wrong data dimension. Please pass 1D data.'
    assert freq_sp[0] < freq_sp[1]
    assert freq_broad[0] < freq_broad[1]
    assert isinstance(downsample, bool), 'Downsample must be True or False.'

    # Hypno processing
    if hypno is not None:
        hypno = np.asarray(hypno, dtype=int)
        assert hypno.ndim == 1, 'Hypno must be one dimensional.'
        assert hypno.size == data.size, 'Hypno must have same size as data.'
        unique_hypno = np.unique(hypno)
        logger.info('Number of unique values in hypno = %i', unique_hypno.size)
        if isinstance(include, int):
            include = [include]
        else:
            assert isinstance(include, (tuple, list, np.ndarray))
        assert len(include) >= 1, 'include must have at least one element.'
        if not any(np.in1d(unique_hypno, include)):
            logger.error('The values in include are not present in hypno. '
                         'Switching to hypno = None.')
            hypno = None

    # Check data amplitude
    data_trimstd = trimbothstd(data, cut=0.10)
    data_ptp = np.ptp(data)
    logger.info('Number of samples in data = %i', data.size)
    logger.info('Sampling frequency = %.2f Hz', sf)
    logger.info('Data duration = %.2f seconds', data.size / sf)
    logger.info('Trimmed standard deviation of data = %.4f uV', data_trimstd)
    logger.info('Peak-to-peak amplitude of data = %.4f uV', data_ptp)
    if not (1 < data_trimstd < 1e3 or 1 < data_ptp < 1e6):
        logger.error('Wrong data amplitude. Unit must be uV. Returning None.')
        return None

    if 'rel_pow' not in thresh.keys():
        thresh['rel_pow'] = 0.20
    if 'corr' not in thresh.keys():
        thresh['corr'] = 0.65
    if 'rms' not in thresh.keys():
        thresh['rms'] = 1.5

    # Check if we can downsample to 100 or 128 Hz
    if downsample is True and sf > 128:
        if sf % 100 == 0 or sf % 128 == 0:
            new_sf = 100 if sf % 100 == 0 else 128
            fac = int(sf / new_sf)
            sf = new_sf
            data = data[::fac]
            logger.info('Downsampled data by a factor of %i', fac)
            if hypno is not None:
                hypno = hypno[::fac]
                assert hypno.size == data.size
        else:
            logger.warning("Cannot downsample if sf is not a mutiple of 100 "
                           "or 128. Skipping downsampling.")

    # Create sleep stage vector mask
    if hypno is not None:
        mask = np.in1d(hypno, include)
    else:
        mask = np.ones(data.size, dtype=bool)

    # Bandpass filter
    data = filter_data(data,
                       sf,
                       freq_broad[0],
                       freq_broad[1],
                       method='fir',
                       verbose=0)

    # The width of the transition band is set to 1.5 Hz on each side,
    # meaning that for freq_sp = (12, 15 Hz), the -6 dB points are located at
    # 11.25 and 15.75 Hz.
    data_sigma = filter_data(data,
                             sf,
                             freq_sp[0],
                             freq_sp[1],
                             l_trans_bandwidth=1.5,
                             h_trans_bandwidth=1.5,
                             method='fir',
                             verbose=0)

    # Compute the pointwise relative power using interpolated STFT
    # Here we use a step of 200 ms to speed up the computation.
    f, t, Sxx = stft_power(data,
                           sf,
                           window=2,
                           step=.2,
                           band=freq_broad,
                           interp=False,
                           norm=True)
    idx_sigma = np.logical_and(f >= freq_sp[0], f <= freq_sp[1])
    rel_pow = Sxx[idx_sigma].sum(0)

    # Let's interpolate `rel_pow` to get one value per sample
    # Note that we could also have use the `interp=True` in the `stft_power`
    # function, however 2D interpolation is much slower than
    # 1D interpolation.
    func = interp1d(t, rel_pow, kind='cubic', bounds_error=False, fill_value=0)
    t = np.arange(data.size) / sf
    rel_pow = func(t)

    # Now we apply moving RMS and correlation on the sigma-filtered signal
    _, mcorr = moving_transform(data_sigma,
                                data,
                                sf,
                                window=.3,
                                step=.1,
                                method='corr',
                                interp=True)
    _, mrms = moving_transform(data_sigma,
                               data,
                               sf,
                               window=.3,
                               step=.1,
                               method='rms',
                               interp=True)

    # Hilbert power (to define the instantaneous frequency / power)
    n = data_sigma.size
    nfast = next_fast_len(n)
    analytic = signal.hilbert(data_sigma, N=nfast)[:n]
    inst_phase = np.angle(analytic)
    inst_pow = np.square(np.abs(analytic))
    # inst_freq = sf / 2pi * 1st-derivative of the phase of the analytic signal
    inst_freq = (sf / (2 * np.pi) * np.ediff1d(inst_phase))

    # Let's define the thresholds
    if hypno is None:
        thresh_rms = mrms.mean() + thresh['rms'] * trimbothstd(mrms, cut=0.10)
    else:
        thresh_rms = mrms[mask].mean() + thresh['rms'] * \
            trimbothstd(mrms[mask], cut=0.10)

    # Avoid too high threshold caused by Artefacts / Motion during Wake.
    thresh_rms = min(thresh_rms, 10)
    idx_rel_pow = (rel_pow >= thresh['rel_pow']).astype(int)
    idx_mcorr = (mcorr >= thresh['corr']).astype(int)
    idx_mrms = (mrms >= thresh_rms).astype(int)
    idx_sum = (idx_rel_pow + idx_mcorr + idx_mrms).astype(int)

    # Make sure that we do not detect spindles in REM or Wake if hypno != None
    if hypno is not None:
        idx_sum[~mask] = 0

    # For debugging
    logger.info('Moving RMS threshold = %.3f', thresh_rms)
    logger.info('Number of supra-theshold samples for relative power = %i',
                idx_rel_pow.sum())
    logger.info('Number of supra-theshold samples for moving correlation = %i',
                idx_mcorr.sum())
    logger.info('Number of supra-theshold samples for moving RMS = %i',
                idx_mrms.sum())

    # The detection using the three thresholds tends to underestimate the
    # real duration of the spindle. To overcome this, we compute a soft
    # threshold by smoothing the idx_sum vector with a 100 ms window.
    w = int(0.1 * sf)
    idx_sum = np.convolve(idx_sum, np.ones(w) / w, mode='same')
    # And we then find indices that are strictly greater than 2, i.e. we find
    # the 'true' beginning and 'true' end of the events by finding where at
    # least two out of the three treshold were crossed.
    where_sp = np.where(idx_sum > 2)[0]

    # If no events are found, return an empty dataframe
    if not len(where_sp):
        logger.warning('No spindles were found in data. Returning None.')
        return None

    # Merge events that are too close
    if min_distance is not None and min_distance > 0:
        where_sp = _merge_close(where_sp, min_distance, sf)

    # Extract start, end, and duration of each spindle
    sp = np.split(where_sp, np.where(np.diff(where_sp) != 1)[0] + 1)
    idx_start_end = np.array([[k[0], k[-1]] for k in sp]) / sf
    sp_start, sp_end = idx_start_end.T
    sp_dur = sp_end - sp_start

    # Find events with bad duration
    good_dur = np.logical_and(sp_dur > duration[0], sp_dur < duration[1])

    # If no events of good duration are found, return an empty dataframe
    if all(~good_dur):
        logger.warning('No spindles were found in data. Returning None.')
        return None

    # Initialize empty variables
    n_sp = len(sp)
    sp_amp = np.zeros(n_sp)
    sp_freq = np.zeros(n_sp)
    sp_rms = np.zeros(n_sp)
    sp_osc = np.zeros(n_sp)
    sp_sym = np.zeros(n_sp)
    sp_abs = np.zeros(n_sp)
    sp_rel = np.zeros(n_sp)
    sp_sta = np.zeros(n_sp)

    # Number of oscillations (= number of peaks separated by at least 60 ms)
    # --> 60 ms because 1000 ms / 16 Hz = 62.5 ms, in other words, at 16 Hz,
    # peaks are separated by 62.5 ms. At 11 Hz, peaks are separated by 90 ms.
    distance = 60 * sf / 1000

    for i in np.arange(len(sp))[good_dur]:
        # Important: detrend the signal to avoid wrong peak-to-peak amplitude
        sp_x = np.arange(data[sp[i]].size, dtype=np.float64)
        sp_det = _detrend(sp_x, data[sp[i]])
        # sp_det = signal.detrend(data[sp[i]], type='linear')
        sp_amp[i] = np.ptp(sp_det)  # Peak-to-peak amplitude
        sp_rms[i] = _rms(sp_det)  # Root mean square
        sp_rel[i] = np.median(rel_pow[sp[i]])  # Median relative power

        # Hilbert-based instantaneous properties
        sp_inst_freq = inst_freq[sp[i]]
        sp_inst_pow = inst_pow[sp[i]]
        sp_abs[i] = np.median(np.log10(sp_inst_pow[sp_inst_pow > 0]))
        sp_freq[i] = np.median(sp_inst_freq[sp_inst_freq > 0])

        # Number of oscillations
        peaks, peaks_params = signal.find_peaks(sp_det,
                                                distance=distance,
                                                prominence=(None, None))
        sp_osc[i] = len(peaks)

        # For frequency and amplitude, we can also optionally use these
        # faster alternatives. If we use them, we do not need to compute the
        # Hilbert transform of the filtered signal.
        # sp_freq[i] = sf / np.mean(np.diff(peaks))
        # sp_amp[i] = peaks_params['prominences'].max()

        # Symmetry index
        sp_sym[i] = peaks[peaks_params['prominences'].argmax()] / sp_det.size

        # Sleep stage
        if hypno is not None:
            sp_sta[i] = hypno[sp[i]][0]

    # Create a dictionnary
    sp_params = {
        'Start': sp_start,
        'End': sp_end,
        'Duration': sp_dur,
        'Amplitude': sp_amp,
        'RMS': sp_rms,
        'AbsPower': sp_abs,
        'RelPower': sp_rel,
        'Frequency': sp_freq,
        'Oscillations': sp_osc,
        'Symmetry': sp_sym,
        'Stage': sp_sta
    }

    df_sp = pd.DataFrame.from_dict(sp_params)[good_dur].reset_index(drop=True)

    if hypno is None:
        df_sp = df_sp.drop(columns=['Stage'])
    else:
        df_sp['Stage'] = df_sp['Stage'].astype(int).astype('category')

    # We need at least 50 detected spindles to apply the Isolation Forest.
    if remove_outliers and df_sp.shape[0] >= 50:
        from sklearn.ensemble import IsolationForest
        df_sp_dummies = pd.get_dummies(df_sp)
        col_keep = df_sp_dummies.columns.difference(['Start', 'End'])
        ilf = IsolationForest(behaviour='new',
                              contamination='auto',
                              max_samples='auto',
                              verbose=0,
                              random_state=42)

        good = ilf.fit_predict(df_sp_dummies[col_keep])
        good[good == -1] = 0
        logger.info('%i outliers were removed.', (good == 0).sum())
        # Remove outliers from DataFrame
        df_sp = df_sp[good.astype(bool)].reset_index(drop=True)

    logger.info('%i spindles were found in data.', df_sp.shape[0])
    return df_sp
コード例 #37
0
ファイル: RF_Iter_Missing.py プロジェクト: PL97/Data-cleaning
 def training_oulier_testdata(self, data, outlier_features):
     ilf = IsolationForest(n_estimators=min(100, len(data)),
                           n_jobs=-1,
                           verbose=2)
     ilf.fit(data[outlier_features])
     return ilf
コード例 #38
0
    if dat == 'http' or dat == 'smtp':
        y = (y != 'normal.').astype(int)

    n_samples, n_features = np.shape(X)
    n_samples_train = n_samples // 2
    n_samples_test = n_samples - n_samples_train

    X = X.astype(float)
    X_train = X[:n_samples_train, :]
    X_test = X[n_samples_train:, :]
    y_train = y[:n_samples_train]
    y_test = y[n_samples_train:]

    print('IsolationForest processing...')
    model = IsolationForest(bootstrap=True, n_jobs=-1)
    tstart = time()
    model.fit(X_train)
    fit_time = time() - tstart
    tstart = time()

    scoring = model.predict(X_test)  # the lower, the more normal
    predict_time = time() - tstart
    fpr, tpr, thresholds = roc_curve(y_test, scoring)
    AUC = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC for %s (area = %0.3f, train-time: %0.2fs, test-time: %0.2fs)' % (dat, AUC, fit_time, predict_time))

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
コード例 #39
0
# Example settings 示例设置
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]

# define two outlier detection tools to be compared 定义两个异常的检测工具进行比较
classifiers = {
    "One-Class SVM":
    svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                    kernel="rbf",
                    gamma=0.1),
    "Robust covariance":
    EllipticEnvelope(contamination=outliers_fraction),
    "Isolation Forest":
    IsolationForest(max_samples=n_samples,
                    contamination=outliers_fraction,
                    random_state=rng),
    "Local Outlier Factor":
    LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction)
}

# Compare given classifiers under given settings 比较给定设置下的分类器
xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = -1

# Fit the problem with varying cluster separation 将不同的集群分离拟合
for i, offset in enumerate(clusters_separation):
    np.random.seed(42)
コード例 #40
0
def sw_detect(data,
              sf,
              hypno=None,
              include=(2, 3),
              freq_sw=(0.3, 3.5),
              dur_neg=(0.3, 1.5),
              dur_pos=(0.1, 1),
              amp_neg=(40, 300),
              amp_pos=(10, 200),
              amp_ptp=(75, 500),
              downsample=True,
              remove_outliers=False):
    """Slow-waves detection.

    Parameters
    ----------
    data : array_like
        Single-channel continuous EEG data. Unit must be uV.
    sf : float
        Sampling frequency of the data in Hz.
    hypno : array_like
        Sleep stage vector (hypnogram). If the hypnogram is loaded, the
        detection will only be applied to the value defined in
        ``include`` (default = N2 + N3 sleep). ``hypno`` MUST be a 1D array of
        integers with the same size as data and where -1 = Artefact, 0 = Wake,
        1 = N1, 2 = N2, 3 = N3, 4 = REM. If you need help loading your
        hypnogram vector, please read the Visbrain documentation at
        http://visbrain.org/sleep.
    include : tuple, list or int
        Values in ``hypno`` that will be included in the mask. The default is
        (2, 3), meaning that the detection is applied only on N2 and N3 sleep.
        This has no effect is ``hypno`` is None.
    freq_sw : tuple or list
        Slow wave frequency range. Default is 0.3 to 3.5 Hz. Please note that
        YASA uses a FIR filter (implemented in MNE) with a 0.2Hz transition
        band, which means that for `freq_sw = (.3, 3.5 Hz)`, the -6 dB points
        are located at 0.2 and 3.6 Hz.
    dur_neg : tuple or list
        The minimum and maximum duration of the negative deflection of the
        slow wave. Default is 0.3 to 1.5 second.
    dur_pos : tuple or list
        The minimum and maximum duration of the positive deflection of the
        slow wave. Default is 0.1 to 1 second.
    amp_neg : tuple or list
        Absolute minimum and maximum negative trough amplitude of the
        slow-wave. Default is 40 uV to 300 uV.
    amp_pos : tuple or list
        Absolute minimum and maximum positive peak amplitude of the
        slow-wave. Default is 10 uV to 200 uV.
    amp_ptp : tuple or list
        Minimum and maximum peak-to-peak amplitude of the slow-wave.
        Default is 75 uV to 500 uV.
    downsample : boolean
        If True, the data will be downsampled to 100 Hz or 128 Hz (depending
        on whether the original sampling frequency is a multiple of 100 or 128,
        respectively).
    remove_outliers : boolean
        If True, YASA will automatically detect and remove outliers slow-waves
        using an Isolation Forest (implemented in the scikit-learn package).
        The outliers detection is performed on the frequency, amplitude and
        duration parameters of the detected slow-waves. YASA uses a random seed
        (42) to ensure reproducible results. Note that this step will only be
        applied if there are more than 100 detected slow-waves in the first
        place. Default to False.

    Returns
    -------
    sw_params : pd.DataFrame
        Pandas DataFrame:

            'Start' : Start of each detected slow-wave (in seconds of data)
            'NegPeak' : Location of the negative peak (in seconds of data)
            'MidCrossing' : Location of the negative-to-positive zero-crossing
            'Pospeak' : Location of the positive peak
            'End' : End time (in seconds)
            'Duration' : Duration (in seconds)
            'ValNegPeak' : Amplitude of the negative peak (in uV - filtered)
            'ValPosPeak' : Amplitude of the positive peak (in uV - filtered)
            'PTP' : Peak to peak amplitude (ValPosPeak - ValNegPeak)
            'Slope' : Slope between ``NegPeak`` and ``MidCrossing`` (in uV/sec)
            'Frequency' : Frequency of the slow-wave (1 / ``Duration``)
            'Stage' : Sleep stage (only if hypno was provided)

    Notes
    -----
    For better results, apply this detection only on artefact-free NREM sleep.

    Note that the ``PTP``, ``Slope``, ``ValNegPeak`` and ``ValPosPeak`` are
    computed on the filtered signal.
    """
    # Safety check
    data = np.asarray(data, dtype=np.float64)
    if data.ndim == 2:
        data = np.squeeze(data)
    assert data.ndim == 1, 'Wrong data dimension. Please pass 1D data.'
    assert freq_sw[0] < freq_sw[1]
    assert amp_ptp[0] < amp_ptp[1]
    assert isinstance(downsample, bool), 'Downsample must be True or False.'

    # Hypno processing
    if hypno is not None:
        hypno = np.asarray(hypno, dtype=int)
        assert hypno.ndim == 1, 'Hypno must be one dimensional.'
        assert hypno.size == data.size, 'Hypno must have same size as data.'
        unique_hypno = np.unique(hypno)
        logger.info('Number of unique values in hypno = %i', unique_hypno.size)
        if isinstance(include, int):
            include = [include]
        else:
            assert isinstance(include, (tuple, list, np.ndarray))
        assert len(include) >= 1, 'include must have at least one element.'
        if not any(np.in1d(unique_hypno, include)):
            logger.error('The values in include are not present in hypno. '
                         'Switching to hypno = None.')
            hypno = None

    # Check data amplitude
    data_trimstd = trimbothstd(data, cut=0.10)
    data_ptp = np.ptp(data)
    logger.info('Number of samples in data = %i', data.size)
    logger.info('Sampling frequency = %.2f Hz', sf)
    logger.info('Data duration = %.2f seconds', data.size / sf)
    logger.info('Trimmed standard deviation of data = %.4f uV', data_trimstd)
    logger.info('Peak-to-peak amplitude of data = %.4f uV', data_ptp)
    if not (1 < data_trimstd < 1e3 or 1 < data_ptp < 1e6):
        logger.error('Wrong data amplitude. Unit must be uV. Returning None.')
        return None

    # Check if we can downsample to 100 or 128 Hz
    if downsample is True and sf > 128:
        if sf % 100 == 0 or sf % 128 == 0:
            new_sf = 100 if sf % 100 == 0 else 128
            fac = int(sf / new_sf)
            sf = new_sf
            data = data[::fac]
            logger.info('Downsampled data by a factor of %i', fac)
            if hypno is not None:
                hypno = hypno[::fac]
                assert hypno.size == data.size
        else:
            logger.warning("Cannot downsample if sf is not a mutiple of 100 "
                           "or 128. Skipping downsampling.")

    # Define time vector
    times = np.arange(data.size) / sf

    # Bandpass filter
    data_filt = filter_data(data,
                            sf,
                            freq_sw[0],
                            freq_sw[1],
                            method='fir',
                            verbose=0,
                            l_trans_bandwidth=0.2,
                            h_trans_bandwidth=0.2)

    # Find peaks in data
    # Negative peaks with value comprised between -40 to -300 uV
    idx_neg_peaks, _ = signal.find_peaks(-1 * data_filt, height=amp_neg)

    # Positive peaks with values comprised between 10 to 150 uV
    idx_pos_peaks, _ = signal.find_peaks(data_filt, height=amp_pos)

    # Intersect with sleep stage vector
    if hypno is not None:
        mask = np.in1d(hypno, include)
        idx_mask = np.where(mask)[0]
        idx_neg_peaks = np.intersect1d(idx_neg_peaks,
                                       idx_mask,
                                       assume_unique=True)
        idx_pos_peaks = np.intersect1d(idx_pos_peaks,
                                       idx_mask,
                                       assume_unique=True)

    # If no peaks are detected, return None
    if len(idx_neg_peaks) == 0 or len(idx_pos_peaks) == 0:
        logger.warning('No peaks were found in data. Returning None.')
        return None

    # Make sure that the last detected peak is a positive one
    if idx_pos_peaks[-1] < idx_neg_peaks[-1]:
        # If not, append a fake positive peak one sample after the last neg
        idx_pos_peaks = np.append(idx_pos_peaks, idx_neg_peaks[-1] + 1)

    # For each negative peak, we find the closest following positive peak
    pk_sorted = np.searchsorted(idx_pos_peaks, idx_neg_peaks)
    closest_pos_peaks = idx_pos_peaks[pk_sorted] - idx_neg_peaks
    closest_pos_peaks = closest_pos_peaks[np.nonzero(closest_pos_peaks)]
    idx_pos_peaks = idx_neg_peaks + closest_pos_peaks

    # Now we compute the PTP amplitude and keep only the good peaks
    sw_ptp = np.abs(data_filt[idx_neg_peaks]) + data_filt[idx_pos_peaks]
    good_ptp = np.logical_and(sw_ptp > amp_ptp[0], sw_ptp < amp_ptp[1])

    # If good_ptp is all False
    if all(~good_ptp):
        logger.warning('No slow-wave with good amplitude. Returning None.')
        return None

    sw_ptp = sw_ptp[good_ptp]
    idx_neg_peaks = idx_neg_peaks[good_ptp]
    idx_pos_peaks = idx_pos_peaks[good_ptp]

    # Now we need to check the negative and positive phase duration
    # For that we need to compute the zero crossings of the filtered signal
    zero_crossings = _zerocrossings(data_filt)
    # Make sure that there is a zero-crossing after the last detected peak
    if zero_crossings[-1] < max(idx_pos_peaks[-1], idx_neg_peaks[-1]):
        # If not, append the index of the last peak
        zero_crossings = np.append(zero_crossings,
                                   max(idx_pos_peaks[-1], idx_neg_peaks[-1]))

    # Find distance to previous and following zc
    neg_sorted = np.searchsorted(zero_crossings, idx_neg_peaks)
    previous_neg_zc = zero_crossings[neg_sorted - 1] - idx_neg_peaks
    following_neg_zc = zero_crossings[neg_sorted] - idx_neg_peaks
    neg_phase_dur = (np.abs(previous_neg_zc) + following_neg_zc) / sf

    # Distance (in samples) between the positive peaks and the previous and
    # following zero-crossings
    pos_sorted = np.searchsorted(zero_crossings, idx_pos_peaks)
    previous_pos_zc = zero_crossings[pos_sorted - 1] - idx_pos_peaks
    following_pos_zc = zero_crossings[pos_sorted] - idx_pos_peaks
    pos_phase_dur = (np.abs(previous_pos_zc) + following_pos_zc) / sf

    # We now compute a set of metrics
    sw_start = times[idx_neg_peaks + previous_neg_zc]  # Start in time vector
    sw_end = times[idx_pos_peaks + following_pos_zc]  # End in time vector
    sw_dur = sw_end - sw_start  # Same as pos_phase_dur + neg_phase_dur
    sw_midcrossing = times[idx_neg_peaks + following_neg_zc]  # Neg-to-pos zc
    sw_idx_neg = times[idx_neg_peaks]  # Location of negative peak
    sw_idx_pos = times[idx_pos_peaks]  # Location of positive peak
    # Slope between peak trough and midcrossing
    sw_slope = sw_ptp / (sw_midcrossing - sw_idx_neg)
    # Hypnogram
    if hypno is not None:
        sw_sta = hypno[idx_neg_peaks + previous_neg_zc]
    else:
        sw_sta = np.zeros(sw_dur.shape)

    # And we apply a set of thresholds to remove bad slow waves
    good_sw = np.logical_and.reduce((
        # Data edges
        previous_neg_zc != 0,
        following_neg_zc != 0,
        previous_pos_zc != 0,
        following_pos_zc != 0,
        # Duration criteria
        neg_phase_dur > dur_neg[0],
        neg_phase_dur < dur_neg[1],
        pos_phase_dur > dur_pos[0],
        pos_phase_dur < dur_pos[1],
        # Sanity checks
        sw_midcrossing > sw_start,
        sw_midcrossing < sw_end,
        sw_slope > 0,
    ))

    if all(~good_sw):
        logger.warning('No slow-wave satisfying all criteria. Returning None.')
        return None

    # Create a dictionnary and then a dataframe (much faster)
    sw_params = {
        'Start': sw_start,
        'NegPeak': sw_idx_neg,
        'MidCrossing': sw_midcrossing,
        'PosPeak': sw_idx_pos,
        'End': sw_end,
        'Duration': sw_dur,
        'ValNegPeak': data_filt[idx_neg_peaks],
        'ValPosPeak': data_filt[idx_pos_peaks],
        'PTP': sw_ptp,
        'Slope': sw_slope,
        'Frequency': 1 / sw_dur,
        'Stage': sw_sta,
    }

    df_sw = pd.DataFrame.from_dict(sw_params)[good_sw]

    # Remove all duplicates
    df_sw = df_sw.drop_duplicates(subset=['Start'], keep=False)
    df_sw = df_sw.drop_duplicates(subset=['End'], keep=False)

    if hypno is None:
        df_sw = df_sw.drop(columns=['Stage'])
    else:
        df_sw['Stage'] = df_sw['Stage'].astype(int).astype('category')

    # We need at least 100 detected slow waves to apply the Isolation Forest.
    if remove_outliers and df_sw.shape[0] >= 100:
        from sklearn.ensemble import IsolationForest
        col_keep = [
            'Duration', 'ValNegPeak', 'ValPosPeak', 'PTP', 'Slope', 'Frequency'
        ]
        ilf = IsolationForest(behaviour='new',
                              contamination='auto',
                              max_samples='auto',
                              verbose=0,
                              random_state=42)

        good = ilf.fit_predict(df_sw[col_keep])
        good[good == -1] = 0
        logger.info('%i outliers were removed.', (good == 0).sum())
        # Remove outliers from DataFrame
        df_sw = df_sw[good.astype(bool)]

    logger.info('%i slow-waves were found in data.', df_sw.shape[0])
    return df_sw.reset_index(drop=True)
コード例 #41
0
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

rng = np.random.RandomState(42)

# Generate train data
X = 0.3 * rng.randn(100, 2)
X_train = np.r_[X + 2, X - 2]
# Generate some regular novel observations
X = 0.3 * rng.randn(20, 2)
X_test = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))

# fit the model
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)

# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green')
コード例 #42
0
def test_iforest_deprecation():
    iforest = IsolationForest(behaviour='new')
    warn_msg = "'behaviour' is deprecated in 0.22 and will be removed in 0.24"
    with pytest.warns(DeprecationWarning, match=warn_msg):
        iforest.fit(iris.data)
コード例 #43
0
def test_behaviour_param():
    X_train = [[1, 1], [1, 2], [2, 1]]
    clf1 = IsolationForest(behaviour='old').fit(X_train)
    clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train)
    assert_array_equal(clf1.decision_function([[2., 2.]]),
                       clf2.decision_function([[2., 2.]]))
コード例 #44
0
def main():
    samplers = [
        None,
        InstanceHardnessThreshold(sampling_strategy='majority',
                                  random_state=123,
                                  n_jobs=-1),
        NearMiss(version=1,
                 sampling_strategy='majority',
                 random_state=123,
                 n_jobs=-1),
        NearMiss(version=3,
                 sampling_strategy='majority',
                 random_state=123,
                 n_jobs=-1),
        RandomUnderSampler(sampling_strategy='majority', random_state=123)
    ]

    outliers = [
        None,
        IsolationForest(random_state=123, behaviour='new', contamination=0.1),
        LocalOutlierFactor(n_neighbors=27, contamination=0.1)
    ]

    for sampler in samplers:
        for out in outliers:

            global sampler_str, out_str, perm_str
            sampler_str = sampler.__class__.__name__
            out_str = out.__class__.__name__

            print(f"\nsampler={sampler_str}, outlier={out_str}")

            X, y, X_valid, y_valid = Dataset.read_all()
            X, y, X_valid, y_valid = Modification.apply_standartization(
                X, y, X_valid, y_valid)

            print(X.shape)

            if out is not None:
                X, y = Modification.apply_outliers(X, y, out)
                print(X.shape)

            if sampler is None:
                weights, weight_valid = Modification.make_weights_column(
                    X, y, X_valid, y_valid)
            else:
                weights, weight_valid = None, None
                X, y = Modification.apply_samplers(X, y, sampler)
                if "Instance" in sampler_str:
                    X, y = Modification.apply_samplers(
                        X, y,
                        RandomUnderSampler(sampling_strategy='majority',
                                           random_state=123))

            print("0st perm:")
            perm_str = "0st"
            est = Model.train(X, y, X_valid, y_valid, weights, weight_valid)

            print("1st perm:")
            perm_str = "1st"

            X, y, X_valid, y_valid = Modification.apply_permutation(
                X, y, X_valid, y_valid, est, sampler.__class__.__name__,
                weight_valid)
            est = Model.train(X, y, X_valid, y_valid, weights, weight_valid)

            print("2nd perm:")
            perm_str = "2nd"
            X, y, X_valid, y_valid = Modification.apply_permutation(
                X, y, X_valid, y_valid, est, sampler.__class__.__name__,
                weight_valid)
            Model.train(X, y, X_valid, y_valid, weights, weight_valid)

    print(results)
    analyze_results()
コード例 #45
0
#split X_train  to  normal and outliers

X_train_normal = X_train[X_train['label_filled'] == 0].drop("label_filled",
                                                            axis=1,
                                                            inplace=False)
#X_train_outliers = X_train[X_train['label_filled'] == 1].drop("label_filled",axis=1, inplace=False)
X_test = X_test.drop("label_filled", axis=1, inplace=False)
X_train = X_train.drop("label_filled", axis=1, inplace=False)

#print X_test_normal.size

print("Load data done.")

# fit the model
clf = IsolationForest(n_estimators=1000,
                      contamination=0.05,
                      n_jobs=-1,
                      bootstrap=True)
clf.fit(X_train)
#clf.fit(X_train_normal)

#predict
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

#change predict_labeks  (1:-1)->to (0,1)
y_pred_train = np.where(y_pred_train > 0, 0, 1)
y_pred_test = np.where(y_pred_test > 0, 0, 1)

#print result
print("train data classification report: ")
print(classification_report(y_train, y_pred_train))
コード例 #46
0
ファイル: iforest.py プロジェクト: flaviassantos/pyod
class IForest(BaseDetector):
    """Wrapper of scikit-learn Isolation Forest with more functionalities.

    The IsolationForest 'isolates' observations by randomly selecting a
    feature and then randomly selecting a split value between the maximum and
    minimum values of the selected feature.
    See :cite:`liu2008isolation,liu2012isolation` for details.

    Since recursive partitioning can be represented by a tree structure, the
    number of splittings required to isolate a sample is equivalent to the path
    length from the root node to the terminating node.

    This path length, averaged over a forest of such random trees, is a
    measure of normality and our decision function.

    Random partitioning produces noticeably shorter paths for anomalies.
    Hence, when a forest of random trees collectively produce shorter path
    lengths for particular samples, they are highly likely to be anomalies.

    Parameters
    ----------
    n_estimators : int, optional (default=100)
        The number of base estimators in the ensemble.

    max_samples : int or float, optional (default="auto")
        The number of samples to draw from X to train each base estimator.

            - If int, then draw `max_samples` samples.
            - If float, then draw `max_samples * X.shape[0]` samples.
            - If "auto", then `max_samples=min(256, n_samples)`.

        If max_samples is larger than the number of samples provided,
        all samples will be used for all trees (no sampling).

    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set, i.e. the proportion
        of outliers in the data set. Used when fitting to define the threshold
        on the decision function.

    max_features : int or float, optional (default=1.0)
        The number of features to draw from X to train each base estimator.

            - If int, then draw `max_features` features.
            - If float, then draw `max_features * X.shape[1]` features.

    bootstrap : boolean, optional (default=False)
        If True, individual trees are fit on random subsets of the training
        data sampled with replacement. If False, sampling without replacement
        is performed.

    n_jobs : integer, optional (default=1)
        The number of jobs to run in parallel for both `fit` and `predict`.
        If -1, then the number of jobs is set to the number of cores.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    verbose : int, optional (default=0)
        Controls the verbosity of the tree building process.

    Attributes
    ----------
    estimators_ : list of DecisionTreeClassifier
        The collection of fitted sub-estimators.

    estimators_samples_ : list of arrays
        The subset of drawn samples (i.e., the in-bag samples) for each base
        estimator.

    max_samples_ : integer
        The actual number of samples

    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """

    def __init__(self, n_estimators=100,
                 max_samples="auto",
                 contamination=0.1,
                 max_features=1.,
                 bootstrap=False,
                 n_jobs=1,
                 random_state=None,
                 verbose=0):
        super(IForest, self).__init__(contamination=contamination)
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """
        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = IsolationForest(n_estimators=self.n_estimators,
                                         max_samples=self.max_samples,
                                         contamination=self.contamination,
                                         max_features=self.max_features,
                                         bootstrap=self.bootstrap,
                                         n_jobs=self.n_jobs,
                                         random_state=self.random_state,
                                         verbose=self.verbose)
        self.detector_.fit(X=X,
                           y=None,
                           sample_weight=None)

        # invert decision_scores_. Outliers comes with higher outlier scores.
        self.decision_scores_ = invert_order(
            self.detector_.decision_function(X))
        self._process_decision_scores()
        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        # invert outlier scores. Outliers comes with higher outlier scores
        return invert_order(self.detector_.decision_function(X))

    @property
    def estimators_(self):
        """The collection of fitted sub-estimators.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.estimators_

    @property
    def estimators_samples_(self):
        """The subset of drawn samples (i.e., the in-bag samples) for
        each base estimator.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.estimators_samples_

    @property
    def max_samples_(self):
        """The actual number of samples.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.max_samples_
コード例 #47
0
    ############## feature selection
    lsvc = LinearSVC(C=0.01, penalty="l1",
                     dual=False).fit(x_train_norm, y_train)
    model = SelectFromModel(lsvc, prefit=True)
    x_train_selected = model.transform(x_train_norm)
    x_test_selected = model.transform(x_test_norm)

    ############   remove outliers
    LOF = LocalOutlierFactor(n_neighbors=40, contamination=0.08)
    LOF.fit(x_train_selected)
    y_pred_local = LOF.fit_predict(x_train_selected)
    locations = np.where(y_pred_local == -1)

    rng = np.random.RandomState(42)
    IsoTree = IsolationForest(max_samples=100,
                              random_state=rng,
                              contamination=0.08)
    IsoTree.fit(x_train_selected)
    y_pred_iso = IsoTree.predict(x_train_selected)
    locations = np.where(y_pred_iso == -1)

    x_clean = x_train_selected
    y_clean = y_train

    for i in range(len(y_pred_local) - 1, -1, -1):
        if ((y_pred_iso[i] == -1) and (y_pred_local[i] == -1)):
            x_clean = np.delete(x_clean, i, axis=0)
            y_clean = np.delete(y_clean, i, axis=0)

    ############## CV for paramter tuning
    # x_ktrain, x_ktest, y_ktrain, y_ktest = train_test_split(x_clean, y_clean, test_size=0.4, random_state=0)
コード例 #48
0
from lib import proj_dir

if __name__ == '__main__':
    # %% 载入数据.
    from lib.data_process.tmp import data_denoised as data
    cols = [
        'pm10', 'pm25', 'o3', 'so2', 'co', 'no2', 'aqi', 'clock_num',
        'weekday', 'month', 'sd', 'weather', 'temp', 'wd', 'ws'
    ]

    # %% pairplot作图.
    # sns.set(font_scale = 0.5)
    # pg = sns.pairplot(data[cols], height = 1.0, aspect = 0.8, plot_kws = dict(linewidth = 1e-3, edgecolor = 'b', s = 0.3),
    #                   diag_kind = "hist", diag_kws = dict(bins = 20))
    # plt.tight_layout()
    # plt.savefig(os.path.join(proj_dir, 'graph/pollutants_weather_pair_plot.png'), dpi = 450)

    # %% 异常点检测.
    isoforest = IsolationForest(n_estimators=100, max_samples=0.9)
    X_train = np.array(data[cols])

    idxs = list(range(X_train.shape[0]))
    random.shuffle(idxs)
    X_train = X_train[idxs[:5000], :]

    isoforest.fit(X_train)
    y_pred_train = isoforest.predict(X_train)

    scores = isoforest.decision_function(X_train)
コード例 #49
0
ファイル: iForest.py プロジェクト: yli96/cmuxstream-baselines
def run_IForest(X, labels, params):
    clf = IsolationForest(n_estimators = params['n_estimators'])
    clf.fit(X, labels)
    scores = clf.decision_function(X)
    auc, ap = compute_statistics(-scores, labels)
    return auc, ap, scores
コード例 #50
0
ファイル: em_bench.py プロジェクト: ngoix/EMMV_benchmarks
    n_samples_train = n_samples // 2
    n_samples_test = n_samples - n_samples_train

    X_train = X[:n_samples_train, :]
    X_test = X[n_samples_train:, :]
    y_train = y[:n_samples_train]
    y_test = y[n_samples_train:]

    # training and testing only on normal data:
    X_train = X_train[y_train == 0]
    y_train = y_train[y_train == 0]
    X_test = X_test[y_test == 0]
    y_test = y_test[y_test == 0]

    # define models:
    iforest = IsolationForest()
    lof = LocalOutlierFactor(n_neighbors=20)
    ocsvm = OneClassSVM()

    lim_inf = X.min(axis=0)
    lim_sup = X.max(axis=0)
    volume_support = (lim_sup - lim_inf).prod()
    t = np.arange(0, 100 / volume_support, 0.01 / volume_support)
    axis_alpha = np.arange(alpha_min, alpha_max, 0.0001)
    unif = np.random.uniform(lim_inf, lim_sup,
                             size=(n_generated, n_features))

    # fit:
    print('IsolationForest processing...')
    iforest = IsolationForest()
    iforest.fit(X_train)
コード例 #51
0
# Going to try some of the other approaches SKL has for outlier detection
# rather than re-do that.

pipeline = Pipeline([('scale', StandardScaler()),
                     ('ocsvm', OneClassSVM(nu=contamination))])

pipeline.fit(X)
#  Visualize the fit
visualize_fit(X, pipeline)
plt.xlabel('Latency (ms)')
plt.ylabel('Throughput (mb/s)')
plt.title("OneClassSVM: nu: {}".format(contamination))

outliers = X[pipeline.predict(X) == -1]
plt.plot(outliers[:, 0], outliers[:, 1], 'ro', linewidth=2, markersize=10)
plt.show()

pipeline = Pipeline([('scale', StandardScaler()),
                     ('isof', IsolationForest(contamination=contamination))])

pipeline.fit(X)
#  Visualize the fit
visualize_fit(X, pipeline)
plt.xlabel('Latency (ms)')
plt.ylabel('Throughput (mb/s)')
plt.title("IsolationForest: contamination: {}".format(contamination))

outliers = X[pipeline.predict(X) == -1]
plt.plot(outliers[:, 0], outliers[:, 1], 'ro', linewidth=2, markersize=10)
plt.show()
コード例 #52
0
ファイル: bench_isolation_forest.py プロジェクト: ngoix/OCRF
            # indices = np.arange(X.shape[0])
            # np.random.shuffle(indices)  # shuffle the dataset
            # X = X[indices]
            # y = y[indices]

            X_train = X[:n_samples_train, :]
            X_test = X[n_samples_train:, :]
            y_train = y[:n_samples_train]
            y_test = y[n_samples_train:]

            # # training only on normal data:
            # X_train = X_train[y_train == 0]
            # y_train = y_train[y_train == 0]

            print('IsolationForest processing...')
            model = IsolationForest()
            tstart = time()
            model.fit(X_train)
            fit_time += time() - tstart
            tstart = time()

            scoring = -model.decision_function(X_test)  # the lower,the more normal
            predict_time += time() - tstart
            fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring)

            if predict_time + fit_time > max_time:
                raise TimeoutError

            f = interp1d(fpr_, tpr_)
            tpr += f(x_axis)
            tpr[0] = 0.
コード例 #53
0
iListIV = np.array(iListIV)
iListII, iListIII, iListIV = iListII[0], iListIII[0], iListIV[0]

# outlierIV
# pretreatment
succ_corr_normal = succ_corr
succ_corr_normal[iListII] = 0
resp_normal = resp
resp_normal[iListIII] = 0
# isolationforest
succ_resp = np.vstack((succ_corr_normal, resp_normal))
X_train = succ_resp[:, :1440 * train_day]
X_test = succ_resp[:, train_day * 1440:]
X_train = X_train.transpose()
X_test = X_test.transpose()
clf = IsolationForest(n_estimators=100, max_samples=256, contamination=0.001)
clf.fit(X_train)
# plot the train set
succ_resp_index = -clf.score_samples(X_train)
xx, yy = np.meshgrid(
    np.linspace(
        np.min(succ_corr_normal) * 1.1,
        np.max(succ_corr_normal) * 1.1, 500),
    np.linspace(np.min(resp_normal) - 100,
                np.max(resp_normal) * 1.1, 500))
Z = clf.score_samples(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
plt.scatter(X_train[:, 0], X_train[:, 1], marker='x', s=10, c=succ_resp_index)
# plt.colorbar()
plt.xlabel('succ_corr')
コード例 #54
0
ファイル: CEP_Exp_Two.py プロジェクト: igabriel85/dmon-adp
    vecData.index = data.index
    if replace is True:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec

df, t, v = ohEncoding(df, col, replace=True)

print "Shape after encoding"
print type(df.shape)

df_unlabeled = df.drop("Anomaly", axis=1)
print "Shape of the dataframe without anomaly column: "
print df_unlabeled.shape

clf = IsolationForest(max_samples=6444, verbose=1, n_jobs=-1, contamination=0.255555
                      , bootstrap=True, max_features=9)
clf.fit(df_unlabeled)
pred = clf.predict(df_unlabeled)
# print type(pred)
# print data.shape
# print len(pred)
# print pred
anomalies = np.argwhere(pred == -1)
normal = np.argwhere(pred == 1)
# print anomalies
# print type(anomalies)

df['ISO1'] = pred

# iterate over rows
nLabAno = 0
コード例 #55
0
        raise ValueError("invalid embed type %s" % embed_type)

    x_tr = embed.fit_transform(x)
    logger.debug(x_tr)

    if args.plot:
        plot_sample(x_tr,
                    y,
                    pdfpath="temp/spectral_%s%s.pdf" %
                    (sample_type, embed_type))

    ad_type = 'ifor'

    outliers_fraction = 0.1
    ad = IsolationForest(max_samples=256,
                         contamination=outliers_fraction,
                         random_state=None)
    ad.fit(x_tr)
    scores = -ad.decision_function(x_tr)

    top_anoms = np.argsort(-scores)[np.arange(10)]

    if args.plot:

        # to plot probability contours
        xx, yy = np.meshgrid(
            np.linspace(np.min(x_tr[:, 0]), np.max(x_tr[:, 0]), 50),
            np.linspace(np.min(x_tr[:, 1]), np.max(x_tr[:, 1]), 50))
        x_grid = np.c_[xx.ravel(), yy.ravel()]

        Z = -ad.decision_function(x_grid)
コード例 #56
0
from sklearn.ensemble import IsolationForest as IF
import pandas as pd

full_df = pd.read_csv("HTRU_2.csv")

outlier_df = full_df.loc[full_df['Class'] == 1]
inlier_df = full_df.loc[full_df['Class'] == 0].reset_index().drop(['index'],
                                                                  axis=1)

classes = full_df['Class']
full_df.drop(columns=['Class'], inplace=True)
#inlier_df.drop(columns=['Class'], inplace=True)
outlier_df.drop(columns=['Class'], inplace=True)

classifier = IF()  # Isolation Forest instance used to train and score outliers
classifier.fit(full_df)
scores = classifier.decision_function(outlier_df).tolist()
outlier_df['scores'] = scores
outlier_df = outlier_df.sort_values(by=['scores']).reset_index().drop(
    ['index', 'scores'], axis=1)
outlier_df['Class'] = [1 for i in range(outlier_df.shape[0])]

inlier_df = inlier_df.append(outlier_df.head(32)).reset_index().drop(['index'],
                                                                     axis=1)
inlier_df.to_csv('HTRU_2_filtered.csv', index=False)
コード例 #57
0
X_all_1.info()
X_all_0.info()

#set training data, isolation Forest is a semi-supervised algorithm, all training data is normal, and we set 4/5 data set
#as training data
X0_train = X_all_0.loc[0:109196]
print("X0_train############################################")
print(X0_train)
#set test data, the rest of normal data as test set
X0_test = X_all_0.loc[109196:]
print("X0_test############################################")
print(X0_test)

#create a classifier of Isolation Forest
clf = IsolationForest(contamination=0.22)
clf.fit(X0_train)

#use this classifier to predict outliers and test data
y_pred_test = clf.predict(X0_test)
y_pred_outliers = clf.predict(X_all_1)

# print for a confusion matrix and report.
print("amount of target is  0 and prediction is also 0:")
a00 = list(y_pred_test).count(1)
print(a00)

print("amount of target is  0 and prediction is 1:")
a01 = list(y_pred_test).count(-1)
print(a01)
コード例 #58
0
# In[17]:

plt.figure(figsize=(30, 16))
sns.heatmap(df.corr())
plt.show()

# # use anomaly detection with isolation forest

# In[59]:

from sklearn.ensemble import IsolationForest

# In[60]:

model = IsolationForest(n_estimators=100,
                        max_samples='auto',
                        contamination=float(0.1),
                        max_features=1.0)

# In[62]:

replacestruct = {
    "Gender": {
        "Male": 0,
        "Female": 1
    },
    "Customer Type": {
        'Loyal Customer': 0,
        'disloyal Customer': 1
    },
    "Class": {
        'Eco Plus': 0,
コード例 #59
0
    #add the class column back in (it wasn't featurized by itself)
    featureMatrix['class'] = enhancedDf['class']

    #randomly assign 3/4 of the feature df to training and 1/4 to test
    featureMatrix['is_train'] = np.random.uniform(0, 1, len(featureMatrix)) <= .75

    #split out the train and test df's into separate objects
    train, test = featureMatrix[featureMatrix['is_train']==True], featureMatrix[featureMatrix['is_train']==False]

    #drop the is_train column, we don't need it anymore
    train = train.drop('is_train', axis=1)
    test = test.drop('is_train', axis=1)

    #create the isolation forest class and factorize the class column
    clf = IsolationForest(n_estimators=opts.numtrees)


    #train the isolation forest on the training set, dropping the class column (since the trainer takes that as a separate argument)
    print('\nTraining')
    clf.fit(train.drop('class', axis=1))

    #remove the 'answers' from the test set
    testnoclass = test.drop('class', axis=1)

    print('\nPredicting (class 1 is normal, class -1 is malicious)')

    #evaluate our results on the test set.
    test.is_copy = False
    test['prediction'] = clf.predict(testnoclass)
    print