def test_iforest_subsampled_features(): # It tests non-regression for #5732 which failed at predict. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) clf = IsolationForest(max_features=0.8) clf.fit(X_train, y_train) clf.predict(X_test)
def outlier_rejection(X, y): model = IsolationForest(max_samples=100, contamination=0.4, random_state=rng) model.fit(X) y_pred = model.predict(X) return X[y_pred == 1], y[y_pred == 1]
def _predict_self(self): clf = IsolationForest(contamination=self.frac) clf.fit(self.num_X) return clf.predict(self.num_X)
def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]}) for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in grid: # Trained on sparse format sparse_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train_sparse) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) assert_array_equal(sparse_results, dense_results)
def outlier_rejection(X, y): """This will be our function used to resample our dataset.""" model = IsolationForest(max_samples=100, contamination=0.4, random_state=rng) model.fit(X) y_pred = model.predict(X) return X[y_pred == 1], y[y_pred == 1]
def IsolationForest_calulate(train_data_one,test_data): # 使用异常检测方法 clf = IsolationForest() # 训练异常检测模型 clf.fit(train_data_one) # 模型预测 Pre_result = clf.predict(test_data) # 计算多少个概率 prob = len([x for x in Pre_result if x == 1])/len(Pre_result) return prob
def test_iforest_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] # Test LOF clf = IsolationForest(random_state=rng) clf.fit(X) pred = clf.predict(X) # assert detect outliers: assert_greater(np.min(pred[-2:]), np.max(pred[:-2]))
def test_iforest_works(contamination): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] # Test IsolationForest clf = IsolationForest(random_state=rng, contamination=contamination) clf.fit(X) decision_func = -clf.decision_function(X) pred = clf.predict(X) # assert detect outliers: assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2])) assert_array_equal(pred, 6 * [1] + 2 * [-1])
def fit(self, X, y=None): """Fit detector. y is optional for unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) self.detector_.fit(X=X, y=None, sample_weight=None) # invert decision_scores_. Outliers comes with higher outlier scores. self.decision_scores_ = invert_order( self.detector_.decision_function(X)) self._process_decision_scores() return self
def isolationForest(self, settings, mname, data): ''' :param settings: -> settings dictionary :param mname: -> name of serialized cluster :return: -> isolation forest instance :example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False, max_features:1.0, n_jobs:1, random_state:None, verbose:0} ''' # rng = np.random.RandomState(42) if settings['random_state'] == 'None': settings['random_state'] = None if isinstance(settings['bootstrap'], str): settings['bootstrap'] = str2Bool(settings['bootstrap']) if isinstance(settings['verbose'], str): settings['verbose'] = str2Bool(settings['verbose']) if settings['max_samples'] != 'auto': settings['max_samples'] = int(settings['max_samples']) # print type(settings['max_samples']) for k, v in settings.iteritems(): logger.info('[%s] : [INFO] IsolationForest %s set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v) print "IsolationForest %s set to %s" % (k, v) try: clf = IsolationForest(n_estimators=int(settings['n_estimators']), max_samples=settings['max_samples'], contamination=float(settings['contamination']), bootstrap=settings['bootstrap'], max_features=float(settings['max_features']), n_jobs=int(settings['n_jobs']), random_state=settings['random_state'], verbose=settings['verbose']) except Exception as inst: logger.error('[%s] : [ERROR] Cannot instanciate isolation forest with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Error while instanciating isolation forest with %s and %s" % (type(inst), inst.args) sys.exit(1) # clf = IsolationForest(max_samples=100, random_state=rng) # print "*&*&*&& %s" % type(data) try: clf.fit(data) except Exception as inst: logger.error('[%s] : [ERROR] Cannot fit isolation forest model with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(1) predict = clf.predict(data) print "Anomaly Array:" print predict self.__serializemodel(clf, 'isoforest', mname) return clf
def test_score_samples(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = IsolationForest(contamination=0.1).fit(X_train) clf2 = IsolationForest().fit(X_train) assert_array_equal(clf1.score_samples([[2., 2.]]), clf1.decision_function([[2., 2.]]) + clf1.offset_) assert_array_equal(clf2.score_samples([[2., 2.]]), clf2.decision_function([[2., 2.]]) + clf2.offset_) assert_array_equal(clf1.score_samples([[2., 2.]]), clf2.score_samples([[2., 2.]]))
def predict(self, X, window=DEFAULT_WINDOW): """ Predict if a particular sample is an outlier or not. :param X: the time series to detect of :param type X: pandas.Series :param window: the length of window :param type window: int :return: 1 denotes normal, 0 denotes abnormal. """ x_train = list(range(0, 2 * window + 1)) + list(range(0, 2 * window + 1)) + list(range(0, window + 1)) sample_features = zip(x_train, X) clf = IsolationForest(self.n_estimators, self.max_samples, self.contamination, self.max_feature, self.bootstrap, self.n_jobs, self.random_state, self.verbose) clf.fit(sample_features) predict_res = clf.predict(sample_features) if predict_res[-1] == -1: return 0 return 1
def test_iforest_parallel_regression(): """Check parallel regression.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) ensemble = IsolationForest(n_jobs=3, random_state=0).fit(X_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) ensemble = IsolationForest(n_jobs=1, random_state=0).fit(X_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3)
def test_iforest_performance(): """Test Isolation Forest performs well""" # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) X_train = np.r_[X + 2, X - 2] X_train = X[:100] # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) X_test = np.r_[X[100:], X_outliers] y_test = np.array([0] * 20 + [1] * 20) # fit the model clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train) # predict scores (the lower, the more normal) y_pred = - clf.decision_function(X_test) # check that there is at most 6 errors (false positive or false negative) assert_greater(roc_auc_score(y_test, y_pred), 0.98)
def outlier_removal(df, col, method, params): if method == 'Isolation Forest': do_outlier_removal = IsolationForest(**params) if method == 'Local Outlier Factor': do_outlier_removal = LocalOutlierFactor(**params) else: method == None do_outlier_removal.fit(np.array(df[col])) if method == 'Isolation Forest': outlier_scores = do_outlier_removal.decision_function(np.array(df[col])) df[('meta', 'Outlier Scores - ' + method + str(params))] = outlier_scores is_outlier = do_outlier_removal.predict(np.array(df[col])) df[('meta', 'Outliers - ' + method + str(params))] = is_outlier if method == 'Local Outlier Factor': is_outlier = do_outlier_removal.fit_predict(np.array(df[col])) df[('meta', 'Outliers - ' + method + str(params))] = is_outlier df[('meta', 'Outlier Factor - ' + method + str(params))] = do_outlier_removal.negative_outlier_factor_ return df, do_outlier_removal
def test_iforest_warm_start(): """Test iterative addition of iTrees to an iForest """ rng = check_random_state(0) X = rng.randn(20, 2) # fit first 10 trees clf = IsolationForest(n_estimators=10, max_samples=20, random_state=rng, warm_start=True) clf.fit(X) # remember the 1st tree tree_1 = clf.estimators_[0] # fit another 10 trees clf.set_params(n_estimators=20) clf.fit(X) # expecting 20 fitted trees and no overwritten trees assert len(clf.estimators_) == 20 assert clf.estimators_[0] is tree_1
def _IsolationForest(X): rng = np.random.RandomState(42) clf = IsolationForest(max_samples=X.shape[0], random_state=rng) return clf.fit(X).predict(X)
from sklearn.datasets import load_wine from sklearn.neighbors import KNeighborsClassifier import numpy as np winedata = load_wine() feature_names = winedata.feature_names # %% # derive class 1 wine data inx = np.where(data_y == 1)[0] class_1_y = data_y[inx] class_1_x = data_x[inx, ] # %% from sklearn.ensemble import IsolationForest clf = IsolationForest(contamination='auto') clf.fit(class_1_x) IFprediction = clf.predict(class_1_x) anom_ind = np.where(IFprediction < 0) anom_ind # %% import matplotlib.pyplot as plt import matplotlib.colors as mcolors plt.figure(figsize=(10, 8)) scatter = plt.scatter(class_1_x[:, 0], class_1_x[:, 1], c='slateblue',
X_train, X_test, X_train, X_test = prepare_data_mean() rng = np.random.RandomState(42) # Generate train data X = 0.3 * rng.randn(100, 2) #X_train = np.r_[X + 2, X - 2] # Generate some regular novel observations X = 0.3 * rng.randn(20, 2) #X_test = np.r_[X + 2, X - 2] # Generate some abnormal novel observations # = rng.uniform(low=-4, high=4, size=(20, 2)) # fit the model clf = IsolationForest(max_samples=100, random_state=rng) clf.fit(X_train) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) #y_pred_outliers = clf.predict(X_outliers) # plot the line, the samples, and the nearest vectors to the plane #xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) #Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = clf.decision_function(X_train) # Z = Z.reshape(xx.shape) # # plt.title("IsolationForest") # plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=20, edgecolor='k')
# Example settings n_samples = 300 outliers_fraction = 0.15 n_outliers = int(outliers_fraction * n_samples) n_inliers = n_samples - n_outliers # define outlier/anomaly detection methods to be compared anomaly_algorithms = [("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)), ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)), ("Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42)), ("Local Outlier Factor", LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction))] # Define datasets blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2) datasets = [ make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5, **blobs_params)[0], make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5], **blobs_params)[0], make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3], **blobs_params)[0], 4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] -
idx_test.append(gid) else: print "ERROR" sys.exit(-1) Xtrain.append(column_train) Xtest.append(column_test) Xtrain = np.transpose(np.array(Xtrain)) Xtest = np.transpose(np.array(Xtest)) idx_train = idx_train[:Xtrain.shape[0]] idx_test = idx_test[:Xtest.shape[0]] # fit an iforest iforest = IsolationForest(n_estimators=ntrees, max_samples=sample_frac, max_features=feat_frac, n_jobs=-1, random_state=rng, verbose=1) iforest.fit(Xtrain) # anomaly scores y_pred_train = iforest.predict(Xtrain) y_pred_test = iforest.predict(Xtest) train_feature_values = [(gid, val) for gid, val in zip(idx_train, list(y_pred_train))] test_feature_values = [(gid, val) for gid, val in zip(idx_test, list(y_pred_test))] for i, scenario in enumerate(MALICIOUS_SCENARIOS): all_feature_values = train_feature_values + \ [(gid, feat_value) for gid, feat_value in test_feature_values if gid/100 in BENIGN_SCENARIOS or
# ## Improving the Predicition model ## # This part is about finding a better metric for predicting future house sales regarding their price. # # First, I will detect outliers and delete them from the dataset if needed. # ### Detecting Outliers ### # The first step to improve our learning behaviour is to find outliers and then remove them from the data set if needed. # To detect outliers I will use the Isolation Forest Algorithm which is good for high-dimensional data sets as we have present here. # In[ ]: from sklearn.ensemble import IsolationForest clf = IsolationForest(max_samples=100, random_state=rng) clf.fit(df) y = clf.predict(df) print y # ### Location based prices ### # House prices don't only depend on the size of the house or amount of rooms, but are also really dependant on the location of said house. To get an idea how the position might impact my data I analyse the relationship between location and price in my dataset. # In[ ]: import gmaps gmaps.configure(api_key="AIzaSyDPWAl8lcrK9q-tOkrl64sGkxDnbWz47Ko") locations = df[["lat", "long"]] prices = df["price"]
speed.drop(['vehicle_id', 'ride_id', 'type'], axis = 1, inplace = True) # merge vehicle = pd.merge(rpm, speed, how = 'outer', on = 'timestamp') # drop null values and zero speeds --> neutral gear # speed < 200 to remove outliers vh = vehicle.dropna(axis = 0) vh = vh[(vh['rpm'] > 0) & ((vh['speed'] > 0) & (vh['speed'] < 200))] # detect outliers using IsolationForest # assume contamination at 0.01 level distances = pairwise_distances(vh[['rpm','speed']],vh[['rpm','speed']], metric = 'cosine') clf = IsolationForest(max_samples = 100, contamination = 0.01, verbose = 1) clf.fit(distances) labels = clf.predict(distances) vh['outlier'] = labels # remove outliers found by IsolationForest vh = vh[['rpm','speed']][vh['outlier'] == 1] #recompute distances after outlier removal distances = pairwise_distances(vh[['rpm','speed']],vh[['rpm','speed']], metric = 'cosine') # initialize variable to keep best model, its silhouette score and predicted labels best_model = (None, -1, None) # iterate over possible number of gears # since we want to pick model with best silhouette score, can't start with single cluster (k=1)
# Read and load files activity = pd.read_csv('./evaluate/novin_feature.csv', delimiter = ',') #activity1 = pd.read_csv('./evaluate/thirtydays_feature.csv', delimiter = ',') activity1 = pd.read_csv('./evaluate/twentydays_feature.csv', delimiter=',') X = np.array(activity.iloc[0:]) # X = np.array(activity.iloc[:,0:]) X_train = np.array(activity.iloc[:,[2,1]]) X_test = np.array(activity.iloc[:,[1,3]]) # fit the model clf = IsolationForest(max_samples=99, random_state=rng) clf.fit(X_train) # Predict new test-set pred_new = clf.predict(X_test) y = pd.DataFrame(data=pred_new) y_pred = np.array(y) print(pred_new) #estimate error rate in predicting test_error = pred_new[pred_new == 1].size #print(test_error) #print(y_pred_test) #we use pickle to save our classifier so next time we dont have to re-train with open('isolation.pickle', 'wb') as f:
class IForest(BaseDetector): """Wrapper of scikit-learn Isolation Forest with more functionalities. The IsolationForest 'isolates' observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. See :cite:`liu2008isolation,liu2012isolation` for details. Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node. This path length, averaged over a forest of such random trees, is a measure of normality and our decision function. Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies. Parameters ---------- n_estimators : int, optional (default=100) The number of base estimators in the ensemble. max_samples : int or float, optional (default="auto") The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If "auto", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling). contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. max_features : int or float, optional (default=1.0) The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. bootstrap : boolean, optional (default=False) If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed. n_jobs : integer, optional (default=1) The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. behaviour : str, default='old' Behaviour of the ``decision_function`` which can be either 'old' or 'new'. Passing ``behaviour='new'`` makes the ``decision_function`` change to match other anomaly detection algorithm API which will be the default behaviour in the future. As explained in details in the ``offset_`` attribute documentation, the ``decision_function`` becomes dependent on the contamination parameter, in such a way that 0 becomes its natural threshold to detect outliers. .. versionadded:: 0.7.0 ``behaviour`` is added in 0.7.0 for back-compatibility purpose. .. deprecated:: 0.20 ``behaviour='old'`` is deprecated in sklearn 0.20 and will not be possible in 0.22. .. deprecated:: 0.22 ``behaviour`` parameter will be deprecated in sklearn 0.22 and removed in 0.24. .. warning:: Only applicable for sklearn 0.20 above. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. verbose : int, optional (default=0) Controls the verbosity of the tree building process. Attributes ---------- estimators_ : list of DecisionTreeClassifier The collection of fitted sub-estimators. estimators_samples_ : list of arrays The subset of drawn samples (i.e., the in-bag samples) for each base estimator. max_samples_ : integer The actual number of samples decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__(self, n_estimators=100, max_samples="auto", contamination=0.1, max_features=1., bootstrap=False, n_jobs=1, behaviour='old', random_state=None, verbose=0): super(IForest, self).__init__(contamination=contamination) self.n_estimators = n_estimators self.max_samples = max_samples self.max_features = max_features self.bootstrap = bootstrap self.n_jobs = n_jobs self.behaviour = behaviour self.random_state = random_state self.verbose = verbose def fit(self, X, y=None): """Fit detector. y is optional for unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) # In sklearn 0.20+ new behaviour is added (arg behaviour={'new','old'}) # to IsolationForest that shifts the location of the anomaly scores # noinspection PyProtectedMember if _sklearn_version_20(): self.detector_ = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, behaviour=self.behaviour, random_state=self.random_state, verbose=self.verbose) # Do not pass behaviour argument when sklearn version is < 0.20 else: # pragma: no cover self.detector_ = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) self.detector_.fit(X=X, y=None, sample_weight=None) # invert decision_scores_. Outliers comes with higher outlier scores. self.decision_scores_ = invert_order( self.detector_.decision_function(X)) self._process_decision_scores() return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) # invert outlier scores. Outliers comes with higher outlier scores return invert_order(self.detector_.decision_function(X)) @property def estimators_(self): """The collection of fitted sub-estimators. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_ @property def estimators_samples_(self): """The subset of drawn samples (i.e., the in-bag samples) for each base estimator. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_samples_ @property def max_samples_(self): """The actual number of samples. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.max_samples_
# Use the bat DataframeToMatrix class features = [ 'Z', 'rejected', 'proto', 'query', 'qclass_name', 'qtype_name', 'rcode_name', 'query_length', 'id.resp_p' ] to_matrix = dataframe_to_matrix.DataFrameToMatrix() bro_matrix = to_matrix.fit_transform(bro_df[features]) print(bro_matrix.shape) # Print out the range of the daterange and some stats print('DataFrame TimeRange: {:s} --> {:s}'.format( str(bro_df['ts'].min()), str(bro_df['ts'].max()))) # Train/fit and Predict anomalous instances using the Isolation Forest model odd_clf = IsolationForest( contamination=0.2) # Marking 20% as odd odd_clf.fit(bro_matrix) bro_df['anomalous'] = [ predict == -1 for predict in odd_clf.predict(bro_matrix) ] # Now we create a new dataframe using the prediction from our classifier odd_df = bro_df[bro_df['anomalous']] # Now we're going to explore our odd observations with help from KMeans odd_matrix = to_matrix.transform(odd_df[features]) #clusters = KMeans(n_clusters=num_clusters).fit_predict(odd_matrix).tolist() batch_kmeans.partial_fit(odd_matrix) clusters = batch_kmeans.predict(odd_matrix).tolist() # Set the cluster number for all the entries in the original dataframe
for i in range(24): X_pca_i = X_pca_list[i] plt.scatter(X_pca_i[:, 0], X_pca_i[:, 1], s=0.8) plt.title('hour ' + str(i)) plt.show() users = df.user.unique() outliers_list = [] for i in range(24): X_pca_i = X_pca_list[i] #Xi = X[df.hours == i] #pca = decomposition.PCA(n_components=2) #pca.fit(Xi) #X_pca_i = pca.transform(Xi) model = IsolationForest(contamination=0.005) model.fit(X_pca_i) pred = model.predict(X_pca_i) outliers = X_pca_i[pred == -1, :] for outlier in outliers: outliers_list += [outlier] plt.scatter(X_pca_i[:, 0], X_pca_i[:, 1], s=.8, color='blue') plt.scatter(outliers[:, 0], outliers[:, 1], s=6., color='red') plt.show() outliers_list = np.array(outliers_list) #.reshape(len(outliers_list),7) idx = [] for i, row in enumerate(X_pca[:]): for outlier in outliers_list[:]: if np.array_equal(row, outlier): idx += [i]
class Detector: def __init__(self, wait=61 * 2, sensitive=3, ignore_continuous=10, max_window=61 * 5): super().__init__() self.data = [] # store data in [n_samples, n_features] self.inputs = [] self.wait = int(wait) # cold start waiting self.max_window = int(max_window) # max data for training self.retrain = 16 # retrain delay time-step self.sensitive = sensitive # every N anomaly should be retrained self.Anomaly = 0 # anomaly count self.sigRetrain = True # signal of retrain self.ignore_continuous = ignore_continuous # anomaly alert every N ticks self.continuous = 0 # counter for counting alert delay self.cont = False # Anomaly continue state self.anomaly_cont_acc = 0 # Anomaly continue counter self.ma = MA(list(range(3, 32, 2)) + [61, 121]) self.madiff = MADIFF(self.ma) self.ewma = EWMA([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]) self.ewmadiff = EWMADIFF(self.ewma) self.dif = DIF() def fit_predict(self, ptr): self.continuous = (self.continuous if self.continuous == 0 else 0 if self.continuous + 1 > self.ignore_continuous else self.continuous + 1) self.inputs.append(float(ptr)) ptr = self.preprocess(float(ptr)) if self.data and len(self.data) >= self.wait: ans = self.vote(ptr) try: if ans == 1 and self.continuous == 0: self.continuous += 1 self.anomaly_cont_acc += 1 return ans elif ans == 1 and self.continuous > 0: self.cont = True self.anomaly_cont_acc += 1 return 0 else: if self.cont: self.cont = False self.ignore_continuous = math.ceil( self.ignore_continuous * 0.6 + self.anomaly_cont_acc * 0.4) self.sensitive = math.ceil(self.sensitive * 0.8 + self.anomaly_cont_acc * 0.2) self.anomaly_cont_acc = 0 self.continuous = 0 return ans except: pass finally: if len(self.data) == self.max_window: _ = self.data.pop(0) self.Anomaly += ans if self.Anomaly >= self.sensitive: self.sigRetrain = True if self.sigRetrain: self.train_model() else: self.data.append(ptr) return 0 def train_model(self): # reset signal and counter self.sigRetrain, self.Anomaly = False, 0 self.iforest = IsolationForest( n_estimators=math.ceil(np.mean(self.ma.periods)) * len(self.data[-1]) // 10 + 120, # n_jobs=os.cpu_count() - 1, ) self.ocsvm = OneClassSVM(kernel="rbf") # num = len(self.data) - 1 if len(self.data) < 31 else 30 self.lof = LocalOutlierFactor( n_neighbors=math.ceil(np.mean(self.ma.periods)), novelty=True, # n_jobs=os.cpu_count() - 1, ) self.ee = EllipticEnvelope(support_fraction=1.0, contamination=0.25) # self.sscalar = StandardScaler().fit(np.array(self.data)) # tmp = self.sscalar.transform(np.array(self.data)) tmp = np.array(self.data) self.ee.fit(tmp) self.ocsvm.fit(tmp) self.lof.fit(tmp) self.iforest.fit(tmp) def vote(self, val): if self.sigRetrain: self.train_model() # tmp = self.sscalar.transform([val]) tmp = [val] ans = ( # -1 is anomaly and 1 is normal self.ee.predict(tmp) + self.ocsvm.predict(tmp) + self.lof.predict(tmp) + self.iforest.predict(tmp)) for i in range( len(self.ma.data.keys()) + len(self.ewma.data.keys()) + 1): ans += self.Boxplot_Anatomy(val, idx=i) self.data.append(val) if len(self.data) % self.retrain == 0: self.sigRetrain = True self.retrain = int(len(self.data)**0.5) - 1 return 1 if ans[0] < 0 else 0 def Boxplot_Anatomy(self, vals, idx=0): upper_bound = np.quantile(np.array( self.data).T[idx], 0.75) + 1.5 * iqr(np.array(self.data).T[idx]) lower_bound = np.quantile(np.array( self.data).T[idx], 0.25) - 1.5 * iqr(np.array(self.data).T[idx]) return -1 if vals[idx] > upper_bound or vals[idx] < lower_bound else 1 def preprocess(self, val): ma = self.ma.get(val) ewma = self.ewma.get(val) return ([val] + ma + ewma + self.dif.get(val) + self.madiff.get(ma) + self.ewmadiff.get(ewma))
{ "model": detector.OutlierDetector, "params": { "model": LocalOutlierFactor(n_neighbors=1, contamination=0.1) }, "df": [ [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0], ], "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0], }, { "model": detector.OutlierDetector, "params": { "model": IsolationForest(n_estimators=100, contamination=0.1) }, "df": [[0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0]], "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0], }, { "model": detector.RegressionAD, "params": { "target": 2, "regressor": LinearRegression() }, "df": [ [0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9], [0, 2, 4, 6, 8, 10, 12, 14, 14, 16, 18], [0, 3, 6, 10, 12, 14, 18, 21, nan, 24, 27],
def initialize_isoForest(seed, n_estimators, max_samples, contamination, **kwargs): isoForest = IsolationForest(n_estimators=n_estimators, max_samples=max_samples, contamination=contamination, n_jobs=-1, random_state=seed, behaviour='new', **kwargs) return isoForest
def Eval(clargs): __version__ = '1.0' usage = """train_flows [options] normaldatafile""" parser = OptionParser(usage=usage, version=__version__) parser.add_option("-x", "--vectorizerfile", action="store", type="string", \ default='/tmp/vectorizers.pkl', help="") parser.add_option("-v", "--verbose", action="store_true", default=False, \ help="enable verbose output") parser.add_option("-o", "--maliciousdatafile", action="store", type="string", \ default=None, help="An optional file of malicious http logs") parser.add_option("-m", "--maxfeaturesperbag", action="store", type="int", \ default=100, help="maximum number of features per bag") parser.add_option("-g", "--ngramsize", action="store", type="int", \ default=7, help="ngram size") parser.add_option("-f", "--features", action="store", type="string", \ default="01000100111111111111", help="An optional file for choosing which features to be extracted") parser.add_option("-t", "--maxtrainingfeatures", action="store", type="int", \ default=50000, help="maximum number of rows to train with per class") parser.add_option("-n", "--numtrees", action="store", type="int", \ default=200, help="number of trees in isolation forest") parser.add_option("-s", "--numsamples", action="store", type="int", \ default=8192, help="number of samples in each tree") Start=time.time() (opts, args) = parser.parse_args(clargs) if len(args) != 2: parser.error('Incorrect number of arguments') ftu=[] features = opts.features for i, j in enumerate(features): if opts.verbose: print(j, all_fields[i]) if j == 1 or j=='1': ftu.append(all_fields[i]) if opts.verbose: print ftu #ftu = ['method', 'user_agent', 'status_code'] # load the http data in to a data frame print('Loading HTTP data') df = load_brofile(args[0], fields_to_use) trainDf = load_brofile(args[1], fields_to_use) total_rows = len(df.index) if opts.verbose: print('Total number of rows: %d' % total_rows) if opts.maliciousdatafile != None: print('Reading malicious training data') df1 = load_brofile(opts.maliciousdatafile, fields_to_use) if opts.verbose: print('Read malicious data with %s rows ' % len(df1.index)) #if (len(df1.index) > opts.maxtrainingfeatures): # if opts.verbose: print('Too many malicious samples for training, downsampling to %d' % opts.maxtrainingfeatures) # df1 = df1.sample(n=opts.maxtrainingfeatures) #set the classes of the dataframes and then stitch them together in to one big dataframe df['class'] = 0 df1['class'] = 1 classedDf = pd.concat([df,df1], ignore_index=True) else: #we weren't passed a file containing class-1 data, so we should generate some of our own. noiseDf = create_noise_contrast(df, numSamples) if opts.verbose: print('Added %s rows of generated malicious data'%numSamples) df['class'] = 0 noiseDf['class'] = 1 classedDf = pd.concat([df,noiseDf], ignore_index=True) #that doesn't matter trainDf['class']=0; #spliting into training and evaluation sets classedDf['is_train']=False trainDf['is_train']=True enhancedDf = enhance_flow(pd.concat([trainDf,classedDf], ignore_index=True), ftu) # construct some vectorizers based on the data in the DF. We need to vectorize future log files the exact same way so we # will be saving these vectorizers to a file. vectorizers = build_vectorizers(enhancedDf, ftu, max_features=opts.maxfeaturesperbag, ngram_size=opts.ngramsize, verbose=opts.verbose) #use the vectorizers to featureize our DF into a numeric feature dataframe featureMatrix = featureize(enhancedDf, ftu, vectorizers, verbose=opts.verbose) #add the class column back in (it wasn't featurized by itself) featureMatrix['class'] = enhancedDf['class'] featureMatrix['is_train'] = enhancedDf['is_train'] #split out the train and test df's into separate objects train, test = featureMatrix[featureMatrix['is_train']==True], featureMatrix[featureMatrix['is_train']==False] #drop the is_train column, we don't need it anymore train = train.drop('is_train', axis=1) test = test.drop('is_train', axis=1) #print('Calculating features') Trees=opts.numtrees Samples=opts.numsamples clf = IsolationForest(n_estimators=Trees, max_samples=Samples) clf.fit(train.drop('class', axis=1)) testnoclass = test.drop('class', axis=1) print('Predicting') test.is_copy = False test['prediction'] = clf.decision_function(testnoclass) + 0.5 print('Analyzing') #get the class-1 (outlier/anomaly) rows from the feature matrix, and drop the prediction so we can investigate them ##From Here Left=0.001 Right=0.01 fpr, tpr, thresholds = roc_curve(test['class'], test['prediction'], pos_label=0) F=interpolate.interp1d(fpr, tpr, assume_sorted=True) x=np.logspace(np.log10(Left), np.log10(Right)) y=F(x) roc_auc=auc(x, y) plt.figure() plt.xscale('log') plt.plot(fpr, tpr, color='b') plt.plot(x,y, color='r') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.plot(plt.xlim(), plt.ylim(), ls="--", c=".3") plt.savefig("fig3.png") plt.clf() plt.close('all') print('Area Under the Curve = %.6f' %(roc_auc)) Min, Sec= divmod( int(time.time() - Start), 60 ) #print Min, Sec target= open('Results.txt', 'a') target.write(str(Trees)+' ') target.write(str(Samples)+' ') target.write(str(Min)+' ') target.write(str(Sec)+' ') target.write(str(roc_auc)) target.write("\n") target.write(str(features)) target.write("\n") target.write("\n") target.close() print("Minutes: %d, Seconds: %d" % (int(Min), int(Sec)) ) return roc_auc
color='red', alpha=1.0) plt.xlabel("Accomodates") plt.ylabel("Price") plt.title(outputTitle) plt.legend() plt.show() X = pd.read_csv("E:/outlier.csv") X = X.iloc[:, 1:3] print(X.info()) sns.jointplot(x="accommodates", y="price", data=X) iso_forest_model = IsolationForest(n_estimators=100, contamination=0.01) iso_forest_model.fit(X) iso_forest_model.estimators_ outlier_scores = iso_forest_model.decision_function(X) decisions_iso = iso_forest_model.predict(X) displayResults( inliers=X[decisions_iso == 1], outliers=X[decisions_iso == -1], classifier=iso_forest_model, outputTitle="detecting potential outliers using isolation forest", outputName="outliers_isolation_forest") lof_model = neighbors.LocalOutlierFactor(n_neighbors=20, contamination=0.01) decisions_lof = lof_model.fit_predict(X) print(lof_model.negative_outlier_factor_)
def removeOutliers(train, labels=None, opt='isolation', cont='auto', rerun=100, outlier_importance=20, max_features=0.2, max_samples=0.2, random_state=0, **kwargs): # Set seed and data size n1, m = train.shape np.random.seed(random_state) # Merge into one dataset with labels if labels is None: data = train else: data = pd.concat([train, labels], axis=1) # Define functions for interation of estimators def IterateResults(estimator, data, rerun): score = np.zeros(n1) print("Outlier detection: Iterating", opt, "estimator", rerun, "times.") print("Cummulative outliers found") def resample_score(seed): np.random.seed(seed) return estimator.fit(data).decision_function(data) mapping = map(resample_score, range(random_state, random_state + rerun)) for i in mapping: # Give more weights to outliers found i[i < 0] = i[i < 0] * outlier_importance score += i print((score < 0).sum(), end="->") print("Done!") return score / rerun def MahalanobisDist(data): def is_pos_def(A): if np.allclose(A, A.T): try: np.linalg.cholesky(A) return True except np.linalg.LinAlgError: return False else: return False covar = np.cov(data, rowvar=False) if is_pos_def(covar): covar_inv = np.linalg.inv(covar) if is_pos_def(covar_inv): mean = np.mean(data, axis=0) diff = data - mean md = np.sqrt(diff.dot(covar_inv).dot(diff.T).diagonal()) return md else: print( "Error: Inverse of Covariance Matrix is not positive definite!" ) else: print("Error: Covariance Matrix is not positive definite!") # Choose method if opt == 'isolation': from sklearn.ensemble import IsolationForest estim = IsolationForest(contamination=cont, behaviour='new', max_samples=max_samples, max_features=max_features, n_estimators=50, n_jobs=-1, **kwargs) decision = estim.fit(data).predict(data) if (rerun > 0): decision = IterateResults(estim, data, rerun) if opt == 'lof': from sklearn.neighbors import LocalOutlierFactor estim = LocalOutlierFactor(contamination=cont, n_neighbors=55, n_jobs=-1) decision = estim.fit_predict(data) if opt == 'svm': from sklearn.svm import OneClassSVM if cont == 'auto': cont = 0.01 estim = OneClassSVM(nu=cont, gamma='scale', tol=1e-3) decision = estim.fit(data).predict(data) if opt == 'covariance': if cont == 'auto': cont = 4 MD = MahalanobisDist(data.values) std = np.std(MD) mean = np.mean(MD) k = 3. * std if True else 2. * std high, low = mean + k, mean - k decision = (MD >= high) * (-2) + (MD <= low) * (-2) + 1 # Print summary information index = decision < 0 print("Outlier values: ", round(index.sum() * 100 / n1, 3), "% (", index.sum(), "/", n1, ")") print("Outlier values", opt, "method indecies:") for i in data[index].index: print(i, end=' ') print() if index.sum() / n1 > 0.1: print("Warning! More than 10% of training observations deleted!") # Discard outliers out = data[np.invert(index)] if labels is None: return out else: train = out.iloc[:, 0:m] labels = pd.DataFrame(out.iloc[:, -1]) return (train, labels)
def clean(ar, args, arch): """Cleans the archive and returns the cleaned copy. """ ar_name = ar.get_filename().split()[-1] # Create copy of archive that is used to grab the profiles if args.bandpass: patient = calibrate_bandpass(ar) else: patient = ar.clone() patient.pscrunch() patient.remove_baseline() # Grab the profiles after dedispersing them patient.dedisperse() data = patient.get_data()[:, 0, :, :] if np.count_nonzero(data) == 0: print("Archive is empty.") return ar profile_number = data[:, :, 0].size pca_components = min(args.components, data.shape[2]) if not args.quiet: print("Number of Profiles: %s" % profile_number) if not args.disable_pca: print("PCA parameters: n_components: %s" % pca_components) print( "IsolationForest parameters: n_estimators: %s max_samples: %s max_features: %s" % (args.estimators, args.samples, args.max_features)) orig_shape = np.shape(data) # Reshape the profiles for pca computation data = np.reshape(data, (-1, orig_shape[2])) # Delete precleaned profiles if args.weight: orig_weights = ar.get_weights().flatten() known_rfi = np.where(orig_weights == 0) known_non_rfi = np.where(orig_weights != 0) data = np.delete(data, known_rfi, axis=0) # Compute additional features if wanted if args.features or args.disable_pca: array_feat = compute_metrics(data) if args.order: data = np.concatenate((data, array_feat), axis=1) # Compute the pca if not args.disable_pca: pca = PCA(n_components=pca_components, svd_solver="full") data_pca = pca.fit_transform(data) data_features = data_pca if args.features and not args.order: data_features = np.concatenate((data_features, array_feat), axis=1) else: data_features = array_feat print("All features: %s" % (data_features.shape[1])) # Compute the anomaly scores of the isolation forest algorithm # The random_state creates a reproducible result but this may not be the best solution in the future clf = IsolationForest(n_estimators=args.estimators, max_samples=args.samples, max_features=args.max_features, n_jobs=2, random_state=1) clf.fit(data_features) anomaly_factors = clf.decision_function(data_features) # Introduce known weights if args.weight: dummy_anomaly = np.zeros(orig_weights.shape) dummy_anomaly[known_non_rfi] = anomaly_factors dummy_anomaly[known_rfi] = np.inf anomaly_factors_reshape = np.reshape(dummy_anomaly, orig_shape[0:2]) else: anomaly_factors_reshape = np.reshape(anomaly_factors, orig_shape[0:2]) snrs = [] split_values = [] rfi_fracs = [] # Cycle through different rfi fractions and find the best snr min_frac = 0 max_frac = 50 num_frac = 130 for rfi_frac in np.linspace(min_frac, max_frac, num=num_frac): split_value = np.percentile(anomaly_factors, rfi_frac) test_profile = np.sum( data[anomaly_factors >= split_value, :orig_shape[2]], axis=0) profile_object = psrchive.Profile(orig_shape[2]) profile_object.get_amps()[:] = test_profile test_snr = profile_object.snr() snrs.append(test_snr) split_values.append(split_value) rfi_fracs.append(rfi_frac) # print test_snr best_index = int(np.argmax(snrs) + args.additional * num_frac / max_frac) best_snr = snrs[best_index] best_frac = rfi_fracs[best_index] best_split_value = split_values[best_index] if not args.quiet: print("SNR: %.1f RFI fraction: %.4f" % (best_snr, best_frac * 0.01)) # Set the weights in the archive set_weights_archive(ar, anomaly_factors_reshape, best_split_value) # Test if whole channel or subints should be removed if args.bad_chan != 1 or args.bad_subint != 1: ar = find_bad_parts(ar, args) # Create plot that shows zapped( red) and unzapped( blue) profiles if needed if args.print_zap: plt.imshow(anomaly_factors_reshape.T, vmin=best_split_value - 0.0001, vmax=best_split_value, aspect='auto', interpolation='nearest', cmap=cm.coolwarm) plt.gca().invert_yaxis() plt.savefig("%s_%s_%s_%s.png" % (ar_name, args.components, args.estimators, args.samples), bbox_inches='tight') # Create log that contains the used parameters with open("clean.log", "a") as myfile: myfile.write("\n %s: Cleaned %s with %s" % (datetime.datetime.now(), ar_name, args)) return ar, (anomaly_factors, snrs, rfi_fracs)
def isolationForest(dataset, rng): isolationforest = IsolationForest(behaviour='new', max_samples=100, random_state=rng, contamination='auto').fit(dataset) return isolationforest
def spindles_detect(data, sf, hypno=None, include=(1, 2, 3), freq_sp=(12, 15), duration=(0.5, 2), freq_broad=(1, 30), min_distance=500, downsample=True, thresh={ 'rel_pow': 0.2, 'corr': 0.65, 'rms': 1.5 }, remove_outliers=False): """Spindles detection. Parameters ---------- data : array_like Single-channel continuous EEG data. Unit must be uV. sf : float Sampling frequency of the data in Hz. hypno : array_like Sleep stage vector (hypnogram). If the hypnogram is loaded, the detection will only be applied to the value defined in ``include`` (default = N1 + N2 + N3 sleep). ``hypno`` MUST be a 1D array of integers with the same size as data and where -1 = Artefact, 0 = Wake, 1 = N1, 2 = N2, 3 = N3, 4 = REM. If you need help loading your hypnogram vector, please read the Visbrain documentation at http://visbrain.org/sleep. include : tuple, list or int Values in ``hypno`` that will be included in the mask. The default is (1, 2, 3), meaning that the detection is applied on N1, N2 and N3 sleep. This has no effect is ``hypno`` is None. freq_sp : tuple or list Spindles frequency range. Default is 12 to 15 Hz. Please note that YASA uses a FIR filter (implemented in MNE) with a 1.5Hz transition band, which means that for `freq_sp = (12, 15 Hz)`, the -6 dB points are located at 11.25 and 15.75 Hz. duration : tuple or list The minimum and maximum duration of the spindles. Default is 0.5 to 2 seconds. freq_broad : tuple or list Broad band frequency of interest. Default is 1 to 30 Hz. min_distance : int If two spindles are closer than `min_distance` (in ms), they are merged into a single spindles. Default is 500 ms. downsample : boolean If True, the data will be downsampled to 100 Hz or 128 Hz (depending on whether the original sampling frequency is a multiple of 100 or 128, respectively). thresh : dict Detection thresholds:: 'rel_pow' : Relative power (= power ratio freq_sp / freq_broad). 'corr' : Pearson correlation coefficient. 'rms' : Mean(RMS) + 1.5 * STD(RMS). remove_outliers : boolean If True, YASA will automatically detect and remove outliers spindles using an Isolation Forest (implemented in the scikit-learn package). The outliers detection is performed on all the spindles parameters with the exception of the 'Start' and 'End' columns. YASA uses a random seed (42) to ensure reproducible results. Note that this step will only be applied if there are more than 50 detected spindles in the first place. Default to False. Returns ------- sp_params : pd.DataFrame Pandas DataFrame: 'Start' : Start time of each detected spindles (in seconds) 'End' : End time (in seconds) 'Duration' : Duration (in seconds) 'Amplitude' : Amplitude (in uV) 'RMS' : Root-mean-square (in uV) 'AbsPower' : Median absolute power (in log10 uV^2) 'RelPower' : Median relative power (ranging from 0 to 1, in % uV^2) 'Frequency' : Median frequency (in Hz) 'Oscillations' : Number of oscillations (peaks) 'Symmetry' : Symmetry index, ranging from 0 to 1 'Stage' : Sleep stage (only if hypno was provided) Notes ----- For better results, apply this detection only on artefact-free NREM sleep. """ # Safety check data = np.asarray(data, dtype=np.float64) if data.ndim == 2: data = np.squeeze(data) assert data.ndim == 1, 'Wrong data dimension. Please pass 1D data.' assert freq_sp[0] < freq_sp[1] assert freq_broad[0] < freq_broad[1] assert isinstance(downsample, bool), 'Downsample must be True or False.' # Hypno processing if hypno is not None: hypno = np.asarray(hypno, dtype=int) assert hypno.ndim == 1, 'Hypno must be one dimensional.' assert hypno.size == data.size, 'Hypno must have same size as data.' unique_hypno = np.unique(hypno) logger.info('Number of unique values in hypno = %i', unique_hypno.size) if isinstance(include, int): include = [include] else: assert isinstance(include, (tuple, list, np.ndarray)) assert len(include) >= 1, 'include must have at least one element.' if not any(np.in1d(unique_hypno, include)): logger.error('The values in include are not present in hypno. ' 'Switching to hypno = None.') hypno = None # Check data amplitude data_trimstd = trimbothstd(data, cut=0.10) data_ptp = np.ptp(data) logger.info('Number of samples in data = %i', data.size) logger.info('Sampling frequency = %.2f Hz', sf) logger.info('Data duration = %.2f seconds', data.size / sf) logger.info('Trimmed standard deviation of data = %.4f uV', data_trimstd) logger.info('Peak-to-peak amplitude of data = %.4f uV', data_ptp) if not (1 < data_trimstd < 1e3 or 1 < data_ptp < 1e6): logger.error('Wrong data amplitude. Unit must be uV. Returning None.') return None if 'rel_pow' not in thresh.keys(): thresh['rel_pow'] = 0.20 if 'corr' not in thresh.keys(): thresh['corr'] = 0.65 if 'rms' not in thresh.keys(): thresh['rms'] = 1.5 # Check if we can downsample to 100 or 128 Hz if downsample is True and sf > 128: if sf % 100 == 0 or sf % 128 == 0: new_sf = 100 if sf % 100 == 0 else 128 fac = int(sf / new_sf) sf = new_sf data = data[::fac] logger.info('Downsampled data by a factor of %i', fac) if hypno is not None: hypno = hypno[::fac] assert hypno.size == data.size else: logger.warning("Cannot downsample if sf is not a mutiple of 100 " "or 128. Skipping downsampling.") # Create sleep stage vector mask if hypno is not None: mask = np.in1d(hypno, include) else: mask = np.ones(data.size, dtype=bool) # Bandpass filter data = filter_data(data, sf, freq_broad[0], freq_broad[1], method='fir', verbose=0) # The width of the transition band is set to 1.5 Hz on each side, # meaning that for freq_sp = (12, 15 Hz), the -6 dB points are located at # 11.25 and 15.75 Hz. data_sigma = filter_data(data, sf, freq_sp[0], freq_sp[1], l_trans_bandwidth=1.5, h_trans_bandwidth=1.5, method='fir', verbose=0) # Compute the pointwise relative power using interpolated STFT # Here we use a step of 200 ms to speed up the computation. f, t, Sxx = stft_power(data, sf, window=2, step=.2, band=freq_broad, interp=False, norm=True) idx_sigma = np.logical_and(f >= freq_sp[0], f <= freq_sp[1]) rel_pow = Sxx[idx_sigma].sum(0) # Let's interpolate `rel_pow` to get one value per sample # Note that we could also have use the `interp=True` in the `stft_power` # function, however 2D interpolation is much slower than # 1D interpolation. func = interp1d(t, rel_pow, kind='cubic', bounds_error=False, fill_value=0) t = np.arange(data.size) / sf rel_pow = func(t) # Now we apply moving RMS and correlation on the sigma-filtered signal _, mcorr = moving_transform(data_sigma, data, sf, window=.3, step=.1, method='corr', interp=True) _, mrms = moving_transform(data_sigma, data, sf, window=.3, step=.1, method='rms', interp=True) # Hilbert power (to define the instantaneous frequency / power) n = data_sigma.size nfast = next_fast_len(n) analytic = signal.hilbert(data_sigma, N=nfast)[:n] inst_phase = np.angle(analytic) inst_pow = np.square(np.abs(analytic)) # inst_freq = sf / 2pi * 1st-derivative of the phase of the analytic signal inst_freq = (sf / (2 * np.pi) * np.ediff1d(inst_phase)) # Let's define the thresholds if hypno is None: thresh_rms = mrms.mean() + thresh['rms'] * trimbothstd(mrms, cut=0.10) else: thresh_rms = mrms[mask].mean() + thresh['rms'] * \ trimbothstd(mrms[mask], cut=0.10) # Avoid too high threshold caused by Artefacts / Motion during Wake. thresh_rms = min(thresh_rms, 10) idx_rel_pow = (rel_pow >= thresh['rel_pow']).astype(int) idx_mcorr = (mcorr >= thresh['corr']).astype(int) idx_mrms = (mrms >= thresh_rms).astype(int) idx_sum = (idx_rel_pow + idx_mcorr + idx_mrms).astype(int) # Make sure that we do not detect spindles in REM or Wake if hypno != None if hypno is not None: idx_sum[~mask] = 0 # For debugging logger.info('Moving RMS threshold = %.3f', thresh_rms) logger.info('Number of supra-theshold samples for relative power = %i', idx_rel_pow.sum()) logger.info('Number of supra-theshold samples for moving correlation = %i', idx_mcorr.sum()) logger.info('Number of supra-theshold samples for moving RMS = %i', idx_mrms.sum()) # The detection using the three thresholds tends to underestimate the # real duration of the spindle. To overcome this, we compute a soft # threshold by smoothing the idx_sum vector with a 100 ms window. w = int(0.1 * sf) idx_sum = np.convolve(idx_sum, np.ones(w) / w, mode='same') # And we then find indices that are strictly greater than 2, i.e. we find # the 'true' beginning and 'true' end of the events by finding where at # least two out of the three treshold were crossed. where_sp = np.where(idx_sum > 2)[0] # If no events are found, return an empty dataframe if not len(where_sp): logger.warning('No spindles were found in data. Returning None.') return None # Merge events that are too close if min_distance is not None and min_distance > 0: where_sp = _merge_close(where_sp, min_distance, sf) # Extract start, end, and duration of each spindle sp = np.split(where_sp, np.where(np.diff(where_sp) != 1)[0] + 1) idx_start_end = np.array([[k[0], k[-1]] for k in sp]) / sf sp_start, sp_end = idx_start_end.T sp_dur = sp_end - sp_start # Find events with bad duration good_dur = np.logical_and(sp_dur > duration[0], sp_dur < duration[1]) # If no events of good duration are found, return an empty dataframe if all(~good_dur): logger.warning('No spindles were found in data. Returning None.') return None # Initialize empty variables n_sp = len(sp) sp_amp = np.zeros(n_sp) sp_freq = np.zeros(n_sp) sp_rms = np.zeros(n_sp) sp_osc = np.zeros(n_sp) sp_sym = np.zeros(n_sp) sp_abs = np.zeros(n_sp) sp_rel = np.zeros(n_sp) sp_sta = np.zeros(n_sp) # Number of oscillations (= number of peaks separated by at least 60 ms) # --> 60 ms because 1000 ms / 16 Hz = 62.5 ms, in other words, at 16 Hz, # peaks are separated by 62.5 ms. At 11 Hz, peaks are separated by 90 ms. distance = 60 * sf / 1000 for i in np.arange(len(sp))[good_dur]: # Important: detrend the signal to avoid wrong peak-to-peak amplitude sp_x = np.arange(data[sp[i]].size, dtype=np.float64) sp_det = _detrend(sp_x, data[sp[i]]) # sp_det = signal.detrend(data[sp[i]], type='linear') sp_amp[i] = np.ptp(sp_det) # Peak-to-peak amplitude sp_rms[i] = _rms(sp_det) # Root mean square sp_rel[i] = np.median(rel_pow[sp[i]]) # Median relative power # Hilbert-based instantaneous properties sp_inst_freq = inst_freq[sp[i]] sp_inst_pow = inst_pow[sp[i]] sp_abs[i] = np.median(np.log10(sp_inst_pow[sp_inst_pow > 0])) sp_freq[i] = np.median(sp_inst_freq[sp_inst_freq > 0]) # Number of oscillations peaks, peaks_params = signal.find_peaks(sp_det, distance=distance, prominence=(None, None)) sp_osc[i] = len(peaks) # For frequency and amplitude, we can also optionally use these # faster alternatives. If we use them, we do not need to compute the # Hilbert transform of the filtered signal. # sp_freq[i] = sf / np.mean(np.diff(peaks)) # sp_amp[i] = peaks_params['prominences'].max() # Symmetry index sp_sym[i] = peaks[peaks_params['prominences'].argmax()] / sp_det.size # Sleep stage if hypno is not None: sp_sta[i] = hypno[sp[i]][0] # Create a dictionnary sp_params = { 'Start': sp_start, 'End': sp_end, 'Duration': sp_dur, 'Amplitude': sp_amp, 'RMS': sp_rms, 'AbsPower': sp_abs, 'RelPower': sp_rel, 'Frequency': sp_freq, 'Oscillations': sp_osc, 'Symmetry': sp_sym, 'Stage': sp_sta } df_sp = pd.DataFrame.from_dict(sp_params)[good_dur].reset_index(drop=True) if hypno is None: df_sp = df_sp.drop(columns=['Stage']) else: df_sp['Stage'] = df_sp['Stage'].astype(int).astype('category') # We need at least 50 detected spindles to apply the Isolation Forest. if remove_outliers and df_sp.shape[0] >= 50: from sklearn.ensemble import IsolationForest df_sp_dummies = pd.get_dummies(df_sp) col_keep = df_sp_dummies.columns.difference(['Start', 'End']) ilf = IsolationForest(behaviour='new', contamination='auto', max_samples='auto', verbose=0, random_state=42) good = ilf.fit_predict(df_sp_dummies[col_keep]) good[good == -1] = 0 logger.info('%i outliers were removed.', (good == 0).sum()) # Remove outliers from DataFrame df_sp = df_sp[good.astype(bool)].reset_index(drop=True) logger.info('%i spindles were found in data.', df_sp.shape[0]) return df_sp
def training_oulier_testdata(self, data, outlier_features): ilf = IsolationForest(n_estimators=min(100, len(data)), n_jobs=-1, verbose=2) ilf.fit(data[outlier_features]) return ilf
if dat == 'http' or dat == 'smtp': y = (y != 'normal.').astype(int) n_samples, n_features = np.shape(X) n_samples_train = n_samples // 2 n_samples_test = n_samples - n_samples_train X = X.astype(float) X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] print('IsolationForest processing...') model = IsolationForest(bootstrap=True, n_jobs=-1) tstart = time() model.fit(X_train) fit_time = time() - tstart tstart = time() scoring = model.predict(X_test) # the lower, the more normal predict_time = time() - tstart fpr, tpr, thresholds = roc_curve(y_test, scoring) AUC = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC for %s (area = %0.3f, train-time: %0.2fs, test-time: %0.2fs)' % (dat, AUC, fit_time, predict_time)) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate')
# Example settings 示例设置 n_samples = 200 outliers_fraction = 0.25 clusters_separation = [0, 1, 2] # define two outlier detection tools to be compared 定义两个异常的检测工具进行比较 classifiers = { "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1), "Robust covariance": EllipticEnvelope(contamination=outliers_fraction), "Isolation Forest": IsolationForest(max_samples=n_samples, contamination=outliers_fraction, random_state=rng), "Local Outlier Factor": LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction) } # Compare given classifiers under given settings 比较给定设置下的分类器 xx, yy = np.meshgrid(np.linspace(-7, 7, 100), np.linspace(-7, 7, 100)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) ground_truth = np.ones(n_samples, dtype=int) ground_truth[-n_outliers:] = -1 # Fit the problem with varying cluster separation 将不同的集群分离拟合 for i, offset in enumerate(clusters_separation): np.random.seed(42)
def sw_detect(data, sf, hypno=None, include=(2, 3), freq_sw=(0.3, 3.5), dur_neg=(0.3, 1.5), dur_pos=(0.1, 1), amp_neg=(40, 300), amp_pos=(10, 200), amp_ptp=(75, 500), downsample=True, remove_outliers=False): """Slow-waves detection. Parameters ---------- data : array_like Single-channel continuous EEG data. Unit must be uV. sf : float Sampling frequency of the data in Hz. hypno : array_like Sleep stage vector (hypnogram). If the hypnogram is loaded, the detection will only be applied to the value defined in ``include`` (default = N2 + N3 sleep). ``hypno`` MUST be a 1D array of integers with the same size as data and where -1 = Artefact, 0 = Wake, 1 = N1, 2 = N2, 3 = N3, 4 = REM. If you need help loading your hypnogram vector, please read the Visbrain documentation at http://visbrain.org/sleep. include : tuple, list or int Values in ``hypno`` that will be included in the mask. The default is (2, 3), meaning that the detection is applied only on N2 and N3 sleep. This has no effect is ``hypno`` is None. freq_sw : tuple or list Slow wave frequency range. Default is 0.3 to 3.5 Hz. Please note that YASA uses a FIR filter (implemented in MNE) with a 0.2Hz transition band, which means that for `freq_sw = (.3, 3.5 Hz)`, the -6 dB points are located at 0.2 and 3.6 Hz. dur_neg : tuple or list The minimum and maximum duration of the negative deflection of the slow wave. Default is 0.3 to 1.5 second. dur_pos : tuple or list The minimum and maximum duration of the positive deflection of the slow wave. Default is 0.1 to 1 second. amp_neg : tuple or list Absolute minimum and maximum negative trough amplitude of the slow-wave. Default is 40 uV to 300 uV. amp_pos : tuple or list Absolute minimum and maximum positive peak amplitude of the slow-wave. Default is 10 uV to 200 uV. amp_ptp : tuple or list Minimum and maximum peak-to-peak amplitude of the slow-wave. Default is 75 uV to 500 uV. downsample : boolean If True, the data will be downsampled to 100 Hz or 128 Hz (depending on whether the original sampling frequency is a multiple of 100 or 128, respectively). remove_outliers : boolean If True, YASA will automatically detect and remove outliers slow-waves using an Isolation Forest (implemented in the scikit-learn package). The outliers detection is performed on the frequency, amplitude and duration parameters of the detected slow-waves. YASA uses a random seed (42) to ensure reproducible results. Note that this step will only be applied if there are more than 100 detected slow-waves in the first place. Default to False. Returns ------- sw_params : pd.DataFrame Pandas DataFrame: 'Start' : Start of each detected slow-wave (in seconds of data) 'NegPeak' : Location of the negative peak (in seconds of data) 'MidCrossing' : Location of the negative-to-positive zero-crossing 'Pospeak' : Location of the positive peak 'End' : End time (in seconds) 'Duration' : Duration (in seconds) 'ValNegPeak' : Amplitude of the negative peak (in uV - filtered) 'ValPosPeak' : Amplitude of the positive peak (in uV - filtered) 'PTP' : Peak to peak amplitude (ValPosPeak - ValNegPeak) 'Slope' : Slope between ``NegPeak`` and ``MidCrossing`` (in uV/sec) 'Frequency' : Frequency of the slow-wave (1 / ``Duration``) 'Stage' : Sleep stage (only if hypno was provided) Notes ----- For better results, apply this detection only on artefact-free NREM sleep. Note that the ``PTP``, ``Slope``, ``ValNegPeak`` and ``ValPosPeak`` are computed on the filtered signal. """ # Safety check data = np.asarray(data, dtype=np.float64) if data.ndim == 2: data = np.squeeze(data) assert data.ndim == 1, 'Wrong data dimension. Please pass 1D data.' assert freq_sw[0] < freq_sw[1] assert amp_ptp[0] < amp_ptp[1] assert isinstance(downsample, bool), 'Downsample must be True or False.' # Hypno processing if hypno is not None: hypno = np.asarray(hypno, dtype=int) assert hypno.ndim == 1, 'Hypno must be one dimensional.' assert hypno.size == data.size, 'Hypno must have same size as data.' unique_hypno = np.unique(hypno) logger.info('Number of unique values in hypno = %i', unique_hypno.size) if isinstance(include, int): include = [include] else: assert isinstance(include, (tuple, list, np.ndarray)) assert len(include) >= 1, 'include must have at least one element.' if not any(np.in1d(unique_hypno, include)): logger.error('The values in include are not present in hypno. ' 'Switching to hypno = None.') hypno = None # Check data amplitude data_trimstd = trimbothstd(data, cut=0.10) data_ptp = np.ptp(data) logger.info('Number of samples in data = %i', data.size) logger.info('Sampling frequency = %.2f Hz', sf) logger.info('Data duration = %.2f seconds', data.size / sf) logger.info('Trimmed standard deviation of data = %.4f uV', data_trimstd) logger.info('Peak-to-peak amplitude of data = %.4f uV', data_ptp) if not (1 < data_trimstd < 1e3 or 1 < data_ptp < 1e6): logger.error('Wrong data amplitude. Unit must be uV. Returning None.') return None # Check if we can downsample to 100 or 128 Hz if downsample is True and sf > 128: if sf % 100 == 0 or sf % 128 == 0: new_sf = 100 if sf % 100 == 0 else 128 fac = int(sf / new_sf) sf = new_sf data = data[::fac] logger.info('Downsampled data by a factor of %i', fac) if hypno is not None: hypno = hypno[::fac] assert hypno.size == data.size else: logger.warning("Cannot downsample if sf is not a mutiple of 100 " "or 128. Skipping downsampling.") # Define time vector times = np.arange(data.size) / sf # Bandpass filter data_filt = filter_data(data, sf, freq_sw[0], freq_sw[1], method='fir', verbose=0, l_trans_bandwidth=0.2, h_trans_bandwidth=0.2) # Find peaks in data # Negative peaks with value comprised between -40 to -300 uV idx_neg_peaks, _ = signal.find_peaks(-1 * data_filt, height=amp_neg) # Positive peaks with values comprised between 10 to 150 uV idx_pos_peaks, _ = signal.find_peaks(data_filt, height=amp_pos) # Intersect with sleep stage vector if hypno is not None: mask = np.in1d(hypno, include) idx_mask = np.where(mask)[0] idx_neg_peaks = np.intersect1d(idx_neg_peaks, idx_mask, assume_unique=True) idx_pos_peaks = np.intersect1d(idx_pos_peaks, idx_mask, assume_unique=True) # If no peaks are detected, return None if len(idx_neg_peaks) == 0 or len(idx_pos_peaks) == 0: logger.warning('No peaks were found in data. Returning None.') return None # Make sure that the last detected peak is a positive one if idx_pos_peaks[-1] < idx_neg_peaks[-1]: # If not, append a fake positive peak one sample after the last neg idx_pos_peaks = np.append(idx_pos_peaks, idx_neg_peaks[-1] + 1) # For each negative peak, we find the closest following positive peak pk_sorted = np.searchsorted(idx_pos_peaks, idx_neg_peaks) closest_pos_peaks = idx_pos_peaks[pk_sorted] - idx_neg_peaks closest_pos_peaks = closest_pos_peaks[np.nonzero(closest_pos_peaks)] idx_pos_peaks = idx_neg_peaks + closest_pos_peaks # Now we compute the PTP amplitude and keep only the good peaks sw_ptp = np.abs(data_filt[idx_neg_peaks]) + data_filt[idx_pos_peaks] good_ptp = np.logical_and(sw_ptp > amp_ptp[0], sw_ptp < amp_ptp[1]) # If good_ptp is all False if all(~good_ptp): logger.warning('No slow-wave with good amplitude. Returning None.') return None sw_ptp = sw_ptp[good_ptp] idx_neg_peaks = idx_neg_peaks[good_ptp] idx_pos_peaks = idx_pos_peaks[good_ptp] # Now we need to check the negative and positive phase duration # For that we need to compute the zero crossings of the filtered signal zero_crossings = _zerocrossings(data_filt) # Make sure that there is a zero-crossing after the last detected peak if zero_crossings[-1] < max(idx_pos_peaks[-1], idx_neg_peaks[-1]): # If not, append the index of the last peak zero_crossings = np.append(zero_crossings, max(idx_pos_peaks[-1], idx_neg_peaks[-1])) # Find distance to previous and following zc neg_sorted = np.searchsorted(zero_crossings, idx_neg_peaks) previous_neg_zc = zero_crossings[neg_sorted - 1] - idx_neg_peaks following_neg_zc = zero_crossings[neg_sorted] - idx_neg_peaks neg_phase_dur = (np.abs(previous_neg_zc) + following_neg_zc) / sf # Distance (in samples) between the positive peaks and the previous and # following zero-crossings pos_sorted = np.searchsorted(zero_crossings, idx_pos_peaks) previous_pos_zc = zero_crossings[pos_sorted - 1] - idx_pos_peaks following_pos_zc = zero_crossings[pos_sorted] - idx_pos_peaks pos_phase_dur = (np.abs(previous_pos_zc) + following_pos_zc) / sf # We now compute a set of metrics sw_start = times[idx_neg_peaks + previous_neg_zc] # Start in time vector sw_end = times[idx_pos_peaks + following_pos_zc] # End in time vector sw_dur = sw_end - sw_start # Same as pos_phase_dur + neg_phase_dur sw_midcrossing = times[idx_neg_peaks + following_neg_zc] # Neg-to-pos zc sw_idx_neg = times[idx_neg_peaks] # Location of negative peak sw_idx_pos = times[idx_pos_peaks] # Location of positive peak # Slope between peak trough and midcrossing sw_slope = sw_ptp / (sw_midcrossing - sw_idx_neg) # Hypnogram if hypno is not None: sw_sta = hypno[idx_neg_peaks + previous_neg_zc] else: sw_sta = np.zeros(sw_dur.shape) # And we apply a set of thresholds to remove bad slow waves good_sw = np.logical_and.reduce(( # Data edges previous_neg_zc != 0, following_neg_zc != 0, previous_pos_zc != 0, following_pos_zc != 0, # Duration criteria neg_phase_dur > dur_neg[0], neg_phase_dur < dur_neg[1], pos_phase_dur > dur_pos[0], pos_phase_dur < dur_pos[1], # Sanity checks sw_midcrossing > sw_start, sw_midcrossing < sw_end, sw_slope > 0, )) if all(~good_sw): logger.warning('No slow-wave satisfying all criteria. Returning None.') return None # Create a dictionnary and then a dataframe (much faster) sw_params = { 'Start': sw_start, 'NegPeak': sw_idx_neg, 'MidCrossing': sw_midcrossing, 'PosPeak': sw_idx_pos, 'End': sw_end, 'Duration': sw_dur, 'ValNegPeak': data_filt[idx_neg_peaks], 'ValPosPeak': data_filt[idx_pos_peaks], 'PTP': sw_ptp, 'Slope': sw_slope, 'Frequency': 1 / sw_dur, 'Stage': sw_sta, } df_sw = pd.DataFrame.from_dict(sw_params)[good_sw] # Remove all duplicates df_sw = df_sw.drop_duplicates(subset=['Start'], keep=False) df_sw = df_sw.drop_duplicates(subset=['End'], keep=False) if hypno is None: df_sw = df_sw.drop(columns=['Stage']) else: df_sw['Stage'] = df_sw['Stage'].astype(int).astype('category') # We need at least 100 detected slow waves to apply the Isolation Forest. if remove_outliers and df_sw.shape[0] >= 100: from sklearn.ensemble import IsolationForest col_keep = [ 'Duration', 'ValNegPeak', 'ValPosPeak', 'PTP', 'Slope', 'Frequency' ] ilf = IsolationForest(behaviour='new', contamination='auto', max_samples='auto', verbose=0, random_state=42) good = ilf.fit_predict(df_sw[col_keep]) good[good == -1] = 0 logger.info('%i outliers were removed.', (good == 0).sum()) # Remove outliers from DataFrame df_sw = df_sw[good.astype(bool)] logger.info('%i slow-waves were found in data.', df_sw.shape[0]) return df_sw.reset_index(drop=True)
import matplotlib.pyplot as plt from sklearn.ensemble import IsolationForest rng = np.random.RandomState(42) # Generate train data X = 0.3 * rng.randn(100, 2) X_train = np.r_[X + 2, X - 2] # Generate some regular novel observations X = 0.3 * rng.randn(20, 2) X_test = np.r_[X + 2, X - 2] # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) # fit the model clf = IsolationForest(max_samples=100, random_state=rng) clf.fit(X_train) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) y_pred_outliers = clf.predict(X_outliers) # plot the line, the samples, and the nearest vectors to the plane xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("IsolationForest") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white') b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green')
def test_iforest_deprecation(): iforest = IsolationForest(behaviour='new') warn_msg = "'behaviour' is deprecated in 0.22 and will be removed in 0.24" with pytest.warns(DeprecationWarning, match=warn_msg): iforest.fit(iris.data)
def test_behaviour_param(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = IsolationForest(behaviour='old').fit(X_train) clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train) assert_array_equal(clf1.decision_function([[2., 2.]]), clf2.decision_function([[2., 2.]]))
def main(): samplers = [ None, InstanceHardnessThreshold(sampling_strategy='majority', random_state=123, n_jobs=-1), NearMiss(version=1, sampling_strategy='majority', random_state=123, n_jobs=-1), NearMiss(version=3, sampling_strategy='majority', random_state=123, n_jobs=-1), RandomUnderSampler(sampling_strategy='majority', random_state=123) ] outliers = [ None, IsolationForest(random_state=123, behaviour='new', contamination=0.1), LocalOutlierFactor(n_neighbors=27, contamination=0.1) ] for sampler in samplers: for out in outliers: global sampler_str, out_str, perm_str sampler_str = sampler.__class__.__name__ out_str = out.__class__.__name__ print(f"\nsampler={sampler_str}, outlier={out_str}") X, y, X_valid, y_valid = Dataset.read_all() X, y, X_valid, y_valid = Modification.apply_standartization( X, y, X_valid, y_valid) print(X.shape) if out is not None: X, y = Modification.apply_outliers(X, y, out) print(X.shape) if sampler is None: weights, weight_valid = Modification.make_weights_column( X, y, X_valid, y_valid) else: weights, weight_valid = None, None X, y = Modification.apply_samplers(X, y, sampler) if "Instance" in sampler_str: X, y = Modification.apply_samplers( X, y, RandomUnderSampler(sampling_strategy='majority', random_state=123)) print("0st perm:") perm_str = "0st" est = Model.train(X, y, X_valid, y_valid, weights, weight_valid) print("1st perm:") perm_str = "1st" X, y, X_valid, y_valid = Modification.apply_permutation( X, y, X_valid, y_valid, est, sampler.__class__.__name__, weight_valid) est = Model.train(X, y, X_valid, y_valid, weights, weight_valid) print("2nd perm:") perm_str = "2nd" X, y, X_valid, y_valid = Modification.apply_permutation( X, y, X_valid, y_valid, est, sampler.__class__.__name__, weight_valid) Model.train(X, y, X_valid, y_valid, weights, weight_valid) print(results) analyze_results()
#split X_train to normal and outliers X_train_normal = X_train[X_train['label_filled'] == 0].drop("label_filled", axis=1, inplace=False) #X_train_outliers = X_train[X_train['label_filled'] == 1].drop("label_filled",axis=1, inplace=False) X_test = X_test.drop("label_filled", axis=1, inplace=False) X_train = X_train.drop("label_filled", axis=1, inplace=False) #print X_test_normal.size print("Load data done.") # fit the model clf = IsolationForest(n_estimators=1000, contamination=0.05, n_jobs=-1, bootstrap=True) clf.fit(X_train) #clf.fit(X_train_normal) #predict y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) #change predict_labeks (1:-1)->to (0,1) y_pred_train = np.where(y_pred_train > 0, 0, 1) y_pred_test = np.where(y_pred_test > 0, 0, 1) #print result print("train data classification report: ") print(classification_report(y_train, y_pred_train))
class IForest(BaseDetector): """Wrapper of scikit-learn Isolation Forest with more functionalities. The IsolationForest 'isolates' observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. See :cite:`liu2008isolation,liu2012isolation` for details. Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node. This path length, averaged over a forest of such random trees, is a measure of normality and our decision function. Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies. Parameters ---------- n_estimators : int, optional (default=100) The number of base estimators in the ensemble. max_samples : int or float, optional (default="auto") The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If "auto", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling). contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. max_features : int or float, optional (default=1.0) The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. bootstrap : boolean, optional (default=False) If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed. n_jobs : integer, optional (default=1) The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. verbose : int, optional (default=0) Controls the verbosity of the tree building process. Attributes ---------- estimators_ : list of DecisionTreeClassifier The collection of fitted sub-estimators. estimators_samples_ : list of arrays The subset of drawn samples (i.e., the in-bag samples) for each base estimator. max_samples_ : integer The actual number of samples decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__(self, n_estimators=100, max_samples="auto", contamination=0.1, max_features=1., bootstrap=False, n_jobs=1, random_state=None, verbose=0): super(IForest, self).__init__(contamination=contamination) self.n_estimators = n_estimators self.max_samples = max_samples self.max_features = max_features self.bootstrap = bootstrap self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose def fit(self, X, y=None): """Fit detector. y is optional for unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) self.detector_.fit(X=X, y=None, sample_weight=None) # invert decision_scores_. Outliers comes with higher outlier scores. self.decision_scores_ = invert_order( self.detector_.decision_function(X)) self._process_decision_scores() return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) # invert outlier scores. Outliers comes with higher outlier scores return invert_order(self.detector_.decision_function(X)) @property def estimators_(self): """The collection of fitted sub-estimators. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_ @property def estimators_samples_(self): """The subset of drawn samples (i.e., the in-bag samples) for each base estimator. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_samples_ @property def max_samples_(self): """The actual number of samples. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.max_samples_
############## feature selection lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(x_train_norm, y_train) model = SelectFromModel(lsvc, prefit=True) x_train_selected = model.transform(x_train_norm) x_test_selected = model.transform(x_test_norm) ############ remove outliers LOF = LocalOutlierFactor(n_neighbors=40, contamination=0.08) LOF.fit(x_train_selected) y_pred_local = LOF.fit_predict(x_train_selected) locations = np.where(y_pred_local == -1) rng = np.random.RandomState(42) IsoTree = IsolationForest(max_samples=100, random_state=rng, contamination=0.08) IsoTree.fit(x_train_selected) y_pred_iso = IsoTree.predict(x_train_selected) locations = np.where(y_pred_iso == -1) x_clean = x_train_selected y_clean = y_train for i in range(len(y_pred_local) - 1, -1, -1): if ((y_pred_iso[i] == -1) and (y_pred_local[i] == -1)): x_clean = np.delete(x_clean, i, axis=0) y_clean = np.delete(y_clean, i, axis=0) ############## CV for paramter tuning # x_ktrain, x_ktest, y_ktrain, y_ktest = train_test_split(x_clean, y_clean, test_size=0.4, random_state=0)
from lib import proj_dir if __name__ == '__main__': # %% 载入数据. from lib.data_process.tmp import data_denoised as data cols = [ 'pm10', 'pm25', 'o3', 'so2', 'co', 'no2', 'aqi', 'clock_num', 'weekday', 'month', 'sd', 'weather', 'temp', 'wd', 'ws' ] # %% pairplot作图. # sns.set(font_scale = 0.5) # pg = sns.pairplot(data[cols], height = 1.0, aspect = 0.8, plot_kws = dict(linewidth = 1e-3, edgecolor = 'b', s = 0.3), # diag_kind = "hist", diag_kws = dict(bins = 20)) # plt.tight_layout() # plt.savefig(os.path.join(proj_dir, 'graph/pollutants_weather_pair_plot.png'), dpi = 450) # %% 异常点检测. isoforest = IsolationForest(n_estimators=100, max_samples=0.9) X_train = np.array(data[cols]) idxs = list(range(X_train.shape[0])) random.shuffle(idxs) X_train = X_train[idxs[:5000], :] isoforest.fit(X_train) y_pred_train = isoforest.predict(X_train) scores = isoforest.decision_function(X_train)
def run_IForest(X, labels, params): clf = IsolationForest(n_estimators = params['n_estimators']) clf.fit(X, labels) scores = clf.decision_function(X) auc, ap = compute_statistics(-scores, labels) return auc, ap, scores
n_samples_train = n_samples // 2 n_samples_test = n_samples - n_samples_train X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] # training and testing only on normal data: X_train = X_train[y_train == 0] y_train = y_train[y_train == 0] X_test = X_test[y_test == 0] y_test = y_test[y_test == 0] # define models: iforest = IsolationForest() lof = LocalOutlierFactor(n_neighbors=20) ocsvm = OneClassSVM() lim_inf = X.min(axis=0) lim_sup = X.max(axis=0) volume_support = (lim_sup - lim_inf).prod() t = np.arange(0, 100 / volume_support, 0.01 / volume_support) axis_alpha = np.arange(alpha_min, alpha_max, 0.0001) unif = np.random.uniform(lim_inf, lim_sup, size=(n_generated, n_features)) # fit: print('IsolationForest processing...') iforest = IsolationForest() iforest.fit(X_train)
# Going to try some of the other approaches SKL has for outlier detection # rather than re-do that. pipeline = Pipeline([('scale', StandardScaler()), ('ocsvm', OneClassSVM(nu=contamination))]) pipeline.fit(X) # Visualize the fit visualize_fit(X, pipeline) plt.xlabel('Latency (ms)') plt.ylabel('Throughput (mb/s)') plt.title("OneClassSVM: nu: {}".format(contamination)) outliers = X[pipeline.predict(X) == -1] plt.plot(outliers[:, 0], outliers[:, 1], 'ro', linewidth=2, markersize=10) plt.show() pipeline = Pipeline([('scale', StandardScaler()), ('isof', IsolationForest(contamination=contamination))]) pipeline.fit(X) # Visualize the fit visualize_fit(X, pipeline) plt.xlabel('Latency (ms)') plt.ylabel('Throughput (mb/s)') plt.title("IsolationForest: contamination: {}".format(contamination)) outliers = X[pipeline.predict(X) == -1] plt.plot(outliers[:, 0], outliers[:, 1], 'ro', linewidth=2, markersize=10) plt.show()
# indices = np.arange(X.shape[0]) # np.random.shuffle(indices) # shuffle the dataset # X = X[indices] # y = y[indices] X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] # # training only on normal data: # X_train = X_train[y_train == 0] # y_train = y_train[y_train == 0] print('IsolationForest processing...') model = IsolationForest() tstart = time() model.fit(X_train) fit_time += time() - tstart tstart = time() scoring = -model.decision_function(X_test) # the lower,the more normal predict_time += time() - tstart fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring) if predict_time + fit_time > max_time: raise TimeoutError f = interp1d(fpr_, tpr_) tpr += f(x_axis) tpr[0] = 0.
iListIV = np.array(iListIV) iListII, iListIII, iListIV = iListII[0], iListIII[0], iListIV[0] # outlierIV # pretreatment succ_corr_normal = succ_corr succ_corr_normal[iListII] = 0 resp_normal = resp resp_normal[iListIII] = 0 # isolationforest succ_resp = np.vstack((succ_corr_normal, resp_normal)) X_train = succ_resp[:, :1440 * train_day] X_test = succ_resp[:, train_day * 1440:] X_train = X_train.transpose() X_test = X_test.transpose() clf = IsolationForest(n_estimators=100, max_samples=256, contamination=0.001) clf.fit(X_train) # plot the train set succ_resp_index = -clf.score_samples(X_train) xx, yy = np.meshgrid( np.linspace( np.min(succ_corr_normal) * 1.1, np.max(succ_corr_normal) * 1.1, 500), np.linspace(np.min(resp_normal) - 100, np.max(resp_normal) * 1.1, 500)) Z = clf.score_samples(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) plt.scatter(X_train[:, 0], X_train[:, 1], marker='x', s=10, c=succ_resp_index) # plt.colorbar() plt.xlabel('succ_corr')
vecData.index = data.index if replace is True: data = data.drop(cols, axis=1) data = data.join(vecData) return data, vecData, vec df, t, v = ohEncoding(df, col, replace=True) print "Shape after encoding" print type(df.shape) df_unlabeled = df.drop("Anomaly", axis=1) print "Shape of the dataframe without anomaly column: " print df_unlabeled.shape clf = IsolationForest(max_samples=6444, verbose=1, n_jobs=-1, contamination=0.255555 , bootstrap=True, max_features=9) clf.fit(df_unlabeled) pred = clf.predict(df_unlabeled) # print type(pred) # print data.shape # print len(pred) # print pred anomalies = np.argwhere(pred == -1) normal = np.argwhere(pred == 1) # print anomalies # print type(anomalies) df['ISO1'] = pred # iterate over rows nLabAno = 0
raise ValueError("invalid embed type %s" % embed_type) x_tr = embed.fit_transform(x) logger.debug(x_tr) if args.plot: plot_sample(x_tr, y, pdfpath="temp/spectral_%s%s.pdf" % (sample_type, embed_type)) ad_type = 'ifor' outliers_fraction = 0.1 ad = IsolationForest(max_samples=256, contamination=outliers_fraction, random_state=None) ad.fit(x_tr) scores = -ad.decision_function(x_tr) top_anoms = np.argsort(-scores)[np.arange(10)] if args.plot: # to plot probability contours xx, yy = np.meshgrid( np.linspace(np.min(x_tr[:, 0]), np.max(x_tr[:, 0]), 50), np.linspace(np.min(x_tr[:, 1]), np.max(x_tr[:, 1]), 50)) x_grid = np.c_[xx.ravel(), yy.ravel()] Z = -ad.decision_function(x_grid)
from sklearn.ensemble import IsolationForest as IF import pandas as pd full_df = pd.read_csv("HTRU_2.csv") outlier_df = full_df.loc[full_df['Class'] == 1] inlier_df = full_df.loc[full_df['Class'] == 0].reset_index().drop(['index'], axis=1) classes = full_df['Class'] full_df.drop(columns=['Class'], inplace=True) #inlier_df.drop(columns=['Class'], inplace=True) outlier_df.drop(columns=['Class'], inplace=True) classifier = IF() # Isolation Forest instance used to train and score outliers classifier.fit(full_df) scores = classifier.decision_function(outlier_df).tolist() outlier_df['scores'] = scores outlier_df = outlier_df.sort_values(by=['scores']).reset_index().drop( ['index', 'scores'], axis=1) outlier_df['Class'] = [1 for i in range(outlier_df.shape[0])] inlier_df = inlier_df.append(outlier_df.head(32)).reset_index().drop(['index'], axis=1) inlier_df.to_csv('HTRU_2_filtered.csv', index=False)
X_all_1.info() X_all_0.info() #set training data, isolation Forest is a semi-supervised algorithm, all training data is normal, and we set 4/5 data set #as training data X0_train = X_all_0.loc[0:109196] print("X0_train############################################") print(X0_train) #set test data, the rest of normal data as test set X0_test = X_all_0.loc[109196:] print("X0_test############################################") print(X0_test) #create a classifier of Isolation Forest clf = IsolationForest(contamination=0.22) clf.fit(X0_train) #use this classifier to predict outliers and test data y_pred_test = clf.predict(X0_test) y_pred_outliers = clf.predict(X_all_1) # print for a confusion matrix and report. print("amount of target is 0 and prediction is also 0:") a00 = list(y_pred_test).count(1) print(a00) print("amount of target is 0 and prediction is 1:") a01 = list(y_pred_test).count(-1) print(a01)
# In[17]: plt.figure(figsize=(30, 16)) sns.heatmap(df.corr()) plt.show() # # use anomaly detection with isolation forest # In[59]: from sklearn.ensemble import IsolationForest # In[60]: model = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.1), max_features=1.0) # In[62]: replacestruct = { "Gender": { "Male": 0, "Female": 1 }, "Customer Type": { 'Loyal Customer': 0, 'disloyal Customer': 1 }, "Class": { 'Eco Plus': 0,
#add the class column back in (it wasn't featurized by itself) featureMatrix['class'] = enhancedDf['class'] #randomly assign 3/4 of the feature df to training and 1/4 to test featureMatrix['is_train'] = np.random.uniform(0, 1, len(featureMatrix)) <= .75 #split out the train and test df's into separate objects train, test = featureMatrix[featureMatrix['is_train']==True], featureMatrix[featureMatrix['is_train']==False] #drop the is_train column, we don't need it anymore train = train.drop('is_train', axis=1) test = test.drop('is_train', axis=1) #create the isolation forest class and factorize the class column clf = IsolationForest(n_estimators=opts.numtrees) #train the isolation forest on the training set, dropping the class column (since the trainer takes that as a separate argument) print('\nTraining') clf.fit(train.drop('class', axis=1)) #remove the 'answers' from the test set testnoclass = test.drop('class', axis=1) print('\nPredicting (class 1 is normal, class -1 is malicious)') #evaluate our results on the test set. test.is_copy = False test['prediction'] = clf.predict(testnoclass) print