def dorc(preprocessedData, random_state, outliers_fraction=0.1): t0 = time.time() clf = IForest(contamination=outliers_fraction, random_state=random_state, n_jobs=-1) clf.fit(preprocessedData) scores = clf.decision_function(preprocessedData) # Apply IQR-based criteria to identify rare cells for further downstream analysis. q3 = np.percentile(scores, 75) iqr = stats.iqr(scores) th = q3 + (1.5 * iqr) # Select indexes that satisfy IQR-based thresholding criteria. indIqr = np.where(scores >= th)[0] print('shape of selected cells : {}'.format(indIqr.shape)) # Create a file with binary predictions predictions = np.zeros(preprocessedData.shape[0]) predictions[indIqr] = 1 # Replace predictions for rare cells with '1'. t1 = time.time() duration = round(t1 - t0, ndigits=4) print("Total running DoRC time is :" + str(duration) + " s") return predictions, scores, duration
def anomaly_detection(data, label): X = data[data.select_dtypes('number').columns.tolist()] y = data[label] y = y.values X = X.drop([label], axis=1) sc = StandardScaler() X = pd.DataFrame(data=sc.fit_transform(X), columns=X.columns) ifo = IForest(contamination=0.01, behaviour='new', n_estimators=1000, max_samples=1024, n_jobs=-1, verbose=1) ifo.fit(X) ifo_pred = ifo.labels_ print('ROC score for Isolation forest: ', roc_auc_score(y, ifo_pred)) utilities.plot_outlier_scores( y, ifo.decision_scores_, bw=0.1, title='Fraud, Isolation forest. (n_estimators={})'.format( ifo.n_estimators)) ae = AutoEncoder(hidden_neurons=[25, 20, 15, 20, 25], hidden_activation='relu', output_activation='sigmoid', optimizer='adam', epochs=20, batch_size=128, dropout_rate=0.2, l2_regularizer=0.0, validation_size=0.1, preprocessing=False, verbose=1, random_state=1, contamination=0.01) ae.fit(X) ae_pred = ae.labels_ print('ROC score for Autoencoder: ', roc_auc_score(y, ae_pred)) utilities.plot_outlier_scores( y, ae.decision_scores_, bw=0.1, title='Fraud, Autoencoder. (epochs={})'.format(ae.epochs)) # Too long to train, under-sample needed lof = LOF(n_neighbors=int(y.sum() * 1.3), contamination=0.01, n_jobs=-1) lof.fit(X) lof_pred = lof.labels_ print('ROC score for LOF: ', roc_auc_score(y, lof_pred)) utilities.plot_outlier_scores( y, lof.decision_scores_, bw=0.1, title='Fraud, Local outliers factor. (n_neighbors={})'.format( lof.n_neighbors)) return y, ifo_pred, ae_pred, lof_pred
class IForestSupervisedKNN(BaseDetector): def __init__(self, get_top=0.8, if_params={}, knn_params={}): super(IForestSupervisedKNN, self).__init__() self.get_top = get_top self.is_fitted = False self.iforest = IForest(**if_params) self.knn = KNN(**knn_params) def fit(self, X, y=None): X = check_array(X) self._set_n_classes(y) self.iforest.fit(X) scores = self.iforest.predict_proba(X)[:, 1] normal_instances = X[np.argsort(scores)[:int(len(X) * self.get_top)]] self.knn.fit(normal_instances) self.decision_scores_ = self.decision_function(X) self._process_decision_scores() self.is_fitted = True return self def decision_function(self, X): check_is_fitted(self, ['is_fitted']) return self.knn.decision_function(X)
def train(): # initiate the train try: # 1-. Take data and configuration data = pd.read_csv(training_path, index_col=0) # Read in any configuration stored with open(param_path, 'r') as tc: hyper_parameters = json.load(tc) # 2-. Set up # instantiate the Isolation Forest model model = IForest(contamination=hyper_parameters['contamination'], behaviour='new') model.fit(data) # fit # 3-. Save the model model_name = 'great_model' with open(os.path.join(model_path, '{}.pkl'.format(model_name)), 'wb') as out: pickle.dump(model, out, protocol=0) # consider that the train fails except Exception as e: # write the log trc = traceback.format_exc() with open(os.path.join(output_path, 'failure'), 'w') as s: s.write('Exception during train: ' + str(e) + '\n' + trc) sys.exit(255)
def densityBased(self): ''' @brief Function that implements the dependency based component @param self @return It returns the vector with the scores of the instances ''' # Initialize the scores scores = np.array([0] * len(self.dataset)).astype(float) for i in range(self.num_iter): iforest = IForest(contamination=self.contamination, behaviour="new") # Number in the interval [50, 1000] subsample_size = np.random.randint(50, 1001) sample = [] if subsample_size >= len(self.dataset): sample = list(range(len(self.dataset))) else: # Take the sample and train the model sample = np.random.choice(len(self.dataset), size=subsample_size, replace=False) iforest.fit(self.dataset[sample]) # Update the score to compute the mean scores[sample] += iforest.decision_scores_ # Return the mean scores = scores / self.num_iter scores = scale(scores) return scores
def iforest(X_train, X_test, Y_train, Y_test): from pyod.models.iforest import IForest model = IForest(random_state=0) model.fit(X_train) pred = model.predict(X_test) acc = np.sum(pred == Y_test) / X_test.shape[0] print(acc) return (acc * 100)
def outlier_iforest(data, **kwargs): import pandas as pd from pyod.models.iforest import IForest contamination = float(kwargs.pop('contamination')) clf = IForest(contamination=contamination) clf.fit(data) pred = clf.labels_ df = pd.DataFrame(pred, columns=['is_outlier']) ret = pd.concat([data, df], axis=1) return ret
def remove_outliars(dft, target_col): ol_model = IForest() #### can be used as a hyperparameter ol_model.fit(dft.drop(columns=target_col)) dft['is_outliar'] = ol_model.labels_ dft = dft[dft['is_outliar'] != 1] dft = dft.drop(columns='is_outliar') print("Completed Outliar Detection - ", datetime.datetime.now()) return dft
def S2(self): self.S1() water_data = self.water_data result = self.result # 数据预处理及模型训练 clean_data = water_data[water_data['S1'] == 0] Y = pd.DataFrame(index=clean_data.index, columns=['S2']) X_train = np.array(clean_data.iloc[:, 1:12]) name = list(clean_data.iloc[:, 1:12].columns.values) scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) clf1 = IForest(contamination=0.05, max_features=11, bootstrap=True) clf2 = KNN(contamination=0.05, n_neighbors=100) clf3 = HBOS(contamination=0.05, n_bins=10) clf4 = PCA(contamination=0.05) clf1.fit(X_train) clf2.fit(X_train) clf3.fit(X_train) clf4.fit(X_train) Y['S2'] = clf1.labels_ * clf2.labels_ * clf3.labels_ * clf4.labels_ water_data = pd.concat([water_data, Y], axis=1) # water_data.loc[water_data['S2'].isna(),['S2']]=0,将S1中异常的,在S2中标注为0; result['统计异常'] = water_data['S2'].values # 寻找异常维度 from sklearn.neighbors import KernelDensity clean_data = water_data[water_data['S1'] == 0] dens = pd.DataFrame(index=clean_data.index, columns=[ 'temperature', 'pH', 'EC', 'ORP', 'DO', 'turbidity', 'transparency', 'COD', 'P', 'NH3N', 'flux' ]) for i in dens.columns: kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit( clean_data[i].values.reshape(-1, 1)) dens[i] = np.exp( kde.score_samples(clean_data[i].values.reshape(-1, 1))) dens = dens.iloc[:, 0:11].rank() dens['S2_names'] = dens.idxmin(axis=1) water_data = pd.concat([water_data, dens['S2_names']], axis=1) self.water_data = water_data result['统计异常维度'] = water_data['S2_names'].values # 存储模型 joblib.dump(scaler, "./water_model/S2_scaler") joblib.dump(clf1, "./water_model/S2_Iforest")
def outlier_detection(x_raw, y_raw): """ Filter all ourlier points :param x_raw: feature in ndarray :param y_raw: label in ndarray :return x_clean, y_clean: cleaned feature and label in ndarray """ # TODO Filter the outliers. print() print("Detecting outliers...") print("Before outlier detection: {}".format(x_raw.shape)) outliers_fraction = 0.04 random_state = np.random.RandomState(42) # all outlier detection method candidate list as follows classifiers = { 'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction), 'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state), 'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction), 'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction), 'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction), 'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state), 'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction), 'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state), 'Improving Supervised Outlier Detection with Unsupervised Representation Learning': XGBOD(contamination=outliers_fraction), } clf_name = 'Isolation Forest' clf = IForest(contamination=outliers_fraction, random_state=random_state) # clf_name = 'Angle-based Outlier Detector (ABOD)' # clf = ABOD(contamination=outliers_fraction, method='default') clf.fit(x_raw) y_pred = clf.predict(x_raw) # for pyod, 1 means outliers and 0 means inliers # for sklearn, -1 means outliers and 1 means inliers idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1] x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0) y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0) print("After outlier detection: {}".format(x_clean.shape)) assert (x_clean.shape[0] == y_clean.shape[0]) return x_clean, y_clean
def getOutlierIForest(dataset): ''' @brief Function that executes IForest algorithm on the dataset and obtains the labels of the dataset indicating which instance is an inlier (0) or outlier (1) @param dataset Dataset on which to try the algorithm @return It returns a list of labels 0 means inlier, 1 means outlier ''' # Initializating the model without verbose ifor = IForest(verbose=0) # Fits the data and obtains labels ifor.fit(dataset) # Return labels return ifor.labels_
class IForestPyOD(BaseAlgorithm): name = "iForest_pyod" def __init__(self, t=100, psi=256): self.iforest = IForest(max_samples=psi, n_estimators=t, behaviour="new", contamination=0.1) def fit(self, X): self.iforest.fit(X) def predict(self, X): return self.iforest.decision_function(X)
def detect(self, X, y=None): """ :param X: Dataframe :param y: np.array :return: outlier scores """ rng = np.random.RandomState(42) # 构造训练样本 n_estimators = 200 # 森林中树的棵数 outliers_fraction = 0.5 # 异常样本比例 clf = IForest(max_samples='auto', random_state=rng, contamination=outliers_fraction, n_estimators=n_estimators) clf.fit(X) scores = clf.decision_function(X) return scores
def detect_outliers(stocks: list, all_stocks_cip: pd.DataFrame, rules=None): """ Returns a dataframe describing those outliers present in stocks based on the provided rules. """ if rules is None: rules = default_point_score_rules() str_rules = { str(r):r for r in rules } rows = [] stocks_by_sector_df = stocks_by_sector() # NB: ETFs in watchlist will have no sector stocks_by_sector_df.index = stocks_by_sector_df['asx_code'] for stock in stocks: #print("Processing stock: ", stock) try: sector = stocks_by_sector_df.at[stock, 'sector_name'] sector_companies = list(stocks_by_sector_df.loc[stocks_by_sector_df['sector_name'] == sector].asx_code) # day_low_high() may raise KeyError when data is currently being fetched, so it appears here... day_low_high_df = day_low_high(stock, all_stocks_cip.columns) except KeyError: warning(None, "Unable to locate watchlist entry: {} - continuing without it".format(stock)) continue state = { 'day_low_high_df': day_low_high_df, # never changes each day, so we init it here 'all_stocks_change_in_percent_df': all_stocks_cip, 'stock': stock, 'daily_range_threshold': 0.20, # 20% at either end of the daily range gets a point } points_by_rule = defaultdict(int) for date in all_stocks_cip.columns: market_avg = all_stocks_cip[date].mean() sector_avg = all_stocks_cip[date].filter(items=sector_companies).mean() stock_move = all_stocks_cip.at[stock, date] state.update({ 'market_avg': market_avg, 'sector_avg': sector_avg, 'stock_move': stock_move, 'date': date }) for rule_name, rule in str_rules.items(): points_by_rule[rule_name] += rule(state) d = { 'stock': stock } d.update(points_by_rule) rows.append(d) df = pd.DataFrame.from_records(rows) df = df.set_index('stock') print(df) from pyod.models.iforest import IForest clf = IForest() clf.fit(df) scores = clf.predict(df) results = [row[0] for row, value in zip(df.iterrows(), scores) if value > 0] #print(results) print("Found {} outlier stocks".format(len(results))) return results
def add_other_class(num, size, pad): res = pd.read_csv("data/train.txt", header=None).values tif_data = [] for r in tqdm(range(res.shape[0])): img = get_cell(res[r][1], res[r][2], size) if img is None: print("img NOT Exist.", res[r]) continue img = img.reshape(-1).tolist() tif_data.append([labels_key[res[r][0]]] + img) tif_data = np.array(tif_data) print(tif_data.shape) np.random.shuffle(tif_data) clf = IForest() clf.fit(tif_data[:, 1:]) i = 0 pos = [] false_num = 0 while True: ix = np.random.randint(pad, dataset.RasterXSize - pad) iy = np.random.randint(pad, dataset.RasterYSize - pad) t = get_cell(ix, iy, size) if t is None: continue t = t.reshape(1, -1) y_test_pred = clf.predict(t)[0] # outlier labels (0 or 1) if y_test_pred == 1: i += 1 pos.append(["其他"] + [ix, iy]) print("{}/{} added.".format(i, num)) else: false_num += 1 print("{}/{} is not include {}.{}. false_num: {}".format( i, num, ix, iy, false_num)) if i == num: break pos = np.concatenate((res, np.array(pos)), axis=0) print(Counter(pos[:, 0])) pd.DataFrame(pos).to_csv("data/train_enhance.txt", index=None, header=None) pos[:, 2] = -1 * (pos[:, 2].astype(np.int)) pd.DataFrame(pos).to_csv("data/train_enhance_view.txt", index=None, header=None)
def transform(self, df2: pd.DataFrame) -> pd.DataFrame: """Apply the transforms to the dataframe.""" le = LabelEncoder() df2['mm'] = df2['make'] + ' ' + df2['model'] g_mm_count = df2.groupby(['mm']).count().reset_index() mm_more_than_100 = g_mm_count[g_mm_count['make'] > 100]['mm'] df2 = df2[df2['mm'].isin(mm_more_than_100)] dfn3 = df2.copy() g1 = dfn3.groupby('mm') clf1 = IForest(contamination=0.01) flag = [1] if 1 in flag: dff1 = pd.DataFrame(columns=[ 'idv_id', 'kms_run', 'owners', 'age', 'Popularity Index', 'quoted_price', 'outlier', 'dep_percentage' ]) for idv_id, idv_id_df in g1: idv_id_df1 = idv_id_df[[ 'kms_run', 'owners', 'age', 'quoted_price', 'dep_percentage' ]] clf1.fit(idv_id_df1) y_pred = clf1.predict(idv_id_df1) idv_id_df['outlier'] = y_pred.tolist() dff1 = pd.concat([dff1, idv_id_df]) outlier_idv_if_dff1 = set(dff1[dff1['outlier'] == 1].index) df2 = df2.drop(outlier_idv_if_dff1) df = df2.copy() X = df[[ 'make', 'model', 'city', 'variant', 'owners', 'kms_run', 'age', 'Popularity Index', 'ex_showroom_price', 'fuel_type', 'transmission', 'color' ]] categorical_feature_mask = X.dtypes == object categorical_cols = X.columns[categorical_feature_mask].tolist() self.dic = {} for i in categorical_cols: X[i] = le.fit_transform(X[i]) self.dic[i] = dict(zip(le.classes_, le.transform(le.classes_))) y = df[['dep_percentage']] aa = pd.concat([X, y], axis=1) return aa
class IForestWrapper: def __init__(self, **kwargs): self._model = IForest(**kwargs) def fit(self, X, T): # unsupervised learning Targets not used self._model.fit(X) return self def predict(self, X): Y = self._model.predict(X) return Y def predict_proba(self, X): probs = self._model.predict_proba(X) return probs
def main(): dataset, label = pre_data() from numpy import nan as NA from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=NA, strategy="mean") dataset = imputer.fit_transform(dataset) x_train, x_test, y_train, y_label = train_test_split(dataset, label, test_size=0.3, random_state=44) # x_train, x_test, y_train, y_label =[], [], [], [] # for i in range(1000): # x_train.append(dataset[i]) # y_train.append(label[i]) # for i in range(6000,10000): # x_train.append(dataset[i]) # y_train.append(label[i]) # x_test = dataset[1000:6000] # y_label = label[1000:6000] for i in range(3): clf_name = 'IForest' clf = IForest() clf.fit(x_train) # get the prediction label and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores from sklearn.metrics import accuracy_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score print(accuracy_score(y_train, y_train_pred)) print(precision_score(y_train, y_train_pred)) print(recall_score(y_train, y_train_pred)) # get the prediction on the test data y_test_pred = clf.predict(x_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(x_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print(accuracy_score(y_label, y_test_pred)) print(precision_score(y_train, y_train_pred)) print(recall_score(y_train, y_train_pred)) print("\nOn Test Data:") evaluate_print(clf_name, y_label, y_test_scores)
def test_pyod_isolation_forest(): import shap import numpy as np from pyod.models.iforest import IForest from sklearn.ensemble.iforest import _average_path_length X, _ = shap.datasets.boston() for max_features in [1.0, 0.75]: iso = IForest(max_features=max_features) iso.fit(X) explainer = shap.TreeExplainer(iso) shap_values = explainer.shap_values(X) score_from_shap = -2**( -(np.sum(shap_values, axis=1) + explainer.expected_value) / _average_path_length(np.array([iso.max_samples_]))[0]) assert np.allclose(iso.detector_.score_samples(X), score_from_shap, atol=1e-7)
def get_IF_scores(dataframe, cols, outliers_fraction=0.01, standardize=True): '''Takes df, a list selected column nmaes, outliers_fraction = 0.01 default Returns: df with Isolation Forest (IF) scores added ''' if standardize: #standardize selected variables minmax = MinMaxScaler(feature_range=(0, 1)) dataframe[cols] = minmax.fit_transform(dataframe[cols]) #Convert dataframe to a numpy array in order to incorprate our algorithm arrays = [] for row in cols: row = dataframe[row].values.reshape(-1, 1) arrays.append(row) X = np.concatenate((arrays), axis=1) #fit clf = IForest(contamination=outliers_fraction, random_state=0) clf.fit(X) # predict raw anomaly score scores_pred = clf.decision_function(X) * -1 # prediction of a datapoint category outlier or inlier y_pred = clf.predict(X) n_inliers = len(y_pred) - np.count_nonzero(y_pred) n_outliers = np.count_nonzero(y_pred == 1) CheckOutliers.df3 = dataframe CheckOutliers.df3['outlier'] = y_pred.tolist() print('OUTLIERS:', n_outliers, 'INLIERS:', n_inliers, 'found with HBOS')
Created on Tue Dec 24 15:54:36 2019 @author: zixing.mei """ from pyod.models.iforest import IForest clf = IForest(behaviour='new', bootstrap=False, contamination=0.1, max_features=1.0, max_samples='auto', n_estimators=500, n_jobs=-1, random_state=None, verbose=0) clf.fit(x) out_pred = clf.predict_proba(x, method='linear')[:, 1] train['out_pred'] = out_pred train['for_pred'] = np.where(train.out_pred > 0.7, '负样本占比', '正样本占比') dic = dict(train.groupby(train.for_pred).bad_ind.agg(np.sum)/ \ train.bad_ind.groupby(train.for_pred).count()) pd.DataFrame(dic, index=[0]) clf = IForest(behaviour='new', bootstrap=False, contamination=0.1, max_features=1.0, max_samples='auto', n_estimators=500, n_jobs=-1, random_state=None,
class TestIForest(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = IForest(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'estimators_') and self.clf.estimators_ is not None) assert_true(hasattr(self.clf, 'estimators_samples_') and self.clf.estimators_samples_ is not None) assert_true(hasattr(self.clf, 'max_samples_') and self.clf.max_samples_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
#Extracting y-labels for the validation data and dropping in X data. Y labels will be the same for all feature sets ofcourse Y_valid1 = X_valid1['Label_<lambda>'] X_valid1.drop(['Label_<lambda>'], inplace=True, axis=1) # Reading original test data to extract the malicious flow data after prediction orig_test_data = pd.read_csv("test_data.csv", header=None) orig_test_data.columns = ['Date_Flow_Start', 'Duration','Protocol','Src_IP','Src_Port','Direction','Dst_IP','Dst_Port','State','Source_Service','Dest_Service','Total_Packets','BiDirection_Bytes','SrcToDst_Bytes'] """TRAINING on Feature Set 1 IFOREST on Default Parameters """ #Training clf1 = IForest(random_state = 42) # Default contamination = 0.1 clf1.fit(X_train1) #Setting threshold using the contamination parameter dec_scores = clf1.decision_scores_ dec_scores_sorted=sorted(dec_scores, reverse=True) a = round(len(X_train1) * clf1.contamination) print(a) anomalies=dec_scores_sorted[:a] threshold = anomalies[-1] # Validation data is scored y_valid_scores = clf1.decision_function(X_valid1) y_valid_scores = pd.Series(y_valid_scores) valid_SrcIP = np.load('preprocessing1_valid_srcIP.npy',allow_pickle=True)
class IF(IForest): def __init__(self, n_estimators=100, max_samples='auto', contamination=0.1, random_state=42, verbose=1): """Isolation Forest (IF) Parameters ---------- n_estimators : int, optional (default=100) The number of base estimators in the ensemble. max_samples : int or float, optional (default="auto") The number of samples to draw from X to train each base estimator. contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e., the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. verbose: int (default is 1) A print level is to control what information should be printed according to the given value. The higher the value is, the more info is printed. random_state: int (default is 42) """ self.n_estimators = n_estimators self.max_samples = max_samples self.contamination = contamination self.verbose = verbose self.random_state = random_state def fit(self, X_train, y_train=None): """Fit the model. y is ignored in unsupervised methods. Parameters ---------- X_train : numpy array of shape (n_samples, n_features) The input samples. y_train : Ignored Not used, present for API consistency by convention. Returns ------- self : object The fitted estimator. """ self.model_ = IForest( n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=1., bootstrap=False, n_jobs=-1, behaviour='deprecated', # no use any more in sklean 0.24. random_state=self.random_state, verbose=self.verbose) self.model_.fit(X=X_train) return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ return self.model_.decision_function(X) def predict_proba(self, X): raise NotImplementedError
def evaluation_od_train(x, y, data_name, model_name="iforest", chosen_subspace=None): """ using anomaly detector to yield anomaly score for each subspace, generate two files: the subspaces with the highest anomaly score & lof score for each subspace :param x: data matrix :param y: class information :param data_name: the data set name, using for naming the ground truth file :param model_name: anomaly detector name, default: lof :param chosen_subspace: use this to only evaluate a subset of the power set of full feature space :return: df: a ground-truth map using anomaly idx as key and ground truth feature subspace as value. """ global chosen_model dim = x.shape[1] ano_idx = np.where(y == 1)[0] n_ano = len(ano_idx) # get all the possible feature subset or just use given subset list f_subsets = utils.get_subset_candidate(dim, chosen_subspace) # score anomalies in each subspace, generate the score matrix n_subsets = len(f_subsets) score_matrix = np.zeros([n_ano, n_subsets]) for i in tqdm(range(n_subsets)): subset = f_subsets[i] x_subset = x[:, subset] if model_name == "iforest": clf = IForest() clf.fit(x_subset) od_score = clf.decision_scores_ elif model_name == "copod": clf = COPOD() clf.fit(x_subset) od_score = clf.decision_scores_ elif model_name == "hbos": clf = HBOS() clf.fit(x_subset) od_score = clf.decision_scores_ else: raise ValueError("unsupported od model") od_score = utils.min_max_norm(od_score) score_matrix[:, i] = od_score[ano_idx] if not os.path.exists(eva_root + "data_od_evaluation/"): os.makedirs(eva_root + "data_od_evaluation/") # score matrix to df anomaly_score_df = pd.DataFrame(data=score_matrix, columns=[str(s) for s in f_subsets]) col_name = anomaly_score_df.columns.tolist() col_name.insert(0, 'ano_idx') anomaly_score_df["ano_idx"] = ano_idx anomaly_score_df = anomaly_score_df.reindex(columns=col_name) path1 = eva_root + "data_od_evaluation/" + data_name + "_score_" + model_name + ".csv" anomaly_score_df.to_csv(path1, index=False) # get the ground truth (one subspace for each anomaly that the anomaly can obtain the highest anomaly score) g_truth_df = pd.DataFrame(columns=["ano_idx", "exp_subspace"]) exp_subspaces = [] for ii, ano_score in enumerate(score_matrix): max_score_idx = int(np.argmax(ano_score)) exp_subset = str(f_subsets[max_score_idx]) exp_subspaces.append(exp_subset) g_truth_df["ano_idx"] = ano_idx g_truth_df["exp_subspace"] = exp_subspaces g_truth_df.astype({"exp_subspace": "object"}) path2 = eva_root + "data_od_evaluation/" + data_name + "_gt_" + model_name + ".csv" g_truth_df.to_csv(path2, index=False) return anomaly_score_df, g_truth_df
contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train IForest detector clf_name = 'IForest' clf = IForest() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
def detect_outliers(stocks: list, all_stocks_cip: pd.DataFrame, rules=None): """ Returns a dataframe describing those outliers present in stocks based on the provided rules. All_stocks_cip is the "change in percent" for at least the stocks present in the specified list """ if rules is None: rules = default_point_score_rules() str_rules = {str(r): r for r in rules} rows = [] stocks_by_sector_df = (stocks_by_sector() ) # NB: ETFs in watchlist will have no sector stocks_by_sector_df.index = stocks_by_sector_df["asx_code"] for stock in stocks: # print("Processing stock: ", stock) try: sector = stocks_by_sector_df.at[stock, "sector_name"] sector_companies = list(stocks_by_sector_df.loc[ stocks_by_sector_df["sector_name"] == sector].asx_code) # day_low_high() may raise KeyError when data is currently being fetched, so it appears here... day_low_high_df = day_low_high(stock, all_stocks_cip.columns) except KeyError: warning( None, "Unable to locate watchlist entry: {} - continuing without it". format(stock), ) continue state = { "day_low_high_df": day_low_high_df, # never changes each day, so we init it here "all_stocks_change_in_percent_df": all_stocks_cip, "stock": stock, "daily_range_threshold": 0.20, # 20% at either end of the daily range gets a point } points_by_rule = defaultdict(int) for date in all_stocks_cip.columns: market_avg = all_stocks_cip[date].mean() sector_avg = all_stocks_cip[date].filter( items=sector_companies).mean() stock_move = all_stocks_cip.at[stock, date] state.update({ "market_avg": market_avg, "sector_avg": sector_avg, "stock_move": stock_move, "date": date, }) for rule_name, rule in str_rules.items(): try: points_by_rule[rule_name] += rule(state) except TypeError: # handle nan's in dataset safely pass d = {"stock": stock} d.update(points_by_rule) rows.append(d) df = pd.DataFrame.from_records(rows) df = df.set_index("stock") # print(df) clf = IForest() clf.fit(df) scores = clf.predict(df) results = [ row[0] for row, value in zip(df.iterrows(), scores) if value > 0 ] # print(results) print("Found {} outlier stocks".format(len(results))) return results
def anomaly_rate(model, validation_df, freq, plot=False): if freq[:-1].isnumeric() and (freq[-1] == 'S' or freq[-1] == 'D'): last_history = (model.start + model.t_scale).round(freq) else: raise ValueError( "Unsupported frequency format. " "Provide any valid frequency for pd.date_range, as multiple of 'D' or 'S'." ) first_validation = validation_df['ds'].iloc[0] last_validation = validation_df['ds'].iloc[-1] if last_validation > last_history: if first_validation <= last_history: validation_df = validation_df.loc[ validation_df['ds'] > last_history].dropna()[['ds', 'y']] start_timer = time.time() future = validation_df['ds'].to_frame(name='ds') prediction_data = model.predict(future)[['ds', 'yhat']] # TOO SLOW! print("--- Prediction: %s seconds ---" % (time.time() - start_timer)) df = pd.DataFrame({ 'y': validation_df['y'].values, 'yhat': prediction_data['yhat'].values }) scaler = MinMaxScaler(feature_range=(0, 1)) df[['y', 'yhat']] = scaler.fit_transform(df[['y', 'yhat']]) clf_name = 'iForest' clf = IForest() clf.fit(df) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores if plot: # fig = plt.figure(facecolor='w', figsize=(10, 6)) # ax = fig.add_subplot(111) # ax.plot(prediction_data['ds'].dt.to_pydatetime(), deviation, 'k.') # ax.plot(prediction_data['ds'][y_train_pred == 1].dt.to_pydatetime(), deviation[y_train_pred == 1], 'r.') # fig.show() fig1 = plt.figure(facecolor='w', figsize=(10, 6)) ax = fig1.add_subplot(111) ax.plot(prediction_data['ds'].dt.to_pydatetime(), y_train_scores) ax.plot( prediction_data['ds'][y_train_pred == 1].dt.to_pydatetime(), y_train_scores[y_train_pred == 1], 'r.') fig1.show() fig2 = plt.figure(facecolor='w', figsize=(10, 6)) ax = fig2.add_subplot(111) ax.plot(validation_df['ds'].dt.to_pydatetime(), validation_df['y'].values) ax.plot(prediction_data['ds'].dt.to_pydatetime(), prediction_data['yhat'].values) ax.vlines( prediction_data['ds'][y_train_pred == 1].dt.to_pydatetime(), min(validation_df['y'].values), max(validation_df['y'].values), 'r') fig2.show() return sum(y_train_pred) / len(y_train_pred) else: raise ValueError( "Validation dataset has no data point after last member of time-series of historical data that", "the model was trained on. Please use validation dataset with last member of the time series", "after %s." % last_history)
# In[19]: from sklearn.manifold import TSNE tsne = TSNE(n_components=2) # Reduce the redunant data X14 = tsne.fit_transform(unique4) plt.figure(figsize=(20, 20)) plt.scatter(X14[:, 0], X14[:, 1], c=pca4.labels_) plt.show() # In[17]: from pyod.models.iforest import IForest iforest2 = IForest() iforest2.fit(unique2) # In[15]: from sklearn.manifold import TSNE tsne = TSNE(n_components=2) # Reduce the redunant data X22 = tsne.fit_transform(unique2) plt.figure(figsize=(20, 20)) plt.scatter(X22[:, 0], X22[:, 1], c=iforest2.labels_) plt.show() # In[22]: iforest3 = IForest()
class TestIForest(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = IForest(contamination=self.contamination, random_state=42) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'estimators_') and self.clf.estimators_ is not None) assert (hasattr(self.clf, 'estimators_samples_') and self.clf.estimators_samples_ is not None) assert (hasattr(self.clf, 'max_samples_') and self.clf.max_samples_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
from pyod.utils.data import generate_data from pyod.utils.data import evaluate_print from pyod.utils.data import visualize if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points X_train, y_train, X_test, y_test = generate_data( n_train=n_train, n_test=n_test, contamination=contamination) # train IForest detector clf_name = 'IForest' clf = IForest() clf.fit(X_train) # get the prediction label and decision_scores_ on the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
class Remove_Outliers(BaseEstimator, TransformerMixin): def __init__(self, target, contamination=.20, random_state=42, methods=['knn', 'iso', 'mcd']): self.target = target self.contamination = contamination self.random_state = random_state self.methods = methods def fit(self, data, y=None): return (None) def transform(self, data, y=None): return (data) def fit_transform(self, dataset, y=None): data = dataset.copy() if 'iso' in self.methods: self.iso_forest = IForest(contamination=self.contamination, random_state=self.random_state, behaviour='new') self.iso_forest.fit(data.drop(self.target, axis=1)) iso_predict = self.iso_forest.predict( data.drop(self.target, axis=1)) data['iso'] = iso_predict if 'knn' in self.methods: self.knn_out = KNN(contamination=self.contamination) self.knn_out.fit(data.drop(self.target, axis=1)) knn_predict = self.knn_out.predict(data.drop(self.target, axis=1)) data['knn'] = knn_predict if 'pca' in self.methods: self.out_pca = PCA_RO(contamination=self.contamination, random_state=self.random_state) self.out_pca.fit(data.drop(self.target, axis=1)) pca_predict = self.out_pca.predict(data.drop(self.target, axis=1)) data['pca'] = pca_predict # use for those features which are gaussian distributed if 'mcd' in self.methods: self.mcd = EllipticEnvelope(contamination=0.01) self.mcd.fit(data.drop(self.target, axis=1)) mcd_predict = self.mcd.predict(data.drop(self.target, axis=1)) data['mcd'] = mcd_predict data['vote_outlier'] = 0 for i in self.methods: data['vote_outlier'] = data['vote_outlier'] + data[i] self.outliers = data[data['vote_outlier'] == len(self.methods)] return dataset[[ True if i not in self.outliers.index else False for i in dataset.index ]]