def test_iforest_subsampled_features(): # It tests non-regression for #5732 which failed at predict. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) clf = IsolationForest(max_features=0.8) clf.fit(X_train, y_train) clf.predict(X_test)
def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]}) for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in grid: # Trained on sparse format sparse_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train_sparse) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) assert_array_equal(sparse_results, dense_results)
def outlier_rejection(X, y): model = IsolationForest(max_samples=100, contamination=0.4, random_state=rng) model.fit(X) y_pred = model.predict(X) return X[y_pred == 1], y[y_pred == 1]
def _predict_self(self): clf = IsolationForest(contamination=self.frac) clf.fit(self.num_X) return clf.predict(self.num_X)
def outlier_rejection(X, y): """This will be our function used to resample our dataset.""" model = IsolationForest(max_samples=100, contamination=0.4, random_state=rng) model.fit(X) y_pred = model.predict(X) return X[y_pred == 1], y[y_pred == 1]
def test_iforest_parallel_regression(): """Check parallel regression.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) ensemble = IsolationForest(n_jobs=3, random_state=0).fit(X_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) ensemble = IsolationForest(n_jobs=1, random_state=0).fit(X_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3)
def IsolationForest_calulate(train_data_one,test_data): # 使用异常检测方法 clf = IsolationForest() # 训练异常检测模型 clf.fit(train_data_one) # 模型预测 Pre_result = clf.predict(test_data) # 计算多少个概率 prob = len([x for x in Pre_result if x == 1])/len(Pre_result) return prob
def test_iforest_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] # Test LOF clf = IsolationForest(random_state=rng) clf.fit(X) pred = clf.predict(X) # assert detect outliers: assert_greater(np.min(pred[-2:]), np.max(pred[:-2]))
def test_iforest_works(contamination): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] # Test IsolationForest clf = IsolationForest(random_state=rng, contamination=contamination) clf.fit(X) decision_func = -clf.decision_function(X) pred = clf.predict(X) # assert detect outliers: assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2])) assert_array_equal(pred, 6 * [1] + 2 * [-1])
def isolationForest(self, settings, mname, data): ''' :param settings: -> settings dictionary :param mname: -> name of serialized cluster :return: -> isolation forest instance :example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False, max_features:1.0, n_jobs:1, random_state:None, verbose:0} ''' # rng = np.random.RandomState(42) if settings['random_state'] == 'None': settings['random_state'] = None if isinstance(settings['bootstrap'], str): settings['bootstrap'] = str2Bool(settings['bootstrap']) if isinstance(settings['verbose'], str): settings['verbose'] = str2Bool(settings['verbose']) if settings['max_samples'] != 'auto': settings['max_samples'] = int(settings['max_samples']) # print type(settings['max_samples']) for k, v in settings.iteritems(): logger.info('[%s] : [INFO] IsolationForest %s set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v) print "IsolationForest %s set to %s" % (k, v) try: clf = IsolationForest(n_estimators=int(settings['n_estimators']), max_samples=settings['max_samples'], contamination=float(settings['contamination']), bootstrap=settings['bootstrap'], max_features=float(settings['max_features']), n_jobs=int(settings['n_jobs']), random_state=settings['random_state'], verbose=settings['verbose']) except Exception as inst: logger.error('[%s] : [ERROR] Cannot instanciate isolation forest with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Error while instanciating isolation forest with %s and %s" % (type(inst), inst.args) sys.exit(1) # clf = IsolationForest(max_samples=100, random_state=rng) # print "*&*&*&& %s" % type(data) try: clf.fit(data) except Exception as inst: logger.error('[%s] : [ERROR] Cannot fit isolation forest model with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(1) predict = clf.predict(data) print "Anomaly Array:" print predict self.__serializemodel(clf, 'isoforest', mname) return clf
def check(self, timeseries_data, check_value): data = timeseries_data.values.tolist() check_value_list = [] check_value_list.append([check_value]) # 孤立森林异常点检查,data是列表形式 x_train = [] for i in range(len(data)): x_train.append([data[i]]) clf = IsolationForest(behaviour='new', max_samples=100) clf.fit(x_train) scores_pred = clf.predict(check_value_list) if scores_pred[-1] < 0: if check_value > data[-1]: return "uprush", scores_pred[-1] else: return "anticlimax", scores_pred[-1] else: return "no alarm", 0
def predict(self, X, window=DEFAULT_WINDOW): """ Predict if a particular sample is an outlier or not. :param X: the time series to detect of :param type X: pandas.Series :param window: the length of window :param type window: int :return: 1 denotes normal, 0 denotes abnormal. """ x_train = list(range(0, 2 * window + 1)) + list(range(0, 2 * window + 1)) + list(range(0, window + 1)) sample_features = zip(x_train, X) clf = IsolationForest(self.n_estimators, self.max_samples, self.contamination, self.max_feature, self.bootstrap, self.n_jobs, self.random_state, self.verbose) clf.fit(sample_features) predict_res = clf.predict(sample_features) if predict_res[-1] == -1: return 0 return 1
def outlier_removal(df, col, method, params): if method == 'Isolation Forest': do_outlier_removal = IsolationForest(**params) if method == 'Local Outlier Factor': do_outlier_removal = LocalOutlierFactor(**params) else: method == None do_outlier_removal.fit(np.array(df[col])) if method == 'Isolation Forest': outlier_scores = do_outlier_removal.decision_function(np.array(df[col])) df[('meta', 'Outlier Scores - ' + method + str(params))] = outlier_scores is_outlier = do_outlier_removal.predict(np.array(df[col])) df[('meta', 'Outliers - ' + method + str(params))] = is_outlier if method == 'Local Outlier Factor': is_outlier = do_outlier_removal.fit_predict(np.array(df[col])) df[('meta', 'Outliers - ' + method + str(params))] = is_outlier df[('meta', 'Outlier Factor - ' + method + str(params))] = do_outlier_removal.negative_outlier_factor_ return df, do_outlier_removal
def single_user_analytics(user_df): """ analyses the data of a user using the IsolationForest algorithm and returns outliers. :param user_df: dataframe should only have the columns: Method, Path, Id, Resource, Status, Src :type user_df: dataframe :return: :rtype: Series """ print('analyzing') converted_user_df = user_df.drop("Date", 1) converted_user_df = pd.get_dummies(converted_user_df, columns=["Method", "Path", "Id", "Resource", "Status", "Src"]) clf = IsolationForest(contamination=0, max_features=len(converted_user_df.columns.values)) clf.fit(converted_user_df) pred = clf.predict(converted_user_df) temp_df = user_df temp_df['e'] = pd.Series(pred, user_df.index) novelty = user_df.loc[temp_df["e"] == -1] return novelty.index.values
def apply_ISOForest(df, contamination=0.1, columns_to_use=[]): ''' input -- pandas data frame with columns [No. Time Source Destination Protocol Length Info] output -- numpy array with {0, 1} where 1 is anomaly ''' iso_frame = df.copy() iso_frame.drop(columns=['No.', 'Source', 'Destination', 'Info'], inplace=True) iso_frame = pd.get_dummies(iso_frame, columns=['Protocol']) iso_frame['Time'] = list(map(int, iso_frame['Time'].values)) names = [] if type(columns_to_use) == str: names = ['Length', 'Protocol_' + columns_to_use] if type(columns_to_use) == list: if len(columns_to_use) == 0: names = list(iso_frame.columns) names.remove('Time') else: names2 = [] for name in names: if len(name.split('_')) != 0: continue elif name.split('_')[1] in columns_to_use: name2.append('Protocol_' + name) names = names2 names.append('Length') iso_frame2 = pd.DataFrame(columns=names) grouped = iso_frame.groupby('Time') for col in names: iso_frame2[col] = grouped[col].agg(np.sum) iso = IsolationForest(n_jobs=-1, n_estimators=20) iso.fit(iso_frame2) pred_out = iso.predict(iso_frame2) pred_out[pred_out == 1] = 0 pred_out[pred_out == -1] = 1 return pred_out
def main(): # Read all the csv files csvPath = "./csv_files" csvFiles = [f for f in listdir(csvPath) if isfile(join(csvPath, f))] dfs = [] for cv in csvFiles: print("CSV Processing: " + cv) dfs.append(pd.read_csv(csvPath + '/' + cv, index_col=False)) df = pd.concat(dfs, ignore_index=True) #df = df.drop('Unnamed: 0', axis=1) # Process all the csv file totalNormal = 0 totalAnomalies = 0 # Turn every column to numeric cols = [c for c in df.columns] nom_cols = ['ip_flags', 'tcp_udp_flags'] for c in nom_cols: le = LabelEncoder() df[c] = le.fit_transform(df[c]) # Use the isolation forest to find the anomalies -1: anomaly 1:normal clf = IsolationForest(n_estimators=10, max_samples=int(0.2 * len(df['time_diff'])) + 1, contamination='auto', behaviour='new') clf.fit(df) df['label'] = clf.predict(df) totalNormal = len(df[df['label'] == 1]) totalAnomalies = len(df[df['label'] == -1]) print("Normal: " + str(totalNormal)) print("Anomaly: " + str(totalAnomalies)) df.to_csv('./processed_csv/' + 'processed_' + cv, index=False) #Save the model filename = 'model.sav' pickle.dump(clf, open(filename, 'wb'))
def visualize(): cluster_dataframe = pd.read_csv("data/cluster.csv") X = cluster_dataframe.drop(['connections'], axis=1) clusters = 4 X = normalize(X) reduced_data = PCA(n_components=clusters).fit_transform(X) #Outlier Test model = IsolationForest(contamination=0.05) model.fit(reduced_data) outliers = model.predict(reduced_data) outlier_frame = pd.DataFrame() outlier_frame['connections'] = cluster_dataframe['connections'] outlier_frame['X'] = reduced_data[:, 0] outlier_frame['Y'] = reduced_data[:, 1] outlier_frame['isOutlier'] = outliers normal_connection = outlier_frame.loc[outlier_frame.isOutlier == 1] anomalous_connection = outlier_frame.loc[outlier_frame.isOutlier == -1] data = [go.Scatter( x = normal_connection['X'], y = normal_connection['Y'], text = normal_connection['connections'], hoverinfo = 'text', name="Normal Connections", mode = 'markers', marker=dict( color = 'rgb(34,140,217)' ) ), go.Scatter( x = anomalous_connection['X'], y = anomalous_connection['Y'], text = anomalous_connection['connections'], hoverinfo = 'text', name = "Anomalous Connections", mode = 'markers', marker=dict( color = 'rgb(235,82,82)' ) ) ] graphJSON = json.dumps(data, cls=plotly.utils.PlotlyJSONEncoder) return render_template('visualize.html', graphJSON=graphJSON)
def anomaly_test(train_data, train_label, test_data, test_label): train_data = train_data train_label = train_label train_total = pd.concat([train_data, train_label], axis=1) normal_index = train_total[(train_total['Label'] == 1) == True].index train_total = train_total.loc[normal_index] test_data = test_data test_label = test_label test_label = labeling(test_label) train_total = train_total.drop(['Label'], axis=1) train_total = train_total.sample(frac=0.2, random_state=42) print("\nOne-class SVM") ocs = OneClassSVM(kernel="linear", gamma='auto') ocs.fit(train_total) ocs_pred = ocs.predict(test_data) df_ocs = pd.DataFrame(data=(test_label['Label']).to_numpy(), columns=['actual']) df_ocs['predict'] = ocs_pred ocs_result = scoring(df_ocs) print(ocs_result) print("\nIsolation Forest") iforset = IsolationForest(max_samples=100, contamination=0.1, random_state=42) iforset.fit(train_total) iforse_pred = iforset.predict(test_data) df_iforse = pd.DataFrame(data=(test_label['Label']).to_numpy(), columns=['actual']) df_iforse['predict'] = iforse_pred iforse_result = scoring(df_iforse) print(iforse_result) print("\nLocal Outlier Factor") lof = LocalOutlierFactor(n_neighbors=15) lof.fit(train_total) lof_pred = lof.fit_predict(test_data) df_lof = pd.DataFrame(data=(test_label['Label']).to_numpy(), columns=['actual']) df_lof['predict'] = lof_pred lof_result = scoring(df_lof) print(lof_result)
def Preprocessing_train(): global col_train global col_train_bis global train global clf global prepro_y global prepro global mat_new global test global train_dataset # print (test) clf = IsolationForest(random_state=42) clf.fit(train) y_noano = clf.predict(train) y_noano = pd.DataFrame(y_noano, columns=['Top']) y_noano[y_noano['Top'] == 1].index.values # train = train.iloc[y_noano[y_noano['Top'] == 1].index.values] train.reset_index(drop=True, inplace=True) # print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0]) # print("Number of rows without outliers:", train.shape[0]) col_train = list(train.columns) col_train_bis = list(train.columns) col_train_bis.remove('DEM') mat_train = np.matrix(train) mat_test = np.matrix(test) mat_new = np.matrix(train.drop('DEM', axis=1)) mat_y = np.array(train.DEM).reshape((len(train.DEM), 1)) # preprocessing prepro_y = MinMaxScaler() prepro_y.fit(mat_y) prepro = MinMaxScaler() prepro.fit(mat_train) prepro_test = MinMaxScaler() prepro_test.fit(mat_new) # print (mat_test) train_dataset = pd.DataFrame(prepro.transform(mat_train), columns=col_train) test = pd.DataFrame(prepro_test.transform(mat_test), columns=col_train_bis) print(train_dataset) # print(test) return "Preprocessing"
def outlier_prediction(x_train, y_train): # Use built-in isolation forest or use predicted vs. actual # Compute squared residuals of every point # Make a threshold criteria for inclusion # The prediction returns 1 if sample point is inlier. If outlier prediction returns -1 rng = np.random.RandomState(42) clf_all_features = IsolationForest(max_samples=100, random_state=rng) clf_all_features.fit(x_train) # Predict if a particular sample is an outlier using all features for higher dimensional data set. y_pred_train = clf_all_features.predict(x_train) # Exclude suggested outlier samples for improvement of prediction power/score outlier_map_out_train = np.array(map(lambda x: x == 1, y_pred_train)) x_train_modified = x_train[outlier_map_out_train, ] y_train_modified = y_train[outlier_map_out_train, ] return x_train_modified, y_train_modified
def method_isolation_forest(): X, Y = load_data() X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1) max_review_length = min(map(len, X)) X_train = sequence.pad_sequences(X_train, maxlen=max_review_length, dtype=float) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length, dtype=float) X_train = X_train[y_train == labels["Buy"]] model = IsolationForest(contamination=0.5) model.fit(X_train) yhat = model.predict(X_test) print("accuracy: ", metrics.accuracy_score(y_test, yhat)) y_test[y_test == labels["NotBuy"]] = -1 y_test[y_test == labels["Buy"]] = 1 score = f1_score(y_test, yhat, pos_label=-1) print('F1 Score: %.3f' % score)
def test_tvm_iforest_remainder_batch(self): warnings.filterwarnings("ignore") num_classes = 2 model = IsolationForest(n_estimators=10, max_samples=2) np.random.seed(0) X = np.random.rand(105, 200) X = np.array(X, dtype=np.float32) y = np.random.randint(num_classes, size=105) model.fit(X, y) batch_size = 10 remainder_size = X.shape[0] % batch_size hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :], remainder_size) self.assertIsNotNone(hb_model) np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.decision_function(X), hb_model.decision_function(X), rtol=1e-06, atol=1e-06) np.testing.assert_allclose(model.score_samples(X), hb_model.score_samples(X), rtol=1e-06, atol=1e-06)
def datacleaning_if(input_file, output_file): with open(input_file, 'r') as f: # 1.创建阅读器对象 reader = csv.reader(f) # 2.读取文件第一行数据 head_row = next(reader) data_attribute = [] for item in head_row: data_attribute.append(item) # 读取数据 tn = pd.read_csv(input_file) tn.dropna(inplace=True) train = np.array(tn) train_x = train[:, :-1] #输出评估结果时使用 # 对所有数据行进行异常检测 train_x = np.array(train_x) clf = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.0001, max_features=1.0, bootstrap=False, n_jobs=1, random_state=None, verbose=0).fit(train_x) # pred存入的是每一行数据的预测值,是1或者-1 pred = clf.predict(train_x) normal = train_x[pred == 1] abnormal = train_x[pred == -1] # 删除pred为-1的行数据 df = pd.DataFrame(pd.read_csv(input_file))[0:pred.size] df['pred'] = pred # data2=data1[-data1.sorce.isin([61])] df2 = df[-df.pred.isin([-1])] df2 = df2.drop(['pred'], axis=1) # 将清洗之后的数据存入csv文件 data_out = df2.iloc[:, :].values csvfile2 = open(output_file, 'w', newline='') writer = csv.writer(csvfile2) writer.writerow(data_attribute) # 存属性 m = len(data_out) for i in range(m): writer.writerow(data_out[i])
def remove_outliers (df, field, contamination=0.01, verbose=False): '''Function will run an Isolation Forest to determine values that are outliers in the given field and will remove those data points before returning a new dataframe.''' # Use a deep copy of data to avoid making changes to original X = df[field].copy(deep=True) X = X.values.reshape(-1, 1) # Prepare and fit the Isolation Forest IsoFor = IsolationForest(bootstrap=True, n_jobs=-1, contamination=contamination) IsoFor.fit(X) # Make predictions y_pred = IsoFor.predict(X) if verbose: num_outliers = np.unique(y_pred, return_counts=True)[1][0] print('{} outliers detected and removed from dataframe.'.format(num_outliers)) # Truth value of non_outliers (equal to 1) non_outliers = y_pred == 1 # Return new df return df[non_outliers].copy(deep=True)
def compute_scores(o, n_iterations, pdb_ids, ab_truth, ab_coord, ab_X, ab_X_weights, precision, recall): print outlier_fractions[o] forest = IsolationForest(contamination=outlier_fractions[o], n_jobs=4) for i in xrange(len(pdb_ids)) : print pdb_ids[i] current_precision = 0 current_recall = 0 for _ in xrange(n_iterations) : forest.fit(ab_X[i], sample_weight=ab_X_weights[i]) patch_pred_no_outliers = forest.predict(ab_coord[i]) p, r, _, _ = precision_recall_fscore_support(ab_truth[i], patch_pred_no_outliers, average='binary') current_precision += p current_recall += r current_precision /= n_iterations current_recall /= n_iterations precision[o] += current_precision recall[o] += current_recall precision[o] /= len(pdb_ids) recall[o] /= len(pdb_ids)
def do_isoForest(df, kwargs=None): """ Runs an isolation forest looking for outliers. """ if kwargs is None: kwargs = { # 'n_estimators': 1000, 'behaviour': 'new', # 'max_samples': 1000, 'random_state': 42, 'contamination': 'auto', 'max_features': 1 } forest = IsolationForest(**kwargs).fit(df) predics = forest.predict(df) return forest, predics
def outliers_isolation_forest(df, target_encoded, encoder=None, contamination=0.001): """Using 'Isolation Forest', filters the outliers data points Args: df (DataFrame): Source data target_encoded (str): Target column name encoder (obj, optional): Object of the type 'IsolationForest'. Defaults to None. contamination (float, optional): Threshold to remove the outliers. Defaults to 0.001. Returns: DataFrame: Same as source """ if encoder is None: encoder = IsolationForest(contamination=contamination) y_pred = encoder.fit(df.drop([target_encoded], axis=1)) y_pred = encoder.predict(df.drop([target_encoded], axis=1)) mask = y_pred != -1 df = df.loc[list(mask), :] return df, encoder
def isolutionforest(DO): rng = np.random.RandomState(42) clf = IsolationForest(random_state=rng, contamination=0.025) # contamination为异常样本比例 clf.fit(DO) DO_copy = DO m = 0 pre = clf.predict(DO) for i in range(len(pre)): if pre[i] == -1: DO_copy = np.delete(DO_copy, i - m, 0) plt.scatter(i,DO[i],c='red') print(i) m += 1 # plt.plot(DO) # plt.show() return DO_copy
def anomalies(): cluster_dataframe = pd.read_csv("data/cluster.csv") X = cluster_dataframe.drop(['connections'], axis=1) clusters = 4 X = normalize(X) reduced_data = PCA(n_components=clusters).fit_transform(X) #Outlier Test model = IsolationForest(contamination=0.05) model.fit(reduced_data) outliers = model.predict(reduced_data) outlier_frame = pd.DataFrame() outlier_frame['connections'] = cluster_dataframe['connections'] outlier_frame['X'] = reduced_data[:, 0] outlier_frame['Y'] = reduced_data[:, 1] outlier_frame['isOutlier'] = outliers normal_connection = outlier_frame.loc[outlier_frame.isOutlier == 1] anomalous_connection = outlier_frame.loc[outlier_frame.isOutlier == -1] return render_template('anomalies.html', items=list(anomalous_connection['connections']))
def test_isolationforest(): # Load data X, _ = make_blobs(n_samples=400, centers=[[0, 0], [0, 0]], cluster_std=0.5, n_features=2, random_state=42) X_outlier = np.random.RandomState(42).uniform(low=-6, high=6, size=(50, 2)) # Create and fit model model = IsolationForest(random_state=42) model.fit(X) # Compute counterfactuals x = X[0, :] y_target = -1 assert model.predict([x]) == 1 x_cf, y_cf, _ = generate_counterfactual(model, x, y_target=y_target, return_as_dict=False) assert y_cf == y_target assert model.predict(np.array([x_cf])) == y_target x = X_outlier[1, :] y_target = 1 assert model.predict([x]) == -1 x_cf, y_cf, _ = generate_counterfactual(model, x, y_target=y_target, return_as_dict=False) assert y_cf == y_target assert model.predict(np.array([x_cf])) == y_target cf = generate_counterfactual(model, x, y_target=y_target, return_as_dict=True) assert cf["y_cf"] == y_target assert model.predict(np.array([cf["x_cf"]])) == y_target # Other stuff from ceml.sklearn import IsolationForest as IsolationForestCf model_cf = IsolationForestCf(model) assert model.predict([x]) == model_cf.predict(x) with pytest.raises(TypeError): IsolationForestCf(sklearn.linear_model.LogisticRegression())
def runIF(X, Y, model): """ Isolation Forest model ... Inputs ------ X: {array-like, matrix}, shape = [n_samples, n_features] Y: {1-D array-like} shape must equal number of rows in X model: output of filterK.kMeansModel function including: k-means model and normalization and contains list with following elements: out[0] = object of class 'sklearn.cluster.k_means_.KMeans' out[1] = shift for data normalisation out[2] = scale for data normalisation Outputs ------- return following mask_outliers = all data labelled as outlier (-1) by IF y_pred is cluster label for each point """ X = np.array(X) Y = np.array(Y) # Check parameters if Y.shape[0] != X.shape[0]: raise Exception('X dataset shape does not match Y') else: pass # Normalise data X = X * model[2] + model[1] clf = model[0] y_pred = clf.predict(X) IF = IsolationForest(n_estimators=500, max_samples='auto', random_state=0) IF.fit(X) if_apply = IF.predict(X) mask_outliers = (if_apply == -1) return [mask_outliers, y_pred]
class iForest(): MAX_N_SAMPLES = 32000 def __init__(self, max_number_of_samples=None, outliers_fraction=0.1, n_estimators=100): self.max_number_of_samples = max_number_of_samples if max_number_of_samples else self.MAX_N_SAMPLES self.outliers_fraction = outliers_fraction self.n_estimators = n_estimators self.classifier = IsolationForest(n_estimators=self.n_estimators,max_samples=self.max_number_of_samples, contamination=self.outliers_fraction, random_state=None) def train(self, train_data): n_train_samples = train_data.shape[0] train_data = train_data.reshape(n_train_samples, -1) if n_train_samples > self.max_number_of_samples: logging.warning( 'Discarding training data: using {} of {} chunks.'.format(self.max_number_of_samples, n_train_samples)) train_data = self._subsample_data(train_data) self.classifier.fit(train_data) def predict(self, test_sample): data = test_sample.reshape(test_sample.data.shape[0], -1) prediction = self.classifier.predict(data) return prediction def decision_function(self, test_sample): data = test_sample.data.reshape(test_sample.data.shape[0], -1) anomaly_score = self.classifier.decision_function(data) return np.squeeze(anomaly_score) def _subsample_data(self, data): return data[np.random.choice(data.shape[0], self.max_number_of_samples, replace=False)] @property def configuration(self): return { 'max_number_of_samples': self.max_number_of_samples, 'outliers_fraction': self.outliers_fraction, 'n_estimators': self.n_estimators }
def check_relative_anomaly_score(real_data=None, test_data=None, DIR=None, domain_dims=None): _cur_path_ = os.path.abspath(__file__).replace('.py', '') _cur_path_ = '/'.join(_cur_path_.split('/')[:-1]) save_dir = os.path.join(_cur_path_, 'saved_model/{}'.format(DIR)) path = Path(save_dir) path.mkdir(parents=True, exist_ok=True) f_path = os.path.join(save_dir, 'ad_if.pkl') print(f_path) AD_obj = None if not os.path.exists(f_path) and real_data is not None: AD_obj = IsolationForest(n_estimators=100, contamination=0.01, n_jobs=mp.cpu_count(), verbose=True) # Convert real data to one-hot encoded oh_data = convert_to_01(real_data, domain_dims) AD_obj.fit(oh_data) print("Model fitting done.") pickle.dump(AD_obj, open(f_path, 'wb'), pickle.HIGHEST_PROTOCOL) elif os.path.exists(f_path): AD_obj = pickle.load(open(f_path, 'rb')) print(AD_obj) if test_data is not None and AD_obj is not None: print(test_data.shape) oh_data = convert_to_01(test_data, domain_dims) print(oh_data.shape) # -1 for outliers y = AD_obj.predict(oh_data) # percentage of data points predicted as anomalies count_outliers = np.where(y == -1.0)[0].shape[0] data_len = test_data.shape[0] return (count_outliers / data_len)
def findBestModel(X_train, X_test, Y_test, model): """ Function to find the best parameters to use for a given model components: X_train: numpy array of the input data X_test: list containing numpy arrays of different test data Y_test: list containing numpy array of different test outcomes (note that this is configured differently for different algorithms,for iForest, each column must have -1 or 1. -1 --> the anomaly, if 1 --> not an anomaly) model: string to determine model type """ if model == 'iForest': for max_features in range(1, X_train.shape[1] + 1): for contamination in range(0, 101): iForest = IsolationForest(n_estimators=100, max_features=max_features / 1000, contamination=contamination, random_state=0).fit(X_train) for x_test, y_test in zip(X_test, Y_test): y_hat = iForest.predict(x_test) score = evaluate(y_test, y_hat) # returns accuracy score print(score)
def compute_outliers(metrics, test_metrics, metric_key, sort_key_list, es): train = metrics[metric_key] # [:-test_samples] test = test_metrics[metric_key] #[-test_samples:] test_time_stamps = sort_key_list X_train = pd.DataFrame(train, columns=['sample']) X_test = pd.DataFrame(test, columns=['sample']) clf = IsolationForest(max_samples=1000) clf.fit(X_train) outliers = clf.predict(X_test) for i in range(0, len(test_time_stamps)): ## Store if the metric sampled at a particular time stamp was an anomaly or not jsonString = '{"' + sort_key + '": "' + test_time_stamps[ i] + '", "metrics":"' + metric_key + '", "value":' + str( test[i]) + ',"outlier":' + str(outliers[i]) + '}' es.index(index=anomalies_index, doc_type='predicted', body=json.loads(jsonString))
def test_isolation_forest_score_samples(self): isol = IsolationForest(n_estimators=3, random_state=0) data = np.array([[-1.1, -1.2], [0.3, 0.2], [0.5, 0.4], [100., 99.]], dtype=np.float32) model = isol.fit(data) model_onnx = to_onnx(model, data, target_opset=TARGET_OPSET, options={'score_samples': True}) sess = InferenceSession(model_onnx.SerializeToString()) names = [o.name for o in sess.get_outputs()] self.assertEqual(names, ['label', 'scores', 'score_samples']) got = sess.run(None, {'X': data}) self.assertEqual(len(got), 3) expected_label = isol.predict(data) expected_decif = isol.decision_function(data) expected_score = isol.score_samples(data) assert_almost_equal(expected_label, got[0].ravel()) assert_almost_equal(expected_decif, got[1].ravel()) assert_almost_equal(expected_score, got[2].ravel())
class IF_real_bogus: def __init__(self, feature): self.feature = feature #self.Class = [] #[self.Class.append('real') for i in range(len(self.feature))] #self.trainning, self.test, self.Class_train, self.Class_test = train_test_split(self.real_feature,\ # self.real_Class, test_size=0.4, random_state=0) #print self.trainning def train(self): self.IFmod = IF(n_estimators=160) self.IFmod.fit(self.feature) def validation(self): #score = self.IFmod.decision_function(self.feature) result = self.IFmod.predict(self.feature) self.normal = (result == 1) self.abnormal = (result == -1) print "total:{0}, normal:{1}, abnormal:{2}, normal/total:{3}".format( len(self.feature), self.normal.sum(), self.abnormal.sum(), self.normal.sum() / float(len(self.feature)))
def feature_engineering(datasets): #generate some features here, need discuss #read the file train = pd.read_csv(datasets["train"]) train_X = train.iloc[:, :-1] train_y = train.TARGET # add outlier feature test = pd.read_csv(datasets["test"]) n = train_X.shape[0] clf = IsolationForest(random_state=42) clf.fit(train_X) outlier = clf.predict(train_X) train_X = pd.DataFrame(train_X) train_X["outlier"] = outlier #add xxx return train_X, train_y, test
def outlier_detection(df): X = df.drop('AdoptionSpeed', axis=1) Y = df['AdoptionSpeed'] # Isolation Forest clf = IsolationForest(contamination=0.08, max_samples=256, behaviour="new") # 8% outliers clf.fit(X) processedData1 = clf.predict(X) == -1 print("Isolation Forest: ", sum(processedData1)) clf = LocalOutlierFactor(contamination='auto') processedData2 = clf.fit_predict(X) == -1 print("LOF:", sum(processedData2)) newData = df.loc[processedData1 == 0] newData = newData.reset_index(drop=True) return newData
def remove_outliers(df_train): """ :objective: Remove outliers. Before dividing into X/y :return: pandas dataframe """ numeric_colnum = df_train.columns.get_indexer(['판매단가', '취급액']).tolist() feature_set = df_train.iloc[:, numeric_colnum] # identify outliers in the training dataset iso = IsolationForest(n_estimators=50, max_samples=50, contamination=float(0.05), max_features=1.0) iso.fit(feature_set) pred = iso.predict(feature_set) feature_set['anomaly'] = pred outliers = feature_set.loc[feature_set['anomaly'] == -1] outlier_index = list(outliers.index) df_train = df_train.loc[~df_train.index.isin(outlier_index)].reset_index() return df_train
def training_oulier_alldata(self, data, t_data, outlier_features, complete_row_idx, missing_lines): complete_data = data[outlier_features].iloc[complete_row_idx, :] test_data = t_data[outlier_features] all_data = pd.concat([complete_data, test_data], axis=0, ignore_index=True) ilf = IsolationForest(n_estimators=min(100, len(all_data)), n_jobs=-1, verbose=2) ilf.fit(all_data) if len(complete_data) > 0: pred = ilf.predict(complete_data) outlier_idx = [ complete_row_idx[i] for i in np.where(pred == -1)[0] ] # put these outlier data into the missing list for k, v in missing_lines.items(): missing_lines[k] = np.append(v, outlier_idx) return ilf, missing_lines
def find_circle_R(kmeans): R = [] centers = [] for i in range(EMOTION_NUM): clf = IsolationForest(max_samples=60) random_idx = np.random.permutation(range(len(kmeans[i]))) X_train = kmeans[i][random_idx] clf.fit(X_train) pred_outliers = clf.predict(np.array(kmeans[i])) kmeans_in_circle = kmeans[i][pred_outliers == 1] center = np.mean(kmeans_in_circle, 0) centers.append(center) d_point_center = [] for j in range(len(kmeans_in_circle)): d_point_center.append( nltk.cluster.util.cosine_distance( center, kmeans_in_circle[j].tolist())) R.append(np.sort(d_point_center)[len(d_point_center) - 1]) return R, centers
def test_iforest_performance(): """Test Isolation Forest performs well""" # Generate train/test data rng = check_random_state(2) X = 0.3 * rng.randn(120, 2) X_train = np.r_[X + 2, X - 2] X_train = X[:100] # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) X_test = np.r_[X[100:], X_outliers] y_test = np.array([0] * 20 + [1] * 20) # fit the model clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train) # predict scores (the lower, the more normal) y_pred = clf.predict(X_test) # check that there is at most 6 errors (false positive or false negative) assert_greater(roc_auc_score(y_test, y_pred), 0.98)
data = data.join(vecData) return data, vecData, vec df, t, v = ohEncoding(df, col, replace=True) print "Shape after encoding" print type(df.shape) df_unlabeled = df.drop("Anomaly", axis=1) print "Shape of the dataframe without anomaly column: " print df_unlabeled.shape clf = IsolationForest(max_samples=6444, verbose=1, n_jobs=-1, contamination=0.255555 , bootstrap=True, max_features=9) clf.fit(df_unlabeled) pred = clf.predict(df_unlabeled) # print type(pred) # print data.shape # print len(pred) # print pred anomalies = np.argwhere(pred == -1) normal = np.argwhere(pred == 1) # print anomalies # print type(anomalies) df['ISO1'] = pred # iterate over rows nLabAno = 0 nDetAno = 0 nFalsePositives = 0
# ## Improving the Predicition model ## # This part is about finding a better metric for predicting future house sales regarding their price. # # First, I will detect outliers and delete them from the dataset if needed. # ### Detecting Outliers ### # The first step to improve our learning behaviour is to find outliers and then remove them from the data set if needed. # To detect outliers I will use the Isolation Forest Algorithm which is good for high-dimensional data sets as we have present here. # In[ ]: from sklearn.ensemble import IsolationForest clf = IsolationForest(max_samples=100, random_state=rng) clf.fit(df) y = clf.predict(df) print y # ### Location based prices ### # House prices don't only depend on the size of the house or amount of rooms, but are also really dependant on the location of said house. To get an idea how the position might impact my data I analyse the relationship between location and price in my dataset. # In[ ]: import gmaps gmaps.configure(api_key="AIzaSyDPWAl8lcrK9q-tOkrl64sGkxDnbWz47Ko") locations = df[["lat", "long"]] prices = df["price"] heatmap_layer = gmaps.heatmap_layer(locations, weights=prices)
rng = np.random.RandomState(42) # Generate train data X = 0.3 * rng.randn(100, 2) X_train = np.r_[X + 2, X - 2] # Generate some regular novel observations X = 0.3 * rng.randn(20, 2) X_test = np.r_[X + 2, X - 2] # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) # fit the model clf = IsolationForest(max_samples=100, random_state=rng) clf.fit(X_train) y_pred_train = clf.predict(X_train) y_pred_test = clf.predict(X_test) y_pred_outliers = clf.predict(X_outliers) # plot the line, the samples, and the nearest vectors to the plane xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.title("IsolationForest") plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r) b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white') b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green') c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red') plt.axis('tight')
Xtrain.append(column_train) Xtest.append(column_test) Xtrain = np.transpose(np.array(Xtrain)) Xtest = np.transpose(np.array(Xtest)) idx_train = idx_train[:Xtrain.shape[0]] idx_test = idx_test[:Xtest.shape[0]] # fit an iforest iforest = IsolationForest(n_estimators=ntrees, max_samples=sample_frac, max_features=feat_frac, n_jobs=-1, random_state=rng, verbose=1) iforest.fit(Xtrain) # anomaly scores y_pred_train = iforest.predict(Xtrain) y_pred_test = iforest.predict(Xtest) train_feature_values = [(gid, val) for gid, val in zip(idx_train, list(y_pred_train))] test_feature_values = [(gid, val) for gid, val in zip(idx_test, list(y_pred_test))] for i, scenario in enumerate(MALICIOUS_SCENARIOS): all_feature_values = train_feature_values + \ [(gid, feat_value) for gid, feat_value in test_feature_values if gid/100 in BENIGN_SCENARIOS or gid/100 == scenario] all_values = np.array([feat_value for gid, feat_value in all_feature_values]).reshape(-1,1) y_true = [1 if gid/100 in MALICIOUS_SCENARIOS else 0 for gid, feat_value in all_feature_values]
# merge vehicle = pd.merge(rpm, speed, how = 'outer', on = 'timestamp') # drop null values and zero speeds --> neutral gear # speed < 200 to remove outliers vh = vehicle.dropna(axis = 0) vh = vh[(vh['rpm'] > 0) & ((vh['speed'] > 0) & (vh['speed'] < 200))] # detect outliers using IsolationForest # assume contamination at 0.01 level distances = pairwise_distances(vh[['rpm','speed']],vh[['rpm','speed']], metric = 'cosine') clf = IsolationForest(max_samples = 100, contamination = 0.01, verbose = 1) clf.fit(distances) labels = clf.predict(distances) vh['outlier'] = labels # remove outliers found by IsolationForest vh = vh[['rpm','speed']][vh['outlier'] == 1] #recompute distances after outlier removal distances = pairwise_distances(vh[['rpm','speed']],vh[['rpm','speed']], metric = 'cosine') # initialize variable to keep best model, its silhouette score and predicted labels best_model = (None, -1, None) # iterate over possible number of gears # since we want to pick model with best silhouette score, can't start with single cluster (k=1) for k in range(2,7):
n_samples, n_features = np.shape(X) n_samples_train = n_samples // 2 n_samples_test = n_samples - n_samples_train X = X.astype(float) X_train = X[:n_samples_train, :] X_test = X[n_samples_train:, :] y_train = y[:n_samples_train] y_test = y[n_samples_train:] print('IsolationForest processing...') model = IsolationForest(bootstrap=True, n_jobs=-1) tstart = time() model.fit(X_train) fit_time = time() - tstart tstart = time() scoring = model.predict(X_test) # the lower, the more normal predict_time = time() - tstart fpr, tpr, thresholds = roc_curve(y_test, scoring) AUC = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC for %s (area = %0.3f, train-time: %0.2fs, test-time: %0.2fs)' % (dat, AUC, fit_time, predict_time)) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show()
#create the isolation forest class and factorize the class column clf = IsolationForest(n_estimators=opts.numtrees) #train the isolation forest on the training set, dropping the class column (since the trainer takes that as a separate argument) print('\nTraining') clf.fit(train.drop('class', axis=1)) #remove the 'answers' from the test set testnoclass = test.drop('class', axis=1) print('\nPredicting (class 1 is normal, class -1 is malicious)') #evaluate our results on the test set. test.is_copy = False test['prediction'] = clf.predict(testnoclass) print #group by class (the real answers) and prediction (what the forest said). we want these values to match for 'good' answers results=test.groupby(['class', 'prediction']) resultsagg = results.size() print(resultsagg) tp = float(resultsagg[-1,-1]) if (-1,-1) in resultsagg.index else 0 fp = float(resultsagg[1,-1]) if (1,-1) in resultsagg.index else 0 fn = float(resultsagg[-1,1]) if (-1,1) in resultsagg.index else 0 f1 = 2*tp/(2*tp + fp + fn) print("F1 = %s" % f1) #save the vectorizers and trained RF file joblib.dump(vectorizers, opts.vectorizerfile)