def train_model(X_train, characteristic, n_threads): X_train = pd.DataFrame(X_train) X_train = lp.load_data_float(X_train) # Train block train_len = len(X_train) if train_len > 1000: clf = iso.iForest(X_train, ntrees=characteristic.ntrees, sample_size=characteristic.sample_size, ExtensionLevel=1, n_threads=n_threads) else: clf = iso.iForest(X_train, ntrees=5000, sample_size=train_len, ExtensionLevel=1) return clf
def transform(self, X): # instantiate extended isolation forest object ext_iso = eif.iForest( X=X[self.columns].values, ntrees=self.n_trees, sample_size=self.sample_size, ExtensionLevel=self.extension_level, ) # calculate anomaly scores anomaly_scores = ext_iso.compute_paths(X_in=X[self.columns].values) # store anomaly score for each observation in Pandas DataFrame, sort descending anomaly_scores_sorted = pd.DataFrame( anomaly_scores, index=X.index, columns=["anomaly score"]).sort_values(["anomaly score"], ascending=False) # identify outliers by limiting anomaly_scores_sorted to the (anamalies_ratio * total observation) # observations with the highest anomaly scores self.outliers = np.array(anomaly_scores_sorted[:int( np.ceil(self.anomalies_ratio * X.shape[0]))].index) # optionally drop outlier observations from input dataset if self.drop_outliers: X = X.drop(self.outliers, axis=0).reset_index(drop=True) return X
def extended_isolation_forest(self, contamination): self.report.append('extended_isolation_forest') if_eif = iso.iForest(self.training.astype('float64').values, ntrees=100, sample_size=256, ExtensionLevel=2) anomaly_scores = if_eif.compute_paths(X_in=self.training.values) anomaly_scores = pd.Series(anomaly_scores) anomaly_scores.index = self.training.index return self.uni_boxplot_outlier_det(anomaly_scores)
def _construct(self, X): import eif model = super()._construct(X) model.psi = min(self.psi, X.shape[0]) model.t = self.t model.random_state = self.random_state model.forest = eif.iForest(X, ntrees=model.t, sample_size=model.psi, seed=model.random_state, ExtensionLevel=X.shape[1] - 1, **self.eif_params) return model
def learning_process_prediction_ext_iso_f(self): print("Extended isolation forest train process is initialized!!") get_time() self.get_x_values() self.model_e_iso_f = iso.iForest( self.X, ntrees=self.params['num_of_trees'], sample_size=self.params['sample_size'], ExtensionLevel=len(self.features) - 1) self.data[self.model_params['args'] ['pred_field']] = self.model_e_iso_f.compute_paths( X_in=self.X) self.train_test_split() print("Extended Isolation Forest Model Train Process Done!")
IsolationForest( n_estimators=500, behaviour="new", contamination=outliers_fraction, random_state=42, ), ), ( "Local Outlier Factor", LocalOutlierFactor( n_neighbors=35, contamination=outliers_fraction, novelty=False ), ), ( "Extended IF", iso.iForest(datasets3D[0], ntrees=500, sample_size=255, ExtensionLevel=1), ), ( "USPORF", UnsupervisedRandomForest( feature_combinations="auto", max_depth=None, max_features="auto", min_samples_split="auto", n_estimators=500, n_jobs=None, projection_matrix="RerF", ), ), ]
def AnomalyDetection(df, chamber, model, percent, x_train, x_test, scoring=True, contamination=0.001, show_params=False, show=True, save=False): slicing = int(len(df) * percent) if model == "extendedIsolationForest": import eif as iso # ExtensionLevel=0 is the same as regular Isolation Forest clf = iso.iForest(x_train.values, ntrees=200, sample_size=256, ExtensionLevel=1) print("fitting finished") train_pred = clf.compute_paths(X_in=x_train.values) test_pred = clf.compute_paths(X_in=x_test.values) print("scoring finished") else: if model == "IsolationForest": from sklearn.ensemble import IsolationForest # contamination : the proportion of outliers in the data set. Used when fitting to define the threshold on the scores of the sample clf = IsolationForest(n_estimators=50, contamination=contamination, random_state=0) elif model == "LocalOutlierFactor": from sklearn.neighbors import LocalOutlierFactor # If you really want to use neighbors.LocalOutlierFactor for novelty detection, # i.e. predict labels or compute the score of abnormality of new unseen data, # you can instantiate the estimator with the novelty parameter set to True before fitting the estimator. clf = LocalOutlierFactor(n_neighbors=5, novelty=True) elif model == "OneClassSVM": from sklearn.svm import OneClassSVM clf = OneClassSVM(gamma='auto') else: clf = None print("model selection error") return clf.fit(x_train) print("fitting finished") # pred = -(clf.predict(x_test)) # predict: Returns 1 for outliers and -1 for inliers # pred = -(clf.score_samples(x_test[features])) # score_samples : Returns anomaly score 0~1 (0 for normal, 1 for anomal) if scoring == True: train_pred = -(clf.score_samples(x_train)) test_pred = -(clf.score_samples(x_test)) else: train_pred = -(clf.predict(x_train)) test_pred = -(clf.predict(x_test)) print("scoring finished")
r = np.sort(np.array(r)) return r, theta if __name__ == "__main__": confusion_matrices = [] All_orbits = [] X_buffer = [] Y_buffer = [] buffer = False binary_set = True use_previously_saved_models = False categorical_num = True for index in range(SET_PARAMS.Number_of_multiple_orbits): Y, Y_buffer, X, X_buffer, Orbit = Dataset_order(index, binary_set, buffer, categorical_num, use_previously_saved_models) All_orbits.append(Orbit) F1 = iso.iForest(X, ntrees = 500, sample_size = 1000, ExtensionLevel=1) xxx = np.array([[0,0.]]) SL0 = F1.compute_paths_single_tree(xxx, 0) S1 = F1.compute_paths(X_in=X) ss1=np.argsort(S1) number_of_errors = np.sum(Y % 2 == 1) print(np.sum(Y[ss1[:number_of_errors]])/number_of_errors, index) """ To determine whether a single point within
data = pd.read_csv( "C:/Users/Reinis Fisers/PycharmProjects/TF_TEST/HalfYearFilteredNoNAN.csv") data = data.tail(100000) x = data['WindSpeed_mps'] y = data['Power_kW'] ### Create a two dimensional array with datatset ### z = np.array((list(zip(x, y)))) ### Create the dataframe ### new_data = pd.DataFrame(np.array(z), columns=['A', 'B']) ### Fitting into Extended Isolation Forest Model ### anomalies_ratio = 0.02 eif = iso.iForest(new_data.values, ntrees=3000, sample_size=100, ExtensionLevel=0.9) anomaly_scores = eif.compute_paths(X_in=new_data.values) anomaly_scores_sorted = np.argsort(anomaly_scores) indices_with_preds = anomaly_scores_sorted[ -int(np.ceil(anomalies_ratio * new_data.shape[0])):] outliers = np.zeros_like(y) outliers[indices_with_preds] = 1 ### Getting the cleaned date from outliers ### x_cleaned = data[np.where(outliers != 1, True, False)] x_cleaned.to_csv("EIF4.csv") ### Loading the created dataset ### data1 = pd.read_csv("C:/Users/Reinis Fisers/PycharmProjects/TF_TEST/EIF4.csv") x1 = data1['WindSpeed_mps']
ax1.set_xlabel("Anomaly") ax1.set_ylim(0, forest.limit) ax1.axes.get_xaxis().set_visible(False) ax1.axes.get_yaxis().set_visible(False) plt.show() if __name__ == "__main__": X_train, X_test, y_train, y_test = load_data() print("training sample nums: ", len(X_train)) eifmodel = iso.iForest(X_train.values, ntrees=100, sample_size=256, ExtensionLevel=1, n_jobs=4) # save model joblib.dump(eifmodel, './eiforest.pkl') # eifmodel = joblib.load('./eiforest.pkl') print("test sample nums: ", len(X_test)) print("test anoamly sample nums: ", sum(y_test)) stime = time.time() y_pred_test = eifmodel.compute_paths(X_test.values, n_jobs=4) ctime = time.time() - stime print("cost time is: {:.4f} ".format(ctime)) fpr, tpr, thresholds = roc_curve(y_test.values, y_pred_test) areaUnderROC = auc(fpr, tpr)