def choose_models(): isolFor = { 'name': 'Isolation Forest', 'class': ensemble.IsolationForest(), 'parameters': { 'n_estimators': [5, 10, 20, 50, 100, 150, 200] } } locOutFac = { 'name': 'Local Outlier Factor', 'class': neighbors.LocalOutlierFactor(novelty=True), 'parameters': { 'n_neighbors': range(5, 50, 5) } } # ocSVM = {'name': 'One Class SVM', # 'class': svm.OneClassSVM(), # 'parameters': { # 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], # 'nu': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] # } # } elEnv = { 'name': 'Elliptic Envelope', 'class': covariance.EllipticEnvelope(), 'parameters': { 'contamination': np.linspace(0.05, 0.45, 9) } } return [isolFor, locOutFac, elEnv]
def remove_outlier(data, contamination=0.01): outlier_map = se.IsolationForest(contamination=contamination).fit_predict( data[[ "Sales", "CompetitionDistance", "aggregated_promo2", "days_since_competition" ]]) return data[outlier_map == 1]
def test_score_samples_estimators(): """Check the values of score_samples methods derived from sklearn. Check that the values are the same than sklearn decision_function methods. This only concerns OCSVM and IsolationForest. """ X = np.random.randn(50, 2) clf1 = IsolationForest(random_state=88) clf1.fit(X) clf2 = ensemble.IsolationForest(random_state=88) clf2.fit(X) assert_array_equal(clf1.score_samples(X), clf2.decision_function(X)) nu = 0.4 sigma = 3.0 gamma = gamma = 1. / (2. * sigma**2) clf1 = OCSVM(sigma=sigma, nu=nu) clf1.fit(X) clf2 = OneClassSVM(gamma=gamma, nu=nu) clf2.fit(X) assert_array_equal(clf1.score_samples(X), clf2.decision_function(X).ravel())
def set_isolation_forest_classifier(self): ''' Deprecated for now, no meaningful results - performance metrics were similar to baseline results. ''' return SkLearner( ensemble.IsolationForest(max_samples=100, random_state=42, contamination=0.1))
def RemoveAbnormal(BigFeatures, contamination=0.05): print('******************** 剔除异常样本 ********************\n') from sklearn import ensemble clf = ensemble.IsolationForest(max_samples='auto', contamination=contamination, \ max_features=1.0, bootstrap=False, random_state=42) clf.fit(BigFeatures) y_detection = clf.predict(BigFeatures) mask = (y_detection == -1) return mask # 异常样本编号
def occ_training(X_train, model_type, dict_params=None, val_split=0.25, random_state=108): """Trains one-class classifier by grid search. Args: X_train: np array, input for training model_type: str, type of model, example: svm, isoforest dict_params: dict, key: parameter, value: list of hyperparameters, default:model_params[model_type] val_split: float, validation split, default=0.25 random_state: int, seed for splitting and isolation forest classifier Returns: best_model: sklearn model, best model best_params: dict, hyperparameters of best model best_accuracy: float, accuracy of best model """ X_train, X_val, _, _ = model_selection.train_test_split( X_train, np.zeros(len(X_train)), test_size=val_split, random_state=random_state) if dict_params is None: dict_params = model_params[model_type] all_params = list(model_selection.ParameterGrid(dict_params)) prev_accuracy = 0 for tmp_params in all_params: if model_type is 'svm': tmp_model = svm.OneClassSVM(cache_size=5000) tmp_model.set_params(kernel=tmp_params['kernel']) tmp_model.set_params(nu=tmp_params['nu']) elif model_type is 'isoforest': tmp_model = ensemble.IsolationForest(n_jobs=-1, warm_start=True, random_state=random_state) tmp_model.set_params(n_estimators=tmp_params['n_estimators']) tmp_model.set_params(max_features=tmp_params['max_features']) tmp_model.fit(X_train) val_accuracy = occ_scorer(tmp_model, X_val) if val_accuracy > prev_accuracy: best_model = tmp_model best_params = tmp_params best_accuracy = val_accuracy return best_model, best_params, best_accuracy
def fit(self, X, y=None): if self.transformer is not None: print 'Fit Transformer' self.transformer.fit(X, y) from sklearn import ensemble print 'Fitting ISF' self.isf = ensemble.IsolationForest(n_estimators=self.n_estimators, max_samples='auto', contamination=self.contamination, n_jobs=-1, random_state=self.random_state) self.isf.fit( X if self.transformer is None else self.transformer.latent(X)) self._estimate_threshold(X)
def handle_app(app_id, ids_entries, experiment): """ Full flow for one classifier. """ verify_ids_entries(ids_entries, app_id, experiment.storer_printer) training, scoring = ids_tools.ids_entries_to_train_test(ids_entries) X_train, _ = IdsConverter().ids_entries_to_X_y(training) X_test, y_true = IdsConverter().ids_entries_to_X_y(scoring) classifiers = [sk_svm.OneClassSVM(), sk_ens.IsolationForest()] for classifier in classifiers: classifier.fit(X_train) y_pred = classifier.predict(X_test) experiment.visualise_store("SPEC", app_id, classifier, y_true, y_pred)
def mapper(self, data): """Run the mapper algorithm on the data. Parameters ---------- data : array-like The data to run the algorihthm on, can have almost any shape. Returns ------- graph : The graph output from km.KeplerMapper(...).map(...) """ # Initialize logging.info("Applying the mapping algorithm.") mapper = km.KeplerMapper(verbose=2) # We create a custom 1-D lens with Isolation Forest model = ensemble.IsolationForest() model.fit(data) isolation_forest = model.decision_function(data).reshape( (data.shape[0], 1)) # Fit to and transform the data tsne_projection = mapper.fit_transform( data, projection=sklearn.manifold.TSNE(n_components=2, perplexity=20, init='pca')) lens = np.c_[isolation_forest, tsne_projection] # Create dictionary called 'graph' with nodes, edges and meta-information graph = mapper.map(tsne_projection, coverer=km.Cover(10, 0.2), clusterer=sklearn.cluster.DBSCAN(eps=1.0, min_samples=2)) color_function = np.array( [self._label_to_color(self.labels[i]) for i in range(len(data))]) # Visualize it mapper.visualize(graph, path_html="actions.html", title="chunk", custom_tooltips=self.tooltips, color_function=color_function) return graph
def handle_all(experiment): """ Full flow for a one-fits-all classifier. """ from ids.TEMP_IDS_CONVERTER import IdsConverter as TEMPCONVERTER converter = TEMPCONVERTER() log_entries = [] for line in Dir.yield_lines(experiment.file_path, ITEM_LIMIT): log_entry = LogEntry.from_log_string(line) log_entries.append(log_entry) all_entries = converter.LOG_ENTRIES_TO_IDS_ENTRIES(log_entries, binary=True) training_entries, scoring_entries = ids_tools.ids_entries_to_train_test( all_entries) X_train, _ = IdsConverter().ids_entries_to_X_y(training_entries) scoring_dict = {} for ids_entry in scoring_entries: if ids_entry.app_id not in scoring_dict: scoring_dict[ids_entry.app_id] = [] scoring_dict[ids_entry.app_id].append(ids_entry) # Classify with all entries: training_entries classifiers = [sk_svm.OneClassSVM(), sk_ens.IsolationForest()] for classifier in classifiers: classifier.fit(X_train) # Score for each app: scoring_dict for app_id, app_entries in util.seqr.yield_items_in_key_order( scoring_dict): X_test, y_true = IdsConverter().ids_entries_to_X_y(app_entries) y_preds = [clf.predict(X_test) for clf in classifiers] for clf, y_pred in zip(classifiers, y_preds): experiment.visualise_store("ALL", app_id, clf, y_true, y_pred)
def fit(self, df, cluster_alg_ls=['KMeans', 'DBSCAN'], dim_reduction_alg_ls=[], n_evaluations=30, run_obj='quality', seed=27, cutoff_time=50, optimizer='smac', evaluator=get_evaluator(evaluator_ls=['silhouetteScore'], weights=[], clustering_num=None, min_proportion=.001, min_relative_proportion=0.01), n_folds=3, preprocess_dict={}, isolation_forest_contamination='auto', warmstart=False, warmstart_datasets_dir='silhouette', warmstart_metafeatures_table_path='metaknowledge/metafeatures_table.csv', warmstart_n_neighbors=3, warmstart_top_n=20, general_metafeatures=[], numeric_metafeatures=[], categorical_metafeatures=[], verbose_level=2): """ --------------------------------------------------------------------------- Arguments --------------------------------------------------------------------------- df: a DataFrame n_folds: number of folds used in k-fold cross validation preprocess_dict: should be a dictionary with keys 'numeric_cols', 'ordinal_cols', 'categorical_cols' and 'y_col' isolation_forest_contamination: 'contamination' parameter in IsolationForest outlier removal model, float expected optimizer: 'smac' or 'random' cluster_alg_ls: list of clustering algorithms to explore dim_reduction_alg_ls: list of dimension algorithms to explore n_evaluations: max # of evaluations done during optimization, higher values yield better results run_obj: 'runtime' or 'quality', cutoff_time must be provided if 'runtime' chosen. cutoff_time: Maximum runtime, after which the target algorithm is cancelled. Required if run_obj is 'runtime'. shared_model: whether or not to use parallel SMAC evaluator: a function for evaluating clustering result, must have the arguments X and y_pred verbose_level: integer, must be either 0, 1 or 2. The higher the number, the more logs/print statements are used. """ ############################################################# # Logging/Printing # ############################################################# self._verbose_level = verbose_level ############################################################# # Data preprocessing # ############################################################# # rename, save preprocess_dict for later use raw_data = df self._preprocess_dict = preprocess_dict # encode categorical and ordinal columns preprocess_dict['df'] = raw_data raw_data_np = PreprocessedDataset(**preprocess_dict).X # perform outlier detection predicted_labels = ensemble.IsolationForest( n_estimators=100, warm_start=True, behaviour='new', contamination=isolation_forest_contamination).fit_predict( raw_data_np) idx_np = np.where(predicted_labels == 1) # remove outliers raw_data_cleaned = raw_data.iloc[idx_np].reset_index(drop=True) self._log("{}/{} datapoints remaining after outlier removal".format( len(raw_data_cleaned), len(raw_data_np)), min_verbose_level=1) # encode cleaned datasest preprocess_dict['df'] = raw_data_cleaned processed_data_np = PreprocessedDataset(**preprocess_dict).X ############################################################# # Warmstarting (Optional) # ############################################################# # construct desired configuration space cs = build_config_space(cluster_alg_ls, dim_reduction_alg_ls) self._log(cs, min_verbose_level=2) # calculate metafeatures metafeatures_np = None metafeatures_ls = general_metafeatures + numeric_metafeatures + categorical_metafeatures if len(metafeatures_ls) > 0: metafeatures_np = calculate_metafeatures(raw_data_cleaned, preprocess_dict, metafeatures_ls) # perform warmstart, if needed initial_cfgs_ls = [] if warmstart and len(metafeatures_ls) > 0: # create and train warmstarter warmstarter = KDTreeWarmstarter(metafeatures_ls) warmstarter.fit(warmstart_metafeatures_table_path) # query for suitable configurations initial_configurations = warmstarter.query( metafeatures_np, warmstart_n_neighbors, warmstart_top_n, datasets_dir=warmstart_datasets_dir) # construct configuration objects for cfg in initial_configurations: try: initial_cfgs_ls.append(build_config_obj(cs, cfg[0])) except: pass # if too little configurations available, just ignore initial_cfgs_ls = None if len(initial_cfgs_ls) < 2 else initial_cfgs_ls if initial_cfgs_ls is not None: self._log( 'Found {} relevant intial configurations from warmstarter.'. format(len(initial_cfgs_ls)), min_verbose_level=1) ############################################################# # Bayesian optimization (SMAC) # ############################################################# # make sure n_evaluations is valid dim_reduction_min_size = 1 if len(dim_reduction_alg_ls) == 0 \ else min([Mapper.getClass(alg).n_possible_cfgs for alg in dim_reduction_alg_ls]) clustering_min_size = min( [Mapper.getClass(alg).n_possible_cfgs for alg in cluster_alg_ls]) n_evaluations = min(n_evaluations, clustering_min_size * dim_reduction_min_size) initial_cfgs_ls = initial_cfgs_ls[ 0:n_evaluations] if initial_cfgs_ls is not None else None self._log('Truncated n_evaluations: {}'.format(n_evaluations), min_verbose_level=1) # define scenario object to be passed into SMAC scenario_params = { "run_obj": run_obj, "runcount-limit": n_evaluations, "cutoff_time": cutoff_time, "cs": cs, "deterministic": "true", "output_dir": LogUtils.create_new_directory('{}/smac'.format(self.log_dir)), "abort_on_first_run_crash": False, } scenario = Scenario(scenario_params) self._log('{}'.format(scenario_params), min_verbose_level=2) # functions required for SMAC optimization def fit_models(cfg, data): ################################################ # Preprocessing # ################################################ # fit standard scaler scaler = preprocessing.StandardScaler() scaler.fit(data) # standardize data scaled_data = scaler.transform(data) ################################################ # Dimensionality reduction # ################################################ # get the dimension reduction method chosen dim_reduction_alg = Mapper.getClass( cfg.get("dim_reduction_choice", None)) dim_reduction_model = None # fit dimension reduction model compressed_data = scaled_data if dim_reduction_alg: cfg_dim_reduction = { StringUtils.decode_parameter(k, dim_reduction_alg.name): v for k, v in cfg.items() if StringUtils.decode_parameter( k, dim_reduction_alg.name) is not None } # compress the data using chosen configurations dim_reduction_model = dim_reduction_alg.model( **cfg_dim_reduction) compressed_data = dim_reduction_model.fit_transform( scaled_data) ################################################ # Clustering # ################################################ # get the model chosen clustering_alg = Mapper.getClass(cfg["clustering_choice"]) # decode the encoded parameters cfg_clustering = { StringUtils.decode_parameter(k, clustering_alg.name): v for k, v in cfg.items() if StringUtils.decode_parameter( k, clustering_alg.name) is not None } # train clustering model clustering_model = clustering_alg.model(**cfg_clustering) clustering_model.fit(compressed_data) return scaler, dim_reduction_model, clustering_model, def cfg_to_dict(cfg): # convert cfg into a dictionary cfg = {k: cfg[k] for k in cfg if cfg[k]} # remove keys with value == None return {k: v for k, v in cfg.items() if v is not None} def evaluate_model(cfg): # get cfg as dictionary cfg = cfg_to_dict(cfg) # logging self._log("Fitting configuration: \n{}".format(cfg), min_verbose_level=1) ################################################ # K fold cross validation # ################################################ kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed) kf.get_n_splits(processed_data_np) # store score obtain by each fold score_ls = [] for train_idx, valid_idx in kf.split(processed_data_np): # split data into train and test train_data, valid_data = processed_data_np[ train_idx], processed_data_np[valid_idx] # fit clustering and dimension reduction models on training data scaler, dim_reduction_model, clustering_model = fit_models( cfg, train_data) # test on validation data scaled_valid_data = scaler.transform(valid_data) compressed_valid_data = scaled_valid_data if dim_reduction_model: try: compressed_valid_data = dim_reduction_model.transform( scaled_valid_data) except: compressed_valid_data = dim_reduction_model.fit_transform( scaled_valid_data) # predict on validation data if hasattr(clustering_model, 'fit_predict'): y_pred = clustering_model.fit_predict( compressed_valid_data) else: y_pred = clustering_model.predict(compressed_valid_data) # evaluate using provided evaluator score = evaluator(X=compressed_valid_data, y_pred=y_pred) score_ls.append(score) # if we have infinity, no point continue evaluating if score in [float('inf'), np.nan]: break if (float('inf') in score_ls) or (np.nan in score_ls): score = float('inf') else: score = np.mean(score_ls) self._log("Score obtained by this configuration: {}".format(score), min_verbose_level=1) return score optimal_config = None if optimizer == 'smac': # reset self._random_optimizer_obj = None # run SMAC to optimize smac_params = { "scenario": scenario, "rng": np.random.RandomState(seed), "tae_runner": evaluate_model, "initial_configurations": initial_cfgs_ls, } self._smac_obj = SMAC(**smac_params) optimal_config = self._smac_obj.optimize() time_spent = round(self._smac_obj.stats.get_used_wallclock_time(), 2) elif optimizer == 'random': # reset self._smac_obj = None # run random optimizer t0 = time.time() self._random_optimizer_obj = RandomOptimizer( random_seed=seed, blackbox_function=evaluate_model, config_space=cs) optimal_config, score = self._random_optimizer_obj.optimize( n_evaluations=n_evaluations, cutoff=cutoff_time) time_spent = round(time.time() - t0, 2) # refit to get optimal model self._scaler, self._dim_reduction_model, self._clustering_model = fit_models( cfg_to_dict(optimal_config), processed_data_np) self._log("Optimization is complete.", min_verbose_level=1) self._log("Took {} seconds.".format(time_spent), min_verbose_level=1) self._log("The optimal configuration is \n{}".format(optimal_config), min_verbose_level=1) # return a dictionary result = { "cluster_alg_ls": cluster_alg_ls, "dim_reduction_alg_ls": dim_reduction_alg_ls, "random_optimizer_obj": self._random_optimizer_obj, "smac_obj": self._smac_obj, "optimal_cfg": optimal_config, "metafeatures": metafeatures_np, "metafeatures_used": metafeatures_ls, "clustering_model": self._clustering_model, "dim_reduction_model": self._dim_reduction_model, "scaler": self._scaler } return result
print(grid_lsvm_estimator.score(X_train1, y_train)) final_model = grid_lsvm_estimator.best_estimator_ final_model.coef_ final_model.intercept_ gbm_estimator = GradientBoostingClassifier(random_state=2017) gbm_grid = {'n_estimators':[50, 100], 'max_depth':[3,4,5], 'learning_rate':[0.001,0.01,0.2,0.3]} grid_gbm_estimator = model_selection.GridSearchCV(gbm_estimator, gbm_grid,scoring="roc_auc",cv=10,n_jobs=1) grid_gbm_estimator.fit(X_train,y_train) print(grid_gbm_estimator.grid_scores_) print(grid_gbm_estimator.best_score_) print(grid_gbm_estimator.best_params_) print(grid_gbm_estimator.score(X_train, y_train)) isf=ensemble.IsolationForest(random_state=2017) X_test = total_data2[train_data.shape[0]:] pca = decomposition.PCA() pca.fit(X_test) print(pca.explained_variance_) print(pca.explained_variance_ratio_) print(pca.explained_variance_ratio_.cumsum()) pca = decomposition.PCA(150) pca.fit(X_test) X_test1 = pca.transform(X_test) X_test.shape X_test.info() test_data['target'] = dt_estimator.predict_proba(X_test1)
def _get_best_detector(self, train): detector = ensemble.IsolationForest() detector.fit(train) return detector
def remove_outliers(G,terrorist_graph,threshold,train_set,predict_set): rng = np.random.RandomState(42) # fit the model # X_Red=[X_train[i] for i in range(len(X_train)) if Y_train[i]==1 ] X_Blue=[] X_Blue={} X_Red={} keys=[] for point in train_set: if train_set[point][1]==0: X_Blue[point]=train_set[point][0].values() keys=train_set[point][0].keys() else: X_Red[point]=train_set[point][0].values() blue_train=[X_Blue[i] for i in X_Blue.keys()] blue_keys=X_Blue.keys() if len(blue_train)>0: #clf=neigh.LocalOutlierFactor() clf = en.IsolationForest(max_samples=100, random_state=rng) clf.fit(blue_train) y_pred_train = clf.predict(blue_train) no_red_among_blue=[n for n in blue_keys if G.node[n]['color']=="Red" ] no_identified=0 no_removed=0 for j in range(len(y_pred_train)): if y_pred_train[j]!=1: if blue_keys[j] in no_red_among_blue: no_identified+=1 no_monitors=terrorist_graph.node[blue_keys[j]]["MonitorNumber"] # print no_monitors # prob_red=pow(prob_lying,no_monitors) # print prob_red #if terrorist_graph.node[blue_keys[j]]["MonitorNumber"]<4: #or terrorist_graph.node[blue_keys[j]]["RedConfidence"]>0.1 : if no_monitors<threshold+1: #print no_monitors predict_set[blue_keys[j]]=train_set[blue_keys[j]][0] no_removed+=1 #predict_set.append() del train_set[blue_keys[j]] terrorist_graph.node[blue_keys[j]]["color"]="Black" terrorist_graph.node[blue_keys[j]]['tempColor']="Black" terrorist_graph.node[blue_keys[j]]["IsMonitor"]=False #print (no_identified,len(no_red_among_blue),no_removed) return train_set,predict_set,terrorist_graph
def outliers_isolation_forest(sparse_data): iso_forest = ensemble.IsolationForest(contamination=0.15) iso_forest.fit(sparse_data) y = iso_forest.predict(sparse_data) # list of outlier samples print([i for i in range(len(y)) if y[i] < 0])
def ex_1(): X, y = datasets.fetch_openml('diabetes', as_frame=True, return_X_y=True) # print(X) # print(X.info()) # print(X.describe()) X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42) X_train_2 = X_train.copy() plt.figure() X_train.boxplot() X_train.hist(bins=20) plt.figure() sns.boxplot(x=X_train['mass']) imputer_mass = impute.SimpleImputer(missing_values=0.0, strategy='mean') imputer_skin = impute.SimpleImputer(missing_values=0.0, strategy='mean') X_train[['mass']] = imputer_mass.fit_transform(X_train[['mass']]) X_train[['skin']] = imputer_skin.fit_transform(X_train[['skin']]) X_test[['mass']] = imputer_mass.transform(X_test[['mass']]) X_test[['skin']] = imputer_mass.transform(X_test[['skin']]) df_mass = X_train[['mass']] # print(df_mass.head(5)) # Wykrywanie anomalii czyli odstających danych X_train_isolation = X_train.values X_train_isolation = X_train_isolation[:, [1, 5]] X_test_isolation = X_test.values X_test_isolation = X_test_isolation[:, [1, 5]] isolation_forest = ensemble.IsolationForest(contamination=0.05) isolation_forest.fit(X_train_isolation) y_predicted_outliers = isolation_forest.predict(X_test_isolation) print(y_predicted_outliers) plot_iris2d(X_test_isolation, y_predicted_outliers) clf = svm.SVC(random_state=42) clf.fit(X_train, y_train) y_predicted = clf.predict(X_test) print(metrics.classification_report(y_test, y_predicted)) X_train.hist() imputer_it = impute.IterativeImputer(missing_values=0.0) X_train_2[['mass']] = imputer_it.fit_transform(X_train_2[['mass']]) X_train_2[['skin']] = imputer_it.fit_transform(X_train_2[['skin']]) X_train_2.hist(bins=20) plt.figure() X_train_2.boxplot() clf_rf = ensemble.RandomForestClassifier(random_state=42) clf_rf.fit(X_train, y_train) y_predicted = clf_rf.predict(X_test) print(metrics.classification_report(y_test, y_predicted)) importances = clf_rf.feature_importances_ std = np.std([tree.feature_importances_ for tree in clf_rf.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(X.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the impurity-based feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(X.shape[1]), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(X.shape[1]), indices) plt.xlim([-1, X.shape[1]]) plt.show()
# 椭圆分布假设的异常检测 if 0: from sklearn import covariance contamination = 0.05 # 需设置异常比例 clf = covariance.EllipticEnvelope(assume_centered=False, support_fraction=None, \ contamination=contamination, random_state=42) clf.fit(BigFeatures) y_detection=clf.predict(BigFeatures) print(BigSamplenames[y_detection==-1]) # 隔离森林异常检测,适于多维数据集 if 1: print('******************** 剔除异常样本 ********************\n') from sklearn import ensemble contamination = 0.05 # 需设置异常比例 clf = ensemble.IsolationForest(max_samples='auto', contamination=contamination, \ max_features=1.0, bootstrap=False, random_state=42) clf.fit(BigFeatures) y_detection=clf.predict(BigFeatures) print('异常样本类别:\n',BigSamplenames[y_detection==-1]) Samplenames,Labels,Features = \ BigSamplenames[y_detection!=-1],BigLabels[y_detection!=-1],BigFeatures[y_detection!=-1,:] # OCSVM异常检测,超参数不易设置 if 0: from sklearn import svm clf = svm.OneClassSVM(kernel='rbf', nu=0.5, max_iter=-1, random_state=42) clf.fit(BigFeatures) y_detection=clf.predict(BigFeatures) print(BigSamplenames[y_detection==-1]) if 1:
import numpy as np import kmapper as km import sklearn from sklearn import ensemble # For data we use the Wisconsin Breast Cancer Dataset # Via: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data df = pd.read_csv("data.csv") feature_names = [c for c in df.columns if c not in ["id", "diagnosis"]] df["diagnosis"] = df["diagnosis"].apply(lambda x: 1 if x == "M" else 0) X = np.array(df[feature_names].fillna(0)) # quick and dirty imputation y = np.array(df["diagnosis"]) # We create a custom 1-D lens with Isolation Forest model = ensemble.IsolationForest(random_state=1729) model.fit(X) lens1 = model.decision_function(X).reshape((X.shape[0], 1)) # We create another 1-D lens with L2-norm mapper = km.KeplerMapper(verbose=3) lens2 = mapper.fit_transform(X, projection="l2norm") # Combine both lenses to create a 2-D [Isolation Forest, L^2-Norm] lens lens = np.c_[lens1, lens2] # Create the simplicial complex graph = mapper.map(lens, X, cover=km.Cover(n_cubes=15, perc_overlap=0.7), clusterer=sklearn.cluster.KMeans(n_clusters=2,
def iso_forest(X): clf = ensemble.IsolationForest(max_samples=X.shape[0], random_state=None) return clf.fit(X)
def trainAnomalyModel(self, data, logFolder, newPMMLFileName, lock, kwargs): print('here' * 20, kwargs) paramToTrainModel = kwargs['data'] idforData = kwargs['idforData'] dataPath = kwargs['filePath'] try: targetVar = kwargs['target_variable'] except: targetVar = None algorithmToUse = kwargs['parameters']['algorithm'] projectName = idforData projectPath = logFolder + projectName dataFolder = projectPath + '/dataFolder/' statusfileLocation = dataFolder + 'status' + '.txt' def upDateStatus(): lock.acquire() sFile = open(statusfileLocation, 'r') sFileText = sFile.read() lock.release() data_details = json.loads(sFileText) return data_details try: dataMapperInner = autoMLutilities.createDataMapper( paramToTrainModel, targetVar) except Exception as e: data_details = upDateStatus() data_details['status'] = 'Training Failed' data_details[ 'errorMessage'] = 'Error while creating DataFrameMapper >> ' + str( e) data_details['errorTraceback'] = traceback.format_exc() with open(statusfileLocation, 'w') as filetosave: json.dump(data_details, filetosave) # sys.exit() return mapper1 = DataFrameMapper(dataMapperInner) featureVar = list(data.columns) if algorithmToUse == 'IsolationForest': print('came here') from sklearn import ensemble modelT = ensemble.IsolationForest() elif algorithmToUse == 'OneClassSVM': from sklearn import svm modelT = svm.OneClassSVM() elif algorithmToUse == 'LinearSVR': print('Came to SVR') from sklearn import svm modelT = svm.LinearSVR() # else: # data_details=upDateStatus() # data_details['status']='Training Failed' # data_details['errorMessage']='Model not supported >> ' # data_details['errorTraceback']='None' # with open(statusfileLocation,'w') as filetosave: # json.dump(data_details, filetosave) # # sys.exit() # return try: print('training started') pipeline = Pipeline([('feature_mapper', mapper1), ('model', modelT)]) pipelObj = pipeline.fit(data) print('training completed') except Exception as e: data_details = upDateStatus() data_details['status'] = 'Training Failed' data_details[ 'errorMessage'] = 'Error while preparing Data and training model >> ' + str( e) data_details['errorTraceback'] = traceback.format_exc() with open(statusfileLocation, 'w') as filetosave: json.dump(data_details, filetosave) # sys.exit() return data_details = upDateStatus() data_details['listOfModelAccuracy'] = [] data_details['pmmlFilelocation'] = '' with open(statusfileLocation, 'w') as filetosave: json.dump(data_details, filetosave) finalPMMLfile = '../ZMOD/Models/' + newPMMLFileName toExportDict = { 'model1': { 'data': None, 'hyperparameters': None, 'preProcessingScript': None, 'pipelineObj': Pipeline(pipelObj.steps[:-1]), 'modelObj': pipelObj.steps[-1][1], 'featuresUsed': featureVar, 'targetName': None, 'postProcessingScript': None, 'taskType': 'score' } } try: print('toExportDict >>>>>>>>>>>> ', toExportDict) from nyoka.skl.skl_to_pmml import model_to_pmml model_to_pmml(toExportDict, PMMLFileName=finalPMMLfile) print('>>>>>>>>>>>>>>>>>>>>>>> Success') except Exception as e: data_details = upDateStatus() data_details['status'] = 'Training Failed' data_details[ 'errorMessage'] = 'Error while Saving Model >> ' + str(e) data_details['errorTraceback'] = traceback.format_exc() with open(statusfileLocation, 'w') as filetosave: json.dump(data_details, filetosave) # sys.exit() return print('>>>>>>>>>>>>>>>>>>>>>>> Failed Saving Trying again') with open(statusfileLocation, 'r') as sFile: sFileText = sFile.read() model_accuracy = [] data_details = json.loads(sFileText) data_details['status'] = 'Complete' data_details['pmmlFilelocation'] = finalPMMLfile data_details['listOfModelAccuracy'] = model_accuracy with open(statusfileLocation, 'w') as filetosave: json.dump(data_details, filetosave)
def main(): URLKeyword, URLchar, action, title = get_dic() #load training dataset mainfile = './data/file_list_20170430_new的副本.txt' WebDirectory = './data/file的副本/' MD5_list, flag_list, URL_list = traverse_directory(WebDirectory, mainfile) X_train = list() Y_train = flag_list for i in range(len(MD5_list)): URL = URL_list[i] Web_data = read_file(MD5_list[i]) web_vec = Web_feature(Web_data, title, action, MD5_list[i]) URL_vec = URL_feature(URL, URLKeyword, URLchar) feature = np.hstack((web_vec, URL_vec)) X_train.append(feature) # print(len(feature)) print(len(X_train), len(Y_train)) X_train = np.asarray(X_train) Y_train = np.asarray(Y_train) #feature selection X_train, Y_train, F_index = feature_selection(X_train, Y_train) print(F_index) #参数选择 #tuned_parameters = {'n_estimators':range(10,100,10),"max_depth":range(3,25,2),"max_features":range(3,20,2)} #split dataset # X_train, X_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.25, random_state=0, stratify=Y_train) #train model clf1 = ensemble.RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=21, max_features=15, n_estimators=10) clf1.fit(X_train, Y_train) clf2 = ensemble.IsolationForest(contamination=0.06, n_estimators=90, max_samples=150, bootstrap=True) clf2.fit(X_train, Y_train) clf3 = XGBClassifier(learning_rate=0.5, max_depth=6, n_estimators=100, objective="multi:softmax", num_class=2) clf3.fit(X_train, Y_train) # print("best parameter:",clf.best_params_) # print(clf.grid_scores_) # joblib.dump(clf,'RF_model.m') """ print("Traing Score:%f" % clf.score(X_train, Y_train)) # print("Testing Score:%f"%clf.score(X_test,y_test)) middle = time.clock() print(middle-start) """ #load testing dataset mainfile1 = './data/file_list_10000.txt' WebDirectory1 = './data/file1/' MD5_list1, flag_list1, URL_list1 = traverse_directory_t( WebDirectory1, mainfile1) X_test = list() Y_test = flag_list1 for h in range(len(MD5_list1)): s_fea = [] URL1 = URL_list1[h] Web_data1 = read_file(MD5_list1[h]) web_vec1 = Web_feature(Web_data1, title, action, MD5_list1[h]) URL_vec1 = URL_feature(URL1, URLKeyword, URLchar) feature1 = np.hstack((web_vec1, URL_vec1)) for j in F_index: s_fea.append(feature1[j]) X_test.append(s_fea) # print("********") print(len(X_test), len(Y_test)) #testing model y_score_1 = clf1.predict_proba(X_test)[:, 1] y_score_2 = clf2.decision_function(X_test) y_score_3 = clf3.predict_proba(X_test)[:, 1] fig_plot(Y_test, y_score_1, y_score_2, y_score_3)
def outliers_isolation_forest_dense(matrix_data): iso_forest = ensemble.IsolationForest(contamination=0.10, behaviour='new') iso_forest.fit(matrix_data) y = iso_forest.predict(matrix_data) # list of outlier samples print([i for i in range(len(y)) if y[i] < 0])
###Hay que tener en cuenta en cuenta el alto porcentaje de superposición. ### ####################################################################################################################### # Create a custom 1-D lens with **Isolation Forest** ### #Return the anomaly score of each sample using the IsolationForest algorithm ### #The IsolationForest ‘isolates’ observations by randomly selecting a feature and then randomly ### #selecting a split value between the maximum and minimum values of the selected feature. ### #Since recursive partitioning can be represented by a tree structure, the number of splittings ### #required to isolate a sample is equivalent to the path length from the root node to the terminating node. ### #This path length, averaged over a forest of such random trees, is a measure of normality and our decisionfunction. ### #Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees ### #collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies. ### ####################################################################################################################### # Create a custom 1-D lens with Isolation Forest model = ensemble.IsolationForest( random_state=1729 ) #If int, random_state is the seed used by the random number generator; model.fit(X) lens1 = model.decision_function(X).reshape((X.shape[0], 1)) # Create another 1-D lens with L2-norm mapper = km.KeplerMapper(verbose=0) lens2 = mapper.fit_transform(X, projection="l2norm") # Combine both lenses to get a 2-D [Isolation Forest, L^2-Norm] lens lens = np.c_[lens1, lens2] ########################################################################################################################################### ### Aplicacion del cluster Affinity Propagation proveniente de la libreria SKLearn ## # ## #AffinityPropagation creates clusters by sending messages between pairs of samples until convergence. ##
def filter_outliers(x, y, **kwargs): xy = np.column_stack((x, y)) filter_estimator = ensemble.IsolationForest(random_state=42, **kwargs) filter_estimator.fit(xy) is_inlier = filter_estimator.predict(xy) return x[is_inlier == 1], y[is_inlier == 1]
def iso_forest(self, label, result_list): x_train = self.train_test_split['x_train'] clf = ensemble.IsolationForest(max_samples=x_train.shape[0], random_state=None) return execute_decision_function(clf, self.train_test_split, label, result_list, self.image_creator, unsupervised=True)
return Y_pred_collection, Y if __name__ == "__main__": np.random.seed(712) sz_t = 14 sz_height_span = [ 1, ] sz_image_size = 24 sz_downsample_size = 3 rf_learner = ensemble.RandomForestClassifier(n_estimators=100, n_jobs=4) xgc_learner = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=50, min_child_weight=1, subsample=1, colsample_bytree=1) isof_learner = ensemble.IsolationForest(contamination=0.001, max_samples=0.5) cross_validataion_classification_one_timeslot(sz_t, sz_height_span, sz_image_size, sz_downsample_size, xgc_learner, detect_outlier=False)
ax = mtp.axes() xx, yy = make_meshgrid(outd5x[:, 0], outd5x[:, 1]) plot_contours(ax, model, xx, yy, cmap=mtp.cm.coolwarm) ax.scatter(outd5x[:, 0], outd5x[:, 1], c=outd5y, edgecolors='black') ax.set_xlabel('feature 1') ax.set_ylabel('feature 2') purple_patch = mpatches.Patch(color='purple', label='class 0') yellow_patch = mpatches.Patch(color='yellow', label='class 1') mtp.legend(handles=[purple_patch, yellow_patch]) mtp.title( "Outlier removed data 5 plot with decision boundary using One class SVM (linear kernel)" ) mtp.show() # Using Isolation Forest model = ensemble.IsolationForest(contamination=0.52) out4_0 = model.fit_predict(d4x0) # print(out4_0) out4_1 = model.fit_predict(d4x1) outd4x = np.zeros(shape=(np.count_nonzero(out4_0 == 1) + np.count_nonzero(out4_1 == 1), 2)) outd4y = np.zeros( (np.count_nonzero(out4_0 == 1) + np.count_nonzero(out4_1 == 1))) # d4o = np.zeros(shape=(np.count_nonzero(out4_0 == -1), 2)) # d4y0 = np.zeros(np.count_nonzero(d4y == 0)) # d4y1 = np.ones(np.count_nonzero(d4y == 1)) a = 0 for i in range(len(d4x0)): if out4_0[i] == 1:
import matplotlib.pyplot as plt from sklearn import ensemble import pandas as pd from sklearn.metrics import confusion_matrix, accuracy_score, plot_confusion_matrix from bunch import Bunch import database_config import config as cfg # Configuration TEST_SET_SIZE = 1500 VIEW_GRAPH = False # Model Setup # isolation_forest = ensemble.IsolationForest(n_estimators=50, max_features=3, random_state=cfg.RANDOM_SEED_MODEL) isolation_forest = ensemble.IsolationForest(n_estimators=50, max_features=3, contamination=0.10, random_state=cfg.RANDOM_SEED_MODEL) # Get Data database_config.db.load(cfg.STORAGE_BASE_PATH_SIMULATED_DATA) changes_df = database_config.db.get_table('MTA_CHANGES').get_data() update_changes_df = pd.DataFrame( changes_df[changes_df['change_type'] == 'update']) update_changes_df['price_delta'] = update_changes_df.old_value.astype( float) - update_changes_df.new_value.astype(float) dataset = Bunch( # data=update_changes_df[['price_delta']].values, # data=update_changes_df[['old_value', 'new_value']].values, data=update_changes_df[['old_value', 'new_value', 'price_delta']].values,
print(features.size) features = sklearn.preprocessing.scale(features) train_unlabeled = sklearn.preprocessing.scale(np.array(train_unlabeled)) # gnb = nb.MultinomialNB() # gnb.fit(features,lables) # # yresult = gnb.predict(train_unlabeled) # np.savetxt('gaussianNB.csv',yresult,delimiter=',') validate = np.loadtxt(open('benchmark2100.csv'),delimiter = ",") valiStored = validate train_unlabeledStored = train_unlabeled # print(accuracy_score(validate,yresult)) iso = en.IsolationForest(n_estimators=100, max_samples='auto', contamination=0.3, max_features=128, bootstrap=False, n_jobs=-1, random_state=None, verbose=2) iso.fit(train_unlabeled,validate) truthTable = iso.predict(train_unlabeled) inlier = [] inlierLabel = [] count = 0 for i in range(21000): if truthTable[i] == 1: temp = train_unlabeledStored[i,] inlier = np.append(inlier,temp) inlierLabel = np.append(inlierLabel,valiStored[i]) inlier = np.reshape(inlier,[int(len(inlier)/128), 128]) all_features = np.concatenate((features,inlier),axis=0)
from common_utils import * from outlier_utils import * from feature_reduction_utils import * from sklearn.model_selection import train_test_split from sklearn import metrics, preprocessing, tree, covariance, linear_model, ensemble, neighbors, svm, model_selection, feature_selection, kernel_ridge from sklearn.preprocessing import PolynomialFeatures import pandas as pd import numpy as np card_data = pd.read_csv(os.path.join(path, 'creditcard.csv')) card_data.info() X = drop_features(card_data, ['Time', 'Amount', 'Class']) y = card_data['Class'] tnse_data = feature_reduction_tsne(X, 3) plot_data_3d_outliers(tnse_data, y, title="Credit card data") iso_forest_estimator = ensemble.IsolationForest() iso_forest_grid = {'contamination': [0.1, 0.2, 0.25, 0.3]} grid_search_plot_models_outliers(iso_forest_estimator, iso_forest_grid, X, y, xlim=[-7, 7], ylim=[-7, 7]) iso_best_model = grid_search_best_model_outliers(iso_forest_estimator, iso_forest_grid, X, y, scoring='roc_auc') plot_model_2d_outliers(iso_best_model, X, y)