def create_model_for_ensemble(x_train, y_train, params=None, times_to_repeat=1): def min_max_rescaling(values, min_, max_): return [(ith_element - min_) / (max_ - min_) for ith_element in values] if not params: if constants.USING_AGGREGATED_FEATURES: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_XGBOOST_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_XGBOOST_REAL_DATASET_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_XGBOOST_OLD_DATASET_AGGREGATED else: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_XGBOOST_NO_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_XGBOOST_REAL_DATASET_NO_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_XGBOOST_OLD_DATASET_NO_AGGREGATED avg_threshold = [] avg_min = [] avg_max = [] avg_mean = [] for i in range(times_to_repeat): print("iteration: ", i, "of", times_to_repeat) _x_val, _y_val, _x_train, _y_train = evaluation.get_val_test_set( x_train, y_train, 0.25) xg_reg = xgb.XGBClassifier(subsample=params["subsample"], min_child_weight=params["min_child_weight"], max_depth=params["max_depth"], learning_rate=params["learning_rate"], gamma=params["gamma"], colsample_bytree=params["colsample_bytree"]) xg_reg.fit(_x_train, _y_train) y_pred = xg_reg.predict_proba(_x_val)[:, 1] min_ = min(y_pred) max_ = max(y_pred) mean_ = np.array(y_pred).mean() avg_min.append(min_) avg_max.append(max_) avg_mean.append(mean_) y_pred = min_max_rescaling(y_pred, min_, max_) avg_threshold.append( evaluation.find_best_threshold_fixed_fpr(_y_val, y_pred)) xg_reg = xgb.XGBClassifier(subsample=params["subsample"], min_child_weight=params["min_child_weight"], max_depth=params["max_depth"], learning_rate=params["learning_rate"], gamma=params["gamma"], colsample_bytree=params["colsample_bytree"]) xg_reg.fit(x_train, y_train) return xg_reg, np.array(avg_threshold).mean(), np.array( avg_min).mean(), np.array(avg_max).mean(), np.array(avg_mean).mean()
def create_model_for_ensemble(x_train, y_train, params=None, times_to_repeat=1): def min_max_rescaling(values, min_, max_): return [(ith_element - min_) / (max_ - min_) for ith_element in values] if not params: if constants.USING_AGGREGATED_FEATURES: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_RF_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_RF_REAL_DATASET_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_RF_OLD_DATASET_AGGREGATED else: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_RF_NO_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_RF_REAL_DATASET_NO_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_RF_OLD_DATASET_NO_AGGREGATED avg_threshold = [] avg_min = [] avg_max = [] avg_mean = [] for i in range(times_to_repeat): print("iteration: ", i, "of", times_to_repeat) _x_val, _y_val, _x_train, _y_train = evaluation.get_val_test_set(x_train, y_train, 0.25) rf = RandomForestClassifier(n_estimators=params["n_estimators"], min_samples_split=params["min_samples_split"], min_samples_leaf=params["min_samples_leaf"], max_features=params["max_features"], max_depth=params["max_depth"], bootstrap=params["bootstrap"]) rf.fit(_x_train, _y_train) y_pred = rf.predict_proba(_x_val)[:, 1] min_ = min(y_pred) max_ = max(y_pred) mean_ = np.array(y_pred).mean() avg_min.append(min_) avg_max.append(max_) avg_mean.append(mean_) y_pred = min_max_rescaling(y_pred, min_, max_) avg_threshold.append(evaluation.find_best_threshold_fixed_fpr(_y_val, y_pred)) rf = RandomForestClassifier(n_estimators=params["n_estimators"], min_samples_split=params["min_samples_split"], min_samples_leaf=params["min_samples_leaf"], max_features=params["max_features"], max_depth=params["max_depth"], bootstrap=params["bootstrap"]) rf.fit(x_train, y_train) return rf, np.array(avg_threshold).mean(), np.array(avg_min).mean(), np.array(avg_max).mean(), np.array(avg_mean).mean()
def create_fit_model(x_train, y_train, look_back, params=None, times_to_repeat=1): if not params: if constants.USING_AGGREGATED_FEATURES: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_LSTM_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_LSTM_REAL_DATASET_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_LSTM_OLD_DATASET_AGGREGATED else: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_LSTM_NO_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_LSTM_REAL_DATASET_NO_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_LSTM_OLD_DATASET_NO_AGGREGATED avg_threshold = 0 for i in range(times_to_repeat): print("iteration: ", i, "of", times_to_repeat) _x_val, _y_val, _x_train, _y_train = evaluation.get_val_test_set( x_train, y_train, 0.25) model = create_model(params["layers"], params["dropout_rate"], look_back, len(x_train[0, 0])) model.fit(_x_train, _y_train, epochs=params["epochs"], verbose=1, batch_size=params["batch_size"]) y_pred = model.predict(_x_val) avg_threshold += evaluation.find_best_threshold_fixed_fpr( _y_val, y_pred) # model = create_simple_model(x_train, y_train, look_back) model = create_model(params["layers"], params["dropout_rate"], look_back, len(x_train[0, 0])) model.fit(x_train, y_train, epochs=params["epochs"], verbose=1, batch_size=params["batch_size"]) return model, avg_threshold / times_to_repeat
def create_model(x_train, y_train, params=None, times_to_repeat=1): if not params: if constants.USING_AGGREGATED_FEATURES: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_XGBOOST_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_XGBOOST_REAL_DATASET_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_XGBOOST_OLD_DATASET_AGGREGATED else: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_XGBOOST_NO_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_XGBOOST_REAL_DATASET_NO_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_XGBOOST_OLD_DATASET_NO_AGGREGATED thresholds = [] for i in range(times_to_repeat): print("iteration: ", i, "of", times_to_repeat) _x_val, _y_val, _x_train, _y_train = evaluation.get_val_test_set( x_train, y_train, 0.25) xg_reg = xgb.XGBClassifier(subsample=params["subsample"], min_child_weight=params["min_child_weight"], max_depth=params["max_depth"], learning_rate=params["learning_rate"], gamma=params["gamma"], colsample_bytree=params["colsample_bytree"]) xg_reg.fit(_x_train, _y_train) y_pred = xg_reg.predict_proba(_x_val)[:, 1] thresholds.append( evaluation.find_best_threshold_fixed_fpr(_y_val, y_pred)) print(np.array(thresholds).mean(), np.array(thresholds).std()) xg_reg = xgb.XGBClassifier(subsample=params["subsample"], min_child_weight=params["min_child_weight"], max_depth=params["max_depth"], learning_rate=params["learning_rate"], gamma=params["gamma"], colsample_bytree=params["colsample_bytree"]) xg_reg.fit(x_train, y_train) return xg_reg, np.array(thresholds).mean()
def create_model(x_train, y_train, params=None, times_to_repeat=1): if not params: if constants.USING_AGGREGATED_FEATURES: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_RF_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_RF_REAL_DATASET_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_RF_OLD_DATASET_AGGREGATED else: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_RF_NO_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_RF_REAL_DATASET_NO_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_RF_OLD_DATASET_NO_AGGREGATED avg_threshold = 0 for i in range(times_to_repeat): print("iteration: ", i, "of", times_to_repeat) _x_val, _y_val, _x_train, _y_train = evaluation.get_val_test_set(x_train, y_train, 0.25) rf = RandomForestClassifier(n_estimators=params["n_estimators"], min_samples_split=params["min_samples_split"], min_samples_leaf=params["min_samples_leaf"], max_features=params["max_features"], max_depth=params["max_depth"], bootstrap=params["bootstrap"]) rf.fit(_x_train, _y_train) y_pred = rf.predict_proba(_x_val)[:, 1] avg_threshold += evaluation.find_best_threshold_fixed_fpr(_y_val, y_pred) rf = RandomForestClassifier(n_estimators=params["n_estimators"], min_samples_split=params["min_samples_split"], min_samples_leaf=params["min_samples_leaf"], max_features=params["max_features"], max_depth=params["max_depth"], bootstrap=params["bootstrap"]) rf.fit(x_train, y_train) return rf, avg_threshold / times_to_repeat
def create_fit_model_for_ensemble(x_train, y_train, look_back, params=None, times_to_repeat=1): def min_max_rescaling(values, min_, max_): return [(ith_element - min_) / (max_ - min_) for ith_element in values] if not params: if constants.USING_AGGREGATED_FEATURES: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_LSTM_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_LSTM_REAL_DATASET_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_LSTM_OLD_DATASET_AGGREGATED else: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_LSTM_NO_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_LSTM_REAL_DATASET_NO_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_LSTM_OLD_DATASET_NO_AGGREGATED avg_threshold = [] avg_min = [] avg_max = [] avg_mean = [] for i in range(times_to_repeat): print("iteration: ", i, "of", times_to_repeat) _x_val, _y_val, _x_train, _y_train = evaluation.get_val_test_set( x_train, y_train, 0.25) model = create_model(params["layers"], params["dropout_rate"], look_back, len(x_train[0, 0])) model.fit(_x_train, _y_train, epochs=params["epochs"], verbose=1, batch_size=params["batch_size"]) y_pred = model.predict(_x_val) min_ = min(y_pred) max_ = max(y_pred) mean_ = np.array(y_pred).mean() avg_min.append(min_) avg_max.append(max_) avg_mean.append(mean_) y_pred = min_max_rescaling(y_pred, min_, max_) avg_threshold.append( evaluation.find_best_threshold_fixed_fpr(_y_val, y_pred)) model = create_model(params["layers"], params["dropout_rate"], look_back, len(x_train[0, 0])) model.fit(x_train, y_train, epochs=params["epochs"], verbose=1, batch_size=params["batch_size"]) return model, np.array(avg_threshold).mean(), np.array( avg_min).mean(), np.array(avg_max).mean(), np.array(avg_mean).mean()
def create_fit_model_for_ensemble_based_on_cdf(x_train, y_train, look_back, params=None, times_to_repeat=1): if not params: if constants.USING_AGGREGATED_FEATURES: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_LSTM_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_LSTM_REAL_DATASET_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_LSTM_OLD_DATASET_AGGREGATED else: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_LSTM_NO_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_LSTM_REAL_DATASET_NO_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_LSTM_OLD_DATASET_NO_AGGREGATED scales, locs, means, stds, thresholds = [], [], [], [], [] for i in range(times_to_repeat): print("iteration: ", i, "of", times_to_repeat) _x_val, _y_val, _x_train, _y_train = evaluation.get_val_test_set( x_train, y_train, 0.25) model = create_model(params["layers"], params["dropout_rate"], look_back, len(x_train[0, 0])) model.fit(_x_train, _y_train, epochs=params["epochs"], verbose=1, batch_size=params["batch_size"]) y_val_pred_lstm = model.predict(_x_val) ''' to get the shape of the distribution prova = y_val_pred_lstm.tolist() buckets = {} for i in range(0, 101): buckets[i / 100] = 0 for i in prova: buckets[round(i, 2)] += 1 plt.plot(list(buckets.keys()), ",", list(buckets.values()) plt.show() ''' loc_lstm, scale_lstm = stats.expon.fit(y_val_pred_lstm) samples_lstm = stats.expon.cdf(y_val_pred_lstm, scale=scale_lstm, loc=loc_lstm) lstm_mean = samples_lstm.mean() lstm_std = samples_lstm.std() threshold_lstm = evaluation.find_best_threshold_fixed_fpr( _y_val, samples_lstm) scales.append(scale_lstm) locs.append(loc_lstm) means.append(lstm_mean) stds.append(lstm_std) thresholds.append(threshold_lstm) model = create_model(params["layers"], params["dropout_rate"], look_back, len(x_train[0, 0])) model.fit(x_train, y_train, epochs=params["epochs"], verbose=1, batch_size=params["batch_size"]) return model, np.array(scales).mean(), np.array(locs).mean(), np.array( means).mean(), np.array(stds).mean(), np.array(thresholds).mean()
def repeat_experiment_n_times(lstm, rf, xg_reg, scenario, times_to_repeat=100, adversarial_attack=False, evasion_attack=False, is_white_box_attack=True, use_lstm_for_adversarial=False): tn_s = [] tp_s = [] fp_s = [] fn_s = [] f1_s = [] balanced_accuracies = [] precisions = [] recalls = [] aucpr_s = [] roc_aucs = [] num_decisions_taken_by_lstm,\ num_decisions_taken_by_rf, \ num_decisions_taken_by_xgb, \ num_decisions_correctly_taken_from_lstm, \ num_decisions_correctly_taken_from_lstm_and_not_from_xgb_or_rf = 0, 0, 0, 0, 0 for i in range(times_to_repeat): print("Iteration", i) x_test_set, y_test_set = sequences_crafting_for_classification.get_test_set( scenario=scenario) x_val, y_val, x_test, y_test = evaluation.get_val_test_set( x_test_set, y_test_set, val_size=0.25) x_val_supervised = x_val[:, len(x_val[0]) - 1, :] x_test_supervised = x_test[:, len(x_val[0]) - 1, :] if adversarial_attack or evasion_attack: # getting train set for training if is_white_box_attack: print("Using as training set, the real one - whitebox attack") dataset_type = REAL_DATASET else: print("Using as training set, the old one - blackbox attack") dataset_type = OLD_DATASET x_train, y_train = sequences_crafting_for_classification.get_train_set( dataset_type=dataset_type) x_train_supervised = x_train[:, look_back, :] if adversarial_attack: print("Crafting an adversarial attack") if not use_lstm_for_adversarial: print("The attacker will use a Multilayer perceptron") # training multilayer perceptron # todo: hyper param tuning multilayer perceptron adversarial_model = MultiLayerPerceptron.create_fit_model( x_train_supervised, y_train) # crafting adversarial samples x_test_supervised = x_test[:, len(x_test[0]) - 1, :] frauds = x_test_supervised[np.where(y_test == 1)] adversarial_samples = fgsm.craft_sample(frauds, adversarial_model, epsilon=0.01) x_test[np.where(y_test == 1), len(x_test[0]) - 1] = adversarial_samples x_test_supervised = x_test[:, len(x_test[0]) - 1, :] else: print("The attacker will use a LSTM network") # train the network using the right params if is_white_box_attack: if USING_AGGREGATED_FEATURES: params = BEST_PARAMS_LSTM_REAL_DATASET_AGGREGATED else: params = BEST_PARAMS_LSTM_REAL_DATASET_NO_AGGREGATED else: if USING_AGGREGATED_FEATURES: params = BEST_PARAMS_LSTM_OLD_DATASET_AGGREGATED else: params = BEST_PARAMS_LSTM_OLD_DATASET_NO_AGGREGATED adversarial_model = LSTM_classifier.create_fit_model( x_train, y_train, look_back, params=params) frauds = x_test[np.where(y_test == 1)] adversarial_samples = fgsm.craft_sample(frauds, adversarial_model, epsilon=0.01) x_test[np.where(y_test == 1)] = adversarial_samples x_test_supervised = x_test[:, len(x_test[0]) - 1, :] if evasion_attack: print("Crafting an evasion attack") # train the network using the right params if is_white_box_attack: if USING_AGGREGATED_FEATURES: params = BEST_PARAMS_RF_REAL_DATASET_AGGREGATED else: params = BEST_PARAMS_RF_REAL_DATASET_NO_AGGREGATED else: if USING_AGGREGATED_FEATURES: params = BEST_PARAMS_RF_OLD_DATASET_AGGREGATED else: params = BEST_PARAMS_RF_OLD_DATASET_NO_AGGREGATED # training the oracle oracle = RF.create_model(x_train_supervised, y_train, params=params) # get the oracle threshold y_val_pred_oracle = oracle.predict_proba(x_val_supervised) oracle_threshold = evaluation.find_best_threshold_fixed_fpr( y_val, y_val_pred_oracle[:, 1]) # if the oracle predicts the fraud as fraud -> discard it, otherwise inject in real bank system y_pred_oracle = rf.predict_proba(x_test_supervised) y_pred_oracle = y_pred_oracle[:, 1].ravel() y_pred_oracle = np.array( evaluation.adjusted_classes(y_pred_oracle, oracle_threshold)) x_test = x_test[(np.where(( (y_test == 1) & (y_pred_oracle == 0)) | (y_test == 0)))] y_test = y_test[(np.where(( (y_test == 1) & (y_pred_oracle == 0)) | (y_test == 0)))] x_test_supervised = x_test[:, len(x_test[0]) - 1, :] try: # a, b, c, d, e = 0, 0, 0, 0, 0 # y_test_pred, not_by_xgb, not_by_rf, not_found_by_others = predict_test_based_on_voting(lstm, rf, xg_reg, x_val, x_val_supervised, y_val, x_test, x_test_supervised, y_test) y_test_pred, a, b, c, d, e = predict_test_based_on_more_confident( lstm, rf, xg_reg, x_val, x_val_supervised, y_val, x_test, x_test_supervised, y_test) # y_test_pred, a, b, c, d, e = predict_test_based_on_expon(lstm, rf, xg_reg, x_val, x_val_supervised, y_val, x_test, x_test_supervised, y_test) # y_test_pred = predict_test_based_on_sum(lstm, rf, xg_reg, x_val, x_val_supervised, y_val, x_test, x_test_supervised) # y_test_pred = predict_test_based_on_more_confident_and_majority_voting(lstm, rf, xg_reg, x_val, x_val_supervised, y_val, x_test, x_test_supervised, y_test) # not_found_by_xgboost += not_by_xgb # not_by_rf += not_by_rf # not_found_by_others += not_by_others y_test_pred = np.array(y_test_pred) confusion, f1, balanced_accuracy, precision, recall, aucpr, roc_auc = evaluation.get_performance( y_test, y_test_pred, threshold=True) tn = confusion[0, 0] tp = confusion[1, 1] fp = confusion[0, 1] fn = confusion[1, 0] tn_s.append(tn) tp_s.append(tp) fp_s.append(fp) fn_s.append(fn) f1_s.append(f1) num_decisions_taken_by_lstm += a num_decisions_taken_by_rf += b num_decisions_taken_by_xgb += c num_decisions_correctly_taken_from_lstm += d num_decisions_correctly_taken_from_lstm_and_not_from_xgb_or_rf += e balanced_accuracies.append(balanced_accuracy) precisions.append(precision) recalls.append(recall) aucpr_s.append(aucpr) roc_aucs.append(roc_auc) except RuntimeError: i -= 1 print("Num decisions taken from lstm: ", num_decisions_taken_by_lstm / times_to_repeat) print("Num decisions taken by rf: ", num_decisions_taken_by_rf / times_to_repeat) print("Num decisions taken by xgb: ", num_decisions_taken_by_xgb / times_to_repeat) print("Num decisions taken by lstm correctly taken: ", num_decisions_correctly_taken_from_lstm / times_to_repeat) print( "Num decisions taken by lstm correctly taken and not by others: ", num_decisions_correctly_taken_from_lstm_and_not_from_xgb_or_rf / times_to_repeat) evaluation.print_results( np.array(tn_s).mean(), np.array(fp_s).mean(), np.array(fn_s).mean(), np.array(tp_s).mean(), np.array(f1_s).mean(), np.array(balanced_accuracies).mean(), np.array(precisions).mean(), np.array(recalls).mean(), np.array(aucpr_s).mean(), np.array(roc_aucs).mean())
def create_fit_model_for_ensemble_based_on_cdf(x_train, y_train, params=None, times_to_repeat=1): if not params: if constants.USING_AGGREGATED_FEATURES: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_XGBOOST_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_XGBOOST_REAL_DATASET_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_XGBOOST_OLD_DATASET_AGGREGATED else: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_XGBOOST_NO_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_XGBOOST_REAL_DATASET_NO_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_XGBOOST_OLD_DATASET_NO_AGGREGATED scales, locs, means, stds, thresholds = [], [], [], [], [] for i in range(times_to_repeat): print("iteration: ", i, "of", times_to_repeat) _x_val, _y_val, _x_train, _y_train = evaluation.get_val_test_set( x_train, y_train, 0.25) model = xgb.XGBClassifier(subsample=params["subsample"], min_child_weight=params["min_child_weight"], max_depth=params["max_depth"], learning_rate=params["learning_rate"], gamma=params["gamma"], colsample_bytree=params["colsample_bytree"]) model.fit(x_train, y_train) y_val_pred_lstm = model.predict_proba(_x_val)[:, 1] ''' to get the shape of the distribution prova = y_val_pred_lstm.tolist() buckets = {} for i in range(0, 101): buckets[i / 100] = 0 for i in prova: buckets[round(i, 2)] += 1 plt.plot(list(buckets.keys()), ",", list(buckets.values()) plt.show() ''' loc, scale = stats.expon.fit(y_val_pred_lstm) samples = stats.expon.cdf(y_val_pred_lstm, scale=scale, loc=loc) mean = samples.mean() std = samples.std() threshold = evaluation.find_best_threshold_fixed_fpr(_y_val, samples) scales.append(scale) locs.append(loc) means.append(mean) stds.append(std) thresholds.append(threshold) xg_reg = xgb.XGBClassifier(subsample=params["subsample"], min_child_weight=params["min_child_weight"], max_depth=params["max_depth"], learning_rate=params["learning_rate"], gamma=params["gamma"], colsample_bytree=params["colsample_bytree"]) xg_reg.fit(x_train, y_train) return xg_reg, np.array(scales).mean(), np.array(locs).mean(), np.array( means).mean(), np.array(stds).mean(), np.array(thresholds).mean()
def create_fit_model_for_ensemble_based_on_cdf(x_train, y_train, params=None, times_to_repeat=1): if not params: if constants.USING_AGGREGATED_FEATURES: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_RF_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_RF_REAL_DATASET_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_RF_OLD_DATASET_AGGREGATED else: if constants.DATASET_TYPE == constants.INJECTED_DATASET: params = constants.BEST_PARAMS_RF_NO_AGGREGATED if constants.DATASET_TYPE == constants.REAL_DATASET: params = constants.BEST_PARAMS_RF_REAL_DATASET_NO_AGGREGATED if constants.DATASET_TYPE == constants.OLD_DATASET: params = constants.BEST_PARAMS_RF_OLD_DATASET_NO_AGGREGATED scales, locs, means, stds, thresholds = [], [], [], [], [] for i in range(times_to_repeat): print("iteration: ", i, "of", times_to_repeat) _x_val, _y_val, _x_train, _y_train = evaluation.get_val_test_set(x_train, y_train, 0.25) model = RandomForestClassifier(n_estimators=params["n_estimators"], min_samples_split=params["min_samples_split"], min_samples_leaf=params["min_samples_leaf"], max_features=params["max_features"], max_depth=params["max_depth"], bootstrap=params["bootstrap"]) model.fit(_x_train, _y_train) y_val_pred_lstm = model.predict_proba(_x_val)[:, 1] ''' to get the shape of the distribution prova = y_val_pred_lstm.tolist() buckets = {} for i in range(0, 101): buckets[i / 100] = 0 for i in prova: buckets[round(i, 2)] += 1 plt.plot(list(buckets.keys()), ",", list(buckets.values()) plt.show() ''' loc, scale = stats.expon.fit(y_val_pred_lstm) samples = stats.expon.cdf(y_val_pred_lstm, scale=scale, loc=loc) mean = samples.mean() std = samples.std() threshold = evaluation.find_best_threshold_fixed_fpr(_y_val, samples) scales.append(scale) locs.append(loc) means.append(mean) stds.append(std) thresholds.append(threshold) rf = RandomForestClassifier(n_estimators=params["n_estimators"], min_samples_split=params["min_samples_split"], min_samples_leaf=params["min_samples_leaf"], max_features=params["max_features"], max_depth=params["max_depth"], bootstrap=params["bootstrap"]) rf.fit(x_train, y_train) return rf, np.array(scales).mean(), np.array(locs).mean(), np.array(means).mean(), np.array(stds).mean(), np.array(thresholds).mean()