Esempio n. 1
0
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)

if "Audit" in datasets:
	build_audit(DecisionTreeClassifier(min_samples_leaf = 2, random_state = 13), "DecisionTreeAudit", compact = False)
	build_audit(BaggingClassifier(DecisionTreeClassifier(random_state = 13, min_samples_leaf = 5), n_estimators = 3, max_features = 0.5, random_state = 13), "DecisionTreeEnsembleAudit")
	build_audit(DummyClassifier(strategy = "most_frequent"), "DummyAudit")
	build_audit(ExtraTreesClassifier(n_estimators = 10, min_samples_leaf = 5, random_state = 13), "ExtraTreesAudit")
	build_audit(GBDTLRClassifier(RandomForestClassifier(n_estimators = 17, random_state = 13), LogisticRegression(multi_class = "ovr", solver = "liblinear")), "GBDTLRAudit")
	build_audit(GBDTLRClassifier(XGBClassifier(n_estimators = 17, random_state = 13), LogisticRegression(multi_class = "ovr", solver = "liblinear")), "XGBLRAudit")
	build_audit(GBDTLRClassifier(XGBRFClassifier(n_estimators = 7, max_depth = 6, random_state = 13), SGDClassifier(loss = "log", penalty = "elasticnet", random_state = 13)), "XGBRFLRAudit")
	build_audit(GradientBoostingClassifier(loss = "exponential", init = None, random_state = 13), "GradientBoostingAudit")
	build_audit(HistGradientBoostingClassifier(max_iter = 71, random_state = 13), "HistGradientBoostingAudit")
	build_audit(LGBMClassifier(objective = "binary", n_estimators = 37), "LGBMAudit", predict_params = {"num_iteration" : 17}, predict_proba_params = {"num_iteration" : 17}, num_iteration = 17)
	build_audit(LinearDiscriminantAnalysis(solver = "lsqr"), "LinearDiscriminantAnalysisAudit")
	build_audit(LinearSVC(penalty = "l1", dual = False, random_state = 13), "LinearSVCAudit", with_proba = False)
	build_audit(LogisticRegression(multi_class = "multinomial", solver = "newton-cg", max_iter = 500), "MultinomialLogisticRegressionAudit")
	build_audit(LogisticRegressionCV(cv = 3, multi_class = "ovr"), "OvRLogisticRegressionAudit")
	build_audit(BaggingClassifier(LogisticRegression(multi_class = "ovr", solver = "liblinear"), n_estimators = 3, max_features = 0.5, random_state = 13), "LogisticRegressionEnsembleAudit")
	build_audit(GaussianNB(), "NaiveBayesAudit")
	build_audit(OneVsRestClassifier(LogisticRegression(multi_class = "ovr", solver = "liblinear")), "OneVsRestAudit")
	build_audit(RandomForestClassifier(n_estimators = 10, min_samples_leaf = 3, random_state = 13), "RandomForestAudit", flat = True)
	build_audit(RidgeClassifierCV(), "RidgeAudit", with_proba = False)
	build_audit(BaggingClassifier(RidgeClassifier(random_state = 13), n_estimators = 3, max_features = 0.5, random_state = 13), "RidgeEnsembleAudit")
	build_audit(StackingClassifier([("lda", LinearDiscriminantAnalysis(solver = "lsqr")), ("lr", LogisticRegression(multi_class = "ovr", solver = "liblinear"))], final_estimator = GradientBoostingClassifier(n_estimators = 11, random_state = 13)), "StackingEnsembleAudit")
	build_audit(SVC(gamma = "auto"), "SVCAudit", with_proba = False)
X_clouds_L = clouds_L.reshape((clouds_L.shape[0], -1))
X_clouds_scl_L = X_clouds_L / scale_rnd_L
model_L_params = {
    'colsample_bytree': 0.07,
    'gamma': 0.005,
    'max_depth': 3,
    'min_child_weight': 3,
    'n_estimators': 500,
    'objective': 'binary:logistic',
    'random_state': 10,
    'reg_alpha': 9,
    'reg_lambda': 0,
    'subsample': 0.6,
    'verbosity': 0
}
model_L = XGBRFClassifier(**model_L_params)
model_L.fit(X_train_scl_L,
            y_train_L,
            eval_set=eval_set,
            eval_metric='auc',
            early_stopping_rounds=20,
            verbose=False)
pred_L = model_L.predict(X_clouds_scl_L)

clouds_R, _ = get_clouds(roiR)
X_clouds_R = clouds_R.reshape((clouds_R.shape[0], -1))
X_clouds_scl_R = X_clouds_R / scale_rnd_R
model_R_params = {
    'colsample_bytree': 0.07,
    'gamma': 0.005,
    'max_depth': 3,
gamma = [i / 1000 for i in range(1, 11)]
reg_alpha = [i for i in range(1, 11)]
params_grid = [{'gamma': gamma, 'reg_alpha': reg_alpha, 'reg_lambda': [0]}]
param_dists = params_grid
n_iter = 25

early_stopping_rounds = 20
eval_set = [(X_eval_scl_L, y_eval_L)]
fixed_params = {
    'objective': 'binary:logistic',
    'n_estimators': 500,
    'random_state': 10,
    'verbosity': 0
}

estimator = XGBRFClassifier(**fixed_params, **var_params)

crossval = RepeatedStratifiedKFold(n_splits=6, n_repeats=3, random_state=3)

my_prec_scorer = make_scorer(precision_score, pos_label=class_names[0])
my_recall_scorer = make_scorer(recall_score, pos_label=class_names[0])
metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': my_prec_scorer,
    'recall': my_recall_scorer
}

print(f'# Tuning hyper-parameters')
print()

search_params = {
X = df.drop(columns=target)

train, val, test, y_train, y_val, y_test = train_val_test_split(X, y)

pipeline = make_pipeline(OrdinalEncoder(), SimpleImputer())

X_train = pipeline.fit_transform(train)
X_val = pipeline.transform(val)
X_test = pipeline.transform(test)

eval_set = [(X_train, y_train), (X_val, y_val)]

model = XGBRFClassifier(n_jobs=-1,
                        n_estimators=5000,
                        early_stopping_rounds=100,
                        random_state=42,
                        scale_pos_weight=15,
                        learning_rate=.005,
                        reg_lambda=.01,
                        verbosity=1)
print('fitting...')
model.fit(X_train, y_train, eval_set=eval_set, eval_metric='auc', verbose=True)

y_pred_proba = model.predict_proba(X_val)[:, 1]
print(f'Validation ROC AUC score: {roc_auc_score(y_val, y_pred_proba)}')

print('permuting...')
permuter = PermutationImportance(model,
                                 cv='prefit',
                                 n_iter=5,
                                 scoring='roc_auc',
                                 random_state=42)
            # param_test = {
            #     'max_depth':range(3,10,2),
            #     'min_child_weight':range(1,6,2)
            # }
            # #metrics to consider: f1_micro, f1_macro, roc_auc_ovr
            # gsearch1 = GridSearchCV(estimator = xgbrf_classifier, param_grid = param_test, scoring='f1_micro',n_jobs=1,verbose = 10, cv=5)
            # gsearch1.fit(trainDataset.X, trainDataset.Y[:, 0])
            # results, best = getTrainScores(gsearch1)
            # print(results)
            # print(best)

            xgbrf_classifier = None
            if not arg.oneHot:
                xgbrf_classifier = XGBRFClassifier(
                                    learning_rate=0.1,
                                    n_estimators=1000,
                                    max_depth=7,
                                    min_child_weight=5
                                )
            else:
                xgbrf_classifier = XGBRFClassifier(
                                    learning_rate=0.1,
                                    n_estimators=1000,
                                    max_depth=7,
                                    min_child_weight=3
                                )

            print('[LOG] Fitting model...')
            xgbrf_classifier.fit(trainDataset.X, trainDataset.Y[:,0])
            print('[LOG] Fitting done!')
            print('-- Model Report --')
            print('XGBoost train Accuracy: '+str(accuracy_score(xgbrf_classifier.predict(trainDataset.X), trainDataset.Y[:,0])))