Beispiel #1
0
def objective(trial):
    iris = load_iris()
    X, y = cudf.DataFrame(pd.DataFrame(
        iris.data.astype('float32'))), cudf.DataFrame(
            pd.DataFrame(iris.target.astype('float32')))
    solver = trial.suggest_categorical("solver", ["qn"])
    C = trial.suggest_uniform("C", 0.0, 1.0)

    if solver == "qn":
        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    else:
        # 'penalty' parameter isn't relevant for this solver,
        # so we always specify 'l2' as the dummy value.
        penalty = "l2"

    classifier = LogisticRegression(max_iter=200,
                                    solver=solver,
                                    C=C,
                                    penalty=penalty)

    X_train, X_valid, y_train, y_valid = train_test_split(X, y)
    classifier.fit(X_train, y_train)

    score = classifier.score(X_valid, y_valid)
    return score
def run_log_reg(scaled_df):

    raw_train_arr = []
    raw_test_arr = []

    # Init metrics
    metrics = ['accuracy', 'f1', 'roc_auc_ovr']

    # Set c vals and penalty
    C_vals = range(-8, 5)
    C_vals = [10**val for val in C_vals]

    penalty = ['none', 'l1', 'l2']

    # Init params
    params = {'penalty': penalty, 'C': C_vals}

    # Over five trials
    for i in range(5):
        # Train test split
        X_train, X_test, y_train, y_test = train_test_split(
            scaled_df.iloc[:, :-1], scaled_df.y, train_size=5000)

        # Init clf
        clf = LogisticRegression()

        # Init gridsearch and run
        search_results = GridSearchCV(clf,
                                      params,
                                      scoring=metrics,
                                      refit=False)
        search_results.fit(X_train, y_train)

        # Get results and organize
        results = pd.DataFrame(search_results.cv_results_['params'])

        results['mean_accuracy'] = search_results.cv_results_[
            'mean_test_accuracy']
        results['mean_f1'] = search_results.cv_results_['mean_test_f1']
        results['mean_auc'] = search_results.cv_results_[
            'mean_test_roc_auc_ovr']

        # Get optimal clfs
        opt_acc_inf = results.sort_values(by='mean_accuracy',
                                          ascending=False).iloc[0]
        opt_f1_inf = results.sort_values(by='mean_f1', ascending=False).iloc[0]
        opt_auc_inf = results.sort_values(by='mean_auc',
                                          ascending=False).iloc[0]

        # Init optimal clfs
        opt_acc_clf = LogisticRegression(C=opt_acc_inf.C,
                                         penalty=opt_acc_inf.penalty,
                                         max_iter=100000)
        opt_f1_clf = LogisticRegression(C=opt_f1_inf.C,
                                        penalty=opt_f1_inf.penalty,
                                        max_iter=100000)
        opt_auc_clf = LogisticRegression(C=opt_auc_inf.C,
                                         penalty=opt_auc_inf.penalty,
                                         max_iter=100000)

        # Fit clfs
        opt_acc_clf.fit(X_train, y_train)
        opt_f1_clf.fit(X_train, y_train)
        opt_auc_clf.fit(X_train, y_train)

        # Get train and test metrics
        train_score_acc = opt_acc_clf.score(X_train, y_train)
        train_score_f1 = f1_score(y_train, opt_f1_clf.predict(X_train))
        train_score_auc = roc_auc_score(y_train, opt_auc_clf.predict(X_train))

        test_score_acc = opt_acc_clf.score(X_test, y_test)
        test_score_f1 = f1_score(y_test, opt_f1_clf.predict(X_test))
        test_score_auc = roc_auc_score(y_test, opt_auc_clf.predict(X_test))

        raw_train_arr.append(
            [train_score_acc, train_score_f1, train_score_auc])
        raw_test_arr.append([test_score_acc, test_score_f1, test_score_auc])

    # Create dataframe from results
    raw_train_arr = np.array(raw_train_arr).reshape(5, 3)
    raw_test_arr = np.array(raw_test_arr).reshape(5, 3)

    raw_train_df = pd.DataFrame(data=raw_train_arr,
                                columns=['accuracy', 'f1', 'auc'])
    raw_test_df = pd.DataFrame(data=raw_test_arr,
                               columns=['accuracy', 'f1', 'auc'])

    # Return results
    return raw_train_df, raw_test_df