def objective_svm(x):
    C_in, gamma_in = x[0]**2, x[1]**2
    svm_classifier = SVC(C=C_in,
                         cache_size=200,
                         class_weight=None,
                         coef0=0.0,
                         decision_function_shape='ovr',
                         degree=3,
                         gamma=gamma_in,
                         kernel='rbf',
                         max_iter=-1,
                         probability=False,
                         random_state=None,
                         shrinking=True,
                         tol=0.001,
                         verbose=False)
    k = 5
    kf = StratifiedKFold(k)
    i = 0
    score = 0
    for train_index, test_index in kf.split(X_train, Y_train):
        svm_classifier.fit(X_train[train_index], Y_train[train_index])
        Y_pred = svm_classifier.predict(X_train[test_index])
        score += f1_score(Y_train[test_index], Y_pred)
        i += 1
    return score
Esempio n. 2
0
    lgb_eval = lgb.Dataset(X_train[test_index],
                           Y_train[test_index],
                           reference=lgb_train)
    gbm = lgb.train(parameters,
                    train_set=lgb_train,
                    num_boost_round=100,
                    valid_sets=lgb_eval,
                    verbose_eval=40,
                    feval=f1_score_lgbm)
    res = gbm.predict(X_test)
    Y_pred = gbm.predict(X_train[test_index])
    Y_pred_train = gbm.predict(X_train[train_index])
    predictions[:, i] = res
    predictions_train[test_index] = Y_pred
    print("train: " +
          str(f1_score(Y_train[train_index], Y_pred_train.round())))
    print("test: " + str(f1_score(Y_train[test_index], Y_pred.round())))
    i += 1

# save submission file
Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
submission = pd.DataFrame(Y_test)
submission.to_csv(path_or_buf=path_to_submissions +
                  "-".join(my_features_acronym) + "lgbm" + ".csv",
                  index=True,
                  index_label="id",
                  header=["category"])

# save probabilities for stacking
stacking_logits_test = np.sum(predictions, axis=1)
stacking_test = pd.DataFrame(stacking_logits_test)
kf = KFold(k)
predictions = np.zeros((X_test.shape[0], k))
predictions_test = np.zeros((X_test.shape[0], k))
predictions_train = np.zeros(X_train.shape[0])
i = 0

# for each fold store predictions on test set and print validation results
test_score = 0.0
for train_index, test_index in kf.split(X_train, Y_train):
    RF.fit(X_train[train_index], Y_train[train_index])
    Y_pred = RF.predict(X_train[test_index])
    Y_pred_train = RF.predict(X_train[train_index])
    predictions[:, i] = RF.predict(X_test)
    predictions_test[:, i] = RF.predict_proba(X_test)[:, 1]
    predictions_train[test_index] = RF.predict_proba(X_train[test_index])[:, 1]
    current_test_score = f1_score(Y_train[test_index], Y_pred)
    test_score += current_test_score
    print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
    print("test: " + str(current_test_score))
    i += 1

print("CV test score: " + str(test_score / k))
# save submission file
Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)
submission = pd.DataFrame(Y_test)
submission.to_csv(
    path_or_buf=path_to_submissions + "-".join(my_features_acronym) + "RF.csv",
    index=True,
    index_label="id",
    header=["category"]
)
Esempio n. 4
0
print("parameters:")
print(parameters)
print("cross validation:")

LogReg = LogisticRegressionCV(max_iter=parameters['max_iter'],
                              tol=parameters['tol'],
                              penalty=parameters['penalty'])
k = 5
kf = KFold(k)
predictions = np.zeros((X_test.shape[0], k))
i = 0

for train_index, test_index in kf.split(X_train, Y_train):
    LogReg.fit(X_train[train_index], Y_train[train_index])
    Y_pred = LogReg.predict(X_train[test_index])
    Y_pred_train = LogReg.predict(X_train[train_index])
    predictions[:, i] = LogReg.predict(X_test)
    print("train: " + str(f1_score(Y_train[train_index], Y_pred_train)))
    print("test: " + str(f1_score(Y_train[test_index], Y_pred)))
    i += 1

Y_test = (np.sum(predictions, axis=1) > 2.5).astype(int)

# submission = pd.DataFrame(Y_test)
# submission.to_csv(
#     path_or_buf=path_to_submissions+"-".join(my_features_string)+"LogReg.csv",
#     index=True,
#     index_label="id",
#     header=["category"]
# )
Esempio n. 5
0
                    n_estimators=parameters["n_estimators"],
                    criterion=parameters["criterion"],
                    max_depth=parameters["max_depth"],
                    min_samples_leaf=parameters["min_samples_leaf"],
                    bootstrap=parameters["bootstrap"],
                    n_jobs=parameters["n_jobs"])
                k = 2
                kf = KFold(k)
                train_score = 0.0
                test_score = 0.0

                for train_index, test_index in kf.split(X_train, Y_train):
                    RF.fit(X_train[train_index], Y_train[train_index])
                    Y_pred = RF.predict(X_train[test_index])
                    Y_pred_train = RF.predict(X_train[train_index])
                    train_score += f1_score(Y_train[train_index], Y_pred_train)
                    test_score += f1_score(Y_train[test_index], Y_pred)

                train_score /= k
                test_score /= k

                if test_score > best_test_score:
                    best_index = i
                    best_train_score = train_score
                    best_test_score = test_score

                print("train score: " + str(train_score))
                print("test score: " + str(test_score))
                print("")

        print("for this round, the best feature was " +