import pandas as pd from sklearn.linear_model import LogisticRegression from utils import impute_nas, send_submission train = pd.read_csv("input/dev.csv") #DEV-SAMPLE test = pd.read_csv("input/oot0.csv") #OUT-OF-TIME SAMPLE TARGET = 'ob_target' model = LogisticRegression() model.fit(train.drop(TARGET, axis=1), train[TARGET]) preds = model.predict_proba(impute_nas(test))[:, 1] send_submission("logit_all_vars.csv", preds)
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['log2', 'sqrt', None], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10] } create_file = True for sampling in params["bootstrap"]: for depth in params["max_depth"]: for max_feat in params["max_features"]: for min_leaf in params["min_samples_leaf"]: for min_split in params["min_samples_split"]: p = { 'bootstrap': sampling, 'max_depth': depth, 'max_features': max_feat, 'min_samples_leaf': min_leaf, 'min_samples_split': min_split, "random_state": SEED, "n_estimators": 1000 } model = RandomForestClassifier(**p).fit(X, y) preds = model.predict_proba(test)[:, 1] results = send_submission("doesnt_matter.csv", preds) save_hyperparameters(p, results, "grid_search_rf.csv", create_file) create_file = False sleep(3)
import xgboost as xgb import pandas as pd from utils import impute_nas, send_submission train = pd.read_csv("input/dev.csv") #DEV-SAMPLE test = pd.read_csv("input/oot0.csv") #OUT-OF-TIME SAMPLE TARGET = 'ob_target' model = xgb.XGBClassifier(random_state=20190626, n_estimators=1000) model.fit(train.drop(TARGET, axis=1), train[TARGET]) preds = model.predict_proba(impute_nas(test))[:, 1] send_submission("xgb_all_vars.csv", preds)
from sklearn.ensemble import RandomForestClassifier import seaborn as sns import matplotlib.pyplot as plt train = pd.read_csv("input/dev.csv").drop("id", axis=1) #DEV-SAMPLE test = impute_nas(pd.read_csv("input/oot0.csv").drop( "id", axis=1)) #OUT-OF-TIME SAMPLE model = RandomForestClassifier(random_state=20190628, n_estimators=1000) results = perform_rfe(model, train, test, "random_forest_rfe.csv", to_remove=80) #results = pd.read_csv("random_forest_rfe.csv") num_removed = 82 - results["n"] fig, ax = plt.subplots(figsize=(8, 8)) sns.scatterplot(x=num_removed[:70], y=results["grade"][:70], ax=ax) best = results[results["grade"] == max(results["grade"])] model = RandomForestClassifier(random_state=20190628, n_estimators=1000) model.fit(train[best["feats"].item().split(";")], train[TARGET]) preds = model.predict_proba(test[best["feats"].item().split(";")])[:, 1] result = send_submission("rf_after_rfe.csv", preds)