Ejemplo n.º 1
0
import pandas as pd
from sklearn.linear_model import LogisticRegression
from utils import impute_nas, send_submission

train = pd.read_csv("input/dev.csv")  #DEV-SAMPLE
test = pd.read_csv("input/oot0.csv")  #OUT-OF-TIME SAMPLE

TARGET = 'ob_target'

model = LogisticRegression()
model.fit(train.drop(TARGET, axis=1), train[TARGET])

preds = model.predict_proba(impute_nas(test))[:, 1]

send_submission("logit_all_vars.csv", preds)
Ejemplo n.º 2
0
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features': ['log2', 'sqrt', None],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]
}

create_file = True
for sampling in params["bootstrap"]:
    for depth in params["max_depth"]:
        for max_feat in params["max_features"]:
            for min_leaf in params["min_samples_leaf"]:
                for min_split in params["min_samples_split"]:
                    p = {
                        'bootstrap': sampling,
                        'max_depth': depth,
                        'max_features': max_feat,
                        'min_samples_leaf': min_leaf,
                        'min_samples_split': min_split,
                        "random_state": SEED,
                        "n_estimators": 1000
                    }

                    model = RandomForestClassifier(**p).fit(X, y)
                    preds = model.predict_proba(test)[:, 1]
                    results = send_submission("doesnt_matter.csv", preds)

                    save_hyperparameters(p, results, "grid_search_rf.csv",
                                         create_file)
                    create_file = False
                    sleep(3)
Ejemplo n.º 3
0
import xgboost as xgb
import pandas as pd
from utils import impute_nas, send_submission

train = pd.read_csv("input/dev.csv")  #DEV-SAMPLE
test = pd.read_csv("input/oot0.csv")  #OUT-OF-TIME SAMPLE

TARGET = 'ob_target'

model = xgb.XGBClassifier(random_state=20190626, n_estimators=1000)
model.fit(train.drop(TARGET, axis=1), train[TARGET])

preds = model.predict_proba(impute_nas(test))[:, 1]

send_submission("xgb_all_vars.csv", preds)
Ejemplo n.º 4
0
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

train = pd.read_csv("input/dev.csv").drop("id", axis=1)  #DEV-SAMPLE
test = impute_nas(pd.read_csv("input/oot0.csv").drop(
    "id", axis=1))  #OUT-OF-TIME SAMPLE

model = RandomForestClassifier(random_state=20190628, n_estimators=1000)

results = perform_rfe(model,
                      train,
                      test,
                      "random_forest_rfe.csv",
                      to_remove=80)

#results = pd.read_csv("random_forest_rfe.csv")
num_removed = 82 - results["n"]

fig, ax = plt.subplots(figsize=(8, 8))
sns.scatterplot(x=num_removed[:70], y=results["grade"][:70], ax=ax)

best = results[results["grade"] == max(results["grade"])]

model = RandomForestClassifier(random_state=20190628, n_estimators=1000)
model.fit(train[best["feats"].item().split(";")], train[TARGET])

preds = model.predict_proba(test[best["feats"].item().split(";")])[:, 1]

result = send_submission("rf_after_rfe.csv", preds)