Beispiel #1
0
        (LogisticRegression(**utils.read_estimator_params(s, "lr")), "lr"),
        (RandomForestClassifier(**utils.read_estimator_params(s, "rf")), "rf")
    ]
    results = []
    for clf in clfs:
        ts = time.time()
        model, log_loss = train_model(X_train, y_train, X_val, y_val, clf[0])
        results.append((clf[1], model, log_loss))
        logger.info("Trained {} in {:.2f} seconds, Log loss : {:.6f}"
            .format(type(clf[0]).__name__, (time.time() - ts), log_loss))
    # Sort by log_loss
    results.sort(key=lambda tup: tup[2])

    # Prepare the DataFrame containing from the predicted_probabilities
    model = results[0][1]
    predicted_probabilities = model.predict_proba(test_sp)
    df = pd.DataFrame(predicted_probabilities)
    subm = pd.read_csv(os.path.join("data", "sample_submission.csv.gz"),
                       dtype={"device_id": np.str})
    classes = subm.columns.values.tolist()[1:]

    df["device_id"] = subm["device_id"]
    df = df[["device_id"] + np.arange(0, 12).tolist()]
    new_names = dict(zip(np.arange(0, 12).tolist(), classes))
    df.rename(columns=new_names, inplace=True)

    # Submission file
    logger.info(tabulate(zip([r[0] for r in results], [r[2] for r in results]),
                         floatfmt=".4f", headers=("model", "log_loss")))
    utils.make_submission_file(model, df, "%s_" % results[0][0])
Beispiel #2
0
                df_all.shape[1])

    # Separating the train and test
    train = df_all[df_all["ID"].isin(id_train)]
    test = df_all[df_all["ID"].isin(id_test)]

    logger.info("Training model. Train dataset shape : %s" % str(train.shape))
    X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4)
    preds = None
    clf = ensemble.RandomForestClassifier()
    clf.set_params(**cfg[section]["best_estimator"])
    if cfg[section]["find_best"] == True:
        model = utils.find_best_estimator(clf,
                                          X,
                                          y,
                                          cfg,
                                          section=section,
                                          grid_search_params_key="param_dist",
                                          scoring="f1",
                                          verbosity=2)
    else:
        model = clf.fit(X, y)
        preds = model.predict_proba(X_eval)[:, 1]
        log_loss = metrics.log_loss(y_eval, preds)
        logger.info("Trained model %s" % model)
        logger.info("Log loss : %.6f" % log_loss)

    logger.info("Making predictions..")
    predicted_probabilities = model.predict_proba(test)[:, 1]
    utils.make_submission_file(predicted_probabilities, "simple-randomforest")
Beispiel #3
0
if args['load']:
    with open(args['load'], 'rb') as f:
        loaded_net = cPickle.load(f)
    conv_net.load_params_from(loaded_net)

conv_net.fit(X, y)

name = exp_name + '_' + str(date.today())
with open('models/conv_net_'+name+'.pkl', 'wb') as f:
    cPickle.dump(conv_net, f, -1)
conv_net.save_params_to('models/params_'+name)

# ----- Train set ----
train_predictions = conv_net.predict_proba(X)
make_submission_file(train_predictions[:sample_size], images_id[:sample_size],
                     output_filepath='models/training_'+name+'.csv')

# ----- Test set ----
X_test, _, images_id_test = load_numpy_arrays(args['test_file'])
print "Test:"
print "X_test.shape:", X_test.shape
predictions = conv_net.predict_proba(X_test)
make_submission_file(predictions, images_id_test,
                     output_filepath='submissions/submission_'+name+'.csv')

# ----- Make plots ----
plot_loss(conv_net, "models/loss_"+name+".png", show=False)

plot_conv_weights(conv_net.layers_[1], figsize=(4, 4))
plt.savefig('models/weights_'+name+'.png')
Beispiel #4
0
    for i in xrange(cfg[s]["n_blends"]):
        print("Iteration {}".format(i))
        bclf, b_t, log_loss = run_stacked_generalization(clfs, train, target)
        results_2.append((bclf, b_t, log_loss))
        logger.info("Iteration {}, Log loss : {:.4f}".format(i, log_loss))
    # Sort by log_loss
    results_2.sort(key=lambda tup: tup[2])

    # Prepare the DataFrame containing from the predicted_probabilities
    log_loss_1, log_loss_2 = results_1[0][2], results_2[0][2]
    model, predicted_probabilities = None, None
    if log_loss_1 < log_loss_2:
        logger.info("Method 1 has lower log loss {:.4f}".format(log_loss_1))
        model = results_1[0][1]
        predicted_probabilities = model.predict_proba(test)
    else:
        logger.info("Method 2 has lower log loss {:.4f}".format(log_loss_2))
        model = results_2[0][0]
        blend_test = results_2[0][1]
        predicted_probabilities = model.predict_proba(blend_test)

    df = pd.DataFrame(predicted_probabilities)
    df["device_id"] = test["device_id"]
    df = df[["device_id"] + np.arange(0, 12).tolist()]
    new_names = dict(zip(np.arange(0, 12).tolist(), model.classes_.tolist()))
    df.rename(columns=new_names, inplace=True)

    # Submission file
    prefix = utils.get_key(type(model).__name__)
    utils.make_submission_file(model, df, "{}_".format(prefix))
Beispiel #5
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from utils import make_submission_file
from utils import load_numpy_arrays
from datetime import date
import cPickle
import sys

conv_net = cPickle.load(open(str(sys.argv[1]),'rb'))

# ----- Test set ----
X_test, _, images_id_test = load_numpy_arrays('test.npz')
print "Test:"
print "X_test.shape:", X_test.shape
predictions = conv_net.predict_proba(X_test)
make_submission_file(predictions, images_id_test, output_filepath='submissions/submission_'+str(date.today)+'.csv')

Beispiel #6
0
    # Drop the remaining categorical columns (ones we did not convert)
    col_names = list(df_all.columns.values)
    logger.info("Categorical columns not converted : %s" % remaining_cols)
    df_all = df_all.drop(remaining_cols, axis=1)
    logger.info("%d columns after dropping remaining categorical columns." %
                df_all.shape[1])

    # Separating the train and test
    train = df_all[df_all["ID"].isin(id_train)]
    test = df_all[df_all["ID"].isin(id_test)]

    logger.info("Training model. Train dataset shape : %s" % str(train.shape))
    X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4)
    preds = None
    clf = ensemble.RandomForestClassifier()
    clf.set_params(**cfg[section]["best_estimator"])
    if cfg[section]["find_best"] == True:
        model = utils.find_best_estimator(clf, X, y, cfg, section=section,
                                          grid_search_params_key="param_dist",
                                          scoring="f1", verbosity=2)
    else:
        model = clf.fit(X, y)
        preds = model.predict_proba(X_eval)[:, 1]
        log_loss = metrics.log_loss(y_eval, preds)
        logger.info("Trained model %s" % model)
        logger.info("Log loss : %.6f" % log_loss)

    logger.info("Making predictions..")
    predicted_probabilities = model.predict_proba(test)[:, 1]
    utils.make_submission_file(predicted_probabilities, "simple-randomforest")
Beispiel #7
0
            # For numeric columns, replace missing values with -999
            tmp_len = len(train[a_vals.isnull()])
            if tmp_len > 0:
                train.loc[a_vals.isnull(), a] = -999
            tmp_len = len(test[b_vals.isnull()])
            if tmp_len > 0:
                test.loc[b_vals.isnull(), b] = -999

    # Training
    t0 = time.time()
    clf = ExtraTreesClassifier()
    clf.set_params(**cfg[s]["estimator_params_etc"])
    X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4)

    if cfg[s]["find_best"] == True:
        model = utils.find_best_estimator(clf, X, y, cfg, section=s,
                                          grid_search_params_key="gs_params_etc",
                                          scoring="log_loss", verbosity=2)
        logger.info(model)
    else:
        model = clf.fit(X, y)
        logger.info("%.2f seconds to train %s" % ((time.time() - t0), model))

    preds = model.predict_proba(X_eval)[:, 1]
    log_loss = metrics.log_loss(y_eval, preds)
    logger.info("Log loss : %.6f" % log_loss)

    logger.info("Making predictions..")
    y_pred = model.predict_proba(test)
    utils.make_submission_file(y_pred[:, 1], "etc_")
Beispiel #8
0
                test.loc[b_vals.isnull(), b] = -999

    # Training
    t0 = time.time()
    clf = ExtraTreesClassifier()
    clf.set_params(**cfg[s]["estimator_params_etc"])
    X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4)

    if cfg[s]["find_best"] == True:
        model = utils.find_best_estimator(
            clf,
            X,
            y,
            cfg,
            section=s,
            grid_search_params_key="gs_params_etc",
            scoring="log_loss",
            verbosity=2)
        logger.info(model)
    else:
        model = clf.fit(X, y)
        logger.info("%.2f seconds to train %s" % ((time.time() - t0), model))

    preds = model.predict_proba(X_eval)[:, 1]
    log_loss = metrics.log_loss(y_eval, preds)
    logger.info("Log loss : %.6f" % log_loss)

    logger.info("Making predictions..")
    y_pred = model.predict_proba(test)
    utils.make_submission_file(y_pred[:, 1], "etc_")