Example #1
0
def train_model(X, y, X_, y_, clf):
    model = utils.find_best_estimator(clf, X, y, section="approach2")
    preds = model.predict_proba(X_)
    log_loss = metrics.log_loss(y_, preds)
    return model, log_loss
Example #2
0
                df_all.shape[1])

    # Separating the train and test
    train = df_all[df_all["ID"].isin(id_train)]
    test = df_all[df_all["ID"].isin(id_test)]

    logger.info("Training model. Train dataset shape : %s" % str(train.shape))
    X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4)
    preds = None
    clf = ensemble.RandomForestClassifier()
    clf.set_params(**cfg[section]["best_estimator"])
    if cfg[section]["find_best"] == True:
        model = utils.find_best_estimator(clf,
                                          X,
                                          y,
                                          cfg,
                                          section=section,
                                          grid_search_params_key="param_dist",
                                          scoring="f1",
                                          verbosity=2)
    else:
        model = clf.fit(X, y)
        preds = model.predict_proba(X_eval)[:, 1]
        log_loss = metrics.log_loss(y_eval, preds)
        logger.info("Trained model %s" % model)
        logger.info("Log loss : %.6f" % log_loss)

    logger.info("Making predictions..")
    predicted_probabilities = model.predict_proba(test)[:, 1]
    utils.make_submission_file(predicted_probabilities, "simple-randomforest")
Example #3
0
    num_classes = len(label_encoder.classes_)
    y = label_encoder.transform(target)

    # Level 0 classifiers
    clfs = [
        ExtraTreesClassifier(**utils.read_estimator_params(s, "et")),
        LogisticRegression(**utils.read_estimator_params(s, "lr")),
        RandomForestClassifier(**utils.read_estimator_params(s, "rf"))
    ]

    # First, run grid search (if enabled) to find the best estimator
    results_1 = []
    for clf in clfs:
        ts = time.time()
        clf_name = type(clf).__name__
        model = utils.find_best_estimator(clf, X_train, y_train, section=s)
        preds = model.predict_proba(X_valid)
        log_loss = metrics.log_loss(y_valid, preds)
        results_1.append((utils.get_key(clf_name), model, log_loss))
        logger.info("Trained {} in {:.2f} seconds, Log loss : {:.6f}"
            .format(clf_name, (time.time() - ts), log_loss))
    # Sort by log_loss
    results_1.sort(key=lambda tup: tup[2])
    logger.info(tabulate(zip([r[0] for r in results_1],
                             [r[2] for r in results_1]),
                         floatfmt=".4f", headers=("model", "log_loss")))
    clfs = [clf[1] for clf in results_1] # required for blending stage

    # Next, run stacked generalization (blending)
    logger.info("Start blending")
    results_2 = []
Example #4
0
    # Drop the remaining categorical columns (ones we did not convert)
    col_names = list(df_all.columns.values)
    logger.info("Categorical columns not converted : %s" % remaining_cols)
    df_all = df_all.drop(remaining_cols, axis=1)
    logger.info("%d columns after dropping remaining categorical columns." %
                df_all.shape[1])

    # Separating the train and test
    train = df_all[df_all["ID"].isin(id_train)]
    test = df_all[df_all["ID"].isin(id_test)]

    logger.info("Training model. Train dataset shape : %s" % str(train.shape))
    X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4)
    preds = None
    clf = ensemble.RandomForestClassifier()
    clf.set_params(**cfg[section]["best_estimator"])
    if cfg[section]["find_best"] == True:
        model = utils.find_best_estimator(clf, X, y, cfg, section=section,
                                          grid_search_params_key="param_dist",
                                          scoring="f1", verbosity=2)
    else:
        model = clf.fit(X, y)
        preds = model.predict_proba(X_eval)[:, 1]
        log_loss = metrics.log_loss(y_eval, preds)
        logger.info("Trained model %s" % model)
        logger.info("Log loss : %.6f" % log_loss)

    logger.info("Making predictions..")
    predicted_probabilities = model.predict_proba(test)[:, 1]
    utils.make_submission_file(predicted_probabilities, "simple-randomforest")
Example #5
0
            # For numeric columns, replace missing values with -999
            tmp_len = len(train[a_vals.isnull()])
            if tmp_len > 0:
                train.loc[a_vals.isnull(), a] = -999
            tmp_len = len(test[b_vals.isnull()])
            if tmp_len > 0:
                test.loc[b_vals.isnull(), b] = -999

    # Training
    t0 = time.time()
    clf = ExtraTreesClassifier()
    clf.set_params(**cfg[s]["estimator_params_etc"])
    X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4)

    if cfg[s]["find_best"] == True:
        model = utils.find_best_estimator(clf, X, y, cfg, section=s,
                                          grid_search_params_key="gs_params_etc",
                                          scoring="log_loss", verbosity=2)
        logger.info(model)
    else:
        model = clf.fit(X, y)
        logger.info("%.2f seconds to train %s" % ((time.time() - t0), model))

    preds = model.predict_proba(X_eval)[:, 1]
    log_loss = metrics.log_loss(y_eval, preds)
    logger.info("Log loss : %.6f" % log_loss)

    logger.info("Making predictions..")
    y_pred = model.predict_proba(test)
    utils.make_submission_file(y_pred[:, 1], "etc_")
Example #6
0
            tmp_len = len(test[b_vals.isnull()])
            if tmp_len > 0:
                test.loc[b_vals.isnull(), b] = -999

    # Training
    t0 = time.time()
    clf = ExtraTreesClassifier()
    clf.set_params(**cfg[s]["estimator_params_etc"])
    X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4)

    if cfg[s]["find_best"] == True:
        model = utils.find_best_estimator(
            clf,
            X,
            y,
            cfg,
            section=s,
            grid_search_params_key="gs_params_etc",
            scoring="log_loss",
            verbosity=2)
        logger.info(model)
    else:
        model = clf.fit(X, y)
        logger.info("%.2f seconds to train %s" % ((time.time() - t0), model))

    preds = model.predict_proba(X_eval)[:, 1]
    log_loss = metrics.log_loss(y_eval, preds)
    logger.info("Log loss : %.6f" % log_loss)

    logger.info("Making predictions..")
    y_pred = model.predict_proba(test)