def train_model(X, y, X_, y_, clf): model = utils.find_best_estimator(clf, X, y, section="approach2") preds = model.predict_proba(X_) log_loss = metrics.log_loss(y_, preds) return model, log_loss
df_all.shape[1]) # Separating the train and test train = df_all[df_all["ID"].isin(id_train)] test = df_all[df_all["ID"].isin(id_test)] logger.info("Training model. Train dataset shape : %s" % str(train.shape)) X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4) preds = None clf = ensemble.RandomForestClassifier() clf.set_params(**cfg[section]["best_estimator"]) if cfg[section]["find_best"] == True: model = utils.find_best_estimator(clf, X, y, cfg, section=section, grid_search_params_key="param_dist", scoring="f1", verbosity=2) else: model = clf.fit(X, y) preds = model.predict_proba(X_eval)[:, 1] log_loss = metrics.log_loss(y_eval, preds) logger.info("Trained model %s" % model) logger.info("Log loss : %.6f" % log_loss) logger.info("Making predictions..") predicted_probabilities = model.predict_proba(test)[:, 1] utils.make_submission_file(predicted_probabilities, "simple-randomforest")
num_classes = len(label_encoder.classes_) y = label_encoder.transform(target) # Level 0 classifiers clfs = [ ExtraTreesClassifier(**utils.read_estimator_params(s, "et")), LogisticRegression(**utils.read_estimator_params(s, "lr")), RandomForestClassifier(**utils.read_estimator_params(s, "rf")) ] # First, run grid search (if enabled) to find the best estimator results_1 = [] for clf in clfs: ts = time.time() clf_name = type(clf).__name__ model = utils.find_best_estimator(clf, X_train, y_train, section=s) preds = model.predict_proba(X_valid) log_loss = metrics.log_loss(y_valid, preds) results_1.append((utils.get_key(clf_name), model, log_loss)) logger.info("Trained {} in {:.2f} seconds, Log loss : {:.6f}" .format(clf_name, (time.time() - ts), log_loss)) # Sort by log_loss results_1.sort(key=lambda tup: tup[2]) logger.info(tabulate(zip([r[0] for r in results_1], [r[2] for r in results_1]), floatfmt=".4f", headers=("model", "log_loss"))) clfs = [clf[1] for clf in results_1] # required for blending stage # Next, run stacked generalization (blending) logger.info("Start blending") results_2 = []
# Drop the remaining categorical columns (ones we did not convert) col_names = list(df_all.columns.values) logger.info("Categorical columns not converted : %s" % remaining_cols) df_all = df_all.drop(remaining_cols, axis=1) logger.info("%d columns after dropping remaining categorical columns." % df_all.shape[1]) # Separating the train and test train = df_all[df_all["ID"].isin(id_train)] test = df_all[df_all["ID"].isin(id_test)] logger.info("Training model. Train dataset shape : %s" % str(train.shape)) X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4) preds = None clf = ensemble.RandomForestClassifier() clf.set_params(**cfg[section]["best_estimator"]) if cfg[section]["find_best"] == True: model = utils.find_best_estimator(clf, X, y, cfg, section=section, grid_search_params_key="param_dist", scoring="f1", verbosity=2) else: model = clf.fit(X, y) preds = model.predict_proba(X_eval)[:, 1] log_loss = metrics.log_loss(y_eval, preds) logger.info("Trained model %s" % model) logger.info("Log loss : %.6f" % log_loss) logger.info("Making predictions..") predicted_probabilities = model.predict_proba(test)[:, 1] utils.make_submission_file(predicted_probabilities, "simple-randomforest")
# For numeric columns, replace missing values with -999 tmp_len = len(train[a_vals.isnull()]) if tmp_len > 0: train.loc[a_vals.isnull(), a] = -999 tmp_len = len(test[b_vals.isnull()]) if tmp_len > 0: test.loc[b_vals.isnull(), b] = -999 # Training t0 = time.time() clf = ExtraTreesClassifier() clf.set_params(**cfg[s]["estimator_params_etc"]) X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4) if cfg[s]["find_best"] == True: model = utils.find_best_estimator(clf, X, y, cfg, section=s, grid_search_params_key="gs_params_etc", scoring="log_loss", verbosity=2) logger.info(model) else: model = clf.fit(X, y) logger.info("%.2f seconds to train %s" % ((time.time() - t0), model)) preds = model.predict_proba(X_eval)[:, 1] log_loss = metrics.log_loss(y_eval, preds) logger.info("Log loss : %.6f" % log_loss) logger.info("Making predictions..") y_pred = model.predict_proba(test) utils.make_submission_file(y_pred[:, 1], "etc_")
tmp_len = len(test[b_vals.isnull()]) if tmp_len > 0: test.loc[b_vals.isnull(), b] = -999 # Training t0 = time.time() clf = ExtraTreesClassifier() clf.set_params(**cfg[s]["estimator_params_etc"]) X, X_eval, y, y_eval = cv.train_test_split(train, target, test_size=0.4) if cfg[s]["find_best"] == True: model = utils.find_best_estimator( clf, X, y, cfg, section=s, grid_search_params_key="gs_params_etc", scoring="log_loss", verbosity=2) logger.info(model) else: model = clf.fit(X, y) logger.info("%.2f seconds to train %s" % ((time.time() - t0), model)) preds = model.predict_proba(X_eval)[:, 1] log_loss = metrics.log_loss(y_eval, preds) logger.info("Log loss : %.6f" % log_loss) logger.info("Making predictions..") y_pred = model.predict_proba(test)