def train_it(train, holdout_df, filename): train_y = train["is_attributed"] train_x = train[predict_col] X_train, X_valid, y_train, y_valid = model_selection.train_test_split( train_x, train_y, test_size=0.2, random_state=99) timer.time("prepare train in ") lgb = pocket_lgb.GoldenLgb() model = lgb.do_train_sk(X_train, X_valid, y_train, y_valid) lgb.show_feature_importance(model) del train, X_train, X_valid, y_train, y_valid gc.collect() timer.time("end train in ") validator = holdout_validator2.HoldoutValidator(model, holdout_df, predict_col) validator.validate() validator.output_prediction(filename) timer.time("done validation in ") return model
def train_it(train, pred_df, output_file): train_y = train["is_attributed"] train_x = train[predict_col] X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_x, train_y, test_size=0.1, random_state=99) timer.time("prepare train in ") lgb = pocket_lgb.GoldenLgb() model = lgb.do_train_sk(X_train, X_valid, y_train, y_valid) lgb.show_feature_importance(model) del train, X_train, X_valid, y_train, y_valid gc.collect() timer.time("end train in ") submission = pd.DataFrame({"click_id": pred_df["click_id"].astype("int")}) y_pred = model.predict(pred_df[predict_col]) submission["is_attributed"] = y_pred print(submission.describe()) timer.time("done prediction in ") submission.to_csv(output_file, index=False) timer.time("submission in ") return model
test = pd.read_feather(OUTPUT_TEST) timer.time("load csv in ") train = train7.append(train8).append(train9) print(train.info()) print(test.info()) del train7, train8, train9 gc.collect() train_y = train["is_attributed"] train_x = train[predict_col] X_train, X_valid, y_train, y_valid = model_selection.train_test_split( train_x, train_y, test_size=0.2, random_state=99) timer.time("prepare train in ") lgb = pocket_lgb.GoldenLgb() model = lgb.do_train_sk(X_train, X_valid, y_train, y_valid) lgb.show_feature_importance(model) timer.time("end train in ") del train, X_train, X_valid, y_train, y_valid gc.collect() y_pred = model.predict(test[predict_col]) mamas_idx = np.load(MAMAS_INDEX) test = test[test["click_id"] >= 0] # watch out this line for bugs submission = pd.DataFrame({"click_id": test["click_id"].astype("int")}) submission["is_attributed"] = y_pred[mamas_idx] print(submission.describe()) timer.time("done prediction in ")