('data/original/wc_2010_games_real.csv', 'data/original/wc_2010_bets.csv', "2010-06-11")] feature_sets = [("all_features", all_features), ("general_features", other_features), ("player_features", player_features)] file_name = "outcome_report_full.txt" reports = [] for (name, feature_set) in feature_sets: write_log(file_name, str(datetime.datetime.now())) write_log(file_name, f"Running test for feature set: {name}", print_text=True) data_loader = DataLoader(feature_set) X, y = data_loader.get_all_data("home_win") arguments = get_grid_search_arguments(X) results = run_grid_search(arguments, X, y) results.to_csv(f"gboost_hyperparam_optimization_{name}.csv") best_params_dict = get_best_params(results) optimal_params = {'n_estimators': 250} optimal_params["learning_rate"] = best_params_dict["learning_rate"] optimal_params["max_depth"] = best_params_dict["max_depth"] optimal_params["min_samples_leaf"] = best_params_dict["min_samples_leaf"] optimal_params["max_features"] = best_params_dict["max_features"] write_log(file_name, str(optimal_params), print_text=True) for (tt_file, bet_file, filter_start) in tournament_parameters:
] feature_sets = [ # ("all_features", all_features, "score_hyperparam_optimization_all_features.csv"), # ("general_features", other_features, "score_hyperparam_optimization_general_features.csv"), # ("player_features", player_features, "score_hyperparam_optimization_player_features.csv") ("rfe_features", rfe_feature, "score_hyperparam_optimization_rfe.csv")) ] file_name = "score_report_full.txt" reports = [] for (name, feature_set, fname) in feature_sets: write_log(file_name, str(datetime.datetime.now())) write_log(file_name, f"Running test for feature set: {name}", print_text=True) data_loader = DataLoader(feature_set) params = get_default_parameters() if os.path.isfile(fname): write_log(file_name, f"Hyperparameters found for: {name}", print_text=True) results = pd.read_csv(fname) else: Xhome, yhome, Xaway, yaway = data_loader.get_all_data(["home_score", "away_score"]) _, outcomes = data_loader.get_all_data("home_win") arguments = get_cv_grid_search_arguments(params, Xhome) results = run_grid_search_for_score(arguments, Xhome, yhome, Xaway, yaway, outcomes) results.to_csv(f"score_hyperparam_optimization_{name}.csv") best_params_dict = get_best_params(results) write_log(file_name, str(best_params_dict), print_text=True)
params = { 'oob_score' : True, 'bootstrap': True, 'n_jobs':-1, 'n_estimators': 1000, "max_features": "sqrt", "max_depth": 8, "min_samples_leaf": 3 } avg_accuracies = [] avg_log_lossss = [] features = [] while len(feature_set) > 0: data_loader = DataLoader(feature_set) accuracies = [] log_losses = [] feature_values = {} for i in range(100): model = RandomForestClassifier(**params) X_train, y_train, X_test, y_test = data_loader.get_train_and_test_dataset("home_win", random_state=None) model.fit(X_train, y_train) y_true, y_pred = y_test, model.predict(X_test) accuracies.append(accuracy_score(y_true, y_pred)) y_true, y_prob = y_test, model.predict_proba(X_test) log_losses.append(log_loss(y_true, y_prob))
if args.y == 2010: tt_file = 'data/original/wc_2010_games_real.csv' mb_file = 'data/original/wc_2010_bets.csv' filter_start = "2010-06-11" elif args.y == 2014: tt_file = 'data/original/wc_2014_games_real.csv' mb_file = 'data/original/wc_2014_bets.csv' filter_start = "2014-06-12" else: tt_file = 'data/original/wc_2018_games_real.csv' mb_file = 'data/original/wc_2018_bets.csv' filter_start = "2018-06-13" prefix = f"{args.f}_{args.y}" dl = DataLoader(all_features, filter_start=filter_start) model_parameters = get_default_parameters() model_parameters["max_depth"] = 8 model_parameters["max_features"] = "sqrt" model_parameters["min_samples_leaf"] = 1 af_data = simulate(tt_file, mb_file, dl, model_parameters, f"{prefix}_all_features") dl = DataLoader(other_features, filter_start=filter_start) model_parameters["max_depth"] = 8 model_parameters["max_features"] = "log2" model_parameters["min_samples_leaf"] = 10 gf_data = simulate(tt_file, mb_file, dl, model_parameters, f"{prefix}_general_features") dl = DataLoader(player_features, filter_start=filter_start)