def end_to_end_test(): np.random.seed(0) # load data match_data, match_features, match_labels, bkm_quotes = simple_data_prep( verbose=1, fable_observed_matches=40, padding=False, fable="match_hist", label_format="hot_vectors") # split data split_ratio = 0.15 X_train, X_val, (indices_train, indices_val) = split_input(match_features, 1. - split_ratio, random=True, return_indices=True) Y_train, Y_val = match_labels.iloc[indices_train], match_labels.iloc[ indices_val] bkm_quotes_train, bkm_quotes_val = bkm_quotes.iloc[ indices_train], bkm_quotes.iloc[indices_val] display_shapes(X_train, X_val, Y_train, Y_val) epochs = 200 convolution_model = False # define and configure model if convolution_model: add_tag = "conv" # n_activations = 64 n_activations = 16 n_conv_filter = 4 # activation_fct = "sigmoid" activation_fct = "relu" dropout = 0.45 # l2_reg = 0.003 l2_reg = 0.07 model = prepare_simple_nn_model_conv(X_train.shape[1:], n_activations=n_activations, n_conv_filter=n_conv_filter, activation_fct=activation_fct, base_dropout=dropout, l2_regularization_factor=l2_reg) else: add_tag = "simple" # n_activations = 128 # sigmoid n_activations = 50 # relu # activation_fct = "sigmoid" activation_fct = "relu" dropout = 0.45 # l2_reg = 0.002 # sigmoid l2_reg = 0.05 # relu model = prepare_simple_nn_model(X_train.shape[1:], n_activations=n_activations, activation_fct=activation_fct, base_dropout=dropout, l2_regularization_factor=l2_reg) # creates a model label containing most of its param (used to load / save it) model_label = add_tag + '_model_' + str(n_activations) + '_' + activation_fct + '_d' + str(dropout) + '_reg' + \ str(l2_reg) + '_shape_' + ''.join(str(e) + '_' for e in X_train.shape[1:] if e > 1) + 'e' + \ str(epochs) model_label = model_label.replace('.', '') model_label += '.h5py' # Its better to have a decreasing learning rate during the training to reach efficiently the global # minimum of the loss function. # To keep the advantage of the fast computation time with a high LR, i decreased the LR dynamically # every X steps (epochs) depending if it is necessary (when accuracy is not improved). # With the ReduceLROnPlateau function from Keras.callbacks, i choose to reduce the LR by half if the accuracy learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=60, verbose=1, factor=0.6, min_lr=0.0001) # Define the optimizer # optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0) optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0) batch_size = 256 try: model = load_model(MODEL_PATH + model_label) except OSError: # Compile the model # model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"]) model.compile(optimizer=optimizer, loss="categorical_crossentropy") if VERBOSE: model.summary() history = model.fit(x=X_train, y=Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, Y_val), verbose=2, callbacks=[learning_rate_reduction]) if SAVE_MODEL: model.save(MODEL_PATH + model_label) # Plot the loss and accuracy curves for training and validation fig, ax = plt.subplots(2, 1) ax[0].plot(history.history['loss'][5:], color='b', label="Training loss") ax[0].plot(history.history['val_loss'][5:], color='r', label="validation loss", axes=ax[0]) legend = ax[0].legend(loc='best', shadow=True) # ax[1].plot(history.history['acc'], color='b', label="Training accuracy") # ax[1].plot(history.history['val_acc'], color='r', label="Validation accuracy") # legend = ax[1].legend(loc='best', shadow=True) if DISPLAY_GRAPH: plt.show() # model predictions predictions_val = model.predict(X_val) # to get percentages predictions_train = model.predict(X_train) # to get percentages if VERBOSE: print("\n --- TRAIN ANALYSIS --- ") analyze_predictions(Y_train, predictions_train, bkm_quotes_train, nb_max_matchs_displayed=0) print("\n --- VAL ANALYSIS --- ") analyze_predictions(Y_val, predictions_val, bkm_quotes_val, nb_max_matchs_displayed=0) # on the below, reduce universe to matches with quotes remove_nan_mask_val = [ not contain_nan(bkm_quotes_val.iloc[i]) for i in range(bkm_quotes_val.shape[0]) ] bkm_quotes_val_r = bkm_quotes_val.iloc[remove_nan_mask_val] Y_val_r = Y_val.iloc[remove_nan_mask_val] predictions_val_r = predictions_val[remove_nan_mask_val] constant_invest_stgy = ConstantAmountInvestStrategy( 1.) # invest 1 in each match (if expected return > 1% actually) constant_sigma_invest_stgy = ConstantStdDevInvestStrategy( 0.01) # stdDev of each bet is 1% of wealth kelly_invest_stgy = KellyInvestStrategy( ) # Kelly's ratio investment to maximize's wealth long term return constant_percent_stgy = ConstantPercentInvestStrategy( 0.01) # invest 1% of money each time for invest_stgy in [ constant_invest_stgy, constant_sigma_invest_stgy, kelly_invest_stgy, constant_percent_stgy ]: print("\n#### results for ", invest_stgy.__class__.__name__, "####") init_wealth = 100 df_recap_stgy = invest_stgy.apply_invest_strategy( predictions_val_r, bkm_quotes_val_r, Y_val_r, init_wealth=init_wealth) print(df_recap_stgy[[ 'invested_amounts', 'exp_gain_amounts', 'gain_amounts' ]].sum()) print('wealth: from', init_wealth, 'to', round(df_recap_stgy['wealth'].iloc[-1], 4))
def stacking_predictions(): np.random.seed(2) nn_pred = np.genfromtxt("D:/Football_betting/predictions/" + 'conv_nn_predictions.csv', delimiter=',') dixon_pred = np.genfromtxt("D:/Football_betting/predictions/" + 'dixon_coles_predictions.csv', delimiter=',') bkm_quotes = pd.read_csv("D:/Football_betting/predictions/" + 'bookmaker_quotes.csv', header=0) result_labels = pd.read_csv("D:/Football_betting/predictions/" + 'actual_results.csv', header=0) bkm_probas = bkm_quote_to_probas(bkm_quotes) # on the below, reduce universe to matches with quotes remove_nan_mask_val = [not contain_nan(bkm_probas[i]) for i in range(bkm_probas.shape[0])] bkm_probas = bkm_probas[remove_nan_mask_val] nn_pred = nn_pred[remove_nan_mask_val] dixon_pred = dixon_pred[remove_nan_mask_val] result_labels = result_labels.iloc[remove_nan_mask_val] y_hot_vectors_train, y_hot_vectors_val, (indices_train, indices_val) = split_input(result_labels, split_ratio=0.8, random=True, return_indices=True) bkm_probas_train, bkm_probas_val = bkm_probas[indices_train], bkm_probas[indices_val] nn_pred_train, nn_pred_val = nn_pred[indices_train], nn_pred[indices_val] dixon_pred_train, dixon_pred_val = dixon_pred[indices_train], dixon_pred[indices_val] x_train = np.concatenate(tuple([bkm_probas_train, nn_pred_train, dixon_pred_train]), axis=1) x_val = np.concatenate(tuple([bkm_probas_val, nn_pred_val, dixon_pred_val]), axis=1) y_train = y_hot_vectors_train y_val = y_hot_vectors_val print('inputs shapes') print('x_train', x_train.shape) print('x_val', x_val.shape) print('y_train', y_train.shape) print('y_val', y_val.shape) n_activations = 20 model = simple_stacking_nn_model(n_activations, x_train.shape[1:], l2_regularization_factor=0.00005, dropout_factor=0.3) # Define the optimizer # optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0) optimizer = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0) model.compile(optimizer=optimizer, loss="categorical_crossentropy") # Its better to have a decreasing learning rate during the training to reach efficiently the global # minimum of the loss function. # To keep the advantage of the fast computation time with a high LR, i decreased the LR dynamically # every X steps (epochs) depending if it is necessary (when accuracy is not improved). # With the ReduceLROnPlateau function from Keras.callbacks, i choose to reduce the LR by half if the accuracy learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=60, verbose=1, factor=0.6, min_lr=0.0001) epochs = 800 batch_size = 512 # all ? # if VERBOSE: model.summary() history = model.fit(x=x_train, y=y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_val, y_val), verbose=2, callbacks=[learning_rate_reduction]) # Plot the loss and accuracy curves for training and validation fig, ax = plt.subplots(2, 1) ax[0].plot(history.history['loss'][5:], color='b', label="Training loss") ax[0].plot(history.history['val_loss'][5:], color='r', label="validation loss", axes=ax[0]) legend = ax[0].legend(loc='best', shadow=True) # ax[1].plot(history.history['acc'], color='b', label="Training accuracy") # ax[1].plot(history.history['val_acc'], color='r', label="Validation accuracy") # legend = ax[1].legend(loc='best', shadow=True) if DISPLAY_GRAPH: plt.show() # model predictions predictions_val = model.predict(x_val) # to get percentages predictions_train = model.predict(x_train) # to get percentages print('predictions_train ; ', log_loss(y_train, predictions_train)) print('predictions_val ; ', log_loss(y_val, predictions_val)) print('bkm_train ; ', log_loss(y_train, bkm_probas_train)) print('bkm_val ; ', log_loss(y_val, bkm_probas_val)) print('--') print('nn_train ; ', log_loss(y_train, nn_pred_train)) print('nn_val ; ', log_loss(y_val, nn_pred_val)) print('dx_train ; ', log_loss(y_train, dixon_pred_train)) print('dx_val ; ', log_loss(y_val, dixon_pred_val))
def test_train_classifier(): nb_teams = 20 nb_seasons = 20 # load everything data = full_data_creation(nb_teams, nb_seasons, dynamic_tag="dynamic", nb_seasons_val=2, fable_observed_seasons=1, bkm_noise=0.03, label_format="indices", horizontal_fable_features=True) # split data X_train, X_val, Y_train, Y_val, actual_probas_train, actual_probas_val, bkm_quotes_train, bkm_quotes_val = data X_train, X_calib, [indices_train, indices_calib] = split_input(X_train, split_ratio=0.7, random=True, return_indices=True) Y_calib = Y_train.iloc[indices_calib] Y_train = Y_train.iloc[indices_train] # print(Y_train.iloc[-10:]) # X_train, _, Y_train, _ = train_test_split(X_train, Y_train, test_size=0.1, shuffle=True, stratify=Y_train) # print(Y_train[-10:]) # input() LOG_clf = linear_model.LogisticRegression(multi_class="ovr", solver="sag", class_weight='balanced') # LOG_clf.fit(X_train, Y_train) # print("Score of {} for training set: {:.4f}.".format(LOG_clf.__class__.__name__, # accuracy_score(Y_train, LOG_clf.predict(X_train)))) # print( # "Score of {} for test set: {:.4f}.".format(LOG_clf.__class__.__name__, accuracy_score(Y_val, # LOG_clf.predict(X_val)))) dm_reduction = PCA() RF_clf = RandomForestClassifier(n_estimators=200, random_state=1, class_weight='balanced') # Creating cross validation data splits cv_sets = model_selection.StratifiedShuffleSplit(n_splits=5, test_size=0.20, random_state=5) cv_sets.get_n_splits(X_train, Y_train) n_features = X_train.shape[1] parameters_RF = { 'clf__max_features': ['auto', 'log2'], 'dm_reduce__n_components': np.arange(5, n_features, np.around(n_features / 5)) } parameters_LOG = { 'clf__C': np.logspace(1, 1000, 5), 'dm_reduce__n_components': np.arange(5, n_features, np.around(n_features / 5)) } # scorer = make_scorer(accuracy_score) scorer = make_scorer(log_loss) # scorer = make_scorer(lambda x, y: log_loss(x, y, labels=sorted(np.unique(y)))) # to improve # scorer = lambda y: make_scorer(log_loss, greater_is_better=False, needs_proba=True, # labels=sorted(np.unique(y))) # computations core to use jobs = 1 # best_pipe = train_classifier(LOG_clf, dm_reduction, X_train, Y_train, cv_sets, parameters_LOG, scorer, jobs, # use_grid_search=True, best_components=None, best_params=None) best_pipe = train_classifier(RF_clf, dm_reduction, X_train, Y_train, cv_sets, parameters_RF, scorer, jobs, use_grid_search=True, best_components=None, best_params=None) print(best_pipe) clf, dm_reduce, train_score, test_score = train_calibrate_predict( RF_clf, dm_reduction, X_train, Y_train, X_calib, Y_calib, X_val, Y_val, cv_sets, parameters_RF, scorer, jobs, use_grid_search=True) print(clf) print(dm_reduce) print(train_score) print(test_score)
def test_stacking_model(): # load data y_hot_vectors = pd.read_csv(PATH + "labels.csv") perfect_preds = pd.read_csv(PATH + "perfect_pred.csv") noisy_preds = pd.read_csv(PATH + "noisy_pred.csv") wrong_preds = pd.read_csv(PATH + "wrong_pred.csv") print('perfect; ', log_loss(y_hot_vectors, perfect_preds)) print('noisy ; ', log_loss(y_hot_vectors, noisy_preds)) print('wrong ; ', log_loss(y_hot_vectors, wrong_preds)) np.random.seed(2) perfect_preds_train, perfect_preds_val, (indices_train, indices_val)= split_input(perfect_preds, split_ratio=0.8, random=True, return_indices=True) y_hot_vectors_train, y_hot_vectors_val = y_hot_vectors.iloc[indices_train], y_hot_vectors.iloc[indices_val] noisy_preds_train, noisy_preds_val = noisy_preds.iloc[indices_train], noisy_preds.iloc[indices_val] wrong_preds_train, wrong_preds_val = wrong_preds.iloc[indices_train], wrong_preds.iloc[indices_val] x_train = np.concatenate(tuple([noisy_preds_train, wrong_preds_train]), axis=1) x_val = np.concatenate(tuple([noisy_preds_val, wrong_preds_val]), axis=1) y_train = y_hot_vectors_train y_val = y_hot_vectors_val n_activations = 20 model = simple_stacking_nn_model(n_activations, x_train.shape[1:], l2_regularization_factor=0.00005, dropout_factor=0.3) # Define the optimizer # optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0) optimizer = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0) model.compile(optimizer=optimizer, loss="categorical_crossentropy") # Its better to have a decreasing learning rate during the training to reach efficiently the global # minimum of the loss function. # To keep the advantage of the fast computation time with a high LR, i decreased the LR dynamically # every X steps (epochs) depending if it is necessary (when accuracy is not improved). # With the ReduceLROnPlateau function from Keras.callbacks, i choose to reduce the LR by half if the accuracy learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=60, verbose=1, factor=0.6, min_lr=0.0001) epochs = 800 batch_size = 512 # all ? # if VERBOSE: model.summary() history = model.fit(x=x_train, y=y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_val, y_val), verbose=2, callbacks=[learning_rate_reduction]) # Plot the loss and accuracy curves for training and validation fig, ax = plt.subplots(2, 1) ax[0].plot(history.history['loss'][5:], color='b', label="Training loss") ax[0].plot(history.history['val_loss'][5:], color='r', label="validation loss", axes=ax[0]) legend = ax[0].legend(loc='best', shadow=True) # ax[1].plot(history.history['acc'], color='b', label="Training accuracy") # ax[1].plot(history.history['val_acc'], color='r', label="Validation accuracy") # legend = ax[1].legend(loc='best', shadow=True) if DISPLAY_GRAPH: plt.show() # model predictions predictions_val = model.predict(x_val) # to get percentages predictions_train = model.predict(x_train) # to get percentages print('perfect; ', log_loss(y_hot_vectors, perfect_preds)) print('noisy ; ', log_loss(y_hot_vectors, noisy_preds)) print('wrong ; ', log_loss(y_hot_vectors, wrong_preds)) print('predictions_train ; ', log_loss(y_train, predictions_train)) print('predictions_val ; ', log_loss(y_val, predictions_val)) print('perfect_val ; ', log_loss(y_val, perfect_preds_val))
def full_data_creation(nb_teams, nb_seasons, dynamic_tag="dynamic", nb_seasons_val=2, fable_observed_seasons=1, bkm_noise=0.03, bkm_fees=0.05, nb_fixed_seasons=0, fable='match_hist', label_format="hot_vectors", horizontal_fable_features=False, verbose=1, data_path=DATA_PATH): # Check inputs assert(nb_seasons_val + fable_observed_seasons < nb_seasons) assert(label_format in ("hot_vectors", "indices", "labels")) assert(fable in ("match_hist", "stats")) # dynamic_tag = "stationary" params_str = 't' + str(nb_teams) + '_s' + str(nb_seasons) + '_' np.random.seed(0) try: match_results = pd.read_csv(data_path + params_str + dynamic_tag + "_poisson_results.csv") actual_probas = pd.read_csv(data_path + params_str + dynamic_tag + "_poisson_results_probabilities.csv") print(" ... data files have been loaded ...") except FileNotFoundError: print("no data files found: ... creating data ...") if dynamic_tag == "dynamic": match_results, actual_probas, team_params = create_dynamic_poisson_match_results(nb_teams, nb_seasons, nb_fixed_seasons= nb_fixed_seasons, export=True) elif dynamic_tag == "stationary": match_results, actual_probas, team_params = create_stationary_poisson_match_results(nb_teams, nb_seasons, export=True) bkm_quotes = create_noisy_bookmaker_quotes(actual_probas, std_dev=bkm_noise, fees=bkm_fees) match_results['date'] = create_time_feature_from_season_and_stage(match_results, base=100) if verbose: print(" ... creating fables ...") if fable == "match_hist": match_fables = simple_fable(match_results, nb_observed_match=(nb_teams - 1) * fable_observed_seasons * 2, horizontal_features=horizontal_fable_features) elif fable == "stats": match_fables = simple_stats_fable(match_results, nb_observed_match=(nb_teams - 1) * fable_observed_seasons * 2) if label_format == "hot_vectors": match_labels = match_outcomes_hot_vectors(match_results) elif label_format == "indices": match_labels = match_outcomes_indices(match_results) elif label_format == "labels": match_labels = match_results.apply(get_match_label, axis=1) # Split the train and the validation set for the fitting split_ratio_1 = 1. - nb_seasons_val / nb_seasons # X_train, X_val, Y_train, Y_val = train_test_split(x_data, y_data, test_size=0.1, random_state=random_seed) X_train, X_val, (indices90, indices10) = split_input(match_fables, split_ratio=split_ratio_1, random=False, return_indices=True) # eliminate first season (no fable) split_ratio_2 = fable_observed_seasons / (nb_seasons - nb_seasons_val) _, X_train, (_, remaining_train_indices) = split_input(X_train, split_ratio=split_ratio_2, random=False, return_indices=True) Y_train = match_labels.iloc[indices90].iloc[remaining_train_indices] Y_val = match_labels.iloc[indices10] bkm_quotes_train = bkm_quotes.iloc[indices90].iloc[remaining_train_indices] bkm_quotes_val = bkm_quotes.iloc[indices10] if verbose: display_shapes(X_train, X_val, Y_train, Y_val) # get actual probabilities of issues for the validation set of matches actual_probas_train = actual_probas.iloc[indices90].iloc[remaining_train_indices] actual_probas_val = actual_probas.iloc[indices10] if verbose: print("best possible honest score on train set:", log_loss(Y_train, actual_probas_train)) print("best possible honest score on validation set:", log_loss(Y_val, actual_probas_val)) return X_train, X_val, Y_train, Y_val, actual_probas_train, actual_probas_val, bkm_quotes_train, bkm_quotes_val