def optimize_parameters(self, matches, current_time, init_params=None, padding=True, verbose=1, home_team_key='home_team_id', away_team_key='away_team_id', home_goals_key='home_goals', away_goals_key='away_goals', time_key='date', control_dates=True): if control_dates: # control we calibrate params on past data assert (matches[time_key].max() < current_time) # init params if init_params is None: init_params = np.ones((self.nb_params, 1)) bounds = ((0.2, 5), ) * self.nb_params # define local functions involved in optimization def constraint_fct(params): # avg of alphas and betas must be one return (np.sum(params[0:self.nb_teams]) - self.nb_teams) ** 2 + \ (np.sum(params[self.nb_teams:2*self.nb_teams]) - self.nb_teams)**2 def constraint_fct_der(params): jac = np.zeros_like(params) alpha_cur = np.sum(params[0:self.nb_teams]) - self.nb_teams beta_cur = np.sum( params[self.nb_teams:2 * self.nb_teams]) - self.nb_teams for i in range(self.nb_teams): jac[i] = 2. * params[i] * alpha_cur for i in range(self.nb_teams, 2 * self.nb_teams): jac[i] = 2. * params[i] * beta_cur return jac def likelihood_m(params): return -self._likelihood(matches, params, current_time, padding=padding, home_team_key=home_team_key, away_team_key=away_team_key, home_goals_key=home_goals_key, away_goals_key=away_goals_key, time_key=time_key) def likelihood_jac_m(params): return -self._likelihood_jac(matches, params, current_time, padding=padding, home_team_key=home_team_key, away_team_key=away_team_key, home_goals_key=home_goals_key, away_goals_key=away_goals_key, time_key=time_key) # other ok methods; TNC or L-BFGS-B res = minimize(likelihood_m, init_params, jac=likelihood_jac_m, method='Newton-CG', bounds=bounds, options={ 'xtol': 10e-3, 'disp': False }, constraints=({ 'type': 'eq', 'fun': constraint_fct, 'jac': constraint_fct_der }, )) if not res.success: printv( 1, verbose, " fail to calibrate parameters with method Newton-CG. trying another method (TNC)" ) res = minimize(likelihood_m, init_params, jac=likelihood_jac_m, method='TNC', bounds=bounds, options={ 'xtol': 10e-3, 'disp': False }, constraints=({ 'type': 'eq', 'fun': constraint_fct, 'jac': constraint_fct_der }, )) if not res.success: print('\033[91m' + "fail to calibrate parameters on date " + str(current_time) + '\033[0m') return None return res.x
def fit_and_predict(self, matches_to_predict, full_matches_history, nb_obs_years=3, padding=True, verbose=1, home_team_key='home_team_id', away_team_key='away_team_id', home_goals_key='home_goals', away_goals_key='away_goals', time_key='date', season_key='season', stage_key='stage'): start_time = time() sorted_matches = matches_to_predict.sort_values( by=[season_key, stage_key, time_key]) # first parameters calibration pred_season = sorted_matches[season_key].iloc[0] pred_stage = sorted_matches[stage_key].iloc[0] pred_time = sorted_matches[time_key].iloc[0] printv(2, verbose, "current calibration; season", pred_season, " day", pred_stage, " time", pred_time) min_hist_time = pred_time - relativedelta(years=nb_obs_years) relevant_match_history = full_matches_history[ min_hist_time <= full_matches_history[time_key]] relevant_match_history = relevant_match_history[ relevant_match_history[time_key] < pred_time] opti_params = self.optimize_parameters(relevant_match_history, pred_time, padding=padding, verbose=verbose, home_team_key=home_team_key, away_team_key=away_team_key, home_goals_key=home_goals_key, away_goals_key=away_goals_key, time_key=time_key) if verbose >= 3: self.print_params(opti_params) params_history = [ [pred_time, opti_params], ] # storage of outcomes probabilities df_predictions = pd.DataFrame(columns=['W', 'D', 'L']) for i, match in sorted_matches.iterrows(): # parameters calibration cur_season = match[season_key] cur_stage = match[stage_key] cur_time = match[time_key] if cur_season != pred_season or cur_stage != pred_stage: printv(2, verbose, "current calibration; season", cur_season, " day", cur_stage, " time", cur_time) pred_season, pred_stage, pred_time = cur_season, cur_stage, cur_time min_hist_time = pred_time - relativedelta(years=nb_obs_years) relevant_match_history = full_matches_history[ min_hist_time <= full_matches_history[time_key]] relevant_match_history = relevant_match_history[ relevant_match_history[time_key] < pred_time] # we start optimization by last most relevant params opti_params = self.optimize_parameters( relevant_match_history, pred_time, init_params=opti_params, padding=padding, verbose=verbose, home_team_key=home_team_key, away_team_key=away_team_key, home_goals_key=home_goals_key, away_goals_key=away_goals_key, time_key=time_key) if opti_params is not None: params_history.append([pred_time, opti_params]) if verbose >= 3: self.print_params(opti_params) # make prediction by finding adapted param and use it to predict outcome sorted_params_history = sorted(params_history, key=lambda x: x[0]) for i, match in matches_to_predict.iterrows(): # find adapted params for given match match_t = match[time_key] t, params_t = sorted_params_history[0] for next_t, params_next_t in sorted_params_history: if next_t > match_t: break # params to use have been found ! --> params_t params_t = params_next_t # predictions using params p_w, p_d, p_l = self.predict_match_outcome(match[home_team_key], match[away_team_key], params_t) df_predictions = df_predictions.append( { 'W': p_w, 'D': p_d, 'L': p_l }, ignore_index=True) printv(3, verbose, "prediction;", match[home_team_key], match[away_team_key], " --> ", round(p_w, 4), round(p_d, 4), round(p_l, 4)) end_time = time() printv(1, verbose, " ... fit_and_predict computations performed in", round(end_time - start_time, 2), "seconds ...") return df_predictions.values, sorted_params_history
def dixon_coles_predictions(matches_to_predict, full_match_history, nb_obs_years=3, dixon_coles_params=None, verbose=1, intermediary_analysis=True, home_team_key='home_team_id', away_team_key='away_team_id', home_goals_key='home_team_goal', away_goals_key='away_team_goal', time_key='date', season_key='season', stage_key='stage', bkm_home_win_key='B365H', bkm_draw_key='B365D', bkm_away_win_key='B365A'): # default model params if dixon_coles_params is None: dixon_coles_params = dict() # create an index to be able to return predictions in the order of the input (not the order it s been computed) matches_to_predict['tmp_index'] = range(len(matches_to_predict)) countries = list(matches_to_predict['league_country'].unique()) all_predictions = None for country in countries: printv(1, verbose, "\n #### WORKING WITH DATA FROM", country, " #### ") match_data = matches_to_predict[ matches_to_predict['league_country'].isin([ country, ])] match_history = full_match_history[ full_match_history['league_country'].isin([ country, ])] # on the below: define our team universe (teams we calibrate parameters on) team_universe = set(match_history[home_team_key].unique()) | set( match_history[away_team_key].unique()) printv(1, verbose, ' ...', len(team_universe), ' teams involved:', *team_universe, '...') printv(1, verbose, ' ...', match_data.shape[0], 'matches to predict ...') model = DixonColes(team_universe, **dixon_coles_params) printv( 1, verbose, " ... fit dixon coles parameters and predict match outcomes ... ") predictions, param_histo = model.fit_and_predict( match_data, match_history, nb_obs_years=nb_obs_years, verbose=verbose, home_team_key=home_team_key, away_team_key=away_team_key, home_goals_key=home_goals_key, away_goals_key=away_goals_key, time_key=time_key, season_key=season_key, stage_key=stage_key) printv(1, verbose, " ... match outcomes predicted ... ") if len( countries ) > 1 and intermediary_analysis: # display or not intermediary predictions quality match_outcomes = match_outcomes_hot_vectors(match_data) bkm_quotes = pd.DataFrame() bkm_quotes['W'] = match_data[bkm_home_win_key] bkm_quotes['D'] = match_data[bkm_draw_key] bkm_quotes['L'] = match_data[bkm_away_win_key] analysis = analyze_predictions(match_outcomes, predictions, bkm_quotes, nb_max_matchs_displayed=40, fully_labelled_matches=match_data, verbose=verbose, home_team_key=home_team_key, away_team_key=away_team_key, home_goals_key=home_goals_key, away_goals_key=away_goals_key) model_log_loss, model_rps, (log_loss_comparison_l, rps_comparison_l) = analysis # add predictions to those already made predictions_with_index = np.append( match_data['tmp_index'].values.reshape((-1, 1)), predictions, axis=1) if all_predictions is not None: all_predictions = np.append(all_predictions, predictions_with_index, axis=0) else: all_predictions = predictions_with_index # exctract all predictions, resort them by their index, and remove the index all_predictions = all_predictions[all_predictions[:, 0].argsort()][:, 1:] # perform a global analysis all_match_outcomes = match_outcomes_hot_vectors(matches_to_predict) all_bkm_quotes = pd.DataFrame() all_bkm_quotes['W'] = matches_to_predict[bkm_home_win_key] all_bkm_quotes['D'] = matches_to_predict[bkm_draw_key] all_bkm_quotes['L'] = matches_to_predict[bkm_away_win_key] analysis = analyze_predictions(all_match_outcomes, all_predictions, all_bkm_quotes, nb_max_matchs_displayed=40, fully_labelled_matches=matches_to_predict, verbose=verbose, home_team_key=home_team_key, away_team_key=away_team_key, home_goals_key=home_goals_key, away_goals_key=away_goals_key) print("final_pred shape", all_predictions.shape) return all_predictions
def analyze_predictions(y, pred, bkm_quotes, nb_max_matchs_displayed=10, compare_to_dummy_pred=True, fully_labelled_matches=None, verbose=1, home_team_key='home_team_id', away_team_key='away_team_id', home_goals_key='home_team_goal', away_goals_key='away_team_goal'): assert (y.shape[0] == pred.shape[0] == bkm_quotes.shape[0]) if fully_labelled_matches is not None: assert (y.shape[0] == fully_labelled_matches.shape[0]) bkm_probas = bkm_quote_to_probas(bkm_quotes) if nb_max_matchs_displayed: printv(2, verbose, "--- on the below, few prediction examples") for i in range(min(y.shape[0], nb_max_matchs_displayed)): if fully_labelled_matches is not None: printv(2, verbose, fully_labelled_matches[home_team_key].iloc[i], ' ', fully_labelled_matches[home_goals_key].iloc[i], fully_labelled_matches[away_goals_key].iloc[i], ' ', fully_labelled_matches[away_team_key].iloc[i]) printv(2, verbose, '\nmodel predictions :', pred[i]) printv(2, verbose, 'bkm probas:', list(bkm_probas[i])) printv(2, verbose, 'bkm quote:', list(bkm_quotes.iloc[i])) # log loss analysis model_log_loss = log_loss(y, pred) remove_nan_mask = [ not contain_nan(bkm_probas[i]) for i in range(bkm_probas.shape[0]) ] bkm_log_loss = log_loss(y.iloc[remove_nan_mask], bkm_probas[remove_nan_mask]) model_log_loss_bkm_comparison = log_loss(y.iloc[remove_nan_mask], pred[remove_nan_mask]) printv(1, verbose, "\ntotal model log loss score :", round(model_log_loss, 4)) printv(1, verbose, "model log loss score on matches with bkm data:", round(model_log_loss_bkm_comparison, 4)) printv(1, verbose, "bkm log loss score (on matches with bkm data):", round(bkm_log_loss, 4)) # rank probability score analysis model_rps = rank_prob_score(pred, y) bkm_rps = rank_prob_score(bkm_probas[remove_nan_mask], y.iloc[remove_nan_mask]) model_rps_bkm_comparison = rank_prob_score(pred[remove_nan_mask], y.iloc[remove_nan_mask]) printv(1, verbose, "total model rps score :", round(model_rps, 4)) printv(1, verbose, "model rps score on matches with bkm data:", round(model_rps_bkm_comparison, 4)) printv(1, verbose, "bkm rps score (on matches with bkm data):", round(bkm_rps, 4)) if compare_to_dummy_pred: printv(2, verbose, "score of equiprobability prediction :", round(log_loss(y, np.full(y.shape, 1. / 3)), 4)) return model_log_loss, model_rps, [[ model_log_loss_bkm_comparison, bkm_log_loss ], [model_rps_bkm_comparison, bkm_rps]]
def test_case_dixon_coles_one_country_predictions(): player_data, player_stats_data, team_data, match_data = first_data_preparation( ) countries = [ 'France', ] # countries = ['England', ] min_date = datetime.strptime('2016-04-30', "%Y-%m-%d").date() mask_countries = match_data['league_country'].isin(countries) match_data = match_data[mask_countries] # input(match_data['league_country'].unique()) # convert date input string to actual python date match_data['date'] = match_data.apply( lambda x: datetime.strptime(x['date'], "%Y-%m-%d %H:%M:%S").date(), axis=1) # on the below: non effective way to use team names as id (easier for human-checking and debugging) team_id_to_name, team_name_to_id = create_dict_involved_teams( match_data, team_data) match_data['home_team_id'] = match_data.apply( lambda x: team_id_to_name[x['home_team_api_id']], axis=1) match_data['away_team_id'] = match_data.apply( lambda x: team_id_to_name[x['away_team_api_id']], axis=1) # on the below: we define our team universe (teams we calibrate parameters on) mask_home = team_data['team_api_id'].isin(match_data['home_team_api_id']) mask_away = team_data['team_api_id'].isin(match_data['away_team_api_id']) team_universe = list(team_data[mask_home | mask_away]['team_long_name']) printv(1, VERBOSE, len(team_universe), team_universe) printv(1, VERBOSE, 'nb matches', match_data.shape[0]) # save full_history before selecting recent matches to predict full_history = match_data mask_date = match_data['date'] >= min_date match_data = match_data[mask_date] exp_weight_fct = lambda t1, t2: np.exp(-0.3 * (t2 - t1).days / 365.25) model = DixonColes(team_universe, weight_fct=exp_weight_fct) printv(1, VERBOSE, " ... fit dixon coles parameters and predict match outcomes ... ") predictions, param_histo = model.fit_and_predict( match_data, full_history, nb_obs_years=1, verbose=VERBOSE, home_goals_key='home_team_goal', away_goals_key='away_team_goal') printv(1, VERBOSE, " ... match outcomes predicted ... ") match_outcomes = match_outcomes_hot_vectors(match_data) bkm_quotes = pd.DataFrame() bkm_quotes['W'], bkm_quotes['D'], bkm_quotes['L'] = match_data[ 'B365H'], match_data['B365D'], match_data['B365A'] analysis = analyze_predictions(match_outcomes, predictions, bkm_quotes, verbose=VERBOSE, nb_max_matchs_displayed=40, fully_labelled_matches=match_data) model_log_loss, model_rps, (log_loss_comparison_l, rps_comparison_l) = analysis remove_nan_mask = [ not contain_nan(bkm_quotes.iloc[i]) for i in range(bkm_quotes.shape[0]) ] bkm_quotes_r = bkm_quotes.iloc[remove_nan_mask] match_outcomes_r = match_outcomes.iloc[remove_nan_mask] predictions_r = predictions[remove_nan_mask] constant_invest_stgy = ConstantAmountInvestStrategy( 1.) # invest 1 in each match (if expected return > 1% actually) constant_sigma_invest_stgy = ConstantStdDevInvestStrategy( 0.01) # stdDev of each bet is 1% of wealth kelly_invest_stgy = KellyInvestStrategy( ) # Kelly's ratio investment to maximize's wealth long term return constant_percent_stgy = ConstantPercentInvestStrategy( 0.01) # invest 1% of money each time for invest_stgy in [ constant_invest_stgy, constant_sigma_invest_stgy, kelly_invest_stgy, constant_percent_stgy ]: printv(1, VERBOSE, "\n#### results for ", invest_stgy.__class__.__name__, "####") init_wealth = 100 df_recap_stgy = invest_stgy.apply_invest_strategy( predictions_r, bkm_quotes_r, match_outcomes_r, init_wealth=init_wealth) printv( 1, VERBOSE, df_recap_stgy[[ 'invested_amounts', 'exp_gain_amounts', 'gain_amounts' ]].sum()) printv(1, VERBOSE, 'wealth: from', init_wealth, 'to', round(df_recap_stgy['wealth'].iloc[-1], 4))
def optimize_parameters(self, matches, current_time, init_params=None, padding=True, verbose=1, home_team_key='home_team_id', away_team_key='away_team_id', home_goals_key='home_goals', away_goals_key='away_goals', time_key='date', control_dates=True): if control_dates: # control we calibrate params on past data assert (matches[time_key].max() < current_time) # init params if init_params is None: init_params = np.array([ 0. if (i < self.nb_teams or i == self.nb_params - 1) else 1. for i in range(self.nb_params) ]) # init_params = np.ones((self.nb_params, 0)) # bounds = tuple([(-5, 5) if i % 2 == 0 else (0.01, 5) for i in range(self.nb_params)]) bounds = tuple([(-3, 3) if (i < self.nb_teams or i == self.nb_params - 1) else (0.01, 3) for i in range(self.nb_params)]) # # define local functions involved in optimization def constraint_fct(params): # sum of alphas must be 0 return np.sum(params[0:self.nb_teams]) def constraint_fct_der(params): jac = np.zeros_like(params) for i in range(self.nb_teams): jac[i] = 1. return jac def likelihood_m(params): return -self._likelihood(matches, params, current_time, padding=padding, home_team_key=home_team_key, away_team_key=away_team_key, home_goals_key=home_goals_key, away_goals_key=away_goals_key, time_key=time_key) def likelihood_jac_m(params): return -self._likelihood_jac(matches, params, current_time, padding=padding, home_team_key=home_team_key, away_team_key=away_team_key, home_goals_key=home_goals_key, away_goals_key=away_goals_key, time_key=time_key) # other ok methods; TNC or L-BFGS-B res = minimize(likelihood_m, init_params, jac=likelihood_jac_m, method='Newton-CG', bounds=bounds, options={ 'xtol': 10e-3, 'disp': False }, constraints=({ 'type': 'eq', 'fun': constraint_fct, 'jac': constraint_fct_der }, )) if not res.success: printv( 1, verbose, " fail to calibrate parameters with method Newton-CG. trying another method (TNC)" ) res = minimize(likelihood_m, init_params, jac=likelihood_jac_m, method='TNC', bounds=bounds, options={ 'xtol': 10e-3, 'disp': False }, constraints=({ 'type': 'eq', 'fun': constraint_fct, 'jac': constraint_fct_der }, )) if not res.success: print('\033[91m' + "fail to calibrate parameters on date " + str(current_time) + '\033[0m') return None return res.x