def _get_unique_teams_from_datasets(self, df_train: pd.DataFrame, df_test: pd.DataFrame, df_predict: pd.DataFrame) -> None: """ Gets unique teams names from dataframes for each dataset type. :param df_train: Train dataset. :param df_test: Test dataset. :param df_predict: Predict dataset. """ self.train_teams = get_unique_teams(df_train) self.test_teams = get_unique_teams(df_test) self.predict_teams = get_unique_teams(df_predict) self.train_teams_exclusively = sorted(list(set(self.train_teams) - set(self.test_teams)))
def run(self) -> None: """ Runs training of the model for given number of epochs. """ st = time.time() df_train, df_test, df_predict = self._preload() # Just load models and make predictions if self._predict: teams = spc.get_unique_teams(df_train) self._load_models(teams, include_optimizer=False) self.predict(df_test, Dataset.Test, revert_to_best_params=True, restore_states_after_training=True) self.predict(df_predict, Dataset.Predict) self.save(predict=True) else: self.train(df_train, df_test) self.save(models=True, train=True, test=True) # Predict both test and predict datasets with the best params after training self.predict(df_test, Dataset.Test, revert_to_best_params=True, restore_states_after_training=True) self.predict(df_predict, Dataset.Predict) self.save(predict=True) print(f"Run time: {((time.time()-st)/60):.2f} mins.")
def load_and_process_fixtures_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Loads fixtures data ordered by date. The split guarantees that there will be at least N samples for predict dataset given by NPREDICT. Also, there will approx. the same number on test samples given by ntest argument. The exact number of required samples for test/predict dataset cannot be guaranteed due to different number of predict/test/discard samples because matches are not always played successively, so some teams can play more often within a certain period of time. Thus, in order to avoid overlapping datasets, it is probably impossible to ensure the exact numbers of samples when doing backtesting in general. :return: Train, test, and predict fixtures datasets. """ df = self._dbmanager.query_fixtures_data(self._seasons) if df.empty: raise ValueError("Empty fixtures dataframe.") df = self._drop_last_season_championship_matches(df) self.teams = get_unique_teams(df) self.last_season_teams = get_last_season_unique_teams(df) # Get fixtures ids for each team teams_fixtures_ids = {t: df[(df["home"] == t) | (df["away"] == t)].loc[:, "id"].tolist() for t in self.teams} self._check_missing_columns(df) df = self._check_nan_values(df, teams_fixtures_ids) teams_fixtures_ids = self._discard_matches(df, teams_fixtures_ids) if not self._resume: # Use last n ids for predictions and last m ids for testing for t in self.last_season_teams: self.predict_fixtures_ids[t] = teams_fixtures_ids[t][-NPREDICT:] teams_fixtures_ids[t] = teams_fixtures_ids[t][:-NPREDICT] self.test_fixtures_ids[t] = teams_fixtures_ids[t][-self._ntest:] teams_fixtures_ids[t] = teams_fixtures_ids[t][:-self._ntest] # Rest of ids is counted as train set self.train_fixtures_ids = teams_fixtures_ids else: if self.teams_names_bitlen != self._model_settings["teams_names_bitlen"]: raise ValueError("Current bitlength required to encode all teams names is higher than previous one.") # Check whether teams has not changed if self.teams != self._model_settings["teams"]: raise ValueError("Teams differ from previous run. \n" f"New: {self.teams} \n" f"Old: {self._model_settings['teams']}") if self.last_season_teams != self._model_settings["last_season_teams"]: raise ValueError("Last season teams differ from previous run. \n" f"New: {self.last_season_teams} \n" f"Old: {self._model_settings['last_season_teams']}") # Check whether fixtures ids match from previous run for t in self.last_season_teams: predict_fixtures_ids = teams_fixtures_ids[t][-NPREDICT:] teams_fixtures_ids[t] = teams_fixtures_ids[t][:-NPREDICT] test_fixtures_ids = teams_fixtures_ids[t][-self._ntest:] teams_fixtures_ids[t] = teams_fixtures_ids[t][:-self._ntest] if predict_fixtures_ids != self._model_settings["predict_fixtures_ids"][t]: raise ValueError(f"{t} predict fixtures ids differ from previous run. \n" f"New: {predict_fixtures_ids} \n" f"Old: {self._model_settings['predict_fixtures_ids'][t]}") if test_fixtures_ids != self._model_settings['test_fixtures_ids'][t]: raise ValueError(f"{t} test fixtures ids differ from previous run. \n" f"New: {test_fixtures_ids} \n" f"Old: {self._model_settings['test_fixtures_ids'][t]}") if teams_fixtures_ids[t] != self._model_settings['train_fixtures_ids'][t]: raise ValueError(f"{t} train fixtures ids differ from previous run. \n" f"New: {teams_fixtures_ids[t]} \n" f"Old: {self._model_settings['train_fixtures_ids'][t]}") # Checks passed, load previously saved data self.teams = self._model_settings["teams"] self.last_season_teams = self._model_settings["last_season_teams"] self.train_fixtures_ids = self._model_settings["train_fixtures_ids"] self.test_fixtures_ids = self._model_settings["test_fixtures_ids"] self.predict_fixtures_ids = self._model_settings["predict_fixtures_ids"] self._check_season_gaps_in_teams_matches(df) # Split original dataset into train, test, and predict datasets df_train, df_test, df_predict = self._mask_out_dataset(df) self._get_unique_teams_from_datasets(df_train, df_test, df_predict) self._check_changes_in_teams() self._count_samples(df_train, df_test, df_predict) return df_train, df_test, df_predict
def predict(self, df: pd.DataFrame, predict_dataset: Dataset, revert_to_best_params: bool = False, restore_states_after_training: bool = False, verbose: bool = False) -> None: """ Performs a single iteration of predict_on_batch for every sample in given dataset. Logic of setting weights is same as for training. :param df: Portion of data used for prediction. :param predict_dataset: Which type of dataset is used for prediction. :param revert_to_best_params: Whether to revert back to best weights. :param restore_states_after_training: Whether to restore states to moment after training. :param verbose: Whether to print matches predicting. """ print(f"Predicting dataset: {predict_dataset.value}...") teams = spc.get_unique_teams(df) predict_metrics = defaultdict(lambda: defaultdict(list)) for t in teams: self.models[t].matches_data[predict_dataset]["idx"] = 0 # Use only best params for prediction if revert_to_best_params: self.models[t].snapshot.revert_to_best_params() self.models[t].revert_to_best_params(include_optimizer=False) if restore_states_after_training: self.models[t].snapshot.restore_states_after_training() self.models[t].restore_states_after_training() # Loop over matches for i, r in df.iterrows(): if verbose and self._verbose > 0: print( f"{i:04d}: {r['id']:04d} {r['date']} {r['season']:02d} {r['league']} {r['home']} {r['away']}" ) team1 = r["home"] team2 = r["away"] team1_preds = None team2_preds = None self.models[team1].set_network_head2_params(team2) self.models[team2].set_network_head2_params(team1) team1_xinput, team1_yinput = self.models[team1].form_input( predict_dataset, self.models[team2]) if (predict_dataset == Dataset.Predict and team1_xinput) or (team1_xinput and team1_yinput): team1_preds = self.models[team1].network.predict_on_batch( team1_xinput) self.models[team1].store_network_head2_states(team2) team2_xinput, team2_yinput = self.models[team2].form_input( predict_dataset, self.models[team1]) if (predict_dataset == Dataset.Predict and team2_xinput) or (team2_xinput and team2_yinput): team2_preds = self.models[team2].network.predict_on_batch( team2_xinput) self.models[team2].store_network_head2_states(team1) emsg = "There are probably some missing data in the dataset." if team1_preds is None: raise ValueError(f"Predictions for model1 are nan. \n{emsg}") elif team2_preds is None: raise ValueError(f"Predictions for model2 are nan. \n{emsg}") # Log mew metrics predict_metrics = self._log_predict_metrics( predict_metrics, r, teams=(team1, team2), x_inputs=(team1_xinput, team2_xinput), y_inputs=(team1_yinput, team2_yinput), preds=(team1_preds, team2_preds)) self.models[team1].matches_data[predict_dataset]["idx"] += 1 self.models[team2].matches_data[predict_dataset]["idx"] += 1 # Get max number of indices depending on length of datasets if predict_dataset == Dataset.Test: max_range = self.data_loader.max_ntest_len else: max_range = self.data_loader.max_npredict_len # Create stats file for prediction metrics = list(predict_metrics[teams[0]].keys()) multiindex = pd.MultiIndex.from_product([teams, metrics], names=["team", "metric"]) self.predictions[predict_dataset] = pd.DataFrame([], index=range( 0, max_range), columns=multiindex) # Save stats for t in teams: for m in metrics: self.predictions[predict_dataset].loc[ 0:len(predict_metrics[t]), (t, m)] = pd.Series(predict_metrics[t][m])
def train(self, df_train: pd.DataFrame, df_test: pd.DataFrame) -> None: """ Loops over train dataset for given number of epochs. Model's performance is evaluated against test dataset after each epoch. During each loop over matches within an epoch: 1) Params of head2 for both current models are set - this needs to be done at the start to prevent using already updated weights when training second model. 2) If input is correctly fetched then model of team1 is trained on the input and updated states of head2 are stored into snapshot. The same applies for the second model. 3) Index to data is incremented for both models. 4) Advance to next match and repeat. :param df_train: Portion of data used for training. :param df_test: Portion of data used for testing. """ teams = spc.get_unique_teams(df_train) dataset = Dataset.Train # Load previously saved models if we continue in training # Optimizer is necessary for training if self._resume: self._load_models(teams, include_optimizer=True) for epoch in range(self._previous_epochs, self._total_epochs): st = time.time() print("---") print(f"Epoch: {epoch+1} of {self._total_epochs}") print("Training model...") # Verbose print only for first epoch verbose = (epoch == self._previous_epochs) train_metrics = defaultdict(lambda: defaultdict(list)) # Reset states of RNNs at the beginning of the epoch, so they can be saved at the end # Also reset data index position for t in teams: self.models[t].snapshot.reset_states() self.models[t].network.reset_states() self.models[t].matches_data[dataset]["idx"] = 0 # Loop over matches for i, r in df_train.iterrows(): if verbose and self._verbose > 0: print( f"{i:04d}: {r['id']:04d} {r['date']} {r['season']:02d} {r['league']} {r['home']} {r['away']}" ) team1 = r["home"] team2 = r["away"] # Set team2 weights for both teams to avoid using newly changed weights of home team for # the away team and vice versa self.models[team1].set_network_head2_params(team2) self.models[team2].set_network_head2_params(team1) # Train home model x_input, y_input = self.models[team1].form_input( dataset, self.models[team2]) if x_input and y_input: loss, acc = self.models[team1].train_on_batch( x_input, y_input) self.models[team1].store_network_head2_states(team2) train_metrics[team1]["loss"].append(loss) train_metrics[team1]["acc"].append(acc) # Train away model x_input, y_input = self.models[team2].form_input( dataset, self.models[team1]) if x_input and y_input: loss, acc = self.models[team2].train_on_batch( x_input, y_input) self.models[team2].store_network_head2_states(team1) train_metrics[team2]["loss"].append(loss) train_metrics[team2]["acc"].append(acc) # Increment index to data self.models[team1].matches_data[dataset]["idx"] += 1 self.models[team2].matches_data[dataset]["idx"] += 1 # Track epochs passed self._total_epochs_passed += 1 # Append metrics per current epoch for t in teams: self.models[t].snapshot.save_states_after_training() self.models[t].save_states_after_training() self.train_stats.loc[epoch, (t, "loss")] = np.mean( train_metrics[t]["loss"]) self.train_stats.loc[epoch, (t, "acc")] = np.mean( train_metrics[t]["acc"]) # Test models after every epoch self.test(df_test, epoch, verbose) # Call on epoch end processing self._on_epoch_end(epoch) # Measure training time and approx remaining time to finish et = time.time() - st self._runtimes_per_epoch.append(et) estimate = et * (self._total_epochs - self._total_epochs_passed) runtime = f"{estimate/60:.2f} mins" if epoch else "<inaccurate at first epoch>" print( f"Epoch took: {et:.2f} secs. Estimated time to finish: {runtime}" )
def _preload(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Loads train, test, and predict teams data for each model, and build models. :return: Train, test, and predict datasets. """ print("Loading data...") # Load fixtures for all three datasets df_train, df_test, df_predict = self.data_loader.load_and_process_fixtures_data( ) # Fit scalers on train dataset only self.data_loader.fit_scalers(df_train) # Build models for all teams all_teams = spc.get_unique_teams( pd.concat([df_train, df_test, df_predict])) for t in all_teams: self.models[t] = SPModel(t, self.data_loader.test_teams, self.data_loader.teams_names_bitlen, f"{self._folder_prefix}") # Get fixtures ids where each team played in (separately for each dataset) and store them # Ids for test and predict datasets are properly aligned to fit match sequences for t in self.data_loader.train_teams: fixtures_ids = spc.get_fixtures_ids_from_df(df_train, t) team_matches_data = self.data_loader.load_and_process_team_data( Dataset.Train, self._teams_tuples[t], fixtures_ids) self.models[t].prepare_matches_data(Dataset.Train, team_matches_data) # Compute class weights for train dataset (remove last id from each team's fixtures # which will not be used for training to properly offset test dataset) self.models[t].compute_class_weights(team_matches_data, fixtures_ids[:-1], verbose=False) for t in self.data_loader.test_teams: fixtures_ids = spc.get_fixtures_ids_from_df(df_test, t) aligned_fixtures_ids = spc.align_fixtures_ids( df_train, t, fixtures_ids, self._timesteps) team_matches_data = self.data_loader.load_and_process_team_data( Dataset.Test, self._teams_tuples[t], aligned_fixtures_ids) self.models[t].prepare_matches_data(Dataset.Test, team_matches_data) for t in self.data_loader.predict_teams: combined_df_train = pd.concat((df_train, df_test), ignore_index=True) fixtures_ids = spc.get_fixtures_ids_from_df(df_predict, t) # Use combined train+test dataset in case that there would be less test samples than timesteps # so the rest of sequence can be filled from train dataset aligned_fixtures_ids = spc.align_fixtures_ids( combined_df_train, t, fixtures_ids, self._timesteps) team_matches_data = self.data_loader.load_and_process_team_data( Dataset.Predict, self._teams_tuples[t], aligned_fixtures_ids) self.models[t].prepare_matches_data(Dataset.Predict, team_matches_data) # Assemble network for each model print(f"Assembling {len(self.models)} models...") t0 = all_teams[0] self.models[t0].build_model() for t in all_teams[1:]: if RANDOM_WEIGHTS and not self._resume: self.models[t].build_model() else: self.models[t].build_model_from(self.models[t0]) if not self._resume: self._set_default_snapshots() self._create_stats_files() # Reset indices of dfs df_train.reset_index(inplace=True, drop=True) df_test.reset_index(inplace=True, drop=True) df_predict.reset_index(inplace=True, drop=True) return df_train, df_test, df_predict