def test_get_rankings_all(): temp_folder = os.path.join(os.getcwd(), 'temp/file.csv') make_directory(temp_folder) from_year, to_year = 1993, 2019 get_rankings_all(from_year, to_year, RAW_CLEANED_DATA_FILE_PATH, temp_folder) for year in range(from_year, to_year + 1): csv_file = '{}-{}.csv'.format(year, year + 1) created_file = os.path.join(temp_folder, csv_file) cmp_file = os.path.join(STANDINGS_PATH, csv_file) assert compare_csv(cmp_file, created_file) remove_directory(temp_folder)
def test_get_rankings_year_after_sofifa(): year = 2008 temp_folder = os.path.join(os.getcwd(), 'temp') csv_file = '{}-{}.csv'.format(year, year + 1) from_file = os.path.join(RAW_CLEANED_DATA_FILE_PATH, csv_file) to_file = os.path.join(temp_folder, csv_file) make_directory(temp_folder) get_rankings(from_file, to_file, '{}-12-31'.format(str(year+1)), include_prediction=False) cmp_file = os.path.join(STANDINGS_PATH, csv_file) assert compare_csv(cmp_file, to_file) remove_directory(temp_folder)
def test_compare_csv(): temp_folder = os.path.join(os.getcwd(), 'temp') temp_file = os.path.join(temp_folder, 'temp.csv') remove_directory(temp_folder) assert not os.path.isfile(temp_file) assert not os.path.isdir(temp_folder) copy_csv(FINAL_FILE, temp_file) assert compare_csv(FINAL_FILE, temp_file) assert compare_csv(temp_file, temp_file) assert compare_csv(FINAL_FILE, FINAL_FILE) remove_directory(temp_folder)
def delete_images_directory(reset_downloaded_images_list=True): if remove_directory(os.path.join('content', 'images'), 'State'): print('> [State Robot] Images directory successfully removed.') if reset_downloaded_images_list: content = load() content.reset_downloaded_images_list() save(content)
def test_get_current_fixtures(): temp_folder = os.path.join(os.getcwd(), 'temp') temp_file = os.path.join(temp_folder, 'temp.csv') make_directory(temp_file) get_current_fixtures(temp_file) assert os.path.isfile(temp_file) df = pd.read_csv(temp_file) df_columns_list = list(df) assert 'Date' in df_columns_list assert 'HomeTeam' in df_columns_list assert 'AwayTeam' in df_columns_list assert 'FTHG' in df_columns_list assert 'FTAG' in df_columns_list assert 'FTR' in df_columns_list remove_directory(temp_folder)
def start_experiment(self, num_epochs, num_epoch_iterations, test_phase_length, resume_from_epoch=None, show_final_plots=False): state_save_folder = 'experiment_results/{}'.format( self.experiment_name) for i in range(num_epochs): epoch_index = i + 1 if resume_from_epoch is not None and epoch_index <= resume_from_epoch: continue self.dynamics_simulator.run_simulation(num_epoch_iterations * epoch_index) new_state_save_location = state_save_folder + '/epoch_{}'.format( epoch_index) if epoch_index > 1: prev_state_save_location = state_save_folder + '/epoch_{}'.format( epoch_index - 1) self.save_state( new_state_save_location=new_state_save_location, prev_state_save_location=prev_state_save_location) remove_directory(location=prev_state_save_location) else: self.save_state( new_state_save_location=new_state_save_location) self.dynamics_simulator.set_testing_phase(True) self.dynamics_simulator.run_simulation(num_epochs * num_epoch_iterations + test_phase_length) self.save_state(new_state_save_location=state_save_folder + '/test', prev_state_save_location=state_save_folder + '/epoch_{}'.format(num_epochs), show_generated_plots=show_final_plots)
def delete_content_directory(): if remove_directory('content', 'State'): print('> [State Robot] Content directory successfully removed.')
def magic(should_train=True, should_scrape=False, data_year_available_from=1993, data_year_collect_from=2006): # Function(s) that don't have to be executed every time # 1. OVA data from sofifa_scraper (Warning: This takes a long time to run) # SOFIFA updates their stat two or three times every month, but they don't change data much # Uncomment below to scrape team overall stat data if should_scrape: scrape_team_ova_all(OVA_FILE_PATH, data_year_collect_from, CURRENT_YEAR) # Preprocessing # 1. Latest premier league results # This data can also be retrieved from http://www.football-data.co.uk/englandm.php # Uncomment below to get the latest match results get_current_fixtures(RAW_DATA_FILE_PATH_CURRENT) # 2. Standings (from 1993 to curent year) # Uncomment below to run the function get_rankings_all(data_year_available_from, CURRENT_YEAR, RAW_CLEANED_DATA_FILE_PATH, STANDINGS_PATH) # Run the functions below to start generating necessary data # 1. From raw data, remove all data but the selected columns. # Produces: cleaned data csv located in CLEANED_DATA_FILE_PATH clean_all(RAW_DATA_FILE_PATH, RAW_CLEANED_DATA_FILE_PATH, data_year_available_from, CURRENT_YEAR) # 2. From 1, add Overall Rating columns # Produces: cleaned csv modified, located in CLEANED_DATA_FILE_PATH. Now all cleaned csv from 2006-2018 have OVA column. merge_ova_to_cleaned_all(OVA_FILE_PATH, RAW_CLEANED_DATA_FILE_PATH, data_year_collect_from, CURRENT_YEAR) # 3. From 2, copy cleaned raw data to cleaned data for prediction purpose # Produces: copy csv from RAW_CLEANED_DATA_FILE_PATH to CLEANED_DATA_FILE_PATH copy_csv(RAW_CLEANED_DATA_FILE_PATH, CLEANED_DATA_FILE_PATH) # 4. From 3, add current status columns (current point, current goal for,against,difference, match played, losing/winning streaks, last 5 games) # Produces: cleaned csv modified, located in CLEANED_DATA_FILE_PATH. Now all cleaned csv from 1993-2018 have additional columns add_current_details_all(CLEANED_DATA_FILE_PATH, CLEANED_DATA_FILE_PATH, STANDINGS_PATH, data_year_available_from, CURRENT_YEAR, data_year_available_from) # 5. From 4, merge all csv files from startYear to endYear together. # FOR NOW, I only collect data from 2006 because sofifa only provides ova data from 2006, and model tends to perform better with this approach # Produces: new csv file on FINAL_FILE combine_matches(CLEANED_DATA_FILE_PATH, FINAL_FILE, data_year_collect_from, CURRENT_YEAR) # 6. From 5, get all head-to-head results (match results against the other team over time) # Produces: editted final.csv file under DATA_PATH get_match_results_against(FINAL_FILE, CLEANED_DATA_FILE_PATH, DATA_PATH, data_year_available_from, CURRENT_YEAR) # 7. Once all data is aggregated, we can now build a classifer that make preidctions. # If 'recalculate' is set True, it runs multiple classifiers on this data, # and do some grid search on it if necessary, and finally generates 'model confidence.csv' that records confidence score of each classifier. # If 'recalculate' is set False, and if clf_file exists, then it simply loads the clf from clf_file. # Produces: returns the best clf. best_clf, _, best_clf_average = get_clf(FINAL_FILE, CONFIDENCE_FILE, CLF_FILE, recalculate=should_train) # 8. Now we make prediction. This process is done by first predicting the upcoming round, then aggregate the result, then predict the next, # and repeat the process until there are no more games to predict. "predict_next_round" also produces prediction probabilities # for each matches on stat_path. # - 1. predict_next_round predicts next round and save the result in RAW_CLEANED_DATA_FILE_PATH_CURRENT. # - 2. add_current_details, as its name suggests, it adds current details. # - 3. combine_matches combine all matches from 2006 to 2018 # - 4. get_match_results_against adds head-to-head results between two teams for each match is_first = True # First save current ranking before predicting results remove_directory(STATISTICS_PATH) now = datetime.datetime.now().date().strftime('%Y-%m-%d') pred_ranking_round_file = os.path.join( PRED_RANKING_ROUND_PATH, 'prediction_ranking_{}.csv'.format(now)) get_rankings(RAW_CLEANED_DATA_FILE_PATH_CURRENT, pred_ranking_round_file, include_prediction=True, predicted_date_so_far=now, ranking_summary_file=PRED_RANKING_ROUND_SUMMARY_FILE) while True: is_next_round, date = predict_next_round( best_clf, FINAL_FILE, RAW_CLEANED_DATA_FILE_PATH_CURRENT, statistics=True, stat_path=PREDICTION_FILE, first=is_first) if not is_next_round: break add_current_details(RAW_CLEANED_DATA_FILE_PATH_CURRENT, CLEANED_DATA_FILE_PATH_CURRENT, STANDINGS_PATH, data_year_available_from) combine_matches(CLEANED_DATA_FILE_PATH, FINAL_FILE, data_year_collect_from, CURRENT_YEAR) get_match_results_against(FINAL_FILE, CLEANED_DATA_FILE_PATH, DATA_PATH, data_year_available_from, CURRENT_YEAR) pred_ranking_round_file = os.path.join( PRED_RANKING_ROUND_PATH, 'prediction_ranking_{}.csv'.format(date)) get_rankings(PREDICTION_FILE, pred_ranking_round_file, include_prediction=True, predicted_date_so_far=date, ranking_summary_file=PRED_RANKING_ROUND_SUMMARY_FILE) is_first = False # 9. Now prediction is done. Produce a season standing with using the prediction result. winning_team = get_rankings(PREDICTION_FILE, PRED_RANKING_FILE, include_prediction=True) # 10. Put previous results, prediction results, standing predictions to the database save_new_data_to_database(DATABASE_PATH, FINAL_FILE, PREDICTION_FILE, PRED_RANKING_ROUND_SUMMARY_FILE) # 11. Summary to database if should_train: save_summary_to_database(DATABASE_PATH, best_clf_average, winning_team)