def consolidate_data(year): driver_logger.log("\tConsolidating data") print("Consolidating data") start_time = time.time() logger.log("Consolidating team data || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) db = DatabaseConnection(sandbox_mode) for ty_uid in db.read( 'select ty_uniqueidentifier from team_years where year = ' + str(year) + ';'): team_start_time = time.time() logger.log('\t' + db.read( 'select teamId from team_years where ty_uniqueidentifier = ' + str(ty_uid[0]) + ';')[0][0]) write_roster_info( ty_uid[0], { 'hitter_spots': consolidate_hitter_spots(ty_uid[0]), 'player_positions': consolidate_player_positions(ty_uid[0]), 'batter_stats': consolidate_player_stats(ty_uid[0], 'batting', year), 'pitcher_stats': consolidate_player_stats(ty_uid[0], 'pitching', year), 'fielder_stats': consolidate_player_stats(ty_uid[0], 'fielding', year) }) logger.log('\t\tTime = ' + time_converter(time.time() - team_start_time)) db.close() total_time = time_converter(time.time() - start_time) logger.log("Done consolidating team data: Time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def simulation(away_team_id, away_year, away_year_info, home_team_id, home_year, home_year_info, games): start_time = time.time() clear_logs('controller') team_object_time = time.time() logger.log("Creating team objects") away_team = Team(away_team_id, away_year) home_team = Team(home_team_id, home_year) logger.log("\t" + time_converter(time.time() - team_object_time)) away_team_wins = 0 home_team_wins = 0 strike_zone = { 'x': strike_zone_coordinate('x'), 'y': strike_zone_coordinate('y') } for game in range(games): game_data = simulate_game(game + 1, away_team, away_team.get_team_info(), away_year_info, home_team, home_team.get_team_info(), home_year_info, away_year, home_year, strike_zone) if game_data['winner'] == away_team.get_team_id(): away_team_wins += 1 else: home_team_wins += 1 logger.log('Simulation complete: Time = ' + time_converter(time.time() - start_time)) return determine_series_winner(away_team, away_team_wins, home_team, home_team_wins, games)
def pitcher_spray_chart_constructor(year): print("creating pitcher spray charts") start_time = time.time() global bad_gateway_data bad_gateway_data = [] logger.log("Downloading " + str(year) + " pitcher spray charts || Timestamp: " + datetime.datetime.today()\ .strftime('%Y-%m-%d %H:%M:%S')) if year >= 1988: driver_logger.log("\tCreating pitcher spray charts") db = DatabaseConnection(sandbox_mode) pt_uid_players = set( db.read( 'select PT_uniqueidentifier from player_pitching where year = ' + str(year) + ' and pa_infield is NULL;')) db.close() with ThreadPoolExecutor(os.cpu_count()) as executor: for ent in pt_uid_players: executor.submit(reduce_functionality, year, ent) driver_logger.log("\t\tTime = " + time_converter(time.time() - start_time)) else: driver_logger.log("\tNo pitcher spray chart data before 1988") logger.log("\tNo spray pitcher chart data before 1988") return if len(bad_gateway_data) > 0: revisit_bad_gateways(year, bad_gateway_data) logger.log("Done downloading pitcher spray charts: time = " + time_converter(time.time() - start_time) + '\n\n')
def manager_tendencies(year): driver_logger.log("\tStoring manager tendencies") print("storing manager tendencies") start_time = time.time() logger.log("Downloading " + str(year) + " manager tendencies || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) logger.log('\tMaking HTTP requests') db = DatabaseConnection(sandbox_mode) managers = db.read( 'select manager_teams.managerid, manager_teams.teamid from manager_teams, manager_year where ' 'manager_year.year = ' + str(year) + ' and manager_year.mt_uniqueidentifier = manager_teams.' 'mt_uniqueidentifier;') db.close() with ThreadPoolExecutor(os.cpu_count()) as executor: for manager in managers: executor.submit(load_url, manager[0], manager[1]) logger.log('\t\tTime = ' + time_converter(time.time() - start_time)) process_manager_tendencies(year) write_time = time.time() logger.log('\tWriting data to database') global stats with ThreadPoolExecutor(os.cpu_count()) as executor2: for manager_team, tendencies in stats.items(): if len(tendencies) > 0: executor2.submit(write_to_file, year, manager_team, tendencies) logger.log('\t\tTime = ' + time_converter(time.time() - write_time)) total_time = time_converter(time.time() - start_time) driver_logger.log("\t\tTime = " + total_time) logger.log("Done storing manager tendencies: time = " + total_time + '\n\n')
def populate_teams_table(year): driver_logger.log('\tPopulating teams table') print("Populating teams table") start_time = time.time() logger.log('Begin populating teams table for ' + str(year) + ' || Timestamp: ' + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) with open(os.path.join("..", "background", "yearTeams.txt"), 'rt') as file: db = DatabaseConnection(sandbox_mode) db.write('ALTER TABLE teams DROP INDEX teamId;') for line in file: if str(year) in line: temp_line = line.split(',')[1:-1] for team in temp_line: team_id = team.split(';')[0] db.write('insert into teams (teamId, teamName) values ("' + team_id + '", "' + translate_team_name(team_id).replace("'", "\'") + '");') break db.write('ALTER TABLE teams ADD INDEX(teamId);') db.close() total_time = time.time() - start_time logger.log('Populating teams table completed: ' + time_converter(total_time)) driver_logger.log('\t\tTime = ' + time_converter(total_time))
def rank_driver(year): print("\n\ncalculating team ranks (year)") driver_logger.log("\tBeginning rank driver") start_time = time.time() logger.log("Beginning rank driver || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) logger.log("\tCalculating team ranks (year)") runs = {} allowed = {} difference = {} standard_deviation_for = {} standard_deviation_against = {} standard_deviation_ovr = {} ws_winners = {} driver_logger.log("\t\tCalculating team ranks (year)") for data_year in range(year, get_oldest_year() - 1, -1): runs[data_year], allowed[data_year], difference[ data_year] = team_ranker_year(data_year) standard_deviation_for[str(data_year)] = stdev( [team_runs_for[1] for team_runs_for in runs[data_year]]) standard_deviation_against[str(data_year)] = stdev( [team_runs_against[1] for team_runs_against in allowed[data_year]]) standard_deviation_ovr[str(data_year)] = stdev( [team_runs_diff[1] for team_runs_diff in difference[data_year]]) ws_winners[data_year] = get_ws_winner(data_year) total_time = time_converter(time.time() - start_time) logger.log("\t\tTime = " + total_time) driver_logger.log("\t\t\tTime = " + total_time) second_time = time.time() driver_logger.log("\t\tCalculating team ranks (overall)") logger.log("\tCalculating team ranks (overall)") print("calculating team ranks (overall)") total_list = [] years = [value for key, value in runs.items()] for ent in years: for team_total in ent: total_list.append(team_total[1]) average_deviation_for = mean( [value for key, value in standard_deviation_for.items()]) average_deviation_against = mean( [value for key, value in standard_deviation_against.items()]) average_deviation_diff = mean( [value for key, value in standard_deviation_ovr.items()]) all_time_rpg = get_all_time_rpg() team_ranker_ovr(runs, True, "offRank_ovr", all_time_rpg, standard_deviation_for, average_deviation_for) team_ranker_ovr(allowed, False, "defRank_ovr", all_time_rpg, standard_deviation_against, average_deviation_against) team_ranker_ovr(difference, True, "ovrRank_ovr", all_time_rpg, standard_deviation_ovr, average_deviation_diff, ws_winners) second = time_converter(time.time() - second_time) logger.log("\t\tTime = " + second) driver_logger.log("\t\t\tTime = " + second) total_time = time_converter(time.time() - start_time) logger.log("Rank driver complete: time = " + total_time + '\n\n') driver_logger.log("\t\tRank driver time = " + total_time)
def get_pitch_fx_data(year, month=None, day=None): if year < 2008: driver_logger.log("\tNo pitch fx data to download before 2008") return start_time = time.time() if month is None and day is None: driver_logger.log("\tFetching " + str(year) + " pitch fx data") print("Fetching " + str(year) + " pitch fx data") logger.log("Downloading pitch fx data for " + str(year) + " || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) db = DatabaseConnection(sandbox_mode) opening_day = db.read('select opening_day from years where year = ' + str(year) + ';')[0][0] db.close() for month in range(3, 12, 1): # if month > 11: if month >= int(opening_day.split('-')[0]): for day in range(1, 32, 1): # if day > 14: if month == int( opening_day.split('-')[0]) and int(day) < int( opening_day.split('-')[1]): continue if len(str(day)) == 1: this_day = '0' + str(day) else: this_day = str(day) if len(str(month)) == 1: this_month = '0' + str(month) else: this_month = str(month) get_day_data(this_day, this_month, str(year)) logger.log("Done fetching " + str(year) + " pitch fx data: time = " + time_converter(time.time() - start_time) + '\n\n\n\n') driver_logger.log("\t\tTime = " + time_converter(time.time() - start_time)) aggregate_pitch_fx(year) else: driver_logger.log("\tFetching " + str(month) + "-" + str(day) + "-" + str(year) + " pitch fx data") print("Fetching " + str(month) + "-" + str(day) + "-" + str(year) + " pitch fx data") logger.log("Downloading pitch fx data for " + str(month) + "-" + str(day) + "-" + str(year) + " || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) get_day_data(str(day), str(month), str(year)) driver_logger.log("\t\tTime = " + time_converter(time.time() - start_time)) aggregate_pitch_fx(year, month, day)
def simulate_game(game_num, away_team, away_team_info, away_year_info, home_team, home_team_info, home_year_info, away_year, home_year, strike_zone): driver_logger.log("Starting game " + str(game_num) + " simulation: " + away_team.get_team_id() + " @ " + home_team.get_team_id()) start_time = time.time() logger.log("Starting game " + str(game_num) + " simulation: " + away_team.get_team_id() + " @ " + home_team.get_team_id()) game_data = {} game = Game(away_team.get_team_id(), home_team.get_team_id()) league = League(home_team.get_team_id(), home_team.get_year()) lineup_time = time.time() logger.log("\tCreating lineups") away_pitcher = get_starting_pitcher(away_team.get_team_id(), away_year, game_num) home_pitcher = get_starting_pitcher(home_team.get_team_id(), home_year, game_num) away_team.set_lineup(away_pitcher, home_pitcher, use_dh=league.get_rules()) home_team.set_lineup(home_pitcher, away_pitcher, use_dh=league.get_rules()) logger.log("\t\t" + time_converter(time.time() - lineup_time)) while game.get_inning() <= 9 or game.get_away_score( ) == game.get_home_score(): inning_data = simulate_inning( game, away_team_info, home_team_info, away_year_info, home_year_info, { 'top': away_team.get_batting_order(), 'bottom': home_team.get_batting_order() }, { 'top': away_team.get_lineup_place(), 'bottom': home_team.get_lineup_place() }, { 'top': home_team.get_pitcher(), 'bottom': away_team.get_pitcher() }, strike_zone, logger) away_team.set_batting_order(inning_data['top']['lineup']) home_team.set_batting_order(inning_data['bottom']['lineup']) away_team.set_lineup_place(inning_data['top']['place']) home_team.set_lineup_place(inning_data['bottom']['place']) game.increment_away_score(inning_data['top']['runs']) game.increment_home_score(inning_data['bottom']['runs']) game_data['winner'] = determine_winner(game, away_team, home_team) total_time = time_converter(time.time() - start_time) logger.log("Done simulating game: " + away_team.get_team_id() + " @ " + home_team.get_team_id() + ": Time = " + total_time + '\n\n') driver_logger.log("\tDone simulating game: " + away_team.get_team_id() + " @ " + home_team.get_team_id() + ": Time = " + total_time) return game_data
def process_manager_tendencies(year): start_time = time.time() logger.log('\tProcessing manager tendencies') global pages global stats for manager_team, tendencies in pages.items(): stats[manager_team] = {} stats_to_consider = [ 'steal_2b_chances', 'steal_2b_attempts', 'steal_3b_chances', 'steal_3b_attempts', 'sac_bunt_chances', 'sac_bunts', 'ibb_chances', 'ibb', 'pinch_hitters', 'pinch_runners', 'pitchers_used_per_game' ] try: rows = str(tendencies).split('<h2>Managerial Tendencies</h2>' )[1].split('tbody>')[1].split('<tr>') for row in rows: try: if row.split('.shtml">')[1].split('</a>')[0] == str(year): for stat in stats_to_consider: for datum in row.split('<td'): if stat in datum: stats[manager_team][stat] = row.split( 'data-stat="' + stat + '">')[1].split('</td>')[0] break break except IndexError: continue except IndexError: continue logger.log('\t\tTime = ' + time_converter(time.time() - start_time))
def simulate_inning(game, away_team_info, home_team_info, away_year_info, home_year_info, lineup, place, pitcher, strike_zone, driver_logger): inning_num = str(game.get_inning()) driver_logger.log('\tInning ' + inning_num) start_time = time.time() inning = Inning() inning_data = {'top': {}, 'bottom': {}} batting_team_info = {'top': away_team_info, 'bottom': home_team_info} pitching_team_info = {'bottom': away_team_info, 'top': home_team_info} batting_year_info = {'top': away_year_info, 'bottom': home_year_info} # pitchers' batting stats for a given year pitching_year_info = {'top': home_year_info, 'bottom': away_year_info} # league pitch_fx data logger.log("Starting inning simulation: " + game.get_away_team() + " @ " + game.get_home_team() + " - " + inning_num) for half in ['top', 'bottom']: inning.set_half_inning(half) for key, value in simulate_half_inning(game, batting_team_info[half], pitching_team_info[half], batting_year_info[half], pitching_year_info[half], inning, lineup[half], place[half], strike_zone, pitcher[half]).items(): inning_data[half][key] = value # put the half inning data into the inning data dictionary game.increment_inning() total_time = time_converter(time.time() - start_time) driver_logger.log('\t\tTime = ' + total_time) logger.log("Done simulating inning: " + game.get_away_team() + " @ " + game.get_home_team() + " - " + inning_num + ": Time = " + total_time + '\n\n') return inning_data
def all_star_finder(year, normal, driver_logger): driver_logger.log("\tFinding " + str(year) + " all stars") start_time = time.time() logger.log("Finding All Stars || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) all_stars = [] if normal: all_star_table = str( BeautifulSoup( urlopen('https://www.baseball-reference.com/allstar/' + str(year) + '-allstar-game.shtml'), 'html.parser')) nl_table = all_star_table.split('<table>')[2].split('</table>')[0] al_table = all_star_table.split('<table>')[1].split('</table>')[0] all_stars += get_all_stars(nl_table, '<tr class="">') all_stars += get_all_stars(al_table, '<tr class="">') write_to_file(year, all_stars) else: leagues = ['NL', 'AL'] for league in leagues: all_star_table = str( BeautifulSoup( urlopen('https://www.baseball-reference.com/leagues/' + league + '/' + str(year) + '-other-leaders.shtml'), 'html.parser')) all_star_table1 = all_star_table.split('<h2>League All-Stars</h2>')[1].split('<tbody>')[1].\ split('</tbody')[0] all_stars += get_all_stars(all_star_table1, '<tr >') if year != 1945: all_star_table2 = all_star_table.split('<h2>League All-Stars</h2>')[2].split('<tbody>')[1].\ split('</tbody')[0] all_stars += get_all_stars(all_star_table2, '<tr >') write_to_file(year, all_stars) total_time = time_converter(time.time() - start_time) logger.log("All star finder complete: time = " + total_time) driver_logger.log("\t\tTime = " + total_time)
def mvp_cy_young(year, driver_logger): driver_logger.log("\tFinding " + str(year) + " MVPs and Cy Youngs") start_time = time.time() logger.log("Finding " + str(year) + " MVPs and Cy Youngs || Timestamp: " + datetime.datetime.today().\ strftime('%Y-%m-%d %H:%M:%S')) table = str(BeautifulSoup(urlopen('https://www.baseball-reference.com/awards/mvp_cya.shtml'), 'html.parser')) rows = table.split('</tr></thead>')[1].split('</table>')[0].split('<tr valign=') award_winners = {} for row in rows: try: if str(year) == row.split('"top"><td>')[1].split('</td>')[0]: pass else: continue except IndexError: continue if 'NLmvp' in row: award_winners['nl_mvp'] = \ row.split('NLmvp')[0].split('<a href="/players/')[-1].split('/')[1].split('.shtml"')[0] if 'ALmvp' in row: award_winners['al_mvp'] = \ row.split('ALmvp')[0].split('<a href="/players/')[-1].split('/')[1].split('.shtml"')[0] if 'NLcya' in row: award_winners['nl_cyYoung'] = \ row.split('NLcya')[0].split('<a href="/players/')[-1].split('/')[1].split('.shtml"')[0] if 'ALcya' in row: award_winners['al_cyYoung'] = \ row.split('ALcya')[0].split('<a href="/players/')[-1].split('/')[1].split('.shtml"')[0] total_time = time_converter(time.time() - start_time) logger.log("MVP and Cy Young finder complete: time = " + total_time) driver_logger.log("\t\tTime = " + total_time) return award_winners
def main(from_server, begin_year, end_year, frame=None): print('\n') if end_year > begin_year >= 1876: driver_logger.log( 'Begin Yearly Driver || Timestamp: ' + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) start_time = time.time() if not from_server: frame.withdraw() league_table_constructor() manager_table_constructor() years = [] for year in range(begin_year, end_year, 1): years.append(year) driver(year) create_strike_zone() rank_driver(years[-1]) comparisons_driver(years[-1]) hof_finder() clean_up_deadlocked_file() auto_migrate() driver_logger.log('Driver complete for year' + stringify_list(years) + ': time = ' + time_converter(time.time() - start_time) + '\n\n\n') else: print( 'Begin year must be lower than End year, but cannot be lower than 1876.' ) exit()
def main(from_server, day, month, year, frame=None): print('\n') if 0 < day <= 31 and 0 < month <= 12 and year >= 1876: try: driver_logger.log('Begin Daily Driver || Timestamp: ' + datetime.datetime.today(). strftime('%Y-%m-%d %H:%M:%S')) start_time = time.time() if not from_server: frame.withdraw() league_table_constructor() manager_table_constructor() driver(day, month, year) create_strike_zone() clean_up_deadlocked_file() auto_migrate() driver_logger.log('Driver complete for year ' + str(year) + ': time = ' + time_converter(time.time()-start_time) + '\n') except Exception as e: driver_logger.log("ERROR:\t" + str(e)) send_results() raise e else: print('Must enter a valid date.') send_results() exit()
def driver(day, month, year): driver_logger.log(str(month) + '/' + str(day) + '/' + str(year)) driver_time = time.time() print('\n\n' + str(month) + '/' + str(day) + '/' + str(year)) populate_teams_table(year) get_year_data(year) ballpark_and_manager_data(year) league_standings(year) team_offensive_statistics(year) team_defensive_statistics(year) batting_constructor(year) pitching_constructor(year) fielding_constructor(year) team_fielding_file_constructor(year) team_pitching_rotation_constructor(year) team_batting_order_constructor(year) primary_and_secondary_positions(year) determine_pitcher_roles_year(year) get_pitch_fx_data(year, month, day) hitter_tendencies(year) pitcher_tendencies(year) manager_tendencies(year) hitter_spray_chart_constructor(year) pitcher_spray_chart_constructor(year) team_certainties(year) consolidate_data(year) driver_logger.log('Time taken to download ' + str(month) + '/' + str(day) + '/' + str(year) + ' data: ' + time_converter(time.time()-driver_time) + '\n')
def team_ranker_ovr(data, greater_than, field, all_time_rpg, standard_deviation, average_deviation, playoff_data=None): logger = Logger(os.path.join(log_prefix, "import_data", "team_ranker_ovr.log")) logger.log("Calculating overall team ranks: " + field) start_time = time.time() final_data = {} if field != "ovrRank_ovr": for year, value in data.items(): final_data[year] = [] for ent in value: if field == "offRank_ovr": final_data[year].append([ent[0], (ent[1]/all_time_rpg) / (standard_deviation[str(year)]/average_deviation)]) else: final_data[year].append([ent[0], (ent[1]/all_time_rpg) * (standard_deviation[str(year)]/average_deviation)]) else: for year, value in data.items(): final_data[year] = [] for ent in value: for team_value in data[year]: if team_value[0] == ent[0]: playoff_bump = 1.0 for accomplishment, team_id in playoff_data.items(): if team_id == ent[0]: if accomplishment == 'ws_champ': playoff_bump += 0.005 playoff_bump += 0.005 final_data[year].append([ent[0], (ent[1]/(standard_deviation[str(year)]/average_deviation)) * playoff_bump]) write_to_file(final_data, greater_than, field) total_time = time_converter(time.time() - start_time) logger.log("\tTime = " + total_time + '\n\n')
def moy_gatherer(year, driver_logger): driver_logger.log("\tFinding " + str(year) + " managers of the year") start_time = time.time() logger.log("Finding " + str(year) + " managers of the year || Timestamp: " + datetime.datetime.today().\ strftime('%Y-%m-%d %H:%M:%S')) award_winners = {} table = str(BeautifulSoup(urlopen('https://www.baseball-reference.com/awards/manage.shtml'), 'html.parser')).\ split('</thead></table></div></div></div></div></div></body></html>')[1].split('<!--')[0] rows = table.split('<tr>') for row in rows: try: if str(year) == row.split('<td valign="top">')[1].split('<')[0]: pass else: continue except IndexError: continue award_winners['NL_moy'] = row.split('NLmoy')[0].split( '<a href="/managers/')[-1].split('.shtml')[0] award_winners['AL_moy'] = row.split('ALmoy')[0].split( '<a href="/managers/')[-1].split('.shtml')[0] total_time = time_converter(time.time() - start_time) logger.log("Manager of the year finder complete: time = " + total_time) driver_logger.log("\t\tTime = " + total_time) return award_winners
def roy_gatherer(year, driver_logger): driver_logger.log("\tFinding " + str(year) + " Rookies of the year") start_time = time.time() logger.log("Finding " + str(year) + " Rookies of the year || Timestamp: " + datetime.datetime.today().\ strftime('%Y-%m-%d %H:%M:%S')) award_winners = {} table = str(BeautifulSoup(urlopen('https://www.baseball-reference.com/awards/roy_rol.shtml'), 'html.parser')).\ split('</tr></thead>')[1].split('</table>')[0] rows = table.split('<tr ') for row in rows: try: if str(year) == row.split('valign="top"><td>')[1].split( '</td>')[0]: pass else: continue if 'NLroy' in row: award_winners['nl_roy'] = \ row.split('NLroy')[0].split('<a href="/players/')[-1].split('/')[1].split('.shtml"')[0] if 'ALroy' in row: award_winners['al_roy'] = \ row.split('ALroy')[0].split('<a href="/players/')[-1].split('/')[1].split('.shtml"')[0] except IndexError: continue total_time = time_converter(time.time() - start_time) logger.log("Rookie of the year finder complete: time = " + total_time) driver_logger.log("\t\tTime = " + total_time) return award_winners
def team_certainties(year): print('aggregating team statistic certainties') driver_logger.log("\tAggregating team statistic certainties") start_time = time.time() logger.log("Calculating team certainties || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) db = DatabaseConnection(sandbox_mode) stat_types = ["batting", "pitching"] for stat_type in stat_types: ty_uids = db.read('select ty_uniqueidentifier, teamid from team_years where year = ' + str(year)) for ty_uid in ty_uids: pau = 0 player_list = list(db.read('select playerid from player_positions where ty_uniqueidentifier = ' + str(ty_uid[0]) + ';')) for player in player_list: pt_uid = db.read('select pt_uniqueidentifier from player_teams where playerid = "' + player[0] + '" and' ' teamid = "' + ty_uid[1] + '";')[0][0] try: ent = db.read('select pa, certainty from player_' + stat_type + ' where year = ' + str(year) + ' and pt_uniqueidentifier = ' + str(pt_uid) + ';') pau += int(ent[0][0]) - (int(ent[0][0]) * float(ent[0][1])) except IndexError: continue except TypeError: continue pa = int(db.read('select pa from team_years where ty_uniqueidentifier = ' + str(ty_uid[0]) + ';')[0][0]) db.write('update team_years set certainty = ' + str((pa - pau) / pa) + ' where ty_uniqueidentifier = ' + str(ty_uid[0]) + ';') db.close() total_time = time_converter(time.time() - start_time) logger.log("Done calculating team certainties: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def hof_finder(): print("adding HOF data") driver_logger.log("\tAdding HOF data") start_time = time.time() logger.log("Begin finding hall of famers || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) hof_table = str(BeautifulSoup(urlopen('https://www.baseball-reference.com/awards/hof.shtml'), 'html.parser')).\ split('<tbody>')[1].split('</tbody>')[0] rows = hof_table.split('<tr>')[1:] db = DatabaseConnection(sandbox_mode) for row in rows: person = row.split('data-append-csv="')[1].split('"')[0] year = row.split('<a href="/awards/hof_')[1].split('.shtml')[0] induction_type = row.split('data-stat="category_hof">')[1].split( '<')[0] if induction_type == 'Player': db.write('update players set HOF = ' + str(year) + ' where playerId = "' + person + '";') elif induction_type == 'Manager': db.write('update managers set HOF = ' + str(year) + ' where managerId = "' + person + '";') else: continue db.close() total_time = time_converter(time.time() - start_time) logger.log("Done finding hall of famers: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def get_day_data(day, month, year): if len(day) == 1: day = '0' + day if len(month) == 1: month = '0' + month logger.log("\tDownloading data for " + month + '-' + day + '-' + year) day_time = time.time() home_page_url = 'http://gd2.mlb.com/components/game/mlb/year_' + year + '/month_' + month + '/day_' + day home_page = str(BeautifulSoup(urlopen(home_page_url), 'html.parser')).split('<li>') for line in home_page: try: if str(line.split('"day_' + str(day) + '/')[1])[:3] == 'gid': if not get_data_from_this_game(home_page_url[:-6] + line.split( '<a href="')[1].split('">')[0] + 'game.xml'): continue global innings innings = {} innings_url = home_page_url[:-6] + line.split( '<a href="')[1].split('">')[0] + 'inning/' players_url = home_page_url[:-6] + line.split( '<a href="')[1].split('">')[0] + 'players.xml' logger.log("\t\tDownloading data for game: " + line.split('gid_')[1].split('_')[3] + '_' + line.split('gid_')[1].split('_')[4] + ' - ' + innings_url) try: innings_page = str( BeautifulSoup(urlopen(innings_url), 'html.parser')).split('<li>') urlretrieve( players_url, os.path.join("..", "..", "baseball-sync", "src", "import_data", "player_data", "pitch_fx", "xml", "players.xml")) except Exception: innings_page = [] with ThreadPoolExecutor(os.cpu_count()) as executor: for inning in innings_page: try: if inning.split('<a href="inning_')[1].split( '.')[0].isdigit(): individual_inning_url = inning.split( '.xml"> ')[1].split('</a>')[0] executor.submit( load_xml, innings_url + individual_inning_url, individual_inning_url.split('_')[1].split( '.xml')[0]) except IndexError: continue parse_innings(year, month, day, innings_url) clear_xmls() except (IndexError, KeyError): clear_xmls() continue logger.log("\tDone downloading data for " + month + '-' + day + '-' + year + ": time = " + time_converter(time.time() - day_time) + '\n\n')
def catcher_defense(year, logger): logger.log('\tDownloading catcher data') start_time = time.time() page = str(BeautifulSoup(urlopen('https://www.baseball-reference.com/leagues/MLB/' + str(year) + '-specialpos_c-fielding.shtml'), 'html.parser')).\ split('Player Fielding - C</h2>')[1].split('<tbody>')[1].split('</tbody>')[0].split('<tr ') data = parse_table(page) logger.log('\t\tTime = ' + time_converter(time.time() - start_time)) return data
def primary_and_secondary_positions(year): print("adding primary and secondary positions") driver_logger.log("\tAdding primary and secondary positions") start_time = time.time() logger.log("Downloading " + str(year) + " primary and secondary data || Timestamp: " + datetime.datetime.today()\ .strftime('%Y-%m-%d %H:%M:%S')) db = DatabaseConnection(sandbox_mode) logger.log("\tAssembling list of players") assembly_time = time.time() teams_from_year = db.read( "select TY_uniqueidentifier from team_years where year=" + str(year) + ';') teams_from_year_range = db.read( "select TY_uniqueidentifier from team_years where year between " + str(year - 25) + ' and ' + str(year) + ';') player_positions = [] player_positions_range = [] for team in teams_from_year: player_positions += db.read( 'select playerId, positions from player_positions where ' + 'TY_uniqueidentifier = ' + str(team[0]) + ';') for team in teams_from_year_range: player_positions_range += db.read( 'select playerId, positions from player_positions where ' + 'TY_uniqueidentifier = ' + str(team[0]) + ';') logger.log("\t\tTime = " + time_converter(time.time() - assembly_time)) logger.log("\tDetermining positions") determination_time = time.time() for player in player_positions: player_position_string = get_player_positions(player, player_positions_range) player_positions_dict = determine_primary_position( player_position_string) write_to_file(player[0].replace("'", "\'"), player_positions_dict) db.close() logger.log("\t\tTime = " + time_converter(time.time() - determination_time)) total_time = time_converter(time.time() - start_time) logger.log("Done downloading primary and secondary positions: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def manager_table_constructor(): driver_logger.log('\tGathering manager data (all-time)') print("Gathering manager data (all-time)") start_time = time.time() logger.log('Begin populating teams table || Timestamp: ' + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) table = str( bs( urllib.request.urlopen( 'https://www.baseball-reference.com/managers/'), 'html.parser')) rows = table.split('<tr') db = DatabaseConnection(sandbox_mode=True) db.write('ALTER TABLE managers DROP INDEX managerId;') db.close() with ThreadPoolExecutor(os.cpu_count()) as executor: for row in rows: if '<td class="left" csk="' in row: this_row = row.split('</tr>')[0] try: manager_id = this_row.split( '<a href="/managers/')[1].split('.shtml')[0].replace( "'", "\'") last_first = this_row.split('</tr>')[0].split( '<td class="left" csk="')[1].split('"')[0] last = last_first.split(',')[0].replace("'", "\'") first = last_first.split(',')[1].replace("'", "\'") wins = this_row.split('data-stat="W">')[1].split('<')[0] loses = this_row.split('data-stat="L">')[1].split('<')[0] executor.submit( write_to_file, '"' + manager_id + '","' + last + '","' + first + '",' + wins + ',' + loses) except AttributeError: continue db = DatabaseConnection(sandbox_mode=True) db.write('ALTER TABLE managers ADD INDEX(managerId);') db.close() total_time = time.time() - start_time logger.log('Constructing manager table completed: time = ' + time_converter(total_time)) driver_logger.log('\t\tTime = ' + time_converter(total_time))
def team_batting_order_constructor(year): if year < 1908: logger.log("\tNo team batting order data to download before 1908.") driver_logger.log( "\tNo team batting order data to download before 1908.") return print("getting team batting order data") driver_logger.log("\tGetting team batting order data") start_time = time.time() global pages pages = {} logger.log("Downloading " + str(year) + " team batting order data || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) logger.log("\tDownloading team pages") try: year_file = open(os.path.join("..", "background", "yearTeams.txt"), 'r') except FileNotFoundError: year_file = open( os.path.join("..", "..", "..", "background", "yearTeams.txt"), 'r') with ThreadPoolExecutor(os.cpu_count()) as executor: for line in year_file: if str(year) in line: temp_line = line.split(',')[1:-1] for team in temp_line: if "TOT" not in team: executor.submit(load_url, year, team.split(';')[0], team.split(';')[1]) break logger.log("\t\t\tTime = " + time_converter(time.time() - start_time)) logger.log("\tOrganizing batting orders") write_time = time.time() get_hitters(year) logger.log("\t\t\tTime = " + time_converter(time.time() - write_time)) total_time = time_converter(time.time() - start_time) logger.log("Done downloading team batting order data: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)
def pitcher_tendencies(year): print("storing pitcher tendencies") start_time = time.time() logger.log("Downloading " + str(year) + " pitcher tendencies || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) if year >= 1988: driver_logger.log("\tStoring pitcher tendencies") logger.log("\tDownloading data") prev_player_id = "" page = str( BeautifulSoup( urlopen('https://www.baseball-reference.com/leagues/MLB/' + str(year) + '-pitches-pitching.shtml'), 'html.parser')) table = page.split('<h2>Player Pitching Pitches</h2>')[1].split( '<tbody>')[1].split('</tbody>')[0] rows = table.split('<tr') logger.log("\t\tTime = " + time_converter(time.time() - start_time)) logger.log("\tFormatting data") format_time = time.time() stat_dictionary = {} for row in rows: player_id, temp_stats = intermediate(row, prev_player_id) if player_id is not None: stat_dictionary[player_id] = temp_stats prev_player_id = player_id for player_id, stats in stat_dictionary.items(): write_to_file(year, player_id, stats) fill_pitchers_with_0_pa(year) total_time = time_converter(time.time() - format_time) logger.log("\t\tTime = " + total_time) driver_logger.log("\t\tTime = " + total_time) else: driver_logger.log("\tNo pitcher tendency data before 1988") logger.log("\tNo pitcher tendency data before 1988") fill_fields(year) logger.log("Done storing pitcher tendencies: time = " + time_converter(time.time() - start_time) + '\n\n')
def team_defensive_statistics(year): driver_logger.log("\tGathering team defensive statistics") print('Gathering team defensive statistics') start_time = time.time() logger.log('Downloading team defensive data for ' + str(year) + ' || Timestamp: ' + datetime.datetime.today().\ strftime('%Y-%m-%d %H:%M:%S')) page1 = str( BeautifulSoup( urlopen("https://www.baseball-reference.com/leagues/MLB/" + str(year) + "-standard-pitching.shtml"), "html.parser")) try: page2 = str( BeautifulSoup( urlopen("https://www.baseball-reference.com/leagues/MLB/" + str(year) + "-batting-pitching.shtml"), "html.parser")) batting_against_rows = page2.split('Player Batting Against')[0].split('<h2>Team Batting Against')[1].\ split('<tbody>')[1].split('</tbody>')[0].split('<tr>') except Exception: batting_against_rows = [] standard_pitching_rows = page1.split('Player Standard Pitching')[0].split('<h2>Team Standard Pitching')[1].\ split('<tbody>')[1].split('</tbody>')[0].split('<tr>') stats1 = { 'R': 'RA', 'ER': 'ER', 'H': "HA", 'HR': 'HRA', 'BB': 'BBA', 'HBP': 'HBPA', 'IBB': 'IBBA', 'SO': 'K', 'ERA': 'ERA', 'whip': 'WHIP' } stats2 = { 'PA': 'PAA', 'AB': 'ABA', '2B': '2BA', '3B': '3BA', 'batting_avg': 'BAA', 'onbase_perc': 'OBA', 'slugging_perc': 'SLGA', 'onbase_plus_slugging': 'OPSA', 'batting_avg_bip': 'BABIPA' } extract_data(standard_pitching_rows, stats1, year) extract_data(batting_against_rows, stats2, year) total_time = time_converter(time.time() - start_time) logger.log("Done donwloading team defensive data for " + str(year) + ': time = ' + total_time + '\n\n') driver_logger.log('\t\tTime = ' + total_time)
def triple_crown_winners(year, driver_logger): driver_logger.log("\tFinding " + str(year) + " triple crown winners") start_time = time.time() logger.log("Finding " + str(year) + " triple crown winners || Timestamp: " + datetime.datetime.today().\ strftime('%Y-%m-%d %H:%M:%S')) page = str(BeautifulSoup(urlopen('https://www.baseball-reference.com/awards/triple_crowns.shtml'), 'html.parser')) batting_table = page.split('Batting Triple Crowns Table')[1].split('</table>')[0] pitching_table = page.split('Pitching Triple Crowns Table')[1].split('</table>')[0] hitters = get_winners(year, batting_table, "hitting") pitchers = get_winners(year, pitching_table, "pitching") total_time = time_converter(time.time() - start_time) logger.log("Triple crown finder complete: time = " + total_time) driver_logger.log("\t\tTime = " + total_time) return hitters, pitchers
def ballpark_and_manager_data(year): driver_logger.log('\tGathering ballpark and manager data') print("Gathering ballpark and manager data") start_time = time.time() global pages pages = {} logger.log('Beginning ballpark and manager data download for ' + str(year) + ' || Timestamp: ' + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) teams = {} with open(os.path.join("..", "background", "yearTeams.txt"), 'rt') as file: for line in file: if str(year) in line: temp_line = line.split(',')[1:-1] for team in temp_line: temp_team = team.split(';') if 'TOT' not in temp_team: teams[temp_team[1]] = temp_team[0] break logger.log('Begin downloading team pages') download_time = time.time() with ThreadPoolExecutor(os.cpu_count()) as executor1: for team_key, team_id in teams.items(): executor1.submit(load_url, year, team_key) logger.log('\tDone downloading team pages: time = ' + time_converter(time.time() - download_time)) logger.log("Calculating and writing ballpark numbers and downloading images") calc_and_download_time = time.time() team_count = len(teams) with ThreadPoolExecutor(os.cpu_count()) as executor2: for team_key, team_id in teams.items(): executor2.submit(gather_team_home_numbers, team_id, team_key, year, team_count) # break logger.log("\tDone calculating and writing ballpark numbers and downloading manager data: time = " + time_converter(time.time() - calc_and_download_time)) total_time = time_converter(time.time() - start_time) logger.log('Ballpark and manager data download completed: time = ' + total_time + '\n\n') driver_logger.log('\t\tTime = ' + total_time)
def team_fielding_file_constructor(year): print('getting team fielding positions') driver_logger.log("\tGetting team fielding positions") start_time = time.time() global pages pages = {} logger.log("Downloading " + str(year) + " team fielding positions || Timestamp: " + datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')) logger.log("\tDownloading team pages") try: year_file = open(os.path.join("..", "background", "yearTeams.txt"), 'r') except FileNotFoundError: year_file = open( os.path.join("..", "..", "..", "background", "yearTeams.txt"), 'r') with ThreadPoolExecutor(os.cpu_count()) as executor: for line in year_file: if str(year) in line: temp_line = line.split(',')[1:-1] for team in temp_line: split_team = team.split(';') if "TOT" not in split_team: executor.submit(load_url, year, split_team[0], split_team[1]) year_file.close() break logger.log("\t\tTime = " + time_converter(time.time() - start_time)) logger.log("\tOrganizing team position data") write_time = time.time() write_to_file(year) logger.log("\t\tTime = " + time_converter(time.time() - write_time)) total_time = time_converter(time.time() - start_time) logger.log("Done downloading team fielding data: time = " + total_time + '\n\n') driver_logger.log("\t\tTime = " + total_time)