def update_data(date, team): """Load game data using pybaseball's scraper. Uses pybaseball.statcast and pybaseball.playerid_reverse_lookup to load data from a specific game, and stores it as json in the user's browser session in a hidden div. """ print("Loading data from statcast... ") raw = statcast(start_dt=date, end_dt=date, team=team) raw = raw.rename(columns={"player_name": "pitcher_name"}) print("Adding batter names... ") batter_ids = raw["batter"].unique() batters = playerid_reverse_lookup(batter_ids, key_type="mlbam") batters["batter_name"] = (batters["name_first"].str.capitalize() + " " + batters["name_last"].str.capitalize()) raw = raw.merge( batters[["key_mlbam", "batter_name"]], how="left", left_on="batter", right_on="key_mlbam", ) print("Done.") return raw.to_json(date_format="iso", orient="split")
def finalize(input_files): header_dict = {"last_name": 1, "first_name": 1, "position": 1, "total_bases": 1, "opportunities": 1, "percentage": 1, "player_id": 1} for file in input_files: with open(file) as file_r: output_file = file[:-4] + "_final.csv" with open(output_file, 'a') as file_w: reader = csv.DictReader(file_r) writer = csv.DictWriter(file_w, header_dict.keys()) writer.writeheader() data = list(reader) for old_row in data: new_row = {} # print(old_row.keys()) if old_row['player_id'] == "": continue lookup = int(float(old_row['player_id'])) data = playerid_reverse_lookup([lookup], key_type="mlbam") new_row['player_id'] = lookup new_row['position'] = old_row['position'] new_row['last_name'] = data['name_last'][0] new_row['first_name'] = data['name_first'][0] new_row['total_bases'] = old_row['total_bases'] new_row['opportunities'] = old_row['opportunities'] percentage = int(new_row['total_bases']) / int(new_row['opportunities']) new_row['percentage'] = round(percentage, 3) # print(new_row) writer.writerow(new_row) file_w.close() file_r.close()
def make_efp_json(statcast): efp_s = make_efp_series(make_pitcher_df(statcast)) ids = [] for i in efp_s.index: ids.append(i) pitcher_ids = playerid_reverse_lookup(ids) p_df = make_pitcher_df(statcast) efp_s = make_efp_series(p_df).rename("EFP") combined = p_df.merge(efp_s.to_frame(), left_index=True, right_index=True) with_ids = combined.merge(pitcher_ids[["name_last", "name_first", "key_mlbam"]], left_index=True, right_on="key_mlbam").set_index("key_mlbam", drop=True) return with_ids.to_json(orient="columns")
def pitcher(self, name, team): Xcols = ['pfx_x', 'pfx_z', 'release_speed', 'release_spin_rate'] fgp = self.fgp player = fgp[(fgp.Name.str.lower() == name.lower()) & (fgp.Team.str.lower() == team.lower())].playerid pid = int(playerid_reverse_lookup(player, 'fangraphs').key_mlbam) pitch = statcast_pitcher(start_dt='2015-03-28', end_dt='2019-09-29', player_id=pid) if set(pitch.p_throws) == {'R'}: throws = 'R' scaler = self.scalerR kmeans = self.modelR else: throws = 'L' scaler = self.scalerL kmeans = self.modelL pitch.dropna(subset=Xcols, inplace=True) pitch.reset_index(drop=True, inplace=True) pitch['p_type'] = kmeans.predict(scaler.transform(pitch[Xcols])) pitchdict = {} for i in range(13): if throws == 'R': if i == 7: pitchernum = pitch[(pitch.p_type == 7) | (pitch.p_type == 12)] elif i == 12: pitchernum = [] else: pitchernum = pitch[pitch.p_type == i] else: if i == 0: pitchernum = pitch[(pitch.p_type == 0) | (pitch.p_type == 4)] elif i == 4: pitchernum = [] else: pitchernum = pitch[pitch.p_type == i] cutoff = len(pitchernum) / len(pitch) if cutoff > (1 / 20): pitchdict[i] = round((cutoff * 100), 1) return pid, pitchdict, throws
def batter(self, name, team, throws='R'): Xcols = ['pfx_x', 'pfx_z', 'release_speed', 'release_spin_rate'] if throws == 'R': scaler = self.scalerR kmeans = self.modelR else: scaler = self.scalerL kmeans = self.modelL fgh = self.fgh player = fgh[(fgh.Name.str.lower() == name.lower()) & (fgh.Team.str.lower() == team.lower())].playerid pid = int(playerid_reverse_lookup(player, 'fangraphs').key_mlbam) bat = statcast_batter(start_dt='2015-03-28', end_dt='2019-09-29', player_id=pid) bat.dropna(subset=Xcols, inplace=True) bat.reset_index(drop=True, inplace=True) bat = bat[bat.p_throws == throws] bat['p_type'] = kmeans.predict(scaler.transform(bat[Xcols])) batdict = {} for i in range(12): if throws == 'R': if i == 7: batnum = bat[(bat.p_type == 7) | (bat.p_type == 12)] elif i == 12: continue else: batnum = bat[bat.p_type == i] else: if i == 0: batnum = bat[(bat.p_type == 0) | (bat.p_type == 4)] elif i == 4: continue else: batnum = bat[bat.p_type == i] batdict[i] = [len(batnum)] batdict[i] += [ round((np.sum(batnum.woba_value) / np.sum(batnum.woba_denom)), 3) ] return pid, batdict
def calc_batting_fd_score(self, start_date='2015-04-01', end_date='2018-07-19', preload=True, write_csv=False, path2017="", path2018=""): # PART 1 - get bbref data # Inputs: # start_date - beginning of time to pull statcast data # stop_date - time to cease pulling statcast data # Filepath1 - include path to filepath to bbref .jl file from scraper # Filepath2 - include path to baseball_name_translator # Outputs: # DF of batting stats merged from both bbref and statcast sources # Note: # There are no dates for bbref data bcause the scraping system has what it has # As we scrape more and icnrease our dataset we can hone in on specific seasons using these date variables # PART 1 - pull in bbref data and store as a df to be merge later batting_df, pitching_df = self.load_data(path2017=path2017, path2018=path2018, preload=preload, write_csv=write_csv) # PART 2 - get statcast data try: print("Accessing statcast_cache...") print("If dates are missing try rebuilding cache...") statcast_input_frame = pd.read_csv('statcast_cache.csv') except: print("Getting raw statcast data...") statcast_input_frame = self.pull_raw_statcast_data( start_date=start_date, end_date=end_date) statcast_input_frame.to_csv('statcast_cache.csv') events_worth_points = [ 'single', 'double', 'triple', 'walk', 'hit_by_pitch', 'home_run', 'hit_by_pitch' ] statcast_df = statcast_input_frame[statcast_input_frame['events'].isin( events_worth_points)] # Gets a list of batter keys, (unique list prevents repeat occurances) player_list = list(statcast_df['batter'].unique().astype(int)) # Lookup keys to get each player's various keys (mlb, bbref, etc.) player_id_values = playerid_reverse_lookup(player_list, key_type='mlbam') # Merge player keys to batter df based on key cols_to_merge = [ 'name_last', 'name_first', 'key_mlbam', 'key_bbref', 'key_fangraphs', 'key_retro' ] statcast_df_2 = statcast_df.merge(player_id_values[cols_to_merge], how='inner', left_on='batter', right_on='key_mlbam') # Bring in stadium codes to to use with "home team" to determine the "stadium" where the game took place try: stadium_codes = pd.read_csv(self.key_join_path) except FileNotFoundError: print( "Couldn't find baseball_key_joiner.csv in the same directory.") # Merge stadium codes onto existing statcast DF, merge on home team name statcast_df_3 = statcast_df_2.merge(stadium_codes, how='left', left_on='home_team', right_on='team_abbr') # Ad hoc basic key generation - can extract this to a function later if necessary # Matches on game_id since no 'start_time' value is available for statcast statcast_df_3['game_date'] = pd.to_datetime(statcast_df_3['game_date']) statcast_df_3['game_date'] = statcast_df_3['game_date'].astype(str) statcast_df_3['stadium'] = statcast_df_3['stadium'].astype(str) statcast_df_3['game_id'] = statcast_df_3['game_date'] + \ statcast_df_3['stadium'] + \ statcast_df_3['key_bbref'].astype(str) print("Aggregating data...") # Counts and groups events by game_id and event type, then unpacks events via unstack into their own columns batter_agg = statcast_df_3.groupby(['batter', 'home_team', 'game_date', 'game_id', 'events']).size() \ .unstack(fill_value=0) batter_agg2 = batter_agg.reset_index() # Aggregates fan duel values batter_agg3 = batter_agg2.groupby(['batter', 'home_team', 'game_date', 'game_id']) \ .agg({ 'hit_by_pitch' : 'sum', \ 'home_run' : 'sum', \ 'single' : 'sum', \ 'double' : 'sum', \ 'triple' : 'sum', \ 'walk' : 'sum', \ 'hit_by_pitch' : 'sum'}) statcast_data = batter_agg3.reset_index() print("Merging bbref and statcast data...") # Merge statcast and bbref databases, dropna (there are a lot b/c statcast has +1 year of data with no bbref values) batter_dataframe_final = batting_df.merge(statcast_data, how='left', left_on='game_id', right_on='game_id') batter_dataframe_final.drop(columns=['home_team_y', 'game_date_y'], inplace=True) batter_dataframe_final.rename(columns={ "home_team_x": "home_team", "game_date_x": "game_date" }, inplace=True) #batter_dataframe_final = batter_dataframe_final.dropna() # Score game performance batter_dataframe_final['fd_score'] = batter_dataframe_final.apply( self.fd_batting_score, axis=1) # NAN values after the bbref-statcast join a lack of value for an in game event. A player # who has at lease 1 AT BAT in a game, but fails to generate a FD scoring event, returns a NAN # This NAN should be a zero, since the player scored zero FD points batter_dataframe_final[[ 'hit_by_pitch', 'home_run', 'single', 'double', 'triple' ]] = batter_dataframe_final[[ 'hit_by_pitch', 'home_run', 'single', 'double', 'triple' ]].fillna(value=0) batter_dataframe_final = batter_dataframe_final[ (batter_dataframe_final['game_date'] < end_date) & (batter_dataframe_final['game_date'] > start_date)] # Take walk values from BB if 'walk' value is not filled due to failure to join between # statcast and bbref batter_dataframe_final['walk'] = np.where( batter_dataframe_final['walk'].isnull(), batter_dataframe_final['BB'], batter_dataframe_final['walk']) batter_dataframe_final[ 'roto_game_id'] = batter_dataframe_final['game_date'].astype( str) + batter_dataframe_final['player'].astype(str) print("Batting FD Score calculated! Returning data..") return batter_dataframe_final
def calc_fd_scores_roto(self, start_date='2015-04-01', end_date='2018-07-19', preload=True, write_csv=False, path2017="", path2018=""): #the dates in rotoguru are in a weird format, need to clean them # PART 1 - pull in bbref data and store as a df to be merge later batting_df, pitching_df = self.load_data(preload=True, write_csv=False, path2017="", path2018="") #Our batting and pitching df need to match on this new game_id batting_df[ 'roto_game_id'] = batting_df['game_date'] + batting_df['player'] pitching_df[ 'roto_game_id'] = pitching_df['game_date'] + pitching_df['player'] print("Loading rotoguru data..") try: rotoguru = pd.read_csv("roto_data_2015-2018.csv") except FileNotFoundError: print("Couldn't find the rotoguru csv!") #match rotoguru to baseball reference with different keys print("Getting bbref key to merge rotoguru and bbref data") unique_players = list(rotoguru['MLB_ID'].unique()) lookup = playerid_reverse_lookup(unique_players) rotoguru = pd.merge(rotoguru, lookup[['key_mlbam', 'key_bbref']], left_on='MLB_ID', right_on='key_mlbam') print("Cleaning up rotoguru dates..") #we need to make the rotoguru dates match the bbref date format - use a helper function rotoguru['game_date'] = rotoguru.apply(self.clean_rotoguru_dates, axis=1) rotoguru.drop('Date', axis=1, inplace=True) print("Merging bbref and rotoguru data to get FD scores") #create a unique id to merge rotoguru and bbref data rotoguru[ 'roto_game_id'] = rotoguru['game_date'] + rotoguru['key_bbref'] print("Batting df pre merge: ", batting_df.shape) rotoguru = rotoguru[['roto_game_id', 'FD_points', 'Pos']] batting_df = pd.merge(batting_df, rotoguru, on='roto_game_id') print("Batting df post merge: ", batting_df.shape) print("Pitching df pre merge: ", pitching_df.shape) rotoguru = rotoguru[['roto_game_id', 'FD_points', 'Pos']] pitching_df = pd.merge(pitching_df, rotoguru, on='roto_game_id') print("Pitching df post merge: ", pitching_df.shape) #we want to remove pitchers - we're not going to include them in the batting model, we have a separate df for pitching batting_df = batting_df[batting_df['Pos'] != 'P'] batting_df.dropna(inplace=True) return batting_df, pitching_df
def main(): pth = "/Users/irarickman/Google Drive/Data Science/Projects/MLB Projections/Batting Average" td = format(datetime.today(), "%Y-%m-%d") old_data = pd.read_pickle(pth + "/lastabs.pkl") old_data.game_date = pd.to_datetime(old_data.game_date, infer_datetime_format=True) prev_date = old_data.game_date.max() od = format(prev_date, "%Y-%m-%d") if od != td: new_d = statcast(od, td) new_data = new_d[new_d.events.notnull()] players_ids = playerid_reverse_lookup(new_data.batter.unique()) id_df = players_ids[['name_last', 'name_first', 'key_mlbam']] new_names = new_data.merge(id_df, how='left', left_on='batter', right_on='key_mlbam') df = pd.concat([old_data, new_names]) else: df = old_data df.drop_duplicates(inplace=True) df.to_pickle(pth + "/lastabs.pkl") df['hit'] = df.events.apply( lambda x: 1 if x in ["single", 'double', 'home_run', 'triple'] else 0) df['ab'] = df.events.apply(lambda x: 0 if x in [ 'walk', 'hit_by_pitch', "caught_stealing_2b", "pickoff_caught_stealing_2b", 'pickoff_1b', 'catcher_interf', 'pickoff_caught_stealing_3b', 'pickoff_2b', 'pickoff_caught_stealing_home', 'caught_stealing_3b', 'caught_stealing_home', "sac_fly", 'sac_bunt', 'sac_fly_double_play', 'sac_bunt_double_play' ] else 1) df['player_team'] = df.apply(lambda x: x.home_team if x.inning_topbot == "Bot" else x.away_team, axis=1) df['Opp'] = df.apply(lambda x: x.away_team if x.inning_topbot == "Bot" else x.home_team, axis=1) df['Place'] = df.apply(lambda x: "Home" if x.inning_topbot == "Bot" else "Away", axis=1) teams = df.player_team.unique() fixers = {"WSH": "WSN", "CWS": "CHW"} teams_fixed = [x if x not in fixers.keys() else fixers[x] for x in teams] team_schedule = {} missed = [] for t in teams_fixed: try: d = schedule_and_record(2018, t) d['fix_date'] = d.Date.str.replace("\(\d\)", "").str.strip() + " 2018" d['game_date'] = pd.to_datetime(d.fix_date.apply( lambda x: datetime.strptime(x, "%A, %b %d %Y")).apply( lambda x: x.strftime("%Y-%m-%d")), infer_datetime_format=True) d['Place'] = d.Home_Away.apply(lambda x: "Home" if x == "Home" else "Away") d2 = d[d.game_date >= datetime.today()][[ 'Place', "Opp", "game_date" ]] team_schedule[t] = d2 except ValueError: print(t) missed.append(t) df['name_last'] = df['name_last'].str.capitalize() df['name_first'] = df['name_first'].str.capitalize() df['player_name'] = df.name_first + " " + df.name_last sm_df = df[[ 'game_date', 'game_pk', 'hit', 'ab', 'Opp', 'Place', 'player_name', 'player_team', 'key_mlbam' ]] sm_df.sort_values(['player_name', 'game_date', 'game_pk'], inplace=True) trim_df = sm_df.groupby([ 'player_name', 'game_date', 'game_pk', 'Opp', 'Place', 'player_team', 'key_mlbam' ]).sum().reset_index() def player_df(player, d=trim_df): temp = d[d.player_name == player] temp = temp.sort_values(['game_date']).reset_index(drop=True) tm = temp.loc[len(temp) - 1, 'player_team'] if tm in fixers.keys(): sched = team_schedule[fixers[tm]] else: sched = team_schedule[tm] tdf = pd.concat([temp, sched]) tdf.ab.fillna(0, inplace=True) tdf.hit.fillna(0, inplace=True) tdf.player_name.fillna(player, inplace=True) tdf.player_team.fillna(tm, inplace=True) return tdf master_df = player_df(trim_df.player_name.unique()[0]) for p in trim_df.player_name.unique()[1:]: got = player_df(p) master_df = pd.concat([master_df, got]) master_df.game_date = master_df.game_date.apply( lambda x: format(x, "%Y-%m-%d")) ## now write to the google sheet # #authorization gc = pygsheets.authorize(outh_file='/Users/irarickman/client_secret.json') mlb = 'MLB At Bats' sh = gc.open(mlb) #select the first sheet wks = sh[0] wks.set_dataframe(master_df, (1, 1))
from luigi import Task, LocalTarget, IntParameter, Parameter from pybaseball import pitching_stats, batting_stats, playerid_reverse_lookup, schedule_and_record, home_games # a list of mlbam ids player_ids = [116539, 116541, 641728, 116540] # find the names of the players in player_ids, along with their ids from other data sources data = playerid_reverse_lookup(player_ids, key_type='mlbam') # find their names and ids from other data sources fg_ids = [826, 5417, 210, 1101] data = playerid_reverse_lookup(fg_ids, key_type='fangraphs') class GetData(Task): start_year = IntParameter(default=2015) end_year = IntParameter(default=2018) type = Parameter() data_function = {'pitching': pitching_stats, 'batting': batting_stats} def requires(self): return None def output(self): return LocalTarget(f'./data/external/{self.type}_data.csv') def run(self): params = {'start_season': self.start_year, 'end_season': self.end_year} pitching_data = self.data_function[str(self.type)](**params) pitching_data.to_csv(self.output().path)
clf = pickle.load(open('xBA_knn_model.pickle', 'rb')) # get 2019 batting data data2019 = pd.read_csv('data/all_outcomes_2019.csv', index_col=0) # remove players with fewer than 300 ABs AB300 = [] for pid in data2019['batter'].unique(): if data2019.loc[data2019['batter'] == pid].shape[0] >= 300 and pid not in AB300: AB300.append(pid) cBA, xBA = [], [] xBA_dict = {} for i, pid in enumerate(AB300): print('Processing {:d}/{:d}'.format(i + 1, len(AB300)) + ' Player IDs...') # get player name from player_id plast, pfirst = playerid_reverse_lookup([pid], key_type='mlbam').iloc[0, :2] print('Player ID {} --> {} {}'.format(pid, pfirst, plast)) # filter player data from league-wide batting data pdata = data2019.loc[data2019['batter'] == pid].copy() Xp, yp, dfp = pre_process(pdata) # calculate xBA from model predicted_outcomes = clf.predict(Xp) unique, counts = np.unique(predicted_outcomes, return_counts=True) d = dict(zip(unique, counts)) # hit=1, out=0 xBA = d[1] / (d[0] + d[1]) # calculate standard BA cBA = calc_BA(dfp, from_file=False) xBA_dict[pid] = [pfirst, plast, round(cBA, 3), round(xBA, 3)] # convert to datafrom and save results to csv
def rotoguru_features(self, batting=True): '''Rotoguru contains data on weather, batting order and windspeed/direction.''' try: rotoguru = pd.read_csv("roto_data_2015-2018.csv") except FileNotFoundError: print("Couldn't find the rotoguru csv!") #match rotoguru to baseball reference with different keys print("Getting bbref key to merge rotoguru and bbref data") unique_players = list(rotoguru['MLB_ID'].unique()) lookup = playerid_reverse_lookup(unique_players) rotoguru = pd.merge(rotoguru, lookup[['key_mlbam', 'key_bbref']], left_on='MLB_ID', right_on='key_mlbam') print("Cleaning up rotoguru dates..") #we need to make the rotoguru dates match the bbref date format - use a helper function rotoguru['game_date'] = rotoguru.apply(self.clean_rotoguru_dates, axis=1) rotoguru.drop('Date', axis=1, inplace=True) print("Merging bbref and rotoguru data to get FD scores") #create a unique id to merge rotoguru and bbref data rotoguru[ 'roto_game_id'] = rotoguru['game_date'] + rotoguru['key_bbref'] if batting: #there are only certain relevant columns we want to keep batter_cols = ['Condition', 'Hand', 'FD_points', 'FD_salary', 'Gametime_ET', 'Home_Ump', 'H/A', 'Oppt', 'Oppt_pitch_Name', 'Oppt_pitch_MLB_ID', 'Oppt_pitch_hand', 'Order', 'Pos', 'Temp', \ 'W_dir', 'W_speed', 'roto_game_id'] batters = rotoguru[rotoguru['Pos'] != 'P'] batters = batters[batter_cols] batters['Order'] = batters['Order'].astype(str) ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True) batters_ohe = ohe.fit_transform( batters[['H/A', 'Condition', 'W_dir', 'Order']]) batters_ohe.drop( ['H/A_a', 'Condition_nan', 'W_dir_nan', 'Order_nan'], axis=1, inplace=True) #add in relevant game_id batters_ohe['roto_game_id'] = batters['roto_game_id'] return batters_ohe else: #return pitching df instead pitcher_cols = ['Condition', 'FD_points', 'FD_salary', 'Gametime_ET', 'Home_Ump', 'IP', 'H/A', 'Oppt', 'Oppt_pitch_Name', 'Oppt_pitch_MLB_ID', 'Oppt_pitch_hand', 'QS', 'Temp', \ 'W_dir', 'W_speed', 'roto_game_id'] pitchers = rotoguru[rotoguru['Pos'] == 'P'] pitchers = pitchers[pitcher_cols] ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True) pitchers_ohe = ohe.fit_transform( pitchers[['H/A', 'Condition', 'W_dir']]) pitchers_ohe.drop(['H/A_a', 'Condition_nan', 'W_dir_nan'], axis=1, inplace=True) pitchers_ohe['roto_game_id'] = pitchers['roto_game_id'] return pitchers_ohe
def stadium_batter_avg(self, switch_cutoff=0.05, preload=False, filepath_statcast_cache='../statcast_cache.csv'): batting_df = self.avg_df.copy() batting_column = 'batting_hand_' + str(switch_cutoff) if not preload: try: statcast_frame_raw = pd.read_csv(filepath_statcast_cache) except: print( "Could not locate statcast_cache.csv, please check filepath_statcast_cache value" ) return "" left_right_hand = statcast_frame_raw.groupby(['batter', 'stand']).size() left_right_hand = left_right_hand.reset_index() left_right_hand.rename(columns={0: "bat_count"}, inplace=True) # Convert to long format with columns titled L / R (Left or Right handed pitches received) lr_pivot = left_right_hand.pivot(columns='stand', index="batter", values='bat_count') lr_pivot = lr_pivot.reset_index() lr_pivot = lr_pivot.fillna(0) # Creates helpyer columns to calculate how many pitches each player received # for each hand, and calculate the % of switch hitting each engaged in lr_pivot['primary_hand'] = np.where(lr_pivot['L'] > lr_pivot['R'], 'L', 'R') lr_pivot['major_count'] = np.where(lr_pivot['L'] > lr_pivot['R'], lr_pivot['L'], lr_pivot['R']) lr_pivot['minor_count'] = np.where(lr_pivot['L'] > lr_pivot['R'], lr_pivot['R'], lr_pivot['L']) lr_pivot['switch_perc'] = (lr_pivot['minor_count'] / lr_pivot['major_count']) lr_pivot = lr_pivot.replace(np.inf, np.nan).fillna(0) # if a player performs more than 5% of their bats using the other hand, they're classified as a switch hitter lr_pivot[batting_column] = np.where( lr_pivot['switch_perc'] < switch_cutoff, lr_pivot['primary_hand'], "S") lr_pivot.sort_values(by='switch_perc', ascending=False) left_right = lr_pivot[['batter', batting_column]] # Lookup 'batter keys' to mlb keys player_list = left_right['batter'].tolist() # Lookup keys to get each player's various keys (mlb, bbref, etc.) player_id_values = playerid_reverse_lookup(player_list, key_type='mlbam') # Merge player keys to batter df based on key cols_to_merge = [ 'name_last', 'name_first', 'key_mlbam', 'key_bbref', 'key_fangraphs', 'key_retro' ] left_right_with_keys = left_right.merge( player_id_values[cols_to_merge], how='inner', left_on='batter', right_on='key_mlbam') # Cache print('Creating cache "batter_hand.csv" in current directory...') left_right_with_keys.to_csv('batter_hand.csv', index=False) # Load the cache left_right = pd.read_csv('batter_hand.csv') batting_df2 = batting_df.merge( left_right[[batting_column, 'key_bbref']], how="left", left_on="player", right_on="key_bbref") batting_df2.drop('key_bbref', 1, inplace=True) # Grouyp by stadium and batting hand. Possible future expansion here based on date, maybe not tho stadium_hand_averages = batting_df2.groupby( ['stadium', batting_column])['batting_avg'].mean() stadium_hand_averages = stadium_hand_averages.reset_index() stadium_hand_averages.rename( {"batting_avg": 'stadium_batting_avg_' + str(switch_cutoff)}, axis=1, inplace=True) # Bring in the stadium averages to our normal DF (so we link back up with game_id) batting_df3 = batting_df2.merge(stadium_hand_averages, how="left", left_on=["stadium", batting_column], right_on=["stadium", batting_column]) return_frame = batting_df3[[ 'game_id', 'stadium_batting_avg_' + str(switch_cutoff) ]] return return_frame
def run_pull(start_date, yr=2021): pth = "/home/irarickman/data" yd = (datetime.now(pytz.timezone('US/Eastern')) - timedelta(1)).strftime('%Y-%m-%d') if path.exists(pth + '/lastabs.pkl'): old_data = pd.read_pickle(pth + "/lastabs.pkl") old_data.game_date = pd.to_datetime(old_data.game_date, infer_datetime_format=True) prev_date = old_data.game_date.max() od = prev_date.strftime("%Y-%m-%d") if od == yd: return else: ## if the entered date equals yesterday (which it will in the dag), we need to check the previous day's data ## to make sure that we didn't miss anything new_d = statcast(od, yd) new_data = new_d[new_d.events.notnull()] players_ids = playerid_reverse_lookup(new_data.batter.unique()) id_df = players_ids[['name_last', 'name_first', 'key_mlbam']] new_names = new_data.merge(id_df, how='left', left_on='batter', right_on='key_mlbam') df = pd.concat([old_data, new_names]) else: new_d = statcast(start_date, yd) new_data = new_d[new_d.events.notnull()] players_ids = playerid_reverse_lookup(new_data.batter.unique()) id_df = players_ids[['name_last', 'name_first', 'key_mlbam']] new_names = new_data.merge(id_df, how='left', left_on='batter', right_on='key_mlbam') df = new_names df.drop_duplicates(inplace=True) df.to_pickle(pth + "/lastabs.pkl") df['hit'] = df.events.apply( lambda x: 1 if x in ["single", 'double', 'home_run', 'triple'] else 0) df['ab'] = df.events.apply(lambda x: 0 if x in [ 'walk', 'hit_by_pitch', "caught_stealing_2b", "pickoff_caught_stealing_2b", 'pickoff_1b', 'catcher_interf', 'pickoff_caught_stealing_3b', 'pickoff_2b', 'pickoff_caught_stealing_home', 'caught_stealing_3b', 'caught_stealing_home', "sac_fly", 'sac_bunt', 'sac_fly_double_play', 'sac_bunt_double_play' ] else 1) df['player_team'] = df.apply(lambda x: x.home_team if x.inning_topbot == "Bot" else x.away_team, axis=1) df['Opp'] = df.apply(lambda x: x.away_team if x.inning_topbot == "Bot" else x.home_team, axis=1) df['Place'] = df.apply(lambda x: "Home" if x.inning_topbot == "Bot" else "Away", axis=1) teams = df.player_team.unique() fixers = {"WSH": "WSN", "CWS": "CHW"} teams_fixed = [x if x not in fixers.keys() else fixers[x] for x in teams] team_schedule = {} missed = [] for t in teams_fixed: try: d = schedule_and_record(yr, t) d['fix_date'] = d.Date.str.replace("\(\d\)", "").str.strip() + " " + str(yr) d['game_date'] = pd.to_datetime(d.fix_date.apply( lambda x: datetime.strptime(x, "%A, %b %d %Y")).apply( lambda x: x.strftime("%Y-%m-%d")), infer_datetime_format=True) d['Place'] = d.Home_Away.apply(lambda x: "Home" if x == "Home" else "Away") d2 = d[d.game_date > df.game_date.max()][[ 'Place', "Opp", "game_date" ]] team_schedule[t] = d2 except ValueError: print(t) missed.append(t) df['name_last'] = df['name_last'].str.capitalize() df['name_first'] = df['name_first'].str.capitalize() df['player_name'] = df.name_first + " " + df.name_last sm_df = df[[ 'game_date', 'game_pk', 'hit', 'ab', 'Opp', 'Place', 'player_name', 'player_team', 'key_mlbam' ]] sm_df.sort_values(['player_name', 'game_date', 'game_pk'], inplace=True) trim_df = sm_df.groupby([ 'player_name', 'game_date', 'game_pk', 'Opp', 'Place', 'player_team', 'key_mlbam' ]).sum().reset_index() def player_df(player, d=trim_df): temp = d[d.player_name == player] temp = temp.sort_values(['game_date']).reset_index(drop=True) tm = temp.loc[len(temp) - 1, 'player_team'] if tm in fixers.keys(): sched = team_schedule[fixers[tm]] else: sched = team_schedule[tm] tdf = pd.concat([temp, sched], sort=False) tdf.ab.fillna(0, inplace=True) tdf.hit.fillna(0, inplace=True) tdf.player_name.fillna(player, inplace=True) tdf.player_team.fillna(tm, inplace=True) return tdf master_df = player_df(trim_df.player_name.unique()[0]) for p in trim_df.player_name.unique()[1:]: got = player_df(p) master_df = pd.concat([master_df, got], sort=False) master_df.game_date = master_df.game_date.apply( lambda x: format(x, "%Y-%m-%d")) ## now write to the google sheet # #authorization gc = pygsheets.authorize( service_file='/home/irarickman/formal-thunder-186123-ab6b0fb6bc46.json' ) mlb = 'MLB At Bats' sh = gc.open(mlb) #select the first sheet wks = sh[0] wks.set_dataframe(master_df, (1, 1))