def cmd_statcast_echo(start_date, end_date): dates = pd.date_range(start_date, end_date).tolist() data = statcast(start_dt=start_date, end_dt=end_date) click.echo(data.head())
def update_data(date, team): """Load game data using pybaseball's scraper. Uses pybaseball.statcast and pybaseball.playerid_reverse_lookup to load data from a specific game, and stores it as json in the user's browser session in a hidden div. """ print("Loading data from statcast... ") raw = statcast(start_dt=date, end_dt=date, team=team) raw = raw.rename(columns={"player_name": "pitcher_name"}) print("Adding batter names... ") batter_ids = raw["batter"].unique() batters = playerid_reverse_lookup(batter_ids, key_type="mlbam") batters["batter_name"] = (batters["name_first"].str.capitalize() + " " + batters["name_last"].str.capitalize()) raw = raw.merge( batters[["key_mlbam", "batter_name"]], how="left", left_on="batter", right_on="key_mlbam", ) print("Done.") return raw.to_json(date_format="iso", orient="split")
def import_statcast(self, table, start, end, team=None, verbose=True, cache=False): if cache: from pybaseball import cache cache.enable() if team is not None: data = statcast(start, end, team, verbose) else: data = statcast(start, end, verbose) data.to_sql(table, self.db_connection.get_connection(), if_exists='append')
def get_batting_league( start_dt=None, end_dt=None, team=None, verbose=True, fname_all=None, fname_bb=None, features=[ 'events', 'description', 'batter', 'stand', 'launch_angle', 'launch_speed', 'hc_x', 'hc_y', 'pitcher', 'p_throws', 'pitch_type', 'release_speed', 'release_spin_rate' ] ): """ Pull league-wide statcast batting data from baseballsavant using pybaseball https://github.com/jldbc/pybaseball https://baseballsavant.mlb.com/statcast_search Arguments start_dt: get data from start_dt forward stop_dt: get data up to stop_dt team: search team only ('SEA', 'TEX', etc) verbose: display query status messages fname_all: export csv of all statcast at bat outcomes to this file **must be .csv** fname_bb: export csv of all outcomes with a batted ball to this file **must be .csv** Returns (all_outcomes, batted_balls) tuple of dataframes Saves to files 'fname_all' and 'fname_bb' if fname is not None. Note that all_outcomes DOES NOT include plate appearances that do not count as at bats. TODO: output truly all outcomes, all at bats, and all batted balls as three separate files. """ # get statcast data (this can take awhile) df = statcast(start_dt, end_dt, team, verbose) # discard null events all_outcomes = df[df['events'].notnull()] # get the specified features only all_outcomes = all_outcomes[features] if fname_all is not None: # export to csv all_outcomes.to_csv(fname_all, index=False) print('Exported: {}'.format(fname_all)) # get batted balls only batted_balls = filter_batted_balls(all_outcomes) if fname_bb is not None: # export data batted_balls.to_csv(fname_bb, index=False) print('Exported: {}'.format(fname_bb)) return(all_outcomes, batted_balls)
def main(): args = _parse_args() start_time = time.time() _ = statcast(args.start_date, args.end_date) end_time = time.time() query_time = end_time - start_time threshold_exceeded = query_time > args.time_threshold print( f"query took {query_time: .1f} seconds (expected less than {args.time_threshold: .1f})" ) sys.exit(int(threshold_exceeded))
def new_statcast(self, current_statcast): latest = current_statcast['game_date'].max() dates = self.get_dates() statbat = pd.DataFrame() for date in dates: if date > datetime.strftime(latest, "%Y-%m-%d"): bat = statcast(date) print("\r", date, end="") statbat = statbat.append(bat) return statbat
def get_statcast_data(start_dt=None, end_dt=None): if start_dt is None: start_dt = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d") if end_dt is None: end_dt = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d") df = statcast(start_dt=start_dt, end_dt=end_dt) # clean up the df df = df.drop(["index"], axis=1) df.columns = [c.replace(".", "_") for c in df.columns] return df
def statcast_data(year, end_month_range): df_dict = {} for i in range(3, end_month_range): start_date = datetime.date(year, i, 1).strftime("%Y-%m-%d") if year == today.year and i == today.month: end_date = today.strftime("%Y-%m-%d") else: end_date = datetime.date(year, i, month_end_dates[i]).strftime("%Y-%m-%d") try: df = statcast(start_date, end_date) except: pass df_dict[i] = df return df_dict
def get_statcast_data(directory, year): year = str(year) data_path = join(directory, 'statcast_data_{}.csv'.format(year)) sc_data = None if not isfile(data_path): date_ranges = [('$-01-01', '$-03-31'), ('2018-04-01', '$-06-30'), ('$-07-01', '$-09-30'), ('$-10-01', '$-12-31')] date_ranges = [(date[0].replace('$', year), date[1].replace('$', year)) for date in date_ranges] data = [] for dr in date_ranges: data.append(statcast(start_dt=dr[0], end_dt=dr[1])) sc_data = pd.concat(data, ignore_index=True) sc_data.to_csv(data_path) else: sc_data = pd.read_csv(data_path, index_col=0) return sc_data.copy()
def get_statcast( filename: str, start: str, end: str = None, fmt: str = "%Y-%m-%d", mode="append", ): """Download data to an existing parquet file.""" filename = os.path.expanduser(filename) if end is None: end = start start_dt = datetime.strptime(start, fmt).date() end_dt = datetime.strptime(end, fmt).date() for dt in daterange(start_dt, end_dt): datestr = dt.strftime("%Y-%m-%d") print("Downloading data from " + datestr + "... ", end="") pdf = statcast(start_dt=datestr, end_dt=datestr) if pdf.empty: print("No data.") continue # fix type of string columns with NaNs fields = [ x for x in pdf.columns if pdf[x].dtype == "object" and pdf[x].isna().sum() > 0 ] for field in fields: pdf[field] = pdf[field].astype(str) # convert to spark and save as parquet print("Saving as parquet ({})... ".format(mode), end="") sdf = get_spark().createDataFrame(pdf) sdf.write.parquet(filename, mode=mode, partitionBy="pitcher") # overwrite on first pass, append after that if mode == "overwrite": mode = "append" print("Done.")
def pull_clean_and_pickle(start_date, end_date, year): # Pull in DataFrame from statcast df = statcast(start_dt=start_date, end_dt=end_date) # Drop depriciated and extraneous features df = df.drop(columns=[ 'spin_dir', 'spin_rate_deprecated', 'break_angle_deprecated', 'break_length_deprecated', 'game_type', 'tfs_deprecated', 'tfs_zulu_deprecated', 'umpire' ]) # Sort pitches chronologically df = df.sort_values( by=['game_date', 'game_pk', 'at_bat_number', 'pitch_number']) # Pickle DataFrame df.to_pickle(path=( "/Users/mattkirby/repos/RC-v1.0--Data-Gathering/season_pickles/pitches_" + year + ".pkl"), compression='zip')
def cmd_statcast_upload(start_date, end_date, db_username, db_password, db_hostname, db_name, db_tablename): click.echo('[[[ PULLING STATCAST DATAFRAME ]]]') statcast_engine = initdb_statcast(db_username, db_password, db_hostname, db_name, db_tablename) click.echo("Breaking up into queries by day to avoid the 30k row cap...") for day in pd.date_range(start_date, end_date): click.echo(str(day.date()) + ' - Pulling data...') try: data = statcast(start_dt=str(day.date()), end_dt=str(day.date())) upload_block(data, statcast_engine, db_tablename) except Exception as exc: click.echo( str(day.date()) + " - ERROR pulling down data - Error was = " + str(exc)) else: click.echo( str(day.date()) + ' - SUCCESS, pulled ' + str(data.shape[0]) + ' records')
def download_season(self, years): """ Downloads the statcast data for a season as a csv. It adds a unique id (uid) for each pitch and then saves the csv in the folder statcast_data/[year].csv. Parameters ---------- years (str, int, list of strings, list of ints) : the season(s) to download Returns ------- None """ dh = DataHandler() if not isinstance(years, list): years = [years] for year in years: data = statcast(start_dt=self.season_dates[year]["start"], end_dt=self.season_dates[year]["end"]) data["index"] = data.apply(lambda row: self._create_uid(row), axis=1) data = data.set_index("index") data = data.sort_index().reset_index(drop=False) data = dh.set_data_types(data) data.to_feather(f"statcast_data/{year}")
# Qualified pitcher data and eliminating unneeded rows df_qualified_pitchers = pd.read_csv(data_dir + '200IPData_Advanced.csv') df_qualified_pitchers = df_qualified_pitchers[['Name', 'FIP']] df_qualified_pitchers['Tag'] = np.nan df_qualified_pitchers.Tag[(df_qualified_pitchers['FIP'] > 4.69)] = 'Bad' df_qualified_pitchers.Tag[(df_qualified_pitchers['FIP'] < 3.78)] = 'Good' df_qualified_pitchers.Tag[(df_qualified_pitchers['FIP'] <= 4.69) & (df_qualified_pitchers['FIP'] >= 3.78)] = 'Average' pitcher_tag = dict(zip(df_qualified_pitchers.Name, df_qualified_pitchers.Tag)) # print(df_qualified_pitchers.describe()) print("Number of Pitchers in Each Category: \n", df_qualified_pitchers.Tag.value_counts()) # Raw play by play data and eliminating non-BBIP events df_raw = statcast(start_dt='2017-03-01', end_dt='2019-10-01') df_raw = df_raw[['pitcher', 'batter', 'woba_value', 'woba_denom']] df_raw = df_raw.dropna() df_raw['pitcher_name'] = df_raw['pitcher'].map(player_id) df_raw['batter_name'] = df_raw['batter'].map(player_id) df_raw['pitcher_tag'] = df_raw['pitcher_name'].map(pitcher_tag) df_raw['batter_tag'] = df_raw['batter_name'].map(batter_tag) df_raw = df_raw.dropna() # Splits data up by match ups, pitcher tendency is listed first df_G_G = df_raw[(df_raw.pitcher_tag == 'Good') & (df_raw.batter_tag == 'Good')] df_G_A = df_raw[(df_raw.pitcher_tag == 'Good') & (df_raw.batter_tag == 'Average')] df_G_B = df_raw[(df_raw.pitcher_tag == 'Good') & (df_raw.batter_tag == 'Bad')] df_A_G = df_raw[(df_raw.pitcher_tag == 'Average') & (df_raw.batter_tag == 'Good')]
block_start_date = start_date group_start_date = start_date group_count = 0 group_data = pd.DataFrame() while end_date >= block_start_date: block_end_date = block_start_date + timedelta(days=block_days - 1) if block_end_date > end_date: block_end_date = end_date print('Getting data for %s to %s' % (block_start_date.strftime(date_format), block_end_date.strftime(date_format))) block_data = statcast(start_dt=block_start_date.strftime(date_format), end_dt=block_end_date.strftime(date_format)) block_start_date = block_end_date + timedelta(days=1) if len(block_data): group_data = pd.concat((group_data, block_data)) group_count += 1 if group_count == group_blocks: group_filename = '%s_%s.csv' % (group_start_date.strftime( date_format), block_end_date.strftime(date_format)) group_data.to_csv(os.path.join(data_path, group_filename)) group_start_date = block_start_date group_count = 0 group_data = pd.DataFrame()
from pybaseball import statcast import pandas as pd # Scraping Statcast Data With Python called_pitches = statcast(start_dt='2016-03-15', end_dt='2016-05-02') called_pitches = called_pitches[['game_date', 'description', 'stand', 'balls', 'strikes', 'plate_x', 'plate_z', 'sz_top', 'sz_bot', 'inning', 'player_name', 'game_pk']] called_pitches.to_csv('statcast-all-2016-april.csv') called_pitches = "" # Scraping Statcast Data With Python called_pitches = statcast(start_dt='2016-05-02', end_dt='2016-06-01') called_pitches = called_pitches[['game_date', 'description', 'stand', 'balls', 'strikes', 'plate_x', 'plate_z', 'sz_top', 'sz_bot', 'inning', 'player_name', 'game_pk']] called_pitches.to_csv('statcast-all-2016-may.csv') called_pitches = "" # Scraping Statcast Data With Python called_pitches = statcast(start_dt='2016-06-02', end_dt='2016-07-01') called_pitches = called_pitches[['game_date', 'description', 'stand', 'balls', 'strikes', 'plate_x', 'plate_z', 'sz_top', 'sz_bot', 'inning', 'player_name', 'game_pk']] called_pitches.to_csv('statcast-all-2016-june.csv') called_pitches = "" # Scraping Statcast Data With Python called_pitches = statcast(start_dt='2016-07-02', end_dt='2016-08-01') called_pitches = called_pitches[['game_date', 'description', 'stand', 'balls', 'strikes', 'plate_x', 'plate_z', 'sz_top', 'sz_bot', 'inning', 'player_name', 'game_pk']] called_pitches.to_csv('statcast-all-2016-july.csv') called_pitches = "" # Scraping Statcast Data With Python called_pitches = statcast(start_dt='2016-08-02', end_dt='2016-09-01') called_pitches = called_pitches[['game_date', 'description', 'stand', 'balls', 'strikes', 'plate_x', 'plate_z', 'sz_top', 'sz_bot', 'inning', 'player_name', 'game_pk']] called_pitches.to_csv('statcast-all-2016-august.csv')
def pull_statcast_data(start_date, end_date, year): """ Date Format: YYYY-MM-DD """ df = statcast(start_dt=start_date, end_dt=end_date) return df
def getInfo(input_file): print("In GET INFO") data = statcast(start_dt='2017-03-28', end_dt='2017-10-1') # print(data.head(data.size)) data.to_csv(input_file)
import pandas as pd # people from: https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv # sc from: from pybaseball import statcast sc = statcast(start_dt='2012-01-01', end_dt='2020-01-01') sc.to_parquet("statcast_dump.parquet", engine="fastparquet") # full statcast.csv.gzip file available upon request. people = pd.read_csv("people.csv") # sc = pd.read_parquet("statcast_dump.parquet", engine="fastparquet") people["batter_name"] = people.name_first + " " + people.name_last merged = pd.merge( sc, people.loc[:, ["key_mlbam", "batter_name"]], how="left", left_on="batter", right_on="key_mlbam", ) cols2keep = [ "player_name", "batter_name", "pitch_type", "game_date", "release_speed", "events", "launch_speed", "woba_value", "bb_type", "balls", "strikes",
def stream_data(self, date): '''Appends new MLB statcast data for a specified date. Designed to be used as a streaming pipline for new daily data during in-season. Once the database is built, this method can be used to add new data to the RAW and WRK tables. This method should be used while scheduling a task to send new daily data through the pipeline during in-season. Args: No arguments Returns: Does not return a parameter Raises: No exceptions ''' logging.basicConfig( filename=f'{self.parent_path}/logs/stream_data.log', filemode='w', format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO) startTime = datetime.now() self.dbtype = self.lst_config[0]['Database_System']['db_type'] self.connect_db() try: self.engine.connect() print( f'Connected to: \nServer: {self.server}\nDatabase: {self.database}' ) logging.info( f'\nConnection Successful \nServer: {self.server}\nDatabase: {self.database}' ) except Exception as e: Print('Unable to connect to Server: {self.sever]}') logging.exception('Exception Occured') try: self.df = pd.DataFrame( statcast(start_dt=f'{date}', end_dt=f'{date}')) except Exception as e: logging.exception("Exception occurred") self.df = self.df.replace({np.nan: None}) self.dct_parks = dict( zip(self.team_atts['Team'], self.team_atts[f'Ball_Park_{int(self.df.game_year.loc[1])}'])) if self.df.empty: print(f'{date}: NO DATA FOR THIS DATE') logging.warning(f'{date}: NO DATA FOR THIS DATE') else: print('............') print(f'|{date}|') print('............') print(f'Completed: RAW data imported from Baseball Savant') logging.info(f'{date}: RAW data imported from Baseball Savant') try: self.df.drop(['pitcher.1', 'fielder_2.1'], axis=1, inplace=True) self.df.to_sql(f'raw_statcast_{int(self.df.game_year.loc[1])}', self.engine, index=False, if_exists='append') print( f'Completed: Raw data inserted into DB: STATCAST , TABLE: raw_statcast_{int(self.df.game_year.loc[1])}' ) logging.info( f'{date}: Raw data inserted into DB: STATCAST , TABLE: raw_statcast_{int(self.df.game_year.loc[1])}' ) except Exception as e: logging.exception("Exception occurred") try: self.reorder_columns() self.rename_columns() self.add_batter_name() self.add_player_names() self.add_teams() self.add_lg_div() self.add_ballparks(self.df['Game_Year'][0]) self.stream_pos_bop_dct(date) self.add_batter_pos() self.add_batter_bop() print(f'Completed: Data transformation') logging.info(f'{date}: Data transformation complete') except: logging.exception("Exception occurred") try: self.df.to_sql(f'wrk_statcast_{int(self.df.Game_Year.loc[1])}', self.engine, index=False, if_exists='append') print( f'Completed: Working data inserted into DB: STATCAST , TABLE: wrk_statcast_{int(self.df.Game_Year.loc[1])}\n' ) logging.info( f'''{date}: Working data inserted into DB: STATCAST , TABLE: wrk_statcast_{int(self.df.Game_Year.loc[1])}\nTime Elapsed: {datetime.now() - startTime}''' ) except Exception as e: logging.exception("Exception occurred") print(f'--------{date} Complete--------') print(f'Total Time Elapsed: {datetime.now() - startTime}') logging.info( f'--------{date} Complete--------\nTotal Time Elapsed: {datetime.now() - startTime}' )
# inputs are valid if either both or zero dates are supplied. Not valid of only one given. if start_dt and end_dt: # how many days worth of data are needed? date_format = "%Y-%m-%d" d1 = datetime.datetime.strptime(start_dt, date_format) d2 = datetime.datetime.strptime(end_dt, date_format) days_in_query = (d2 - d1).days if days_in_query <= small_query_threshold: data = small_request(start_dt,end_dt) else: data = large_request(start_dt,end_dt,d1,d2,step=small_query_threshold,verbose=verbose) data = postprocessing(data, team) return data start_dates = ['2012-03-25', '2012-07-02', '2013-03-25', '2013-07-02', '2014-03-25', '2014-07-02', '2015-03-25','2015-07-02', '2016-03-25','2016-07-02', '2017-03-25', '2017-07-02'] end_dates = ['2012-07-01', '2012-11-01', '2013-07-01', '2013-11-01', '2014-07-01', '2014-11-01', '2015-07-01', '2015-11-01', '2016-07-01', '2016-11-01', '2017-07-01', '2017-11-01'] for i in range(len(start_dates)) : start = start_dates[i] end = end_dates[i] df = statcast(start_dt=start, end_dt=end) df = df[['pitcher', 'player_name', 'pitchtype', 'pitch_name', 'game_date', 'release_speed', 'release_pos_x', 'release_pos_y', 'release_pos_z', 'spin_dir', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'vx0', 'vy0', 'vz0' , 'ax', 'ay', 'az']] df.to_csv('C:/Users/Colin/Documents/GitHub/baseball-injuries/datasets/statcast_{0}_{1}.csv'.format(start, end))
#imports from pybaseball import statcast import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') # collect Statcast data on all pitches from the months of May and June data = statcast('2017-05-01', '2017-05-03') print(data.shape) data2 = data.dropna( subset=['launch_angle', 'launch_speed', 'estimated_ba_using_speedangle']) data2.shape fig, ax = plt.subplots(figsize=(8, 8)) sns.despine(fig, left=True, bottom=True) sns.scatterplot(x="launch_speed", y="launch_angle", hue="estimated_ba_using_speedangle", palette='viridis', data=data2, ax=ax) ax.set_title("Hit probability by Launch Angle and Exit Velocity") data2['hr'] = data2.events == 'home_run' fig, ax = plt.subplots(figsize=(8, 8)) sns.despine(fig, left=True, bottom=True) sns.scatterplot(x="launch_speed",
You can see by the pairplot below that stolen bases do not correlate well with any of hits, homeruns, or at bats. """ batting_copy = batting[['AB', 'H', 'HR', 'SB']] sns.pairplot(batting_copy) plt.show() """ Strike zone Application Now that we got through the basics of plotting, lets apply this to an example where we plot the strike zone and pitch locations. Essentially we are taking the variables plate_x and plate_z and plotting them. You can imagine our plot like you are looking at the pitcher from home plate and we are plotting where each pitch is. So to do this we need to create out strike zone and it is also helpful to map each pitch type to a color so we can see if location changes depending on the pitch type. """ statcast = pyb.statcast(start_dt='2020-09-22', end_dt='2020-09-23') print('{} Rows imported'.format(str(len(statcast)))) def plotPitches(plotdf): strikezone = plt.Rectangle((-.95, 1.6), 1.65, 1.8, color='red', fill=False) groups = plotdf.groupby("pitch_name") for name, group in groups: plt.plot(group["plate_x"], group["plate_z"], marker="o", linestyle="", label=name, alpha=.55) plt.ylim((0, 5))
def query_statcast( start_dt: Param(help="Beginning date to pull data from", type=str) = None, end_dt: Param(help="End date to pull data from", type=str) = None, team: Param(help="Abbreviation for team of interest", type=str) = None, verbose: Param(help="Whether or not to print verbose updates", type=bool_arg) = True, output_type: Param(help="What format to save data in", type=str) = "db", overwrite: Param(help="Whether or not to overwrite the db table if it already exists", type=bool_arg,) = False, output_path: Param(help="path to location that data should be saved", type=str) = "."): """ Callable from the command-line or in Python. Pulls pitch-level MLB data from [statcast](https://baseballsavant.mlb.com/statcast_search). Saves as either a sqlite db file, or csv. * inputs: - `start_dt`: `str`, Beginning date to pull data from = None - `end_dt`: `str`, End date to pull data from = None - `team`: `str`, abbreviation for team of interest = None - `verbose`: `bool`, Whether or not to print verbose updates - `output_type`: `str`, What format to save data in (must be one of {'db', 'csv'}) = 'db' - `overwrite`: `bool`, Whether or not to overwrite the db table if it already exists = False - `output_path`: `str`, Path to the location that the data should be saved at = '.' * outputs: - None """ # checking for correct output type if output_type not in ("db", "csv"): raise ValueError("output_type must be one of {'db', 'csv'}") if output_type == "db": # creating db connection conn = sqlite3.connect(f"{output_path}/statcast_pitches.db") # Checking if year is already in db cursor = conn.execute(f"select name from sqlite_master where type='table' and name='statcast_{start_dt[:4]}'") # if table exists in db if cursor.fetchone(): if overwrite: conn.execute(f"DROP TABLE IF EXISTS statcast_{start_dt[:4]}") else: # don't want to overwrite, pop out of function print(f"Table named 'statcast_{start_dt[:4]}' already exists in db saved at `{output_path}/statcast_{start_dt[:4]}`.") return None # if table does not already exist in db or it was just dropped # pulling data from statcast data = statcast(start_dt=start_dt, end_dt=end_dt, team=team, verbose=verbose) data.to_sql(f"statcast_{start_dt[:4]}", conn) conn.close() # output type is csv else: # Checking if file is already saved as csv if path.exists(f"{output_path}/statcast_{start_dt[:4]}.csv"): print(f"File named `{output_path}/statcast_{start_dt[:4]}.csv` already exists.") return None # pulling data from statcast data = statcast(start_dt=start_dt, end_dt=end_dt, team=team, verbose=verbose) # saving to csv data.to_csv(f"{output_path}/statcast_{start_dt[:4]}.csv", index=False) return None
def main(): pth = "/Users/irarickman/Google Drive/Data Science/Projects/MLB Projections/Batting Average" td = format(datetime.today(), "%Y-%m-%d") old_data = pd.read_pickle(pth + "/lastabs.pkl") old_data.game_date = pd.to_datetime(old_data.game_date, infer_datetime_format=True) prev_date = old_data.game_date.max() od = format(prev_date, "%Y-%m-%d") if od != td: new_d = statcast(od, td) new_data = new_d[new_d.events.notnull()] players_ids = playerid_reverse_lookup(new_data.batter.unique()) id_df = players_ids[['name_last', 'name_first', 'key_mlbam']] new_names = new_data.merge(id_df, how='left', left_on='batter', right_on='key_mlbam') df = pd.concat([old_data, new_names]) else: df = old_data df.drop_duplicates(inplace=True) df.to_pickle(pth + "/lastabs.pkl") df['hit'] = df.events.apply( lambda x: 1 if x in ["single", 'double', 'home_run', 'triple'] else 0) df['ab'] = df.events.apply(lambda x: 0 if x in [ 'walk', 'hit_by_pitch', "caught_stealing_2b", "pickoff_caught_stealing_2b", 'pickoff_1b', 'catcher_interf', 'pickoff_caught_stealing_3b', 'pickoff_2b', 'pickoff_caught_stealing_home', 'caught_stealing_3b', 'caught_stealing_home', "sac_fly", 'sac_bunt', 'sac_fly_double_play', 'sac_bunt_double_play' ] else 1) df['player_team'] = df.apply(lambda x: x.home_team if x.inning_topbot == "Bot" else x.away_team, axis=1) df['Opp'] = df.apply(lambda x: x.away_team if x.inning_topbot == "Bot" else x.home_team, axis=1) df['Place'] = df.apply(lambda x: "Home" if x.inning_topbot == "Bot" else "Away", axis=1) teams = df.player_team.unique() fixers = {"WSH": "WSN", "CWS": "CHW"} teams_fixed = [x if x not in fixers.keys() else fixers[x] for x in teams] team_schedule = {} missed = [] for t in teams_fixed: try: d = schedule_and_record(2018, t) d['fix_date'] = d.Date.str.replace("\(\d\)", "").str.strip() + " 2018" d['game_date'] = pd.to_datetime(d.fix_date.apply( lambda x: datetime.strptime(x, "%A, %b %d %Y")).apply( lambda x: x.strftime("%Y-%m-%d")), infer_datetime_format=True) d['Place'] = d.Home_Away.apply(lambda x: "Home" if x == "Home" else "Away") d2 = d[d.game_date >= datetime.today()][[ 'Place', "Opp", "game_date" ]] team_schedule[t] = d2 except ValueError: print(t) missed.append(t) df['name_last'] = df['name_last'].str.capitalize() df['name_first'] = df['name_first'].str.capitalize() df['player_name'] = df.name_first + " " + df.name_last sm_df = df[[ 'game_date', 'game_pk', 'hit', 'ab', 'Opp', 'Place', 'player_name', 'player_team', 'key_mlbam' ]] sm_df.sort_values(['player_name', 'game_date', 'game_pk'], inplace=True) trim_df = sm_df.groupby([ 'player_name', 'game_date', 'game_pk', 'Opp', 'Place', 'player_team', 'key_mlbam' ]).sum().reset_index() def player_df(player, d=trim_df): temp = d[d.player_name == player] temp = temp.sort_values(['game_date']).reset_index(drop=True) tm = temp.loc[len(temp) - 1, 'player_team'] if tm in fixers.keys(): sched = team_schedule[fixers[tm]] else: sched = team_schedule[tm] tdf = pd.concat([temp, sched]) tdf.ab.fillna(0, inplace=True) tdf.hit.fillna(0, inplace=True) tdf.player_name.fillna(player, inplace=True) tdf.player_team.fillna(tm, inplace=True) return tdf master_df = player_df(trim_df.player_name.unique()[0]) for p in trim_df.player_name.unique()[1:]: got = player_df(p) master_df = pd.concat([master_df, got]) master_df.game_date = master_df.game_date.apply( lambda x: format(x, "%Y-%m-%d")) ## now write to the google sheet # #authorization gc = pygsheets.authorize(outh_file='/Users/irarickman/client_secret.json') mlb = 'MLB At Bats' sh = gc.open(mlb) #select the first sheet wks = sh[0] wks.set_dataframe(master_df, (1, 1))
def build_db(self): '''Builds MLB Statcast tables within a pre-defined SQL database. Through a pipeline, this method will build tables containing all available MLB statcast data. It will will use the SQL sever and database information provided by the user within the GUI prompt that pops-up upon calling the method. Each year with have a RAW table created and a WRK table created. The RAW data is the untouched data from Baseball Savant, while the WRK table is the organized data with increased level of detail. This method will likely take several hours to complete. There is a log file that is created to track the progress. The data tables are created day-by-day, with each day appended. This should limit memory usage. Args: No arguments Returns: Does not return a parameter Raises: Exception: if the server information is not correct/exists ''' startTime = datetime.now() logging.basicConfig(filename=f'{self.parent_path}/logs/build_db.log', filemode='w', format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO) try: self.prompt_window() logging.info( f'Connected to: \nServer: {self.server}\nDatabase: {self.database}' ) except Exception as e: logging.exception('Exception occured') lst_year = ['2016', '2017', '2018', '2019'] lst_month = ['03', '09'] lst_day_31 = ['01', '31'] lst_day_30 = ['01', '30'] # lst_year = list(reversed([str(i) for i in range(2008,2021)])) # lst_month = ['0'+str(i) for i in range(3,10)] + [str(i) for i in range(10,12)] # lst_day_31 = ['0'+str(i) for i in range(1,10)] + [str(i) for i in range(10,32)] # lst_day_30 = ['0'+str(i) for i in range(1,10)] + [str(i) for i in range(10,31)] for y in range(0, len(lst_year)): self.dct_pos = ujson.load( open(f'{self.parent_path}/dicts/dct_pos_{lst_year[y]}.txt')) self.dct_bop = ujson.load( open(f'{self.parent_path}/dicts/dct_bop_{lst_year[y]}.txt')) self.dct_ss = ujson.load( open(f'{self.parent_path}/dicts/dct_ss_{lst_year[y]}.txt')) self.dct_parks = dict( zip(self.team_atts['Team'], self.team_atts[f'Ball_Park_{lst_year[y]}'])) for m in range(0, len(lst_month)): if any(lst_month[m] == x for x in ['03', '05', '07', '08', '10']): lst_day = lst_day_31 else: lst_day = lst_day_30 for d in range(0, len(lst_day)): try: self.df = pd.DataFrame( statcast( start_dt= f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}', end_dt= f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}')) except Exception as e: logging.exception("Exception occurred") self.df = self.df.replace({np.nan: None}) if self.df.empty: print( f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: NO DATA FOR THIS DATE' ) logging.warning( f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: NO DATA FOR THIS DATE' ) else: print('............') print(f'|{lst_year[y]}/{lst_month[m]}/{lst_day[d]}|') print('............') print( f'Completed: RAW data imported from Baseball Savant' ) logging.info( f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: RAW data imported from Baseball Savant' ) try: self.df.drop([ 'pitcher.1', 'fielder_2.1', 'post_away_score', 'post_home_score', 'post_bat_score', 'post_fld_score' ], axis=1, inplace=True) self.df.to_sql(f'raw_statcast_{lst_year[y]}', self.engine, index=False, if_exists='append') print( f'Completed: Raw data inserted into DB: STATCAST , TABLE: raw_statcast_{lst_year[y]}' ) logging.info( f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: Raw data inserted into DB: STATCAST , TABLE: raw_statcast_{lst_year[y]}' ) except Exception as e: logging.exception("Exception occurred") try: self.reorder_columns() self.rename_columns() self.add_batter_name() self.add_player_names() self.add_teams() self.add_lg_div() self.add_ballparks(lst_year[y]) self.add_batter_pos() self.add_batter_bop() self.add_batter_ss() self.add_post_scores() self.add_cnt_pa() self.add_cnt_ab() self.add_cnt_hit() self.add_cnt_single() self.add_cnt_double() self.add_cnt_triple() self.add_cnt_home_run() self.add_cnt_k() self.add_cnt_backwardsk() self.add_cnt_walk() self.add_cnt_hbp() self.add_cnt_rbi() self.add_yahoo_pnts() print( f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: Data transformation complete' ) logging.info( f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: Data transformation complete' ) except Exception as e: logging.exception('Exception Occured') try: self.df.to_sql(f'wrk_statcast_{lst_year[y]}', self.engine, index=False, if_exists='append') print( f'Completed: Working data inserted into DB: STATCAST , TABLE: wrk_statcast_{lst_year[y]}' ) print( f'Time Elapsed: {datetime.now() - startTime}\n' ) logging.info( f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: Working data inserted into DB: STATCAST , TABLE: wrk_statcast_{lst_year[y]}\nTime Elapsed: {datetime.now() - startTime}' ) except Exception as e: logging.exception("Exception occurred") print('\n\n--------All Dates Complete--------') print(f'Total Time Elapsed: {datetime.now() - startTime}') logging.info( f'--------All Dates Complete--------\nTotal Time Elapsed: {datetime.now() - startTime}' )
except: print(team, "didn't work") err = np.append(err, team) # In[43]: results # There are only 19 teams in this array, there should be 30 # In[69]: from pybaseball import statcast # get all statcast data for July 4th, 2017 data = statcast('2017-07-04') #get data for the first seven days of August in 2016 data = statcast('2016-08-01', '2016-08-07') # In[70]: data.columns.tolist() # In[71]: data = data[data['launch_angle'].notna()] data = data[data['launch_speed'].notna()] # In[76]:
ncol=2, prop={'size': 8}) plt.show() def dataFormatPlayer(player, data): df = data[[ 'pitch_type', 'release_speed', 'player_name', 'events', 'release_spin_rate', 'release_pos_x', 'release_pos_y' ]] df = df[::-1] df = df.loc[(data['player_name'] == player), [ 'pitch_type', 'release_speed', 'player_name', 'events', 'release_spin_rate', 'release_pos_x', 'release_pos_y' ]] df = df.reset_index(drop=True) df.index = df.index + 1 return df if __name__ == '__main__': day = '2019-07-03' data = statcast(start_dt=day, team='CHC') for player in data.player_name.unique(): spin_rate_scatter(player, dataFormatPlayer(player, data), day) print_full(data.events.unique()) # spin_rate_scatter(player, dataFormatPlayer(player, data), day) # spin_rate_scatter(player, dataFormatPlayer(player, data), day) #date = ['2019-05-01', '2019-04-29', '2019-04-27', '2019-04-23', '2019-04-22', '2019-03-20']
import pandas as pd from pybaseball import statcast # Data: Statcast # Overview: https://www.mlb.com/glossary/statcast # Package to pull data: # https://pypi.org/project/pybaseball/ # Other Packages considered: # https://pypi.org/project/baseball-scraper/ # https://pypi.org/project/vigorish/ # Useful source to compare packages: # https://snyk.io/advisor/python data = statcast(start_dt='2017-06-24', end_dt='2017-06-24')
# baseball-scraper package from baseball_scraper import espn # Data: Statcast # Overview: https://www.mlb.com/glossary/statcast # Package to pull data: # https://pypi.org/project/pybaseball/ # Other Packages considered: # https://pypi.org/project/baseball-scraper/ # https://pypi.org/project/vigorish/ # Useful source to compare packages: # https://snyk.io/advisor/python ############## Statcast data ############## data = statcast(start_dt='2021-04-01', end_dt='2021-04-02', team='SEA') data.loc[data.game_date == '2021-04-01', :] data.head() ############## Team Crosswalk ############# team_cross = teams() team_cross = team_ids() ############# Player Crosswalk ############ player = playerid_lookup('Sheffield', 'Justus') ############## Date Range ################# dt_lst = pd.date_range(start='2021-04-01', end='2021-05-31', freq='D') dt_range = [] for i in dt_lst: dt_range.append(i.strftime('%Y-%m-%d'))