Ejemplo n.º 1
0
def cmd_statcast_echo(start_date, end_date):

    dates = pd.date_range(start_date, end_date).tolist()

    data = statcast(start_dt=start_date, end_dt=end_date)

    click.echo(data.head())
Ejemplo n.º 2
0
def update_data(date, team):
    """Load game data using pybaseball's scraper.

    Uses pybaseball.statcast and pybaseball.playerid_reverse_lookup to load
    data from a specific game, and stores it as json in the user's browser
    session in a hidden div.

    """
    print("Loading data from statcast... ")
    raw = statcast(start_dt=date, end_dt=date, team=team)
    raw = raw.rename(columns={"player_name": "pitcher_name"})

    print("Adding batter names... ")
    batter_ids = raw["batter"].unique()
    batters = playerid_reverse_lookup(batter_ids, key_type="mlbam")
    batters["batter_name"] = (batters["name_first"].str.capitalize() + " " +
                              batters["name_last"].str.capitalize())

    raw = raw.merge(
        batters[["key_mlbam", "batter_name"]],
        how="left",
        left_on="batter",
        right_on="key_mlbam",
    )

    print("Done.")

    return raw.to_json(date_format="iso", orient="split")
Ejemplo n.º 3
0
    def import_statcast(self,
                        table,
                        start,
                        end,
                        team=None,
                        verbose=True,
                        cache=False):
        if cache:
            from pybaseball import cache
            cache.enable()
        if team is not None:
            data = statcast(start, end, team, verbose)
        else:
            data = statcast(start, end, verbose)

        data.to_sql(table,
                    self.db_connection.get_connection(),
                    if_exists='append')
Ejemplo n.º 4
0
def get_batting_league(
    start_dt=None, end_dt=None, team=None, verbose=True, fname_all=None,
    fname_bb=None, features=[
        'events', 'description', 'batter', 'stand', 'launch_angle',
        'launch_speed', 'hc_x', 'hc_y', 'pitcher', 'p_throws', 'pitch_type',
        'release_speed', 'release_spin_rate'
    ]
):

    """
    Pull league-wide statcast batting data from baseballsavant using pybaseball
    https://github.com/jldbc/pybaseball
    https://baseballsavant.mlb.com/statcast_search

    Arguments
        start_dt: get data from start_dt forward
        stop_dt:  get data up to stop_dt
        team: search team only ('SEA', 'TEX', etc)
        verbose: display query status messages
        fname_all: export csv of all statcast at bat outcomes to this file
            **must be .csv**
        fname_bb: export csv of all outcomes with a batted ball to this file
            **must be .csv**

    Returns
        (all_outcomes, batted_balls) tuple of dataframes
            Saves to files 'fname_all' and 'fname_bb' if fname is not None.
            Note that all_outcomes DOES NOT include plate appearances that do
            not count as at bats.
            TODO: output truly all outcomes, all at bats, and all batted balls
            as three separate files.
    """

    # get statcast data (this can take awhile)
    df = statcast(start_dt, end_dt, team, verbose)
    # discard null events
    all_outcomes = df[df['events'].notnull()]

    # get the specified features only
    all_outcomes = all_outcomes[features]

    if fname_all is not None:
        # export to csv
        all_outcomes.to_csv(fname_all, index=False)
        print('Exported: {}'.format(fname_all))

    # get batted balls only
    batted_balls = filter_batted_balls(all_outcomes)

    if fname_bb is not None:
        # export data
        batted_balls.to_csv(fname_bb, index=False)
        print('Exported: {}'.format(fname_bb))

    return(all_outcomes, batted_balls)
Ejemplo n.º 5
0
def main():
    args = _parse_args()
    start_time = time.time()
    _ = statcast(args.start_date, args.end_date)
    end_time = time.time()
    query_time = end_time - start_time
    threshold_exceeded = query_time > args.time_threshold
    print(
        f"query took {query_time: .1f} seconds (expected less than {args.time_threshold: .1f})"
    )
    sys.exit(int(threshold_exceeded))
Ejemplo n.º 6
0
    def new_statcast(self, current_statcast):
        latest = current_statcast['game_date'].max()
        dates = self.get_dates()

        statbat = pd.DataFrame()
        for date in dates:
            if date > datetime.strftime(latest, "%Y-%m-%d"):
                bat = statcast(date)
                print("\r", date, end="")
                statbat = statbat.append(bat)

        return statbat
Ejemplo n.º 7
0
def get_statcast_data(start_dt=None, end_dt=None):
    if start_dt is None:
        start_dt = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
    if end_dt is None:
        end_dt = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")

    df = statcast(start_dt=start_dt, end_dt=end_dt)

    # clean up the df
    df = df.drop(["index"], axis=1)
    df.columns = [c.replace(".", "_") for c in df.columns]
    return df
Ejemplo n.º 8
0
def statcast_data(year, end_month_range):
    df_dict = {}
    for i in range(3, end_month_range):
        start_date = datetime.date(year, i, 1).strftime("%Y-%m-%d")
        if year == today.year and i == today.month:
            end_date = today.strftime("%Y-%m-%d")
        else:
            end_date = datetime.date(year, i,
                                     month_end_dates[i]).strftime("%Y-%m-%d")
        try:
            df = statcast(start_date, end_date)
        except:
            pass
        df_dict[i] = df
    return df_dict
Ejemplo n.º 9
0
def get_statcast_data(directory, year):
    year = str(year)
    data_path = join(directory, 'statcast_data_{}.csv'.format(year))
    sc_data = None
    if not isfile(data_path):
        date_ranges = [('$-01-01', '$-03-31'), ('2018-04-01', '$-06-30'),
                       ('$-07-01', '$-09-30'), ('$-10-01', '$-12-31')]
        date_ranges = [(date[0].replace('$', year), date[1].replace('$', year))
                       for date in date_ranges]
        data = []
        for dr in date_ranges:
            data.append(statcast(start_dt=dr[0], end_dt=dr[1]))
        sc_data = pd.concat(data, ignore_index=True)
        sc_data.to_csv(data_path)
    else:
        sc_data = pd.read_csv(data_path, index_col=0)
    return sc_data.copy()
Ejemplo n.º 10
0
def get_statcast(
    filename: str,
    start: str,
    end: str = None,
    fmt: str = "%Y-%m-%d",
    mode="append",
):
    """Download data to an existing parquet file."""
    filename = os.path.expanduser(filename)
    if end is None:
        end = start
    start_dt = datetime.strptime(start, fmt).date()
    end_dt = datetime.strptime(end, fmt).date()

    for dt in daterange(start_dt, end_dt):
        datestr = dt.strftime("%Y-%m-%d")
        print("Downloading data from " + datestr + "... ", end="")

        pdf = statcast(start_dt=datestr, end_dt=datestr)

        if pdf.empty:
            print("No data.")
            continue

        # fix type of string columns with NaNs
        fields = [
            x for x in pdf.columns
            if pdf[x].dtype == "object" and pdf[x].isna().sum() > 0
        ]

        for field in fields:
            pdf[field] = pdf[field].astype(str)

        # convert to spark and save as parquet
        print("Saving as parquet ({})... ".format(mode), end="")
        sdf = get_spark().createDataFrame(pdf)

        sdf.write.parquet(filename, mode=mode, partitionBy="pitcher")

        # overwrite on first pass, append after that
        if mode == "overwrite":
            mode = "append"

        print("Done.")
def pull_clean_and_pickle(start_date, end_date, year):

    # Pull in DataFrame from statcast
    df = statcast(start_dt=start_date, end_dt=end_date)

    # Drop depriciated and extraneous features
    df = df.drop(columns=[
        'spin_dir', 'spin_rate_deprecated', 'break_angle_deprecated',
        'break_length_deprecated', 'game_type', 'tfs_deprecated',
        'tfs_zulu_deprecated', 'umpire'
    ])

    # Sort pitches chronologically
    df = df.sort_values(
        by=['game_date', 'game_pk', 'at_bat_number', 'pitch_number'])

    # Pickle DataFrame
    df.to_pickle(path=(
        "/Users/mattkirby/repos/RC-v1.0--Data-Gathering/season_pickles/pitches_"
        + year + ".pkl"),
                 compression='zip')
Ejemplo n.º 12
0
def cmd_statcast_upload(start_date, end_date, db_username, db_password,
                        db_hostname, db_name, db_tablename):

    click.echo('[[[ PULLING STATCAST DATAFRAME ]]]')

    statcast_engine = initdb_statcast(db_username, db_password, db_hostname,
                                      db_name, db_tablename)

    click.echo("Breaking up into queries by day to avoid the 30k row cap...")

    for day in pd.date_range(start_date, end_date):
        click.echo(str(day.date()) + ' - Pulling data...')
        try:
            data = statcast(start_dt=str(day.date()), end_dt=str(day.date()))
            upload_block(data, statcast_engine, db_tablename)
        except Exception as exc:
            click.echo(
                str(day.date()) + " - ERROR pulling down data - Error was = " +
                str(exc))
        else:
            click.echo(
                str(day.date()) + ' - SUCCESS, pulled ' + str(data.shape[0]) +
                ' records')
Ejemplo n.º 13
0
    def download_season(self, years):
        """ Downloads the statcast data for a season as a csv. It adds a unique id (uid) for each pitch and then saves the csv in the folder statcast_data/[year].csv.
        
        Parameters
        ----------
        years (str, int, list of strings, list of ints) : the season(s) to download

        Returns
        -------
        None
        """
        dh = DataHandler()
        if not isinstance(years, list):
            years = [years]
        for year in years:
            data = statcast(start_dt=self.season_dates[year]["start"],
                            end_dt=self.season_dates[year]["end"])
            data["index"] = data.apply(lambda row: self._create_uid(row),
                                       axis=1)
            data = data.set_index("index")
            data = data.sort_index().reset_index(drop=False)
            data = dh.set_data_types(data)
            data.to_feather(f"statcast_data/{year}")
Ejemplo n.º 14
0
# Qualified pitcher data and eliminating unneeded rows
df_qualified_pitchers = pd.read_csv(data_dir + '200IPData_Advanced.csv')
df_qualified_pitchers = df_qualified_pitchers[['Name', 'FIP']]
df_qualified_pitchers['Tag'] = np.nan
df_qualified_pitchers.Tag[(df_qualified_pitchers['FIP'] > 4.69)] = 'Bad'
df_qualified_pitchers.Tag[(df_qualified_pitchers['FIP'] < 3.78)] = 'Good'
df_qualified_pitchers.Tag[(df_qualified_pitchers['FIP'] <= 4.69)
                          & (df_qualified_pitchers['FIP'] >= 3.78)] = 'Average'
pitcher_tag = dict(zip(df_qualified_pitchers.Name, df_qualified_pitchers.Tag))
# print(df_qualified_pitchers.describe())
print("Number of Pitchers in Each Category: \n",
      df_qualified_pitchers.Tag.value_counts())

# Raw play by play data and eliminating non-BBIP events
df_raw = statcast(start_dt='2017-03-01', end_dt='2019-10-01')
df_raw = df_raw[['pitcher', 'batter', 'woba_value', 'woba_denom']]
df_raw = df_raw.dropna()
df_raw['pitcher_name'] = df_raw['pitcher'].map(player_id)
df_raw['batter_name'] = df_raw['batter'].map(player_id)
df_raw['pitcher_tag'] = df_raw['pitcher_name'].map(pitcher_tag)
df_raw['batter_tag'] = df_raw['batter_name'].map(batter_tag)
df_raw = df_raw.dropna()

# Splits data up by match ups, pitcher tendency is listed first
df_G_G = df_raw[(df_raw.pitcher_tag == 'Good') & (df_raw.batter_tag == 'Good')]
df_G_A = df_raw[(df_raw.pitcher_tag == 'Good')
                & (df_raw.batter_tag == 'Average')]
df_G_B = df_raw[(df_raw.pitcher_tag == 'Good') & (df_raw.batter_tag == 'Bad')]
df_A_G = df_raw[(df_raw.pitcher_tag == 'Average')
                & (df_raw.batter_tag == 'Good')]
Ejemplo n.º 15
0
    block_start_date = start_date
    group_start_date = start_date
    group_count = 0
    group_data = pd.DataFrame()

    while end_date >= block_start_date:

        block_end_date = block_start_date + timedelta(days=block_days - 1)
        if block_end_date > end_date:
            block_end_date = end_date

        print('Getting data for %s to %s' %
              (block_start_date.strftime(date_format),
               block_end_date.strftime(date_format)))

        block_data = statcast(start_dt=block_start_date.strftime(date_format),
                              end_dt=block_end_date.strftime(date_format))

        block_start_date = block_end_date + timedelta(days=1)

        if len(block_data):
            group_data = pd.concat((group_data, block_data))
            group_count += 1
            if group_count == group_blocks:
                group_filename = '%s_%s.csv' % (group_start_date.strftime(
                    date_format), block_end_date.strftime(date_format))
                group_data.to_csv(os.path.join(data_path, group_filename))

                group_start_date = block_start_date
                group_count = 0
                group_data = pd.DataFrame()
Ejemplo n.º 16
0
from pybaseball import statcast
import pandas as pd

# Scraping Statcast Data With Python
called_pitches = statcast(start_dt='2016-03-15', end_dt='2016-05-02')
called_pitches = called_pitches[['game_date', 'description', 'stand', 'balls', 'strikes', 'plate_x', 'plate_z', 'sz_top', 'sz_bot', 'inning', 'player_name', 'game_pk']]
called_pitches.to_csv('statcast-all-2016-april.csv')
called_pitches = ""

# Scraping Statcast Data With Python
called_pitches = statcast(start_dt='2016-05-02', end_dt='2016-06-01')
called_pitches = called_pitches[['game_date', 'description', 'stand', 'balls', 'strikes', 'plate_x', 'plate_z', 'sz_top', 'sz_bot', 'inning', 'player_name', 'game_pk']]
called_pitches.to_csv('statcast-all-2016-may.csv')
called_pitches = ""

# Scraping Statcast Data With Python
called_pitches = statcast(start_dt='2016-06-02', end_dt='2016-07-01')
called_pitches = called_pitches[['game_date', 'description', 'stand', 'balls', 'strikes', 'plate_x', 'plate_z', 'sz_top', 'sz_bot', 'inning', 'player_name', 'game_pk']]
called_pitches.to_csv('statcast-all-2016-june.csv')
called_pitches = ""

# Scraping Statcast Data With Python
called_pitches = statcast(start_dt='2016-07-02', end_dt='2016-08-01')
called_pitches = called_pitches[['game_date', 'description', 'stand', 'balls', 'strikes', 'plate_x', 'plate_z', 'sz_top', 'sz_bot', 'inning', 'player_name', 'game_pk']]
called_pitches.to_csv('statcast-all-2016-july.csv')
called_pitches = ""

# Scraping Statcast Data With Python
called_pitches = statcast(start_dt='2016-08-02', end_dt='2016-09-01')
called_pitches = called_pitches[['game_date', 'description', 'stand', 'balls', 'strikes', 'plate_x', 'plate_z', 'sz_top', 'sz_bot', 'inning', 'player_name', 'game_pk']]
called_pitches.to_csv('statcast-all-2016-august.csv')
Ejemplo n.º 17
0
def pull_statcast_data(start_date, end_date, year):
    """
    Date Format: YYYY-MM-DD
    """
    df = statcast(start_dt=start_date, end_dt=end_date)
    return df
Ejemplo n.º 18
0
def getInfo(input_file):
    print("In GET INFO")

    data = statcast(start_dt='2017-03-28', end_dt='2017-10-1')
    # print(data.head(data.size))
    data.to_csv(input_file)
Ejemplo n.º 19
0
import pandas as pd

# people from: https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv
# sc from:
from pybaseball import statcast
sc = statcast(start_dt='2012-01-01', end_dt='2020-01-01')
sc.to_parquet("statcast_dump.parquet", engine="fastparquet")
# full statcast.csv.gzip file available upon request.

people = pd.read_csv("people.csv")
# sc = pd.read_parquet("statcast_dump.parquet", engine="fastparquet")
people["batter_name"] = people.name_first + " " + people.name_last
merged = pd.merge(
    sc,
    people.loc[:, ["key_mlbam", "batter_name"]],
    how="left",
    left_on="batter",
    right_on="key_mlbam",
)
cols2keep = [
    "player_name",
    "batter_name",
    "pitch_type",
    "game_date",
    "release_speed",
    "events",
    "launch_speed",
    "woba_value",
    "bb_type",
    "balls",
    "strikes",
Ejemplo n.º 20
0
    def stream_data(self, date):
        '''Appends new MLB statcast data for a specified date. Designed to
        be used as a streaming pipline for new daily data during in-season.

        Once the database is built, this method can be used to add new data
        to the RAW and WRK tables. This method should be used while scheduling
        a task to send new daily data through the pipeline during in-season.

        Args:
            No arguments

        Returns:
            Does not return a parameter

        Raises:
            No exceptions
        '''
        logging.basicConfig(
            filename=f'{self.parent_path}/logs/stream_data.log',
            filemode='w',
            format='%(asctime)s - %(levelname)s - %(message)s',
            datefmt='%d-%b-%y %H:%M:%S',
            level=logging.INFO)

        startTime = datetime.now()

        self.dbtype = self.lst_config[0]['Database_System']['db_type']
        self.connect_db()
        try:
            self.engine.connect()
            print(
                f'Connected to: \nServer: {self.server}\nDatabase: {self.database}'
            )
            logging.info(
                f'\nConnection Successful \nServer: {self.server}\nDatabase: {self.database}'
            )
        except Exception as e:
            Print('Unable to connect to Server: {self.sever]}')
            logging.exception('Exception Occured')

        try:
            self.df = pd.DataFrame(
                statcast(start_dt=f'{date}', end_dt=f'{date}'))
        except Exception as e:
            logging.exception("Exception occurred")

        self.df = self.df.replace({np.nan: None})
        self.dct_parks = dict(
            zip(self.team_atts['Team'],
                self.team_atts[f'Ball_Park_{int(self.df.game_year.loc[1])}']))

        if self.df.empty:
            print(f'{date}: NO DATA FOR THIS DATE')
            logging.warning(f'{date}: NO DATA FOR THIS DATE')
        else:
            print('............')
            print(f'|{date}|')
            print('............')
            print(f'Completed: RAW data imported from Baseball Savant')
            logging.info(f'{date}: RAW data imported from Baseball Savant')
            try:
                self.df.drop(['pitcher.1', 'fielder_2.1'],
                             axis=1,
                             inplace=True)
                self.df.to_sql(f'raw_statcast_{int(self.df.game_year.loc[1])}',
                               self.engine,
                               index=False,
                               if_exists='append')
                print(
                    f'Completed: Raw data inserted into DB: STATCAST , TABLE: raw_statcast_{int(self.df.game_year.loc[1])}'
                )
                logging.info(
                    f'{date}: Raw data inserted into DB: STATCAST , TABLE: raw_statcast_{int(self.df.game_year.loc[1])}'
                )
            except Exception as e:
                logging.exception("Exception occurred")
            try:
                self.reorder_columns()
                self.rename_columns()
                self.add_batter_name()
                self.add_player_names()
                self.add_teams()
                self.add_lg_div()
                self.add_ballparks(self.df['Game_Year'][0])
                self.stream_pos_bop_dct(date)
                self.add_batter_pos()
                self.add_batter_bop()
                print(f'Completed: Data transformation')
                logging.info(f'{date}: Data transformation complete')
            except:
                logging.exception("Exception occurred")

            try:
                self.df.to_sql(f'wrk_statcast_{int(self.df.Game_Year.loc[1])}',
                               self.engine,
                               index=False,
                               if_exists='append')
                print(
                    f'Completed: Working data inserted into DB: STATCAST , TABLE: wrk_statcast_{int(self.df.Game_Year.loc[1])}\n'
                )
                logging.info(
                    f'''{date}: Working data inserted into DB: STATCAST , TABLE:
                              wrk_statcast_{int(self.df.Game_Year.loc[1])}\nTime Elapsed: {datetime.now() - startTime}'''
                )
            except Exception as e:
                logging.exception("Exception occurred")

        print(f'--------{date} Complete--------')
        print(f'Total Time Elapsed: {datetime.now() - startTime}')
        logging.info(
            f'--------{date} Complete--------\nTotal Time Elapsed: {datetime.now() - startTime}'
        )
Ejemplo n.º 21
0
    # inputs are valid if either both or zero dates are supplied. Not valid of only one given.

    if start_dt and end_dt:
        # how many days worth of data are needed?
        date_format = "%Y-%m-%d"
        d1 = datetime.datetime.strptime(start_dt, date_format)
        d2 = datetime.datetime.strptime(end_dt, date_format)
        days_in_query = (d2 - d1).days
        if days_in_query <= small_query_threshold:
            data = small_request(start_dt,end_dt)
        else:
            data = large_request(start_dt,end_dt,d1,d2,step=small_query_threshold,verbose=verbose)

        data = postprocessing(data, team)
        return data




start_dates = ['2012-03-25', '2012-07-02', '2013-03-25', '2013-07-02', '2014-03-25', '2014-07-02', '2015-03-25','2015-07-02', '2016-03-25','2016-07-02',  '2017-03-25', '2017-07-02']
end_dates = ['2012-07-01', '2012-11-01', '2013-07-01', '2013-11-01', '2014-07-01', '2014-11-01', '2015-07-01', '2015-11-01', '2016-07-01', '2016-11-01', '2017-07-01', '2017-11-01']

for i in range(len(start_dates)) :
    start = start_dates[i]
    end = end_dates[i]
    df = statcast(start_dt=start, end_dt=end)
    df = df[['pitcher', 'player_name', 'pitchtype', 'pitch_name', 'game_date', 'release_speed', 'release_pos_x', 'release_pos_y', 'release_pos_z', 'spin_dir', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'vx0', 'vy0', 'vz0'
    , 'ax', 'ay', 'az']]

    df.to_csv('C:/Users/Colin/Documents/GitHub/baseball-injuries/datasets/statcast_{0}_{1}.csv'.format(start, end))
Ejemplo n.º 22
0
#imports
from pybaseball import statcast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# collect Statcast data on all pitches from the months of May and June
data = statcast('2017-05-01', '2017-05-03')
print(data.shape)

data2 = data.dropna(
    subset=['launch_angle', 'launch_speed', 'estimated_ba_using_speedangle'])
data2.shape
fig, ax = plt.subplots(figsize=(8, 8))
sns.despine(fig, left=True, bottom=True)
sns.scatterplot(x="launch_speed",
                y="launch_angle",
                hue="estimated_ba_using_speedangle",
                palette='viridis',
                data=data2,
                ax=ax)
ax.set_title("Hit probability by Launch Angle and Exit Velocity")

data2['hr'] = data2.events == 'home_run'

fig, ax = plt.subplots(figsize=(8, 8))
sns.despine(fig, left=True, bottom=True)
sns.scatterplot(x="launch_speed",
Ejemplo n.º 23
0
You can see by the pairplot below that stolen bases do not correlate well with any of hits, homeruns, or at bats.

"""
batting_copy = batting[['AB', 'H', 'HR', 'SB']]
sns.pairplot(batting_copy)
plt.show()
"""
Strike zone Application

Now that we got through the basics of plotting, lets apply this to an example where we plot the strike zone and pitch locations. Essentially we are taking the
variables plate_x and plate_z and plotting them. You can imagine our plot like you are looking at the pitcher from home plate and we are plotting where each
pitch is. So to do this we need to create out strike zone and it is also helpful to map each pitch type to a color so we can see if location changes
depending on the pitch type. 
"""

statcast = pyb.statcast(start_dt='2020-09-22', end_dt='2020-09-23')
print('{} Rows imported'.format(str(len(statcast))))


def plotPitches(plotdf):
    strikezone = plt.Rectangle((-.95, 1.6), 1.65, 1.8, color='red', fill=False)
    groups = plotdf.groupby("pitch_name")
    for name, group in groups:
        plt.plot(group["plate_x"],
                 group["plate_z"],
                 marker="o",
                 linestyle="",
                 label=name,
                 alpha=.55)

    plt.ylim((0, 5))
Ejemplo n.º 24
0
def query_statcast(
        start_dt: Param(help="Beginning date to pull data from", type=str) = None,
        end_dt: Param(help="End date to pull data from", type=str) = None,
        team: Param(help="Abbreviation for team of interest", type=str) = None,
        verbose: Param(help="Whether or not to print verbose updates", type=bool_arg) = True,
        output_type: Param(help="What format to save data in", type=str) = "db",
        overwrite: Param(help="Whether or not to overwrite the db table if it already exists", type=bool_arg,) = False,
        output_path: Param(help="path to location that data should be saved", type=str) = "."):
    """
    Callable from the command-line or in Python. Pulls pitch-level MLB data from [statcast](https://baseballsavant.mlb.com/statcast_search).
    Saves as either a sqlite db file, or csv.

    * inputs:
        - `start_dt`: `str`, Beginning date to pull data from = None
        - `end_dt`: `str`, End date to pull data from = None
        - `team`: `str`, abbreviation for team of interest = None
        - `verbose`: `bool`, Whether or not to print verbose updates
        - `output_type`: `str`, What format to save data in (must be one of {'db', 'csv'}) = 'db'
        - `overwrite`: `bool`, Whether or not to overwrite the db table if it already exists = False
        - `output_path`: `str`, Path to the location that the data should be saved at = '.'

    * outputs:
        - None
    """
    # checking for correct output type
    if output_type not in ("db", "csv"):
        raise ValueError("output_type must be one of {'db', 'csv'}")

    if output_type == "db":
        # creating db connection
        conn = sqlite3.connect(f"{output_path}/statcast_pitches.db")

        # Checking if year is already in db
        cursor = conn.execute(f"select name from sqlite_master where type='table' and name='statcast_{start_dt[:4]}'")

        # if table exists in db
        if cursor.fetchone():
            if overwrite:
                conn.execute(f"DROP TABLE IF EXISTS statcast_{start_dt[:4]}")
            else:
                # don't want to overwrite, pop out of function
                print(f"Table named 'statcast_{start_dt[:4]}' already exists in db saved at `{output_path}/statcast_{start_dt[:4]}`.")
                return None

        # if table does not already exist in db or it was just dropped
        # pulling data from statcast
        data = statcast(start_dt=start_dt, end_dt=end_dt, team=team, verbose=verbose)
        data.to_sql(f"statcast_{start_dt[:4]}", conn)
        conn.close()

    # output type is csv
    else:
        # Checking if file is already saved as csv
        if path.exists(f"{output_path}/statcast_{start_dt[:4]}.csv"):
            print(f"File named `{output_path}/statcast_{start_dt[:4]}.csv` already exists.")
            return None

        # pulling data from statcast
        data = statcast(start_dt=start_dt, end_dt=end_dt, team=team, verbose=verbose)

        # saving to csv
        data.to_csv(f"{output_path}/statcast_{start_dt[:4]}.csv", index=False)

    return None
Ejemplo n.º 25
0
def main():

    pth = "/Users/irarickman/Google Drive/Data Science/Projects/MLB Projections/Batting Average"
    td = format(datetime.today(), "%Y-%m-%d")
    old_data = pd.read_pickle(pth + "/lastabs.pkl")
    old_data.game_date = pd.to_datetime(old_data.game_date,
                                        infer_datetime_format=True)
    prev_date = old_data.game_date.max()
    od = format(prev_date, "%Y-%m-%d")
    if od != td:
        new_d = statcast(od, td)
        new_data = new_d[new_d.events.notnull()]
        players_ids = playerid_reverse_lookup(new_data.batter.unique())
        id_df = players_ids[['name_last', 'name_first', 'key_mlbam']]
        new_names = new_data.merge(id_df,
                                   how='left',
                                   left_on='batter',
                                   right_on='key_mlbam')
        df = pd.concat([old_data, new_names])
    else:
        df = old_data
    df.drop_duplicates(inplace=True)
    df.to_pickle(pth + "/lastabs.pkl")
    df['hit'] = df.events.apply(
        lambda x: 1 if x in ["single", 'double', 'home_run', 'triple'] else 0)
    df['ab'] = df.events.apply(lambda x: 0 if x in [
        'walk', 'hit_by_pitch', "caught_stealing_2b",
        "pickoff_caught_stealing_2b", 'pickoff_1b', 'catcher_interf',
        'pickoff_caught_stealing_3b', 'pickoff_2b',
        'pickoff_caught_stealing_home', 'caught_stealing_3b',
        'caught_stealing_home', "sac_fly", 'sac_bunt', 'sac_fly_double_play',
        'sac_bunt_double_play'
    ] else 1)
    df['player_team'] = df.apply(lambda x: x.home_team
                                 if x.inning_topbot == "Bot" else x.away_team,
                                 axis=1)
    df['Opp'] = df.apply(lambda x: x.away_team
                         if x.inning_topbot == "Bot" else x.home_team,
                         axis=1)
    df['Place'] = df.apply(lambda x: "Home"
                           if x.inning_topbot == "Bot" else "Away",
                           axis=1)
    teams = df.player_team.unique()
    fixers = {"WSH": "WSN", "CWS": "CHW"}
    teams_fixed = [x if x not in fixers.keys() else fixers[x] for x in teams]

    team_schedule = {}
    missed = []
    for t in teams_fixed:
        try:
            d = schedule_and_record(2018, t)
            d['fix_date'] = d.Date.str.replace("\(\d\)",
                                               "").str.strip() + " 2018"
            d['game_date'] = pd.to_datetime(d.fix_date.apply(
                lambda x: datetime.strptime(x, "%A, %b %d %Y")).apply(
                    lambda x: x.strftime("%Y-%m-%d")),
                                            infer_datetime_format=True)
            d['Place'] = d.Home_Away.apply(lambda x: "Home"
                                           if x == "Home" else "Away")
            d2 = d[d.game_date >= datetime.today()][[
                'Place', "Opp", "game_date"
            ]]
            team_schedule[t] = d2
        except ValueError:
            print(t)
            missed.append(t)

    df['name_last'] = df['name_last'].str.capitalize()
    df['name_first'] = df['name_first'].str.capitalize()
    df['player_name'] = df.name_first + " " + df.name_last
    sm_df = df[[
        'game_date', 'game_pk', 'hit', 'ab', 'Opp', 'Place', 'player_name',
        'player_team', 'key_mlbam'
    ]]
    sm_df.sort_values(['player_name', 'game_date', 'game_pk'], inplace=True)
    trim_df = sm_df.groupby([
        'player_name', 'game_date', 'game_pk', 'Opp', 'Place', 'player_team',
        'key_mlbam'
    ]).sum().reset_index()

    def player_df(player, d=trim_df):
        temp = d[d.player_name == player]
        temp = temp.sort_values(['game_date']).reset_index(drop=True)
        tm = temp.loc[len(temp) - 1, 'player_team']
        if tm in fixers.keys():
            sched = team_schedule[fixers[tm]]
        else:
            sched = team_schedule[tm]
        tdf = pd.concat([temp, sched])
        tdf.ab.fillna(0, inplace=True)
        tdf.hit.fillna(0, inplace=True)
        tdf.player_name.fillna(player, inplace=True)
        tdf.player_team.fillna(tm, inplace=True)
        return tdf

    master_df = player_df(trim_df.player_name.unique()[0])
    for p in trim_df.player_name.unique()[1:]:
        got = player_df(p)
        master_df = pd.concat([master_df, got])
    master_df.game_date = master_df.game_date.apply(
        lambda x: format(x, "%Y-%m-%d"))

    ## now write to the google sheet

    # #authorization
    gc = pygsheets.authorize(outh_file='/Users/irarickman/client_secret.json')
    mlb = 'MLB At Bats'
    sh = gc.open(mlb)

    #select the first sheet
    wks = sh[0]

    wks.set_dataframe(master_df, (1, 1))
Ejemplo n.º 26
0
    def build_db(self):
        '''Builds MLB Statcast tables within a pre-defined SQL database.

        Through a pipeline, this method will build tables containing all available
        MLB statcast data. It will will use the SQL sever and database
        information provided by the user within the GUI prompt that pops-up upon
        calling the method. Each year with have a RAW table created and a WRK
        table created. The RAW data is the untouched data from Baseball Savant,
        while the WRK table is the organized data with increased level of detail.
        This method will likely take several hours to complete. There is a log file
        that is created to track the progress. The data tables are created day-by-day,
        with each day appended. This should limit memory usage.

        Args:
            No arguments

        Returns:
            Does not return a parameter

        Raises:
            Exception: if the server information is not correct/exists

        '''
        startTime = datetime.now()

        logging.basicConfig(filename=f'{self.parent_path}/logs/build_db.log',
                            filemode='w',
                            format='%(asctime)s - %(levelname)s - %(message)s',
                            datefmt='%d-%b-%y %H:%M:%S',
                            level=logging.INFO)

        try:
            self.prompt_window()
            logging.info(
                f'Connected to: \nServer: {self.server}\nDatabase: {self.database}'
            )
        except Exception as e:
            logging.exception('Exception occured')

        lst_year = ['2016', '2017', '2018', '2019']
        lst_month = ['03', '09']
        lst_day_31 = ['01', '31']
        lst_day_30 = ['01', '30']

        #         lst_year =  list(reversed([str(i) for i in range(2008,2021)]))
        #         lst_month = ['0'+str(i) for i in range(3,10)] + [str(i) for i in range(10,12)]
        #         lst_day_31 = ['0'+str(i) for i in range(1,10)] + [str(i) for i in range(10,32)]
        #         lst_day_30 = ['0'+str(i) for i in range(1,10)] + [str(i) for i in range(10,31)]

        for y in range(0, len(lst_year)):
            self.dct_pos = ujson.load(
                open(f'{self.parent_path}/dicts/dct_pos_{lst_year[y]}.txt'))
            self.dct_bop = ujson.load(
                open(f'{self.parent_path}/dicts/dct_bop_{lst_year[y]}.txt'))
            self.dct_ss = ujson.load(
                open(f'{self.parent_path}/dicts/dct_ss_{lst_year[y]}.txt'))
            self.dct_parks = dict(
                zip(self.team_atts['Team'],
                    self.team_atts[f'Ball_Park_{lst_year[y]}']))

            for m in range(0, len(lst_month)):
                if any(lst_month[m] == x
                       for x in ['03', '05', '07', '08', '10']):
                    lst_day = lst_day_31
                else:
                    lst_day = lst_day_30
                for d in range(0, len(lst_day)):

                    try:
                        self.df = pd.DataFrame(
                            statcast(
                                start_dt=
                                f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}',
                                end_dt=
                                f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}'))
                    except Exception as e:
                        logging.exception("Exception occurred")

                    self.df = self.df.replace({np.nan: None})

                    if self.df.empty:
                        print(
                            f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: NO DATA FOR THIS DATE'
                        )
                        logging.warning(
                            f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: NO DATA FOR THIS DATE'
                        )
                    else:
                        print('............')
                        print(f'|{lst_year[y]}/{lst_month[m]}/{lst_day[d]}|')
                        print('............')
                        print(
                            f'Completed: RAW data imported from Baseball Savant'
                        )
                        logging.info(
                            f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: RAW data imported from Baseball Savant'
                        )

                        try:
                            self.df.drop([
                                'pitcher.1', 'fielder_2.1', 'post_away_score',
                                'post_home_score', 'post_bat_score',
                                'post_fld_score'
                            ],
                                         axis=1,
                                         inplace=True)
                            self.df.to_sql(f'raw_statcast_{lst_year[y]}',
                                           self.engine,
                                           index=False,
                                           if_exists='append')
                            print(
                                f'Completed: Raw data inserted into DB: STATCAST , TABLE: raw_statcast_{lst_year[y]}'
                            )
                            logging.info(
                                f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: Raw data inserted into DB: STATCAST , TABLE: raw_statcast_{lst_year[y]}'
                            )
                        except Exception as e:
                            logging.exception("Exception occurred")

                        try:
                            self.reorder_columns()
                            self.rename_columns()
                            self.add_batter_name()
                            self.add_player_names()
                            self.add_teams()
                            self.add_lg_div()
                            self.add_ballparks(lst_year[y])
                            self.add_batter_pos()
                            self.add_batter_bop()
                            self.add_batter_ss()
                            self.add_post_scores()
                            self.add_cnt_pa()
                            self.add_cnt_ab()
                            self.add_cnt_hit()
                            self.add_cnt_single()
                            self.add_cnt_double()
                            self.add_cnt_triple()
                            self.add_cnt_home_run()
                            self.add_cnt_k()
                            self.add_cnt_backwardsk()
                            self.add_cnt_walk()
                            self.add_cnt_hbp()
                            self.add_cnt_rbi()
                            self.add_yahoo_pnts()

                            print(
                                f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: Data transformation complete'
                            )
                            logging.info(
                                f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: Data transformation complete'
                            )
                        except Exception as e:
                            logging.exception('Exception Occured')

                        try:
                            self.df.to_sql(f'wrk_statcast_{lst_year[y]}',
                                           self.engine,
                                           index=False,
                                           if_exists='append')
                            print(
                                f'Completed: Working data inserted into DB: STATCAST , TABLE: wrk_statcast_{lst_year[y]}'
                            )
                            print(
                                f'Time Elapsed: {datetime.now() - startTime}\n'
                            )
                            logging.info(
                                f'{lst_year[y]}-{lst_month[m]}-{lst_day[d]}: Working data inserted into DB: STATCAST , TABLE: wrk_statcast_{lst_year[y]}\nTime Elapsed: {datetime.now() - startTime}'
                            )
                        except Exception as e:
                            logging.exception("Exception occurred")

        print('\n\n--------All Dates Complete--------')
        print(f'Total Time Elapsed: {datetime.now() - startTime}')
        logging.info(
            f'--------All Dates Complete--------\nTotal Time Elapsed: {datetime.now() - startTime}'
        )
Ejemplo n.º 27
0
    except:
        print(team, "didn't work")
        err = np.append(err, team)

# In[43]:

results

# There are only 19 teams in this array, there should be 30

# In[69]:

from pybaseball import statcast

# get all statcast data for July 4th, 2017
data = statcast('2017-07-04')

#get data for the first seven days of August in 2016
data = statcast('2016-08-01', '2016-08-07')

# In[70]:

data.columns.tolist()

# In[71]:

data = data[data['launch_angle'].notna()]
data = data[data['launch_speed'].notna()]

# In[76]:
               ncol=2,
               prop={'size': 8})
    plt.show()


def dataFormatPlayer(player, data):
    df = data[[
        'pitch_type', 'release_speed', 'player_name', 'events',
        'release_spin_rate', 'release_pos_x', 'release_pos_y'
    ]]
    df = df[::-1]
    df = df.loc[(data['player_name'] == player), [
        'pitch_type', 'release_speed', 'player_name', 'events',
        'release_spin_rate', 'release_pos_x', 'release_pos_y'
    ]]
    df = df.reset_index(drop=True)
    df.index = df.index + 1
    return df


if __name__ == '__main__':

    day = '2019-07-03'
    data = statcast(start_dt=day, team='CHC')
    for player in data.player_name.unique():
        spin_rate_scatter(player, dataFormatPlayer(player, data), day)
    print_full(data.events.unique())
    # spin_rate_scatter(player, dataFormatPlayer(player, data), day)
    # spin_rate_scatter(player, dataFormatPlayer(player, data), day)
    #date = ['2019-05-01', '2019-04-29', '2019-04-27', '2019-04-23', '2019-04-22', '2019-03-20']
Ejemplo n.º 29
0
import pandas as pd
from pybaseball import statcast

# Data: Statcast
# Overview: https://www.mlb.com/glossary/statcast
# Package to pull data:
# https://pypi.org/project/pybaseball/
# Other Packages considered:
# https://pypi.org/project/baseball-scraper/
# https://pypi.org/project/vigorish/
# Useful source to compare packages:
# https://snyk.io/advisor/python

data = statcast(start_dt='2017-06-24', end_dt='2017-06-24')
Ejemplo n.º 30
0
# baseball-scraper package
from baseball_scraper import espn

# Data: Statcast
# Overview: https://www.mlb.com/glossary/statcast
# Package to pull data:
# https://pypi.org/project/pybaseball/
# Other Packages considered:
# https://pypi.org/project/baseball-scraper/
# https://pypi.org/project/vigorish/
# Useful source to compare packages:
# https://snyk.io/advisor/python

############## Statcast data ##############
data = statcast(start_dt='2021-04-01', end_dt='2021-04-02', team='SEA')
data.loc[data.game_date == '2021-04-01', :]
data.head()

############## Team Crosswalk #############
team_cross = teams()
team_cross = team_ids()

############# Player Crosswalk ############
player = playerid_lookup('Sheffield', 'Justus')

############## Date Range #################
dt_lst = pd.date_range(start='2021-04-01', end='2021-05-31', freq='D')
dt_range = []
for i in dt_lst:
    dt_range.append(i.strftime('%Y-%m-%d'))