Esempio n. 1
0
def process_data(time, firstname, lastname, pos):
    playerid = playerid_lookup(lastname, firstname)
    if "to" in time:
        if pos == 'batter': 
            return statcast_batter(time[0:10], time[14:], player_id=int(playerid['key_mlbam'][0]))
        elif pos == 'pitcher': 
            return statcast_pitcher(time[0:10], time[14:], player_id=int(playerid['key_mlbam'][0]))
        else: 
            return None
    else: 
        if pos == 'batter': 
            return statcast_batter(time, player_id=int(playerid['key_mlbam'][0]))
        elif pos == 'pitcher': 
            return statcast_pitcher(time, player_id=int(playerid['key_mlbam'][0]))
        else: 
            return None
Esempio n. 2
0
def get_data(first_name, last_name, start_date, end_date):
    try:
        key = pb.playerid_lookup(
            last_name,
            first_name)["key_mlbam"].values[0]  # get unique pitcher identifier
    except:
        pass

    data = pb.statcast_pitcher(start_date, end_date,
                               key)  # get dataset of pitches thrown by pitcher
    data = data.sort_values(["pitch_number"
                             ])  # sort pitches by order thrown, earliest first
    data = data.dropna(subset=[
        "pitch_type", "des", "description", "release_spin_rate"
    ])  # make sure dataset does not contain nulls

    data["order"] = data.reset_index(
    ).index  # create new column with pitch order

    df = pd.DataFrame(data)

    df = df.rename(
        {
            "des": "Play by Play",
            "description": "Result of Pitch",
            "order": "Pitch Number",
            "pitch_name": "Pitch Type",
            "release_speed": "Pitch Speed",
        },
        axis=1,
    )

    return df
Esempio n. 3
0
def get_atbats(first, last):

    # Lookup player
    player_info = playerid_lookup(last, first)
    player_id = player_info["key_mlbam"].iloc[0]  # assume only one line
    start_year = int(player_info["mlb_played_first"].iloc[0])
    end_year = int(player_info["mlb_played_last"].iloc[0])
    # ignore this year
    if end_year == 2019:
        end_year = 2018

    # Get all the stats
    start_date = "{0}-01-01".format(start_year)
    end_date = "{0}-12-31".format(end_year)
    print("Scraping from {0} to {1}".format(start_date, end_date))
    d_all_stats = statcast_pitcher(start_date, end_date, player_id)
    d_features = d_all_stats[features]

    # Iterate over strikeout rows, build into AtBat Objects
    strikeout_rows = d_all_stats.index[d_all_stats["events"] ==
                                       "strikeout"].to_list()
    at_bats, ab_arrays = [], []
    for row in strikeout_rows:
        this_ab = AtBat(d_features, row)
        at_bats.append(this_ab)
        ab_arrays.append(this_ab.np)

    return at_bats, ab_arrays
Esempio n. 4
0
def statcastData(pitcherId, stats, dateRange):
    if pitcherId is None:
        # TODO
        raise Exception
    data = bball.statcast_pitcher(dateRange[0], dateRange[1], pitcherId)
    statcastDF = pd.DataFrame(data)
    statsOnly = statcastDF[stats]
    return statsOnly
Esempio n. 5
0
def pand(df):
    frames = []
    for index, value in df.iterrows():
        frame = statcast_pitcher('2018-02-01',
                                 '2018-12-01',
                                 player_id=df['key_mlbam'][index])
        frames.append(frame)
    result = pd.concat(frames)
    return result
Esempio n. 6
0
def dataGrab(number, start, end):
    data = statcast_pitcher(start_dt=start, end_dt=end, player_id=number)
    data = data[[
        'pitch_type', 'release_speed', 'effective_speed', 'release_pos_x',
        'plate_x', 'release_pos_z', 'plate_z', 'release_extension', 'zone',
        'launch_speed', 'launch_angle', 'estimated_woba_using_speedangle'
    ]]
    data.index = range(len(data['pitch_type']))
    return data
Esempio n. 7
0
def import_data(number, start, end):
    data = statcast_pitcher(start_dt=start, end_dt=end,
                            player_id=number)
    data = data[['pitch_type', 'release_speed', 'release_pos_x', 'release_pos_z',
                 'pfx_x', 'pfx_z', 'release_spin_rate', 'plate_x', 'plate_z',
                 'estimated_woba_using_speedangle', 'woba_value', 'description',
                 'launch_speed_angle', 'launch_angle', 'launch_speed', 'bb_type',
                 'effective_speed', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az',
                 'release_extension']]
    data.index = range(len(data['pitch_type']))
    return data
Esempio n. 8
0
def collect_statcast(sample_size, target, features, pitcher_names):
    """Scrapes the Statcast data for each pithcer based on specified criteria; see arguments. 
    
    Arguments:
        sample_size {int} -- the number of pitches to collect for each pithcer
        target {list} -- a list containing the categories desired in the resulting pitch
        features {list} -- a list containing the desired features to keep for the resulting data.
        pitcher_names {list} -- the list of pitcher names from the read_pitchers function.
    
    Returns:
        pandas dataframe -- a pandas dataframe where each row is a single pitch for a particular pitcher
        and each column is a specified feature in the 'features' argument. 
    """

    #loop through all the names
    print('Begin scraping \n')

    final_data = pd.DataFrame(columns=features)

    for i, pitcher in enumerate(pitcher_names):
        if len(pitcher) == 2:
            fname, lname = pitcher[0], pitcher[1]
        elif len(pitcher) >= 3:
            fname, lname = pitcher[0], " ".join(pitcher[1:])
        else:
            pass

        print(
            f'\n Pitcher Name: {fname} {lname}, #: {i+1}/{len(pitcher_names)}  \n'
        )
        #grap the unique identifier of the pitcher
        player = playerid_lookup(lname, fname)

        #to avoid any possible errors, execute following try statement:
        # grab the unique identifier value
        # get all available data in time frame
        # filter data to only have appropriate targets, defined above
        # append particular pitcher to 'master' dataframe
        #if any of these steps fail, particularly the grabbing of 'ID'
        #pass on to next pitcher
        try:
            ID = player['key_mlbam'].iloc[player['key_mlbam'].argmax()]
            df = statcast_pitcher('2018-03-29', '2018-09-30', player_id=ID)
            df = df[df['description'].isin(target)].sample(sample_size,
                                                           random_state=2019)
            final_data = final_data.append(df[features], ignore_index=True)

        except ValueError:
            pass

    print('Finsihed Scraping')
    return final_data
Esempio n. 9
0
def statcast_pitcher_spin(start_dt=None, end_dt=None, player_id=None):
    pitcher_data = statcast_pitcher(start_dt, end_dt, player_id)

    spin_df = pitcher_data[[
        'release_extension', 'vx0', 'vy0', 'vz0', 'ax',
        'ay', 'az', 'release_spin_rate']].copy()

    spin_df = find_intermediate_values(spin_df)

    pitcher_data[['Mx', 'Mz', 'phi', 'theta']] = spin_df[[
        'Mx', 'Mz', 'phi', 'theta']].copy()

    return pitcher_data
def player(first_name, last_name, start_date, end_date):
    player_info = pybaseball.playerid_lookup(last_name, first_name)
    # if (player_info['mlb_played_last'][0] - player_info['mlb_played_first'][0]) < 10:
    #     start_year = int(player_info['mlb_played_first'][0])
    # else:
    #     start_year = int(player_info['mlb_played_last'][0] - 10)

    player_id = player_info['key_mlbam'][0]
    # player_info = [player_id,start_year,str(player_info['mlb_played_last'][0])]

    data = pybaseball.statcast_pitcher(start_dt=start_date,
                                       end_dt=end_date,
                                       player_id=player_info[0])
    data = data.reset_index(drop=True)
    return data
def get_player_stats(id):
    """
    Takes pitcher id as paramter and retrieves all pitches thrown by that
    pitcher in the 2019 season.
    Reduces the dataframe to contain only the columns we want for our
    analysis and only the pitches that are put in play.
    returns condensed dataframe.
    """
    data = statcast_pitcher('2019-03-28', '2019-09-29', id)
    df = data[[
        'pitch_type', 'release_speed', 'release_spin_rate',
        'if_fielding_alignment', 'launch_angle', 'launch_speed', 'hc_x',
        'hc_y', 'stand', 'type', 'events'
    ]]

    new_df = df[df['type'] == 'X']
    return new_df
Esempio n. 12
0
    def pitcher(self, name, team):
        Xcols = ['pfx_x', 'pfx_z', 'release_speed', 'release_spin_rate']

        fgp = self.fgp
        player = fgp[(fgp.Name.str.lower() == name.lower())
                     & (fgp.Team.str.lower() == team.lower())].playerid
        pid = int(playerid_reverse_lookup(player, 'fangraphs').key_mlbam)
        pitch = statcast_pitcher(start_dt='2015-03-28',
                                 end_dt='2019-09-29',
                                 player_id=pid)
        if set(pitch.p_throws) == {'R'}:
            throws = 'R'
            scaler = self.scalerR
            kmeans = self.modelR
        else:
            throws = 'L'
            scaler = self.scalerL
            kmeans = self.modelL
        pitch.dropna(subset=Xcols, inplace=True)
        pitch.reset_index(drop=True, inplace=True)
        pitch['p_type'] = kmeans.predict(scaler.transform(pitch[Xcols]))
        pitchdict = {}
        for i in range(13):
            if throws == 'R':
                if i == 7:
                    pitchernum = pitch[(pitch.p_type == 7) |
                                       (pitch.p_type == 12)]
                elif i == 12:
                    pitchernum = []
                else:
                    pitchernum = pitch[pitch.p_type == i]
            else:
                if i == 0:
                    pitchernum = pitch[(pitch.p_type == 0) |
                                       (pitch.p_type == 4)]
                elif i == 4:
                    pitchernum = []
                else:
                    pitchernum = pitch[pitch.p_type == i]
            cutoff = len(pitchernum) / len(pitch)
            if cutoff > (1 / 20):
                pitchdict[i] = round((cutoff * 100), 1)
        return pid, pitchdict, throws
Esempio n. 13
0
data = statcast(start_dt='2017-06-24', end_dt='2017-06-27')
data.head(2)

from pybaseball import pitching_stats
data = pitching_stats(2012, 2016)
data.head()

from pybaseball import playerid_lookup
from pybaseball import statcast_pitcher
import pandas as pd
csv = '2019pitchers.csv'
df = pd.read_csv(csv)
print(df)

import pandas as pd
alldata = statcast_pitcher('2019-03-27', '2019-11-01', df.get_value(0, 'MLBID'))
for i in range(1, 121): 
  data = statcast_pitcher('2019-03-27', '2019-11-01', df.get_value(i, 'MLBID'))
  data = data[::-1]
  alldata = pd.concat([alldata, data])
print(alldata)

import pandas as pd
kershaw = pd.DataFrame(alldata)
print(kershaw)

a = 18
df1 = kershaw[kershaw['at_bat_number'] <= a]
df1_ = kershaw[kershaw['at_bat_number'] > a]
print(df1)
print(df1_)
Esempio n. 14
0
def SpinRate(names, start_date, end_date, dictionary):
    """ The function takes pitcher names and dates to pull spin rate data from Statcast. A list of names is required
    under the current setting. A singl set of dates or list of dates can be supplied. Additionally, a dictionary of
    player names and their MLB IDs must be supplied. This dictionary can be created using other pybaseball functions.

    Returns a dataframe with spin rate data for four-seam fastball, curveball, and two-seam fastballs as well as
    two-seam fastball use and wOBA and total wOBA."""

    # import warnings filter and ignore warnings
    from warnings import simplefilter
    simplefilter(action='ignore', category=Warning)

    # Load packages for analysis
    import pandas as pd
    import pybaseball as pb
    import numpy as np

    player_dict = dictionary

    df_final = pd.DataFrame(columns=[
        'Name', 'Total wOBA', 'FF Spin', 'CU/KC Spin', 'FT/SI Spin',
        'FT/SI Use', 'FT/SI wOBA'
    ])
    if isinstance(names, list) and isinstance(start_date, list):
        for (name, sdt, edt) in zip(names, start_date, end_date):
            player_ID = player_dict[name]
            df_data = pb.statcast_pitcher(start_dt=sdt,
                                          end_dt=edt,
                                          player_id=player_ID)

            total_pitches = len(df_data)
            total_woba = np.mean(df_data.woba_value)

            FF_data = df_data[(df_data.pitch_type == 'FF')]
            CU_data = df_data[(df_data.pitch_type == 'KC') |
                              (df_data.pitch_type == 'CU')]
            FT_data = df_data[(df_data.pitch_type == 'FT') |
                              (df_data.pitch_type == 'SI')]

            FF_spin = np.mean(FF_data.release_spin_rate)
            CU_spin = np.mean(CU_data.release_spin_rate)
            FT_spin = np.mean(FT_data.release_spin_rate)

            FT_use = len(FT_data) / total_pitches

            FT_woba = np.mean(FT_data.woba_value)

            temp = [
                name, total_woba, FF_spin, CU_spin, FT_spin, FT_use, FT_woba
            ]

            df_temp = pd.DataFrame([temp],
                                   columns=[
                                       'Name', 'Total wOBA', 'FF Spin',
                                       'CU/KC Spin', 'FT/SI Spin', 'FT/SI Use',
                                       'FT/SI wOBA'
                                   ])

            df_final = pd.concat([df_final, df_temp], axis=0)

        df_final = df_final.fillna(0.0)

    if isinstance(names, list):
        for name in names:
            player_ID = player_dict[name]
            if name == 'Will Smith':
                player_ID = 519293
            df_data = pb.statcast_pitcher(start_dt=start_date,
                                          end_dt=end_date,
                                          player_id=player_ID)

            total_pitches = len(df_data)
            total_woba = np.mean(df_data.woba_value)

            FF_data = df_data[(df_data.pitch_type == 'FF')]
            CU_data = df_data[(df_data.pitch_type == 'KC') |
                              (df_data.pitch_type == 'CU')]
            FT_data = df_data[(df_data.pitch_type == 'FT') |
                              (df_data.pitch_type == 'SI')]

            FF_spin = np.mean(FF_data.release_spin_rate)
            CU_spin = np.mean(CU_data.release_spin_rate)
            FT_spin = np.mean(FT_data.release_spin_rate)

            FT_use = len(FT_data) / total_pitches

            FT_woba = np.mean(FT_data.woba_value)

            temp = [
                name, total_woba, FF_spin, CU_spin, FT_spin, FT_use, FT_woba
            ]

            df_temp = pd.DataFrame([temp],
                                   columns=[
                                       'Name', 'Total wOBA', 'FF Spin',
                                       'CU/KC Spin', 'FT/SI Spin', 'FT/SI Use',
                                       'FT/SI wOBA'
                                   ])

            df_final = pd.concat([df_final, df_temp], axis=0)

        df_final = df_final.fillna(0.0)

    return df_final
Esempio n. 15
0
from pybaseball import statcast_pitcher
from pybaseball import playerid_lookup

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from keras.utils import to_categorical

from tensorflow import feature_column
from tensorflow.keras import layers
import tensorflow as tf

pid = playerid_lookup('berrios', 'jose')["key_mlbam"][0]
print(pid)
# get all available data
data = statcast_pitcher('2017-03-01', '2019-10-10', player_id=pid)

data = data[[
    "pitch_type", "bat_score", "fld_score", "on_3b", "on_2b", "on_1b",
    "outs_when_up", "inning", "inning_topbot", "pitch_number", "p_throws",
    "balls", "strikes", "stand", "batter", "release_speed", "description"
]]

data = data[data.pitch_type != 'EP']
data = data[data.pitch_type != 'PO']

data[["on_3b", "on_2b", "on_1b"]] = data[["on_3b", "on_2b",
                                          "on_1b"]].replace(np.nan, 0)
data.loc[data.on_3b > 0, "on_3b"] = 1
data.loc[data.on_2b > 0, "on_2b"] = 1
data.loc[data.on_1b > 0, "on_1b"] = 1
Esempio n. 16
0
def get_data(year = 2018, minimum_starts = 5):
    if not os.path.exists(str(year)):
        os.mkdir(str(year))
    if not os.path.exists(os.path.join(str(year), "Players_Stats_"+str(year)+".csv")):
        player_stats = pitching_stats(year, year)
        player_stats = player_stats[player_stats['GS']>minimum_starts]
        player_stats.to_csv(os.path.join(str(year), "Players_Stats_"+str(year)+".csv"))
    else:
        player_stats = pd.read_csv(os.path.join(str(year), "Players_Stats_"+str(year)+".csv"))
    out = None
    for name in player_stats['Name']:
        if not os.path.exists(os.path.join(str(year),'player')):
            os.mkdir(os.path.join(str(year),'player'))
        splitname = name.split(' ')
        # Database is really good and has some mistakes, so when we go to the lookup table for MLB Player IDs sometimes
        # it doesn't match up. This corrects the issues that I've found. Obviously this won't work for every year
        # out of the box because of this.
        splitname[0] = splitname[0].replace('.', '. ', 1)
        # print(splitname[0])
        if splitname[0] == 'J.A.':
            splitname[0] = 'J. A.'
        if name == 'Zack Wheeler':
            splitname[0] = 'Zach'
        if name == 'Matthew Boyd':
            splitname[0] = 'Matt'
        if name == 'C.J. Wilson':
            splitname[0] = 'c. j.'
        if name == 'R.A. Dickey':
            splitname[0] = 'R. A.'
        if name == 'Jon Niese':
            splitname[0] = 'Jonathon'
        if name == 'A.J. Burnett':
            splitname[0] = 'A. J.'
        if name == 'Jorge De La Rosa':
            splitname[0] = 'Jorge'
            splitname[1] = 'De La Rosa'
        if name == 'Rubby de la Rosa':
            splitname[0] = 'Rubby'
            splitname[1] = 'de la Rosa'
        if name == 'Cole DeVries':
            splitname[1] = 'De Vries'
        if name == 'Samuel Deduno':
            splitname[0] = 'Sam'
        if name == 'JC Ramirez':
            splitname[0] = 'J. C.'
        if name == 'Nathan Karns':
            splitname[0] = 'Nate'
        if name == 'Daniel Ponce de Leon':
            splitname[1] = 'Ponce de Leon'
        if name == 'Chi Chi Gonzalez':
            splitname[0] = 'Chi Chi'
            splitname[1] = 'Gonzalez'
        if name == 'Josh A. Smith':
            splitname[0] = 'Josh'
            splitname[1] = 'Smith'
        if name == 'Joel De La Cruz':
            splitname[1] = 'De La Cruz'

        if not os.path.exists(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv')):
            player_id = playerid_lookup(splitname[1], splitname[0])
            print(year)
            player_id = player_id[player_id['mlb_played_first'] <= year]
            player_id = player_id[player_id['mlb_played_last'] >= year]

            print(player_id)
            print(len(player_id))
            if len(player_id) != 1:
                print(player_id)
                print(name)
                print("Concerning")


            player = statcast_pitcher(str(year)+'-1-01', str(year)+'-12-31', player_id['key_mlbam'].iloc(0)[0])
            player.to_csv(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv'))
        else:
            player = pd.read_csv(os.path.join(str(year), 'player', name+'-'+str(year)+'.csv'))

        # ['SL' 'FF' 'CU' 'FT' 'CH' nan 'FC' 'KC' 'SI' 'PO' 'FS' 'EP' 'SC']
        player_row = pd.DataFrame({'Name':[name]})
        pitch_types = ['SL','FF','CU','FT','CH','FC','KC','SI','PO','FS','EP','SC','KN']
        soi = ['release_speed','release_pos_x','release_pos_z','pfx_x','pfx_z','vx0','vy0','vz0','ax','ay','az','effective_speed','release_spin_rate']
        for pitch in pitch_types:
            pitches = player[player['pitch_type'] == pitch]
            pitches = pitches[soi]
            for stat in soi:
                mean = np.mean(pitches[stat])
                if math.isnan(mean):
                    mean = 0
                std = np.std(pitches[stat])+0
                if math.isnan(std):
                    std = 0
                min = np.min(pitches[stat])+0
                if math.isnan(min):
                    min = 0
                max = np.max(pitches[stat])+0
                if math.isnan(max):
                    max = 0
                player_row[pitch+"_"+stat + '_std'] = std
                player_row[pitch+"_"+stat + '_mean'] = mean
                player_row[pitch + "_" + stat + '_min'] = min
                player_row[pitch + "_" + stat + '_max'] = max
        if out is None:
            out = player_row
        else:
            out = pd.concat([out,player_row])
    out
    out.to_csv(str(year)+".csv")
# Made by Noah Mitchem for MLB Pitchers
# Vertical pitch breaks seem off, don't know what other data can be used
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import plot
import numpy as np
from matplotlib import cm
from pybaseball import playerid_lookup
from pybaseball import statcast_pitcher
file = statcast_pitcher("2019-03-25", "2019-10-01",
                        playerid_lookup("scherzer", "max")["key_mlbam"][0])


def colorcode(speed):
    speed1 = int((speed - 50) * 4.3)
    co = np.array(cm.magma(speed1)) * 255
    return "rgb(" + str(int(co[0])) + "," + str(int(co[1])) + "," + str(
        int(co[2])) + ")"


data = []
data1 = []
pitchTrack = 0
breaks = 0
x = {}
extremes = []
differentPitches = file["pitch_type"].unique().size
totalPitches = file.index.size
color = [
    "rgb(102, 204, 0)", "rgb(0, 214, 214)", "rgb(204, 0, 0)",
    "rgb(255, 153, 0)", "rgb(153, 0, 255)"
Esempio n. 18
0
def retrieve_data():
    """
    Function for retrieving data from Statcast and performing some custom
    formatting
    """
    run_button.label = 'Running...'
    reset_data()
    global p_dict
    global h_dict
    global data
    global data_cds
    global pitch_cds_p
    global pitches_p
    global pitcher_data
    global batter_data
    global sub_batter

    # update plot title
    pitchername = pitcherselect.value.split(' -')[0]
    battername = hitterselect.value.split(' -')[0]
    plot.title.text = f'{pitchername} vs. {battername}'
    pitcher_id = p_dict[pitcherselect.value]
    hitter_id = h_dict[hitterselect.value]
    # all the data for the batter in the time frame
    batter_data_temp = statcast_batter(str(start_date.value),
                                       str(end_date.value), hitter_id)
    batter_data = pitch_info(batter_data_temp)
    # all data for the pitcher in the time frame
    pitcher_data_temp = statcast_pitcher(str(start_date.value),
                                         str(end_date.value), pitcher_id)
    pitcher_data = pitch_info(pitcher_data_temp)
    # filter to only the pitches thrown to selected batter
    data = pitcher_data[pitcher_data['batter'] == hitter_id].copy()
    sub_batter = batter_data[batter_data['pitcher'] == pitcher_id].copy()
    if len(data) == 0:
        warning_txt = 'No matchups in specified time frame'
        warning_label.text = warning_txt
    else:
        warning_label.text = ''
        result = []
        count = []
        for r in data.iterrows():
            # assign event names
            event = results(r[1]['events'], r[1]['description'])
            result.append(event)
            count_str = f"{r[1]['balls']}, {r[1]['strikes']}"
            count.append(count_str)
        data['result'] = result
        data['count'] = count
        # update column data source
        new_data = {
            'pitch': data['pitch_name'],
            'speed': data['release_speed'],
            'result': data['result'],
            'count': data['count'],
            'color': data['color'],
            'plate_x': data['plate_x'],
            'plate_z': data['plate_z']
        }
        data_cds.data = new_data

        # update strike zoe
        new_top = data.sz_top.sum() / len(data.sz_top)
        new_bottom = data.sz_bot.sum() / len(data.sz_bot)
        new_zone = {
            'x': [-8.5 / 12, 8.5 / 12],
            'x_side1': [-8.5 / 12, -8.5 / 12],
            'x_side2': [8.5 / 12, 8.5 / 12],
            'top': [new_top, new_top],
            'bottom': [new_bottom, new_bottom],
            'side1': [new_top, new_bottom],
            'side2': [new_bottom, new_top]
        }
        strike_zone_cds.data = new_zone

        # update pitch plots
        p_unique, p_matchup, p_overall = pitch_frequency(pitcher_data, data)
        pitches_p.x_range.factors = p_unique
        new_data_pitcher = {
            'pitches': p_unique,
            'matchup': p_matchup,
            'overall': p_overall
        }
        pitch_cds_p.data = new_data_pitcher
        b_unique, b_matchup, b_overall = pitch_frequency(
            batter_data, sub_batter)
        pitches_b.x_range.factors = b_unique
        new_data_batter = {
            'pitches': b_unique,
            'matchup': b_matchup,
            'overall': b_overall
        }
        pitch_cds_b.data = new_data_batter
    run_button.label = 'Run'
    def get_data(first_name, last_name):

        train_filename = 'Data/' + str(last_name) + "_" + str(
            first_name) + "_train.csv"
        test_filename = 'Data/' + str(last_name) + "_" + str(
            first_name) + "_test.csv"

        if os.path.isfile(train_filename) and os.path.isfile(
                test_filename):  #If we've already gotten the data, read it in
            train_data = pd.read_csv(train_filename)
            test_data = pd.read_csv(test_filename)
        else:
            #If we haven't, get it off the web and store it for future runs
            #training is done on data from 2015 through 2017
            train_data = statcast_pitcher(
                start_dt='2015-01-01',
                end_dt='2017-12-31',
                player_id=int(playerid_lookup('sale', 'chris')['key_mlbam']))
            train_data.to_csv(train_filename)
            #testing is done on data from the beginning of 2018 to present
            test_data = statcast_pitcher(
                start_dt='2018-01-01',
                end_dt='2019-12-31',
                player_id=int(playerid_lookup('sale', 'chris')['key_mlbam']))
            test_data.to_csv(test_filename)

        #Get all of the pitch types that a pitcher throws, then encode them using our system
        train_data = train_data[train_data['pitch_type'].isin(pitcher_pitches)]
        train_data = train_data.dropna(subset=['pitch_type'])
        train_data['pitch_code'] = train_data.apply(
            lambda row: get_pitch_code(row, pitcher_pitches), axis=1)

        #Do the same as above but for the testing data in case they added a new pitch
        test_data = test_data[test_data['pitch_type'].isin(pitcher_pitches)]
        test_data = test_data.dropna(subset=['pitch_type'])

        #Encode all the pitch type/location info to a unique int
        test_data['pitch_code'] = test_data.apply(
            lambda row: get_pitch_code(row, pitcher_pitches), axis=1)
        train_data = get_prev_pitch(train_data)
        test_data = get_prev_pitch(test_data)

        #Fills the Na values, turns the batter ID for the player on base into a bool value
        train_data['on_3b'] = train_data['on_3b'].fillna(
            value=0).astype(bool).astype(int)
        train_data['on_2b'] = train_data['on_2b'].fillna(
            value=0).astype(bool).astype(int)
        train_data['on_1b'] = train_data['on_1b'].fillna(
            value=0).astype(bool).astype(int)

        test_data['on_3b'] = test_data['on_3b'].fillna(
            value=0).astype(bool).astype(int)
        test_data['on_2b'] = test_data['on_2b'].fillna(
            value=0).astype(bool).astype(int)
        test_data['on_1b'] = test_data['on_1b'].fillna(
            value=0).astype(bool).astype(int)

        #Get the data we need and drop any null values (which is why it double selects)
        train_data_input = train_data[[
            'prev_pitch_3', 'prev_pitch_2', 'prev_pitch_1', 'balls', 'strikes',
            'stand', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'pitch_number',
            'pitch_code'
        ]].dropna()
        train_data_result = train_data_input[['pitch_code']]
        train_data_input = train_data_input[[
            'prev_pitch_3', 'prev_pitch_2', 'prev_pitch_1', 'balls', 'strikes',
            'stand', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'pitch_number'
        ]]

        test_data = test_data[[
            'prev_pitch_3', 'prev_pitch_2', 'prev_pitch_1', 'balls', 'strikes',
            'stand', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'pitch_number',
            'pitch_code'
        ]].dropna()

        return train_data_input, train_data_result, test_data