예제 #1
0
def get_bbref_pitch(url, year=2015):
    """ 
    returns a pandas dataframe containing bbref info (not all numeric?)
    usage get_bbref(url)
    """

    import Baseball
    import requests
    from bs4 import BeautifulSoup

    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, "lxml")
    tbl = soup.find("table", id="pitching_gamelogs")
    bbref = pd.read_html(str(tbl))[0]
    # Drop the last row = summary row
    bbref = bbref.iloc[:-1]
    try:
        bbref = bbref[bbref["Gcar"] != "Tm"]
    except TypeError:
        pass
    bbref = bbref.dropna(subset=["Gcar"], axis=0)
    for param in bbref.columns:
        bbref[param] = pd.to_numeric(bbref[param], errors="ignore")

    bbref["GDL_Date"] = bbref["Date"].apply(lambda x: bbref_date_to_gdl_date(x, year))
    bbref["IP"] = bbref["IP"].apply(convert_bbref_ip)

    bbref["WHIP"] = Baseball.get_whip(bbref)
    bbref["ERIP"] = Baseball.get_erip(bbref)

    return bbref
예제 #2
0
파일: batting.py 프로젝트: iayork/Baseball
def get_ops_pfx(df):
    """
    From a pitchab dataframe calculate OPS
    OPS = SLG + OBP
    """
    slg = Baseball.get_slg_pfx(df)
    obp = Baseball.get_obp_pfx(df)
    return(slg + obp)
예제 #3
0
파일: batting.py 프로젝트: iayork/Baseball
def get_slg_pfx(df):
    """
    Calculate slugging from a pitchab dataframe
    SLG = (Total bases) / (At Bats)
    """
    tb = Baseball.get_tb(df)
    ab = Baseball.get_atbats_count_pfx(df)
    return(tb/ab) 
예제 #4
0
파일: batting.py 프로젝트: iayork/Baseball
def get_obp_pfx(df):
    # (Hits + Walks + Hit by Pitch) / (At Bats + Walks + Hit by Pitch + Sacrifice Flies)
    ob_events = ['Single', 'Double', 'Triple', 'Home Run', 'Walk', 
                 'Intent Walk', 'Hit By Pitch'] 
    on_base = len(df[df['event'].isin(ob_events)].groupby(['gameday_link','num']).first())
    obp_pa = Baseball.get_pa_for_obp(df)
    return on_base/obp_pa
예제 #5
0
파일: batting.py 프로젝트: iayork/Baseball
def hits_per_pitch_subzone(df): 
    """
    Get hits per pitch in a sub-region of the strike zone
    
    Input: dataframe containing pitches within a specific sub-region
    output: dataframe with 
        rows = vertical position, 
        columns = horizontal position, 
        values = Hits per pitch in that subzone
    """
    
    hits_subzoneD = {} 
    for row in  np.arange(-1.5, 1.5, 0.6): 
        for col in np.arange(1, 4, 0.6):
            subzone = df[(df['px']>=row) & 
                         (df['px']<row+0.6) & 
                         (df['pz']>=col) & 
                         (df['pz']<col+0.6)] 
                         
            hitsD = Baseball.hits_tb_per_pitch(subzone)
            hits = hitsD['Hits']
            pitches = hitsD['Pitches']
            try:
                hits_per_pitch = hits/pitches
            except ZeroDivisionError:
                hits_per_pitch = 0 
            
            try:
                hits_subzoneD[row].append( hits_per_pitch ) 
            except KeyError:
                hits_subzoneD[row] =  [hits_per_pitch, ] 
    return pd.DataFrame(hits_subzoneD, index=np.arange(1, 4, 0.6))     
예제 #6
0
파일: batting.py 프로젝트: iayork/Baseball
def get_pa_count_pfx(df):
    # PA = AB + BB + HBP + SH + SF + Times Reached on Defensive Interference
    pa_extras = ['Walk', 'Sac Fly', 'Hit By Pitch', 'Intent Walk', 'Sac Bunt', 
                'Sac Fly', 'Catcher Interference', 'Fan interference',
                'Batter Interference', 'Sac Fly DP', 'Sacrifice Bunt DP'] 
    pa = (Baseball.get_atbats_count_pfx(df) + 
          len(df[df['event'].isin(pa_extras)].groupby(['gameday_link','num']).first()))
    return pa
예제 #7
0
파일: batting.py 프로젝트: iayork/Baseball
def get_slg_pfx(df):  
    ab = Baseball.get_atbats_df_pfx(df)
    events = list(ab.groupby(['gameday_link','num']).first()['event'].values)
    b1 = events.count('Single')
    b2 = events.count('Double') * 2
    b3 = events.count('Triple') * 3
    b4 = events.count('Home Run') * 4
    return (b1 + b2 + b3 + b4)/len(events)
예제 #8
0
파일: batting.py 프로젝트: iayork/Baseball
def get_pa_for_obp(df):
    # At Bats + Walks + Hit by Pitch + Sacrifice Flies
    obp_pa_extras = ['Walk', 'Sac Fly', 'Hit By Pitch', 'Intent Walk',  
                'Sac Fly', 'Catcher Interference', 'Fan interference',
                'Batter Interference', 'Sac Fly DP', 'Sacrifice Bunt DP'] 
    obp_pa = (Baseball.get_atbats_count_pfx(df) + 
          len(df[df['event'].isin(obp_pa_extras)].groupby(['gameday_link','num']).first()))
    return obp_pa
예제 #9
0
파일: batting.py 프로젝트: iayork/Baseball
def get_obp_pfx(df):
    """
    Calculate on-base percentage from a pitchab dataframe
    OBP = (Hits + Walks + Hit by Pitch) / (At Bats + Walks + Hit by Pitch + Sacrifice Flies)
    """
    ob_events = ['Single', 'Double', 'Triple', 'Home Run', 'Walk', 
                 'Intent Walk', 'Hit By Pitch'] 
    on_base = len(df[df['event'].isin(ob_events)].groupby(['gameday_link','num']).first())
    obp_pa = Baseball.get_pa_count_pfx(df)
    return on_base/obp_pa
예제 #10
0
파일: batting.py 프로젝트: iayork/Baseball
def get_pa_count_pfx(df):

    """
    Given a pitchab dataframe return the number of official plate appearances
    PA = AB + BB + HBP + SH + SF + Times Reached on Defensive Interference
    -> Calculate by first counting at-bats and then counting additional events 
    that count toward a plate appearance
    """ 
    pa_extras = ['Walk', 'Sac Fly', 'Hit By Pitch', 'Intent Walk', 'Sac Bunt', 
                'Sac Fly', 'Catcher Interference', 'Fan interference',
                'Batter Interference', 'Sac Fly DP', 'Sacrifice Bunt DP'] 
    pa = (Baseball.get_atbats_count_pfx(df) + 
          len(df[df['event'].isin(pa_extras)].groupby(['gameday_link','num']).first()))
    return pa
예제 #11
0
파일: batting.py 프로젝트: iayork/Baseball
def get_box_info(df):
    """
    Takes a pitchFX dataframe
    
    Breaks into sub-regions
    Finds the pitches in each subregion, calculates slg and hits in each
    Returns a dict box_infoD[(ctr_x, ctr_y)] containing 
             'pitch_pct': (Percent of all pitches that are in each box) 
             'slgs': (total bases per hit in box)
             'hits': (total hits in box)
             'hits_per_ptch': hits in box/pitches in box * percent of pitches in box
    """
    (top, bottom,left,right, x_step, y_step) = Baseball.official_zone_25_boxes()
    
    assert len(df) > 0, 'Empty dataframe' 

    box_infoD = {}
    
    for x in np.linspace(left, right, 6):
        for y in np.linspace(bottom, top, 6):
            ctr_x, ctr_y = get_center_point(x, y, x_step, y_step)  
            box_infoD[(ctr_x, ctr_y)] = {}
            
            box = df[(df['px'] >= x) & (df['px'] < x + x_step) &
                     (df['pz'] >= y) & (df['pz'] < y + y_step)]
            
            if len(box) == 0:
                box_infoD[(ctr_x, ctr_y)]['slgs'] = 0
                box_infoD[(ctr_x, ctr_y)]['hits'] = 0
                box_infoD[(ctr_x, ctr_y)]['hits_per_pitch'] = 0
                box_infoD[(ctr_x, ctr_y)]['pitch_pct'] = 0
            else: 
                (tb_in_box, hits_in_box) = get_slg_in_box(box) 
                
                box_infoD[(ctr_x, ctr_y)]['hits_per_pitch'] = hits_in_box/len(box)
                box_infoD[(ctr_x, ctr_y)]['pitch_pct'] = len(box)/len(df)*100
            
                if hits_in_box == 0:
                    box_infoD[(ctr_x, ctr_y)]['slgs'] = 0
                    box_infoD[(ctr_x, ctr_y)]['hits'] = 0
                else:
                    box_infoD[(ctr_x, ctr_y)]['slgs'] = tb_in_box/hits_in_box
                    box_infoD[(ctr_x, ctr_y)]['hits'] = hits_in_box 
                
    return (box_infoD)
예제 #12
0
파일: batting.py 프로젝트: iayork/Baseball
def hits_tb_per_pitch(df):
    """
    From a pitchab dataframe return a dict containing
        Pitches
        Hits
        Total Bases
        Total Bases per pitch'
    """
    tb = Baseball.get_tb(df)
    hits = get_hits(df)
    if len(df) == 0:
        tb_per_pitch = 0
    else:
        tb_per_pitch = tb/len(df)
    return {'Pitches':len(df),
            'Hits':len(hits),
            'TB':tb,
            'TB_per_pitch':tb_per_pitch}
예제 #13
0
파일: batting.py 프로젝트: iayork/Baseball
def zone_as_polygon(year):
    """
    Converts a strike zone (as a series of points) into a matplotlib polygon path
    Allows use of path.contains_points to test if pitches are inside a strike zone 
    
    """
    import matplotlib.patches as patches 
    zone_dict = Baseball.get_50pct_zone(2016) 
    xy_r = np.array([np.array(xy) for xy in zip(np.array(zone_dict['xRs'][:-1]), 
                     np.array(zone_dict['yRs'][:-1]))])
    zone_polygon_r = patches.Polygon(xy_r,closed=True, 
                                     facecolor='grey', alpha=0.1)
    zone_path_r = zone_polygon_r.get_path()
    
    xy_l = np.array([np.array(xy) for xy in zip(np.array(zone_dict['xLs'][:-1]), 
                     np.array(zone_dict['yLs'][:-1]))])
    zone_polygon_l = patches.Polygon(xy_l,closed=True, 
                                     facecolor='grey', alpha=0.1)
    zone_path_l = zone_polygon_l.get_path()
    return {'R':zone_path_r, 'L':zone_path_l}
예제 #14
0
def get_bbref_bat(url, year=2015):
    from bs4 import BeautifulSoup
    import requests

    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, "lxml")
    tbl = soup.find("table", id="batting_gamelogs")
    bbref = pd.read_html(str(tbl))[0]
    # Drop the last row = summary row
    bbref = bbref.iloc[:-1]
    try:
        bbref = bbref[~((bbref["Gcar"].str.contains("Tm")) | (bbref["Gcar"].str.contains("Gcar")))]
    except TypeError:
        pass
    bbref = bbref.dropna(subset=["Gcar"], axis=0)
    for param in bbref.columns:
        bbref[param] = pd.to_numeric(bbref[param], errors="ignore")

    bbref["GDL_Date"] = bbref["Date"].apply(lambda x: Baseball.bbref_date_to_gdl_date(x, year))
    return bbref
예제 #15
0
파일: batting.py 프로젝트: iayork/Baseball
def tb_per_pitch_subzone(df): 
    """
    Get total bases in a sub-region of the strike zone
    
    Input: dataframe containing pitches within a specific sub-region
    output: dataframe with 
        rows = vertical position, 
        columns = horizontal position, 
        values = TB per pitch in that subzone
    """
    
    tb_subzoneD = {} 
    for row in  np.arange(-1.5, 1.5, 0.6): 
        for col in np.arange(1, 4, 0.6):
            subzone = df[(df['px']>=row) & 
                         (df['px']<row+0.6) & 
                         (df['pz']>=col) & 
                         (df['pz']<col+0.6)] 
            tbD = Baseball.hits_tb_per_pitch(subzone)
            try:
                tb_subzoneD[row].append( tbD['TB_per_pitch']) 
            except KeyError:
                tb_subzoneD[row] =  [tbD['TB_per_pitch'], ] 
    return pd.DataFrame(tb_subzoneD, index=np.arange(1, 4, 0.6)) 
예제 #16
0
def get_con(year, dbFolder="/Users/iayork/Documents/Baseball/PitchFX", db=False):
    if not db:
        db = 'pitchfx%s.db' % year
    db_path = os.path.join(dbFolder, db
    print(db_path)
    
    engine = create_engine('sqlite:///%s' % db_path)
    connection = engine.connect()
    
def get_pitchab(con, reg=True):
    """ 
    Get everything from pitch and atbat, merge on gameday_link + num
    usage: get_pitchab(con, reg=True)
    set "reg=False" to get spring training, all-star, post-season games
    """    
    
    #atbat = pd.read_sql("select * from atbat ", con)  # for sqlite3 connection
    #pitch = pd.read_sql("select * from pitch ", con)  # for sqlite3 connection
    atbatdf = pd.read_sql_table('atbat', connection)
    pitchdf = pd.read_sql_table('pitch', connection)
    pitchab = pitch.merge(atbat, on=['gameday_link','num'], suffixes=('', '_duplicate_delete'))

    if reg:
        gamedf = pd.read_sql_table('game', connection)
        regdf = gamedf[gamedf['game_type']=='R']
        reg_gdls = ['gid_%s' % x for x in regdf['gameday_link'].values]
        pitchab = pitchab[pitchab['gameday_link'].isin(reg_gdls)]
        
        #game_sql = """select gameday_link from game where game_type="R" """
        #reg_gdls_df = pd.read_sql(game_sql, con) 
        #reg_gdls = ['gid_%s' % x for x in reg_gdls_df['gameday_link'].values]
        #pitchab = pitchab[pitchab['gameday_link'].isin(reg_gdls)]
        
    drop_cols = [x for x in pitchab.columns if '_duplicate_delete' in x]
    for param in ('break_angle', 'break_length','break_y'):
        pitchab[param] = pd.to_numeric(pitchab[param]) 
    return pitchab.drop(drop_cols, axis=1)
    
    

"""
def get_con(year, dbFolder="/Users/iayork/Documents/Baseball/PitchFX", db=False):
    # dbFolder default="/Users/iayork/Documents/Baseball/PitchFX"
    if not db:
        db = 'pitchfx%s.db' % year
    
    print(os.path.join(dbFolder, db))
    return  sql.connect(os.path.join(dbFolder, db)) """"
    
    
def get_pitchab_for_pitcher(pitcher_name, con, reg=True): 
    """
    Get everything from pitch and atbat for a specific pitcher, 
    merge on gameday_link + num
    usage: get_pitchab_for_pitcher(pitcher_name, con, reg=True)
    set "reg=False" to get spring training, all-star, post-season games
    """

    atbat_sql = """select * from atbat where pitcher_name = "%s" """ % pitcher_name

    pitch_sql = """select * from pitch where gameday_link in 
    (select gameday_link from atbat where pitcher_name = "%s") """ % pitcher_name

    atbat = pd.read_sql(atbat_sql, con)
    pitch = pd.read_sql(pitch_sql, con)

    pitchab = pitch.merge(atbat, on=['gameday_link','num']) 
    pitchab.dropna(subset=['px',], inplace=True)

    if reg:
        game_sql = """select gameday_link from game where game_type="R" """
        reg_gdls_df = pd.read_sql(game_sql, con) 
        reg_gdls = ['gid_%s' % x for x in reg_gdls_df['gameday_link'].values]
        pitchab = pitchab[pitchab['gameday_link'].isin(reg_gdls)]
    for param in ('break_angle', 'break_length','break_y'):
        pitchab[param] = pd.to_numeric(pitchab[param]) 
    return pitchab  

# -------- Convert between formats ----------

def bbref_date_to_gdl_date(bbref_date, year=2016):
    """ 
    take date in format "Apr 8" or "Jul 7(1)" and convert to "04-08-16" format
    usage: bbref_date_to_gdl_date(bbref_date, year)
    year default = 2016
    """
    dateD = {'Mar':3, 
             'Apr':4,
             'May':5,
             'Jun':6,
             'Jul':7,
             'Aug':8,
             'Sep':9,
             'Oct':10,
             'Nov':11}
    if len(str(year)) == 4:
        year = str(year)[-2:]
    if ' (' in bbref_date:  # e.g. May 6 (1)
        new_date = '%02d-%02d-%s' % (dateD[bbref_date.split()[0]],
                                     int(bbref_date.split()[1].split('(')[0]),
                                     year)
    else:
        new_date = '%02d-%02d-%s' % (dateD[bbref_date.split()[0]],
                                     int(bbref_date.split()[1].split('(')[0]),
                                     year)
    return new_date
    
def bbref_dates_to_gdls(df, year=2016):
    """ Convert baseball-reference.com date and team information
        to a series of gameday_link 
        gid_2015_05_02_nyamlb_bosmlb_1
        Return a list of gameday_link 
    """
    
    gdls = []
    monthD = {'Mar':'03','Apr':'04', 'May':'05', 'Jun':'06', 
              'Jul':'07', 'Aug':'08', 'Sep':'09', 'Oct':'10'}
              
    teamD = {'ATL':'atl','BAL':'bal', 'BOS':'bos', 'CHW':'cha', 
             'DET':'det','HOU':'hou', 'KCR':'kca', 'LAA':'ana', 
             'MIN':'min','NYY':'nya', 'OAK':'oak', 'SEA':'sea', 
             'TBR':'tba', 'TEX':'tex', 'TOR':'tor','MIA':'mia',
             'NYM':'nyn','PHI':'phi', 'CLE':'cle','WSN':'was', 
             'CHC':'chn','PIT':'pit','STL':'sln','MIL':'mil',
             'CIN':'cin','ARI':'ari','COL':'col','LAD':'lan',
             'SDP':'sdn', 'SFG':'sfn', 'FLA':'flo'}
    
    for (date, tm, at_, opp) in df[['Date','Tm','Unnamed: 4','Opp']].values:
        month = monthD[date.split(' ')[0]]
        day =  date.split(' ')[1].zfill(2)
        if '(' in date:
            game_no = date.split('(')[1].replace(')', '')
        else:
            game_no = '1'
            
        if at_ == '@':
            gdls.append( 'gid_%s_%s_%s_%smlb_%smlb_%s' % (year, month, day, teamD[tm], 
                                                            teamD[opp], game_no))
        else:
            gdls.append( 'gid_%s_%s_%s_%smlb_%smlb_%s' % (year, month, day, teamD[opp], 
                                                            teamD[tm], game_no))
    return gdls
    

def gdl_to_date(gdl): 
    # Takes gameday_link and returns a date as a string like 05-01-16
    y, m, d = (gdl.split('_')[1], gdl.split('_')[2], gdl.split('_')[3])
    return ('%s-%s-%s' % (m, d, y[-2:]))
    
def gdl_to_datetime(x):
    # Takes a gameday_link and returns a date as a datetime
    return pd.to_datetime(gdl_to_date(x), format='%m-%d-%y')
                             
    
def convert_bbref_ip(x):
    """ 
    convert series containing innings pitched in ".1", ".2" format 
    to ".33", ".67" format
    """  
    return round(int(x)) + (x-round(int(x)))/0.3
    
# ----------------- Misc --------------------    
def get_bbref_pitch(url, year=2016):
    """ 
    returns a pandas dataframe containing bbref info (not all numeric?)
    usage get_bbref_pitch(url)
    """
    
    import Baseball
    import requests
    from bs4 import BeautifulSoup 
    
    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, 'lxml')
    tbl = soup.find('table', id='pitching_gamelogs')
    bbref = pd.read_html(str(tbl))[0] 
    # Drop the last row = summary row
    bbref = bbref.iloc[:-1] 
    try:
        bbref = bbref[bbref['Gcar'] != 'Tm']
    except TypeError:
        pass 
    bbref = bbref.dropna(subset=['Gcar',], axis=0)
    for param in bbref.columns:
        bbref[param] = pd.to_numeric(bbref[param], errors='ignore')
    
    bbref['GDL_Date'] = bbref['Date'].apply(lambda x:bbref_date_to_gdl_date(x, year))
    bbref['IP'] = bbref['IP'].apply(convert_bbref_ip)
    
    bbref['WHIP'] = Baseball.get_whip(bbref)
    bbref['ERIP'] = Baseball.get_erip(bbref)
    
    return bbref 

    
def get_bbref_bat(url, year=2016):
    """ 
    returns a pandas dataframe containing bbref info (not all numeric?)
    usage get_bbref_bat(url)
    """
    from bs4 import BeautifulSoup
    import requests 
    
    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, 'lxml') 
    tbl = soup.find('table', id='batting_gamelogs')
    bbref = pd.read_html(str(tbl))[0] 
    # Drop the last row = summary row
    bbref = bbref.iloc[:-1] 
    try:
        bbref = bbref[~((bbref['Gcar'].str.contains('Tm')) |
                       (bbref['Gcar'].str.contains('Gcar')))]
    except TypeError:
        pass 
    bbref = bbref.dropna(subset=['Gcar',], axis=0)
    for param in bbref.columns:
        bbref[param] = pd.to_numeric(bbref[param], errors='ignore')
    
    bbref['GDL_Date'] = bbref['Date'].apply(lambda x:Baseball.bbref_date_to_gdl_date(x, year)) 
    return (bbref)
예제 #17
0
파일: batting.py 프로젝트: iayork/Baseball
def get_atbats_count_pfx(df):
    """
    Given a pitchab dataframe return the number of official at-bats
    """
    ab = Baseball.get_atbats_df_pfx(df)
    return len(ab.groupby(['gameday_link','num']).first())
예제 #18
0
파일: batting.py 프로젝트: iayork/Baseball
def get_slg_per_atbat_pfx(df): 
    """Calculate Total bases per ATBAT """ 
    ab = len(Baseball.get_atbats_df_pfx(df))
    tb = Baseball.get_tb(df)
    return tb/len(events)
예제 #19
0
파일: batting.py 프로젝트: iayork/Baseball
def get_ops_pfx(df):
    slg = Baseball.get_slg_pfx(df)
    obp = Baseball.get_obp_pfx(df)
    return(slg + obp)
예제 #20
0
파일: batting.py 프로젝트: iayork/Baseball
def get_atbats_count_pfx(df):
    ab = Baseball.get_atbats_df_pfx(df)
    return len(ab.groupby(['gameday_link','num']).first())