Ejemplo n.º 1
0
    def add_lineups(self, status='auto'):
        """
        status: 'auto' - expected/actual
        """
        helper.progress("Adding Lineups Attribute")

        # Add lineups
        #     add expected for upcoming game
        #     add actual for completed games
        lineups_path = CONFIG.get(self.league)\
                             .get('paths')\
                             .get('normalized')\
                             .format(f='game_lineup')
        df_lineup = pd.concat(
            objs=[pd.read_parquet(lineups_path+fname) for fname in os.listdir(lineups_path) if 
                 ((fname.replace(".parquet", "") >= self.min_date_gte)
                  &
                  (fname.replace(".parquet", "") <= self.max_date_lte))],
            axis=0
        )

        # Actual        
        actual = df_lineup.loc[df_lineup['positionStatus'] == 'actual', :]
        actual = actual.drop_duplicates(subset=['gameId', 'playerId'])
        actual_ids = list(set(actual.gameId))

        # Expected
        exp = df_lineup.loc[(
            (df_lineup['positionStatus'] == 'expected')
            &
            ~(df_lineup['gameId'].isin(actual_ids))
        ), :]
        exp = exp.drop_duplicates(subset=['gameId', 'playerId'])

        # Concat
        actual = pd.concat(objs=[actual, exp], axis=0)
        actual = actual.rename(columns={'teamDisposition': 'batterDisposition'})

        self.lineups = actual
Ejemplo n.º 2
0
    def add_starting_pitchers(self, dispositions=['home', 'away']):
        """
        ADDS DIMENSIONS TO SUMMARY
        """
        helper.progress("Adding Starting Pitchers Attribute")

        # Paths
        atbats_path = CONFIG.get(self.league)\
            .get('paths')\
            .get('normalized').format(
                f='game_atbats'
            )
        atbats_paths = [atbats_path+d+"/" for d in os.listdir(atbats_path) if (
            (d >= self.min_date_gte)
            &
            (d <= self.max_date_lte)
        )]
        atbats_paths_full = []
        for abp in atbats_paths:
            atbats_paths_full.extend([abp+fname for fname in os.listdir(abp)])

        # Get atbats 
        df_ab = pd.concat(
            objs=[pd.read_parquet(p) for p in atbats_paths_full],
            axis=0
        )
        df_ab.loc[:, 'gameStartTime'] = df_ab['gameStartTime'].str[:10]
        df_ab.loc[:, 'gameStartTime'] = pd.to_datetime(df_ab['gameStartTime'])

        # Save upcoming to use lineup approach with later
        if self.upcoming_start_gte:
            df_upc = df_ab.loc[df_ab['gameStartTime'] >= self.upcoming_start_gte, :]
            df_ab = df_ab.loc[df_ab['gameStartTime'] < self.upcoming_start_gte, :]
        else:
            df_upc = df_ab.loc[df_ab['gameStartTime'] >= dt.datetime.now(), :]
            df_ab = df_ab.loc[df_ab['gameStartTime'] < dt.datetime.now(), :]
        

        # -------------------------
        # -------------------------
        # Filter to games in the past and use atbats to get starter (in case lineup wrong)
        # Get Home Starters
        df_top1 = df_ab.loc[(
            (df_ab['inning']==1) &
            (df_ab['inningHalf']=='TOP') &
            (df_ab['outCount']==0)
        ), :]
        df_home_starters = df_top1.loc[:, ['gameId', 'pitcherId']]\
            .drop_duplicates(subset=['gameId'])
        df_home_starters.rename(
            columns={'pitcherId': 'homeStartingPitcherId'},
            inplace=True
        )

        # Get Away Starters
        df_bot1 = df_ab.loc[(
            (df_ab['inning']==1) &
            (df_ab['inningHalf']=='BOTTOM') &
            (df_ab['outCount']==0)
        ), :]
        df_away_starters = df_bot1.loc[:, ['gameId', 'pitcherId']]\
            .drop_duplicates(subset=['gameId'])
        df_away_starters.rename(
            columns={'pitcherId': 'awayStartingPitcherId'},
            inplace=True
        )

        # Assemble starters
        df_hist_starters = pd.merge(
            df_home_starters, 
            df_away_starters, 
            how='outer', 
            on=['gameId'], 
            validate='1:1'
        )

        # -------------------------
        # -------------------------
        # Filter to games in the current/future and use
        #    lineups to get starter (in case lineup wrong)
        if not hasattr(self, 'lineups'):
            self.add_lineups()
        df_lup_home = self.lineups.loc[
            self.lineups['batterDisposition'].str.lower() == 'home', :]
        df_lup_away = self.lineups.loc[
            self.lineups['batterDisposition'].str.lower() == 'away', :]

        # Filter down
        df_lup_home = df_lup_home.loc[(
            (df_lup_home['playerPositionGeneral'] == 'P')
            &
            (df_lup_home['gameId'].isin(list(df_upc.gameId)))
        ), :]
        df_lup_away = df_lup_away.loc[(
            (df_lup_away['playerPositionGeneral'] == 'P')
            &
            (df_lup_away['gameId'].isin(list(df_upc.gameId)))
        ), :]
                                      
        # Isolate
        df_lup_home.rename(columns={'playerId': 'homeStartingPitcherId'}, inplace=True)
        df_lup_home = df_lup_home.loc[:,
            ['gameId', 'homeStartingPitcherId']]\
            .drop_duplicates(subset=['gameId'], inplace=False)
        df_lup_away.rename(columns={'playerId': 'awayStartingPitcherId'}, inplace=True)
        df_lup_away = df_lup_away.loc[:,
            ['gameId', 'awayStartingPitcherId']]\
            .drop_duplicates(subset=['gameId'], inplace=False)

        # Combine to one game per row
        df_upc_starters = pd.merge(
            df_lup_home,
            df_lup_away,
            how='left',
            on=['gameId'],
            validate='1:1'
        )

        # Concat hist and upc vertically to merge back to summary attrib
        df_starters = pd.concat(
            objs=[df_hist_starters, df_upc_starters],
            axis=0
        )
        
        # Merge to summary attribute
        self.summary = pd.merge(
            self.summary,
            df_starters,
            how='left',
            on=['gameId'],
            validate='1:1'
        )
Ejemplo n.º 3
0
    def add_pitcher_rolling_stats(
        self,
        dispositions=['home', 'away'],
        pitcher_roll_types=['starter', 'reliever', 'closer'],
        shift_back=True    
    ):
        """
        """
        helper.progress("Adding Pitcher Rolling Stats to pitching-related attributes")

        # Path
        ptch_roll_path = CONFIG.get(self.league)\
            .get('paths')\
            .get('rolling_stats').format('pitching')+"player/"

        # Read in
        ptch_roll = pd.concat(
            objs=[pd.read_parquet(ptch_roll_path+fname) for fname in
                  os.listdir(ptch_roll_path) if 
                 ((fname.replace(".parquet", "") >= self.min_date_gte)
                  &
                  (fname.replace(".parquet", "") <= self.max_date_lte))],
            axis=0
        )

        # Create rolling metrics
        cols = ['gameId', 'gameStartDate', 'playerId'] +\
            self.pitching_roll_stats

        # Subset
        ptch_roll = ptch_roll.loc[:,
            ['gameId', 'gameStartDate', 'playerId'] +
            self.pitching_roll_stats
        ]

        # Sort
        ptch_roll.sort_values(by=['playerId', 'gameStartDate'], ascending=True, inplace=True)
        ptch_roll.reset_index(drop=True, inplace=True)

        # Shift back if interested in rolling stats leading up to game
        if shift_back:
            for col in self.pitching_roll_stats:
                msk = (ptch_roll['playerId'].shift(1)==ptch_roll['playerId'])
                ptch_roll.loc[msk, col] = ptch_roll[col].shift(1)

        # Handle Infs
        for col in self.pitching_roll_stats:
            ptch_roll = ptch_roll.loc[~ptch_roll[col].isin([np.inf, -np.inf]), :]

        # Check if starter / all designation
        if 'starter' in pitcher_roll_types:
            print("    Adding stats for starters")

            # Check that summary attribute has starting pitchers
            if not any('StartingPitcherId' in col for col in
                       self.summary.columns):
                self.add_starting_pitchers(dispositions=dispositions)

            # Merge back to starters (one at a time)
            pitcher_cols = ['{}StartingPitcherId'.format(d) for
                            d in dispositions]

            # Prep self.starting_pitcher_stats
            p = []
            for pc in pitcher_cols:
                df = self.summary.loc[:, ['gameId', pc]]
                df = df.loc[df[pc].notnull(), :]
                df.rename(columns={pc: 'pitcherId'}, inplace=True)
                df.loc[:, 'pitcherDisposition'] = pc[:4].lower()
                p.append(df)

            # concatenate to form attribute
            self.starting_pitcher_summary = \
                pd.concat(objs=p, axis=0)
            self.starting_pitcher_summary = pd.merge(
                self.starting_pitcher_summary,
                ptch_roll,
                how='left',
                left_on=['gameId', 'pitcherId'],
                right_on=['gameId', 'playerId'],
                validate='1:1'
            )
            self.starting_pitcher_summary.drop(
                labels=['playerId'],
                axis=1,
                inplace=True
            )

        # Check if reliever / all designation
        if 'reliever' in pitcher_roll_types:
            print("    Adding stats for relievers")
            
            # Check attribute (try / except cheaper but less readable)
            if not hasattr(self, 'bullpen_reliever_summary'):
                self.add_bullpen_summary(dispositions=dispositions)

            # Merge back to relievers in bullpen summary
            msk = (self.bullpen_reliever_summary['pitcherRoleType'].str.lower() == 'reliever')
            bullpen = self.bullpen_reliever_summary.loc[msk, :]
            if bullpen.shape[0] == 0:
                warnings.warn("    No relief pitchers found in bullpen_summary attribute")
            
            if not all(d in dispositions for d in ['home', 'away']):
                assert len(dispositions) == 1 and dispositions[0] in ['home', 'away']
            bullpen_reconstruct = []
            for disp in dispositions:
                bullpen_disp = bullpen.loc[bullpen['bullpenDisposition'] == disp, :]
                bullpen_disp = bullpen_disp.loc[:, ['gameId', 'pitcherId']]
                bullpen_disp = pd.merge(
                    bullpen_disp,
                    ptch_roll,
                    how='left',
                    left_on=['gameId', 'pitcherId'],
                    right_on=['gameId', 'playerId'],
                    validate='1:1'
                )
                bullpen_disp.drop(labels=['playerId'], axis=1, inplace=True)
                bullpen_reconstruct.append(bullpen_disp)
            bullpen_reconstruct = pd.concat(objs=bullpen_reconstruct, axis=0)

            # Add back to summary / detail
            self.bullpen_reliever_summary = pd.merge(
                self.bullpen_reliever_summary,
                bullpen_reconstruct,
                how='left',
                on=['gameId', 'pitcherId'],
                validate='1:1'
            )

            # Set
            # TODO Standard Deviation might not be best here
            aggDict = {stat: ['mean', 'max', 'min'] for stat in [
                x for x in self.bullpen_reliever_summary.columns if
                any(y in x for y in self.pitching_stats)
            ]}
            df = self.bullpen_reliever_summary.groupby(
                by=['gameId', 'gameStartTime', 'teamId', 'bullpenDisposition'],
                as_index=False
            ).agg(aggDict)
            df.columns = [
                x[0] if x[1] == '' else x[0]+"~"+x[1] for x in
                df.columns
            ]
            self.bullpen_reliever_summary = df
            
        # TODO FIX CLOSER MERGE _x _y 
        if 'closer' in pitcher_roll_types:
            print("    Adding stats for closers")

            # Check if closer / all designation
            if not hasattr(self, 'bullpen_closer_summary'):
                self.add_bullpen_summary(dispositions=dispositions)

            # Merge back to closers in bullpen summary
            msk = (self.bullpen_closer_summary['pitcherRoleType'].str.lower() == 'closer')
            bullpen = self.bullpen_closer_summary.loc[msk, :]
            if bullpen.shape[0] == 0:
                warnings.warn("    No closing pitchers found in bullpen_summary attribute")

            if not all(d in dispositions for d in ['home', 'away']):
                assert len(dispositions) == 1 and dispositions[0] in ['home', 'away']
            bullpen_reconstruct = []
            for disp in dispositions:
                bullpen_disp = bullpen.loc[bullpen['bullpenDisposition'] == disp, :]
                bullpen_disp = bullpen_disp.loc[:, ['gameId', 'pitcherId']]
                bullpen_disp = pd.merge(
                    bullpen_disp,
                    ptch_roll,
                    how='left',
                    left_on=['gameId', 'pitcherId'],
                    right_on=['gameId', 'playerId'],
                    validate='1:1'
                )
                bullpen_disp.drop(labels=['playerId'], axis=1, inplace=True)
                bullpen_reconstruct.append(bullpen_disp)
            bullpen_reconstruct = pd.concat(objs=bullpen_reconstruct, axis=0)

            # Add back to summary / detail
            self.bullpen_closer_summary = pd.merge(
                self.bullpen_closer_summary,
                bullpen_reconstruct,
                how='left',
                on=['gameId', 'pitcherId'],
                validate='1:1'
            )

            # Set
            # TODO Standard Deviation might not be best here
            aggDict = {stat: ['mean', 'max', 'min'] for stat in [
                x for x in self.bullpen_closer_summary.columns if
                any(y in x for y in self.pitching_stats)
            ]}
            df = self.bullpen_closer_summary.groupby(
                by=['gameId', 'gameStartTime', 'teamId', 'bullpenDisposition'],
                as_index=False
            ).agg(aggDict)
            df.columns = [
                x[0] if x[1] == '' else x[0]+"~"+x[1] for x in
                df.columns
            ]
            self.bullpen_closer_summary = df
Ejemplo n.º 4
0
    def add_bullpen_summary(self, dispositions=['home', 'away']):
        """
        ADDS ATTRIBUTE "bullpens_summary"
        """
        helper.progress("Adding Bullpen Summary Attribute")

        # Get atbats, filter to where not equal to starters
        if not all(
            s in self.summary.columns for s in \
           ['{}StartingPitcherId'.format(d) for d in dispositions]
        ):
            self.add_starting_pitchers()
        
        # Get atbats
        # Paths
        atbats_path = CONFIG.get(self.league)\
            .get('paths')\
            .get('normalized').format(
                f='game_atbats'
            )
        atbats_paths = [atbats_path+d+"/" for d in os.listdir(atbats_path) if (
            (d >= self.min_date_gte)
            &
            (d <= self.max_date_lte)
        )]
        atbats_paths_full = []
        for abp in atbats_paths:
            atbats_paths_full.extend([abp+fname for fname in os.listdir(abp)])

        # Get atbats and sort by inning / outCount
        df_ab = pd.concat(
            objs=[pd.read_parquet(p) for p in atbats_paths_full],
            axis=0
        )
        df_ab = df_ab.loc[:, ['gameId', 'gameStartTime', 'pitcherId', 'homeTeamId', 'awayTeamId',
                              'inning', 'inningHalf', 'outCount']]
        
        # Select home, sort, dd, remove starter, and rerank
        bullpen_summary = []
        sides = {'TOP': 'home', 'BOTTOM': 'away'}
        for half_, disp  in sides.items():

            # Set up starter map for later mask
            startingPitcherMap = self.summary.set_index('gameId')\
                ['{}StartingPitcherId'.format(disp)].to_dict()
            
            df_ab_h = df_ab.loc[df_ab['inningHalf']==half_, :]
            # Sort
            df_ab_h = df_ab_h.sort_values(
                by=['gameId', 'gameStartTime', 'inning', 'outCount'],
                ascending=True,
                inplace=False
            )
            
            # Drop labels
            df_ab_h = df_ab_h.drop(labels=['inning', 'outCount'], axis=1, inplace=False)

            # Remove pitcher who was already identified as starter
            #    (self.summary['homeStartingPitcherId'].iloc[0]?
            df_ab_h.loc[:, '{}StartingPitcherId'.format(disp)] = \
                df_ab_h['gameId'].map(startingPitcherMap)
            df_ab_h = df_ab_h.loc[
                df_ab_h['pitcherId'] != df_ab_h['{}StartingPitcherId'.format(disp)], :]

            # Handle ordering
            df_ab_h['pitcherAppearOrder'] = df_ab_h\
                .groupby(by=['gameId'])['pitcherId'].rank(method='first')
            df_ab_h = df_ab_h.groupby(
                by=['gameId', 'gameStartTime', '{}TeamId'.format(disp), 'pitcherId'],
                as_index=False).agg({'pitcherAppearOrder': 'min'})
            df_ab_h['pitcherAppearOrder'] = df_ab_h\
                .groupby(by=['gameId'])['pitcherId'].rank(method='first')
            df_ab_h['pitcherAppearOrderMax'] = df_ab_h\
                .groupby('gameId')['pitcherAppearOrder'].transform('max')

            # Label middle pitchers relief role and last pitcher closer` role
            msk = (df_ab_h['pitcherAppearOrder']==df_ab_h['pitcherAppearOrderMax'])
            df_ab_h.loc[msk, 'pitcherRoleType'] = 'closer'
            df_ab_h.loc[~msk, 'pitcherRoleType'] = 'reliever'

            # Subset (TODO add first inning appeared)
            df_ab_h = df_ab_h.loc[:, ['gameId', 'gameStartTime', 'pitcherId', 'pitcherRoleType',
                                      '{}TeamId'.format(disp), 'pitcherAppearOrder']]
            df_ab_h.rename(columns={'{}TeamId'.format(disp): 'teamId'}, inplace=True)
            df_ab_h['bullpenDisposition'] = disp
            bullpen_summary.append(df_ab_h)
            
        bullpen_summary = pd.concat(objs=bullpen_summary, axis=0)
        self.bullpen_reliever_summary = bullpen_summary.loc[
            bullpen_summary['pitcherRoleType'] == 'reliever', :]
        self.bullpen_closer_summary = bullpen_summary.loc[
            bullpen_summary['pitcherRoleType'] == 'closer', :]