Beispiel #1
0
class CleanData(object):
    def __init__(self):
        self.feature_creation = FeatureCreation()

    def drop_rows_player_inactive(self, df):
        df = df.loc[df['SECONDSPLAYED'] > 0]
        return df

    def drop_rows_player_injured(self, df):
        df = df.loc[(df['SECONDSPLAYED'] != 0) |
                    (df['COMMENT'] == "DNP - Coach's Decision")]
        return df

    def drop_rows_player_rest(self, df, thresh=1200):
        df = self.feature_creation.expanding_mean(
            df=df,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='SECONDSPLAYED',
            new_col_name='AVG_SP')
        df = df.loc[~((df['AVG_SP'] > thresh) &
                      (df['COMMENT'] == "DNP - Coach's Decision"))]
        df = df.drop(columns=['AVG_SP'])
        return df

    def roto_name_to_nba_name(self, name):
        name_list = name.split(',')
        name = "{} {}".format(name_list[-1].lstrip(), ' '.join(name_list[:-1]))
        if name in ROTO_NAME_TO_NBA_NAME:
            return ROTO_NAME_TO_NBA_NAME[name]
        return name
Beispiel #2
0
class RPSModel(object):
    def __init__(self, train_data, test_data):
        self.feature_creation = FeatureCreation()

        self.train_data = train_data
        self.test_data = test_data
        self.model = CatBoostRegressionModel(RPS_MODEL_PARAMS)

        self.regressors = []
        self.regressand = 'RPS'

        self.created_features = False
        self.generated_weights = False
        self.trained_model = False

    def create_features(self, odds_data, sp_threshold=60):
        data = pd.concat([self.train_data, self.test_data])

        data['REB'] = data['DREB'] + data['OREB']
        data[self.regressand] = data['REB']/data['SECONDSPLAYED']
        data['ORPS'] = data['OREB']/data['SECONDSPLAYED']
        data['DRPS'] = data['DREB']/data['SECONDSPLAYED']

        data['CLEAN_DRPS'] = data['DRPS']
        data.loc[data['SECONDSPLAYED'] <= sp_threshold, 'CLEAN_DRPS'] = np.nan
        data['CLEAN_ORPS'] = data['ORPS']
        data.loc[data['SECONDSPLAYED'] <= sp_threshold, 'CLEAN_ORPS'] = np.nan

        train_index = self.train_data.set_index(['GAMEID', 'PLAYERID']).index
        test_index = self.test_data.set_index(['GAMEID', 'PLAYERID']).index

        # season averages
        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', weight_col_name='SECONDSPLAYED',
            new_col_name='AVG_DRPS'
        )
        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', weight_col_name='SECONDSPLAYED',
            new_col_name='AVG_ORPS'
        )
        self.regressors.append('AVG_DRPS')
        self.regressors.append('AVG_ORPS')

        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'OPP_TEAM', 'PLAYERID'], col_name=self.regressand,
            weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_OPP_TEAM'
        )
        self.regressors.append('AVG_Y_OPP_TEAM')

        # 1 game lags
        data = self.feature_creation.lag(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='CLEAN_DRPS', new_col_name='L1_DRPS',
            n_shift=1
        )
        self.regressors.append('L1_DRPS')

        # exponentially weighted means
        data = self.feature_creation.expanding_ewm(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', new_col_name='EWM_DRPS',
            alpha=0.90
        )
        data = self.feature_creation.expanding_ewm(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', new_col_name='EWM_ORPS',
            alpha=0.90
        )
        self.regressors.append('EWM_DRPS')
        self.regressors.append('EWM_ORPS')

        # moving averages
        data = self.feature_creation.rolling_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', new_col_name='MA2_DRPS',
            weight_col_name='SECONDSPLAYED', n_rolling=2, min_periods=1
        )
        data = self.feature_creation.rolling_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DRPS', new_col_name='MA15_DRPS',
            weight_col_name='SECONDSPLAYED', n_rolling=15, min_periods=8
        )
        data = self.feature_creation.rolling_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', new_col_name='MA6_ORPS',
            weight_col_name='SECONDSPLAYED', n_rolling=6, min_periods=3
        )
        data = self.feature_creation.rolling_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='ORPS', new_col_name='MA18_ORPS',
            weight_col_name='SECONDSPLAYED', n_rolling=18, min_periods=9
        )
        self.regressors.append('MA2_DRPS')
        self.regressors.append('MA15_DRPS')
        self.regressors.append('MA6_ORPS')
        self.regressors.append('MA18_ORPS')

        # start
        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID', 'START'], col_name=self.regressand,
            weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_R'
        )
        self.regressors.append('AVG_Y_R')

        # position
        data['NORM_POS'] = data['POSITION'].apply(lambda x: x if '-' not in x else x.split('-')[0])
        data['GUARD'] = 0
        data.loc[data['NORM_POS'] == 'Guard', 'GUARD'] = 1
        self.regressors.append('GUARD')

        # defense
        data = self.feature_creation.expanding_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='DREB', new_col_name='AVG_DREB'
        )
        data = self.feature_creation.expanding_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='OREB', new_col_name='AVG_OREB'
        )
        data = self.feature_creation.expanding_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='SECONDSPLAYED', new_col_name='AVG_SP'
        )

        temp = data.dropna(subset=['DREB', 'SECONDSPLAYED', 'AVG_DREB', 'AVG_SP'])
        grouped_defensive_boxscores = temp.groupby(['SEASON', 'DATE', 'OPP_TEAM']).apply(
            lambda x: pd.Series({
                'TEAM_DRPS_ALLOWED': x['DREB'].sum()/x['SECONDSPLAYED'].sum(),
                'TEAM_DRPS_AVG': x['AVG_DREB'].sum()/x['AVG_SP'].sum()
            })
        ).reset_index()
        grouped_defensive_boxscores['TEAM_DRPS_DIFF_ALLOWED'] = grouped_defensive_boxscores['TEAM_DRPS_ALLOWED'] - \
            grouped_defensive_boxscores['TEAM_DRPS_AVG']
        grouped_defensive_boxscores = self.feature_creation.expanding_mean(
            df=grouped_defensive_boxscores, group_col_names=['SEASON', 'OPP_TEAM'], col_name='TEAM_DRPS_DIFF_ALLOWED',
            new_col_name='AVG_TEAM_DRPS_DIFF_ALLOWED', order_idx_name='DATE', min_periods=5
        )
        data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'OPP_TEAM'], how='left')
        self.regressors.append('AVG_TEAM_DRPS_DIFF_ALLOWED')

        temp = data.dropna(subset=['DREB', 'SECONDSPLAYED', 'AVG_DREB', 'AVG_SP'])
        grouped_defensive_boxscores = temp.groupby(['SEASON', 'DATE', 'START', 'OPP_TEAM']).apply(
            lambda x: pd.Series({
                'TEAM_DRPS_ALLOWED_R': x['DREB'].sum()/x['SECONDSPLAYED'].sum(),
                'TEAM_DRPS_AVG_R': x['AVG_DREB'].sum()/x['AVG_SP'].sum()
            })
        ).reset_index()
        grouped_defensive_boxscores['TEAM_DRPS_DIFF_ALLOWED_R'] = grouped_defensive_boxscores['TEAM_DRPS_ALLOWED_R'] - \
            grouped_defensive_boxscores['TEAM_DRPS_AVG_R']
        grouped_defensive_boxscores = self.feature_creation.expanding_mean(
            df=grouped_defensive_boxscores, group_col_names=['SEASON', 'START', 'OPP_TEAM'],
            col_name='TEAM_DRPS_DIFF_ALLOWED_R', new_col_name='AVG_TEAM_DRPS_DIFF_ALLOWED_R', order_idx_name='DATE',
            min_periods=5
        )
        data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'START', 'OPP_TEAM'], how='left')
        self.regressors.append('AVG_TEAM_DRPS_DIFF_ALLOWED_R')

        temp = data.dropna(subset=['DREB', 'OREB', 'SECONDSPLAYED', 'AVG_DREB', 'AVG_OREB', 'AVG_SP'])
        grouped_defensive_boxscores = temp.groupby(['SEASON', 'DATE', 'NORM_POS', 'OPP_TEAM']).apply(
            lambda x: pd.Series({
                'TEAM_DRPS_ALLOWED_P': x['DREB'].sum()/x['SECONDSPLAYED'].sum(),
                'TEAM_DRPS_AVG_P': x['AVG_DREB'].sum()/x['AVG_SP'].sum(),
                'TEAM_ORPS_ALLOWED_P': x['OREB'].sum()/x['SECONDSPLAYED'].sum(),
                'TEAM_ORPS_AVG_P': x['AVG_OREB'].sum()/x['AVG_SP'].sum()
            })
        ).reset_index()
        grouped_defensive_boxscores['TEAM_DRPS_DIFF_ALLOWED_P'] = grouped_defensive_boxscores['TEAM_DRPS_ALLOWED_P'] - \
            grouped_defensive_boxscores['TEAM_DRPS_AVG_P']
        grouped_defensive_boxscores['TEAM_ORPS_DIFF_ALLOWED_P'] = grouped_defensive_boxscores['TEAM_ORPS_ALLOWED_P'] - \
            grouped_defensive_boxscores['TEAM_ORPS_AVG_P']
        grouped_defensive_boxscores = self.feature_creation.expanding_mean(
            df=grouped_defensive_boxscores, group_col_names=['SEASON', 'NORM_POS', 'OPP_TEAM'],
            col_name='TEAM_DRPS_DIFF_ALLOWED_P', new_col_name='AVG_TEAM_DRPS_DIFF_ALLOWED_P', order_idx_name='DATE',
            min_periods=5
        )
        grouped_defensive_boxscores = self.feature_creation.expanding_mean(
            df=grouped_defensive_boxscores, group_col_names=['SEASON', 'NORM_POS', 'OPP_TEAM'],
            col_name='TEAM_ORPS_DIFF_ALLOWED_P', new_col_name='AVG_TEAM_ORPS_DIFF_ALLOWED_P', order_idx_name='DATE',
            min_periods=5
        )
        data = data.merge(grouped_defensive_boxscores, on=['SEASON', 'DATE', 'NORM_POS', 'OPP_TEAM'], how='left')
        self.regressors.append('AVG_TEAM_DRPS_DIFF_ALLOWED_P')
        self.regressors.append('AVG_TEAM_ORPS_DIFF_ALLOWED_P')

        # total
        full_game_odds = odds_data.loc[odds_data['PERIOD'] == 'Full Game']
        full_game_odds['TOTAL'] = full_game_odds['TOTAL'].replace(['PK', '-'], np.nan)
        data = data.merge(full_game_odds, on=['DATE', 'TEAM'], how='left')
        self.regressors.append('TOTAL')

        # injuries
        data = self.feature_creation.expanding_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name='REB', new_col_name='AVG_REB'
        )

        temp = data.dropna(subset=['DREB', 'AVG_DREB', 'SECONDSPLAYED', 'AVG_SP'])
        temp = temp.groupby(['SEASON', 'DATE', 'TEAM']).apply(
            lambda x: pd.Series({
                'TEAM_ACTIVE_AVG_DRPS': x['AVG_DREB'].sum()/x['AVG_SP'].sum(),
                'TEAM_DRPS': x['DREB'].sum()/x['SECONDSPLAYED'].sum(),
                'TEAM_ACTIVE_AVG_RPS': x['AVG_REB'].sum()/x['AVG_SP'].sum(),
                'TEAM_RPS': x['REB'].sum()/x['SECONDSPLAYED'].sum()
            })
        )
        temp = self.feature_creation.expanding_mean(
            df=temp, group_col_names=['SEASON', 'TEAM'], col_name='TEAM_DRPS', new_col_name='AVG_TEAM_DRPS'
        )
        temp['TEAM_ACTIVE_AVG_DRPS_DIFF'] = temp['TEAM_ACTIVE_AVG_DRPS'] - temp['AVG_TEAM_DRPS']
        data = data.merge(temp, on=['DATE', 'TEAM'], how='left')
        self.regressors.append('TEAM_ACTIVE_AVG_DRPS_DIFF')

        # regressand by lineup
        data['START_LINEUP'] = np.nan
        data['STARS'] = np.nan
        data = data.set_index(['GAMEID', 'TEAM'])
        for (game_id, team), temp in data.groupby(['GAMEID', 'TEAM']):
            start_lineup = list(temp.loc[temp['START'] == 1, 'PLAYERID'].values)
            start_lineup.sort()
            start_lineup = '_'.join(start_lineup)
            data.loc[(game_id, team), 'START_LINEUP'] = start_lineup

            stars = list(temp.loc[temp['AVG_DREB'] >= 7, 'PLAYERID'].values)
            stars.sort()
            stars = '_'.join(stars)
            data.loc[(game_id, team), 'STARS'] = stars
        data = data.reset_index()

        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'START_LINEUP', 'PLAYERID'], col_name=self.regressand,
            weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_STARTERS'
        )

        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'STARS', 'PLAYERID'], col_name=self.regressand,
            weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y_STARS'
        )
        self.regressors.append('AVG_Y_STARTERS')
        self.regressors.append('AVG_Y_STARS')

        # misc
        data['GP'] = 1
        data = self.feature_creation.expanding_sum(
            df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='GP', new_col_name='COUNT_GP'
            )
        self.regressors.append('COUNT_GP')
        self.regressors.append('AVG_SP')

        # to fill
        data = self.feature_creation.expanding_weighted_mean(
            df=data, group_col_names=['SEASON', 'TEAM', 'PLAYERID'], col_name=self.regressand,
            weight_col_name='SECONDSPLAYED', new_col_name='AVG_Y'
        )

        data = self.generate_weights(data)
        data = self.preprocess(data)
        data = data.set_index(['GAMEID', 'PLAYERID'])

        train_index = list(set(data.index.values).intersection(set(train_index.values)))
        self.train_data = data.loc[train_index].reset_index()
        test_index = list(set(data.index.values).intersection(set(test_index.values)))
        self.test_data = data.loc[test_index].reset_index()

        self.created_features = True

    def preprocess(self, data):
        data['AVG_Y_R'] = data['AVG_Y_R'].fillna(data['AVG_Y'])
        data['AVG_Y_OPP_TEAM'] = data['AVG_Y_OPP_TEAM'].fillna(data['AVG_Y'])

        data['L1_DRPS'] = data['L1_DRPS'].fillna(data['AVG_DRPS'])

        data['EWM_DRPS'] = data['EWM_DRPS'].fillna(data['AVG_DRPS'])
        data['EWM_ORPS'] = data['EWM_ORPS'].fillna(data['AVG_ORPS'])

        data['MA2_DRPS'] = data['MA2_DRPS'].fillna(data['AVG_DRPS'])
        data['MA15_DRPS'] = data['MA15_DRPS'].fillna(data['MA2_DRPS'])
        data['MA6_ORPS'] = data['MA6_ORPS'].fillna(data['AVG_ORPS'])
        data['MA18_ORPS'] = data['MA18_ORPS'].fillna(data['MA6_ORPS'])

        data['AVG_TEAM_DRPS_DIFF_ALLOWED'] = data['AVG_TEAM_DRPS_DIFF_ALLOWED'].fillna(0)
        data['AVG_TEAM_DRPS_DIFF_ALLOWED_R'] = data['AVG_TEAM_DRPS_DIFF_ALLOWED_R'].fillna(0)
        data['AVG_TEAM_DRPS_DIFF_ALLOWED_P'] = data['AVG_TEAM_DRPS_DIFF_ALLOWED_P'].fillna(0)
        data['AVG_TEAM_ORPS_DIFF_ALLOWED_P'] = data['AVG_TEAM_ORPS_DIFF_ALLOWED_P'].fillna(0)

        data['TOTAL'] = data['TOTAL'].fillna(200)

        data['TEAM_ACTIVE_AVG_DRPS_DIFF'] = data['TEAM_ACTIVE_AVG_DRPS_DIFF'].fillna(0)
        data['AVG_Y_STARS'] = data['AVG_Y_STARS'].fillna(data['AVG_Y'])
        data['AVG_Y_STARTERS'] = data['AVG_Y_STARTERS'].fillna(data['AVG_Y_STARS'])

        data['COUNT_GP'] = data['COUNT_GP'].fillna(0)

        # we can predict Y for a player as long as AVG_Y is not nan
        data = data.dropna(subset=['AVG_Y'])

        return data

    def generate_weights(self, data):
        data = self.feature_creation.expanding_sum(
            df=data, group_col_names=['SEASON', 'PLAYERID'], col_name='SECONDSPLAYED', new_col_name='SUM_SP'
        )

        self.weight = 'WEIGHT'
        data[self.weight] = data['SECONDSPLAYED'].apply(WeightFunctions.game_seconds_played_weight) * \
            data['SUM_SP'].apply(WeightFunctions.season_seconds_played_weight)

        return data

    def train_model(self):
        if not self.created_features:
            raise Exception('Must create features before training model')

        # drop games in which players played a minute or less
        self.train_data = self.train_data.loc[self.train_data['SECONDSPLAYED'] > 60]

        X = self.train_data[self.regressors]
        y = self.train_data[self.regressand]
        w = self.train_data[self.weight]
        self.model.fit(X, y, sample_weight=w, test_size=0.25, early_stopping_rounds=25)

        self.trained_model = True

    def predict(self):
        if not self.trained_model:
            raise Exception('Must train model before generating predictions')

        self.test_data['{}_HAT'.format(self.regressand)] = self.model.predict(self.test_data[self.regressors])

        return self.test_data[['GAMEID', 'PLAYERID', '{}_HAT'.format(self.regressand)]]
Beispiel #3
0
class OwnershipModel(object):
    def __init__(self, train_data, test_data, site):
        self.feature_creation = FeatureCreation()
        self.clean_data = CleanData()

        self.train_data = train_data
        self.test_data = test_data
        self.site = site
        self.model = XGBoostRegressionModel(OWNERSHIP_MODEL_PARAMS)

        self.regressors = []
        self.regressand = 'OWNERSHIP'

        self.created_features = False
        self.trained_model = False

    def create_features(self, salary_data, contest_data, ownership_data,
                        odds_data):
        data = pd.concat([self.train_data, self.test_data])

        train_index = self.train_data.set_index(['GAMEID', 'PLAYERID']).index
        test_index = self.test_data.set_index(['GAMEID', 'PLAYERID']).index

        salary_data = salary_data.loc[salary_data['SITE'] == self.site]
        data = data.merge(salary_data, on=['DATE', 'NAME'], how='inner')

        # player stat features
        CustomFPCalculator = FPCalculator(self.site)

        data['REB'] = data['DREB'] + data['OREB']
        data['DKFP'] = data.apply(
            lambda x: CustomFPCalculator.calculate_fantasy_points(
                x['SEASON'], x['PTS'], x['REB'], x['AST'], x['TOV'], x['BLK'],
                x['STL'], x['FG3M']),
            axis=1)

        data = self.feature_creation.expanding_mean(
            df=data,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='DKFP',
            new_col_name='AVG_DKFP')
        self.regressors.append('AVG_DKFP')

        data['VALUE'] = data['AVG_DKFP'] / data['SALARY']
        self.regressors.append('VALUE')

        data = self.feature_creation.lag(
            df=data,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='DKFP',
            new_col_name='L1_DKFP',
            n_shift=1)
        self.regressors.append('L1_DKFP')

        data = self.feature_creation.rolling_mean(
            df=data,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='DKFP',
            new_col_name='MA5_DKFP',
            n_rolling=5)
        self.regressors.append('MA5_DKFP')

        data = self.feature_creation.lag(
            df=data,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='SALARY',
            new_col_name='L1_SALARY',
            n_shift=1)
        data['SALARY_CHANGE'] = data['SALARY'] - data['L1_SALARY']
        self.regressors.append('SALARY')
        self.regressors.append('SALARY_CHANGE')

        data = self.feature_creation.expanding_standard_deviation(
            df=data,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='DKFP',
            new_col_name='STD_DKFP',
            min_periods=5)
        self.regressors.append('STD_DKFP')

        self.regressors.append('START')

        data['DFS_POSITIONS'] = data['DFS_POSITION'].apply(
            lambda x: x.split('_') if isinstance(x, str) else np.nan)
        data['NUM_POSITIONS'] = data['DFS_POSITIONS'].apply(
            lambda x: len(x) if isinstance(x, list) else np.nan)
        self.regressors.append('NUM_POSITIONS')

        for position in ['SG', 'PG', 'C']:
            data[position] = 0
            data.loc[data['DFS_POSITION'].str.contains(position), position] = 1
            self.regressors.append(position)

        # historical ownership of player
        ownership_data['NAME'] = ownership_data['PLAYERNAME'].apply(
            lambda x: x if x not in OWNERSHIP_NAME_TO_NBA_NAME else
            OWNERSHIP_NAME_TO_NBA_NAME[x])
        ownership_data = ownership_data.merge(contest_data,
                                              on=['SLATEID', 'CONTESTNAME'],
                                              how='inner')
        ownership_data = ownership_data.groupby(
            ['DATE', 'SLATEID', 'GAMECOUNT',
             'NAME']).apply(lambda x: pd.Series({
                 'OWNERSHIP': (x['OWNERSHIP'] * x['TOTALENTRIES']).sum() / x[
                     'TOTALENTRIES'].sum()
             })).reset_index()

        aggregated_ownership = ownership_data.groupby(
            ['DATE', 'NAME']).apply(lambda x: pd.Series(
                {'TOTAL_OWNERSHIP': x['OWNERSHIP'].mean()})).reset_index()
        data = data.merge(aggregated_ownership,
                          on=['DATE', 'NAME'],
                          how='inner')

        data = self.feature_creation.expanding_mean(
            df=data,
            group_col_names=['SEASON', 'NAME'],
            col_name='TOTAL_OWNERSHIP',
            new_col_name='AVG_OWNERSHIP')
        self.regressors.append('AVG_OWNERSHIP')

        data = self.feature_creation.lag(df=data,
                                         group_col_names=['SEASON', 'NAME'],
                                         col_name='TOTAL_OWNERSHIP',
                                         new_col_name='L1_OWNERSHIP',
                                         n_shift=1)
        self.regressors.append('L1_OWNERSHIP')

        data = self.feature_creation.rolling_mean(
            df=data,
            group_col_names=['SEASON', 'NAME'],
            col_name='TOTAL_OWNERSHIP',
            new_col_name='MA5_OWNERSHIP',
            n_rolling=5)
        self.regressors.append('MA5_OWNERSHIP')

        # defense
        data['NORM_POS'] = data['POSITION'].apply(
            lambda x: x if '-' not in x else x.split('-')[0])

        temp = data.dropna(subset=['DKFP', 'AVG_DKFP'])
        grouped_defensive_boxscores = temp.groupby([
            'SEASON', 'DATE', 'NORM_POS', 'OPP_TEAM'
        ]).apply(lambda x: pd.Series({
            'TEAM_DKFP_ALLOWED_P': x['DKFP'].sum(),
            'TEAM_DKFP_AVG_P': x['AVG_DKFP'].sum()
        })).reset_index()

        grouped_defensive_boxscores['DvP'] = grouped_defensive_boxscores['TEAM_DKFP_ALLOWED_P'] - \
            grouped_defensive_boxscores['TEAM_DKFP_AVG_P']

        grouped_defensive_boxscores = self.feature_creation.expanding_mean(
            df=grouped_defensive_boxscores,
            group_col_names=['SEASON', 'OPP_TEAM', 'NORM_POS'],
            col_name='DvP',
            new_col_name='AVG_DvP',
            order_idx_name='DATE',
            min_periods=5)
        self.regressors.append('AVG_DvP')

        data = data.merge(grouped_defensive_boxscores,
                          on=['SEASON', 'DATE', 'OPP_TEAM', 'NORM_POS'],
                          how='left')

        # vegas lines
        odds_data['TOTAL'] = odds_data['TOTAL'].replace(['PK', '-'], np.nan)
        odds_data['POINTSPREAD'] = odds_data['POINTSPREAD'].replace(
            ['PK', '-'], 0)
        full_game_odds = odds_data.loc[odds_data['PERIOD'] == 'Full Game']
        data = data.merge(full_game_odds, on=['DATE', 'TEAM'], how='left')
        self.regressors.append('TOTAL')
        self.regressors.append('POINTSPREAD')

        # slate info
        self.regressors.append('GAMECOUNT')

        slates = contest_data.loc[
            contest_data['SITE'] == self.site,
            ['DATE', 'SLATEID', 'TEAMS']].drop_duplicates()
        slates['TEAMS'] = slates['TEAMS'].apply(lambda x: x.split('_'))
        slates = slates.explode('TEAMS').rename(columns={"TEAMS": "TEAM"})
        slates['TEAM'] = slates['TEAM'].apply(
            lambda x: x
            if x not in DB_TEAM_TO_NBA_TEAM else DB_TEAM_TO_NBA_TEAM[x])

        slate_players = data[[
            'DATE', 'TEAM', 'NAME', 'DFS_POSITIONS', 'SALARY', 'VALUE'
        ]].merge(slates, on=['DATE', 'TEAM'], how='inner')
        slate_players['SALARY_BIN'] = pd.cut(slate_players['SALARY'],
                                             bins=list(range(
                                                 3000, 15000, 1000)),
                                             duplicates='drop',
                                             include_lowest=True)
        slate_players = slate_players.explode('DFS_POSITIONS').rename(
            columns={'DFS_POSITIONS': 'SINGLE_DFS_POSITION'})

        MIN_VALUE = 0.002

        all_temp = slate_players.groupby(
            ['SLATEID', 'SINGLE_DFS_POSITION']).apply(lambda x: pd.Series(
                {'L1P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()})
                                                      ).reset_index().dropna()
        slate_players = slate_players.merge(
            all_temp, on=['SLATEID', 'SINGLE_DFS_POSITION'], how='left')

        sb_temp = slate_players.groupby(
            ['SLATEID', 'SINGLE_DFS_POSITION',
             'SALARY_BIN']).apply(lambda x: pd.Series(
                 {'L1P_SB_COUNT': x['NAME'].count()})).reset_index().dropna()
        slate_players = slate_players.merge(
            sb_temp,
            on=['SLATEID', 'SINGLE_DFS_POSITION', 'SALARY_BIN'],
            how='left')

        L1_TO_L2 = {'PG': 'G', 'SG': 'G', 'SF': 'F', 'PF': 'F', 'C': 'C'}
        slate_players['LEVEL2_DFS_POSITION'] = slate_players[
            'SINGLE_DFS_POSITION'].apply(lambda x: L1_TO_L2[x]
                                         if isinstance(x, str) else np.nan)

        all_temp = slate_players.groupby(
            ['SLATEID', 'LEVEL2_DFS_POSITION']).apply(lambda x: pd.Series(
                {'L2P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()})
                                                      ).reset_index().dropna()
        slate_players = slate_players.merge(
            all_temp, on=['SLATEID', 'LEVEL2_DFS_POSITION'], how='left')

        all_temp = slate_players.groupby(
            ['SLATEID']).apply(lambda x: pd.Series(
                {'L3P_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()})
                               ).reset_index().dropna()
        slate_players = slate_players.merge(all_temp,
                                            on=['SLATEID'],
                                            how='left')

        sb_temp = slate_players.groupby([
            'SLATEID', 'SALARY_BIN'
        ]).apply(lambda x: pd.Series(
            {'L3P_SB_COUNT': x.loc[x['VALUE'] > MIN_VALUE, 'NAME'].count()})
                 ).reset_index().dropna()
        slate_players = slate_players.merge(sb_temp,
                                            on=['SLATEID', 'SALARY_BIN'],
                                            how='left')

        slate_players['SALARY_FLOOR'] = slate_players['SALARY_BIN'].apply(
            lambda x: x.left)

        slate_players['L1P_RANK'] = slate_players.groupby(
            ['SLATEID', 'SINGLE_DFS_POSITION'])['VALUE'].rank(method='min',
                                                              ascending=False)

        slate_players['L1P_SB_RANK'] = slate_players.groupby(
            ['SLATEID', 'SINGLE_DFS_POSITION',
             'SALARY_FLOOR'])['VALUE'].rank(method='min', ascending=False)

        slate_players['L3P_RANK'] = slate_players.groupby(
            ['SLATEID'])['VALUE'].rank(method='min', ascending=False)

        slate_players['L3P_SB_RANK'] = slate_players.groupby(
            ['SLATEID', 'SALARY_FLOOR'])['VALUE'].rank(method='min',
                                                       ascending=False)

        slate_data = slate_players.groupby([
            'DATE', 'SLATEID', 'NAME'
        ]).apply(lambda x: pd.Series({
            'L1P_COUNT': x['L1P_COUNT'].mean(),
            'L1P_RANK': x['L1P_RANK'].mean(),
            'L1P_SB_COUNT': x['L1P_SB_COUNT'].mean(),
            'L1P_SB_RANK': x['L1P_SB_RANK'].mean(),
            'L2P_COUNT': x['L2P_COUNT'].mean(),
            'L3P_COUNT': x['L3P_COUNT'].mean(),
            'L3P_RANK': x['L3P_RANK'].mean(),
            'L3P_SB_COUNT': x['L3P_SB_COUNT'].mean(),
            'L3P_SB_RANK': x['L3P_SB_RANK'].mean()
        })).reset_index()

        self.regressors.append('L1P_COUNT')
        self.regressors.append('L1P_RANK')
        self.regressors.append('L1P_SB_COUNT')
        self.regressors.append('L1P_SB_RANK')
        self.regressors.append('L2P_COUNT')
        self.regressors.append('L3P_COUNT')
        self.regressors.append('L3P_RANK')
        self.regressors.append('L3P_SB_COUNT')
        self.regressors.append('L3P_SB_RANK')

        data['GP'] = 1
        data = self.feature_creation.expanding_sum(
            df=data,
            group_col_names=['SEASON', 'PLAYERID'],
            col_name='GP',
            new_col_name='COUNT_GP')
        self.regressors.append('COUNT_GP')

        data = self.preprocess(data, slate_data, ownership_data)
        data = data.set_index(['GAMEID', 'PLAYERID'])

        train_index = list(
            set(data.index.values).intersection(set(train_index.values)))
        self.train_data = data.loc[train_index].reset_index()
        test_index = list(
            set(data.index.values).intersection(set(test_index.values)))
        self.test_data = data.loc[test_index].reset_index()

        self.created_features = True

    def preprocess(self, data, slate_data, ownership_data):
        ownership_data = ownership_data.merge(slate_data,
                                              on=['DATE', 'SLATEID', 'NAME'],
                                              how='inner')
        data = ownership_data.merge(data, on=['DATE', 'NAME'], how='inner')

        data['L1_DKFP'] = data['L1_DKFP'].fillna(data['AVG_DKFP'])
        data['MA5_DKFP'] = data['MA5_DKFP'].fillna(data['AVG_DKFP'])

        data['SALARY_CHANGE'] = data['SALARY_CHANGE'].fillna(0)

        data['STD_DKFP'] = data['STD_DKFP'].fillna(DEFAULT_STD *
                                                   data['AVG_DKFP'])

        data['L1_OWNERSHIP'] = data['L1_OWNERSHIP'].fillna(
            data['AVG_OWNERSHIP'])
        data['MA5_OWNERSHIP'] = data['MA5_OWNERSHIP'].fillna(
            data['AVG_OWNERSHIP'])

        data['AVG_DvP'] = data['AVG_DvP'].fillna(0)

        data['TOTAL'] = data['TOTAL'].fillna(data['TOTAL'].mean())
        data['POINTSPREAD'] = data['POINTSPREAD'].fillna(0)

        data['L1P_SB_COUNT'] = data['L1P_SB_COUNT'].fillna(0)
        data['L3P_SB_COUNT'] = data['L3P_SB_COUNT'].fillna(0)

        # we can predict Y for a player as long as AVG_Y is not nan
        data = data.dropna(subset=['AVG_OWNERSHIP'])

        return data

    def train_model(self):
        if not self.created_features:
            raise Exception('Must create features before training model')

        X = self.train_data[self.regressors]
        y = self.train_data[self.regressand]
        self.model.fit(X, y, test_size=0.25, early_stopping_rounds=25)

        self.trained_model = True

    def predict(self):
        if not self.trained_model:
            raise Exception('Must train model before generating predictions')

        output_column = '{}_HAT'.format(self.regressand)

        self.test_data[output_column] = self.model.predict(
            self.test_data[self.regressors])

        return self.test_data[['DATE', 'SLATEID', 'NAME',
                               output_column]], output_column