def find_missing_player_observations(player_results_df): player_gid = player_results_df[['player', 'gid', 'team', 'year', 'wk']] player_gid.rename(columns={'gid': 'gid_player'}, inplace=True) team_lag_games = create_team_lagged_games(player_results_df) use_cpu = cpu_count() pool = Pool(use_cpu) player_gid = player_gid.sort_values(['team', 'year', 'player'], ascending=True) player_input_list = [] for data_chunk in chunks(player_gid['player'].unique(), use_cpu): temp_player_df = player_gid[player_gid['player'].isin(data_chunk)] player_input_list.append([temp_player_df, team_lag_games[team_lag_games[ 'team'].isin(temp_player_df['team'].unique())]]) df_pool = pool.map(identify_first_sequence_games_missed, player_input_list) player_missing_games = pd.concat(df_pool).drop(['gid(t-1)', 'gid_player', 'wk', 'year'], axis=1).fillna(0) pool.close() pool.join() return player_missing_games
def assign_lagged_out_vars(player_results_df, n_games_): df = player_results_df.copy() out_game_stats = [col for col in df.columns if '_out' in col] use_cpu = cpu_count() pool = Pool(use_cpu) df = df[['team', 'gid'] + out_game_stats].drop_duplicates() unique_teams = list(df['team'].unique()) df = df.sort_values('gid', ascending=True) df = df.reset_index(drop=True) team_input_list = [] for data_chunk in chunks(unique_teams, use_cpu): team_input_list.append([df[df['team'].isin(data_chunk)], data_chunk, out_game_stats, n_games_]) df_pool = pool.map(create_stat_out_lag_variables, team_input_list) df_concat = pd.concat(df_pool) pool.close() pool.join() player_results_df = pd.merge(player_results_df, df_concat, on=['team', 'gid'], how='left') return player_results_df
def calculate_opp_lags(def_sos_merged, n_games_): def_sos_index_cols = ['year', 'wk', 'team'] def_sos_cols = list(def_sos_merged.columns) for sos_index_col in def_sos_index_cols: if sos_index_col in def_sos_cols: def_sos_cols.remove(sos_index_col) #values = def_sos_merged[def_sos_cols].values # ensure all data is float #values = values.astype('float32') #[def_scaled_df, # cols_scaled] = scale_cols(values, def_sos_cols) #def_sos_merged_scaled_df = pd.concat((def_sos_merged, def_scaled_df), axis=1).drop(def_sos_cols, axis=1) df = def_sos_merged.copy() use_cpu = cpu_count() pool = Pool(use_cpu) df = df.sort_values(['year', 'wk'], ascending=True) unique_teams = list(df['team'].unique()) team_input_list = [] for data_chunk in chunks(unique_teams, use_cpu): team_input_list.append([ df[df['team'].isin(data_chunk)], data_chunk, def_sos_cols, def_sos_index_cols, n_games_ ]) df_pool = map(create_stat_defopp_lag_variables, team_input_list) df_concat = pd.concat(df_pool) pool.close() pool.join() def_cols = df_concat.columns[3:] updated_cols = [] for col in def_cols: updated_cols.append('opp_var_' + col) def_cols_rename = dict(zip(def_cols, updated_cols)) df_concat.rename(columns=def_cols_rename, inplace=True) return df_concat
def identify_nl_plays_pool(pbp_df): use_cpu = cpu_count() pool = Pool(use_cpu) nl_index = pbp_df[pbp_df['loc'] == 'NL'].index loc_nl_list = [] for nl_chunk in chunks(nl_index, use_cpu): loc_nl_list.append([pbp_df[pbp_df.index.isin(nl_chunk)], detail_loc_mapping]) df_pool = pool.map(identify_nl_plays, loc_nl_list) df_concat = pd.concat(df_pool).fillna(0) pbp_df = pd.concat((pbp_df[~pbp_df.index.isin(nl_index)], df_concat), axis=1) return pbp_df
def normalize_def_stats(def_sos_merged, def_cols, year_wk_dict): use_cpu = cpu_count() pool = Pool(use_cpu) def_sos_normalized_input = [] for year_wk_chunk in chunks(year_wk_dict.keys(), use_cpu): yr_wk_values = [year_wk_dict[a] for a in year_wk_chunk] year_wk_chunk_dict = dict(zip(year_wk_chunk, yr_wk_values)) def_sos_normalized_input.append([ def_sos_merged[def_cols + ['team', 'wk', 'year']], year_wk_chunk_dict ]) df_pool = pool.map(normalize_locs, def_sos_normalized_input) df_concat = pd.concat(df_pool) # .fillna(0) pool.close() pool.join() return df_concat
def calculate_def_window_sos_pool(pbp_data_team_def, year_wk_dict, team_def_stats): use_cpu = cpu_count() pool = Pool(use_cpu) def_sos_input = [] for year_wk_chunk in chunks(year_wk_dict.keys(), use_cpu): yr_wk_values = [year_wk_dict[a] for a in year_wk_chunk] year_wk_chunk_dict = dict(zip(year_wk_chunk, yr_wk_values)) # yr_list = list(np.unique([a.split('_')[0] for a in year_wk_chunk])) def_sos_input.append( [pbp_data_team_def, year_wk_chunk_dict, team_def_stats]) df_pool = pool.map(calculate_def_window_sos, def_sos_input) df_concat = pd.concat(df_pool).fillna(0) pool.close() pool.join() return df_concat
def calculate_def_ftp_window_sos_pool(pbp_data, player_results_df, year_wk_dict, ftps_stats, groupby_off_cols, index_cols, rename_cols, start_yr=2010): posd_cols = [col for col in player_results_df.columns if 'posd_' in col] pbp_data_team = pd.merge( pbp_data[['gid', 'seas_wk', 'player', 'loc', 'dk_diff', 'fd_diff']], player_results_df[['gid', 'player', 'team', 'opp'] + posd_cols], on=['gid', 'player'], how='left').groupby(['gid', 'seas_wk', 'loc', 'team', 'opp'])[['dk_diff', 'fd_diff']].sum().reset_index() # + posd_cols) pbp_data_team = pbp_data_team[(pbp_data_team['seas_wk'] >= start_yr - 1)] pbp_data_team[['year', 'wk']] = (pbp_data_team['seas_wk'].astype(str).str.pad( 7, side='right', fillchar='0').str.split('.', expand=True).astype(int)) use_cpu = cpu_count() pool = Pool(use_cpu) def_sos_input = [] for year_wk_chunk in chunks(year_wk_dict.keys(), use_cpu): yr_wk_values = [year_wk_dict[a] for a in year_wk_chunk] year_wk_chunk_dict = dict(zip(year_wk_chunk, yr_wk_values)) #yr_list = list(np.unique([a.split('_')[0] for a in year_wk_chunk])) def_sos_input.append([ pbp_data_team, year_wk_chunk_dict, ftps_stats, groupby_off_cols, index_cols, rename_cols, posd_cols ]) df_pool = pool.map(calculate_def_ftp_window_sos, def_sos_input) df_concat = pd.concat(df_pool) # .fillna(0) pool.close() pool.join() use_cpu = cpu_count() pool = Pool(use_cpu) def_sos_normalized_input = [] for year_wk_chunk in chunks(year_wk_dict.keys(), use_cpu): yr_wk_values = [year_wk_dict[a] for a in year_wk_chunk] year_wk_chunk_dict = dict(zip(year_wk_chunk, yr_wk_values)) def_sos_normalized_input.append([df_concat, year_wk_chunk_dict]) df_pool = pool.map(normalize_locs, def_sos_normalized_input) df_concat = pd.concat(df_pool) # .fillna(0) pool.close() pool.join() return df_concat
def calculate_exp_points(pbp_data, n_periods_): use_cpu = cpu_count() pool = Pool(use_cpu) unique_states = pbp_data[['dwn', 'ytg', 'yfog']].drop_duplicates().reset_index().drop('index', axis=1) unique_states = unique_states.sort_values('dwn', ascending=False) unique_states_list = [] for state_chunk in chunks(unique_states, use_cpu): dwn_list = list(state_chunk['dwn'].unique()) unique_states_list.append([pbp_data[(pbp_data['dwn'].isin(dwn_list)) & (pbp_data['dk_ftps_field_bin'] >= -.10)], state_chunk, n_periods_]) df_pool = pool.map(calc_exp_state_pts_by_year, unique_states_list) df_concat = pd.concat(df_pool).fillna(0) pool.close() pool.join() df_concat = pd.merge(pbp_data, df_concat, left_on=['seas_wk', 'dwn', 'ytg', 'yfog', 'loc'], right_on=['seas_wk', 'dwn_state', 'ytg_state', 'yfog_state', 'loc'], how='left') df_concat.rename(columns={'expected_dk_ftps_field_bin': 'dk_expected', 'expected_fd_ftps_field_bin': 'fd_expected', 'dk_ftps_field_bin': 'dk_actual', 'fd_ftps_field_bin': 'fd_actual', 'value': 'player', }, inplace=True) df_concat = df_concat[df_concat['seas_wk'] > (2000 + (n_periods_ / 21)) - 2] df_concat.loc[df_concat['dk_expected'] < 0, 'dk_expected'] = 0 seas_list = list(df_concat['seas_wk'].unique()) unique_states_agg = unique_states.drop('yfog', axis=1) unique_states_agg.loc[unique_states_agg['ytg'] > 15., 'ytg'] = 15. unique_states_agg = unique_states_agg.drop_duplicates().reset_index().drop('index', axis=1) smooth_exp_state_pts = [] for seas_chunk in chunks(seas_list[21:], use_cpu): min_seas_idx = seas_list.index(np.min(seas_chunk)) smooth_exp_state_pts.append([df_concat[(df_concat['seas_wk'] > min_seas_idx)], unique_states_agg, seas_chunk, seas_list]) pool = Pool(use_cpu) df_pool = pool.map(smooth_exp_states_values, smooth_exp_state_pts) df_concat = pd.concat(df_pool) pool.close() pool.join() df_concat.loc[:, 'dk_diff'] = (df_concat['dk_actual'] - df_concat['dk_expected_smoothed']) df_concat.loc[:, 'fd_diff'] = (df_concat['fd_actual'] - df_concat['fd_expected_smoothed']) df_concat = df_concat.drop(['yfog_state', 'ytg_state', 'dwn_state', 'dk_expected', 'fd_expected', 'dk_expected_mean', 'fd_expected_mean'], axis=1) df_concat.rename(columns={'dk_expected_smoothed': 'dk_expected', 'fd_expected_smoothed': 'fd_expected'}, inplace=True) df_concat = df_concat.groupby(['player', 'gid', 'seas_wk', 'loc'])[['dk_expected', 'fd_expected', 'dk_actual', 'fd_actual', 'dk_diff', 'fd_diff']].sum().reset_index() return df_concat