def wrangle_basic(p_raw, p_wrangled, filename): """Basic Wrangle: converts fieldnames, optimizes datatypes and persists data """ filename_lower = str(filename).lower() wrangled_file = p_wrangled.joinpath(filename_lower) if wrangled_file.exists(): logger.info(f'Skipping wrangle of {filename} - already performed') return os.chdir(p_raw) df = pd.read_csv(filename) df.rename(columns=get_fieldname_mapping(), inplace=True) df.columns = df.columns.str.lower() # downcast integers and convert float to Int64, if data permits dh.optimize_df_dtypes(df) msg = dh.df_info(df) logger.info(f'{filename}\n{msg}') # persist with optimized datatypes os.chdir(p_wrangled) dh.to_csv_with_types(df, wrangled_file)
def wrangle_fielding(p_raw, p_wrangled): """Drops cols > 90% null, converts fieldnames, optimizes datatypes and persists data """ if p_wrangled.joinpath('fielding.csv').exists(): logger.info('Skipping wrangle of Fielding.csv - already performed') return os.chdir(p_raw) fielding = pd.read_csv('Fielding.csv') fielding.rename(columns=get_fieldname_mapping(), inplace=True) fielding.columns = fielding.columns.str.lower() # drop any column that is more than 90% null filt = fielding.isna().mean() > 0.90 if filt.any(): drop_cols = fielding.columns[filt] logger.warning( f'Cols > 90% missing being dropped: {" ".join(drop_cols)}') fielding.drop(drop_cols, axis=1, inplace=True) dh.optimize_df_dtypes(fielding) msg = dh.df_info(fielding) logger.info('fielding\n{}'.format(msg)) # persist os.chdir(p_wrangled) dh.to_csv_with_types(fielding, 'fielding.csv')
def create_batting(player_game, game_start, p_retrosheet_wrangled): """Create batting.csv for batting attributes per player per game.""" # column names of the batting attributes b_cols = [col for col in player_game.columns if col.startswith('b_')] # Note: any player who is in a game in any role, will have b_g = 1 # even if b_pa == 0 (no plate appearances) # fields which uniquely identify a record pkey = ['game_id', 'player_id'] # fields to join to other "tables" fkey = ['team_id'] batting = player_game.loc[:, pkey + fkey + b_cols].copy() # remove b_ from the column names, except for b_2b and b_3b b_cols_new = {col: col[2:] for col in b_cols} b_cols_new['b_2b'] = 'double' b_cols_new['b_3b'] = 'triple' b_cols_new['b_gdp'] = 'gidp' # to match Lahman b_cols_new['b_hp'] = 'hbp' # to match Lahman batting.rename(columns=b_cols_new, inplace=True) # add game_start.dt.year as many queries use year batting = pd.merge(batting, game_start[['game_id', 'game_start']]) batting['year'] = batting['game_start'].dt.year.astype('int16') dh.optimize_df_dtypes(batting, ignore=['year']) logger.info('Writing and compressing batting. This could take several minutes ...') dh.to_csv_with_types(batting, p_retrosheet_wrangled / 'batting.csv.gz')
def create_fielding(player_game, game_start, p_retrosheet_wrangled): """Create fielding.csv for fielding attributes per player per game.""" # column names for fielding attributes f_cols = [col for col in player_game.columns if col.startswith('f_')] # create orig_cols dictionary which maps fielder's pos to original fielding columns names # create new_cols dictionary which maps fielder's pos to new fielding column names # pos: P, C, 1B, 2B, 3B, SS, LF, CF, RF # column name pattern: f_{pos}_{stat} orig_cols = collections.defaultdict(list) new_cols = collections.defaultdict(list) for col in f_cols: match = re.search(r'f_(\w{1,2})_(\w*)', col) pos = match.group(1) stat = match.group(2) orig_cols[pos].append(col) stat = stat.replace('out', 'inn_outs') # to match Lahman new_cols[pos].append(stat) # full pkey will be: ['game_id', 'player_id', 'pos'] pkey = ['game_id', 'player_id'] # fields to join to other "tables" fkey = ['team_id'] """For each record created by cwdaily, create up to 9 new records, one per position. Each record will temporarily go in its own dataframe and then be concatenated. Each dataframe has the same columns.""" dfs = [] for pos in orig_cols.keys(): # if all fielding attributes for this pos are 0 then the player did not play that pos # note: all attributes are unsigned integers f_filt = player_game[orig_cols[pos]].sum(axis=1) == 0 df = pd.DataFrame() df[pkey + fkey + new_cols[pos]] = \ player_game.loc[~f_filt, pkey + fkey + orig_cols[pos]].copy() # add the position column to the df # use upper case to match Lahman position values df.insert(2, 'pos', pos.upper()) # orig_cols['c'] has pb and xi columns # all other positions do not have pb and xi if pos != 'c': df[f'pb'] = 0 df[f'xi'] = 0 dfs.append(df) fielding = pd.concat(dfs, ignore_index=True) # add game_start.dt.year as many queries use year fielding = pd.merge(fielding, game_start[['game_id', 'game_start']]) fielding['year'] = fielding['game_start'].dt.year.astype('int16') dh.optimize_df_dtypes(fielding, ignore=['year']) logger.info('Writing and compressing fielding. This could take several minutes ...') dh.to_csv_with_types(fielding, p_retrosheet_wrangled / 'fielding.csv.gz')
def create_pitching(player_game, game_start, p_retrosheet_wrangled): """Create pitching.csv for pitching attributes per player per game.""" # column names of the pitching attributes p_cols = [col for col in player_game.columns if col.startswith('p_')] # if all pitching attributes are 0 then the player did not pitch # note: all attributes are unsigned integers, so if their sum is zero, all are zero p_filt = player_game[p_cols].sum(axis=1) == 0 # fields which uniquely identify a record pkey = ['game_id', 'player_id'] # fields to join to other "tables" fkey = ['team_id'] # data with some non-zero attributes pitching = player_game.loc[~p_filt, pkey + fkey + p_cols].copy() # remove p_ from the column names, except for p_2b and p_3b p_cols_new = {col: col[2:] for col in p_cols} p_cols_new['p_2b'] = 'double' p_cols_new['p_3b'] = 'triple' p_cols_new['p_gdp'] = 'gidp' # to match Lahman p_cols_new['p_hp'] = 'hbp' # to match Lahman pitching.rename(columns=p_cols_new, inplace=True) # add game_start.dt.year as many queries use year pitching = pd.merge(pitching, game_start[['game_id', 'game_start']]) pitching['year'] = pitching['game_start'].dt.year.astype('int16') dh.optimize_df_dtypes(pitching, ignore=['year']) logger.info('Writing and compressing pitching. This could take several minutes ...') dh.to_csv_with_types(pitching, p_retrosheet_wrangled / 'pitching.csv.gz')
def collect_parsed_files(parse_dir, collect_dir, parser, use_datatypes): """Collect all parsed files and optimize datatypes. """ os.chdir(parse_dir) # read the augmented files, not the ones created by cwevent if parser == 'cwevent': dailyfiles = glob.glob(f'{parser}*_plus.csv') else: dailyfiles = glob.glob(f'{parser}*.csv') dailyfiles.sort() logger.info( f'Collecting {len(dailyfiles)} {parser} parsed csv files into single dataframe ...' ) if use_datatypes: # this can save gigabytes of RAM by using precomputed datatypes logger.info('Using precomputed data types') if parser == 'cwdaily': filename = '../player_game_types.csv' elif parser == 'cwgame': filename = '../game_types.csv' elif parser == 'cwevent': filename = '../event_types.csv' else: raise ValueError(f'Unrecognized parser: {parser}') dates, dtypes = dh.read_types(filename) dtypes = {key.upper(): value for key, value in dtypes.items()} df = pd.concat((pd.read_csv(f, parse_dates=dates, dtype=dtypes) for f in dailyfiles), ignore_index=True, copy=False) logger.info(f'Optimized Memory Usage: {dh.mem_usage(df)}') else: # This could use twice the RAM required to hold the unoptimized DataFrame! # cwgame parser will output the line score (line_tx) like: 001001001 # but without double quotes around it, so it gets interpreted as a number. # Specify dtype for line score fields to get around this. df = pd.concat( (pd.read_csv(f, dtype={ 'AWAY_LINE_TX': str, 'HOME_LINE_TX': str }) for f in dailyfiles), ignore_index=True, copy=False) logger.info(f'Unoptimized Memory Usage: {dh.mem_usage(df)}') logger.info('Optimizing Data Types to reduce memory ...') # for cwdaily, optimize_df_dtypes reduces the size of the dataframe by a factor of 3 dh.optimize_df_dtypes(df) logger.info(f'Optimized Memory Usage: {dh.mem_usage(df)}') # convert to lower case df.columns = df.columns.str.lower() # drop any column that is more than 99% null filt = df.isna().mean() > 0.99 if filt.any(): drop_cols = df.columns[filt] logger.warning( f'Cols > 99% missing being dropped: {" ".join(drop_cols)}') df.drop(drop_cols, axis=1, inplace=True) # persist optimized dataframe # gzip chosen over xy because this runs on client computer and gzip is faster logger.info( 'persisting dataframe using compression - this could take several minutes ...' ) os.chdir(collect_dir) if parser == 'cwdaily': filename = 'player_game.csv.gz' elif parser == 'cwgame': filename = 'game.csv.gz' elif parser == 'cwevent': # was wrangled in parser to save RAM, write to wrangled dir filename = 'event.csv.gz' else: raise ValueError(f'Unrecognized parser: {parser}') dh.to_csv_with_types(df, filename) logger.info(f'{parser} data persisted')
def wrangle_game(game, p_retrosheet_wrangled): """Tidy the Game Data There are 3 types of data: data specific to a game -- the 'game' columns below data specific to the home team for that game -- the 'home' columns below data specific to the away team for that game -- the 'away' columns below The attributes for the home team are identical to the attributes for the away team. This suggests breaking this out into 2 csv files. 1. team_game.csv with key (game_id, team_id) -- stats per team per game (e.g. runs scored) 2. game.csv with key (game_id) -- stats per game (e.g. attendance) """ home_cols = [col for col in game.columns if col.startswith('home')] away_cols = [col for col in game.columns if col.startswith('away')] game_cols = [col for col in game.columns if not col.startswith('home') and not col.startswith('away')] game_tidy = game[game_cols].copy() home_team_game = game[['game_id'] + home_cols].copy() away_team_game = game[['game_id'] + away_cols].copy() home_team_game['bat_last'] = True away_team_game['bat_last'] = False home_team_game = dh.move_column_after(home_team_game, 'game_id', 'bat_last') away_team_game = dh.move_column_after(away_team_game, 'game_id', 'bat_last') # remove leading 'home_' and 'away_' suffix from fields home_team_game.rename(columns=lambda col: col[5:] if col.startswith('home_') else col, inplace=True) away_team_game.rename(columns=lambda col: col[5:] if col.startswith('away_') else col, inplace=True) # include opponent team_id in each row home_team_game.insert(4, 'opponent_team_id', away_team_game['team_id']) away_team_game.insert(4, 'opponent_team_id', home_team_game['team_id']) team_game = pd.concat([home_team_game, away_team_game]) # improve column names names = {col: col.replace('_ct', '') for col in team_game.columns if col.endswith('_ct')} # handle invalid identifiers names['2b_ct'] = 'double' names['3b_ct'] = 'triple' # pitcher_ct (number of pitchers) is a good name though, keep it names.pop('pitcher_ct') # additional fields to rename for consistency names['bi_ct'] = 'rbi' names['gdp_ct'] = 'gidp' names['hits_ct'] = 'h' names['hp_ct'] = 'hbp' names['err_ct'] = 'e' names['score_ct'] = 'r' team_game = team_game.rename(columns=names) # create new datetime column game_tidy['game_start'] = game_tidy.apply(parse_datetime, axis=1) game_tidy = dh.move_column_after(game_tidy, 'game_id', 'game_start') # these fields are no longer necessary game_tidy = game_tidy.drop(['start_game_tm', 'game_dt', 'game_dy'], axis=1) # add the game_start column to team_game to simplify queries team_game = pd.merge(team_game, game_tidy[['game_id', 'game_start']]) team_game['year'] = team_game['game_start'].dt.year.astype('int16') logger.info('Writing and compressing team_game. This could take several minutes ...') dh.optimize_df_dtypes(team_game, ignore=['year']) dh.to_csv_with_types(team_game, p_retrosheet_wrangled / 'team_game.csv.gz') # convert designated hitter to True/False and rename game_tidy['dh'] = False filt = game_tidy['dh_fl'] == 'T' game_tidy.loc[filt, 'dh'] = True game_tidy.drop('dh_fl', axis=1, inplace=True) # convert impossible attendance values to null and rename filt = game_tidy['attend_park_ct'] <= 0 impossible_values = game_tidy.loc[filt, 'attend_park_ct'].unique() game_tidy['attendance'] = game_tidy['attend_park_ct'].replace(impossible_values, np.nan) game_tidy.drop('attend_park_ct', axis=1, inplace=True) # convert impossible temperature values to null and rename filt = game_tidy['temp_park_ct'] <= 0 impossible_values = game_tidy.loc[filt, 'temp_park_ct'].unique() game_tidy['temperature'] = game_tidy['temp_park_ct'].replace(impossible_values, np.nan) game_tidy.drop('temp_park_ct', axis=1, inplace=True) # replace code values with strings # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-winddirection direction = { 0: 'unknown', 1: 'to_lf', 2: 'to_cf', 3: 'to_rf', 4: 'l_to_r', 5: 'from_lf', 6: 'from_cf', 7: 'from_rf', 8: 'r_to_l'} game_tidy['wind_direction'] = \ game_tidy['wind_direction_park_cd'].map(direction).replace('unknown', np.nan) game_tidy.drop('wind_direction_park_cd', axis=1, inplace=True) # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-windspeed # convert impossible wind speed values to null and rename filt = game_tidy['wind_speed_park_ct'] < 0 impossible_values = game_tidy.loc[filt, 'wind_speed_park_ct'].unique() game_tidy['wind_speed'] = game_tidy['wind_speed_park_ct'].replace(impossible_values, np.nan) game_tidy.drop('wind_speed_park_ct', axis=1, inplace=True) # replace code values with strings # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-fieldcondition condition = { 0: 'unknown', 1: 'soaked', 2: 'wet', 3: 'damp', 4: 'dry'} game_tidy['field_condition'] = \ game_tidy['field_park_cd'].map(condition).replace('unknown', np.nan) game_tidy.drop('field_park_cd', axis=1, inplace=True) # replace code values with strings # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-precipitation precip = { 0: 'unknown', 1: 'none', 2: 'drizzle', 3: 'showers', 4: 'rain', 5: 'snow'} game_tidy['precip_type'] = \ game_tidy['precip_park_cd'].map(precip).replace('unknown', np.nan) game_tidy.drop('precip_park_cd', axis=1, inplace=True) # replace code values with strings # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-sky sky = { 0: 'unknown', 1: 'sunny', 2: 'cloudy', 3: 'overcast', 4: 'night', 5: 'dome'} game_tidy['sky_condition'] = \ game_tidy['sky_park_cd'].map(sky).replace('unknown', np.nan) game_tidy.drop('sky_park_cd', axis=1, inplace=True) logger.info('Writing and compressing game. This could take several minutes ...') dh.optimize_df_dtypes(game_tidy) dh.to_csv_with_types(game_tidy, p_retrosheet_wrangled / 'game.csv.gz') # to add game date to other tables return game_tidy[['game_id', 'game_start']]