Python optimize_df_dtypesの例、data_helper.optimize_df_dtypes Pythonの例

コード例 #1

0

ファイルを表示

ファイル: lahman_wrangle.py プロジェクト: cityinspain/baseball-analytics

def wrangle_basic(p_raw, p_wrangled, filename):
    """Basic Wrangle:  converts fieldnames, optimizes datatypes and persists data
    """
    filename_lower = str(filename).lower()
    wrangled_file = p_wrangled.joinpath(filename_lower)

    if wrangled_file.exists():
        logger.info(f'Skipping wrangle of {filename} - already performed')
        return

    os.chdir(p_raw)
    df = pd.read_csv(filename)

    df.rename(columns=get_fieldname_mapping(), inplace=True)
    df.columns = df.columns.str.lower()

    # downcast integers and convert float to Int64, if data permits
    dh.optimize_df_dtypes(df)

    msg = dh.df_info(df)
    logger.info(f'{filename}\n{msg}')

    # persist with optimized datatypes
    os.chdir(p_wrangled)
    dh.to_csv_with_types(df, wrangled_file)

コード例 #2

0

ファイルを表示

ファイル: lahman_wrangle.py プロジェクト: cityinspain/baseball-analytics

def wrangle_fielding(p_raw, p_wrangled):
    """Drops cols > 90% null, converts fieldnames, optimizes datatypes and persists data
    """
    if p_wrangled.joinpath('fielding.csv').exists():
        logger.info('Skipping wrangle of Fielding.csv - already performed')
        return

    os.chdir(p_raw)
    fielding = pd.read_csv('Fielding.csv')

    fielding.rename(columns=get_fieldname_mapping(), inplace=True)
    fielding.columns = fielding.columns.str.lower()

    # drop any column that is more than 90% null
    filt = fielding.isna().mean() > 0.90
    if filt.any():
        drop_cols = fielding.columns[filt]
        logger.warning(
            f'Cols > 90% missing being dropped: {" ".join(drop_cols)}')
        fielding.drop(drop_cols, axis=1, inplace=True)

    dh.optimize_df_dtypes(fielding)

    msg = dh.df_info(fielding)
    logger.info('fielding\n{}'.format(msg))

    # persist
    os.chdir(p_wrangled)
    dh.to_csv_with_types(fielding, 'fielding.csv')

コード例 #3

0

ファイルを表示

ファイル: retrosheet_wrangle.py プロジェクト: sdiehl28/baseball-analytics

def create_batting(player_game, game_start, p_retrosheet_wrangled):
    """Create batting.csv for batting attributes per player per game."""
    # column names of the batting attributes
    b_cols = [col for col in player_game.columns if col.startswith('b_')]

    # Note: any player who is in a game in any role, will have b_g = 1
    # even if b_pa == 0 (no plate appearances)

    # fields which uniquely identify a record
    pkey = ['game_id', 'player_id']

    # fields to join to other "tables"
    fkey = ['team_id']

    batting = player_game.loc[:, pkey + fkey + b_cols].copy()

    # remove b_ from the column names, except for b_2b and b_3b
    b_cols_new = {col: col[2:] for col in b_cols}
    b_cols_new['b_2b'] = 'double'
    b_cols_new['b_3b'] = 'triple'
    b_cols_new['b_gdp'] = 'gidp'  # to match Lahman
    b_cols_new['b_hp'] = 'hbp'  # to match Lahman
    batting.rename(columns=b_cols_new, inplace=True)

    # add game_start.dt.year as many queries use year
    batting = pd.merge(batting, game_start[['game_id', 'game_start']])
    batting['year'] = batting['game_start'].dt.year.astype('int16')

    dh.optimize_df_dtypes(batting, ignore=['year'])
    logger.info('Writing and compressing batting.  This could take several minutes ...')
    dh.to_csv_with_types(batting, p_retrosheet_wrangled / 'batting.csv.gz')

コード例 #4

0

ファイルを表示

ファイル: retrosheet_wrangle.py プロジェクト: sdiehl28/baseball-analytics

def create_fielding(player_game, game_start, p_retrosheet_wrangled):
    """Create fielding.csv for fielding attributes per player per game."""
    # column names for fielding attributes
    f_cols = [col for col in player_game.columns if col.startswith('f_')]

    # create orig_cols dictionary which maps fielder's pos to original fielding columns names
    # create new_cols dictionary which maps fielder's pos to new fielding column names
    # pos: P, C, 1B, 2B, 3B, SS, LF, CF, RF
    # column name pattern: f_{pos}_{stat}
    orig_cols = collections.defaultdict(list)
    new_cols = collections.defaultdict(list)
    for col in f_cols:
        match = re.search(r'f_(\w{1,2})_(\w*)', col)
        pos = match.group(1)
        stat = match.group(2)
        orig_cols[pos].append(col)
        stat = stat.replace('out', 'inn_outs')  # to match Lahman
        new_cols[pos].append(stat)

    # full pkey will be: ['game_id', 'player_id', 'pos']
    pkey = ['game_id', 'player_id']

    # fields to join to other "tables"
    fkey = ['team_id']

    """For each record created by cwdaily, create up to 9 new records, one per position.
    Each record will temporarily go in its own dataframe and then be concatenated.
    
    Each dataframe has the same columns."""
    dfs = []
    for pos in orig_cols.keys():
        # if all fielding attributes for this pos are 0 then the player did not play that pos
        # note: all attributes are unsigned integers
        f_filt = player_game[orig_cols[pos]].sum(axis=1) == 0

        df = pd.DataFrame()
        df[pkey + fkey + new_cols[pos]] = \
            player_game.loc[~f_filt, pkey + fkey + orig_cols[pos]].copy()

        # add the position column to the df
        # use upper case to match Lahman position values
        df.insert(2, 'pos', pos.upper())

        # orig_cols['c'] has pb and xi columns
        # all other positions do not have pb and xi
        if pos != 'c':
            df[f'pb'] = 0
            df[f'xi'] = 0

        dfs.append(df)

    fielding = pd.concat(dfs, ignore_index=True)

    # add game_start.dt.year as many queries use year
    fielding = pd.merge(fielding, game_start[['game_id', 'game_start']])
    fielding['year'] = fielding['game_start'].dt.year.astype('int16')

    dh.optimize_df_dtypes(fielding, ignore=['year'])
    logger.info('Writing and compressing fielding.  This could take several minutes ...')
    dh.to_csv_with_types(fielding, p_retrosheet_wrangled / 'fielding.csv.gz')

コード例 #5

0

ファイルを表示

ファイル: retrosheet_wrangle.py プロジェクト: sdiehl28/baseball-analytics

def create_pitching(player_game, game_start, p_retrosheet_wrangled):
    """Create pitching.csv for pitching attributes per player per game."""
    # column names of the pitching attributes
    p_cols = [col for col in player_game.columns if col.startswith('p_')]

    # if all pitching attributes are 0 then the player did not pitch
    # note: all attributes are unsigned integers, so if their sum is zero, all are zero
    p_filt = player_game[p_cols].sum(axis=1) == 0

    # fields which uniquely identify a record
    pkey = ['game_id', 'player_id']

    # fields to join to other "tables"
    fkey = ['team_id']

    # data with some non-zero attributes
    pitching = player_game.loc[~p_filt, pkey + fkey + p_cols].copy()

    # remove p_ from the column names, except for p_2b and p_3b
    p_cols_new = {col: col[2:] for col in p_cols}
    p_cols_new['p_2b'] = 'double'
    p_cols_new['p_3b'] = 'triple'
    p_cols_new['p_gdp'] = 'gidp'  # to match Lahman
    p_cols_new['p_hp'] = 'hbp'  # to match Lahman
    pitching.rename(columns=p_cols_new, inplace=True)

    # add game_start.dt.year as many queries use year
    pitching = pd.merge(pitching, game_start[['game_id', 'game_start']])
    pitching['year'] = pitching['game_start'].dt.year.astype('int16')

    dh.optimize_df_dtypes(pitching, ignore=['year'])
    logger.info('Writing and compressing pitching.  This could take several minutes ...')
    dh.to_csv_with_types(pitching, p_retrosheet_wrangled / 'pitching.csv.gz')

コード例 #6

0

ファイルを表示

ファイル: retrosheet_collect.py プロジェクト: sdiehl28/baseball-analytics

def collect_parsed_files(parse_dir, collect_dir, parser, use_datatypes):
    """Collect all parsed files and optimize datatypes.
    """

    os.chdir(parse_dir)
    # read the augmented files, not the ones created by cwevent
    if parser == 'cwevent':
        dailyfiles = glob.glob(f'{parser}*_plus.csv')
    else:
        dailyfiles = glob.glob(f'{parser}*.csv')
    dailyfiles.sort()

    logger.info(
        f'Collecting {len(dailyfiles)} {parser} parsed csv files into single dataframe ...'
    )

    if use_datatypes:
        # this can save gigabytes of RAM by using precomputed datatypes
        logger.info('Using precomputed data types')
        if parser == 'cwdaily':
            filename = '../player_game_types.csv'
        elif parser == 'cwgame':
            filename = '../game_types.csv'
        elif parser == 'cwevent':
            filename = '../event_types.csv'
        else:
            raise ValueError(f'Unrecognized parser: {parser}')

        dates, dtypes = dh.read_types(filename)
        dtypes = {key.upper(): value for key, value in dtypes.items()}

        df = pd.concat((pd.read_csv(f, parse_dates=dates, dtype=dtypes)
                        for f in dailyfiles),
                       ignore_index=True,
                       copy=False)
        logger.info(f'Optimized Memory Usage:   {dh.mem_usage(df)}')
    else:
        # This could use twice the RAM required to hold the unoptimized DataFrame!
        # cwgame parser will output the line score (line_tx) like: 001001001
        # but without double quotes around it, so it gets interpreted as a number.
        # Specify dtype for line score fields to get around this.
        df = pd.concat(
            (pd.read_csv(f, dtype={
                'AWAY_LINE_TX': str,
                'HOME_LINE_TX': str
            }) for f in dailyfiles),
            ignore_index=True,
            copy=False)

        logger.info(f'Unoptimized Memory Usage: {dh.mem_usage(df)}')
        logger.info('Optimizing Data Types to reduce memory ...')

        # for cwdaily, optimize_df_dtypes reduces the size of the dataframe by a factor of 3
        dh.optimize_df_dtypes(df)
        logger.info(f'Optimized Memory Usage:   {dh.mem_usage(df)}')

    # convert to lower case
    df.columns = df.columns.str.lower()

    # drop any column that is more than 99% null
    filt = df.isna().mean() > 0.99
    if filt.any():
        drop_cols = df.columns[filt]
        logger.warning(
            f'Cols > 99% missing being dropped: {" ".join(drop_cols)}')
        df.drop(drop_cols, axis=1, inplace=True)

    # persist optimized dataframe
    # gzip chosen over xy because this runs on client computer and gzip is faster
    logger.info(
        'persisting dataframe using compression - this could take several minutes ...'
    )
    os.chdir(collect_dir)
    if parser == 'cwdaily':
        filename = 'player_game.csv.gz'
    elif parser == 'cwgame':
        filename = 'game.csv.gz'
    elif parser == 'cwevent':  # was wrangled in parser to save RAM, write to wrangled dir
        filename = 'event.csv.gz'
    else:
        raise ValueError(f'Unrecognized parser: {parser}')

    dh.to_csv_with_types(df, filename)
    logger.info(f'{parser} data persisted')

コード例 #7

0

ファイルを表示

ファイル: retrosheet_wrangle.py プロジェクト: sdiehl28/baseball-analytics

def wrangle_game(game, p_retrosheet_wrangled):
    """Tidy the Game Data

    There are 3 types of data:

    data specific to a game -- the 'game' columns below
    data specific to the home team for that game -- the 'home' columns below
    data specific to the away team for that game -- the 'away' columns below
    The attributes for the home team are identical to the attributes for the away team.

    This suggests breaking this out into 2 csv files.

    1. team_game.csv with key (game_id, team_id) -- stats per team per game (e.g. runs scored)
    2. game.csv with key (game_id) -- stats per game (e.g. attendance)
    """

    home_cols = [col for col in game.columns if col.startswith('home')]
    away_cols = [col for col in game.columns if col.startswith('away')]
    game_cols = [col for col in game.columns
                 if not col.startswith('home') and not col.startswith('away')]

    game_tidy = game[game_cols].copy()
    home_team_game = game[['game_id'] + home_cols].copy()
    away_team_game = game[['game_id'] + away_cols].copy()

    home_team_game['bat_last'] = True
    away_team_game['bat_last'] = False
    home_team_game = dh.move_column_after(home_team_game, 'game_id', 'bat_last')
    away_team_game = dh.move_column_after(away_team_game, 'game_id', 'bat_last')

    # remove leading 'home_' and 'away_' suffix from fields
    home_team_game.rename(columns=lambda col: col[5:] if col.startswith('home_') else col, inplace=True)
    away_team_game.rename(columns=lambda col: col[5:] if col.startswith('away_') else col, inplace=True)

    # include opponent team_id in each row
    home_team_game.insert(4, 'opponent_team_id', away_team_game['team_id'])
    away_team_game.insert(4, 'opponent_team_id', home_team_game['team_id'])
    team_game = pd.concat([home_team_game, away_team_game])

    # improve column names
    names = {col: col.replace('_ct', '') for col in team_game.columns if col.endswith('_ct')}

    # handle invalid identifiers
    names['2b_ct'] = 'double'
    names['3b_ct'] = 'triple'

    # pitcher_ct (number of pitchers) is a good name though, keep it
    names.pop('pitcher_ct')

    # additional fields to rename for consistency
    names['bi_ct'] = 'rbi'
    names['gdp_ct'] = 'gidp'
    names['hits_ct'] = 'h'
    names['hp_ct'] = 'hbp'
    names['err_ct'] = 'e'
    names['score_ct'] = 'r'

    team_game = team_game.rename(columns=names)

    # create new datetime column
    game_tidy['game_start'] = game_tidy.apply(parse_datetime, axis=1)
    game_tidy = dh.move_column_after(game_tidy, 'game_id', 'game_start')

    # these fields are no longer necessary
    game_tidy = game_tidy.drop(['start_game_tm', 'game_dt', 'game_dy'], axis=1)

    # add the game_start column to team_game to simplify queries
    team_game = pd.merge(team_game, game_tidy[['game_id', 'game_start']])
    team_game['year'] = team_game['game_start'].dt.year.astype('int16')

    logger.info('Writing and compressing team_game.  This could take several minutes ...')
    dh.optimize_df_dtypes(team_game, ignore=['year'])
    dh.to_csv_with_types(team_game, p_retrosheet_wrangled / 'team_game.csv.gz')

    # convert designated hitter to True/False and rename
    game_tidy['dh'] = False
    filt = game_tidy['dh_fl'] == 'T'
    game_tidy.loc[filt, 'dh'] = True
    game_tidy.drop('dh_fl', axis=1, inplace=True)

    # convert impossible attendance values to null and rename
    filt = game_tidy['attend_park_ct'] <= 0
    impossible_values = game_tidy.loc[filt, 'attend_park_ct'].unique()
    game_tidy['attendance'] = game_tidy['attend_park_ct'].replace(impossible_values, np.nan)
    game_tidy.drop('attend_park_ct', axis=1, inplace=True)

    # convert impossible temperature values to null and rename
    filt = game_tidy['temp_park_ct'] <= 0
    impossible_values = game_tidy.loc[filt, 'temp_park_ct'].unique()
    game_tidy['temperature'] = game_tidy['temp_park_ct'].replace(impossible_values, np.nan)
    game_tidy.drop('temp_park_ct', axis=1, inplace=True)

    # replace code values with strings
    # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-winddirection
    direction = {
        0: 'unknown',
        1: 'to_lf',
        2: 'to_cf',
        3: 'to_rf',
        4: 'l_to_r',
        5: 'from_lf',
        6: 'from_cf',
        7: 'from_rf',
        8: 'r_to_l'}
    game_tidy['wind_direction'] = \
        game_tidy['wind_direction_park_cd'].map(direction).replace('unknown', np.nan)
    game_tidy.drop('wind_direction_park_cd', axis=1, inplace=True)

    # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-windspeed
    # convert impossible wind speed values to null and rename
    filt = game_tidy['wind_speed_park_ct'] < 0
    impossible_values = game_tidy.loc[filt, 'wind_speed_park_ct'].unique()
    game_tidy['wind_speed'] = game_tidy['wind_speed_park_ct'].replace(impossible_values, np.nan)
    game_tidy.drop('wind_speed_park_ct', axis=1, inplace=True)

    # replace code values with strings
    # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-fieldcondition
    condition = {
        0: 'unknown',
        1: 'soaked',
        2: 'wet',
        3: 'damp',
        4: 'dry'}
    game_tidy['field_condition'] = \
        game_tidy['field_park_cd'].map(condition).replace('unknown', np.nan)
    game_tidy.drop('field_park_cd', axis=1, inplace=True)

    # replace code values with strings
    # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-precipitation
    precip = {
        0: 'unknown',
        1: 'none',
        2: 'drizzle',
        3: 'showers',
        4: 'rain',
        5: 'snow'}
    game_tidy['precip_type'] = \
        game_tidy['precip_park_cd'].map(precip).replace('unknown', np.nan)
    game_tidy.drop('precip_park_cd', axis=1, inplace=True)

    # replace code values with strings
    # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-sky
    sky = {
        0: 'unknown',
        1: 'sunny',
        2: 'cloudy',
        3: 'overcast',
        4: 'night',
        5: 'dome'}
    game_tidy['sky_condition'] = \
        game_tidy['sky_park_cd'].map(sky).replace('unknown', np.nan)
    game_tidy.drop('sky_park_cd', axis=1, inplace=True)

    logger.info('Writing and compressing game.  This could take several minutes ...')
    dh.optimize_df_dtypes(game_tidy)
    dh.to_csv_with_types(game_tidy, p_retrosheet_wrangled / 'game.csv.gz')

    # to add game date to other tables
    return game_tidy[['game_id', 'game_start']]