Esempio n. 1
0
def get_player_options():
    """Returns list of First Last (DOB)"""
    df = players.get_player_ids_file()[['Name', 'DOB', 'ID']].sort_values('Name')
    names = list(df.Name)
    dobs = list(df.DOB)
    pids = list(df.ID)
    namedob = [{'label': '{0:s} ({1:s})'.format(name, dob), 'value': pid} for name, dob, pid in zip(names, dobs, pids)]
    return namedob
Esempio n. 2
0
def drop_duplicate_pairs(rates):
    """
    The shot rates dataframe has duplicates--e.g. in one row Orlov is PlayerID1 and Niskanen PlayerID2, but in
    another Niskanen is PlayerID1 and Orlov is playerID2. This method will select only one, using the following rules:

    - For mixed-hand pairs, pick the one where P1 is the lefty and P2 is the righty
    - For other pairs, arrange by PlayerID. The one with the smaller ID is P1 and the larger, P2.

    :param rates: dataframe as created by get_dpair_shot_rates

    :return: dataframe, rates with half of rows dropped
    """

    handedness = players.get_player_ids_file().query('Pos == "D"')[[
        'ID', 'Hand'
    ]]
    rates = rates.merge(handedness.rename(columns={'ID': 'PlayerID1', 'Hand': 'Hand1'})) \
        .merge(handedness.rename(columns={'ID': 'PlayerID2', 'Hand': 'Hand2'}))

    rates = rates[((rates.Hand1 == "R") & (rates.Hand2 == "L")) == False]

    lr_pairs = rates.query('Hand1 == "L" & Hand2 == "R"')  # Will keep these
    ll_rr_pairs = rates[((rates.Hand1 == "L") & (rates.Hand2 == "R")) == False]

    # Melt and arrange, and pick first
    ll_rr_pairs = ll_rr_pairs[['PlayerID1', 'PlayerID2']].assign(PairIndex=1)
    ll_rr_pairs.loc[:, 'PairIndex'] = ll_rr_pairs.PairIndex.cumsum()
    melted = helper.melt_helper(ll_rr_pairs,
                                id_vars='PairIndex',
                                var_name='P1P2',
                                value_name='PlayerID')

    firsts = melted.sort_values(['PairIndex', 'PlayerID']) \
        .groupby('PairIndex', as_index=False) \
        .first() \
        .drop('P1P2', axis=1) \
        .rename(columns={'PlayerID': 'PlayerID1'})
    lasts = melted.sort_values(['PairIndex', 'PlayerID']) \
        .groupby('PairIndex', as_index=False) \
        .last() \
        .drop('P1P2', axis=1) \
        .rename(columns={'PlayerID': 'PlayerID2'})

    joined = firsts.merge(lasts, how='outer', on='PairIndex').drop('PairIndex',
                                                                   axis=1)

    # Inner join back on
    df = pd.concat([lr_pairs,
                    rates.merge(joined, how='inner', on=['PlayerID1', 'PlayerID2'])]) \
        .drop({'Hand1', 'Hand2'}, axis=1)

    return df
Esempio n. 3
0
def _add_xy_names_for_dpair_graph(df, delta_small=0.25, delta_large=0.75):
    """
    X is CF60 and Y is CA60. Pushes PlayerID1 a little to the left and PlayerID2 a little to the right in X. Also
    adds player names.

    :param df: dataframe with CF60 and CA60. This df will be wide.
    :param delta_small: amount to move by, in data coordinates, for LL and RR pairs
    :param delta_large: amount to move by, in data coordinates, for LR pairs. Need two deltas because the plot is with
        triangles and the triangles plot so that the vertex across from the short side, and not the center of the short
        side, is at the xy specified.

    :return: dataframe with X and Y and names added on, melted version of original df
    """
    df = df.assign(PairIndex=1)
    df.loc[:, 'PairIndex'] = df.PairIndex.cumsum()

    melted = helper.melt_helper(
        df[['PlayerID1', 'PlayerID2', 'CF60', 'CA60', 'TOI', 'PairIndex']],
        id_vars=['CF60', 'CA60', 'TOI', 'PairIndex'],
        var_name='P1P2',
        value_name='PlayerID')

    handedness = players.get_player_ids_file().query('Pos == "D"')[[
        'ID', 'Hand'
    ]]
    deltadf = df[['PlayerID1', 'PlayerID2', 'PairIndex']] \
        .merge(handedness.rename(columns={'ID': 'PlayerID1', 'Hand': 'Hand1'}), how='left', on='PlayerID1') \
        .merge(handedness.rename(columns={'ID': 'PlayerID2', 'Hand': 'Hand2'}), how='left', on='PlayerID2')
    deltadf.loc[((deltadf.Hand1 == 'L') & (deltadf.Hand2 == 'R')),
                'DeltaReq'] = delta_large
    deltadf.loc[:, 'DeltaReq'] = deltadf.DeltaReq.fillna(delta_small)
    deltadf = deltadf[['PairIndex', 'DeltaReq']]

    melted = melted.merge(deltadf, how='left', on='PairIndex')

    melted.loc[:, 'Name'] = melted.PlayerID.apply(
        lambda x: players.player_as_str(x))

    temp1 = melted[melted.P1P2 == 'PlayerID1']
    temp2 = melted[melted.P1P2 == 'PlayerID2']

    temp1.loc[:, 'X'] = temp1.CF60 - temp1.DeltaReq
    temp2.loc[:, 'X'] = temp2.CF60 + temp2.DeltaReq

    melted = pd.concat([temp1, temp2])
    melted.loc[:, 'Y'] = melted.CA60

    return melted
Esempio n. 4
0
def get_fline_shot_rates(team, startdate, enddate):
    """
    Gets CF/60 and CA/60 by defenseman duo (5v5 only) for this team between given range of dates

    :param team: int or str, team
    :param startdate: str, start date
    :param enddate: str, end date (inclusive)

    :return: dataframe with PlayerID1, PlayerID2, CF, CA, TOI (in secs), CF/60 and CA/60
    """
    # TODO this method is so slow

    startseason, endseason = [
        helper.infer_season_from_date(x) for x in (startdate, enddate)
    ]

    dflst = []
    for season in range(startseason, endseason + 1):
        games_played = schedules.get_team_games(season, team, startdate,
                                                enddate)
        games_played = [g for g in games_played if 20001 <= g <= 30417]

        toi = combos.get_team_combo_toi(season, team, games_played, n_players=3) \
            .rename(columns={'Secs': 'TOI'})

        cfca = combos.get_team_combo_corsi(season,
                                           team,
                                           games_played,
                                           n_players=3)

        joined = toi.merge(cfca, how='outer', on=['PlayerID1', 'PlayerID2', 'PlayerID3']) \
            .assign(Season=season)
        dflst.append(joined)

    df = pd.concat(dflst) \
        .groupby(['PlayerID1', 'PlayerID2', 'PlayerID3'], as_index=False).sum()
    df.loc[:, 'CF60'] = df.CF * 3600 / df.TOI
    df.loc[:, 'CA60'] = df.CA * 3600 / df.TOI

    forwards = players.get_player_ids_file().query('Pos != "D"')[['ID']]
    df = df.merge(forwards.rename(columns={'ID': 'PlayerID1'}), how='inner', on='PlayerID1') \
        .merge(forwards.rename(columns={'ID': 'PlayerID2'}), how='inner', on='PlayerID2') \
        .merge(forwards.rename(columns={'ID': 'PlayerID3'}), how='inner', on='PlayerID3')

    return df
Esempio n. 5
0
def _finish_toidf_manipulations(df, season, game):
    """
    Takes dataframe of shifts (one row per shift) and makes into a matrix of players on ice for each second.

    :param df: dataframe
    :param season: int, the season
    :param game: int, the game

    :return: dataframe
    """
    gameinfo = schedules.get_game_data_from_schedule(season, game)

    # TODO don't read end times. Use duration, which has good coverage, to infer end. Then end + 1200 not needed below.
    # Sometimes shifts have the same start and time.
    # By the time we're here, they'll have start = end + 1
    # So let's remove shifts with duration -1
    df = df[df.Start != df.End + 1]

    # Sometimes you see goalies with a shift starting in one period and ending in another
    # This is to help in those cases.
    if sum(df.End < df.Start) > 0:
        # ed.print_and_log('Have to adjust a shift time', 'warn')
        # TODO I think I'm making a mistake with overtime shifts--end at 3900!
        # TODO also, maybe only go to the end of the period, not to 1200
        # ed.print_and_log(df[df.End < df.Start])
        df.loc[df.End < df.Start,
               'End'] = df.loc[df.End < df.Start, 'End'] + 1200
    # One issue coming up is when the above line comes into play--missing times are filled in as 0:00
    tempdf = df[['PlayerID', 'Start', 'End', 'Team',
                 'Duration']].query("Duration > 0")
    tempdf = tempdf.assign(Time=tempdf.Start)
    # print(tempdf.head(20))

    # Let's filter out goalies for now. We can add them back in later.
    # This will make it easier to get the strength later
    pids = players.get_player_ids_file()
    tempdf = tempdf.merge(pids[['ID', 'Pos']],
                          how='left',
                          left_on='PlayerID',
                          right_on='ID')

    # toi = pd.DataFrame({'Time': [i for i in range(0, max(df.End) + 1)]})
    toi = pd.DataFrame(
        {'Time': [i for i in range(0, int(round(max(df.End))))]})

    # Originally used a hacky way to fill in times between shift start and end: increment tempdf by one, filter, join
    # Faster to work with base structures
    # Or what if I join each player to full df, fill backward on start and end, and filter out rows where end > time
    # toidict = toi.to_dict(orient='list')
    # players_by_sec = [[] for _ in range(min(toidict['Start'], toidict['End'] + 1))]
    # for i in range(len(players_by_sec)):
    #    for j in range(toidict['Start'][i], toidict['End'][i] + 1):
    #        players_by_sec[j].append(toidict['PlayerID'][i])
    # Maybe I can create a matrix with rows = time and columns = players
    # Loop over start and end, and use iloc[] to set booleans en masse.
    # Then melt and filter

    # Create one row per second
    alltimes = toi.Time
    newdf = pd.DataFrame(index=alltimes)

    # Add rows and set times to True simultaneously
    for i, (pid, start, end, team, duration, time, pid,
            pos) in tempdf.iterrows():
        newdf.loc[start:end, pid] = True

    # Fill NAs to False
    for col in newdf:
        newdf.loc[:, col] = newdf[col].fillna(False)

    # Go wide to long and then drop unneeded rows
    newdf = helpers.melt_helper(
        newdf.reset_index(),
        id_vars=
        'Time',  # value_vars=newdf.columns,  # cols with num colnames causing errors
        var_name='PlayerID',
        value_name='OnIce')
    newdf = newdf[newdf.OnIce].drop('OnIce', axis=1)
    newdf = newdf.merge(tempdf.drop('Time', axis=1), how='left', on='PlayerID') \
        .query("Time <= End & Time >= Start") \
        .drop('ID', axis=1)

    # In case there were rows that were all missing, join onto TOI
    tempdf = toi.merge(newdf, how='left', on='Time')
    # TODO continue here--does newdf match tempdf after sort_values?

    # Old method
    # toidfs = []
    # while len(tempdf.index) > 0:
    #    temptoi = toi.merge(tempdf, how='inner', on='Time')
    #    toidfs.append(temptoi)

    #    tempdf = tempdf.assign(Time=tempdf.Time + 1)
    #    tempdf = tempdf.query('Time <= End')

    # tempdf = pd.concat(toidfs)
    # tempdf = tempdf.sort_values(by='Time')

    goalies = tempdf[tempdf.Pos == 'G'].drop({'Pos'}, axis=1)
    tempdf = tempdf[tempdf.Pos != 'G'].drop({'Pos'}, axis=1)

    # Append team name to start of columns by team
    home = str(gameinfo['Home'])
    road = str(gameinfo['Road'])

    # Goalies
    # Let's assume we get only one goalie per second per team.
    # TODO: flag if there are multiple listed and pick only one
    goalies.loc[:, 'GTeam'] = goalies.Team.apply(
        lambda x: 'HG' if str(int(x)) == home else 'RG')
    try:
        goalies2 = goalies[['Time', 'PlayerID', 'GTeam']] \
            .pivot(index='Time', columns='GTeam', values='PlayerID') \
            .reset_index()
    except ValueError:
        # Duplicate entries in index error.
        # ed.print_and_log('Multiple goalies for a team in {0:d} {1:d}, picking one with the most TOI'.format(
        #    season, game), 'warn')

        # Find times with multiple goalies
        too_many_goalies_h = goalies[goalies.GTeam == 'HG'][['Time']] \
            .assign(GoalieCount=1) \
            .groupby('Time').count() \
            .reset_index() \
            .query('GoalieCount > 1')

        too_many_goalies_r = goalies[goalies.GTeam == 'RG'][['Time']] \
            .assign(GoalieCount=1) \
            .groupby('Time').count() \
            .reset_index() \
            .query('GoalieCount > 1')

        # Find most common goalie for each team
        if len(too_many_goalies_h) == 0:
            problem_times_revised_h = goalies
        else:  # i.e. if len(too_many_goalies_h) > 0:
            top_goalie_h = goalies[goalies.GTeam == 'HG'][['PlayerID']] \
                .assign(GoalieCount=1) \
                .groupby('PlayerID').count() \
                .reset_index() \
                .sort_values('GoalieCount', ascending=False) \
                .PlayerID.iloc[0]
            # and now finally drop problem times
            problem_times_revised_h = goalies \
                .merge(too_many_goalies_h[['Time']], how='outer', on='Time', indicator=True)
            problem_times_revised_h.loc[:, 'ToDrop'] = (problem_times_revised_h._merge == 'both') & \
                                                       (problem_times_revised_h.PlayerID != top_goalie_h)
            problem_times_revised_h = problem_times_revised_h[problem_times_revised_h.ToDrop == False] \
                .drop({'_merge', 'ToDrop'}, axis=1)

        if len(too_many_goalies_r) == 0:
            problem_times_revised_r = problem_times_revised_h
        else:  # i.e. if len(too_many_goalies_r) > 0:
            top_goalie_r = goalies[goalies.GTeam == 'RG'][['PlayerID']] \
                .assign(GoalieCount=1) \
                .groupby('PlayerID').count() \
                .reset_index() \
                .sort_values('GoalieCount', ascending=False) \
                .PlayerID.iloc[0]
            problem_times_revised_r = problem_times_revised_h \
                .merge(too_many_goalies_r[['Time']], how='outer', on='Time', indicator=True)
            problem_times_revised_r.loc[:, 'ToDrop'] = (problem_times_revised_r._merge == 'both') & \
                                                       (problem_times_revised_r.PlayerID != top_goalie_r)
            problem_times_revised_r = problem_times_revised_r[problem_times_revised_r.ToDrop == False] \
                .drop({'_merge', 'ToDrop'}, axis=1)

        # Pivot again
        goalies2 = problem_times_revised_r[['Time', 'PlayerID', 'GTeam']] \
            .pivot(index='Time', columns='GTeam', values='PlayerID') \
            .reset_index()

    # Home
    hdf = tempdf.query('Team == "' + home + '"').sort_values(
        ['Time', 'Duration'], ascending=[True, False])
    if len(hdf) == 0:
        # Earlier versions of pandas can have diff behavior
        hdf = tempdf.query('Team == ' + home).sort_values(
            ['Time', 'Duration'], ascending=[True, False])
    hdf2 = hdf[['Time', 'Duration']].groupby('Time').rank(method='first',
                                                          ascending=False)
    hdf2 = hdf2.rename(columns={'Duration': 'rank'})
    hdf2.loc[:, 'rank'] = hdf2['rank'].apply(lambda x: int(x))
    hdf.loc[:, 'rank'] = 'H' + hdf2['rank'].astype('str')

    rdf = tempdf.query('Team == "' + road + '"').sort_values(
        ['Time', 'Duration'], ascending=[True, False])
    if len(rdf) == 0:
        rdf = tempdf.query('Team == ' + road).sort_values(
            ['Time', 'Duration'], ascending=[True, False])
    rdf2 = rdf[['Time', 'Duration']].groupby('Time').rank(method='first',
                                                          ascending=False)
    rdf2 = rdf2.rename(columns={'Duration': 'rank'})
    rdf2.loc[:, 'rank'] = rdf2['rank'].apply(lambda x: int(x))
    rdf.loc[:, 'rank'] = 'R' + rdf2['rank'].astype('str')

    # Remove values above 6--looking like there won't be many
    # But in those cases take shifts with longest durations
    # That's why we create hdf and rdf by also sorting by Time and Duration above, and select duration for rank()
    if len(hdf[hdf['rank'] == "H7"]) > 0:
        # ed.print_and_log('Some times from {0:d} {1:d} have too many home players; cutting off at 6'.format(
        #    season, game), 'warn')
        # ed.print_and_log('Longest shift being lost was {0:d} seconds'.format(
        #    hdf[hdf['rank'] == "H7"].Duration.max()), 'warn')
        pass
    if len(rdf[rdf['rank'] == "R7"]) > 0:
        # ed.print_and_log('Some times from {0:d} {1:d} have too many road players; cutting off at 6'.format(
        #    season, game), 'warn')
        # ed.print_and_log('Longest shift being lost was {0:d} seconds'.format(
        #    rdf[rdf['rank'] == "H7"].Duration.max()), 'warn')
        pass

    hdf = hdf.pivot(index='Time', columns='rank', values='PlayerID').iloc[:,
                                                                          0:6]
    hdf.reset_index(inplace=True)  # get time back as a column
    rdf = rdf.pivot(index='Time', columns='rank', values='PlayerID').iloc[:,
                                                                          0:6]
    rdf.reset_index(inplace=True)

    toi = toi.merge(hdf, how='left', on='Time') \
        .merge(rdf, how='left', on='Time') \
        .merge(goalies2, how='left', on='Time')

    column_order = list(toi.columns.values)
    column_order = ['Time'] + [x for x in sorted(column_order[1:])
                               ]  # First entry is Time; sort rest
    toi = toi[column_order]
    # Now should be Time, H1, H2, ... HG, R1, R2, ..., RG

    # For games in the first, HG and RG may not exist yet. Have dummy replacements in there.
    # Will be wrong for when goalie is pulled in first, but oh well...
    if 'HG' not in toi.columns:
        newcol = [0 for _ in range(len(toi))]
        toi.insert(loc=toi.columns.get_loc('R1'), column='HG', value=newcol)
    if 'RG' not in toi.columns:
        toi.loc[:, 'RG'] = 0

    toi.loc[:, 'HomeSkaters'] = 0
    for col in toi.loc[:, 'H1':'HG'].columns[:-1]:
        toi.loc[:, 'HomeSkaters'] = toi[col].notnull() + toi.HomeSkaters
    toi.loc[:, 'HomeSkaters'] = 100 * toi['HG'].notnull(
    ) + toi.HomeSkaters  # a hack to make it easy to recognize
    toi.loc[:, 'RoadSkaters'] = 0
    for col in toi.loc[:, 'R1':'RG'].columns[:-1]:
        toi.loc[:, 'RoadSkaters'] = toi[col].notnull() + toi.RoadSkaters
    toi.loc[:, 'RoadSkaters'] = 100 * toi['RG'].notnull(
    ) + toi.RoadSkaters  # a hack to make it easy to recognize

    # This is how we label strengths: 5 means 5 skaters plus goalie; five skaters w/o goalie is 4+1.
    toi.loc[:,
            'HomeStrength'] = toi.HomeSkaters.apply(lambda x: '{0:d}'.format(
                x - 100) if x >= 100 else '{0:d}+1'.format(x - 1))
    toi.loc[:,
            'RoadStrength'] = toi.RoadSkaters.apply(lambda x: '{0:d}'.format(
                x - 100) if x >= 100 else '{0:d}+1'.format(x - 1))

    toi.drop({'HomeSkaters', 'RoadSkaters'}, axis=1, inplace=True)

    # Also drop -1+1 and 0+1 cases, which are clearly errors, and the like.
    # Need at least 3 skaters apiece, 1 goalie apiece, time, and strengths to be non-NA = 11 non NA values
    toi2 = toi.dropna(axis=0,
                      thresh=11)  # drop rows without at least 11 non-NA values
    if len(toi2) < len(toi):
        # ed.print_and_log('Dropped {0:d}/{1:d} times in {2:d} {3:d} because of invalid strengths'.format(
        #    len(toi) - len(toi2), len(toi), season, game), 'warn')
        pass

    # TODO data quality check that I don't miss times in the middle of the game

    return toi2
Esempio n. 6
0
def get_dpair_shot_rates(team, startdate, enddate):
    """
    Gets CF/60 and CA/60 by defenseman duo (5v5 only) for this team between given range of dates

    :param team: int or str, team
    :param startdate: str, start date
    :param enddate: str, end date (inclusive)

    :return: dataframe with PlayerID1, PlayerID2, CF, CA, TOI (in secs), CF/60 and CA/60
    """
    startseason, endseason = [
        helper.infer_season_from_date(x) for x in (startdate, enddate)
    ]

    dflst = []
    for season in range(startseason, endseason + 1):
        games_played = schedules.get_team_games(season, team, startdate,
                                                enddate)
        games_played = [g for g in games_played if g >= 20001 and g <= 30417]
        toi = manip.get_game_h2h_toi(
            season, games_played).rename(columns={'Secs': 'TOI'})
        cf = manip.get_game_h2h_corsi(season, games_played,
                                      'cf').rename(columns={'HomeCorsi': 'CF'})
        ca = manip.get_game_h2h_corsi(season, games_played,
                                      'ca').rename(columns={'HomeCorsi': 'CA'})

        # TOI, CF, and CA have columns designating which team--H or R
        # Use schedule to find appropriate ones to filter for
        sch = schedules.get_team_schedule(season, team, startdate, enddate)
        sch = helper.melt_helper(sch[['Game', 'Home', 'Road']],
                                 id_vars='Game',
                                 var_name='HR',
                                 value_name='Team')
        sch = sch.query('Team == {0:d}'.format(int(
            team_info.team_as_id(team))))
        sch.loc[:, 'HR'] = sch.HR.apply(lambda x: x[0])
        sch = sch.assign(Team1=sch.HR, Team2=sch.HR).drop({'Team', 'HR'},
                                                          axis=1)

        toi = toi.merge(sch, how='inner', on=['Game', 'Team1', 'Team2'])
        cf = cf.merge(sch, how='inner', on=['Game', 'Team1', 'Team2'])
        ca = ca.merge(sch, how='inner', on=['Game', 'Team1', 'Team2'])

        # CF and CA from home perspective, so switch if necessary
        cfca = cf.merge(
            ca,
            how='outer',
            on=['Game', 'PlayerID1', 'PlayerID2', 'Team1', 'Team2'])
        cfca.loc[:, 'tempcf'] = cfca.CF
        cfca.loc[:, 'tempca'] = cfca.CA
        cfca.loc[cf.Team1 == 'R', 'CF'] = cfca[cfca.Team1 == 'R'].tempca
        cfca.loc[ca.Team1 == 'R', 'CA'] = cfca[cfca.Team1 == 'R'].tempcf

        cfca = cfca.drop({'Team1', 'Team2', 'tempcf', 'tempca'}, axis=1)
        toi = toi.drop({'Team1', 'Team2', 'Min'}, axis=1)

        joined = toi.merge(cfca, how='outer', on=['PlayerID1', 'PlayerID2', 'Game']) \
            .assign(Season=season)
        dflst.append(joined)

    df = pd.concat(dflst) \
        .groupby(['PlayerID1', 'PlayerID2'], as_index=False).sum()
    df.loc[:, 'CF60'] = df.CF * 3600 / df.TOI
    df.loc[:, 'CA60'] = df.CA * 3600 / df.TOI

    defensemen = players.get_player_ids_file().query('Pos == "D"')[['ID']]
    df = df.merge(defensemen.rename(columns={'ID': 'PlayerID1'}), how='inner', on='PlayerID1') \
        .merge(defensemen.rename(columns={'ID': 'PlayerID2'}), how='inner', on='PlayerID2')

    return df