Ejemplo n.º 1
0
def generate_game_data(filename, teamno, folderpath=PLAY_BY_PLAY_DIRECTORY):
    '''This funciton generates game data from lineup retrieved from stats.nba.com'''
    df = pd.read_csv(folderpath + filename)
    # add_aid_rows(df,)
    teamname = filename[9:12] if teamno == 1 else filename[12:15]
    team = BasketballGame(filename, df, teamname, teamno)

    gamestats = []
    # reads in data of the season's mean
    ref = pd.read_csv(STATS_NBA_PATH % "2008-09_team_mean.csv")  # reads the mean data of teams of the season
    opponent_team_mean = ref[ref['TEAM_ABBREVIATION'] == team.oppteamname].squeeze()

    opponent_team_mean.index = ['OPP_%s' % elt for elt in opponent_team_mean.index]

    for lineup_no in range(1, team.totalcombi + 1):  # Loop over lineups in the game
        lineup = team.give_nth_combination(lineup_no)

        player_names = get_player_list(lineup, team.teamno)

        interval = calculate_interval(team, lineup_no)

        lineup_data = get_lineup_data(teamname, player_names, interval, type="none")
        if lineup_data.empty:
            return pd.Series()
        # print lineup_data
        row_s = lineup_data  # This transforms the columns name used by stats.nba.com to the convention
        # I used throughout the project
        # row_s.rename(index={
        #     'PLUS_MINUS': 'PTD',
        #     'PTS': 'P',
        #     'FT_PCT': 'FTP',
        #     'FG_PCT': 'FGP',
        #     'OREB': 'ORB',
        #     'DREB': 'DRB',
        #     'REB': 'TRB',
        #     'FG3_PCT': 'B3P',
        #     'PF': 'F'
        # }, inplace=True)
        # row_s.drop("PFD", inplace=True)  # drop Personal Foul Drawn since not used in our study
        row_s.name = lineup_no
        row_s["interval"] = interval
        # if row_s.TYPE == ("median" or "mean"):
        #     pass
        # else:
        gamestats.append(row_s.T)
            # print gamestats
    game_agg = pd.DataFrame(gamestats).sum()  # sum up a games all scores
    # this PTD is still the actual PTD of the game, it is only the
    # stats that have been replaced.
    # game_agg.PTD = team.PTD
    game_agg["Actual PTS"] = team.P
    game_agg["Actual PTS scaled"] = team.P * game_agg.interval / float(team.interval)
    game_agg["Actual PTD scaled"] = team.PTD * game_agg.interval / float(team.interval)
    game_agg["Actual PTD"] = team.PTD
    game_agg = game_agg.append(opponent_team_mean["OPP_FGM":])
    # print game_agg
    return game_agg
Ejemplo n.º 2
0
def opplineup(filename):
    # namecolumns = ['a1', 'a2', 'a3', 'a4', 'a5', 'h1', 'h2', 'h3', 'h4', 'h5']
    DATA_FOR_NOT_FOUND = "median"
    out = []
    df = pd.read_csv(PLAY_BY_PLAY_DIRECTORY + filename)
    team_name = filename[9:12]
    opp_team_name = filename[12:15]
    if (opp_team_name not in ALLTEAMS) and (team_name not in ALLTEAMS):
        print '%s, one of the team not available, skip!' % filename
        return pd.DataFrame()
    else:
        add_aid_rows(df, [team_name, opp_team_name])
        total_lineup = df.head(1).lineup_no.item() + 1
        for lineup_no in range(1, int(total_lineup)):  # go through all the lineups in the game
            lineup_df = df[df.lineup_no == lineup_no]  #
            interval = lineup_df.head(1).timeleft.item() - lineup_df.tail(1).timeleft.item()  # calculate the
            # interval of this lineup

            # print 'This is lineup: %s, the interval is %s' %(lineup_no,interval)
            team1 = lineup_df[lineup_df.team == team_name]
            team2 = lineup_df[lineup_df.team == opp_team_name]
            for team_no, (team_name, team_data), in enumerate(zip([team_name, opp_team_name], [team1, team2])):
                if team_data.empty:  # check that the lineup contains this data of this team
                    print '%s, %s, %s, team not in this lineup data' % (filename, lineup_no, team_name)
                else:
                    team_players = get_player_list(team_data, team_no + 1)
                    lineup_data = get_lineup_data(team_name, team_players, type="none")  # retrieve data from stats.nba
                    if lineup_data.empty is False:
                        team_points = score_calculate(team_data)
                        lineup_data['interval'] = interval  # calculate the interval of this lineup
                        lineup_data['PPM'] = float(team_points) / interval * 60  # calculate PPM(POINTS PER MINUTE)
                        # print lineup_data
                        out.append(lineup_data)  # append a line to the lineup data
            # print  pd.concat(out, axis=1).T
            if out != []:
                return pd.concat(out, axis=1).T  # concatenate all lineup data of one game
            else:
                return pd.Series()