Beispiel #1
0
def crawl_nbastats_by_year(year, champion_team_name='dal', num_player_of_interest=10):
    champion_team_name = team_abbr_full[champion_team_name]
    url_root = 'http://espn.go.com/nba/team/stats/_/name/'
    best_stats = []
    champion_teamstats = pd.DataFrame()
    non_champion_teamstats = []
    non_champion_teamlist = []
    for team_abbr, team_name in zip(teams_abbr, teams_full_name):
        catogory = '/cat/avgMinutes/'   # ordering player with their avg. minutes
        URL = url_root + team_abbr + '/year/' + str(year) + category + team_name
        print 'parsing ' + URL + ' ...'
        request = urllib2.Request(URL)
        response = urllib2.urlopen(request)
        if response.url != URL:
            print 'no response on this address, redirect to: ', response.url
            continue
        response = response.read()
        soup = BeautifulSoup(response, 'html.parser')

        players = soup.findAll('tr', {'class': re.compile('^player-')})
        stat_labels = soup.findAll('tr', {'class': ['colhead']})
        total_labels = soup.findAll('tr', {'class': ['total']})
        print soup.title.string

        #print '1: ', total_labels[0].select('td')
        #print '2: ', total_labels[1].select('td')

        player_list = []
        player_dict = {}
        team_stats = OrderedDict()  # avoid dict sorting the keys when adding them

        # Initialise 30 statistics for the team
        stats = ['', '']
        stats[0] = stat_labels[0].select('td')   # Table 1: game statistics
        stats[1] = stat_labels[1].select('td')   # Table 2: shooting statistics
        stat_labels = stats
        for stat in stats[0]:
            team_stats[stat.get_text()] = 0.0
        for stat in stats[1]:
            team_stats[stat.get_text()] = 0.0

        numOfPlayer = len(players) / 2

        # teams with players fewer than 10 are not included in the study
        if numOfPlayer < num_player_of_interest:
            print 'warning: players less than ' + str(num_player_of_interest) + ' !'
            continue

        player_namelist = []
        for i, player in enumerate(players, 0):
            if i == numOfPlayer:
                break
            player_stats = player.findAll('td')
            player_namelist.append(player_stats[0].get_text().encode('ascii', 'ignore'))
        team_stats = pd.DataFrame(np.zeros([numOfPlayer, len(team_stats.keys())]), \
                        index=player_namelist, columns=team_stats.keys())
        team_stats = team_stats.drop('PLAYER', 1)

        for i, player in enumerate(players, 0):
            player_idx = i % numOfPlayer
            j = i / numOfPlayer
            player_stats = player.findAll('td') # iterate over players within a team

            stat = np.zeros(len(player_stats))
            for stat_label, player_stat in zip(stat_labels[j], player_stats):
                x = player_stat.get_text().encode('ascii', 'ignore')
                if isfloat(x) == True:
                    x = float(x)
                    team_stats.set_value(player_namelist[player_idx], stat_label.get_text(), x)

        '''filename = team_name + '_' + str(year) + '.csv'
        print 'saving ' + filename, ' ...'
        team_stats.to_csv(filename)'''

        # keep track of champion team with specified year
        team_stats.index.name = 'Players'
        team_stats.columns.name = 'Statistics'
        if team_name == champion_team_name:
            champion_teamstats = team_stats
        else:
            non_champion_teamstats.append(team_stats)
            non_champion_teamlist.append(team_name)
        # keep track of the best of each statistics
        if len(best_stats) == 0:
            best_stats = team_stats.max(axis=0, numeric_only=True).as_matrix()
        else:
            team_stats = team_stats.max(axis=0, numeric_only=True).as_matrix()
            # only take max if all the entries in 'team_stats' are non-nan
            if not np.isnan(team_stats).any():
                best_stats = np.maximum(best_stats, team_stats) # element-wise max

    # normalise the stats by dividing the champion team's stats by the best stats among all teams
    if (not champion_teamstats.empty) and (len(best_stats) != 0):
        champion_teamstats = champion_teamstats.loc[:, 'GP'::].divide(best_stats, axis='columns')

    for team_stat, team_name in zip(non_champion_teamstats, non_champion_teamlist):
        team_stat = team_stat.loc[:, 'GP'::].divide(best_stats, axis='columns')
        team_stat = team_stat.iloc[0:num_player_of_interest]
        filename = 'non_champions/' + str(year) + '_' + team_name + '.csv'
        team_stat.to_csv(filename)

    return champion_teamstats