def crawl_nbastats_by_year(year, champion_team_name='dal', num_player_of_interest=10): champion_team_name = team_abbr_full[champion_team_name] url_root = 'http://espn.go.com/nba/team/stats/_/name/' best_stats = [] champion_teamstats = pd.DataFrame() non_champion_teamstats = [] non_champion_teamlist = [] for team_abbr, team_name in zip(teams_abbr, teams_full_name): catogory = '/cat/avgMinutes/' # ordering player with their avg. minutes URL = url_root + team_abbr + '/year/' + str(year) + category + team_name print 'parsing ' + URL + ' ...' request = urllib2.Request(URL) response = urllib2.urlopen(request) if response.url != URL: print 'no response on this address, redirect to: ', response.url continue response = response.read() soup = BeautifulSoup(response, 'html.parser') players = soup.findAll('tr', {'class': re.compile('^player-')}) stat_labels = soup.findAll('tr', {'class': ['colhead']}) total_labels = soup.findAll('tr', {'class': ['total']}) print soup.title.string #print '1: ', total_labels[0].select('td') #print '2: ', total_labels[1].select('td') player_list = [] player_dict = {} team_stats = OrderedDict() # avoid dict sorting the keys when adding them # Initialise 30 statistics for the team stats = ['', ''] stats[0] = stat_labels[0].select('td') # Table 1: game statistics stats[1] = stat_labels[1].select('td') # Table 2: shooting statistics stat_labels = stats for stat in stats[0]: team_stats[stat.get_text()] = 0.0 for stat in stats[1]: team_stats[stat.get_text()] = 0.0 numOfPlayer = len(players) / 2 # teams with players fewer than 10 are not included in the study if numOfPlayer < num_player_of_interest: print 'warning: players less than ' + str(num_player_of_interest) + ' !' continue player_namelist = [] for i, player in enumerate(players, 0): if i == numOfPlayer: break player_stats = player.findAll('td') player_namelist.append(player_stats[0].get_text().encode('ascii', 'ignore')) team_stats = pd.DataFrame(np.zeros([numOfPlayer, len(team_stats.keys())]), \ index=player_namelist, columns=team_stats.keys()) team_stats = team_stats.drop('PLAYER', 1) for i, player in enumerate(players, 0): player_idx = i % numOfPlayer j = i / numOfPlayer player_stats = player.findAll('td') # iterate over players within a team stat = np.zeros(len(player_stats)) for stat_label, player_stat in zip(stat_labels[j], player_stats): x = player_stat.get_text().encode('ascii', 'ignore') if isfloat(x) == True: x = float(x) team_stats.set_value(player_namelist[player_idx], stat_label.get_text(), x) '''filename = team_name + '_' + str(year) + '.csv' print 'saving ' + filename, ' ...' team_stats.to_csv(filename)''' # keep track of champion team with specified year team_stats.index.name = 'Players' team_stats.columns.name = 'Statistics' if team_name == champion_team_name: champion_teamstats = team_stats else: non_champion_teamstats.append(team_stats) non_champion_teamlist.append(team_name) # keep track of the best of each statistics if len(best_stats) == 0: best_stats = team_stats.max(axis=0, numeric_only=True).as_matrix() else: team_stats = team_stats.max(axis=0, numeric_only=True).as_matrix() # only take max if all the entries in 'team_stats' are non-nan if not np.isnan(team_stats).any(): best_stats = np.maximum(best_stats, team_stats) # element-wise max # normalise the stats by dividing the champion team's stats by the best stats among all teams if (not champion_teamstats.empty) and (len(best_stats) != 0): champion_teamstats = champion_teamstats.loc[:, 'GP'::].divide(best_stats, axis='columns') for team_stat, team_name in zip(non_champion_teamstats, non_champion_teamlist): team_stat = team_stat.loc[:, 'GP'::].divide(best_stats, axis='columns') team_stat = team_stat.iloc[0:num_player_of_interest] filename = 'non_champions/' + str(year) + '_' + team_name + '.csv' team_stat.to_csv(filename) return champion_teamstats