import pandas as pd import numpy as np import requests import re import pickle from bs4 import BeautifulSoup import get_teams teams = get_teams.retrieve() # http://data.shamsports.com/content/pages/data/salaries/2012/warriors.jsp # 2d np array to store whether or not the given url (combo of the team "name" # and the year) base = 'http://data.shamsports.com/content/pages/data/salaries/' years = range(2000, 2015) x = np.ones((len(teams), len(years)), dtype=np.int) test_url = [t.split()[-1].lower() for t in teams] # if one of our rows is all 0s, that means the team url is actually wrong i = 0 for tu in test_url: j = 0 for year in years: url = base + str(year) + '/' + tu + '.jsp' r = requests.get(url) if r.status_code == 404: x[i, j] = 0 # mark it as a failed request print '404 for team "' + tu + '" and year ' + str(year) else:
# 2003 - 2006 NBA New Orleans Hornets # # thunder: # 2009 - present NBA Oklahoma City Thunder # 1968 - 2008 NBA Seattle Supersonics # this means that we need to retrieve the following salary data to make # our data frame 30 (number of teams) * 8 (number of seasons we have) # = 240 rows, and also label the team names as 'Charlotte Hornets' # instead of just 'hornets' so our data frame will look more like: # new_york_knicks 2007 x0 # new_orleans_pelicans 2014 x1 # new_orleans_hornets 2013 x2, etc. # names of current teams teams = ['_'.join(t.lower().split()) for t in get_teams.retrieve()] ncol = 42 sal_perc_df_full = pd.DataFrame(np.ones((240, ncol), dtype=np.float64), columns=['team', 'year'] + ['p' + str(i) for i in range(1,ncol - 1)]) num_seasons = 8 # 2007 - 2014 old_count = 0 # tracks where we are in sal_perc_df for i in range(0, len(teams)): inds = range((i * num_seasons),((i + 1) * num_seasons)) if sal_perc_df['team'][old_count] == 'hornets': sal_perc_df_full['team'][inds] = u'charlotte_bobcats' sal_perc_df_full['year'][inds] = range(2007, 2015) old_count += sal_perc_df[sal_perc_df['team'] == 'hornets'].shape[0]