Ejemplo n.º 1
0
import pandas as pd
import numpy as np
import requests
import re
import pickle
from bs4 import BeautifulSoup

import get_teams

teams = get_teams.retrieve()

# http://data.shamsports.com/content/pages/data/salaries/2012/warriors.jsp

# 2d np array to store whether or not the given url (combo of the team "name"
# and the year)
base = 'http://data.shamsports.com/content/pages/data/salaries/'
years = range(2000, 2015)
x = np.ones((len(teams), len(years)), dtype=np.int)
test_url = [t.split()[-1].lower() for t in teams]

# if one of our rows is all 0s, that means the team url is actually wrong
i = 0
for tu in test_url:
    j = 0
    for year in years:
        url = base + str(year) + '/' + tu + '.jsp'
        r = requests.get(url)
        if r.status_code == 404:
            x[i, j] = 0 # mark it as a failed request
            print '404 for team "' + tu + '" and year ' + str(year)
        else:
Ejemplo n.º 2
0
# 2003 - 2006       NBA     New Orleans Hornets
#
# thunder:
# 2009 - present    NBA     Oklahoma City Thunder
# 1968 - 2008       NBA     Seattle Supersonics

# this means that we need to retrieve the following salary data to make 
# our data frame 30 (number of teams) * 8 (number of seasons we have) 
# = 240 rows, and also label the team names as 'Charlotte Hornets'
# instead of just 'hornets' so our data frame will look more like:
# new_york_knicks         2007    x0
# new_orleans_pelicans    2014    x1
# new_orleans_hornets     2013    x2, etc.

# names of current teams
teams = ['_'.join(t.lower().split()) for t in get_teams.retrieve()]

ncol = 42
sal_perc_df_full = pd.DataFrame(np.ones((240, ncol), 
    dtype=np.float64), columns=['team', 'year'] + ['p' + str(i) for i in range(1,ncol - 1)]) 

num_seasons = 8 # 2007 - 2014
old_count = 0 # tracks where we are in sal_perc_df
for i in range(0, len(teams)):

    inds = range((i * num_seasons),((i + 1) * num_seasons))

    if sal_perc_df['team'][old_count] == 'hornets':
        sal_perc_df_full['team'][inds] = u'charlotte_bobcats'
        sal_perc_df_full['year'][inds] = range(2007, 2015)
        old_count += sal_perc_df[sal_perc_df['team'] == 'hornets'].shape[0]