# say that maximum of 40 players are being paid (should be ~ 12) ncol = 42 sal_perc_df = pd.DataFrame(np.ones((df.values[df.values == 1].size, ncol), dtype=np.float64), columns=['team', 'year'] + ['p' + str(i) for i in range(1,41)]) rc = 0 # fill the salary df scrape_bad = False for i in range(0, df.shape[0]): for j in range(0, df.shape[1]): if df.iloc[i, j] == 1: sal_perc_df['team'][rc] = df.index[i] sal_perc_df['year'][rc] = df.columns[j] # scrape salary data url = base + str(df.columns[j]) + '/' + df.index[i] + '.jsp' try: sal_percs = cs.extractSalaryPercents(url) except: print 'Scraping error on url <' + url + '>' print 'Improve your extract function, you noob.' scrape_bad = True break sal_percs += [0] * ((ncol - 2) - len(sal_percs)) # pad the zeros sal_perc_df.iloc[rc, 2:] = sal_percs # fill that row in rc += 1 print 'Finished payroll for ' + df.index[i] + ', ' + str(df.columns[j]) else: print 'Skipping 404 page' if scrape_bad: break
2:ncol] sal_perc_df_full.iloc[inds, 2:ncol] = old_percs old_count = old_count + num_seasons # now we need to fill in salary data for Charlotte Bobcats, # New Orleans Hornets/New Orleans Pelicans, Seattle Supersonics/ # Oklahoma City Thunder # NO Hornets sal_perc_df_full.iloc[144:150, 2:] = sal_perc_df[sal_perc_df['team'] == 'hornets'].iloc[0:-1, 2:] # NO Pelicans sal_perc_df_full.iloc[150:152, 2:] = sal_perc_df[sal_perc_df['team'] == 'pelicans'].iloc[:, 2:] # OKC sal_perc_df_full.iloc[218:224, 2:] = sal_perc_df[sal_perc_df['team'] == 'thunder'].iloc[:, 2:] # now salaries that we haven't collected: # Supersonics data can't be found on ShamSports, so we're making it nan sal_perc_df_full.iloc[216:218, 2:] = float('nan') # Charlotte Bobcats (only changing to Charlotte Hornets for # 2014-2015 season) sal_perc_df_full.iloc[168:176, 0:3] i = 0 for year in range(2007, 2015): sal_percs = cs.extractSalaryPercents('http://data.shamsports.com/content/pages/data/salaries/' + str(year) + '/bobcats.jsp') sal_percs += [0] * ((ncol - 2) - len(sal_percs)) sal_perc_df_full.iloc[168 + i, 2:] = sal_percs i += 1 pickle.dump(sal_perc_df_full, open('fixed_payroll_shamsports.p', 'wb'))