def merge_salaries_stats(salaries, year): """ This function is to merge salaries dataset and nba stats dataset for a given year. Since this function is called for preparing for the regression analysis and we are trying regress next year's salaries on this year's statistics, therefore, *statistics year = salaries year - 1* Attributes: salaries: a preprocessed salaries dataset year: a salaries year, from 2000 to 2015 Return: a merged dataframe with salaries and nba stats dataset for the selected year. """ try: salaries_year = salaries[year].dropna(subset=['SALARY']) stats_year = pd.read_csv( os.path.dirname(os.path.realpath(__file__)) + '/../static/data/stats_{}.csv'.format(year - 1)) #load stats data except: raise cannotLoadDataError('NBA Stats Data Not Found!') stats_year = stats_year[stats_year['POS'].isin( ['C', 'PF', 'PG', 'SF', 'SG'])] #check positions stats_year = stats_year.sort(columns='PLAYER') stats_year = stats_year.drop_duplicates(subset=['PLAYER', 'POS']) stats_year = stats_year.set_index(['PLAYER', 'POS']) stats_year = stats_year.drop('TEAM', 1) return pd.merge(salaries_year, stats_year, right_index=True, left_index=True)
def salaries_preprocessing(): """ This function is to preprocess 2000 - 2015 salaries dataset. Return: a merged dataset containing 2000-2015 salaries data. """ s_year = xrange(2000, 2016) #load 2000-2015 salaries data into a dictionary try: dfs = {year:pd.read_csv(os.path.dirname(os.path.realpath(__file__))+"/../static/data/salaries_"+str(year)+".csv").dropna(subset = ['SALARY']) for year in s_year} except: raise cannotLoadDataError('Salaries Data Not Found!') for year in s_year: dfs[year]['POS'].fillna('N', inplace=True) dfs[year] = dfs[year].set_index(['PLAYER','POS']) return pd.concat(dfs, axis=1, join='outer') #merge 2000-2015 dataframes into a dataframe
def salaries_preprocessing(): """ This function is to preprocess 2000 - 2015 salaries dataset. Return: a merged dataset containing 2000-2015 salaries data. """ s_year = xrange(2000, 2016) #load 2000-2015 salaries data into a dictionary try: dfs = { year: pd.read_csv( os.path.dirname(os.path.realpath(__file__)) + "/../static/data/salaries_" + str(year) + ".csv").dropna(subset=['SALARY']) for year in s_year } except: raise cannotLoadDataError('Salaries Data Not Found!') for year in s_year: dfs[year]['POS'].fillna('N', inplace=True) dfs[year] = dfs[year].set_index(['PLAYER', 'POS']) return pd.concat( dfs, axis=1, join='outer') #merge 2000-2015 dataframes into a dataframe
def merge_salaries_stats(salaries, year): """ This function is to merge salaries dataset and nba stats dataset for a given year. Since this function is called for preparing for the regression analysis and we are trying regress next year's salaries on this year's statistics, therefore, *statistics year = salaries year - 1* Attributes: salaries: a preprocessed salaries dataset year: a salaries year, from 2000 to 2015 Return: a merged dataframe with salaries and nba stats dataset for the selected year. """ try: salaries_year = salaries[year].dropna(subset = ['SALARY']) stats_year = pd.read_csv(os.path.dirname(os.path.realpath(__file__))+'/../static/data/stats_{}.csv'.format(year-1)) #load stats data except: raise cannotLoadDataError('NBA Stats Data Not Found!') stats_year = stats_year[stats_year['POS'].isin(['C','PF','PG','SF','SG'])] #check positions stats_year = stats_year.sort(columns='PLAYER') stats_year = stats_year.drop_duplicates(subset=['PLAYER','POS']) stats_year = stats_year.set_index(['PLAYER','POS']) stats_year = stats_year.drop('TEAM',1) return pd.merge(salaries_year,stats_year,right_index=True, left_index=True)