def createTargetDonors(params, stats): starting_year = params[0] min_games = params[1] min_years = params[2] pred_year = params[3] pred_interval = params[4] """ donor setup """ # only consider years prior to 'pred_year' stats_donor = stats[stats.Year < pred_year] stats_donor = stats_donor.sort_values(by=['Player', 'Year']) # only consider years in which at least "min_games" number of games were played stats_donor = stats_donor[stats_donor.G >= min_games] # edit 'year_count' stats_donor = edit_year_count(stats_donor) # create donor object allPivotedTableDict_d, allMetrics = prepareData(stats_donor) donor = Donor(allPivotedTableDict_d) """ target setup """ # consider all years up to (and including) 'pred_year' stats_target = stats[stats.Year <= pred_year] # exclude years prior to 'pred_year' in which < 'min_games' were played idx = stats_target.loc[(stats_target.G < min_games) & (stats_target.Year < pred_year)].index stats_target.drop(idx, inplace=True) # edit 'year_count' stats_target = edit_year_count(stats_target) # create target dictionary of values allPivotedTableDict, allMetrics = prepareData(stats_target) # get target player names targetNames = get_active_players(stats_target, pred_year, min_years, min_games) targetNames.sort() return donor, allPivotedTableDict, targetNames
def test(): """ import data """ print("* importing data") players = pd.read_csv("../data/nba-players-stats/player_data.csv") players = players[players.year_start >= 1980] # only choose players who started after 1980 players["player_id"] = range(0, len(players.name)) # assign id stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") stats = stats[stats.Player.isin(players.name)] # only after 1980 stats = stats[stats.Year >= 1980] # without duplicated names --> to do: how to distinguish multiple player with the same name stats = removeDuplicated(players, stats) stats.Year = stats.Year.astype(int) stats.year_count = stats.year_count.astype(int) print("* preparing data") # transform stats to a dictionary composed of df's for each stat # the stats are re-calculated to get one stat for each year metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"] metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames) metricsPerCentColNames = ["FG", "FT"] metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames) metricsWeightedColNames = ["PER"] metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames) allMetricsDict = { **metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict } allPivotedTableDict = getPivotedTableDict(allMetricsDict) # this matrix will be used to mask the table df_year = pd.pivot_table(stats, values="Year", index="Player", columns="year_count") """ experiment setup """ activePlayers = getActivePlayers(stats, 2016, 4) activePlayers.sort() activePlayers.remove("Kevin Garnett") activePlayers.remove("Kobe Bryant") offMetrics = ["PTS_G", "AST_G", "TOV_G", "3P_G", "PER_w", "FG%", "FT%"] defMetrics = ["TRB_G", "STL_G", "BLK_G"] #### uniform weights # weightsOff = [1.,1.,1.,1.,1.,1.,1.] # weightsDef = [1.,1.,1.] #### mean-standardized weights # weightsOff = [0.12623068620631453, 0.55687314142618904, 0.82115849366536209, 0.080245455622805287, 2.2838580004246301, 1.4304474472757014, 4.7552939398878413] # weightsDef = [0.28744431242409424, 1.5323016513327052, 2.4985245915220626] #### variance-standardized weights (1/(x+1)) metrics_list = [offMetrics, defMetrics] expSetup = ["sliding", "SVD", "all", "pinv", False] threshold = 0.97 print("* start experiment") pred_all = pd.DataFrame() true_all = pd.DataFrame() for playerName in activePlayers: target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) weights_list = getWeitghts(target, donor, metrics_list, expSetup, method="mean") mrsc = mRSC(donor, target, probObservation=1) mrsc.fit_threshold(offMetrics, weights_list[0], 2016, pred_length=1, threshold=threshold, setup=expSetup) predOff = mrsc.predict() trueOff = mrsc.getTrue() predOff.columns = [playerName] trueOff.columns = [playerName] mrsc.fit_threshold(defMetrics, weights_list[1], 2016, pred_length=1, threshold=threshold, setup=expSetup) predDef = mrsc.predict() trueDef = mrsc.getTrue() predDef.columns = [playerName] trueDef.columns = [playerName] pred = pd.concat([predOff, predDef], axis=0) true = pd.concat([trueOff, trueDef], axis=0) pred_all = pd.concat([pred_all, pred], axis=1) true_all = pd.concat([true_all, true], axis=1) ################### mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] print() print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(true_all, pred_all) print() print(rmse) print("RMSE for all: ", rmse.mean()) weirdo = mape.T[mape.T.PTS_G > 100].T print() print(weirdo) print(weirdo.shape)
def test(): """ import data """ print("* importing data") players = pd.read_csv("../data/nba-players-stats/player_data.csv") players = players[players.year_start >= 1980] # only choose players who started after 1980 players["player_id"] = range(0, len(players.name)) # assign id stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") stats = stats[stats.Player.isin(players.name)] # only after 1980 stats = stats[stats.Year >= 1980] # without duplicated names --> to do: how to distinguish multiple player with the same name stats = removeDuplicated(players, stats) stats.Year = stats.Year.astype(int) stats.year_count = stats.year_count.astype(int) print("* preparing data") # transform stats to a dictionary composed of df's for each stat # the stats are re-calculated to get one stat for each year metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"] metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames) metricsPerCentColNames = ["FG", "FT"] metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames) metricsWeightedColNames = ["PER"] metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames) allMetricsDict = { **metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict } allPivotedTableDict = getPivotedTableDict(allMetricsDict) # this matrix will be used to mask the table df_year = pd.pivot_table(stats, values="Year", index="Player", columns="year_count") """ experiment setup """ pred_year = 2016 activePlayers = getActivePlayers(stats, pred_year, 4) activePlayers.sort() activePlayers.remove("Kevin Garnett") activePlayers.remove("Kobe Bryant") metrics1 = ["PTS_G", "PER_w"] metrics2 = ["3P_G", "FG%", "FT%"] metrics3 = ["TOV_G"] metrics4 = ["TRB_G", "STL_G"] metrics5 = ["AST_G", "BLK_G"] metrics_list = [metrics1, metrics2, metrics3, metrics4, metrics5] print(metrics_list) #### uniform weights # weightsOff = [1.,1.,1.,1.,1.,1.,1.] # weightsDef = [1.,1.,1.] expSetup = ["sliding", "SVD", "all", "pinv", False] threshold = 0.97 #### position groups group1 = [ "C", "SF", "PF", "C-PF", "PF-C", "C-SF", "SF-C", "SF-PF", "PF-SF" ] group2 = [ "SG", "SF", "PG", "SG-SF", "SF-SG", "SF-PG", "PG-SF", "SG-PG", "PG-SF" ] group3 = [ "SG", "SF", "PF", "SG-SF", "SF-SG", "SF-PF", "PF-SF", "SG-PF", "PF-SG" ] print("* start experiment") pred_all = pd.DataFrame() true_all = pd.DataFrame() for playerName in activePlayers: playerPos = stats.loc[(stats.Player == playerName) & (stats.Year == pred_year), "Pos"].values[-1] # print(playerPos) if playerPos in group1: players_in_group = stats[stats.Pos.isin(group1)].Player.unique() # print("group 1") elif playerPos in group2: players_in_group = stats[stats.Pos.isin(group2)].Player.unique() # print("group 2") elif playerPos in group3: players_in_group = stats[stats.Pos.isin(group2)].Player.unique() # print("group 3") else: raise Exception("invalid position") donorPivotedTableDict = filterDonor(allPivotedTableDict, players_in_group) target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(donorPivotedTableDict, df_year) # print("***sanitary check***") # print("players_in_group: ", len(players_in_group)) # print("donor pool size for PTS_G", donorPivotedTableDict["PTS_G"].shape) weights_list = getWeitghts(target, donor, metrics_list, expSetup, method="var") mrsc = mRSC(donor, target, probObservation=1) player_pred = pd.DataFrame() player_true = pd.DataFrame() for i in range(len(metrics_list)): mrsc.fit_threshold(metrics_list[i], weights_list[i], pred_year, pred_length=1, threshold=threshold, setup=expSetup) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] player_pred = pd.concat([player_pred, pred], axis=0) player_true = pd.concat([player_true, true], axis=0) pred_all = pd.concat([pred_all, player_pred], axis=1) true_all = pd.concat([true_all, player_true], axis=1) ################### mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] print() print("******* MAPE *******") print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean())
def test(): """ import data """ print("* importing data") players = pd.read_csv("../data/nba-players-stats/player_data.csv") players = players[players.year_start >= 1980] # only choose players who started after 1980 players["player_id"] = range(0, len(players.name)) # assign id stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") stats = stats[stats.Player.isin(players.name)] # only after 1980 stats = stats[stats.Year >= 1980] # without duplicated names --> to do: how to distinguish multiple player with the same name stats = removeDuplicated(players, stats) stats.Year = stats.Year.astype(int) stats.year_count = stats.year_count.astype(int) print("* preparing data") # transform stats to a dictionary composed of df's for each stat # the stats are re-calculated to get one stat for each year metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"] metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames) metricsPerCentColNames = ["FG", "FT"] metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames) metricsWeightedColNames = ["PER"] metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames) allMetricsDict = { **metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict } allPivotedTableDict = getPivotedTableDict(allMetricsDict) allMetrics = list(allMetricsDict.keys()) # this matrix will be used to mask the table df_year = pd.pivot_table(stats, values="Year", index="Player", columns="year_count") """ experiment setup """ activePlayers = getActivePlayers(stats, 2016, 4) activePlayers.sort() activePlayers.remove("Kevin Garnett") activePlayers.remove("Kobe Bryant") # offMetrics = ["PTS_G","AST_G","TOV_G","3P_G","PER_w", "FG%","FT%"] # defMetrics = ["TRB_G","STL_G","BLK_G"] expSetup = ["sliding", "SVD", "all", "pinv", False] threshold = 0.97 metrics_to_use = [ "PTS_G", "AST_G", "TOV_G", "3P_G", "PER_w", "FG%", "FT%", "TRB_G", "STL_G", "BLK_G" ] print("* start experiment") print("*******************") print("uniform weights") weights = [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.] pred_all = pd.DataFrame() true_all = pd.DataFrame() for playerName in activePlayers: target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) mrsc = mRSC(donor, target, probObservation=1) mrsc.fit_threshold(metrics_to_use, weights, 2016, pred_length=1, threshold=threshold, setup=expSetup) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] pred_all = pd.concat([pred_all, pred], axis=1) true_all = pd.concat([true_all, true], axis=1) print() print("*** MAPE ***") mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(true_all, pred_all) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean()) print("*******************") print("mean - standardized weights") metrics_list = [metrics_to_use] weights = getWeitghts(target, donor, metrics_list, expSetup, method="mean")[0] pred_all = pd.DataFrame() true_all = pd.DataFrame() for playerName in activePlayers: target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) mrsc = mRSC(donor, target, probObservation=1) mrsc.fit_threshold(metrics_to_use, weights, 2016, pred_length=1, threshold=threshold, setup=expSetup) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] pred_all = pd.concat([pred_all, pred], axis=1) true_all = pd.concat([true_all, true], axis=1) print() print("*** MAPE ***") mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(true_all, pred_all) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean()) print("*******************") print("var - standardized weights") metrics_list = [metrics_to_use] weights = getWeitghts(target, donor, metrics_list, expSetup, method="var")[0] pred_all = pd.DataFrame() true_all = pd.DataFrame() for playerName in activePlayers: target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) mrsc = mRSC(donor, target, probObservation=1) mrsc.fit_threshold(metrics_to_use, weights, 2016, pred_length=1, threshold=threshold, setup=expSetup) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] pred_all = pd.concat([pred_all, pred], axis=1) true_all = pd.concat([true_all, true], axis=1) print() print("*** MAPE ***") mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(true_all, pred_all) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean())
def test(): """ import data """ pred_year = 2015 # the year that we are living in pred_interval = 1 # we are making predictions for pred_year+1 and +2 print("*** importing data ***") players = pd.read_csv("../data/nba-players-stats/player_data.csv") players = players[players.year_start >= 1980] # only choose players who started after 1980 # players["player_id"] = range(0,len(players.name)) # assign id stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") stats = stats[stats.Player.isin(players.name)] # only after 1980 stats = stats[stats.Year >= 1980] # without duplicated names --> to do: how to distinguish multiple player with the same name stats = removeDuplicated(players, stats) stats.Year = stats.Year.astype(int) stats.year_count = stats.year_count.astype(int) print("*** preparing data ***") ########### Donor ########## # filter stats by the year stats_donor = stats[stats.Year <= pred_year] allPivotedTableDict, allMetrics = prepareData(stats_donor) donor = Donor(allPivotedTableDict) ########### Target ########## # filter stats by the year stats_target = stats[stats.Year <= pred_year+pred_interval] allPivotedTableDict, allMetrics = prepareData(stats_target) # just to debug df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count") """ experiment setup """ # overall setup donorSetup= [None,"fixed", True] # weighting = donorSetup[0] # None / "normalize" # mat_form_method = donorSetup[1] # "fixed" # skipNan = donorSetup[2] # (Boolean) denoiseSetup = ["SVD", "all"] # denoise_method = denoiseSetup[0] # "SVD" # denoise_mat_method = denoiseSetup[1] # "all" regression_method = "pinv" threshold = 0.97 verbose = False ################################################### offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"] defMetrics = ["TRB_G","STL_G","BLK_G"] # metrics_list = [offMetrics, defMetrics] metrics_list = [allMetrics] ################################################### ############################################################## # test 1 ############################################################## playerNames = getActivePlayers(stats, pred_year, buffer=4) playerNames.remove("Kevin Garnett") playerNames.remove("Kobe Bryant") # playerNames.remove("Jason Kidd") all_pred = pd.DataFrame() all_true = pd.DataFrame() for playerName in playerNames: # print(playerName) # print("*** year - year_count matching for this player") # a = df_year[df_year.index == playerName] # print(a.dropna(axis=1)) target = Target(playerName, allPivotedTableDict) # print("*** target - total index: ", target.total_index) # print(target.concat(metrics_list[1])) mrsc = mRSC(donor, target, pred_interval, probObservation=1) player_pred = pd.DataFrame() player_true = pd.DataFrame() for i in range(len(metrics_list)): mrsc.fit_threshold(metrics_list[i], pred_interval, threshold, donorSetup, denoiseSetup,regression_method, verbose) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] player_pred = pd.concat([player_pred, pred], axis=0) player_true = pd.concat([player_true, true], axis=0) all_pred = pd.concat([all_pred, player_pred], axis=1) all_true = pd.concat([all_true, player_true], axis=1) ################### print(all_pred) print(all_pred.shape) mask = (all_true !=0 ) mape = np.abs(all_pred - all_true) / all_true[mask] print("*** MAPE ***") print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(all_true, all_pred) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean())
def test(): """ import data """ print("importing data") players = pd.read_csv("../data/nba-players-stats/player_data.csv") players = players[players.year_start >= 1980] # only choose players who started after 1980 players["player_id"] = range(0, len(players.name)) # assign id stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") stats = stats[stats.Player.isin(players.name)] # only after 1980 stats = stats[stats.Year >= 1980] # without duplicated names --> to do: how to distinguish multiple player with the same name stats = removeDuplicated(players, stats) stats.Year = stats.Year.astype(int) stats.year_count = stats.year_count.astype(int) print("preparing data") # transform stats to a dictionary composed of df's for each stat # the stats are re-calculated to get one stat for each year metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK"] metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames) metricsPerCentColNames = ["FG", "FT", "3P"] metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames) metricsWeightedColNames = ["PER"] metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames) allMetricsDict = { **metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict } allPivotedTableDict = getPivotedTableDict(allMetricsDict) # this matrix will be used to mask the table df_year = pd.pivot_table(stats, values="Year", index="Player", columns="year_count") """ experiment setup """ activePlayers = getActivePlayers(stats, 2016, 5) activePlayers.sort() # offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"] # defMetrics = ["TRB_G","STL_G","BLK_G"] metrics_to_use = [ "PTS_G", "AST_G", "TOV_G", "PER_w", "FG%", "FT%", "3P%", "TRB_G", "STL_G", "BLK_G" ] weights = [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.] expSetup = ["fixed", "SVD", "all", "pinv", False] singvals_list = [1, 2, 4, 8, 16, 32] print("start experiment") for singvals in singvals_list: pred_all = pd.DataFrame() true_all = pd.DataFrame() for playerName in activePlayers: target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) mrsc = mRSC(donor, target, probObservation=1) mrsc.fit(metrics_to_use, weights, 2016, pred_length=1, singvals=singvals, setup=expSetup) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] pred_all = pd.concat([pred_all, pred], axis=1) true_all = pd.concat([true_all, true], axis=1) mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] print(singvals) print(mape.mean(axis=1))
def test(): """ import data """ """ USER PARAMETERS """ starting_year = 1970 #min_games_donor = 40 #min_games_target = 40 #min_games = np.min((min_games_donor, min_games_target)) min_games= 30 train_year = 2015 pred_interval = 1 pred_year = train_year + pred_interval buffer = 4 num_top_players = 300 parameters = {'starting_year': starting_year, #'min_games_donor': min_games_donor, #'min_games_target': min_games_target, 'min_games': min_games, 'pred_year': pred_year, 'pred_interval': pred_interval, 'min_num_years_played': buffer} """ players dataframe """ players = pd.read_csv("../data/nba-players-stats/player_data.csv") # sort players by (name, year_start) players = players.sort_values(by=['name', 'year_start']) # filter players by years considered players = players[players.year_start >= starting_year] """ stats dataframe """ stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") # fix the name* issue stats = stats.replace('\*','',regex=True) # sort players by (name, year) stats = stats.sort_values(by=['Player', 'Year']) # remove multiple rows for the same [Year, Player] totals = stats[stats.Tm == "TOT"] duplicates_removed = stats.drop_duplicates(subset=["Year", "Player"], keep=False) stats = pd.concat([duplicates_removed, totals], axis=0).sort_values("Unnamed: 0") # filter players by years considered stats = stats[stats.Year >= starting_year] """ players + stats dataframes """ valid_players = list(set(stats.Player) & set(players.name)) stats = stats[stats['Player'].isin(valid_players)] players = players[players['name'].isin(valid_players)] # correct names in "players" dataframe duplicate_names = [] for name in players.name: numrows = len(players[players['name']==name]) if numrows == 2: duplicate_names.append(name) i = 0 for birth_date in players.loc[players['name']==name, 'birth_date']: if i == 1: players.loc[(players['name']==name) & (players['birth_date']==birth_date) , 'name'] = name + ' Jr' i += 1 elif numrows == 3: players = players[players.name != name] # correct names in "stats" dataframe stats = fix_duplicates(stats, duplicate_names) # merge players = players.rename(columns={"name": "Player"}) stats = pd.merge(stats, players, on='Player', how='left') # sanity check stats = stats[(stats.Year >= stats.year_start) & (stats.Year <= stats.year_end)] stats.Year = stats.Year.astype(int) stats.year_start = stats.year_start.astype(int) stats['year_count'] = stats.Year - stats.year_start """ donor setup """ # only consider years prior to 'pred_year' stats_donor = stats[stats.Year < pred_year] stats_donor = stats_donor.sort_values(by=['Player', 'Year']) # only consider years in which at least "min_games" number of games were played stats_donor = stats_donor[stats_donor.G >= min_games] # edit 'year_count' stats_donor = edit_year_count(stats_donor) # create donor object allPivotedTableDict_d, allMetrics = prepareData(stats_donor) donor = Donor(allPivotedTableDict_d) """ target setup """ # consider all years up to (and including) 'pred_year' stats_target = stats[stats.Year <= pred_year] # exclude years prior to 'pred_year' in which < 'min_games' were played idx = stats_target.loc[(stats_target.G < min_games) & (stats_target.Year < pred_year)].index stats_target.drop(idx, inplace=True) # edit 'year_count' stats_target = edit_year_count(stats_target) # create target dictionary of values allPivotedTableDict, allMetrics = prepareData(stats_target) # get target player names targetNames = get_active_players(stats_target, pred_year, buffer, min_games) targetNames.sort() if 'Kevin Garnett' in targetNames: targetNames.remove("Kevin Garnett") if 'Kobe Bryant' in targetNames: targetNames.remove("Kobe Bryant") print("*** DATA PREP DONE! ***") predMetrics = ["PTS_G","AST_G","TOV_G","FG%","FT%","3P_G","TRB_G","STL_G","BLK_G"] """ setup """ # user input donor_window_type = 'sliding' normalize_metric = 'variance' threshold = 0.97 #helper_metrics = ['MP'] helper_metrics = [] num_top = 200 top_players = getTopPlayers(stats, pred_year, 'PTS', num_top).values.flatten().tolist() # setup donorSetup= [normalize_metric, donor_window_type, True] denoiseSetup = ["SVD", "all"] regression_method = "pinv" verbose = False metrics_list = [[metric] + helper_metrics for metric in predMetrics] print(metrics_list) selected_targetNames = list(set(targetNames) & set(top_players)) selected_targetNames = targetNames print("the number of targets tested: ", len(selected_targetNames)) """ directory path """ # donor window type donor_window_label = donor_window_type + '/' # prediction method helper_metrics_label = '' if helper_metrics: pred_method = 'mrsc/' for helper_metric in helper_metrics: helper_metrics_label = helper_metrics_label + helper_metric + '_' helper_metrics_label += '/' else: pred_method = 'rsc/' # prediction year pred_year_label = str(pred_year) + '/' # metric normalizing type if normalize_metric == None: normalize_metric_label = 'no_normalization/' else: normalize_metric_label = normalize_metric + '/' # singular value threshold energy level threshold_label = str(threshold*100)[:2] + '/' # prediction length pred_length_label = str(pred_interval) + 'year/' dir_name = 'plots/' + pred_method + pred_year_label + donor_window_label + normalize_metric_label + helper_metrics_label + pred_length_label print(dir_name) """ computation """ print("Computing...") print() print("*** SETUP ***") for key, value in parameters.items(): print("{}: {}".format(key, value)) print("donor window type: ", donor_window_type) print("normalization metric: ", normalize_metric) print("threshold: ", threshold) print("helper metrics: ", helper_metrics) print("denoise setup: ", denoiseSetup) print("regression method: ", regression_method) print("metrics list: ", metrics_list) print() print("Experiment: {}".format(dir_name)) print() all_pred, all_true, all_R2, all_bench = annual_predictions(selected_targetNames, allPivotedTableDict, donor, pred_interval, metrics_list, predMetrics, threshold, donorSetup, denoiseSetup, regression_method, verbose, dir_name, top_players)
def test(): """ import data """ print("* importing data") players = pd.read_csv("../data/nba-players-stats/player_data.csv") players = players[players.year_start >= 1980] # only choose players who started after 1980 players["player_id"] = range(0, len(players.name)) # assign id stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") stats = stats[stats.Player.isin(players.name)] # only after 1980 stats = stats[stats.Year >= 1980] # without duplicated names --> to do: how to distinguish multiple player with the same name stats = removeDuplicated(players, stats) stats.Year = stats.Year.astype(int) stats.year_count = stats.year_count.astype(int) print("* preparing data") # transform stats to a dictionary composed of df's for each stat # the stats are re-calculated to get one stat for each year metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"] metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames) metricsPerCentColNames = ["FG", "FT"] metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames) metricsWeightedColNames = ["PER"] metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames) allMetricsDict = { **metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict } allPivotedTableDict = getPivotedTableDict(allMetricsDict) allMetrics = list(allMetricsDict.keys()) # this matrix will be used to mask the table df_year = pd.pivot_table(stats, values="Year", index="Player", columns="year_count") """ experiment setup """ pred_year = 2016 # targets activePlayers = getActivePlayers(stats, pred_year, 4) activePlayers.sort() activePlayers.remove("Kevin Garnett") activePlayers.remove("Kobe Bryant") # overall setup expSetup = ["sliding", "SVD", "all", "pinv", False] threshold = 0.97 ################################################### # offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"] # defMetrics = ["TRB_G","STL_G","BLK_G"] # metrics_list = [offMetrics, defMetrics] ################################################### print("* start experiment") pred_all = pd.DataFrame() true_all = pd.DataFrame() metrics_all = [] with open('metrics_all.pkl', 'wb') as f: pickle.dump(metrics_all, f, pickle.HIGHEST_PROTOCOL) for playerName in activePlayers: # print() # print("***********", playerName , "************") target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) metrics_list = getMetrics(target, donor, pred_year, allMetrics, threshold, expSetup, boundary="threshold") weights_list = getWeitghts(target, donor, metrics_list, expSetup, method="mean") metrics_all.append(metrics_list) # print(metrics_list) mrsc = mRSC(donor, target, probObservation=1) player_pred = pd.DataFrame() player_true = pd.DataFrame() for i in range(len(metrics_list)): mrsc.fit_threshold(metrics_list[i], weights_list[i], pred_year, pred_length=1, threshold=threshold, setup=expSetup) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] c = metrics_list[i].index(allMetrics[i]) player_pred = pd.concat([player_pred, pred.iloc[c:(c + 1), :]], axis=0) player_true = pd.concat([player_true, true.iloc[c:(c + 1), :]], axis=0) pred_all = pd.concat([pred_all, player_pred], axis=1) true_all = pd.concat([true_all, player_true], axis=1) mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] # print(mape.mean(axis=1)) ################### print("******** RESULT ********") mask = (true_all != 0) mape = np.abs(pred_all - true_all) / true_all[mask] print() print("*** MAPE ***") print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(pred_all, true_all) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean()) with open('metrics_all.pkl', 'wb') as f: pickle.dump(metrics_all, f, pickle.HIGHEST_PROTOCOL)
def test(): """ import data """ print("importing data") players = pd.read_csv("../data/nba-players-stats/player_data.csv") players = players[players.year_start >= 1980] # only choose players who started after 1980 players["player_id"] = range(0, len(players.name)) # assign id stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv") stats = stats[stats.Player.isin(players.name)] # only after 1980 stats = stats[stats.Year >= 1980] # without duplicated names --> to do: how to distinguish multiple player with the same name stats = removeDuplicated(players, stats) stats.Year = stats.Year.astype(int) stats.year_count = stats.year_count.astype(int) print("preparing data") # transform stats to a dictionary composed of df's for each stat # the stats are re-calculated to get one stat for each year metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"] metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames) metricsPerCentColNames = ["FG", "FT"] metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames) metricsWeightedColNames = ["PER"] metricsWeightedDict = getMetricsWeightedDict(stats, metricsWeightedColNames) allMetricsDict = { **metricsPerGameDict, **metricsPerCentDict, **metricsWeightedDict } allPivotedTableDict = getPivotedTableDict(allMetricsDict) allMetrics = list(allMetricsDict.keys()) # this matrix will be used to mask the table df_year = pd.pivot_table(stats, values="Year", index="Player", columns="year_count") """ experiment setup """ # overall setup expSetup = ["sliding", "SVD", "all", "pinv", False] threshold = 0.97 metrics1 = ["PTS_G", "PER_w", "TRB_G", "3P_G"] metrics2 = ["FG%", "FT%"] metrics3 = ["BLK_G", "AST_G", "TOV_G", "STL_G"] metrics_list = [metrics1, metrics2, metrics3] ################################################### # offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"] # defMetrics = ["TRB_G","STL_G","BLK_G"] # metrics_list = [offMetrics, defMetrics] ################################################### playerName = "Ryan Anderson" target = Target(playerName, allPivotedTableDict, df_year) donor = Donor(allPivotedTableDict, df_year) weights_list = getWeitghts(target, donor, metrics_list, expSetup, method="var") mrsc = mRSC(donor, target, probObservation=1) fig, axs = plt.subplots(3, 5) player_pred = pd.DataFrame() player_true = pd.DataFrame() for i in range(len(metrics_list)): mrsc.fit_threshold(metrics_list[i], weights_list[i], 2016, pred_length=1, threshold=threshold, setup=expSetup) pred = mrsc.predict() true = mrsc.getTrue() pred.columns = [playerName] true.columns = [playerName] player_pred = pd.concat([player_pred, pred], axis=0) player_true = pd.concat([player_true, true], axis=0) # mrsc.plot() for j in range(len(metrics_list[i])): metric = metrics_list[i][j] true_trajectory = target.data[metric].dropna( axis='columns').iloc[:, :mrsc.total_index] pred_val = np.dot( mrsc.model.donor_data.iloc[:, j * mrsc.model.total_index:( (j + 1) * mrsc.model.total_index)].T, mrsc.model.beta).T pred_trajectory = pd.DataFrame(pred_val, columns=true_trajectory.columns, index=true_trajectory.index) markers_on = [true_trajectory.shape[1] - 1] axs[i, j].plot(true_trajectory.T, marker='o', color='red', label='true') axs[i, j].plot(pred_trajectory.T, marker='o', markevery=markers_on, color='blue', label='prediction') axs[i, j].set_title(playerName + ": " + metric) # axs[i, j].legend() for ax in axs.flat: ax.set(xlabel='years played in NBA') plt.subplots_adjust(wspace=0.5, hspace=0.5) plt.show() ################### mask = (player_true != 0) mape = np.abs(player_pred - player_true) / player_true[mask] print("*** MAPE ***") print(mape.mean(axis=1)) print("MAPE for all: ", mape.mean().mean()) rmse = utils.rmse_2d(player_true, player_pred) print() print("*** RMSE ***") print(rmse) print("RMSE for all: ", rmse.mean())