Beispiel #1
0
def createTargetDonors(params, stats):
    starting_year = params[0]
    min_games = params[1]
    min_years = params[2]
    pred_year = params[3]
    pred_interval = params[4]
    """ donor setup """
    # only consider years prior to 'pred_year'
    stats_donor = stats[stats.Year < pred_year]
    stats_donor = stats_donor.sort_values(by=['Player', 'Year'])

    # only consider years in which at least "min_games" number of games were played
    stats_donor = stats_donor[stats_donor.G >= min_games]

    # edit 'year_count'
    stats_donor = edit_year_count(stats_donor)

    # create donor object
    allPivotedTableDict_d, allMetrics = prepareData(stats_donor)
    donor = Donor(allPivotedTableDict_d)
    """ target setup """
    # consider all years up to (and including) 'pred_year'
    stats_target = stats[stats.Year <= pred_year]

    # exclude years prior to 'pred_year' in which < 'min_games' were played
    idx = stats_target.loc[(stats_target.G < min_games)
                           & (stats_target.Year < pred_year)].index
    stats_target.drop(idx, inplace=True)

    # edit 'year_count'
    stats_target = edit_year_count(stats_target)

    # create target dictionary of values
    allPivotedTableDict, allMetrics = prepareData(stats_target)

    # get target player names
    targetNames = get_active_players(stats_target, pred_year, min_years,
                                     min_games)
    targetNames.sort()

    return donor, allPivotedTableDict, targetNames
def test():
    """
    import data
    """
    print("* importing data")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >=
                      1980]  # only choose players who started after 1980
    players["player_id"] = range(0, len(players.name))  # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("* preparing data")
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG", "FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats,
                                                 metricsWeightedColNames)

    allMetricsDict = {
        **metricsPerGameDict,
        **metricsPerCentDict,
        **metricsWeightedDict
    }
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)

    # this matrix will be used to mask the table
    df_year = pd.pivot_table(stats,
                             values="Year",
                             index="Player",
                             columns="year_count")
    """
    experiment setup
    """
    activePlayers = getActivePlayers(stats, 2016, 4)
    activePlayers.sort()
    activePlayers.remove("Kevin Garnett")
    activePlayers.remove("Kobe Bryant")

    offMetrics = ["PTS_G", "AST_G", "TOV_G", "3P_G", "PER_w", "FG%", "FT%"]
    defMetrics = ["TRB_G", "STL_G", "BLK_G"]

    #### uniform weights
    # weightsOff = [1.,1.,1.,1.,1.,1.,1.]
    # weightsDef = [1.,1.,1.]

    #### mean-standardized weights
    # weightsOff = [0.12623068620631453, 0.55687314142618904, 0.82115849366536209, 0.080245455622805287, 2.2838580004246301, 1.4304474472757014, 4.7552939398878413]
    # weightsDef = [0.28744431242409424, 1.5323016513327052, 2.4985245915220626]

    #### variance-standardized weights (1/(x+1))

    metrics_list = [offMetrics, defMetrics]

    expSetup = ["sliding", "SVD", "all", "pinv", False]
    threshold = 0.97

    print("* start experiment")
    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    for playerName in activePlayers:
        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(allPivotedTableDict, df_year)
        weights_list = getWeitghts(target,
                                   donor,
                                   metrics_list,
                                   expSetup,
                                   method="mean")

        mrsc = mRSC(donor, target, probObservation=1)
        mrsc.fit_threshold(offMetrics,
                           weights_list[0],
                           2016,
                           pred_length=1,
                           threshold=threshold,
                           setup=expSetup)

        predOff = mrsc.predict()
        trueOff = mrsc.getTrue()
        predOff.columns = [playerName]
        trueOff.columns = [playerName]

        mrsc.fit_threshold(defMetrics,
                           weights_list[1],
                           2016,
                           pred_length=1,
                           threshold=threshold,
                           setup=expSetup)
        predDef = mrsc.predict()
        trueDef = mrsc.getTrue()
        predDef.columns = [playerName]
        trueDef.columns = [playerName]

        pred = pd.concat([predOff, predDef], axis=0)
        true = pd.concat([trueOff, trueDef], axis=0)

        pred_all = pd.concat([pred_all, pred], axis=1)
        true_all = pd.concat([true_all, true], axis=1)


###################
    mask = (true_all != 0)
    mape = np.abs(pred_all - true_all) / true_all[mask]
    print()
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())

    rmse = utils.rmse_2d(true_all, pred_all)
    print()
    print(rmse)
    print("RMSE for all: ", rmse.mean())

    weirdo = mape.T[mape.T.PTS_G > 100].T
    print()
    print(weirdo)
    print(weirdo.shape)
Beispiel #3
0
def test():
    """
    import data
    """
    print("* importing data")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >=
                      1980]  # only choose players who started after 1980
    players["player_id"] = range(0, len(players.name))  # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("* preparing data")
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG", "FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats,
                                                 metricsWeightedColNames)

    allMetricsDict = {
        **metricsPerGameDict,
        **metricsPerCentDict,
        **metricsWeightedDict
    }
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)

    # this matrix will be used to mask the table
    df_year = pd.pivot_table(stats,
                             values="Year",
                             index="Player",
                             columns="year_count")
    """
    experiment setup
    """
    pred_year = 2016
    activePlayers = getActivePlayers(stats, pred_year, 4)
    activePlayers.sort()
    activePlayers.remove("Kevin Garnett")
    activePlayers.remove("Kobe Bryant")

    metrics1 = ["PTS_G", "PER_w"]
    metrics2 = ["3P_G", "FG%", "FT%"]
    metrics3 = ["TOV_G"]
    metrics4 = ["TRB_G", "STL_G"]
    metrics5 = ["AST_G", "BLK_G"]
    metrics_list = [metrics1, metrics2, metrics3, metrics4, metrics5]
    print(metrics_list)
    #### uniform weights
    # weightsOff = [1.,1.,1.,1.,1.,1.,1.]
    # weightsDef = [1.,1.,1.]

    expSetup = ["sliding", "SVD", "all", "pinv", False]
    threshold = 0.97

    #### position groups
    group1 = [
        "C", "SF", "PF", "C-PF", "PF-C", "C-SF", "SF-C", "SF-PF", "PF-SF"
    ]
    group2 = [
        "SG", "SF", "PG", "SG-SF", "SF-SG", "SF-PG", "PG-SF", "SG-PG", "PG-SF"
    ]
    group3 = [
        "SG", "SF", "PF", "SG-SF", "SF-SG", "SF-PF", "PF-SF", "SG-PF", "PF-SG"
    ]

    print("* start experiment")
    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    for playerName in activePlayers:
        playerPos = stats.loc[(stats.Player == playerName) &
                              (stats.Year == pred_year), "Pos"].values[-1]
        # print(playerPos)

        if playerPos in group1:
            players_in_group = stats[stats.Pos.isin(group1)].Player.unique()
            # print("group 1")
        elif playerPos in group2:
            players_in_group = stats[stats.Pos.isin(group2)].Player.unique()
            # print("group 2")
        elif playerPos in group3:
            players_in_group = stats[stats.Pos.isin(group2)].Player.unique()
            # print("group 3")
        else:
            raise Exception("invalid position")

        donorPivotedTableDict = filterDonor(allPivotedTableDict,
                                            players_in_group)

        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(donorPivotedTableDict, df_year)

        # print("***sanitary check***")
        # print("players_in_group: ", len(players_in_group))
        # print("donor pool size for PTS_G", donorPivotedTableDict["PTS_G"].shape)

        weights_list = getWeitghts(target,
                                   donor,
                                   metrics_list,
                                   expSetup,
                                   method="var")

        mrsc = mRSC(donor, target, probObservation=1)

        player_pred = pd.DataFrame()
        player_true = pd.DataFrame()
        for i in range(len(metrics_list)):
            mrsc.fit_threshold(metrics_list[i],
                               weights_list[i],
                               pred_year,
                               pred_length=1,
                               threshold=threshold,
                               setup=expSetup)

            pred = mrsc.predict()
            true = mrsc.getTrue()
            pred.columns = [playerName]
            true.columns = [playerName]

            player_pred = pd.concat([player_pred, pred], axis=0)
            player_true = pd.concat([player_true, true], axis=0)

        pred_all = pd.concat([pred_all, player_pred], axis=1)
        true_all = pd.concat([true_all, player_true], axis=1)


###################
    mask = (true_all != 0)
    mape = np.abs(pred_all - true_all) / true_all[mask]
    print()
    print("******* MAPE *******")
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())
def test():
    """
    import data
    """
    print("* importing data")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >=
                      1980]  # only choose players who started after 1980
    players["player_id"] = range(0, len(players.name))  # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("* preparing data")
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG", "FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats,
                                                 metricsWeightedColNames)

    allMetricsDict = {
        **metricsPerGameDict,
        **metricsPerCentDict,
        **metricsWeightedDict
    }
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())

    # this matrix will be used to mask the table
    df_year = pd.pivot_table(stats,
                             values="Year",
                             index="Player",
                             columns="year_count")
    """
    experiment setup
    """
    activePlayers = getActivePlayers(stats, 2016, 4)
    activePlayers.sort()
    activePlayers.remove("Kevin Garnett")
    activePlayers.remove("Kobe Bryant")

    # offMetrics = ["PTS_G","AST_G","TOV_G","3P_G","PER_w", "FG%","FT%"]
    # defMetrics = ["TRB_G","STL_G","BLK_G"]
    expSetup = ["sliding", "SVD", "all", "pinv", False]
    threshold = 0.97
    metrics_to_use = [
        "PTS_G", "AST_G", "TOV_G", "3P_G", "PER_w", "FG%", "FT%", "TRB_G",
        "STL_G", "BLK_G"
    ]

    print("* start experiment")

    print("*******************")
    print("uniform weights")
    weights = [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]
    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    for playerName in activePlayers:
        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(allPivotedTableDict, df_year)

        mrsc = mRSC(donor, target, probObservation=1)
        mrsc.fit_threshold(metrics_to_use,
                           weights,
                           2016,
                           pred_length=1,
                           threshold=threshold,
                           setup=expSetup)

        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]

        pred_all = pd.concat([pred_all, pred], axis=1)
        true_all = pd.concat([true_all, true], axis=1)
    print()
    print("*** MAPE ***")
    mask = (true_all != 0)
    mape = np.abs(pred_all - true_all) / true_all[mask]
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())
    rmse = utils.rmse_2d(true_all, pred_all)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())

    print("*******************")
    print("mean - standardized weights")
    metrics_list = [metrics_to_use]
    weights = getWeitghts(target, donor, metrics_list, expSetup,
                          method="mean")[0]

    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    for playerName in activePlayers:
        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(allPivotedTableDict, df_year)

        mrsc = mRSC(donor, target, probObservation=1)
        mrsc.fit_threshold(metrics_to_use,
                           weights,
                           2016,
                           pred_length=1,
                           threshold=threshold,
                           setup=expSetup)

        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]

        pred_all = pd.concat([pred_all, pred], axis=1)
        true_all = pd.concat([true_all, true], axis=1)
    print()
    print("*** MAPE ***")
    mask = (true_all != 0)
    mape = np.abs(pred_all - true_all) / true_all[mask]
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())
    rmse = utils.rmse_2d(true_all, pred_all)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())

    print("*******************")
    print("var - standardized weights")
    metrics_list = [metrics_to_use]
    weights = getWeitghts(target, donor, metrics_list, expSetup,
                          method="var")[0]

    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    for playerName in activePlayers:
        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(allPivotedTableDict, df_year)

        mrsc = mRSC(donor, target, probObservation=1)
        mrsc.fit_threshold(metrics_to_use,
                           weights,
                           2016,
                           pred_length=1,
                           threshold=threshold,
                           setup=expSetup)

        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]

        pred_all = pd.concat([pred_all, pred], axis=1)
        true_all = pd.concat([true_all, true], axis=1)
    print()
    print("*** MAPE ***")
    mask = (true_all != 0)
    mape = np.abs(pred_all - true_all) / true_all[mask]
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())
    rmse = utils.rmse_2d(true_all, pred_all)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())
def test():
    """
    import data
    """
    pred_year = 2015 # the year that we are living in
    pred_interval = 1 # we are making predictions for pred_year+1 and +2

    print("*** importing data ***")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >= 1980] # only choose players who started after 1980
    # players["player_id"] = range(0,len(players.name)) # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("*** preparing data ***")

    ########### Donor ##########
    # filter stats by the year
    stats_donor = stats[stats.Year <= pred_year]
    allPivotedTableDict, allMetrics = prepareData(stats_donor)
    donor = Donor(allPivotedTableDict)

    ########### Target ##########
    # filter stats by the year
    stats_target = stats[stats.Year <= pred_year+pred_interval]
    allPivotedTableDict, allMetrics = prepareData(stats_target)
    
    # just to debug
    df_year = pd.pivot_table(stats, values="Year", index="Player", columns = "year_count")

    """
    experiment setup
    """
    # overall setup
    donorSetup= [None,"fixed", True]
    # weighting = donorSetup[0] # None / "normalize"
    # mat_form_method = donorSetup[1] # "fixed"
    # skipNan = donorSetup[2] # (Boolean)
    denoiseSetup = ["SVD", "all"]
    # denoise_method = denoiseSetup[0] # "SVD"
    # denoise_mat_method = denoiseSetup[1] # "all"
    regression_method = "pinv"

    threshold = 0.97
    verbose = False

    ###################################################
    offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"]
    defMetrics = ["TRB_G","STL_G","BLK_G"]
    # metrics_list = [offMetrics, defMetrics]

    metrics_list = [allMetrics]

    ###################################################

    ##############################################################
    # test 1
    ##############################################################
    playerNames = getActivePlayers(stats, pred_year, buffer=4)
    playerNames.remove("Kevin Garnett")
    playerNames.remove("Kobe Bryant")
    # playerNames.remove("Jason Kidd")

    all_pred = pd.DataFrame()
    all_true = pd.DataFrame()
    for playerName in playerNames:
        # print(playerName)
        # print("*** year - year_count matching for this player")
        # a = df_year[df_year.index == playerName]
        # print(a.dropna(axis=1))

        target = Target(playerName, allPivotedTableDict)
        # print("*** target - total index: ", target.total_index)
        # print(target.concat(metrics_list[1]))

        mrsc = mRSC(donor, target, pred_interval, probObservation=1)
        
        player_pred = pd.DataFrame()
        player_true = pd.DataFrame()
        for i in range(len(metrics_list)):
            mrsc.fit_threshold(metrics_list[i], pred_interval, threshold, donorSetup, denoiseSetup,regression_method, verbose)
            pred = mrsc.predict()
            true = mrsc.getTrue()
            pred.columns = [playerName]
            true.columns = [playerName]
            player_pred = pd.concat([player_pred, pred], axis=0)
            player_true = pd.concat([player_true, true], axis=0)
        all_pred = pd.concat([all_pred, player_pred], axis=1)
        all_true = pd.concat([all_true, player_true], axis=1)

    ###################
    print(all_pred)
    print(all_pred.shape)
    mask = (all_true !=0 )
    mape = np.abs(all_pred - all_true) / all_true[mask]
    print("*** MAPE ***")
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())

    rmse = utils.rmse_2d(all_true, all_pred)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())
Beispiel #6
0
def test():
    """
    import data
    """
    print("importing data")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >=
                      1980]  # only choose players who started after 1980
    players["player_id"] = range(0, len(players.name))  # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("preparing data")
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG", "FT", "3P"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats,
                                                 metricsWeightedColNames)

    allMetricsDict = {
        **metricsPerGameDict,
        **metricsPerCentDict,
        **metricsWeightedDict
    }
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)

    # this matrix will be used to mask the table
    df_year = pd.pivot_table(stats,
                             values="Year",
                             index="Player",
                             columns="year_count")
    """
    experiment setup
    """
    activePlayers = getActivePlayers(stats, 2016, 5)
    activePlayers.sort()
    # offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"]
    # defMetrics = ["TRB_G","STL_G","BLK_G"]

    metrics_to_use = [
        "PTS_G", "AST_G", "TOV_G", "PER_w", "FG%", "FT%", "3P%", "TRB_G",
        "STL_G", "BLK_G"
    ]

    weights = [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]
    expSetup = ["fixed", "SVD", "all", "pinv", False]

    singvals_list = [1, 2, 4, 8, 16, 32]

    print("start experiment")
    for singvals in singvals_list:
        pred_all = pd.DataFrame()
        true_all = pd.DataFrame()
        for playerName in activePlayers:
            target = Target(playerName, allPivotedTableDict, df_year)
            donor = Donor(allPivotedTableDict, df_year)

            mrsc = mRSC(donor, target, probObservation=1)
            mrsc.fit(metrics_to_use,
                     weights,
                     2016,
                     pred_length=1,
                     singvals=singvals,
                     setup=expSetup)

            pred = mrsc.predict()
            true = mrsc.getTrue()
            pred.columns = [playerName]
            true.columns = [playerName]

            pred_all = pd.concat([pred_all, pred], axis=1)
            true_all = pd.concat([true_all, true], axis=1)

        mask = (true_all != 0)
        mape = np.abs(pred_all - true_all) / true_all[mask]
        print(singvals)
        print(mape.mean(axis=1))
Beispiel #7
0
def test():
    """
    import data
    """

    """ USER PARAMETERS """
    starting_year = 1970
    #min_games_donor = 40
    #min_games_target = 40
    #min_games = np.min((min_games_donor, min_games_target))
    min_games= 30
    train_year = 2015
    pred_interval = 1
    pred_year = train_year + pred_interval
    buffer = 4
    num_top_players = 300

    parameters = {'starting_year': starting_year,
                 #'min_games_donor': min_games_donor,
                 #'min_games_target': min_games_target,
                 'min_games': min_games,
                 'pred_year': pred_year,
                 'pred_interval': pred_interval,
                 'min_num_years_played': buffer}

    """ players dataframe """
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")

    # sort players by (name, year_start)
    players = players.sort_values(by=['name', 'year_start'])

    # filter players by years considered
    players = players[players.year_start >= starting_year] 

    """ stats dataframe """
    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")

    # fix the name* issue
    stats = stats.replace('\*','',regex=True)

    # sort players by (name, year)
    stats = stats.sort_values(by=['Player', 'Year'])

    # remove multiple rows for the same [Year, Player]
    totals = stats[stats.Tm == "TOT"]
    duplicates_removed = stats.drop_duplicates(subset=["Year", "Player"], keep=False)
    stats = pd.concat([duplicates_removed, totals], axis=0).sort_values("Unnamed: 0")

    # filter players by years considered
    stats = stats[stats.Year >= starting_year]

    """ players + stats dataframes """
    valid_players = list(set(stats.Player) & set(players.name))
    stats = stats[stats['Player'].isin(valid_players)]
    players = players[players['name'].isin(valid_players)]

    # correct names in "players" dataframe
    duplicate_names = []
    for name in players.name:
        numrows = len(players[players['name']==name])
        if numrows == 2:
            duplicate_names.append(name)
            i = 0
            for birth_date in players.loc[players['name']==name, 'birth_date']:
                if i == 1:
                    players.loc[(players['name']==name) & (players['birth_date']==birth_date) , 'name'] = name + ' Jr'
                i += 1
        elif numrows == 3:
            players = players[players.name != name]
             
    # correct names in "stats" dataframe
    stats = fix_duplicates(stats, duplicate_names)

    # merge
    players = players.rename(columns={"name": "Player"})
    stats = pd.merge(stats, players, on='Player', how='left')
    # sanity check 
    stats = stats[(stats.Year >= stats.year_start) & (stats.Year <= stats.year_end)]

    stats.Year = stats.Year.astype(int)
    stats.year_start = stats.year_start.astype(int)
    stats['year_count'] = stats.Year - stats.year_start

    """ donor setup """
    # only consider years prior to 'pred_year'
    stats_donor = stats[stats.Year < pred_year]
    stats_donor = stats_donor.sort_values(by=['Player', 'Year'])

    # only consider years in which at least "min_games" number of games were played
    stats_donor = stats_donor[stats_donor.G >= min_games]

    # edit 'year_count'
    stats_donor = edit_year_count(stats_donor)

    # create donor object
    allPivotedTableDict_d, allMetrics = prepareData(stats_donor)
    donor = Donor(allPivotedTableDict_d)
    """ target setup """
    # consider all years up to (and including) 'pred_year'
    stats_target = stats[stats.Year <= pred_year]

    # exclude years prior to 'pred_year' in which < 'min_games' were played
    idx = stats_target.loc[(stats_target.G < min_games) & (stats_target.Year < pred_year)].index
    stats_target.drop(idx, inplace=True)

    # edit 'year_count'
    stats_target = edit_year_count(stats_target)

    # create target dictionary of values
    allPivotedTableDict, allMetrics = prepareData(stats_target)

    # get target player names
    targetNames = get_active_players(stats_target, pred_year, buffer, min_games) 
    targetNames.sort()

    if 'Kevin Garnett' in targetNames: 
        targetNames.remove("Kevin Garnett")
    if 'Kobe Bryant' in targetNames:
        targetNames.remove("Kobe Bryant")

    print("*** DATA PREP DONE! ***")

    predMetrics = ["PTS_G","AST_G","TOV_G","FG%","FT%","3P_G","TRB_G","STL_G","BLK_G"]
    

    """
    setup
    """
    # user input
    donor_window_type = 'sliding'
    normalize_metric = 'variance' 
    threshold = 0.97
    #helper_metrics = ['MP']
    helper_metrics = []
    num_top = 200
    top_players = getTopPlayers(stats, pred_year, 'PTS', num_top).values.flatten().tolist()

    # setup 
    donorSetup= [normalize_metric, donor_window_type, True]
    denoiseSetup = ["SVD", "all"]
    regression_method = "pinv"
    verbose = False
    metrics_list = [[metric] + helper_metrics for metric in predMetrics]
    print(metrics_list)

    selected_targetNames = list(set(targetNames) & set(top_players))
    selected_targetNames = targetNames

    print("the number of targets tested: ", len(selected_targetNames))


    """ 
    directory path
    """
    # donor window type
    donor_window_label = donor_window_type + '/'

    # prediction method
    helper_metrics_label = ''
    if helper_metrics:
        pred_method = 'mrsc/'
        for helper_metric in helper_metrics: 
            helper_metrics_label = helper_metrics_label + helper_metric + '_'
        helper_metrics_label += '/'
    else:
        pred_method = 'rsc/'
        
    # prediction year
    pred_year_label = str(pred_year) + '/'

    # metric normalizing type
    if normalize_metric == None:
        normalize_metric_label = 'no_normalization/'
    else:
        normalize_metric_label = normalize_metric + '/'
        
    # singular value threshold energy level 
    threshold_label = str(threshold*100)[:2] + '/'

    # prediction length
    pred_length_label = str(pred_interval) + 'year/'
        
    dir_name = 'plots/' + pred_method + pred_year_label + donor_window_label + normalize_metric_label + helper_metrics_label + pred_length_label
        
    print(dir_name)
    """
    computation
    """
    print("Computing...")
    print()

    print("*** SETUP ***")
    for key, value in parameters.items():
        print("{}: {}".format(key, value))

    print("donor window type: ", donor_window_type)
    print("normalization metric: ", normalize_metric)
    print("threshold: ", threshold)
    print("helper metrics: ", helper_metrics)
    print("denoise setup: ", denoiseSetup)
    print("regression method: ", regression_method)
    print("metrics list: ", metrics_list)
    print()
    print("Experiment: {}".format(dir_name))
    print()

    all_pred, all_true, all_R2, all_bench = annual_predictions(selected_targetNames, allPivotedTableDict, donor, pred_interval, metrics_list, predMetrics,
                       threshold, donorSetup, denoiseSetup, regression_method, verbose, dir_name, top_players) 
Beispiel #8
0
def test():
    """
	import data
	"""
    print("* importing data")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >=
                      1980]  # only choose players who started after 1980
    players["player_id"] = range(0, len(players.name))  # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("* preparing data")
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG", "FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats,
                                                 metricsWeightedColNames)

    allMetricsDict = {
        **metricsPerGameDict,
        **metricsPerCentDict,
        **metricsWeightedDict
    }
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())

    # this matrix will be used to mask the table
    df_year = pd.pivot_table(stats,
                             values="Year",
                             index="Player",
                             columns="year_count")
    """
	experiment setup
	"""
    pred_year = 2016
    # targets
    activePlayers = getActivePlayers(stats, pred_year, 4)
    activePlayers.sort()
    activePlayers.remove("Kevin Garnett")
    activePlayers.remove("Kobe Bryant")

    # overall setup
    expSetup = ["sliding", "SVD", "all", "pinv", False]
    threshold = 0.97

    ###################################################
    # offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P_G"]
    # defMetrics = ["TRB_G","STL_G","BLK_G"]
    # metrics_list = [offMetrics, defMetrics]
    ###################################################

    print("* start experiment")
    pred_all = pd.DataFrame()
    true_all = pd.DataFrame()
    metrics_all = []
    with open('metrics_all.pkl', 'wb') as f:
        pickle.dump(metrics_all, f, pickle.HIGHEST_PROTOCOL)
    for playerName in activePlayers:
        # print()
        # print("***********", playerName , "************")
        target = Target(playerName, allPivotedTableDict, df_year)
        donor = Donor(allPivotedTableDict, df_year)

        metrics_list = getMetrics(target,
                                  donor,
                                  pred_year,
                                  allMetrics,
                                  threshold,
                                  expSetup,
                                  boundary="threshold")
        weights_list = getWeitghts(target,
                                   donor,
                                   metrics_list,
                                   expSetup,
                                   method="mean")

        metrics_all.append(metrics_list)
        # print(metrics_list)

        mrsc = mRSC(donor, target, probObservation=1)

        player_pred = pd.DataFrame()
        player_true = pd.DataFrame()
        for i in range(len(metrics_list)):
            mrsc.fit_threshold(metrics_list[i],
                               weights_list[i],
                               pred_year,
                               pred_length=1,
                               threshold=threshold,
                               setup=expSetup)

            pred = mrsc.predict()
            true = mrsc.getTrue()
            pred.columns = [playerName]
            true.columns = [playerName]

            c = metrics_list[i].index(allMetrics[i])
            player_pred = pd.concat([player_pred, pred.iloc[c:(c + 1), :]],
                                    axis=0)
            player_true = pd.concat([player_true, true.iloc[c:(c + 1), :]],
                                    axis=0)

        pred_all = pd.concat([pred_all, player_pred], axis=1)
        true_all = pd.concat([true_all, player_true], axis=1)

        mask = (true_all != 0)
        mape = np.abs(pred_all - true_all) / true_all[mask]
        # print(mape.mean(axis=1))

    ###################
    print("******** RESULT ********")
    mask = (true_all != 0)
    mape = np.abs(pred_all - true_all) / true_all[mask]

    print()
    print("*** MAPE ***")
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())

    rmse = utils.rmse_2d(pred_all, true_all)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())

    with open('metrics_all.pkl', 'wb') as f:
        pickle.dump(metrics_all, f, pickle.HIGHEST_PROTOCOL)
def test():
    """
    import data
    """
    print("importing data")
    players = pd.read_csv("../data/nba-players-stats/player_data.csv")
    players = players[players.year_start >=
                      1980]  # only choose players who started after 1980
    players["player_id"] = range(0, len(players.name))  # assign id

    stats = pd.read_csv("../data/nba-players-stats/Seasons_Stats.csv")
    stats = stats[stats.Player.isin(players.name)]

    # only after 1980
    stats = stats[stats.Year >= 1980]

    # without duplicated names --> to do: how to distinguish multiple player with the same name
    stats = removeDuplicated(players, stats)
    stats.Year = stats.Year.astype(int)
    stats.year_count = stats.year_count.astype(int)

    print("preparing data")
    # transform stats to a dictionary composed of df's for each stat
    # the stats are re-calculated to get one stat for each year
    metricsPerGameColNames = ["PTS", "AST", "TOV", "TRB", "STL", "BLK", "3P"]
    metricsPerGameDict = getMetricsPerGameDict(stats, metricsPerGameColNames)

    metricsPerCentColNames = ["FG", "FT"]
    metricsPerCentDict = getMetricsPerCentDict(stats, metricsPerCentColNames)

    metricsWeightedColNames = ["PER"]
    metricsWeightedDict = getMetricsWeightedDict(stats,
                                                 metricsWeightedColNames)

    allMetricsDict = {
        **metricsPerGameDict,
        **metricsPerCentDict,
        **metricsWeightedDict
    }
    allPivotedTableDict = getPivotedTableDict(allMetricsDict)
    allMetrics = list(allMetricsDict.keys())

    # this matrix will be used to mask the table
    df_year = pd.pivot_table(stats,
                             values="Year",
                             index="Player",
                             columns="year_count")
    """
    experiment setup
    """
    # overall setup
    expSetup = ["sliding", "SVD", "all", "pinv", False]
    threshold = 0.97

    metrics1 = ["PTS_G", "PER_w", "TRB_G", "3P_G"]
    metrics2 = ["FG%", "FT%"]
    metrics3 = ["BLK_G", "AST_G", "TOV_G", "STL_G"]

    metrics_list = [metrics1, metrics2, metrics3]

    ###################################################
    # offMetrics = ["PTS_G","AST_G","TOV_G","PER_w", "FG%","FT%","3P%"]
    # defMetrics = ["TRB_G","STL_G","BLK_G"]
    # metrics_list = [offMetrics, defMetrics]

    ###################################################
    playerName = "Ryan Anderson"

    target = Target(playerName, allPivotedTableDict, df_year)
    donor = Donor(allPivotedTableDict, df_year)

    weights_list = getWeitghts(target,
                               donor,
                               metrics_list,
                               expSetup,
                               method="var")

    mrsc = mRSC(donor, target, probObservation=1)

    fig, axs = plt.subplots(3, 5)
    player_pred = pd.DataFrame()
    player_true = pd.DataFrame()
    for i in range(len(metrics_list)):
        mrsc.fit_threshold(metrics_list[i],
                           weights_list[i],
                           2016,
                           pred_length=1,
                           threshold=threshold,
                           setup=expSetup)
        pred = mrsc.predict()
        true = mrsc.getTrue()
        pred.columns = [playerName]
        true.columns = [playerName]
        player_pred = pd.concat([player_pred, pred], axis=0)
        player_true = pd.concat([player_true, true], axis=0)

        # mrsc.plot()
        for j in range(len(metrics_list[i])):
            metric = metrics_list[i][j]
            true_trajectory = target.data[metric].dropna(
                axis='columns').iloc[:, :mrsc.total_index]

            pred_val = np.dot(
                mrsc.model.donor_data.iloc[:, j * mrsc.model.total_index:(
                    (j + 1) * mrsc.model.total_index)].T, mrsc.model.beta).T
            pred_trajectory = pd.DataFrame(pred_val,
                                           columns=true_trajectory.columns,
                                           index=true_trajectory.index)

            markers_on = [true_trajectory.shape[1] - 1]

            axs[i, j].plot(true_trajectory.T,
                           marker='o',
                           color='red',
                           label='true')
            axs[i, j].plot(pred_trajectory.T,
                           marker='o',
                           markevery=markers_on,
                           color='blue',
                           label='prediction')
            axs[i, j].set_title(playerName + ": " + metric)
            # axs[i, j].legend()

    for ax in axs.flat:
        ax.set(xlabel='years played in NBA')
    plt.subplots_adjust(wspace=0.5, hspace=0.5)
    plt.show()

    ###################
    mask = (player_true != 0)
    mape = np.abs(player_pred - player_true) / player_true[mask]
    print("*** MAPE ***")
    print(mape.mean(axis=1))
    print("MAPE for all: ", mape.mean().mean())

    rmse = utils.rmse_2d(player_true, player_pred)
    print()
    print("*** RMSE ***")
    print(rmse)
    print("RMSE for all: ", rmse.mean())