Exemple #1
0
def save_lineup_data_for_season(year, season_type="regular_season"):
    year_string = common_utils.construct_year_string(year)

    print('creating index')
    db.possessions[season_type][year_string].create_index(
        [('possession_metadata.event_num', pymongo.ASCENDING),
         ('possession_metadata.gid', pymongo.ASCENDING)],
        unique=True)
    print('index created (or already existed)')

    games = db.games[season_type][year_string].find()
    num_games = games.count()
    prev_time = time.perf_counter()
    for index, game in enumerate(games):
        #if game["game_index"]:
        # if game["game_index"] == "0049900030":
        print("processing {}...".format(game["game_index"]))
        team_info = get_team_info(year_string, game["game_index"], "playoffs")
        data = get_lineups_and_possession_info(game["game_index"], game["pbp"],
                                               team_info, year)
        for possession in data:
            db.possessions[season_type][year_string].update(
                {
                    'possession_metadata.gid':
                    possession['possession_metadata']['gid'],
                    'possession_metadata.event_num':
                    possession['possession_metadata']['event_num']
                }, possession, True)
        if index % 10 == 0:
            curr_time = time.perf_counter()
            print("time elapsed: {}".format(curr_time - prev_time))
            prev_time = curr_time
            print("Processed {} / {} games for {} season".format(
                index + 1, num_games, year_string))
Exemple #2
0
def deal_with_traded_players(year, season_type):
    year_as_int = int(year)
    year_string = common_utils.construct_year_string(year)
    traded_players = db.players.find(
        filter = {
            "player_index.team": "TOT",
            "player_index.season": year_as_int
        }
    )
    for player in traded_players:
        print(player["player"])
        if player["player"] in weird_players:
            continue
        each_team = db.players.find(
            filter = {
                "player_index.name_stub": player["player_index"]["name_stub"],
                "player_index.season": year_as_int,
                "player_index.team": { "$ne": "TOT" }
            }
        )
        
        player_update_data = {
            "total_possessions": 0,
            "orapm_weighted_sum": 0,
            "drapm_weighted_sum": 0
        }
        possessions_key = "{}_possessions".format(season_type)
        orapm_key = "orapm_{}".format(season_type)
        drapm_key = "drapm_{}".format(season_type)
        rapm_key = "rapm_{}".format(season_type)
        for player_data in each_team:
            print("\t",player_data["player"], player_data["team_id"], player_data[possessions_key], player_data[orapm_key])
            player_update_data["total_possessions"] += player_data[possessions_key]
            # technically possession weights here should be divided by 2 but after the weighted average it won't matter
            player_update_data["orapm_weighted_sum"] += player_data[possessions_key] * player_data[orapm_key]
            player_update_data["drapm_weighted_sum"] += player_data[possessions_key] * player_data[drapm_key]
            
        player_update_data["orapm_combined"] = player_update_data["orapm_weighted_sum"] / player_update_data["total_possessions"]
        player_update_data["drapm_combined"] = player_update_data["drapm_weighted_sum"] / player_update_data["total_possessions"]
        player_update_data["rapm_combined"] = player_update_data["orapm_combined"] + player_update_data["drapm_combined"]
        
        pprint(player_update_data)
        
        db.players.update_one(
            filter = {
                "player_index.name_stub": player["player_index"]["name_stub"],
                "player_index.team": "TOT",
                "player_index.season": year_as_int
            },
            update = {
                "$set" : {
                    possessions_key: player_update_data["total_possessions"],
                    orapm_key: player_update_data["orapm_combined"],
                    drapm_key: player_update_data["drapm_combined"],
                    rapm_key: player_update_data["rapm_combined"]
                }
            }
        )
Exemple #3
0
def store_games_data(year, season_type):
    year_string = common_utils.construct_year_string(year)
    print("storing games data...")
    games_data = {}
    games = db.games[season_type][year_string].find()
    for game_obj in games:

        home_code = game_obj["home"]
        away_code = game_obj["away"]

        if home_code in common_utils.team_codes_to_bball_ref_codes:
            home_code = common_utils.team_codes_to_bball_ref_codes[home_code]
        if away_code in common_utils.team_codes_to_bball_ref_codes:
            away_code = common_utils.team_codes_to_bball_ref_codes[away_code]

        games_data[game_obj["game_index"]] = {
            "home": home_code,
            "away": away_code
        }
    print("stored games data")
    db.seasons.create_index([
        ('year_string', pymongo.ASCENDING),
    ], unique=True)

    db.seasons.update_one(
        filter = {
            "year_string": year_string
        },
        update = {
            "$set": 
            {
                "{}_games_data".format(season_type): games_data
            }
        }, 
        upsert=True
    )
Exemple #4
0
def store_player_and_possession_data_for_matrix(year, season_type):
    player_info = {}
    year_string = common_utils.construct_year_string(year)
    year_as_int = int(year)
    possessions = db.possessions[season_type][year_string].find().limit(limit)
    count = 0
    player_index = 0
    num_possessions = possessions.count()
    games_data = db.seasons.find_one({"year_string": year_string})["{}_games_data".format(season_type)]
    #print(games_data)

    # player info should already be calculated, this is to get index
    if season_type == "playoffs":
        num_players = len(db.seasons.find_one({ "year_string": year_string })["player_info"])

    for possession in possessions:
        #print(possession)
        count += 1
        if count % 20000 == 0:
            print("Poss count {}/{}".format(count, num_possessions))
        lineups_with_team_names = convert_lineups_to_player_team_strings(possession, games_data)
        home_or_away = ["home_lineup", "away_lineup"]
        for lineup_type in home_or_away:
            for player_team in lineups_with_team_names[lineup_type]:
                player_name, team_code = player_team.split("_")
                stub_name = common_utils.player_to_stub_name(player_name)
                #print(player_name, team_code)
                if player_name == "None": 
                    continue
                if player_team not in player_info:
                    if team_code == "WAS" and year_as_int < 1998:
                        team_code = "WSB"

                    # found the player in bball ref database
                    if common_utils.player_exists(stub_name, team_code, year_as_int):
                        player_info[player_team] = {
                            "index": player_index,
                            "possessions": 0,
                            "stub_name" : stub_name
                        }
                        player_index += 1
                    # try to resolve name
                    else:
                        print("{}_{}_{} not found.".format(stub_name, team_code, year_as_int))
                        if season_type == "playoffs":
                            print("\n{}_{}_{} played in playoffs and not in regular season.".format(stub_name, team_code, year_as_int))
                            print("[{}][{}]: {}".format(
                                possession["possession_metadata"]["gid"], 
                                possession["possession_metadata"]["event_num"],
                                possession["possession_metadata"]["message"]
                            ))
                            print("\tHome", lineups_with_team_names["home_lineup"])
                            print("\tAway", lineups_with_team_names["away_lineup"])
                            db.seasons.find_one_and_update(
                                { "year_string": year_string },
                                { "$set" : 
                                    { "player_info.{}".format(player_team): 
                                        {
                                            "index": num_players,
                                            "possessions": 0,
                                            "stub_name": stub_name
                                        }  
                                    } 
                                }
                            )
                            num_players += 1


                # player was already seen in a lineup
                else:
                    player_info[player_team]["possessions"] += 1

    for player_team in player_info:

        possession_number = player_info[player_team]["possessions"]
        player_name, team_code = player_team.split("_")
        #print(player_info[player_team]["stub_name"], team_code)
        db.players.update_one(
            filter = {
                "player_index.name_stub": player_info[player_team]["stub_name"],
                "player_index.team": team_code,                
                "player_index.season": year_as_int
            },
            update = {
                "$set": {
                    "{}_possessions".format(season_type): possession_number
                }
            },
            upsert=True
        )

    if season_type == "regular_season": 
        db.seasons.find_one_and_update(
            { "year_string": year_string },
            { "$set" : { "player_info": player_info  } }
        )
Exemple #5
0
def write_rapm_json(year, season_type):
    year_as_int = int(year)
    year_string = common_utils.construct_year_string(year_as_int)
    projection = {
        "player": 1,
        "team_id": 1,
        "{}_possessions".format(season_type): 1,
        "orapm_{}".format(season_type): 1,
        "drapm_{}".format(season_type): 1,
        "rapm_{}".format(season_type): 1
    }

    players = db.players.find(
        filter = {
            "player_index.season": year_as_int
        },
        projection = projection,
        sort = [('rapm_{}'.format(season_type), -1)]
    )

    rapm_json = {
        "data": []
    }
    rank = 1
    for player in players:
        numerical_keys = {
            "orapm_{}".format(season_type),
            "drapm_{}".format(season_type),
            "rapm_{}".format(season_type)
        }

        rapm_sum = 0

        for key in numerical_keys:
            if key in player:
                rapm_sum += player[key]
            else:
                continue
        if rapm_sum == 0:
            continue

        row = [rank]
        for key in projection:
            try:
                if "rapm" in key:
                    row.append(round(player[key], 4))
                else:
                    row.append(player[key])
            except:
                # about a dozen players total that need to be investigated as
                # to why they're edge cases, one thing seems to be Washington Bullets
                # Melvin Booker keeps causing problems man
                # print(player)
                continue
        rank += 1

        if len(row) == 7:
            rapm_json["data"].append(row)

    with open("{}-{}-rapm.json".format(year_string, season_type),"w") as jsonfile:
        json.dump(rapm_json, jsonfile)
Exemple #6
0
def calculate_rapm(year, X, Y, metric="rapm", season_type="regular_season"):

    year_as_int = int(year)
    year_string = common_utils.construct_year_string(year)
    print('fitting model...')
    timestamp = time.perf_counter()
    multiplier = 1
    if metric == "apm": 
        clf = linear_model.LinearRegression()
        multiplier = 10
    elif metric == "rapm":
        clf = linear_model.Ridge(alpha=2900)
        multiplier = 100
    elif metric == "rapm_enet_cv":
        clf = linear_model.ElasticNetCV(l1_ratio=.9,cv=3)
    elif metric == "rapm_bayes_ridge":
        clf = linear_model.BayesianRidge()
    elif metric == "rapm_cv":
        clf = linear_model.RidgeCV(alphas=list(range(2600,3000,50)), cv=5)
        multiplier = 100
    else:
        raise RuntimeError("{} not recognized".format(metric))

    clf.fit(X, Y)

    if metric == "rapm_enet_cv" or metric == "rapm_cv":
        print("alpha chosen: {}".format(clf.alpha_))
    if metric == "rapm_enet_cv":
        print("l1_ratio chosen: {}".format(clf.l1_ratio_))

    print("time took {} seconds".format(time.perf_counter()-timestamp))
    all_players_dict = db.seasons.find_one({"year_string": year_string})["player_info"]

    print("coefficients", clf.coef_.shape)
    print("num of players", len(all_players_dict))
    rapm = list(zip(all_players_dict, clf.coef_*multiplier, clf.coef_[len(all_players_dict):]*-1*multiplier))


    print(all_players_dict)
    print(len(rapm))


    for player, opm, dpm in rapm:
        player_name, team_code = player.split("_")
        data = {
            'player': player_name,
            'team': team_code,
            'o' + metric: opm,
            'd' + metric: dpm,
            metric: opm + dpm
        }

        stub_name = common_utils.player_to_stub_name(data["player"])
        team_code = data["team"]
        year_as_int = int(year)

        print ("updating {}_{}_{}".format(stub_name, team_code, year_as_int))
        db.players.update_one(
            {
                "player_index.name_stub": stub_name,
                "player_index.team": team_code,
                "player_index.season": year_as_int
            },
            {
                "$set": {
                    "o" + metric + "_" + season_type: data["o" + metric],
                    "d" + metric + "_" + season_type: data["d" + metric],
                    metric + "_" + season_type: data[metric]
                }
            }
        )

    print("Sorted & filtered {} for {}".format(metric, year_string))
    poss_threshold = 1000
    sorted_opm = db.players.find(
        filter = {"player_index.season": year_as_int, "possessions": {"$gt": poss_threshold}},
        projection = {"player": 1, metric + "_" + season_type: 1},
        sort = [(metric + "_" + season_type, -1)]
    )
    rankings_to_print = 50
    for player in sorted_opm:
        if "player" in player:
            print("{:20} {:>10}".format(player["player"], player[metric + "_" + season_type]))
            rankings_to_print += -1
            if rankings_to_print == 0:
                break
    return clf
Exemple #7
0
def build_matrix(year, season_type="regular_season",prior = "indicator"):

    if prior == "indicator":
        prior_multiplier = 1
    year_string = common_utils.construct_year_string(year)

    print('getting games data for {}'.format(year_string))
    timestamp = time.perf_counter()
    games_data = db.seasons.find_one({"year_string":year_string})["{}_games_data".format(season_type)]
    print("time took {} seconds".format(time.perf_counter()-timestamp))
    print('getting all players list')
    timestamp = time.perf_counter()

    count = 0

    print("time took {} seconds".format(time.perf_counter()-timestamp))
    all_players_dict = db.seasons.find_one({"year_string": year_string})["player_info"]
    print("sample of players from {}".format(year_string))
    for player in all_players_dict:
        print(player)
        count += 1
        if count > 10:
            break
    print('got all players from {}'.format(year_string))
    x_players = []
    y_scores = []
    timestamp = time.perf_counter()
    print('building matrix')
    possessions = db.possessions[season_type][year_string].find().limit(limit)
    possession_index = 0

    num_possessions = possessions.count()
    prev_poss_timestamp = time.perf_counter()

    num_players = len(all_players_dict)
    print("num players: {}".format(num_players))
    player_keys = {}
    for player_team_string in all_players_dict:
        player_keys[player_team_string] = all_players_dict[player_team_string]["index"]
    offense_matrix = scipy.sparse.dok_matrix((num_possessions, num_players), numpy.dtype(float))
    defense_matrix = scipy.sparse.dok_matrix((num_possessions, num_players), numpy.dtype(float))

    for row_num, possession in enumerate(possessions):
        if possession_index % 20000 == 0:
            print("{} / {} possessions added".format(possession_index, num_possessions))
            print("\t{} seconds elapsed".format(time.perf_counter() - prev_poss_timestamp))
            prev_poss_timestamp = time.perf_counter()

        lineups_with_team_names = convert_lineups_to_player_team_strings(possession, games_data)

        try:
            if possession['home_team_is_on_offense']:
                for player in lineups_with_team_names['home_lineup']:
                    if "None" not in player:
                        player_index = player_keys[player]
                        offense_matrix[row_num, player_index] = 1 * prior_multiplier
                for player in lineups_with_team_names['away_lineup']:
                    if "None" not in player:
                        player_index = player_keys[player]
                        defense_matrix[row_num, player_index] = 1 * prior_multiplier
            else:
                for player in lineups_with_team_names['home_lineup']:
                    if "None" not in player:
                        player_index = player_keys[player]
                        defense_matrix[row_num, player_index] = 1 * prior_multiplier
                for player in lineups_with_team_names['away_lineup']:
                    if "None" not in player:
                        player_index = player_keys[player]
                        offense_matrix[row_num, player_index] = 1 * prior_multiplier

        except Exception as e:
            print(e)

        y_scores.append(possession['scoring_margin_update'])

        possession_index += 1



    print('matrix built')
    print("time took {} seconds".format(time.perf_counter()-timestamp))

    print('combining...')
    timestamp = time.perf_counter()

    X = scipy.sparse.hstack((offense_matrix, defense_matrix)).tocsr()
    Y = numpy.array(y_scores)

    print("time took {} seconds".format(time.perf_counter()-timestamp))
    with open("./matrices/{}-X-{}.indicator.pickle".format(year_string, season_type), "wb") as picklefile:
        pickle.dump(X, picklefile)
    with open("./matrices/{}-Y-{}.pickle".format(year_string, season_type),"wb") as picklefile:
        pickle.dump(Y, picklefile)

    print('pickling...')
    timestamp = time.perf_counter()


    print("time took {} seconds".format(time.perf_counter()-timestamp))
    return X, Y
Exemple #8
0
import pickle

from pprint import pprint

import calculate_rapm
import evaluate_metrics
import parse_pbp
import common_utils

if __name__ == "__main__":
    for year in range(2017, 2019):
        year_string = common_utils.construct_year_string(year)
        # #parse_pbp.save_lineup_data_for_season(str(year))
        # calculate_rapm.store_games_data(year)
        # calculate_rapm.store_player_and_possession_data_for_matrix(year)
        # X, Y = calculate_rapm.build_matrix(year)
        # X_loaded = ""
        # Y_loaded = ""
        # with open("matrices/{}-X-indicator.pickle".format(year_string), "rb") as picklefile:
        #     X_loaded = pickle.load(picklefile)
        # with open("matrices/{}-Y.pickle".format(year_string), "rb") as picklefile:
        #     Y_loaded = pickle.load(picklefile)

        # calculate_rapm.calculate_rapm(year, X_loaded, Y_loaded, "apm")
        # calculate_rapm.calculate_rapm(year, X_loaded, Y_loaded, "rapm")
        # calculate_rapm.calculate_rapm(year, X_loaded, Y_loaded, "rapm_enet_cv")
        # calculate_rapm.calculate_rapm(year, X_loaded.toarray(), Y_loaded, "rapm_bayes_ridge")
        # evaluate_metrics.calculate_weighted_average(year)
        pprint(evaluate_metrics.calculate_r_squared_values(year))
Exemple #9
0
def calculate_weighted_average(year):
    year_as_int = int(year)
    metrics_averages = {
        "per": 15.0,
        "bpm": 0.0,
        "ws_per_48": 0.100,
        "rapm": 0.0,
        "apm": 0.0
    }

    if int(year) >= 2014:
        metrics_averages["rpm"] = 0.0

    year_as_int = int(year)
    year_string = common_utils.construct_year_string(year)
    teams = db.teams.find({"season": year})
    for team in teams:
        team_code = team["team_code"]
        # get the players to loop through
        players = db.players.find({
            "player_index.season": year_as_int,
            "player_index.team": team_code
        })
        players = list(players)
        if len(players) == 0 and team_code in common_utils.team_renames:
            players = db.players.find({
                "player_index.season":
                year_as_int,
                "player_index.team":
                common_utils.team_renames[team_code]
            })
            players = list(players)

        if len(players) == 0:
            print("could not find {}_{}".format(year_to_evaluate,
                                                team["team_index"]["team_id"]))
            continue

        for year_step in range(4):
            timestamp = time.perf_counter()

            year_to_evaluate = year - year_step
            total_possessions = 0
            weighted_average = 0

            for metric in metrics_averages:
                print("{} for {} {}\n".format(metric, year_to_evaluate,
                                              team_code))
                if year_to_evaluate < 1997:
                    print("no data available for prior to 1997")
                    continue
                for player in players:
                    # original measure was 250 minutes
                    if "possessions" in player and "player" in player:
                        stub_name = common_utils.player_to_stub_name(
                            player["player"])
                        prev_season_player_lookup = db.players.find({
                            "player_index.season":
                            year_to_evaluate,
                            "player_index.name_stub":
                            stub_name
                        })

                        if not player_exists(stub_name, year_to_evaluate):
                            if player_minus_jr_exists(stub_name,
                                                      year_to_evaluate):
                                stub_name = stub_name[:len(stub_name) - 2]
                                print("substracted suffix, found {}".format(
                                    stub_name))

                            if stub_name in common_utils.nba_com_stubs_to_bball_ref_stubs:
                                stub_name = common_utils.nba_com_stubs_to_bball_ref_stubs[
                                    stub_name]

                        # if player still doesn't exist, then idk
                        if not player_exists(stub_name, year_to_evaluate):
                            print(
                                "{}_{} not found in scraped bball_ref players page"
                                .format(stub_name, year_to_evaluate))
                        else:
                            prev_season_player_lookup = db.players.find({
                                "player_index.season":
                                year_to_evaluate,
                                "player_index.name_stub":
                                stub_name,
                                "player_index.team":
                                team_code
                            })

                        metric_value = 0
                        prev_poss_count = 0

                        # calculate metric value (done in this way because to combine entries for players who were traded)
                        for prev_player_season in prev_season_player_lookup:
                            if "possessions" in prev_player_season:
                                try:
                                    prev_poss_count += prev_player_season[
                                        "possessions"]
                                    #print("\t{} - {}: {} Poss {}".format(player["player"], metric, player[metric], player["possessions"]))
                                    metric_value += float(
                                        prev_player_season[metric]
                                    ) * prev_player_season["possessions"]
                                except:
                                    print("error trying to convert to float ",
                                          metric, prev_player_season)
                            else:
                                metric_value = metrics_averages[metric]
                        if prev_poss_count > 750:
                            metric_value = metric_value / prev_poss_count
                        else:
                            metric_value = metrics_averages[metric]

                        try:
                            total_possessions += player["possessions"]
                            weighted_average += player[
                                "possessions"] * metric_value
                        except:
                            print(player)
                    else:
                        player_name = player["player"] if player.get(
                            "player") else "unknown_player_name"
                        team_code = player["team_id"] if player.get(
                            "team_id") else "unknown_team"
                        player_key = "{}_{}_{}".format(player_name, year,
                                                       team_code)
                        players_without_possessions.add(player_key)
                    if total_possessions == 0:
                        print(players)
                        continue
                if total_possessions > 0:
                    weighted_average = weighted_average / total_possessions
                    print("{}_{} has a weighted average of {} {}\n".format(
                        team_code, year_to_evaluate, weighted_average, metric))

                    db.teams.update_one(
                        {
                            "team_index.season": year,
                            "team_index.team_id": team_code
                        }, {
                            "$set": {
                                "Y-{}_{}".format(year_step, metric):
                                weighted_average
                            }
                        })
                total_possessions = 0
                weighted_average = 0
            print("time took {} seconds for one year step".format(
                time.perf_counter() - timestamp))
    pprint(players_without_possessions)