def save_lineup_data_for_season(year, season_type="regular_season"): year_string = common_utils.construct_year_string(year) print('creating index') db.possessions[season_type][year_string].create_index( [('possession_metadata.event_num', pymongo.ASCENDING), ('possession_metadata.gid', pymongo.ASCENDING)], unique=True) print('index created (or already existed)') games = db.games[season_type][year_string].find() num_games = games.count() prev_time = time.perf_counter() for index, game in enumerate(games): #if game["game_index"]: # if game["game_index"] == "0049900030": print("processing {}...".format(game["game_index"])) team_info = get_team_info(year_string, game["game_index"], "playoffs") data = get_lineups_and_possession_info(game["game_index"], game["pbp"], team_info, year) for possession in data: db.possessions[season_type][year_string].update( { 'possession_metadata.gid': possession['possession_metadata']['gid'], 'possession_metadata.event_num': possession['possession_metadata']['event_num'] }, possession, True) if index % 10 == 0: curr_time = time.perf_counter() print("time elapsed: {}".format(curr_time - prev_time)) prev_time = curr_time print("Processed {} / {} games for {} season".format( index + 1, num_games, year_string))
def deal_with_traded_players(year, season_type): year_as_int = int(year) year_string = common_utils.construct_year_string(year) traded_players = db.players.find( filter = { "player_index.team": "TOT", "player_index.season": year_as_int } ) for player in traded_players: print(player["player"]) if player["player"] in weird_players: continue each_team = db.players.find( filter = { "player_index.name_stub": player["player_index"]["name_stub"], "player_index.season": year_as_int, "player_index.team": { "$ne": "TOT" } } ) player_update_data = { "total_possessions": 0, "orapm_weighted_sum": 0, "drapm_weighted_sum": 0 } possessions_key = "{}_possessions".format(season_type) orapm_key = "orapm_{}".format(season_type) drapm_key = "drapm_{}".format(season_type) rapm_key = "rapm_{}".format(season_type) for player_data in each_team: print("\t",player_data["player"], player_data["team_id"], player_data[possessions_key], player_data[orapm_key]) player_update_data["total_possessions"] += player_data[possessions_key] # technically possession weights here should be divided by 2 but after the weighted average it won't matter player_update_data["orapm_weighted_sum"] += player_data[possessions_key] * player_data[orapm_key] player_update_data["drapm_weighted_sum"] += player_data[possessions_key] * player_data[drapm_key] player_update_data["orapm_combined"] = player_update_data["orapm_weighted_sum"] / player_update_data["total_possessions"] player_update_data["drapm_combined"] = player_update_data["drapm_weighted_sum"] / player_update_data["total_possessions"] player_update_data["rapm_combined"] = player_update_data["orapm_combined"] + player_update_data["drapm_combined"] pprint(player_update_data) db.players.update_one( filter = { "player_index.name_stub": player["player_index"]["name_stub"], "player_index.team": "TOT", "player_index.season": year_as_int }, update = { "$set" : { possessions_key: player_update_data["total_possessions"], orapm_key: player_update_data["orapm_combined"], drapm_key: player_update_data["drapm_combined"], rapm_key: player_update_data["rapm_combined"] } } )
def store_games_data(year, season_type): year_string = common_utils.construct_year_string(year) print("storing games data...") games_data = {} games = db.games[season_type][year_string].find() for game_obj in games: home_code = game_obj["home"] away_code = game_obj["away"] if home_code in common_utils.team_codes_to_bball_ref_codes: home_code = common_utils.team_codes_to_bball_ref_codes[home_code] if away_code in common_utils.team_codes_to_bball_ref_codes: away_code = common_utils.team_codes_to_bball_ref_codes[away_code] games_data[game_obj["game_index"]] = { "home": home_code, "away": away_code } print("stored games data") db.seasons.create_index([ ('year_string', pymongo.ASCENDING), ], unique=True) db.seasons.update_one( filter = { "year_string": year_string }, update = { "$set": { "{}_games_data".format(season_type): games_data } }, upsert=True )
def store_player_and_possession_data_for_matrix(year, season_type): player_info = {} year_string = common_utils.construct_year_string(year) year_as_int = int(year) possessions = db.possessions[season_type][year_string].find().limit(limit) count = 0 player_index = 0 num_possessions = possessions.count() games_data = db.seasons.find_one({"year_string": year_string})["{}_games_data".format(season_type)] #print(games_data) # player info should already be calculated, this is to get index if season_type == "playoffs": num_players = len(db.seasons.find_one({ "year_string": year_string })["player_info"]) for possession in possessions: #print(possession) count += 1 if count % 20000 == 0: print("Poss count {}/{}".format(count, num_possessions)) lineups_with_team_names = convert_lineups_to_player_team_strings(possession, games_data) home_or_away = ["home_lineup", "away_lineup"] for lineup_type in home_or_away: for player_team in lineups_with_team_names[lineup_type]: player_name, team_code = player_team.split("_") stub_name = common_utils.player_to_stub_name(player_name) #print(player_name, team_code) if player_name == "None": continue if player_team not in player_info: if team_code == "WAS" and year_as_int < 1998: team_code = "WSB" # found the player in bball ref database if common_utils.player_exists(stub_name, team_code, year_as_int): player_info[player_team] = { "index": player_index, "possessions": 0, "stub_name" : stub_name } player_index += 1 # try to resolve name else: print("{}_{}_{} not found.".format(stub_name, team_code, year_as_int)) if season_type == "playoffs": print("\n{}_{}_{} played in playoffs and not in regular season.".format(stub_name, team_code, year_as_int)) print("[{}][{}]: {}".format( possession["possession_metadata"]["gid"], possession["possession_metadata"]["event_num"], possession["possession_metadata"]["message"] )) print("\tHome", lineups_with_team_names["home_lineup"]) print("\tAway", lineups_with_team_names["away_lineup"]) db.seasons.find_one_and_update( { "year_string": year_string }, { "$set" : { "player_info.{}".format(player_team): { "index": num_players, "possessions": 0, "stub_name": stub_name } } } ) num_players += 1 # player was already seen in a lineup else: player_info[player_team]["possessions"] += 1 for player_team in player_info: possession_number = player_info[player_team]["possessions"] player_name, team_code = player_team.split("_") #print(player_info[player_team]["stub_name"], team_code) db.players.update_one( filter = { "player_index.name_stub": player_info[player_team]["stub_name"], "player_index.team": team_code, "player_index.season": year_as_int }, update = { "$set": { "{}_possessions".format(season_type): possession_number } }, upsert=True ) if season_type == "regular_season": db.seasons.find_one_and_update( { "year_string": year_string }, { "$set" : { "player_info": player_info } } )
def write_rapm_json(year, season_type): year_as_int = int(year) year_string = common_utils.construct_year_string(year_as_int) projection = { "player": 1, "team_id": 1, "{}_possessions".format(season_type): 1, "orapm_{}".format(season_type): 1, "drapm_{}".format(season_type): 1, "rapm_{}".format(season_type): 1 } players = db.players.find( filter = { "player_index.season": year_as_int }, projection = projection, sort = [('rapm_{}'.format(season_type), -1)] ) rapm_json = { "data": [] } rank = 1 for player in players: numerical_keys = { "orapm_{}".format(season_type), "drapm_{}".format(season_type), "rapm_{}".format(season_type) } rapm_sum = 0 for key in numerical_keys: if key in player: rapm_sum += player[key] else: continue if rapm_sum == 0: continue row = [rank] for key in projection: try: if "rapm" in key: row.append(round(player[key], 4)) else: row.append(player[key]) except: # about a dozen players total that need to be investigated as # to why they're edge cases, one thing seems to be Washington Bullets # Melvin Booker keeps causing problems man # print(player) continue rank += 1 if len(row) == 7: rapm_json["data"].append(row) with open("{}-{}-rapm.json".format(year_string, season_type),"w") as jsonfile: json.dump(rapm_json, jsonfile)
def calculate_rapm(year, X, Y, metric="rapm", season_type="regular_season"): year_as_int = int(year) year_string = common_utils.construct_year_string(year) print('fitting model...') timestamp = time.perf_counter() multiplier = 1 if metric == "apm": clf = linear_model.LinearRegression() multiplier = 10 elif metric == "rapm": clf = linear_model.Ridge(alpha=2900) multiplier = 100 elif metric == "rapm_enet_cv": clf = linear_model.ElasticNetCV(l1_ratio=.9,cv=3) elif metric == "rapm_bayes_ridge": clf = linear_model.BayesianRidge() elif metric == "rapm_cv": clf = linear_model.RidgeCV(alphas=list(range(2600,3000,50)), cv=5) multiplier = 100 else: raise RuntimeError("{} not recognized".format(metric)) clf.fit(X, Y) if metric == "rapm_enet_cv" or metric == "rapm_cv": print("alpha chosen: {}".format(clf.alpha_)) if metric == "rapm_enet_cv": print("l1_ratio chosen: {}".format(clf.l1_ratio_)) print("time took {} seconds".format(time.perf_counter()-timestamp)) all_players_dict = db.seasons.find_one({"year_string": year_string})["player_info"] print("coefficients", clf.coef_.shape) print("num of players", len(all_players_dict)) rapm = list(zip(all_players_dict, clf.coef_*multiplier, clf.coef_[len(all_players_dict):]*-1*multiplier)) print(all_players_dict) print(len(rapm)) for player, opm, dpm in rapm: player_name, team_code = player.split("_") data = { 'player': player_name, 'team': team_code, 'o' + metric: opm, 'd' + metric: dpm, metric: opm + dpm } stub_name = common_utils.player_to_stub_name(data["player"]) team_code = data["team"] year_as_int = int(year) print ("updating {}_{}_{}".format(stub_name, team_code, year_as_int)) db.players.update_one( { "player_index.name_stub": stub_name, "player_index.team": team_code, "player_index.season": year_as_int }, { "$set": { "o" + metric + "_" + season_type: data["o" + metric], "d" + metric + "_" + season_type: data["d" + metric], metric + "_" + season_type: data[metric] } } ) print("Sorted & filtered {} for {}".format(metric, year_string)) poss_threshold = 1000 sorted_opm = db.players.find( filter = {"player_index.season": year_as_int, "possessions": {"$gt": poss_threshold}}, projection = {"player": 1, metric + "_" + season_type: 1}, sort = [(metric + "_" + season_type, -1)] ) rankings_to_print = 50 for player in sorted_opm: if "player" in player: print("{:20} {:>10}".format(player["player"], player[metric + "_" + season_type])) rankings_to_print += -1 if rankings_to_print == 0: break return clf
def build_matrix(year, season_type="regular_season",prior = "indicator"): if prior == "indicator": prior_multiplier = 1 year_string = common_utils.construct_year_string(year) print('getting games data for {}'.format(year_string)) timestamp = time.perf_counter() games_data = db.seasons.find_one({"year_string":year_string})["{}_games_data".format(season_type)] print("time took {} seconds".format(time.perf_counter()-timestamp)) print('getting all players list') timestamp = time.perf_counter() count = 0 print("time took {} seconds".format(time.perf_counter()-timestamp)) all_players_dict = db.seasons.find_one({"year_string": year_string})["player_info"] print("sample of players from {}".format(year_string)) for player in all_players_dict: print(player) count += 1 if count > 10: break print('got all players from {}'.format(year_string)) x_players = [] y_scores = [] timestamp = time.perf_counter() print('building matrix') possessions = db.possessions[season_type][year_string].find().limit(limit) possession_index = 0 num_possessions = possessions.count() prev_poss_timestamp = time.perf_counter() num_players = len(all_players_dict) print("num players: {}".format(num_players)) player_keys = {} for player_team_string in all_players_dict: player_keys[player_team_string] = all_players_dict[player_team_string]["index"] offense_matrix = scipy.sparse.dok_matrix((num_possessions, num_players), numpy.dtype(float)) defense_matrix = scipy.sparse.dok_matrix((num_possessions, num_players), numpy.dtype(float)) for row_num, possession in enumerate(possessions): if possession_index % 20000 == 0: print("{} / {} possessions added".format(possession_index, num_possessions)) print("\t{} seconds elapsed".format(time.perf_counter() - prev_poss_timestamp)) prev_poss_timestamp = time.perf_counter() lineups_with_team_names = convert_lineups_to_player_team_strings(possession, games_data) try: if possession['home_team_is_on_offense']: for player in lineups_with_team_names['home_lineup']: if "None" not in player: player_index = player_keys[player] offense_matrix[row_num, player_index] = 1 * prior_multiplier for player in lineups_with_team_names['away_lineup']: if "None" not in player: player_index = player_keys[player] defense_matrix[row_num, player_index] = 1 * prior_multiplier else: for player in lineups_with_team_names['home_lineup']: if "None" not in player: player_index = player_keys[player] defense_matrix[row_num, player_index] = 1 * prior_multiplier for player in lineups_with_team_names['away_lineup']: if "None" not in player: player_index = player_keys[player] offense_matrix[row_num, player_index] = 1 * prior_multiplier except Exception as e: print(e) y_scores.append(possession['scoring_margin_update']) possession_index += 1 print('matrix built') print("time took {} seconds".format(time.perf_counter()-timestamp)) print('combining...') timestamp = time.perf_counter() X = scipy.sparse.hstack((offense_matrix, defense_matrix)).tocsr() Y = numpy.array(y_scores) print("time took {} seconds".format(time.perf_counter()-timestamp)) with open("./matrices/{}-X-{}.indicator.pickle".format(year_string, season_type), "wb") as picklefile: pickle.dump(X, picklefile) with open("./matrices/{}-Y-{}.pickle".format(year_string, season_type),"wb") as picklefile: pickle.dump(Y, picklefile) print('pickling...') timestamp = time.perf_counter() print("time took {} seconds".format(time.perf_counter()-timestamp)) return X, Y
import pickle from pprint import pprint import calculate_rapm import evaluate_metrics import parse_pbp import common_utils if __name__ == "__main__": for year in range(2017, 2019): year_string = common_utils.construct_year_string(year) # #parse_pbp.save_lineup_data_for_season(str(year)) # calculate_rapm.store_games_data(year) # calculate_rapm.store_player_and_possession_data_for_matrix(year) # X, Y = calculate_rapm.build_matrix(year) # X_loaded = "" # Y_loaded = "" # with open("matrices/{}-X-indicator.pickle".format(year_string), "rb") as picklefile: # X_loaded = pickle.load(picklefile) # with open("matrices/{}-Y.pickle".format(year_string), "rb") as picklefile: # Y_loaded = pickle.load(picklefile) # calculate_rapm.calculate_rapm(year, X_loaded, Y_loaded, "apm") # calculate_rapm.calculate_rapm(year, X_loaded, Y_loaded, "rapm") # calculate_rapm.calculate_rapm(year, X_loaded, Y_loaded, "rapm_enet_cv") # calculate_rapm.calculate_rapm(year, X_loaded.toarray(), Y_loaded, "rapm_bayes_ridge") # evaluate_metrics.calculate_weighted_average(year) pprint(evaluate_metrics.calculate_r_squared_values(year))
def calculate_weighted_average(year): year_as_int = int(year) metrics_averages = { "per": 15.0, "bpm": 0.0, "ws_per_48": 0.100, "rapm": 0.0, "apm": 0.0 } if int(year) >= 2014: metrics_averages["rpm"] = 0.0 year_as_int = int(year) year_string = common_utils.construct_year_string(year) teams = db.teams.find({"season": year}) for team in teams: team_code = team["team_code"] # get the players to loop through players = db.players.find({ "player_index.season": year_as_int, "player_index.team": team_code }) players = list(players) if len(players) == 0 and team_code in common_utils.team_renames: players = db.players.find({ "player_index.season": year_as_int, "player_index.team": common_utils.team_renames[team_code] }) players = list(players) if len(players) == 0: print("could not find {}_{}".format(year_to_evaluate, team["team_index"]["team_id"])) continue for year_step in range(4): timestamp = time.perf_counter() year_to_evaluate = year - year_step total_possessions = 0 weighted_average = 0 for metric in metrics_averages: print("{} for {} {}\n".format(metric, year_to_evaluate, team_code)) if year_to_evaluate < 1997: print("no data available for prior to 1997") continue for player in players: # original measure was 250 minutes if "possessions" in player and "player" in player: stub_name = common_utils.player_to_stub_name( player["player"]) prev_season_player_lookup = db.players.find({ "player_index.season": year_to_evaluate, "player_index.name_stub": stub_name }) if not player_exists(stub_name, year_to_evaluate): if player_minus_jr_exists(stub_name, year_to_evaluate): stub_name = stub_name[:len(stub_name) - 2] print("substracted suffix, found {}".format( stub_name)) if stub_name in common_utils.nba_com_stubs_to_bball_ref_stubs: stub_name = common_utils.nba_com_stubs_to_bball_ref_stubs[ stub_name] # if player still doesn't exist, then idk if not player_exists(stub_name, year_to_evaluate): print( "{}_{} not found in scraped bball_ref players page" .format(stub_name, year_to_evaluate)) else: prev_season_player_lookup = db.players.find({ "player_index.season": year_to_evaluate, "player_index.name_stub": stub_name, "player_index.team": team_code }) metric_value = 0 prev_poss_count = 0 # calculate metric value (done in this way because to combine entries for players who were traded) for prev_player_season in prev_season_player_lookup: if "possessions" in prev_player_season: try: prev_poss_count += prev_player_season[ "possessions"] #print("\t{} - {}: {} Poss {}".format(player["player"], metric, player[metric], player["possessions"])) metric_value += float( prev_player_season[metric] ) * prev_player_season["possessions"] except: print("error trying to convert to float ", metric, prev_player_season) else: metric_value = metrics_averages[metric] if prev_poss_count > 750: metric_value = metric_value / prev_poss_count else: metric_value = metrics_averages[metric] try: total_possessions += player["possessions"] weighted_average += player[ "possessions"] * metric_value except: print(player) else: player_name = player["player"] if player.get( "player") else "unknown_player_name" team_code = player["team_id"] if player.get( "team_id") else "unknown_team" player_key = "{}_{}_{}".format(player_name, year, team_code) players_without_possessions.add(player_key) if total_possessions == 0: print(players) continue if total_possessions > 0: weighted_average = weighted_average / total_possessions print("{}_{} has a weighted average of {} {}\n".format( team_code, year_to_evaluate, weighted_average, metric)) db.teams.update_one( { "team_index.season": year, "team_index.team_id": team_code }, { "$set": { "Y-{}_{}".format(year_step, metric): weighted_average } }) total_possessions = 0 weighted_average = 0 print("time took {} seconds for one year step".format( time.perf_counter() - timestamp)) pprint(players_without_possessions)