def add_rpm_to_player_table(year): if int(year) < 2014: print("No RPM data available prior to 2013-14 season") return year_string = construct_year_string(year) rpm_filename = "{}/player_rpms_{}_rpms.csv".format(year_string, year) with open(rpm_filename, "r") as csvfile: data = csv.reader(csvfile) for index, ele in enumerate(data): # skip header row if index == 0: continue team_codes = ele[2].split("/") player_name = ele[0] if not player_name: continue player_stub_name = common_utils.player_to_stub_name(player_name) for code in team_codes: if code in common_utils.team_codes_to_bball_ref_codes: code = common_utils.team_codes_to_bball_ref_codes[code] player_name = common_utils.nba_com_player_name_to_bball_ref_player_name( player_name, code, year) player_stub_name = common_utils.player_to_stub_name( player_name) rpm_update = { "orpm": float(ele[5]), "drpm": float(ele[6]), "rpm": float(ele[7]) } db.players.update_one( { 'player_index.name_stub': player_stub_name, 'player_index.team': code, 'player_index.season': year }, {'$set': rpm_update}, False) print("Added RPM for {}".format(player_name))
def import_players_into_mongo(year): year_string = construct_year_string(year) print('creating index') db.players.create_index([('player_index.name_stub', pymongo.ASCENDING), ('player_index.season', pymongo.ASCENDING), ('player_index.team', pymongo.ASCENDING)], unique=True) pprint(db.players.index_information()) print('index created (or already existed)') with open("{}/players_advanced_{}.json".format(year_string, year), "r") as playerfile: player_json = json.load(playerfile) headers = player_json.pop(0) headers.pop(1) headers.append(("Season", "season", None)) for index, player_row in enumerate(player_json): player_row.append(year) mongo_row = {} for index, element in enumerate(player_row): try: mongo_row[headers[index][1]] = float(player_row[index]) except ValueError as e: mongo_row[headers[index][1]] = player_row[index] mongo_row["player"] = mongo_row["player"].replace("*", "") stub_name = common_utils.player_to_stub_name(mongo_row["player"]) print("import {}_{}_{}".format(stub_name, mongo_row["team_id"], year)) db.players.update_one( { 'player_index.name_stub': stub_name, 'player_index.team': mongo_row["team_id"].strip(), 'player_index.season': year }, {'$set': mongo_row}, True)
def store_player_and_possession_data_for_matrix(year, season_type): player_info = {} year_string = common_utils.construct_year_string(year) year_as_int = int(year) possessions = db.possessions[season_type][year_string].find().limit(limit) count = 0 player_index = 0 num_possessions = possessions.count() games_data = db.seasons.find_one({"year_string": year_string})["{}_games_data".format(season_type)] #print(games_data) # player info should already be calculated, this is to get index if season_type == "playoffs": num_players = len(db.seasons.find_one({ "year_string": year_string })["player_info"]) for possession in possessions: #print(possession) count += 1 if count % 20000 == 0: print("Poss count {}/{}".format(count, num_possessions)) lineups_with_team_names = convert_lineups_to_player_team_strings(possession, games_data) home_or_away = ["home_lineup", "away_lineup"] for lineup_type in home_or_away: for player_team in lineups_with_team_names[lineup_type]: player_name, team_code = player_team.split("_") stub_name = common_utils.player_to_stub_name(player_name) #print(player_name, team_code) if player_name == "None": continue if player_team not in player_info: if team_code == "WAS" and year_as_int < 1998: team_code = "WSB" # found the player in bball ref database if common_utils.player_exists(stub_name, team_code, year_as_int): player_info[player_team] = { "index": player_index, "possessions": 0, "stub_name" : stub_name } player_index += 1 # try to resolve name else: print("{}_{}_{} not found.".format(stub_name, team_code, year_as_int)) if season_type == "playoffs": print("\n{}_{}_{} played in playoffs and not in regular season.".format(stub_name, team_code, year_as_int)) print("[{}][{}]: {}".format( possession["possession_metadata"]["gid"], possession["possession_metadata"]["event_num"], possession["possession_metadata"]["message"] )) print("\tHome", lineups_with_team_names["home_lineup"]) print("\tAway", lineups_with_team_names["away_lineup"]) db.seasons.find_one_and_update( { "year_string": year_string }, { "$set" : { "player_info.{}".format(player_team): { "index": num_players, "possessions": 0, "stub_name": stub_name } } } ) num_players += 1 # player was already seen in a lineup else: player_info[player_team]["possessions"] += 1 for player_team in player_info: possession_number = player_info[player_team]["possessions"] player_name, team_code = player_team.split("_") #print(player_info[player_team]["stub_name"], team_code) db.players.update_one( filter = { "player_index.name_stub": player_info[player_team]["stub_name"], "player_index.team": team_code, "player_index.season": year_as_int }, update = { "$set": { "{}_possessions".format(season_type): possession_number } }, upsert=True ) if season_type == "regular_season": db.seasons.find_one_and_update( { "year_string": year_string }, { "$set" : { "player_info": player_info } } )
def calculate_rapm(year, X, Y, metric="rapm", season_type="regular_season"): year_as_int = int(year) year_string = common_utils.construct_year_string(year) print('fitting model...') timestamp = time.perf_counter() multiplier = 1 if metric == "apm": clf = linear_model.LinearRegression() multiplier = 10 elif metric == "rapm": clf = linear_model.Ridge(alpha=2900) multiplier = 100 elif metric == "rapm_enet_cv": clf = linear_model.ElasticNetCV(l1_ratio=.9,cv=3) elif metric == "rapm_bayes_ridge": clf = linear_model.BayesianRidge() elif metric == "rapm_cv": clf = linear_model.RidgeCV(alphas=list(range(2600,3000,50)), cv=5) multiplier = 100 else: raise RuntimeError("{} not recognized".format(metric)) clf.fit(X, Y) if metric == "rapm_enet_cv" or metric == "rapm_cv": print("alpha chosen: {}".format(clf.alpha_)) if metric == "rapm_enet_cv": print("l1_ratio chosen: {}".format(clf.l1_ratio_)) print("time took {} seconds".format(time.perf_counter()-timestamp)) all_players_dict = db.seasons.find_one({"year_string": year_string})["player_info"] print("coefficients", clf.coef_.shape) print("num of players", len(all_players_dict)) rapm = list(zip(all_players_dict, clf.coef_*multiplier, clf.coef_[len(all_players_dict):]*-1*multiplier)) print(all_players_dict) print(len(rapm)) for player, opm, dpm in rapm: player_name, team_code = player.split("_") data = { 'player': player_name, 'team': team_code, 'o' + metric: opm, 'd' + metric: dpm, metric: opm + dpm } stub_name = common_utils.player_to_stub_name(data["player"]) team_code = data["team"] year_as_int = int(year) print ("updating {}_{}_{}".format(stub_name, team_code, year_as_int)) db.players.update_one( { "player_index.name_stub": stub_name, "player_index.team": team_code, "player_index.season": year_as_int }, { "$set": { "o" + metric + "_" + season_type: data["o" + metric], "d" + metric + "_" + season_type: data["d" + metric], metric + "_" + season_type: data[metric] } } ) print("Sorted & filtered {} for {}".format(metric, year_string)) poss_threshold = 1000 sorted_opm = db.players.find( filter = {"player_index.season": year_as_int, "possessions": {"$gt": poss_threshold}}, projection = {"player": 1, metric + "_" + season_type: 1}, sort = [(metric + "_" + season_type, -1)] ) rankings_to_print = 50 for player in sorted_opm: if "player" in player: print("{:20} {:>10}".format(player["player"], player[metric + "_" + season_type])) rankings_to_print += -1 if rankings_to_print == 0: break return clf
def get_and_parse_event_list_for_lineups(event_list, team_info, year): """ Take rowSet of events from API call and return a list of equal size with the 5 man lineup on the floor for each team for each event Example output format: [ { "Lakers": ["Lonzo Ball", "Brandon Ingram", "Corey Brewer", "Julius Randle", "Brook Lopez"], "Warriors": ["Stephen Curry", "Kevin Durant", "Klay Thompson", "Draymond Green", "Andre Iguodala"], "event_num": 2 }, ... ] """ player_ids = {} year_as_int = int(year) home_team = team_info["home_team"] away_team = team_info["away_team"] lineups = {team_info["home_team"]: set(), team_info["away_team"]: set()} event_with_lineups_list = [] current_quarter = -1 for index, event in enumerate(event_list): #print(event_with_lineups_list) # reset lineups when quarter changes if current_quarter != event[event_field["PERIOD"]]: current_quarter = event[event_field["PERIOD"]] lineups[team_info["home_team"]] = set() lineups[team_info["away_team"]] = set() quarter_start_index = index # this piece of code is for a single empty event toward the end of this game: # http://stats.nba.com/game/0020000883/playbyplay/ that causes problems otherwise if event[event_field["EVENTMSGTYPE"]] == event_type[ "SUB"] and get_message(event) is None: pass # substitution event # player 1 is being substituted out # player 2 is coming in elif event[event_field["EVENTMSGTYPE"]] == event_type["SUB"]: #print_event(event) team = common_utils.convert_nba_espn_team_codes_to_bball_ref( event[event_field["PLAYER1_TEAM_ABBREVIATION"]], year_as_int) try: player_to_sub_out = common_utils.nba_com_player_name_to_bball_ref_player_name( event[event_field["PLAYER1_NAME"]], team, year_as_int) except RuntimeError as e: print_event(event) try: player_to_sub_in = common_utils.nba_com_player_name_to_bball_ref_player_name( event[event_field["PLAYER2_NAME"]], team, year_as_int) except RuntimeError as e: print_event(event) if (player_to_sub_out, team, year_as_int) not in player_ids: player_ids[(player_to_sub_out, team, year_as_int)] = event[event_field["PLAYER1_ID"]] if (player_to_sub_in, team, year_as_int) not in player_ids: player_ids[(player_to_sub_in, team, year_as_int)] = event[event_field["PLAYER2_ID"]] if player_to_sub_out not in lineups[team]: # backfill for e in event_with_lineups_list[quarter_start_index:]: e[team].add(player_to_sub_out) lineups[team].discard(player_to_sub_out) lineups[team].add(player_to_sub_in) event_with_lineups_list.append({ team_info["home_team"]: lineups[team_info["home_team"]].copy(), team_info["away_team"]: lineups[team_info["away_team"]].copy(), "event_num": event[event_field["EVENTNUM"]], }) # player1, player2, player3 if event[event_field["EVENTMSGTYPE"]] != event_type["SUB"]: f = [ event_field['PLAYER1_NAME'], event_field['PLAYER2_NAME'], event_field['PLAYER3_NAME'] ] for p_index in f: # team code is always an offset of 4 from player name if event[p_index] and event[p_index + 4]: team_code = common_utils.convert_nba_espn_team_codes_to_bball_ref( event[p_index + 4], year_as_int) try: player_name = common_utils.nba_com_player_name_to_bball_ref_player_name( event[p_index], team_code, year_as_int) except RuntimeError as e: print(e) if (player_name, team_code, year_as_int) not in player_ids: # player id is offset -1 from player name player_ids[(player_name, team_code, year_as_int)] = event[p_index - 1] if len(lineups[team_code] ) < 5 and player_name not in lineups[team_code]: lineups[team_code].add(player_name) # backfill for e in event_with_lineups_list[quarter_start_index:]: e[team_code].add(player_name) for (t_player_name, t_team_code, t_year) in player_ids: stub_name = common_utils.player_to_stub_name(t_player_name) player = db.players.update_one( { "player_index.name_stub": stub_name, "player_index.team": t_team_code, "player_index.season": t_year }, { "$set": { "nba_com_id": player_ids[(t_player_name, t_team_code, t_year)] } }) return event_with_lineups_list
def calculate_weighted_average(year): year_as_int = int(year) metrics_averages = { "per": 15.0, "bpm": 0.0, "ws_per_48": 0.100, "rapm": 0.0, "apm": 0.0 } if int(year) >= 2014: metrics_averages["rpm"] = 0.0 year_as_int = int(year) year_string = common_utils.construct_year_string(year) teams = db.teams.find({"season": year}) for team in teams: team_code = team["team_code"] # get the players to loop through players = db.players.find({ "player_index.season": year_as_int, "player_index.team": team_code }) players = list(players) if len(players) == 0 and team_code in common_utils.team_renames: players = db.players.find({ "player_index.season": year_as_int, "player_index.team": common_utils.team_renames[team_code] }) players = list(players) if len(players) == 0: print("could not find {}_{}".format(year_to_evaluate, team["team_index"]["team_id"])) continue for year_step in range(4): timestamp = time.perf_counter() year_to_evaluate = year - year_step total_possessions = 0 weighted_average = 0 for metric in metrics_averages: print("{} for {} {}\n".format(metric, year_to_evaluate, team_code)) if year_to_evaluate < 1997: print("no data available for prior to 1997") continue for player in players: # original measure was 250 minutes if "possessions" in player and "player" in player: stub_name = common_utils.player_to_stub_name( player["player"]) prev_season_player_lookup = db.players.find({ "player_index.season": year_to_evaluate, "player_index.name_stub": stub_name }) if not player_exists(stub_name, year_to_evaluate): if player_minus_jr_exists(stub_name, year_to_evaluate): stub_name = stub_name[:len(stub_name) - 2] print("substracted suffix, found {}".format( stub_name)) if stub_name in common_utils.nba_com_stubs_to_bball_ref_stubs: stub_name = common_utils.nba_com_stubs_to_bball_ref_stubs[ stub_name] # if player still doesn't exist, then idk if not player_exists(stub_name, year_to_evaluate): print( "{}_{} not found in scraped bball_ref players page" .format(stub_name, year_to_evaluate)) else: prev_season_player_lookup = db.players.find({ "player_index.season": year_to_evaluate, "player_index.name_stub": stub_name, "player_index.team": team_code }) metric_value = 0 prev_poss_count = 0 # calculate metric value (done in this way because to combine entries for players who were traded) for prev_player_season in prev_season_player_lookup: if "possessions" in prev_player_season: try: prev_poss_count += prev_player_season[ "possessions"] #print("\t{} - {}: {} Poss {}".format(player["player"], metric, player[metric], player["possessions"])) metric_value += float( prev_player_season[metric] ) * prev_player_season["possessions"] except: print("error trying to convert to float ", metric, prev_player_season) else: metric_value = metrics_averages[metric] if prev_poss_count > 750: metric_value = metric_value / prev_poss_count else: metric_value = metrics_averages[metric] try: total_possessions += player["possessions"] weighted_average += player[ "possessions"] * metric_value except: print(player) else: player_name = player["player"] if player.get( "player") else "unknown_player_name" team_code = player["team_id"] if player.get( "team_id") else "unknown_team" player_key = "{}_{}_{}".format(player_name, year, team_code) players_without_possessions.add(player_key) if total_possessions == 0: print(players) continue if total_possessions > 0: weighted_average = weighted_average / total_possessions print("{}_{} has a weighted average of {} {}\n".format( team_code, year_to_evaluate, weighted_average, metric)) db.teams.update_one( { "team_index.season": year, "team_index.team_id": team_code }, { "$set": { "Y-{}_{}".format(year_step, metric): weighted_average } }) total_possessions = 0 weighted_average = 0 print("time took {} seconds for one year step".format( time.perf_counter() - timestamp)) pprint(players_without_possessions)