def get_raw_data(): sql_cols = [ f"oa_{prefix}_{m}" for m in OUTPUT_METRIC_DICT.keys() for prefix in ["tm", "opp"] ] tm1_sql_cols = [f"{x} as tm1_{x}" for x in sql_cols] tm2_sql_cols = [f"{x} as tm2_{x}" for x in sql_cols] q = f""" select g.date ,g.season ,g.game_key ,g.tm_teamid ,g.opp_teamid ,g.tm_pts ,g.opp_pts ,g.tm_win ,v.tm_closeline ,v.tm_ml ,td1.{', td1.'.join(tm1_sql_cols)} ,td2.{', td2.'.join(tm2_sql_cols)} from games g left join teamdates td1 on g.date = td1.date and g.tm_teamid = td1.teamid left join teamdates td2 on g.date = td2.date and g.opp_teamid = td2.teamid left join calendar c on c.date = g.date left join vegas_lines v on g.season = v.season and g.date = v.date and g.game_key = v.game_key and g.tm_teamid = v.tm_teamid where c.day_num >= 60 order by g.date, g.game_key, g.tm_teamid -- limit 1000 """ raw_td = readSql(q) raw_td.dropna( how="any", subset=[col for col in raw_td.columns if col[:3] in ["tm1", "tm2"]], inplace=True, ) return raw_td
def load_calendar(): "Relies on games to be loaded into database" q = """ select season ,min(date) as min ,max(date) as max from games group by season """ seasonGameDates = readSql(q) season_list: list[int] = [] dates: list[str] = [] day_nums: list[int] = [] for seas, series in seasonGameDates.iterrows(): season_dates = list( pd.date_range(series["min"], series["max"], freq="D").strftime("%Y-%m-%d") ) dates += season_dates season_list += [seas] * len(season_dates) day_nums += [x + 1 for x in range(len(season_dates))] cal = pd.DataFrame({"season": season_list, "date": dates, "day_num": day_nums}) # drop old, make table, add indexe, load data executeSql("drop table if exists calendar") q = f""" create table calendar ( season integer not null, date TEXT not null, day_num integer not null, primary key (season asc, date asc) ) """ executeSql(q) for p in get_unique_permutations(cal.columns): executeSql(f"CREATE INDEX calendar_{'_'.join(p)} ON calendar ({', '.join(p)})") dfToTable(cal, "calendar", "ncaa.db", ifExists="append")
def load_games(): "Depends on teams" # Read on data seasons = pd.read_csv("./data/MSeasons.csv") games = pd.read_csv("./data/MRegularSeasonDetailedResults.csv") # Normalize col names games.columns = [x.lower() for x in games.columns] seasons.columns = [x.lower() for x in seasons.columns] # Get dame dates seasons["dayzero"] = pd.to_datetime(seasons["dayzero"]) games = games.merge(seasons[["season", "dayzero"]], on="season") games["date"] = games.apply( lambda row: (row["dayzero"] + timedelta(days=row["daynum"])).strftime( "%Y-%m-%d" ), axis=1, ) del games["dayzero"], games["daynum"] # Rename columns rename_map = {"w": "tm", "l": "opp"} rename = {"numot": "num_ot", "wscore": "tm_pts", "lscore": "opp_pts"} for col in [ "teamid", "loc", "fgm", "fga", "fgm3", "fga3", "ftm", "fta", "or", "dr", "ast", "to", "stl", "blk", "pf", ]: for prefix in ["w", "l"]: rename[f"{prefix}{col}"] = f"{rename_map[prefix]}_{col}" games.rename(columns=rename, inplace=True) games["opp_loc"] = np.where( games["tm_loc"] == "N", "N", np.where(games["tm_loc"] == "A", "H", "A") ) # Add game key teams = readSql("select * from teams") teams_dict = {} for idx, row in teams.iterrows(): teams_dict[row["teamid"]] = row["teamname"] games["game_key"] = games.apply( lambda x: f"{teams_dict[x['tm_teamid']]}>{teams_dict[x['opp_teamid']]}", axis=1 ) # Add additional columns for prefix in ["tm", "opp"]: games[f"{prefix}_fga2"] = games[f"{prefix}_fga"] - games[f"{prefix}_fga3"] games[f"{prefix}_fgm2"] = games[f"{prefix}_fgm"] - games[f"{prefix}_fgm3"] games[f"{prefix}_tr"] = games[f"{prefix}_or"] + games[f"{prefix}_dr"] games[f"{prefix}_mins"] = games[f"{prefix}_mins"] = 40 + games["num_ot"] * 5 games[f"{prefix}_game"] = 1 games[f"{prefix}_win"] = 1 * ( games[f"{prefix}_pts"] > games[f"{OTHERPREFIXMAP[prefix]}_pts"] ) games[f"{prefix}_loss"] = games[f"{prefix}_game"] - games[f"{prefix}_win"] # games[f"{prefix}_rsgame"] = 1 games[f"{prefix}_poss"] = ( ( games["tm_fga"] + 0.4 * games["tm_fta"] - 1.07 * (games["tm_or"] / (games["tm_or"] + games["opp_dr"])) * (games["tm_fga"] - games["tm_fgm"]) + games["tm_to"] ) + ( games["opp_fga"] + 0.4 * games["opp_fta"] - 1.07 * (games["opp_or"] / (games["opp_or"] + games["tm_dr"])) * (games["opp_fga"] - games["opp_fgm"]) + games["opp_to"] ) ) * 0.5 games[f"{prefix}_margin"] = ( games[f"{prefix}_pts"] - games[f"{OTHERPREFIXMAP[prefix]}_pts"] ) games[f"{prefix}_availor"] = ( games[f"{prefix}_or"] + games[f"{OTHERPREFIXMAP[prefix]}_dr"] ) del games["num_ot"] # Duplicate and rename def rename_col(col): if fnmatch(col, "tm_*"): return f"opp_{col[3:]}" elif fnmatch(col, "opp_*"): return f"tm_{col[4:]}" else: return col dup_games = games.copy() dup_games.columns = [rename_col(col) for col in dup_games.columns] games = pd.concat([games, dup_games], ignore_index=True) games = games.sort_values(by=["date", "game_key"]).reset_index(drop=True) # Reorder columns for easier analysis first_cols = [ "date", "season", "game_key", "tm_teamid", "opp_teamid", "tm_loc", "opp_loc", "tm_pts", "opp_pts", ] remainder_cols = sorted([x for x in games.columns if x not in first_cols]) games = games[first_cols + remainder_cols] # TODO add postseason # Drop old, set up new, add indexes, load data executeSql("drop table if exists games") q = f""" create table games ( season integer not null, date TEXT not null, game_key TEXT not null, tm_teamid integer not null, opp_teamid integer not null, tm_loc string not null, opp_loc string not null, tm_pts integer not null, opp_pts integer not null, {' real not null, '.join(remainder_cols)} real not null, primary key (season, date asc, game_key asc, tm_teamid asc) ) """ executeSql(q) perms = get_unique_permutations( ["date", "season", "game_key", "tm_teamid", "opp_teamid"] ) log.info(f"Creating {len(perms)} indexes on games") for p in perms: executeSql(f"CREATE INDEX games_{'_'.join(p)} on games ({', '.join(p)})") dfToTable( games, table="games", db="ncaa.db", ifExists="append", )
def single_season_vegas_load(season: int): df = pd.read_excel( f"https://www.sportsbookreviewsonline.com/scoresoddsarchives/ncaabasketball/ncaa%20basketball%20{season - 1}-{str(season)[-2:]}.xlsx" ) df.columns = [x.lower() for x in df.columns] # Drop bad dates - date must be >100 pre_drop_len = len(df) bad_dates = df["date"] < 100 log.debug(f"Dropping {sum(bad_dates) * 2:,.0f} bad dated records") idx_to_drop = [] for idx, _ in df.loc[bad_dates].iterrows(): idx_to_drop.append(idx) if idx % 2 == 1: # If odd, drop even number prior idx_to_drop.append(idx - 1) else: # If even, drop next number idx_to_drop.append(idx + 1) df = df.drop(idx_to_drop).reset_index(drop=True) assert len(df) == pre_drop_len - sum(bad_dates) * 2, "missing a date or two" # Parse date from integer date column df[["mo", "dy"]] = df["date"].apply(lambda x: [str(x)[:-2], str(x)[-2:]]).to_list() df["year"] = np.where(df["mo"].astype(int) < 7, str(season), str(season - 1)) df["date"] = pd.to_datetime( df["year"] + "-" + df["mo"] + "-" + df["dy"] ).dt.strftime("%Y-%m-%d") df.drop(inplace=True, labels=["year", "mo", "dy"], axis=1) # Self-join to get all game data in one row df = pd.merge( df.iloc[[x for x in df.index if x % 2 == 0]].reset_index(drop=True), df.iloc[[x for x in df.index if x % 2 == 1]].reset_index(drop=True), left_index=True, right_index=True, suffixes=["_tm", "_opp"], ) preLen = len(df) df = df.loc[df["date_tm"] == df["date_opp"]].reset_index(drop=True) df = df.loc[np.abs((df["rot_tm"] - df["rot_opp"])) == 1].reset_index(drop=True) if len(df) != preLen: log.debug(f"Dropped {preLen-len(df)} records due to date mismatch") assert (df["date_tm"] == df["date_opp"]).all(), "Mismatch dates" assert np.abs((df["rot_tm"] - df["rot_opp"])).max() == 1, "Mismatch rots" # remove unused rows, rename some things df.drop( ["date_opp", "rot_tm", "rot_opp"] + [f"{x}_{y}" for y in ["tm", "opp"] for x in ["1st", "2nd", "2h"]], axis=1, inplace=True, ) df.rename(columns={"date_tm": "date"}, inplace=True) # Get team IDs ts = pd.read_csv("data/MTeamSpellings.csv", sep=",", encoding="cp1252") ts.columns = [x.lower() for x in ts.columns] ts["teamnamespelling_nospace"] = ts["teamnamespelling"].str.replace(" ", "") ts = ts.groupby(["teamid", "teamnamespelling_nospace"]).size().reset_index() for col in ["team_tm", "team_opp"]: df[col] = df[col].str.lower() df[col] = df[col].str.replace(".", "", regex=False) df[col] = df[col].str.replace("\xa0", "", regex=False) df = ( pd.merge( df, ts[["teamnamespelling_nospace", "teamid"]], left_on=["team_tm"], right_on=["teamnamespelling_nospace"], how="left", ) .rename(columns={"teamid": "tm_teamid"}) .drop(["teamnamespelling_nospace"], axis=1) .merge( ts[["teamnamespelling_nospace", "teamid"]], left_on=["team_opp"], right_on=["teamnamespelling_nospace"], how="left", ) .rename(columns={"teamid": "opp_teamid"}) .drop(["teamnamespelling_nospace"], axis=1) ) # TODO handle this sorted( list( set( list(df.loc[pd.isna(df["tm_teamid"])]["team_tm"].unique()) + list(df.loc[pd.isna(df["opp_teamid"])]["team_opp"].unique()) ) ) ) # Drop missing teams preLen = len(df) df = df.loc[(~pd.isna(df["tm_teamid"])) & (~pd.isna(df["opp_teamid"]))].reset_index( drop=True ) log.debug( f"Dropped {preLen - len(df)} records due to missing names {(preLen - len(df)) / preLen:.2%}" ) # Figure out open and close for col in ["open_tm", "close_tm", "open_opp", "close_opp"]: df[col] = np.where(df[col].str.lower().isin(["pk", "p"]), 0, df[col]) # Drop the NL values (no line) # preLen = len(df) # df = df.loc[ # ~(df[["open_tm", "close_tm", "open_opp", "close_opp"]] == "NL").any(axis=1) # ].reset_index(drop=True) # print( # f"Dropped {preLen - len(df)} records due to NL vals {(preLen - len(df)) / preLen:.2%}" # ) def get_line(r): "tm openline, tm_closeline, game_open_ou, game_close_ou, tm_ml, opp_ml" def _get_val_type(val: str | float) -> Literal["line", "ou", "NL"]: if isinstance(val, str): if val.strip() == "NL": return "NL" # print(r) return "NL" # raise ValueError(val) if 0 <= val <= 70: return "line" if val > 70: return "ou" return "NL" if r["ml_tm"] == "NL": tm_ml = None else: tm_ml = r["ml_tm"] if r["ml_opp"] == "NL": opp_ml = None else: opp_ml = r["ml_opp"] types = ( (_get_val_type(r["open_tm"]), _get_val_type(r["open_opp"])), (_get_val_type(r["close_tm"]), _get_val_type(r["close_opp"])), ) # OU is on the side the favorite is # Open open, close close, tm-opp match types[0]: case ("ou", "line"): tm_openline = r["open_opp"] game_openou = r["open_tm"] case ("ou", "NL"): tm_openline = None game_openou = r["open_tm"] case ("NL", "line"): tm_openline = r["open_opp"] game_openou = None case ("line", "ou"): tm_openline = -r["open_tm"] game_openou = r["open_opp"] case ("NL", "ou"): tm_openline = None game_openou = r["open_opp"] case ("line", "NL"): tm_openline = -r["open_tm"] game_openou = None case ("NL", "NL"): tm_openline = None game_openou = None case ("line", "line"): tm_openline = None game_openou = None case _: print(r) raise ValueError(types[0]) match types[1]: case ("ou", "line"): tm_closeline = r["close_opp"] game_closeou = r["close_tm"] case ("ou", "NL"): tm_closeline = None game_closeou = r["close_tm"] case ("NL", "line"): tm_closeline = r["close_opp"] game_closeou = None case ("line", "ou"): tm_closeline = -r["close_tm"] game_closeou = r["close_opp"] case ("NL", "ou"): tm_closeline = None game_closeou = r["close_opp"] case ("line", "NL"): tm_closeline = -r["close_tm"] game_closeou = None case ("NL", "NL"): tm_closeline = None game_closeou = None case ("ou", "ou"): tm_closeline = None game_closeou = None case ("line", "line"): tm_closeline = None game_closeou = None case _: print(r) raise ValueError(types[1]) # assert max(np.abs(tm_openline), np.abs(tm_closeline)) < min( # game_closeou, game_openou # ) return (tm_openline, tm_closeline, game_openou, game_closeou, tm_ml, opp_ml) df[ [ "tm_openline", "tm_closeline", "game_openou", "game_closeou", "tm_ml", "opp_ml", ] ] = df.apply(lambda x: get_line(x), axis=1).to_list() for col in [ "tm_openline", "tm_closeline", "game_openou", "game_closeou", "tm_ml", "opp_ml", ]: df[col] = df[col].astype(float, errors="ignore") # Bring together to games games = readSql(f"select * from games where season = {season}") df = pd.merge( df, games[["date", "tm_teamid", "opp_teamid", "season", "game_key"]], how="left", on=["date", "tm_teamid", "opp_teamid"], ) preLen = len(df) df = df.loc[~pd.isna(df["game_key"])].reset_index(drop=True) log.debug( f"Dropped {preLen - len(df)} records due to no matching game {(preLen - len(df)) / preLen:.2%}" ) # Duplicate for database dup = df.copy() dup.rename( columns={ "tm_ml": "opp_ml", "opp_ml": "tm_ml", "tm_teamid": "opp_teamid", "opp_teamid": "tm_teamid", }, inplace=True, ) dup["tm_openline"] *= -1 dup["tm_closeline"] *= -1 df = pd.concat([df, dup], ignore_index=True) df = df.sort_values(by=["date", "game_key", "tm_teamid"]).reset_index(drop=True) df = df[ [ "season", "date", "game_key", "tm_teamid", "opp_teamid", "tm_openline", "tm_closeline", "tm_ml", "opp_ml", "game_openou", "game_closeou", ] ] return df
def get_teamdates(season: int): # Get games for given season, cross join games to all dates to q = f""" with seasonDates as ( select season ,min(date) as minDate ,max(date) as maxDate from games where season = {season} ) ,dates as ( select distinct s.season ,c.date from calendar c left join seasonDates s where c.date between minDate and maxDate ) ,teamList as ( select distinct season, tm_teamid from games where season = {season} ) ,allDatesAllTeams as ( select dates.season ,dates.date ,teamList.tm_teamid from dates cross join teamList on teamList.season = dates.season ) select adat.* ,t.teamname ,{gb_sql} from allDatesAllTeams adat left join games g on adat.season = g.season and adat.tm_teamid = g.tm_teamid and adat.date > g.date left join teams t on adat.tm_teamid = t.teamid group by adat.season, adat.date, adat.tm_teamid order by adat.season, adat.tm_teamid, adat.date """ df = readSql(q, db="ncaa.db") # Get all games for filtering q = f""" select t.teamname ,g.* from games g left join teams t on g.tm_teamid = t.teamid where season = {season} """ games = readSql(q) log.info("Executed sql") # Get predate opponent stats total_rows = len(df) # Function to get sum of opponent state prior to the give row's date def opp_stats(row: pd.Series): "Removes the team's games from the data, and duplicates by number of games v opp" if (row.name + 1) % 10000 == 0: log.info(f"Progress in apply: {(row.name + 1) / total_rows:.2%}") # Get list of all opponents (includes duplicate values) opp_list = games.loc[(games["date"] < row["date"]) & ( games["tm_teamid"] == row["tm_teamid"])]["opp_teamid"].to_list() # Create DF of opponents-counts combinations, merge into all games by said opps opp_df = pd.DataFrame(Counter(opp_list).items(), columns=["tm_teamid", "n"]) opp_predate_games = games.loc[ (games["date"] < row["date"]) & (games["tm_teamid"].isin(opp_list)) & (games["opp_teamid"] != row["tm_teamid"])].merge(opp_df, how="left", on="tm_teamid") # Duplicate games per number of times they were played, get stats opp_predate_stats = opp_predate_games.loc[ opp_predate_games.index.repeat( opp_predate_games["n"])][metric_cols].sum() return opp_predate_stats df[[f"opp_{x}" for x in metric_cols]] = df.apply(lambda row: opp_stats(row), axis=1) # Add output metrics to df # Pre-create columns for performance warning reasons output_metric_cols = [ f"{t}{p}_{m}" for m in OUTPUT_METRIC_DICT.keys() for p in ["tm", "opp"] for t in ["", "oa_", "rnk_", "rnk_oa_"] ] df = pd.concat([df, pd.DataFrame(columns=output_metric_cols)], axis=1) output_metrics = [] for metric, metric_details in OUTPUT_METRIC_DICT.items(): log.debug(metric) for prefix in ["tm", "opp"]: output_metrics += [ f"{prefix}_{metric}", f"oa_{prefix}_{metric}", f"rnk_{prefix}_{metric}", f"rnk_oa_{prefix}_{metric}", ] # Ascending metrics get flipped for the opponent's acumen at them calc_ascending = metric_details["ascending"] if prefix == "opp": calc_ascending = not calc_ascending # Get metric, OA metric df[f"{prefix}_{metric}"] = ( df[f"{prefix}_{metric_details['num_col']}"] / df[f"{prefix}_{metric_details['den_col']}"]) df[f"oa_{prefix}_{metric}"] = df[f"{prefix}_{metric}"] - ( df[f"opp_{OTHERPREFIXMAP[prefix]}_{metric_details['num_col']}"] / df[f"opp_{OTHERPREFIXMAP[prefix]}_{metric_details['den_col']}"] ) # Round to 6 decimals df[f"{prefix}_{metric}"] = np.round(df[f"{prefix}_{metric}"], 6) df[f"oa_{prefix}_{metric}"] = np.round(df[f"oa_{prefix}_{metric}"], 6) # Rank df[f"rnk_{prefix}_{metric}"] = df.groupby( ["date"])[f"{prefix}_{metric}"].rank(ascending=calc_ascending, method="min") df[f"rnk_oa_{prefix}_{metric}"] = df.groupby([ "date" ])[f"oa_{prefix}_{metric}"].rank(ascending=calc_ascending, method="min") # Trim to final columns out_first_cols = [ "teamid", "game", "win", "loss", "poss", "mins", ] renames = {} for col in out_first_cols: renames[f"tm_{col}"] = col df.rename(columns=renames, inplace=True) out = df[["season", "date"] + out_first_cols + output_metric_cols] return out
out = df[["season", "date"] + out_first_cols + output_metric_cols] return out # %% if __name__ == "__main__": # Drop old table q = "drop table if exists teamdates" executeSql(q, "ncaa.db") # Create table # Loop through seasons in games for season in readSql("select distinct season from games")["season"]: log.info(f"{season} season") df = get_teamdates(season=season) dfToTable( df, table="teamdates", db="ncaa.db", ifExists="append", indexCols=["season", "date", "teamid"], ) for p in get_unique_permutations(["season", "date", "teamid"]): executeSql( f"CREATE INDEX teamdates_{'_'.join(p)} on teamdates ({', '.join(p)})" )
all_years = sorted(td["season"].unique().tolist()) for test_year in all_years: print(f"Testing {test_year} season") # Train-test split train_mask = td["season"] != test_year test_mask = td["season"] == test_year X_train = x_scale_pca[train_mask] X_test = x_scale_pca[test_mask] X_train_incl_line = x_scale_pca_incl_line[train_mask] X_test_incl_line = x_scale_pca_incl_line[test_mask] for id, model in reg_models.items(): already_done = ( readSql("Select distinct season, id from predictions") .to_numpy() .tolist() ) if [test_year, id] in already_done: print(f"Skipping {id} for {test_year}") continue # Set up output td_test = td.loc[test_mask][[x for x in td.columns if x not in X_cols]] td_test["id"] = id td_test["config"] = str(model.get_params()) # Get margin prediction y = td["tm_margin"] y_train = y[train_mask] y_test = y[test_mask] model.fit(X_train, y_train)
# %% Imports import numpy as np from helpers import readSql import plotly.express as px # %% Get and visualize regression models q = """ select * from predictions p where substr(id, 3, 1) == 'r' """ regs = readSql(q) # %% Select a config q = "select distinct config from predictions" cfg = readSql(q)["config"].to_list() print("Please select a config option...") for idx, val in enumerate(cfg): print(f"{idx} | {val}") selected = int(input("Enter a num")) sel_cfg = cfg[selected] # %% q = f""" select p.* ,v.tm_closeline ,v.tm_ml ,v.opp_ml from predictions p