def load_teams(): "No dependencies" teams = pd.read_csv("./data/MTeams.csv") teams.columns = [x.lower() for x in teams.columns] # Load teams into db dfToTable(teams, "teams", "ncaa.db", "replace") for p in get_unique_permutations(["teamid", "teamname"]): executeSql(f"CREATE INDEX teams_{'_'.join(p)} ON teams ({', '.join(p)})")
def load_calendar(): "Relies on games to be loaded into database" q = """ select season ,min(date) as min ,max(date) as max from games group by season """ seasonGameDates = readSql(q) season_list: list[int] = [] dates: list[str] = [] day_nums: list[int] = [] for seas, series in seasonGameDates.iterrows(): season_dates = list( pd.date_range(series["min"], series["max"], freq="D").strftime("%Y-%m-%d") ) dates += season_dates season_list += [seas] * len(season_dates) day_nums += [x + 1 for x in range(len(season_dates))] cal = pd.DataFrame({"season": season_list, "date": dates, "day_num": day_nums}) # drop old, make table, add indexe, load data executeSql("drop table if exists calendar") q = f""" create table calendar ( season integer not null, date TEXT not null, day_num integer not null, primary key (season asc, date asc) ) """ executeSql(q) for p in get_unique_permutations(cal.columns): executeSql(f"CREATE INDEX calendar_{'_'.join(p)} ON calendar ({', '.join(p)})") dfToTable(cal, "calendar", "ncaa.db", ifExists="append")
def load_vegas(): executeSql("drop table if exists vegas_lines") q = f""" create table vegas_lines ( season integer not null, date TEXT not null, game_key TEXT not null, tm_teamid integer not null, opp_teamid integer not null, tm_openline real, tm_closeline real, tm_ml real, opp_ml real, game_openou real, game_closeou real, primary key (season, date asc, game_key asc, tm_teamid asc) ) """ executeSql(q) perms = get_unique_permutations( ["date", "season", "game_key", "tm_teamid", "opp_teamid"] ) log.info(f"Creating {len(perms)} indexes on vegas_lines") for p in perms: executeSql(f"CREATE INDEX vegas_{'_'.join(p)} on vegas_lines ({', '.join(p)})") for season in range(2008, 2023): log.info(season) df = single_season_vegas_load(season) dfToTable( df, table="vegas_lines", db="ncaa.db", ifExists="append", )
def prompt_remake_predictions_table(): if input("Drop/remake old table? y/(n): ").lower() != "y": return print(f"Dropping old table") executeSql("drop table if exists predictions") q = f""" create table predictions ( id string not null, config string not null, season integer not null, date string not null, game_key string not null, tm_teamid integer not null, opp_teamid integer not null, tm_closeline real not null, tm_ml real not null, tm_pts integer not null, opp_pts integer not null, tm_margin integer not null, tm_margin_ats integer not null, tm_win integer not null, tm_win_ats integer not null, pred_win_prob real not null, pred_win_ats_prob real not null, pred_margin real not null, pred_margin_ats real not null, primary key (id, config, date asc, game_key asc, tm_teamid asc) ) """ executeSql(q) for p in get_unique_permutations(["id", "config", "date", "game_key", "tm_teamid"]): executeSql( f"CREATE INDEX predictions_{'_'.join(p)} on predictions ({', '.join(p)})" )
def load_games(): "Depends on teams" # Read on data seasons = pd.read_csv("./data/MSeasons.csv") games = pd.read_csv("./data/MRegularSeasonDetailedResults.csv") # Normalize col names games.columns = [x.lower() for x in games.columns] seasons.columns = [x.lower() for x in seasons.columns] # Get dame dates seasons["dayzero"] = pd.to_datetime(seasons["dayzero"]) games = games.merge(seasons[["season", "dayzero"]], on="season") games["date"] = games.apply( lambda row: (row["dayzero"] + timedelta(days=row["daynum"])).strftime( "%Y-%m-%d" ), axis=1, ) del games["dayzero"], games["daynum"] # Rename columns rename_map = {"w": "tm", "l": "opp"} rename = {"numot": "num_ot", "wscore": "tm_pts", "lscore": "opp_pts"} for col in [ "teamid", "loc", "fgm", "fga", "fgm3", "fga3", "ftm", "fta", "or", "dr", "ast", "to", "stl", "blk", "pf", ]: for prefix in ["w", "l"]: rename[f"{prefix}{col}"] = f"{rename_map[prefix]}_{col}" games.rename(columns=rename, inplace=True) games["opp_loc"] = np.where( games["tm_loc"] == "N", "N", np.where(games["tm_loc"] == "A", "H", "A") ) # Add game key teams = readSql("select * from teams") teams_dict = {} for idx, row in teams.iterrows(): teams_dict[row["teamid"]] = row["teamname"] games["game_key"] = games.apply( lambda x: f"{teams_dict[x['tm_teamid']]}>{teams_dict[x['opp_teamid']]}", axis=1 ) # Add additional columns for prefix in ["tm", "opp"]: games[f"{prefix}_fga2"] = games[f"{prefix}_fga"] - games[f"{prefix}_fga3"] games[f"{prefix}_fgm2"] = games[f"{prefix}_fgm"] - games[f"{prefix}_fgm3"] games[f"{prefix}_tr"] = games[f"{prefix}_or"] + games[f"{prefix}_dr"] games[f"{prefix}_mins"] = games[f"{prefix}_mins"] = 40 + games["num_ot"] * 5 games[f"{prefix}_game"] = 1 games[f"{prefix}_win"] = 1 * ( games[f"{prefix}_pts"] > games[f"{OTHERPREFIXMAP[prefix]}_pts"] ) games[f"{prefix}_loss"] = games[f"{prefix}_game"] - games[f"{prefix}_win"] # games[f"{prefix}_rsgame"] = 1 games[f"{prefix}_poss"] = ( ( games["tm_fga"] + 0.4 * games["tm_fta"] - 1.07 * (games["tm_or"] / (games["tm_or"] + games["opp_dr"])) * (games["tm_fga"] - games["tm_fgm"]) + games["tm_to"] ) + ( games["opp_fga"] + 0.4 * games["opp_fta"] - 1.07 * (games["opp_or"] / (games["opp_or"] + games["tm_dr"])) * (games["opp_fga"] - games["opp_fgm"]) + games["opp_to"] ) ) * 0.5 games[f"{prefix}_margin"] = ( games[f"{prefix}_pts"] - games[f"{OTHERPREFIXMAP[prefix]}_pts"] ) games[f"{prefix}_availor"] = ( games[f"{prefix}_or"] + games[f"{OTHERPREFIXMAP[prefix]}_dr"] ) del games["num_ot"] # Duplicate and rename def rename_col(col): if fnmatch(col, "tm_*"): return f"opp_{col[3:]}" elif fnmatch(col, "opp_*"): return f"tm_{col[4:]}" else: return col dup_games = games.copy() dup_games.columns = [rename_col(col) for col in dup_games.columns] games = pd.concat([games, dup_games], ignore_index=True) games = games.sort_values(by=["date", "game_key"]).reset_index(drop=True) # Reorder columns for easier analysis first_cols = [ "date", "season", "game_key", "tm_teamid", "opp_teamid", "tm_loc", "opp_loc", "tm_pts", "opp_pts", ] remainder_cols = sorted([x for x in games.columns if x not in first_cols]) games = games[first_cols + remainder_cols] # TODO add postseason # Drop old, set up new, add indexes, load data executeSql("drop table if exists games") q = f""" create table games ( season integer not null, date TEXT not null, game_key TEXT not null, tm_teamid integer not null, opp_teamid integer not null, tm_loc string not null, opp_loc string not null, tm_pts integer not null, opp_pts integer not null, {' real not null, '.join(remainder_cols)} real not null, primary key (season, date asc, game_key asc, tm_teamid asc) ) """ executeSql(q) perms = get_unique_permutations( ["date", "season", "game_key", "tm_teamid", "opp_teamid"] ) log.info(f"Creating {len(perms)} indexes on games") for p in perms: executeSql(f"CREATE INDEX games_{'_'.join(p)} on games ({', '.join(p)})") dfToTable( games, table="games", db="ncaa.db", ifExists="append", )
renames = {} for col in out_first_cols: renames[f"tm_{col}"] = col df.rename(columns=renames, inplace=True) out = df[["season", "date"] + out_first_cols + output_metric_cols] return out # %% if __name__ == "__main__": # Drop old table q = "drop table if exists teamdates" executeSql(q, "ncaa.db") # Create table # Loop through seasons in games for season in readSql("select distinct season from games")["season"]: log.info(f"{season} season") df = get_teamdates(season=season) dfToTable( df, table="teamdates", db="ncaa.db", ifExists="append", indexCols=["season", "date", "teamid"], )