def calc_features(regular_season_data, season, team1, team2, tourney_seeds=None, wp_data=None): relevant_data = get_relevant_data(regular_season_data, season=season, team1=team1, team2=team2) columns_with_winning_team_data = np.concatenate((np.array([3]), np.arange(8, 21))) columns_with_losing_team_data = np.concatenate((np.array([5]), np.arange(21, 34))) effective_team1_stats = [] effective_team2_stats = [] team1_wins = 0 team1_loses = 0 team2_wins = 0 team2_loses = 0 for record in relevant_data: winning_stats = record[columns_with_winning_team_data] losing_stats = record[columns_with_losing_team_data] if record[2] == team1: # team1 was the winner team1_wins += 1.0 effective_team1_stats.append(winning_stats) effective_team2_stats.append(losing_stats) elif record[4] == team1: # team1 was the loser team1_loses += 1.0 effective_team1_stats.append(losing_stats) effective_team2_stats.append(winning_stats) elif record[2] == team2: # team2 was the winner team2_wins += 1.0 effective_team2_stats.append(winning_stats) effective_team1_stats.append(losing_stats) elif record[4] == team2: # team1 was the loser team2_loses += 1.0 effective_team2_stats.append(losing_stats) effective_team1_stats.append(winning_stats) effective_team1_stats = np.array(effective_team1_stats).astype(float) effective_team2_stats = np.array(effective_team2_stats).astype(float) # features are the average over the difference of effective stats features = np.average(effective_team1_stats, axis=0) - np.average(effective_team2_stats, axis=0) #add probability of team1 outscoring team2, assuming gaussian stats # scores are in column 0 of effective stats mean = np.mean(effective_team1_stats[:, 0]) - np.mean(effective_team2_stats[:, 0]) covariance_matrix = np.cov(effective_team1_stats[:, 0], effective_team2_stats[:, 0]) variance = covariance_matrix[0, 0] + covariance_matrix[1, 1] - 2.0*covariance_matrix[1, 0] # probability that team 1 outscores team 2 features = np.concatenate((features, np.array([0.5*(sps.erfc(-mean/np.sqrt(2.0*variance)))]))) # add team winning percentages to the feature list # should change this so that home win and away wins are weighted differently features = np.concatenate( ( features, np.array([team1_wins/(team1_loses+team1_wins), team2_wins/(team2_loses+team2_wins)]) ) ) # add team seeds to feature list if tourney_seeds is not None: keep_seeds = np.where(tourney_seeds[:, 0] == season)[0] tourney_seeds = tourney_seeds[keep_seeds] # get team1's seed, if it exists seed1 = tourney_seeds[np.where(tourney_seeds[:, 2] == team1)[0], 1] # get team2's seed, if it exists seed2 = tourney_seeds[np.where(tourney_seeds[:, 2] == team2)[0], 1] # if unseeded, set seed to C17 (C will is a placeholder for conference and will be taken out shortly) seed1 = (seed1[0] if len(seed1) == 1 else 'C17') seed2 = (seed2[0] if len(seed2) == 1 else 'C17') # take out conference and convert to int seed1 = int(seed1[1:3]) seed2 = int(seed2[1:3]) features = np.concatenate( ( features, np.array([seed1, seed2]) ) ) return features
def calc_new_features(regular_season_data, season, team1, team2, tuning_params): relevant_data = get_relevant_data(regular_season_data, season=season, team1=team1, team2=team2) model_params = model_params_from_game_data(relevant_data) columns_with_winning_team_data = np.arange(0, 5) columns_with_losing_team_data = np.arange(5, 10) effective_team1_stats = [] effective_team2_stats = [] team1_wins = 0 team1_loses = 0 team2_wins = 0 team2_loses = 0 team1_erpi = RelativePowerIndexLookupTable.lookup(season, team1, 'erpi') team2_erpi = RelativePowerIndexLookupTable.lookup(season, team2, 'erpi') effective_team1_erpi = [] effective_team2_erpi = [] for i, record in enumerate(relevant_data): winning_team_erpi = RelativePowerIndexLookupTable.lookup(season, record[2], 'erpi') losing_team_erpi = RelativePowerIndexLookupTable.lookup(season, record[4], 'erpi') winning_stats = model_params[i, columns_with_winning_team_data] losing_stats = model_params[i, columns_with_losing_team_data] if record[2] == team1: # team1 was the winner team1_wins += 1.0 effective_team1_stats.append(winning_stats) effective_team2_stats.append(losing_stats) effective_team1_erpi.append(winning_team_erpi) effective_team2_erpi.append(losing_team_erpi) elif record[4] == team1: # team1 was the loser team1_loses += 1.0 effective_team1_stats.append(losing_stats) effective_team2_stats.append(winning_stats) effective_team1_erpi.append(losing_team_erpi) effective_team2_erpi.append(winning_team_erpi) elif record[2] == team2: # team2 was the winner team2_wins += 1.0 effective_team2_stats.append(winning_stats) effective_team1_stats.append(losing_stats) effective_team2_erpi.append(winning_team_erpi) effective_team1_erpi.append(losing_team_erpi) elif record[4] == team2: # team2 was the loser team2_loses += 1.0 effective_team2_stats.append(losing_stats) effective_team1_stats.append(winning_stats) effective_team2_erpi.append(losing_team_erpi) effective_team1_erpi.append(winning_team_erpi) effective_team1_rpi = np.array(effective_team1_erpi) effective_team2_rpi = np.array(effective_team2_erpi) weights = np.exp(-0.5*( (effective_team1_rpi - team1_erpi)**2 + (effective_team2_rpi - team2_erpi)**2 )/tuning_params['std']**2) weights /= np.sum(weights) effective_team1_stats = np.array(effective_team1_stats) effective_team2_stats = np.array(effective_team2_stats) # model_params = np.concatenate((np.average(effective_team1_stats, axis=0), # np.average(effective_team2_stats, axis=0))) model_params = np.concatenate( (np.sum(weights[:, np.newaxis]*effective_team1_stats, axis=0), np.sum(weights[:, np.newaxis]*effective_team2_stats, axis=0))) possessions = int(0.5*(model_params[0] + model_params[5])) ot_possessions = int(0.5*(model_params[0] + model_params[5])/8.0) team1_dist = scoring_distribution(model_params[1:5], possessions) team1_ot_dist = scoring_distribution(model_params[1:5], ot_possessions) team2_dist = scoring_distribution(model_params[6:10], possessions) team2_ot_dist = scoring_distribution(model_params[6:10], ot_possessions) feature = np.sum(np.cumsum(team2_dist)[:-1]*team1_dist[1:]) \ + np.sum(team2_dist*team1_dist)/(1.0 - np.sum(team2_ot_dist*team1_ot_dist))*\ np.sum(np.cumsum(team2_ot_dist)[:-1]*team1_ot_dist[1:]) feature_erpi_ratio = team1_erpi/team2_erpi mean_score1 = np.sum(np.arange(len(team1_dist))*team1_dist) mean_score2 = np.sum(np.arange(len(team2_dist))*team2_dist) return [feature, feature_erpi_ratio, mean_score1, mean_score2]