def make_output_2015(): tourney_seeds = read_csv_as_np_array('../Data/tourney_seeds_2015.csv', header=False) neutralized_season_data = neutralize_season_data( '../Data/regular_season_detailed_results_combined.csv', '../Data/tourney_detailed_results.csv', '../Data/neutralized_season_data_combined.csv' ) tuning_params = dict(std=0.05) output = [['id', 'pred', 'score1', 'score2']] for season in ['2015']: tourney_teams = np.sort(tourney_seeds[np.where(tourney_seeds[:, 0] == season)[0]][:, 2]) while len(tourney_teams) > 1: team1 = tourney_teams[0] tourney_teams= np.delete(tourney_teams, 0) for team2 in tourney_teams: features = calc_new_features( neutralized_season_data, season, team1, team2, tuning_params ) prob = features[0] score1 = np.round(features[2]).astype(int) score2 = np.round(features[3]).astype(int) game_id = [season+'_'+team1+'_'+team2, str(prob), str(score1), str(score2)] output.append(game_id) print game_id output_file = open('../Data/out_2015_with_scores.csv', 'wb') csv.writer(output_file).writerows(output)
class SeedLookupTable(object): _table = read_csv_as_np_array('./Data/tourney_seeds_combined.csv', header=True) _table_header = _table[0] _table = _table[1:] _dict = None _header_legend = dict( (category, i) for i, category in enumerate(_table_header)) @classmethod def _initialize_dict(cls): cls._dict = dict() for line in cls._table: season = line[cls._header_legend['season']] if not season in cls._dict: cls._dict[season] = dict() team = line[cls._header_legend['team']] cls._dict[season][team] = int( line[cls._header_legend['seed']][1:3]) @classmethod def lookup(cls, season, team): if cls._dict is None: cls._initialize_dict() return cls._dict[season][team]
def make_output(): tourney_seeds = read_csv_as_np_array('../Data/tourney_seeds.csv', header=False) neutralized_season_data = neutralize_season_data( '../Data/regular_season_detailed_results.csv', '../Data/tourney_detailed_results.csv', '../Data/neutralized_season_data.csv' ) tuning_params = dict(std=0.05) output = [['id', 'pred']] for season in ['2011', '2012', '2013', '2014']: tourney_teams = np.sort(tourney_seeds[np.where(tourney_seeds[:, 0] == season)[0]][:, 2]) while len(tourney_teams) > 1: team1 = tourney_teams[0] tourney_teams= np.delete(tourney_teams, 0) for team2 in tourney_teams: prob = calc_new_features( neutralized_season_data, season, team1, team2, tuning_params )[0] game_id = [season+'_'+team1+'_'+team2, str(prob)] output.append(game_id) print game_id output_file = open('../Data/out.csv', 'wb') csv.writer(output_file).writerows(output)
def pick_winners(): data = read_csv_as_np_array('../Data/out_2015_with_scores.csv', header=False) output = [['winner', 'loser']] for line in data: team1, team2 = line[0].split('_')[1:3] prob = float(line[1]) score1, score2 = line[2:4] if prob >= 0.5: result = [TeamName.lookup(team1), TeamName.lookup(team2), prob, score1, score2] else: result = [TeamName.lookup(team2), TeamName.lookup(team1), 1.0 - prob, score2, score1] output.append(result) output_file = open('../Data/game_predictions_with_scores.csv', 'wb') csv.writer(output_file).writerows(output)
def make_output_seeds(): tourney_seeds = read_csv_as_np_array('../Data/tourney_seeds_2015.csv', header=False) output = [['id', 'pred']] for season in ['2015']: tourney_teams = np.sort(tourney_seeds[np.where(tourney_seeds[:, 0] == season)[0]][:, 2]) while len(tourney_teams) > 1: team1 = tourney_teams[0] tourney_teams= np.delete(tourney_teams, 0) for team2 in tourney_teams: seed1 = SeedLookupTable.lookup('2015', team1) seed2 = SeedLookupTable.lookup('2015', team2) prob = 0.5 + 0.03*(seed2 - seed1) game_id = [season+'_'+team1+'_'+team2, str(prob)] output.append(game_id) print game_id output_file = open('../Data/out_2015_seed_benchmark.csv', 'wb') csv.writer(output_file).writerows(output)
class TeamName(object): _table = read_csv_as_np_array('../Data/teams.csv', header=True) _table_header = _table[0] _table = _table[1:] _dict = None _header_legend = dict( (category, i) for i, category in enumerate(_table_header)) @classmethod def _initialize_dict(cls): cls._dict = dict() for line in cls._table: cls._dict[line[0]] = line[1] @classmethod def lookup(cls, team_id): if cls._dict is None: cls._initialize_dict() return cls._dict[team_id]
class Seed2DDistribution(object): _data = read_csv_as_np_array('./Data/tourney_compact_results.csv', header=True) _header = _data[0] _data = _data[1:] _header_legend = dict((category, i) for i, category in enumerate(_header)) # drop data from 2011 or later _data_drop_index = np.min( np.where(_data[:, _header_legend['season']] == '2011')) _data = _data[:_data_drop_index] _dict = None @classmethod def _initialize_dict(cls): cls._dict = dict() for line in cls._data: season = line[cls._header_legend['season']] w_team = line[cls._header_legend['wteam']] l_team = line[cls._header_legend['lteam']] w_seed = SeedLookupTable.lookup(season, w_team) l_seed = SeedLookupTable.lookup(season, l_team) greater_seed = w_seed if w_seed >= l_seed else l_seed lesser_seed = w_seed if w_seed <= l_seed else l_seed greater_seed_str = "%02d" % greater_seed lesser_seed_str = "%02d" % lesser_seed if not greater_seed_str in cls._dict: cls._dict[greater_seed_str] = dict() if not lesser_seed_str in cls._dict[greater_seed_str]: cls._dict[greater_seed_str][lesser_seed_str] = [0, 0] if w_seed == l_seed: cls._dict[greater_seed_str][lesser_seed_str][0] += 0.5 cls._dict[greater_seed_str][lesser_seed_str][1] += 0.5 elif w_seed > l_seed: cls._dict[greater_seed_str][lesser_seed_str][0] += 1 else: cls._dict[greater_seed_str][lesser_seed_str][1] += 1 @classmethod def lookup(cls, seed1, seed2): if cls._dict is None: cls._initialize_dict() if seed1 >= seed2: g_str = "%02d" % seed1 l_str = "%02d" % seed2 else: g_str = "%02d" % seed2 l_str = "%02d" % seed1 if not g_str in cls._dict: cls._dict[g_str] = dict() if not l_str in cls._dict[g_str]: cls._dict[g_str][l_str] = [0, 0] result = cls._dict[g_str][l_str] if seed2 > seed1: result = [result[1], result[0]] return result
def compare_with_history(): my_prediction = read_csv_as_np_array('../Data/submission_2015_03_14.csv', header=False) results = read_csv_as_np_array('../Data/tourney_compact_results.csv', header=False) my_dict = dict() results_dict = dict() for i in xrange(len(my_prediction)): key = my_prediction[i, 0] my_dict[key] = my_prediction[i, 1].astype(float) for i in xrange(len(results)): year = results[i, 0] if int(year) >= 2011: team1 = results[i, 2] team2 = results[i, 4] if int(team1) < int(team2): results_key = '_'.join([year, team1, team2]) res = 1 else: results_key = '_'.join([year, team2, team1]) res = 0 results_dict[results_key] = [my_dict[results_key], res] data = [] game_id_list = [] for key in sorted(results_dict.keys()): game_id_list.append(key) data.append(results_dict[key]) data = np.array(data) bins = np.arange(0.05, 1.05, 0.1) p = [] for midpoint in bins: ind = np.where(np.logical_and(data[:, 0] >= (midpoint - 0.025), data[:, 0] < (midpoint + 0.025)))[0] p.append((np.sum(data[ind, 1])+1.0)/(len(ind)+1.0)) p = np.array(p) ind_sort = np.argsort(data[:, 0]) data = data[ind_sort] window = np.sqrt(len(data)).astype(int) test0 = np.convolve(data[:, 0], np.ones((window,))/window, mode='valid') test1 = np.convolve(data[:, 1], np.ones((window,))/window, mode='valid') plt.scatter(data[:, 0], data[:, 1]) # plt.plot(test0, test1) plt.plot(bins, p) plt.ylabel('Result') plt.xlabel('My Prediction') plt.title("Result vs. My Prediction - 2011-2014") plt.savefig('../outputs/comparison_history_vs_my.png') plt.close() x = data[np.flatnonzero(data[:, 1]), 0] nbins = np.sqrt(len(x)).astype(int) bins = np.linspace(x.min(), x.max(), nbins) bins = np.linspace(0, 1, 10) plt.hist(x, bins, normed=True, alpha=0.5) x = data[np.flatnonzero(1-data[:, 1]), 0] nbins = np.sqrt(len(x)).astype(int) bins = np.linspace(x.min(), x.max(), nbins) bins = np.linspace(0, 1, 10) plt.hist(x, bins, normed=True, alpha=0.5) plt.savefig('./comparison_history_vs_my_histogram.png') plt.close()
def compare_with_net_prophet(): my_prediction = read_csv_as_np_array('../Data/out_2015.csv', header=False) np_prediction = read_csv_as_np_array('../Data/net_prophet_kaggle_submission_public.csv', header=False) sb_prediction = read_csv_as_np_array('../Data/out_2015_seed_benchmark.csv', header=False) if len(my_prediction) != len(np_prediction): raise RuntimeError('something wrong') my_dict = dict() np_dict = dict() sb_dict = dict() for i in xrange(len(my_prediction)): key = my_prediction[i, 0] my_dict[key] = my_prediction[i, 1].astype(float) key = np_prediction[i, 0] np_dict[key] = np_prediction[i, 1].astype(float) key = sb_prediction[i, 0] sb_dict[key] = sb_prediction[i, 1].astype(float) data = [] game_id_list = [] for key in sorted(my_dict.keys()): game_id_list.append(key) data.append([my_dict[key], np_dict[key], sb_dict[key]]) data = np.array(data) plt.scatter(data[:, 0], data[:, 1]) plt.ylabel('Net Prophet Prediction') plt.xlabel('My Prediction') plt.title("Net Prophet's Prediction vs. My Prediction - 2015") plt.savefig('./comparison_np_vs_my.png') plt.close() plt.scatter(data[:, 2], data[:, 0]) plt.xlabel('Seed Benchmark Prediction') plt.xlabel('My Prediction') plt.title("My Prediction vs. Seed Benchmark Prediction - 2015") plt.savefig('./comparison_my_vs_sb.png') plt.close() plt.scatter(data[:, 2], data[:, 1]) plt.ylabel('Net Prophet Prediction') plt.xlabel('Seed Benchmark Prediction') plt.title("Net Prophet's Prediction vs. Seed Benchmark Prediction - 2015") plt.savefig('./comparison_np_vs_sb.png') plt.close() print 'E[log_loss | Net Prophet Model correct]:' print ' ----- My Model: ', calc_log_loss(data[:, 0], data[:, 1]) print ' ----- Seed Benchmark Model: ', calc_log_loss(data[:, 2], data[:, 1]) print '\nE[log_loss | My Model correct]:' print ' ----- Net Prophet Model: ', calc_log_loss(data[:, 1], data[:, 0]) print ' ----- Seed Benchmark Model: ', calc_log_loss(data[:, 2], data[:, 0]) print '\nE[log_loss | Seed Benchmark correct]:' print ' ----- My Model: ', calc_log_loss(data[:, 0], data[:, 2]) print ' ----- Net Prophet Model: ', calc_log_loss(data[:, 1], data[:, 2]) # for weight in np.linspace(0, 1, 21): # print weight, calc_log_loss(data[:, 2], weight*data[:, 0] + (1-weight)*data[:, 1]) merged_prediction = 0.5*data[:, 0] + 0.5*data[:, 1] output = [['id', 'pred']] for i, game_id in enumerate(game_id_list): output.append([game_id, merged_prediction[i]]) output_file = open('../Data/out_2015_merged.csv', 'wb') csv.writer(output_file).writerows(output)
def main_2015_03_12(): tourney_data = read_csv_as_np_array('../Data/tourney_detailed_results.csv', header=False) neutralized_season_data = neutralize_season_data( '../Data/regular_season_detailed_results.csv', '../Data/tourney_detailed_results.csv', '../Data/neutralized_season_data.csv' ) # test on 2011-2014 tourney results test_data_min_index = np.min(np.where(tourney_data[:, 0] == '2014')[0]) train_data = tourney_data for std in [0.05]: # np.linspace(0.01, 0.1, 10): tuning_params = dict(std=std) #scramble the results to eliminate fact that team1 is always winner in tourney data results = np.random.randint(0, 2, len(train_data)) features = [] output = [['id', 'pred']] for i in xrange(len(train_data)): season = train_data[i, 0] if results[i] == 1: team1 = train_data[i, 2] team2 = train_data[i, 4] else: team1 = train_data[i, 4] team2 = train_data[i, 2] features.append( calc_new_features( neutralized_season_data, season, team1, team2, tuning_params ) ) if int(season) >= 2011: if int(team1) < int(team2): game_id = [season+'_'+team1+'_'+team2, str(features[-1][0])] else: game_id = [season+'_'+team2+'_'+team1, str(1.0-features[-1][0])] output.append(game_id) # output_file = open('../Data/out.csv', 'wb') # csv.writer(output_file).writerows(output) features = np.array(features) for season in np.unique(train_data[:, 0]): # ['2011', '2012', '2013', '2014']: idx = np.where(train_data[:, 0] == season)[0] log_loss = -np.mean(results[idx]*np.log(features[idx, 0]) + (1-results[idx])*np.log(1.0 - features[idx, 0])) junk = 0.98*(features[idx, 0]-features[idx, 0].min())/(features[idx, 0].max() - features[idx, 0].min()) + 0.01 junk_log_loss = -np.mean(results[idx]*np.log(junk) + (1-results[idx])*np.log(1.0 - junk)) print std, season, log_loss, junk_log_loss
def main_2015_03_08(): standardized_season_data = standardize_season_data( '../Data/regular_season_detailed_results.csv', '../Data/tourney_detailed_results.csv', '../Data/standardized_season_data.csv' ) tourney_data = read_csv_as_np_array('../Data/tourney_detailed_results.csv', header=False) tourney_seeds = read_csv_as_np_array('../Data/tourney_seeds.csv', header=False) #win_loss_data = get_win_loss_data() # test on 2011-2014 tourney results test_data_min_index = np.min(np.where(tourney_data[:, 0] == '2011')[0]) train_data = tourney_data # tourney_data[:, test_data_min_index] test_data = tourney_data[test_data_min_index:] #scramble the results to eliminate fact that team1 is always winner in tourney data results = np.random.randint(0, 2, len(train_data)) features = [] for i in xrange(len(train_data)): if results[i] == 1: team1 = train_data[i, 2] team2 = train_data[i, 4] else: team1 = train_data[i, 4] team2 = train_data[i, 2] features.append( calc_features( standardized_season_data, train_data[i, 0], team1, team2, tourney_seeds=tourney_seeds ) ) #features = np.array(features) # try just using gaussian win prob and team seeds features = np.array(features)[:, np.array([-5, -2, -1])] # logistic regression lr_model = LogisticRegression() lr_model.fit(features[:test_data_min_index], results[:test_data_min_index]) logistic_probability = lr_model.predict_proba(features[test_data_min_index:])[:, 1] log_loss = -np.mean(results[test_data_min_index:]*np.log(logistic_probability) + (1-results[test_data_min_index:])*np.log(1.0 - logistic_probability)) print lr_model.coef_, lr_model.intercept_ print log_loss # try just the seed difference as a feature in a logistic regression seed_diff = features[:, -1] - features[:, -2] lr_model = LogisticRegression() lr_model.fit(seed_diff[:test_data_min_index, np.newaxis], results[:test_data_min_index]) logistic_probability = lr_model.predict_proba(seed_diff[test_data_min_index:, np.newaxis])[:, 1] log_loss = -np.mean(results[test_data_min_index:]*np.log(logistic_probability) + (1-results[test_data_min_index:])*np.log(1.0 - logistic_probability)) print lr_model.coef_, lr_model.intercept_ print log_loss quick_prob = 0.5+0.03*(features[test_data_min_index:, -1] - features[test_data_min_index:, -2]) print -np.mean(results[test_data_min_index:]*np.log(quick_prob) + (1-results[test_data_min_index:])*np.log(1.0 - quick_prob)) plt.scatter(quick_prob, logistic_probability) plt.scatter(quick_prob, quick_prob, color='red') #plt.scatter(logistic_probability, results[test_data_min_index:]) plt.show() # print 'about to do gmm' # gmm_model = mixture.GMM(n_components=2) # gmm_model.fit(features[:test_data_min_index]) # gmm_probs = gmm_model.predict_proba(features[test_data_min_index:]) # # log_loss = -np.mean( # results[test_data_min_index:, np.newaxis]*np.log(gmm_probs) # + (1-results[test_data_min_index:, np.newaxis])*np.log(1.0 - gmm_probs), # axis=0 # ) # # print log_loss print 'done'