def main(): adapter = sql_adapter.SqlAdapter() connection = adapter.get_connection() # these are the only players involved in professional tennis tournaments since 2002 players_cache = tennis_config.players_played_atp_since_2002() # dictionary maps player id to another dict that maps year to last match involving that player player_dict = {} # pre populate an empty inner dictionary for pid in players_cache: player_dict[pid] = {} cursor = connection.cursor() query = "select * from ATPYearlyLastMatchTb" cursor.execute(query) results = cursor.fetchall() cursor.close() for res in results: pid = res[0] # skip over players who don't play professional tennis tournaments if pid not in players_cache: continue year = res[1] serial_num = res[2] matches_played = res[3] player_dict[pid][year] = (serial_num, matches_played) pickle.dump( player_dict, open( "player_last_match.pkl" , "wb" ), -1 )
def extract_feats(start_date, end_date, feat_name_to_col_num=None): """ arguments: start_date is a datetime representing where to begin the data set end_date is a datetime representing where to end the data set global_feat_dict is a dictionary mapping feature_names to column-numbers; it should only be provided when extracting features from test data, so that the columns of the test matrix align correctly. returns: a sparse design matrix, a dict mapping features to column-numbers, a vector of target values, and a list of match ids/information in order of their rows in the design matrix """ start_year = int(start_date.year) end_year = int(end_date.year) feature_dicts = [] targets = [] ids = [] all_matches = get_matches() all_players = get_players() for player in all_players: # players who average a rank below 200 rarely partake in professional tournaments # so we don't include these low probability events in training our model if player["rank"] >= 200: continue # only consider players who have played professional tennis tournaments since 2002 if player["player_id"] not in tennis_config.players_played_atp_since_2002(): continue # every unique datum is the historical metadata associated with a (player, year) tuple for year in range(start_year, end_year + 1): pid_to_inner_dict = tennis_config.pid_to_last_match_played_year_x() year_to_matchid_dict = pid_to_inner_dict[player["player_id"]] last_match_prev_yr = None no_matches_prev_yr = None # try to find the last match played by this player prior to the given year for yr in range(year-1, 1975, -1): if yr in year_to_matchid_dict: last_match_prev_yr = year_to_matchid_dict[yr][0] no_matches_prev_yr = year_to_matchid_dict[yr][1] break # continue if unable to find any previous matches for this player if last_match_prev_yr is None: continue # too few matches to use in training or make a prediction if no_matches_prev_yr <= 5: continue # find last match in current year if year in year_to_matchid_dict: last_match_curr_yr = year_to_matchid_dict[year][0] # set to None if player did not play this year else: last_match_curr_yr = None feature_dicts.append(tennis_feats(player, last_match_prev_yr)) # the main statistic - no of QF+ finishes in professional tournaments target = calculate_strong_finishes(player, last_match_curr_yr) targets.append(target) unique_id = str(player["player_id"]) + "-" + str(year) ids.append(unique_id) X, name_to_col = make_design_mat(feature_dicts, feat_name_to_col_num) return X, name_to_col, np.array(targets), ids