コード例 #1
0
def main():
    adapter = sql_adapter.SqlAdapter()
    connection = adapter.get_connection()

    # these are the only players involved in professional tennis tournaments since 2002
    players_cache = tennis_config.players_played_atp_since_2002()

    # dictionary maps player id to another dict that maps year to last match involving that player
    player_dict = {}

    # pre populate an empty inner dictionary
    for pid in players_cache:
        player_dict[pid] = {}

    cursor = connection.cursor()
    query = "select * from ATPYearlyLastMatchTb"
    cursor.execute(query)
    results = cursor.fetchall()
    cursor.close()

    for res in results:
        pid = res[0]

        # skip over players who don't play professional tennis tournaments
        if pid not in players_cache:
            continue

        year = res[1]
        serial_num = res[2]
        matches_played = res[3]
        player_dict[pid][year] = (serial_num, matches_played)

    pickle.dump( player_dict, open( "player_last_match.pkl" , "wb" ), -1 )
コード例 #2
0
def extract_feats(start_date, end_date, feat_name_to_col_num=None):
    """
    arguments:
      start_date is a datetime representing where to begin the data set
      end_date is a datetime representing where to end the data set
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.

    returns: 
      a sparse design matrix, a dict mapping features to column-numbers,
      a vector of target values, and a list of match ids/information in order of their
      rows in the design matrix
    """

    start_year = int(start_date.year)
    end_year = int(end_date.year)

    feature_dicts = []
    targets = []
    ids = []

    all_matches = get_matches()
    all_players = get_players()

    for player in all_players:
        # players who average a rank below 200 rarely partake in professional tournaments
        # so we don't include these low probability events in training our model
        if player["rank"] >= 200:
            continue

        # only consider players who have played professional tennis tournaments since 2002
        if player["player_id"] not in tennis_config.players_played_atp_since_2002():
            continue

        # every unique datum is the historical metadata associated with a (player, year) tuple
        for year in range(start_year, end_year + 1):

            pid_to_inner_dict = tennis_config.pid_to_last_match_played_year_x()
            year_to_matchid_dict = pid_to_inner_dict[player["player_id"]]
            last_match_prev_yr = None
            no_matches_prev_yr = None

            # try to find the last match played by this player prior to the given year
            for yr in range(year-1, 1975, -1):
                if yr in year_to_matchid_dict:
                    last_match_prev_yr = year_to_matchid_dict[yr][0]
                    no_matches_prev_yr = year_to_matchid_dict[yr][1]
                    break

            # continue if unable to find any previous matches for this player
            if last_match_prev_yr is None:
                continue

            # too few matches to use in training or make a prediction
            if no_matches_prev_yr <= 5:
                continue

            # find last match in current year
            if year in year_to_matchid_dict:
                last_match_curr_yr = year_to_matchid_dict[year][0]
            # set to None if player did not play this year
            else:
                last_match_curr_yr = None

            feature_dicts.append(tennis_feats(player, last_match_prev_yr))

            # the main statistic - no of QF+ finishes in professional tournaments
            target = calculate_strong_finishes(player, last_match_curr_yr)
            targets.append(target)

            unique_id = str(player["player_id"]) + "-" + str(year)
            ids.append(unique_id)

    X, name_to_col = make_design_mat(feature_dicts, feat_name_to_col_num)
    return X, name_to_col, np.array(targets), ids