Ejemplo n.º 1
0
 def get_info(self, home_id, away_id, date, season):
     """Given the data and home/away team id's, get model features"""
     h_manager = get_manager(team_id=home_id, date=date)
     a_manager = get_manager(team_id=away_id, date=date)
     # Check that data was retrieved (catch the error sooner to speed up debugging)
     assert len(h_manager) > 0, 'No data returned for home manager'
     assert len(a_manager) > 0, 'No data returned for away manager'
     # Get the max date from the database
     conn, cursor = connect_to_db()
     max_date = run_query(cursor, 'select max(date) from main_fixtures')
     max_date = pd.to_datetime(max_date.iloc[0, 0])
     # set the fixture_id to be 1 higher than the max fixture_id for that season
     max_fixture = run_query(
         cursor,
         "select max(fixture_id) id from main_fixtures "
         "where date = '{}'".format(str(max_date)))
     max_fixture = max_fixture.iloc[0, 0]
     info_dict = {
         "date": date,
         "home_id": home_id,
         "home_team": fetch_name(home_id),
         "away_id": away_id,
         "away_team": fetch_name(away_id),
         "fixture_id": max_fixture,
         "home_manager_start": h_manager.loc[0, "start_date"],
         "away_manager_start": a_manager.loc[0, "start_date"],
         "season": season
     }
     output = pd.DataFrame()
     output = output.append(pd.DataFrame(info_dict, index=[0]))
     conn.close()
     return output
Ejemplo n.º 2
0
def find_closest_match(team_name):
    # Hardcoded some of the matches because it was too difficult to match accurately
    if team_name == 'Man City':
        output = {
            "team_name": 'Manchester City',
            "team_id": fetch_id("Manchester City")
        }
        return output
    elif team_name == 'Spurs':
        output = {"team_name": 'Tottenham', "team_id": fetch_id("Tottenham")}
        return output
    else:
        conn, cursor = connect_to_db()
        df = run_query(cursor, "Select team_name, team_id from team_ids")
        df['l_dist'] = df['team_name'].apply(
            lambda x: Levenshtein.ratio(x, team_name))
        max_similarity = max(df['l_dist'])
        closest_match = df.loc[df['l_dist'] == max_similarity,
                               ['team_name', 'team_id']].reset_index(drop=True)
        output = {
            "team_name": closest_match.loc[0, 'team_name'],
            "team_id": closest_match.loc[0, 'team_id']
        }
        conn.close()
        return output
async def get_fpl_teams():
    """Download upcoming fixture data from the Fantasy Premier League website"""
    # Connect to the sqlite3 DB
    conn, cursor = connect_to_db()
    df_teams = pd.DataFrame()
    i=0
    # Get all of the team IDs from the fpl_fixtures table
    teams = run_query(cursor, 'select distinct team_h from fpl_fixtures')
    n_teams = len(teams)
    assert n_teams == 20, 'The number of returned teams should be 20, ' \
                          'but is actually {}'.format(n_teams)
    async with aiohttp.ClientSession() as session:
        fpl = FPL(session)
        team_data = await fpl.get_teams(return_json=True)

    for team in team_data:
        df_teams = df_teams.append(pd.DataFrame(team, index=[i]))
        i += 1
    # Find the closest matching team names to the ones in our data
    matches = list(df_teams['name'].apply(lambda x: find_closest_match(x)))
    df_matches = pd.DataFrame()
    for match in matches:
        df_matches = df_matches.append(pd.DataFrame(match, index=[0]))
    df_teams = pd.concat([df_teams, df_matches.reset_index(drop=True)], axis=1)
    # Upload the data to a table in the database
    run_query(cursor, 'DROP TABLE IF EXISTS fpl_teams', return_data=False)
    df_teams.to_sql('fpl_teams', conn)
    conn.close()
Ejemplo n.º 4
0
def get_feature_data(min_training_data_date='2013-08-01'):
    conn, cursor = connect_to_db()
    df = run_query(cursor, """select t1.*, m_h.manager home_manager,
     m_h.start_date home_manager_start, 
     m_a.manager away_manager, m_a.start_date away_manager_start 
     from main_fixtures t1 
     left join managers m_h 
     on t1.home_id = m_h.team_id 
     and (t1.date between m_h.start_date and date(m_h.end_date, '+1 day') 
     or t1.date > m_h.start_date and m_h.end_date is NULL) 
     left join managers m_a 
     on t1.away_id = m_a.team_id 
     and (t1.date between m_a.start_date and date(m_a.end_date, '+1 day') 
     or t1.date > m_a.start_date and m_a.end_date is NULL) 
     where t1.date > '{}'""".format(min_training_data_date))
    df=get_manager_features(df)
    df2 = run_query(cursor, "select * from team_fixtures where date > '{}'".format(
        min_training_data_date))
    conn.close()
    df2['date'] = pd.to_datetime(df2['date'])
    df2 = pd.merge(
        df2,
        df[['date', 'season', 'fixture_id', 'home_manager_age', 'away_manager_age',
            'home_manager_new', 'away_manager_new']],
        on=['date', 'season', 'fixture_id'],
        how="left")
    return df2
Ejemplo n.º 5
0
def get_defaut_collection():
    server = server_carrier.get_server()
    uri = server.config.get('MONGODB_URI')
    db_conn = connect_to_db(DatabaseTypes.MONGODB, uri,
                            DatabaseNames.python_test_db.value)
    member_coll = db_conn.get_collection(Collections.member.value)
    return member_coll
Ejemplo n.º 6
0
async def get_player_data():
    """Download all data from the Fantasy Premier League website"""
    # Connect to the sqlite3 DB
    conn, cursor = connect_to_db()
    summary_df = pd.DataFrame()

    async with aiohttp.ClientSession() as session:
        fpl = FPL(session)
        players = await fpl.get_players()
        i = 0
        for player in players:
            id = player.id
            i += 1
            print("Loading data for player {} of {}".format(i, len(players)))
            player_summary = await fpl.get_player_summary(id)
            player_history = pd.DataFrame(player_summary.history)
            player_history['web_name'] = player.web_name
            player_history['id'] = player.id
            player_history['team'] = player.team
            player_history['team_code'] = player.team_code
            summary_df = summary_df.append(player_history)

    # Upload the data to a table in the database
    run_query(cursor, 'DROP TABLE IF EXISTS player_data', return_data=False)
    summary_df.to_sql('player_data', conn)
Ejemplo n.º 7
0
def fetch_name(team_id):
    """Get name from team ID"""
    conn, cursor = connect_to_db()
    cursor.execute("Select team_name from team_ids where team_id == ?",
                   [team_id])
    output = cursor.fetchone()[0]
    conn.close()
    return output
Ejemplo n.º 8
0
def fetch_id(team_name):
    """Get team ID from name"""
    conn, cursor = connect_to_db()
    cursor.execute(
        "Select team_id from team_ids where team_name = '{}' or alternate_name = '{}'"
        .format(team_name, team_name))
    output = cursor.fetchone()[0]
    conn.close()
    return output
Ejemplo n.º 9
0
def get_upcoming_games(n=10):
    """Get the next n upcoming games in the Premier League"""
    conn, cursor = connect_to_db()
    query = """select kickoff_time, t2.team_id home_id, t2.team_name home_name, 
    t3.team_id away_id, t3.team_name away_name
    from fpl_fixtures t1 left join fpl_teams t2 on t1.team_h = t2.id left 
    join fpl_teams t3 on t1.team_a = t3.id where started = 0 order by 
    kickoff_time limit {}""".format(n)
    df = run_query(cursor, query)
    return df
def get_teams_from_wiki():
    # Connect to database
    conn, cursor = connect_to_db()

    website_url = requests.get(
        'https://en.wikipedia.org/wiki/List_of_Premier_League_clubs').text

    soup = BeautifulSoup(website_url)
    # print(soup.prettify())

    My_table = soup.find('div', {'class': 'timeline-wrapper'})

    links = My_table.findAll('area')

    teams = []

    for link in links:
        team = link.get('alt').encode('UTF-8').decode()
        # Convert &26 to &
        team = team.replace('%26', '&')
        # Remove ' A.F.C' and ' F.C'
        team = team.replace('A.F.C', '')
        team = team.replace('F.C', '')
        # Remove any '.'s
        team = team.replace('.', '')
        # Remove white space from the start and end
        team = team.lstrip().rstrip()
        teams.append(team)

    df = pd.DataFrame()
    df['team_name'] = teams
    df = df.sort_values('team_name')
    df['team_id'] = np.arange(len(df)) + 1

    # Load the names into the database
    run_query(cursor, 'drop table if exists team_ids', return_data=False)
    run_query(
        cursor,
        'create table team_ids (team_name TEXT, team_id INTEGER, alternate_name)',
        return_data=False)

    for row in df.iterrows():
        params = [row[1]['team_name'], row[1]['team_id']]
        params.append(fetch_alternative_name(row[1]['team_name']))
        run_query(
            cursor,
            'insert into team_ids(team_name, team_id, alternate_name) values(?, ?, ?)',
            params=params,
            return_data=False)

    conn.commit()
    conn.close()
async def get_fpl_fixtures():
    """Download upcoming fixture data from the Fantasy Premier League website"""
    # Connect to the sqlite3 DB
    conn, cursor = connect_to_db()
    fixture_list = pd.DataFrame()

    async with aiohttp.ClientSession() as session:
        fpl = FPL(session)
        fixtures = await fpl.get_fixtures(return_json=True)
    for fixture in fixtures:
        del fixture['stats']
        fixture_list = fixture_list.append(pd.DataFrame(fixture, index=[0]))

    # Upload the data to a table in the database
    run_query(cursor, 'DROP TABLE IF EXISTS fpl_fixtures', return_data=False)
    fixture_list.to_sql('fpl_fixtures', conn)
Ejemplo n.º 12
0
def get_manager(team_id, date):
    """Find the sitting manager for a given team_id and date"""
    conn, cursor = connect_to_db()
    # Get the latest date in the db
    max_date = run_query(cursor, "select max(end_date) date from managers").loc[0, 'date']
    # If we are predicting past our data
    if date > max_date:
        # Take the latest manager for the team
        query = """select * from managers where end_date = '{}' and 
        team_id = {}""".format(max_date, team_id)
    else:
        query = """select * from managers where team_id = {} 
                    and '{}' between start_date and end_date""".format(team_id, date)
    df = run_query(cursor, query)
    rows = len(df)
    conn.close()
    if rows != 1:
        logger.warning("get_manager: Expected 1 row but got {}. Is the manager "
                       "info up to date?".format(rows))
    return df
Ejemplo n.º 13
0
 def train_model(self, X, y):
     """Train a model on 90% of the data and predict 10% using KFold validation,
     such that a prediction is made for all data"""
     logger.info("Training model.")
     kf = KFold(n_splits=10)
     model_predictions = pd.DataFrame()
     for train_index, test_index in kf.split(X):
         xgb_model = xgb.XGBClassifier().fit(
             X=np.array(X.iloc[train_index, :][self.model_features]),
             y=np.array(y.iloc[train_index]))
         predictions = xgb_model.predict(np.array(X.iloc[test_index, :][self.model_features]))
         actuals = y.iloc[test_index]
         model_predictions = model_predictions.append(
             pd.concat([
                 X.iloc[test_index, :],
                 pd.DataFrame(predictions, columns=['pred'], index=X.iloc[test_index, :].index),
                 actuals], axis=1))
     # Assess the model performance using the first performance metric
     main_performance_metric = self.performance_metrics[0].__name__
     performance = self.performance_metrics[0](actuals, predictions)
     # If the model performs better than the previous model, save it
     # ToDo: Returning 0 when there is no performance score only works
     #  for performance scores where higher is better
     if performance > self.performance.get(main_performance_metric, 0):
         self.trained_model = xgb_model
         for metric in self.performance_metrics:
             metric_name = metric.__name__
             self.performance[metric_name] = metric(actuals, predictions)
     # Upload the predictions to the model_predictions table
     conn, cursor = connect_to_db()
     # Add model ID so we can compare model performances
     model_predictions['model_id'] = self.model_id
     # Add profit made if we bet on the game
     model_predictions['profit'] = model_predictions.apply(lambda x: get_profit(x), axis=1)
     run_query(cursor, "drop table if exists historic_predictions",
               return_data=False)
     if (not self.test_mode ) or self.upload_historic_predictions:
         model_predictions.to_sql(
             'historic_predictions', con=conn, if_exists='append')
     conn.close()
Ejemplo n.º 14
0
 def get_historic_predictions(self):
     conn, cursor = connect_to_db()
     df = run_query(
         cursor, "select * from historic_predictions where "
                 "model_id = '{}'".format(self.model_id))
     return df
Ejemplo n.º 15
0
import numpy as np
from sklearn.preprocessing import QuantileTransformer
from keras import regularizers
from keras.activations import tanh
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import LSTM, Masking
from keras.optimizers import adam
from keras.callbacks import History
import tensorflow as tf
import talos as ta
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Connect to database
conn, cursor = connect_to_db()

# Get all fixtures after game week 8, excluding the last game week
df = run_query(
    cursor,
    "select t1.*, m_h.manager home_manager, m_h.start home_manager_start, "
    "m_a.manager away_manager, m_a.start away_manager_start "
    "from main_fixtures t1 "
    "left join managers m_h "
    "on t1.home_id = m_h.team_id "
    "and (t1.date between m_h.start and date(m_h.end, '+1 day') or t1.date > m_h.start and m_h.end is NULL) "
    "left join managers m_a "
    "on t1.away_id = m_a.team_id "
    "and (t1.date between m_a.start and date(m_a.end, '+1 day') or t1.date > m_a.start and m_a.end is NULL) "
    "where t1.date > '2013-08-01'")
Ejemplo n.º 16
0
 def get_training_data(self):
     conn, cursor = connect_to_db()
     # Get all fixtures after game week 8, excluding the last game week
     df = run_query(cursor, self.training_data_query)
     return df
Ejemplo n.º 17
0
def combine_team_poisson_probabilities():

    # Connect to database
    conn, cursor = connect_to_db()

    # Extract data, join the poisson_team_odds table on itself to combine the home and away teams.
    query = '''
    select
        -- IDENTIFIERS
        fixture_id, date, season, home_team, away_team,
        -- SCORE PROBABILITIES
        0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_0_a + ga_0_h) p0_0,
        0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_0_a + ga_0_h) p1_0,
        0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_0_a + ga_0_h) p2_0,
        0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_0_a + ga_0_h) p3_0,
        0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_1_a + ga_1_h) p0_1,
        0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_1_a + ga_1_h) p1_1,
        0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_1_a + ga_1_h) p2_1,
        0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_1_a + ga_1_h) p3_1,
        0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_2_a + ga_2_h) p0_2,
        0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_2_a + ga_2_h) p1_2,
        0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_2_a + ga_2_h) p2_2,
        0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_2_a + ga_2_h) p3_2,
        0.5 * (gf_0_h + ga_0_a) * 0.5 * (gf_3_a + ga_3_h) p0_3,
        0.5 * (gf_1_h + ga_1_a) * 0.5 * (gf_3_a + ga_3_h) p1_3,
        0.5 * (gf_2_h + ga_2_a) * 0.5 * (gf_3_a + ga_3_h) p2_3,
        0.5 * (gf_3_h + ga_3_a) * 0.5 * (gf_3_a + ga_3_h) p3_3
    from
        (
        select 
            -- IDENTIFIERS
            t1.fixture_id, t1.date, t1.season, t1.team_name home_team, t2.team_name away_team,
            -- HOME TEAM PROBABILITIES
            t1.gf_prob_0 gf_0_h, t1.gf_prob_1 gf_1_h, t1.gf_prob_2 gf_2_h, t1.gf_prob_3 gf_3_h, 
            t1.ga_prob_0 ga_0_h, t1.ga_prob_1 ga_1_h, t1.ga_prob_2 ga_2_h, t1.ga_prob_3 ga_3_h, 
            -- AWAY TEAM PROBABILITIES
            t2.gf_prob_0 gf_0_a, t2.gf_prob_1 gf_1_a, t2.gf_prob_2 gf_2_a, t2.gf_prob_3 gf_3_a, 
            t2.ga_prob_0 ga_0_a, t2.ga_prob_1 ga_1_a, t2.ga_prob_2 ga_2_a, t2.ga_prob_3 ga_3_a
        from 
            (select * from poisson_team_odds where is_home = 1) t1 -- home_teams
        left join 
            (select * from poisson_team_odds where is_home = 0) t2 -- away_teams
        on t1.fixture_id = t2.fixture_id 
        and t1.season = t2.season 
        and t1.team_name is not t2.team_name
        )'''

    df = run_query(cursor, query)

    df['prob_sum'] = df.select_dtypes('float64').apply(lambda x: sum(x), axis=1)

    # Round floats to 2 decimal places
    float_columns = df.select_dtypes(include='float64').columns
    df[float_columns] = round(df.select_dtypes(include='float64'), 4)

    # Create the DB table to store data
    cursor.execute("DROP TABLE IF EXISTS poisson_match_probabilities")
    cursor.execute("""CREATE TABLE poisson_match_probabilities (fixture_id INT, date DATE, season TEXT, 
    home_team TEXT, away_team TEXT, p0_0 FLOAT, p1_0 FLOAT, p2_0 FLOAT, p3_0 FLOAT, p0_1 FLOAT, p1_1 FLOAT, 
    p2_1 FLOAT, p3_1 FLOAT, p0_2 FLOAT, p1_2 FLOAT, p2_2 FLOAT, p3_2 FLOAT, p0_3 FLOAT, p1_3 FLOAT, p2_3 FLOAT, 
    p3_3 FLOAT, prob_sum FLOAT)""")

    # Load data into the DB
    for row in df.iterrows():
        params = [row[1][0], str(row[1][1]), str(row[1][2]), str(row[1][3]), row[1][4], row[1][5], row[1][6],
                  row[1][7], row[1][8], row[1][9], row[1][10], row[1][11], row[1][12], row[1][13], row[1][14],
                  row[1][15], row[1][16], row[1][17], row[1][18], row[1][19], row[1][20], row[1][21]]
        cursor.execute('insert into poisson_match_probabilities VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', params)

    conn.commit()
Ejemplo n.º 18
0
def create_team_poisson_probabilities():

    # Connect to database
    conn, cursor = connect_to_db()

    # Extract data
    query = 'select fixture_id, team_name, date, season, is_home, goals_for, goals_against from team_fixtures'
    df = run_query(cursor, query)

    # Sort the data by season, team_name and date
    df = df.sort_values(['season', 'team_name', 'date'])

    # Calculate the moving average
    # NOTE: The moving average value uses the previous games data, not the goals_for and goals_against on that row.
    df['goals_for_mavg'] = df.groupby(['team_name', 'season'])['goals_for'].\
        transform(lambda x: x.rolling(8, 8).mean()).shift(1)
    df['goals_against_mavg'] = df.groupby(['team_name', 'season'])['goals_against'].\
        transform(lambda x: x.rolling(8, 8).mean()).shift(1)

    # Calculate the moving standard deviation
    #df['goals_for_sdavg'] = df.groupby(['team_name', 'season'])['goals_for'].\
    #    transform(lambda x: x.rolling(8, 8).std()).shift(1)
    #df['goals_against_sdavg'] = df.groupby(['team_name', 'season'])['goals_against'].\
    #    transform(lambda x: x.rolling(8, 8).std()).shift(1)

    # Get goals_for probabilities
    df['gf_prob_0'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(0, x))
    df['gf_prob_1'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(1, x))
    df['gf_prob_2'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(2, x))
    df['gf_prob_3'] = df['goals_for_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(3, x))
    df['gf_prob_other'] = df.apply(lambda x: 1-(x['gf_prob_0'] + x['gf_prob_1'] +
                                                x['gf_prob_2'] + x['gf_prob_3']), axis=1)

    # Get goals_against probabilities
    df['ga_prob_0'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(0, x))
    df['ga_prob_1'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(1, x))
    df['ga_prob_2'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(2, x))
    df['ga_prob_3'] = df['goals_against_mavg'].apply(lambda x: scipy.stats.distributions.poisson.pmf(3, x))
    df['ga_prob_other'] = df.apply(lambda x: 1 - (x['ga_prob_0'] + x['ga_prob_1'] +
                                                  x['ga_prob_2'] + x['ga_prob_3']), axis=1)

    # ToDo: Get the data also in key-value pair, e.g type: goals_for/goals_against, amount:, value: for easier plotting
    
    # Create the DB table to store data
    cursor.execute("DROP TABLE IF EXISTS poisson_team_odds")
    cursor.execute("""CREATE TABLE poisson_team_odds (fixture_id INT, team_name TEXT, date DATE, season TEXT, 
    is_home INT, goals_for FLOAT, goals_against FLOAT, goals_for_mavg FLOAT, goals_against_mavg FLOAT, gf_prob_0 FLOAT, 
    gf_prob_1 FLOAT, gf_prob_2 FLOAT, gf_prob_3 FLOAT, ga_prob_0 FLOAT, ga_prob_1 FLOAT, ga_prob_2 FLOAT, 
    ga_prob_3 FLOAT)""")

    # Remove Nans (the first 8 games for each team)
    df = df.dropna()

    # Round floats to 2 decimal places
    float_columns = df.select_dtypes(include='float64').columns
    df[float_columns] = round(df.select_dtypes(include='float64'), 4)

    # Load data into the DB
    for row in df.iterrows():
        params = [row[1][0], str(row[1][1]), str(row[1][2]), str(row[1][3]), row[1][4], row[1][5], row[1][6],
                  row[1][7], row[1][8], row[1][9], row[1][10], row[1][11], row[1][12], row[1][13], row[1][14],
                  row[1][15], row[1][16]]
        cursor.execute('''insert into poisson_team_odds VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)''', params)

    conn.commit()