def scrape_savant():

    from datetime import date
    import os
    import pandas as pd
    from general import selenium_utilities
    from general import postgres
    from munging import player_names

    driver = selenium_utilities.start_driver()
    draft_url = "https://baseballsavant.mlb.com/leaderboard/custom?year=2021&type=batter&filter=&sort=4&sortDir=desc&min=10&selections=b_total_pa,xba,xslg,woba,xwoba,xobp,xiso,wobacon,xwobacon,exit_velocity_avg,launch_angle_avg,barrel_batted_rate,hard_hit_percent,sprint_speed,&chart=false&x=xba&y=xba&r=no&chartType=beeswarm"
    driver.get(draft_url)
    print('Arrived at ' + driver.current_url)

    input_dl = driver.find_element_by_id('btnCSV')
    input_dl.click()

    basepath = "/Users/andrewfelton/Documents/bb/bb-2021"
    dl_file = "/Users/andrewfelton/Downloads/docker/stats.csv"

    today = date.today().strftime("%Y%m%d")
    new_file = basepath + "/data/savant/hitter_stats_" + today + ".csv"
    stream_command = os.popen('mv ' + dl_file + ' ' + new_file)
    mv_file = stream_command.read()

    # create the soft link
    ln_file = basepath + "/data/savant/hitter_stats.csv"
    command_ln = os.popen('ln -sf ' + new_file + ' ' + ln_file)

    driver.close()
    print("Finished scraping " + ln_file)

    savant = pd.read_csv(ln_file)
    savant.insert(0, 'asof_date', date.today().strftime('%Y-%m-%d'))

    # Merge in the player names and FG IDs
    savant.rename(columns={'player_id': 'mlb_id'}, inplace=True)
    savant['mlb_id'] = savant['mlb_id'].apply(str)
    names = player_names.get_player_names()
    savant = savant.merge(right=names[['mlb_id', 'fg_id']],
                          how='left',
                          on='mlb_id')

    #fg_ids = savant[['fg_id']].astype(str).values
    #put_missing_in_GS(id_list=pd.DataFrame(fg_ids, columns=['fg_id']), type='fg_id')

    savant = savant[[
        'asof_date', 'fg_id', 'b_total_pa', 'xba', 'xslg', 'woba', 'xwoba',
        'xobp', 'xiso', 'wobacon', 'xwobacon', 'exit_velocity_avg',
        'launch_angle_avg', 'barrel_batted_rate', 'hard_hit_percent',
        'sprint_speed'
    ]]

    schema = 'tracking'
    tablename = 'savant'
    bbdb = postgres.connect_to_bbdb()
    savant.to_sql(tablename, bbdb, schema=schema, if_exists='replace')
Exemple #2
0
def create_combined_valuations(league):
    import pandas as pd
    import gspread
    import gspread_dataframe as gsdf
    import gspread_formatting as gsfmt

    from general import postgres
    from munging import player_names
    from general import gs

    assert league.league_name in ['SoS', 'Legacy']
    #league = 'SoS'
    #league = 'Legacy'

    bbdb = postgres.connect_to_bbdb()
    names = player_names.get_player_names()
    gc = gspread.service_account(filename='./bb-2021-2b810d2e3d25.json')
    #bb2021 = gc.open("BB 2021 " + league.league_name)
    bb2021 = gc.open("BB 2021 InSeason")

    combined_hitters = create_combined_hitter_valuations(league)
    combined_pitchers = create_combined_pitcher_valuations(league)

    hitter_projections = bb2021.worksheet('Hitter Projections - ' +
                                          league.league_name)
    bb2021.values_clear(hitter_projections.title + "!A:Z")
    gsdf.set_with_dataframe(hitter_projections, combined_hitters)
    hitter_projections.update
    format_gs.format_gs_all(league=league, ls=league, type='hitting')

    pitcher_projections = bb2021.worksheet('Pitcher Projections - ' +
                                           league.league_name)
    bb2021.values_clear(pitcher_projections.title + "!A:Z")
    gsdf.set_with_dataframe(pitcher_projections, combined_pitchers)
    pitcher_projections.update
    format_gs.format_gs_all(league=league.league_name,
                            ls=league,
                            type='pitching')

    combined = pd.concat([
        combined_hitters[['name', 'fg_id', 'type', 'zar', 'value']],
        combined_pitchers[['name', 'fg_id', 'type', 'zar', 'value']]
    ])
    combined = combined.sort_values(by='value', ascending=False)

    gs_combined = bb2021.worksheet('Combined Z')
    gsdf.set_with_dataframe(gs_combined, combined)
    gs_combined.update

    gsfmt.format_cell_range(
        gs_combined, 'D:E',
        gsfmt.CellFormat(
            numberFormat=gsfmt.NumberFormat(type='NUMBER', pattern='0.0')))
def create_actuals_pitchers(ls, year=2021):
    import pandas as pd

    from general import utilities
    from general import postgres
    from munging import player_names

    bbdb = postgres.connect_to_bbdb()
    query = (
        'SELECT pit_std.year, pit_std.bbref_id, pit_std."Tm" as team, pit_std."IP" as ip, pit_start."GS" as gs, pit_start."QS" as qs, pit_std."SO" as so, pit_std."ERA" as era, pit_std."WHIP" as whip, pit_relief."SV" as sv, pit_relief."Hold" as hld FROM '
        + '(SELECT * FROM tracking.bbref_pitching_standard) as pit_std ' +
        'LEFT JOIN (SELECT * FROM tracking.bbref_pitching_starter) as pit_start ON pit_std.bbref_id=pit_start.bbref_id AND pit_std.year=pit_start.year AND pit_std."Tm"=pit_start."Tm" '
        +
        'LEFT JOIN (SELECT * FROM tracking.bbref_pitching_reliever) as pit_relief ON pit_std.bbref_id=pit_relief.bbref_id AND pit_std.year=pit_relief.year AND pit_std."Tm"=pit_relief."Tm" '
        + 'WHERE pit_std.year=' + str(year))
    df = pd.read_sql_query(query, bbdb)
    df['ip'] = df['ip'].str.replace('.1', '.33', regex=False)
    df['ip'] = df['ip'].str.replace('.2', '.67', regex=False)
    df = df.fillna(value={
        'era': 0,
        'whip': 0,
        'gs': 0,
        'qs': 0,
        'sv': 0,
        'hld': 0
    })
    for c in ['gs', 'qs', 'so', 'sv', 'hld']:
        df[c] = df[c].replace(r'^\s*$', 0, regex=True)
        df[c] = df[c].astype(int)
    for c in ['ip', 'era', 'whip']:
        df[c] = df[c].replace(r'^\s*$', 0, regex=True)
        df[c] = df[c].astype(float)
    df['svhld'] = df['sv'] + df['hld']
    df = df[(df['bbref_id'].notnull()) & (df['bbref_id'] != u'')]

    # merge in the names and reorder
    names = player_names.get_player_names()
    combined_pitchers = df.merge(names[['bbref_id', 'fg_id', 'name']],
                                 on='bbref_id',
                                 how='left')
    output_stats = utilities.flatten([['fg_id', 'name', 'team', 'ip'],
                                      [ls.pitching_stats]])
    combined_pitchers = combined_pitchers[output_stats]
    combined_pitchers.drop_duplicates(inplace=True)

    return combined_pitchers
def create_actuals_hitters(ls, year=2021):
    import pandas as pd
    from general import utilities
    from general import postgres
    from general import classes
    from munging import player_names

    bbdb = postgres.connect_to_bbdb()

    if year == 2021:
        tablename = 'tracking'
    else:
        tablename = 'reference'

    query = (
        'SELECT year, bbref_id, bat."Tm" as team, bat."PA" as pa, ' +
        'bat."HR" as hr, bat."R" as r, bat."RBI" as rbi, bat."SB" as sb, bat."OBP" as obp, bat."OPS" as ops '
        + 'FROM ' + tablename + '.bbref_batting_standard bat WHERE year=' +
        str(year))
    df = pd.read_sql_query(query, bbdb)
    df = df.fillna(value={
        'obp': 0,
        'ops': 0,
        'pa': 0,
        'r': 0,
        'rbi': 0,
        'sb': 0
    })
    for c in ['pa', 'r', 'rbi', 'hr', 'sb']:
        df[c] = df[c].replace(r'^\s*$', 0, regex=True)
        df[c] = df[c].astype(int)
    for c in ['obp', 'ops']:
        df[c] = df[c].replace(r'^\s*$', 0, regex=True)
        df[c] = df[c].astype(float)
    df = df[(df['bbref_id'].notnull()) & (df['bbref_id'] != u'')]

    # merge in the names and reorder
    names = player_names.get_player_names()
    combined_hitters = df.merge(names[['bbref_id', 'fg_id', 'name']],
                                on='bbref_id',
                                how='left')
    output_stats = utilities.flatten(
        [['fg_id', 'bbref_id', 'name', 'team', 'pa'], [ls.hitting_stats]])
    combined_hitters = combined_hitters[output_stats]
    combined_hitters.drop_duplicates(inplace=True)
    return combined_hitters
Exemple #5
0
def get_fg_id_from_ff_id(ff_id):
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process

    from scraping import scrape_ff
    from munging import player_names

    # Get FF info on a player
    player = scrape_ff.get_ff_player_info(ff_id)
    #print('Searching for a match for '+player['name'] + ' in the FanGraphs player list')
    #print(player)

    # Find the best match for the name in the FF player pool
    names = player_names.get_player_names()
    bestmatch = process.extract(player['name'],
                                names['name'],
                                limit=1,
                                scorer=fuzz.token_sort_ratio)
    name, fuzzratio, row = bestmatch[0]
    player_rostered = names.loc[row]
    return player_rostered.to_dict()
def create_last30_hitters(ls):
    import pandas as pd
    from general import utilities
    from general import postgres
    from general import classes
    from munging import player_names

    bbdb = postgres.connect_to_bbdb()

    query = (
        'SELECT bat.fg_id, bat.team, bat.pa, ' +
        'bat.hr, bat.r, bat.rbi, bat.sb, bat.obp, bat.obp+bat.slg as ops ' +
        'FROM tracking.batters_last30 AS bat')
    df = pd.read_sql_query(query, bbdb)
    df = df.fillna(value={
        'obp': 0,
        'ops': 0,
        'pa': 0,
        'r': 0,
        'rbi': 0,
        'sb': 0
    })
    for c in ['pa', 'r', 'rbi', 'hr', 'sb']:
        df[c] = df[c].replace(r'^\s*$', 0, regex=True)
        df[c] = df[c].astype(int)
    for c in ['obp', 'ops']:
        df[c] = df[c].replace(r'^\s*$', 0, regex=True)
        df[c] = df[c].astype(float)
    #df = df[(df['fg_id'].notnull()) & (df['fg_id']!=u'')]

    # merge in the names and reorder
    names = player_names.get_player_names()
    combined_hitters = df.merge(names[['fg_id', 'name']],
                                on='fg_id',
                                how='left')
    output_stats = utilities.flatten([['fg_id', 'name', 'team', 'pa'],
                                      [ls.hitting_stats]])
    combined_hitters = combined_hitters[output_stats]
    combined_hitters.drop_duplicates(inplace=True)
    return combined_hitters
def scrape_razz(mytype, url, logged_in_driver=False, merge_ownership=True):
    from bs4 import BeautifulSoup
    import pandas as pd
    from datetime import datetime
    from datetime import date
    import os
    from selenium.common.exceptions import NoSuchElementException
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    from general import selenium_utilities
    from general import postgres
    from munging import player_names
    from munging import rosters

    print('Going to scrape '+mytype+' from '+url)

    if not logged_in_driver:
        driver = selenium_utilities.start_driver()
        waiter = WebDriverWait(driver, 10)

        # Get the home page
        driver.get("https://razzball.com/")
        #expected condition
        waiter.until(EC.presence_of_element_located((By.ID, 'sitemenu')))
        #JavaScript Executor to stop page load
        driver.execute_script("window.stop();")

        # Check if already logged in
        try:
            user_info = driver.find_element_by_id('wp-admin-bar-user-info')
        except NoSuchElementException:
            # If not logged in, then log in
            driver.get("https://razzball.com/wp-login.php?redirect_to=https%3A%2F%2Frazzball.com")
            input_login = driver.find_element_by_id('user_login')
            input_login.send_keys('*****@*****.**')
            input_pw = driver.find_element_by_id('user_pass')
            input_pw.send_keys('36Pm4jKml7')
            input_submit = driver.find_element_by_id('wp-submit')
            input_submit.click()

    # Go to the projections page
    driver.get(url)
    print("Arrived at "+url)

    # Copy the csv window into BS
    soup = BeautifulSoup(driver.page_source, 'lxml')
    table = soup.find('table', id='neorazzstatstable')
    # Close it down
    driver.close()

    streamers = []

    colnames = []
    # Get the list of column names
    ths = table.findAll('th')
    for th in ths:
        colname = th.text.lower()
        colname = colname.replace('#', 'rank').replace('$','value').replace('!','').replace('%','pct_')
        colnames.append(colname)
    # Insert Razz ID before Name
    colnames.insert(colnames.index('name'), 'razz_id')

    # Loop through all the rows and append them to the list
    trows = table.findAll('tr')
    for trow in trows:
        streamer = []
        tds = trow.findAll('td')
        append = True

        if [] == list(set(trow['class']) &
                      set(['class=sorter-head', 'tablesorter-headerRow', 'tablesorter-ignoreRow'])):
            for loc, td in enumerate(tds):
                if (loc+1)==colnames.index('name'):
                    player_url = td.find('a')['href']
                    player_id = player_url.split('/')[4]
                    player_id = '660271' if (str(player_id) == '6602710') else player_id  # Manual correction for Ohtani
                    streamer.append(player_id)  # Razz ID
                    player_name = td.find('a').text
                    streamer.append(player_name)  # player name
                elif ('date' in colnames) and (loc+1)==colnames.index('date'):
                    date_str =td.text + '/2021'
                    streamdate = datetime.strptime(date_str, '%m/%d/%Y')
                    streamer.append(streamdate) # Date
                else:
                    try:
                        value = float(td.text)
                    except ValueError:
                        value = td.text
                    streamer.append(value)

            # Some times there are entries with missing values -- do not include those in the dataframe
            for var in ['pa', 'ip']:
                if var in colnames and (
                        str(streamer[colnames.index(var)])=='' or
                        str(streamer[colnames.index(var)])=='None'
                ):
                    append = False
            if streamer[colnames.index('razz_id')]==1.0:
                append = False
            if 'value' in colnames:
                if str(streamer[colnames.index('value')])=='':
                    append = False
            if append:
                streamers.append(streamer)
    df_streamers = pd.DataFrame(streamers, columns=colnames)
    df_streamers.drop(df_streamers[df_streamers['razz_id']==1.0].index, inplace=True)

    # fix date
    #date_str = tds[4].text + '/2021'
    #streamdate = datetime.strptime(date_str, '%m/%d/%Y')

    names = player_names.get_player_names()
    df_streamers = df_streamers.merge(right=names[['mlb_id', 'fg_id']], how='left', left_on='razz_id', right_on='mlb_id')
    df_streamers['fg_id'] = df_streamers.apply(lambda row: row['fg_id'] if str(row['fg_id'])!='nan' else row['razz_id'], axis=1)

    if merge_ownership:
        ff_rosters = rosters.get_ff_ownership().rename(columns={"Team": "SoS_Team"})
        legacy_rosters = rosters.get_legacy_ownership().rename(columns={"Team": "Legacy_Team"})
        df_streamers = df_streamers.merge(
            right=ff_rosters[['SoS_Team', 'fg_id']], how='left', on='fg_id').merge(
                right=legacy_rosters[['Legacy_Team', 'fg_id']], how='left', on='fg_id')

    # Save on computer as .csv file
    mysystem = 'razz'
    today = date.today().strftime("%Y%m%d")
    basename = "/Users/andrewfelton/Documents/bb/bb-2021/data/" + mysystem + '/' + mysystem + "_" + mytype
    new_file = basename + "_" + today + ".csv"
    df_streamers.to_csv(new_file)

    # create the soft link
    ln_file = basename + ".csv"
    command_ln = os.popen('ln -sf ' + new_file + ' ' + ln_file)
    print(command_ln)

    # Upload to the database
    tablename = mytype
    bbdb = postgres.connect_to_bbdb()

    query_tables = "SELECT * FROM pg_catalog.pg_tables WHERE schemaname='proj';"
    tables_list_result = bbdb.execute(query_tables)
    tables_list = []
    for table in tables_list_result:
        tables_list.append(table[1])

    if (tablename in tables_list):
        command = 'TRUNCATE TABLE proj."'+tablename+'"'
        bbdb.execute(command)
    df_streamers.to_sql(tablename, bbdb, schema='proj', if_exists='append', index=False)

    return df_streamers
Exemple #8
0
def scrape_yahoo_player_pool():
    # This loops through the FF player pages and saves the player name, id, and eligibility to the database
    from bs4 import BeautifulSoup
    import pandas as pd
    import requests
    import time
    import unidecode

    from munging import player_names
    from general import postgres


    # EXTRACT
    pitcher_base_url = 'https://baseball.fantasysports.yahoo.com/b1/26574/players?status=ALL&pos=P&cut_type=33&stat1=S_S_2021&myteam=0&sort=R_PO&sdir=1&count='
    hitter_base_url  = 'https://baseball.fantasysports.yahoo.com/b1/26574/players?status=ALL&pos=B&cut_type=33&stat1=S_S_2021&myteam=0&sort=R_PO&sdir=1&count='
              
    players = []
    for baseurl in [hitter_base_url, pitcher_base_url]:
        for i in range(0, 401, 25):
            url = baseurl + str(i)
            page = requests.get(url)
            print('Got '+url)
            time.sleep(1)
            soup = BeautifulSoup(page.text, 'html.parser')
            table = soup.find('div', {'id':'players-table'}).find('table')

            for trow in table.find('tbody').find_all('tr'):
                player_div = trow.find('div', {'class':'ysf-player-name'})
                player_name = unidecode.unidecode(player_div.find('a').text)
                player_url = player_div.find('a')['href']
                player_id = player_url.split('/')[-1].split('-')[-1]
                player_team_elig = player_div.find('span', {'class':'Fz-xxs'}).text.split('-')
                player_team = player_team_elig[0].strip()
                player_elig = player_team_elig[1].strip()
                players.append([player_id, player_name, player_url, player_team, player_elig])

    df_players = pd.DataFrame(players, columns=['yahoo_id', 'yahoo_name', 'yahoo_url', 'yahoo_team', 'yahoo_elig'])

    # TRANSFORM
    def combine_eligibilities(row):
        yahoo_elig_list = row['yahoo_elig'].split(',')
        #print(row['ff_name'])
        #print(row['ff_elig'])
        #print(ff_elig_list)
        eligibilities = []

        # Utilty/DH-only
        if yahoo_elig_list == 'Util':
            eligibilities.append('UT')

        # Infielders
        for pos in ['C', '1B', '2B', 'SS', '3B']:
            if pos in yahoo_elig_list:
                eligibilities.append(pos)
        if '2B' in eligibilities or 'SS' in eligibilities:
            eligibilities.append('MI')
        if '1B' in eligibilities or '3B' in eligibilities:
            eligibilities.append('CI')
        if 'MI' in eligibilities or 'CI' in eligibilities:
            eligibilities.append('IF')

        # Outfielders
        for pos in ['OF', 'RF', 'LF', 'CF']:
            if pos in yahoo_elig_list and 'OF' not in eligibilities:
                eligibilities.append('OF')

        # Pitchers
        for pos in ['SP', 'RP']:
            if pos in yahoo_elig_list:
                eligibilities.append(pos)

        #print(eligibilities)
        # Concatenate into a string and return
        elig = ' '.join(eligibilities).strip()
        #print(elig)
        if elig == '':
            elig = 'UT'
        return elig

    df_players['elig'] = df_players.apply(lambda row: combine_eligibilities(row), axis=1)

    names = player_names.get_player_names()
    df_players = df_players.merge(right=names[['yahoo_id', 'fg_id']], how='left', on='yahoo_id')

    # LOAD    
    bbdb = postgres.connect_to_bbdb()
    df_players.to_sql('player_pool_yahoo', con=bbdb, schema='reference', if_exists='replace', chunksize=1000, method='multi', index=False)
    print('Uploaded Yahoo player pool')

    return df_players
Exemple #9
0
def scrape_yahoo_roster(league_num='26574'):
    print('\n--------------------------\nScraping Yahoo rosters:\n')

    from datetime import date
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    import gspread
    import gspread_dataframe as gsdf

    from general import postgres
    from general import selenium_utilities
    from munging import player_names

    league_url = 'https://baseball.fantasysports.yahoo.com/b1/' + league_num + '/startingrosters'
    print('Scraping from '+league_url)
    page = requests.get(league_url)
    bs_rosters = BeautifulSoup(page.text, 'html.parser')
    main_div = bs_rosters.find('div', id='yspmaincontent')
    tables = main_div.find_all('div', {'class':'Grid-u-1-2 Pend-xl'})

    rosters = []
    for table in tables:
        #roster = []
        owner_id = table.find('p').find('a')['href'].split('/')[-1]
        owner = table.find('p').find('a').text
        # print('Scraping ' + owner)
        player_rows = table.find('table').find('tbody').find_all('tr')
        for player_row in player_rows:
            tds = player_row.find_all('td')
            td_pos = tds[0]
            pos = td_pos.text
            td_player = tds[1]
            info_player = td_player.find('div', {'class':'ysf-player-name'})
            if info_player.find('div', {'class':'emptyplayer'}) is not None:
                rosters.append([owner, pos, 'empty', 'empty'])
            else:
                player = info_player.find('a')
                #print(player)
                playerid = str(player['href'].split('/')[-1])
                playername = player.text
                rosters.append([owner_id, owner, pos, playerid, playername])

    rosters = pd.DataFrame(rosters, columns=['owner_id', 'Team', 'pos', 'yahoo_id', 'name'])
    #player_names.put_missing_in_GS(id_list=rosters[rosters['yahoo_id']!='empty'], type='yahoo_id')


    names = player_names.get_player_names()
    rosters = rosters.merge(
        names[['yahoo_id', 'fg_id', 'name']],
        on='yahoo_id',
        how='left'
    )
    today = date.today().strftime("%Y%m%d")
    rosters['date'] = today
    rosters = rosters[['date', 'owner_id', 'Team', 'pos', 'fg_id', 'yahoo_id']]

    missing_fg_id = rosters[rosters['fg_id'].isna()]
    if len(missing_fg_id)>0:
        for player in missing_fg_id.values.tolist():
            if player[3] != 'empty': # Don't flag if it's just an empty position slot
                print('\nMissing info on:')
                print(player)
                yahoo_match = player_names.find_other_ids_w_yahoo(player[5])
        player_names.push_player_names_to_gs()
        print('Updated Google Sheets')



    today = date.today().strftime("%Y%m%d")
    basename = "/Users/andrewfelton/Documents/bb/bb-2021/data/yahoo/rosters"
    new_file = basename + "_" + today + ".csv"
    rosters.to_csv(new_file)

    bbdb = postgres.connect_to_bbdb()
    rosters.to_sql('legacy', con=bbdb, schema='rosters', if_exists='replace', index=False)
    print('Uploaded to database')
Exemple #10
0
def rosters(league, upload_to_db=True):
    import sys
    import datetime
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    from general import classes
    from munging import player_names
    from general import postgres

    print(
        '\n--------------------------\nScraping Fleaflicker rosters for league:'
        + league.league_name + '\n')
    assert (league.league_platform == 'fleaflicker')
    league_num = league.league_num

    roster_url = 'https://www.fleaflicker.com/mlb/leagues/' + league_num + '/teams'
    page = requests.get(roster_url)
    soup = BeautifulSoup(page.text, 'html.parser')
    main_div = soup.find('div', id='body-center-main')
    tables = main_div.find_all('table')

    today = datetime.date.today()
    str_today = str(today)

    teams = []

    for t in tables:
        trows = t.find_all('tr')
        for tr in trows:
            if (tr.find("span", {"class": "league-name"
                                 })):  # Found the span with the team name
                team_name = tr.find("span", {"class": "league-name"}).text
                #print('New team: '+team_name)
                teams.append(classes.FantasyTeam(team_name))
                current_team = teams[-1]
            elif tr.find('a', {"class": "player-text"}):
                player_data = tr.find('a', {"class": "player-text"})
                player_name = player_data.text
                player_url = 'https://www.fleaflicker.com' + player_data['href']
                player_ff_id = player_data['href'].split('/')[-1].split(
                    '-')[-1]
                current_team.add_player(player_name, player_ff_id)

    df_export = pd.DataFrame(columns=['Team', 'Player', 'ff_id'])
    for team in teams:
        df_export = pd.concat([df_export, team.to_dataframe()])
    df_export.reset_index(drop=True, inplace=True)

    names = player_names.get_player_names()
    df_export = df_export.merge(right=names[['ff_id', 'fg_id']],
                                how='left',
                                on='ff_id')

    # Go through the Fleaflicker players that don't have matching FG IDs
    missing_fg_id = df_export[df_export['fg_id'].isna()]
    if len(missing_fg_id) > 0:
        print('Miising fg_id for ' + str(len(missing_fg_id.values)) +
              ' player(s):')
        for player in missing_fg_id.values.tolist():
            print('\nMissing info on:')
            print(player)
            ff_match = player_names.find_other_ids_w_ff(player[2])

    file_rosters = '/Users/andrewfelton/Documents/bb/bb-2021/data/rosters/rosters_' + league_num + '_' + str_today + '.csv'
    df_export.to_csv(file_rosters, index=False)
    print('Saved rosters to ' + file_rosters)

    if upload_to_db:
        bbdb = postgres.connect_to_bbdb()
        df_export.to_sql('sos',
                         con=bbdb,
                         schema='rosters',
                         if_exists='replace',
                         index=False)
        print('Uploaded to database')

        player_names.push_player_names_to_gs()
        print('Updated Google Sheets')

    return df_export
Exemple #11
0
def scrape_ff_player_pool():
    # This loops through the FF player pages and saves the player name, id, and eligibility to the database
    from bs4 import BeautifulSoup
    import pandas as pd
    import requests
    import time
    import unidecode

    import sys
    sys.path.append('python/munging')
    import player_names
    sys.path.append('python/general')
    import postgres

    # EXTRACT
    pitcher_base_url = 'https://www.fleaflicker.com/mlb/leagues/23172/players?season=2021&statType=1&sortMode=1&position=1536&isFreeAgent=false&tableSortDirection=DESC&tableSortName=pv7&tableOffset='
    hitter_base_url = 'https://www.fleaflicker.com/mlb/leagues/23172/players?season=2021&statType=1&sortMode=1&position=511&isFreeAgent=false&tableSortDirection=DESC&tableSortName=pv7&tableOffset='
    rp_base_url = 'https://www.fleaflicker.com/mlb/leagues/23172/players?season=2021&statType=1&sortMode=1&position=1536&isFreeAgent=false&tableSortName=st25&tableSortDirection=DESC&tableOffset='

    players = []
    for baseurl in [hitter_base_url, pitcher_base_url, rp_base_url]:
        count_top = 601
        if baseurl in [rp_base_url]:
            count_top = 201

        for i in range(0, count_top, 20):
            url = baseurl + str(i)
            page = requests.get(url)
            print('Got ' + url)
            time.sleep(1)
            soup = BeautifulSoup(page.text, 'html.parser')
            table = soup.find('div', {'id': 'body-center-main'}).find('table')

            count = 0
            trow = table.find('thead').next_sibling
            while trow is not None and count < 20:
                player_data = trow.find('div', {'class': 'player'})
                player_name = unidecode.unidecode(
                    player_data.find('a', {
                        'class': 'player-text'
                    }).text)
                player_id = player_data.find(
                    'a', {'class': 'player-text'
                          })['href'].split('/')[-1].split('-')[-1]
                player_url = 'https://www.fleaflicker.com' + player_data.find(
                    'a')['href']
                player_elig = player_data.find('span', {
                    'class': 'position'
                }).text
                player_team = player_data.find('span', {
                    'class': 'player-team'
                }).text
                #print('  '.join([player_name, player_id, elig]))
                players.append([
                    player_id, player_name, player_url, player_team,
                    player_elig
                ])
                trow = trow.next_sibling
                count = count + 1

    df_players = pd.DataFrame(
        players, columns=['ff_id', 'ff_name', 'ff_url', 'ff_team', 'ff_elig'])
    df_players.drop_duplicates(subset=['ff_id'],
                               inplace=True,
                               ignore_index=True)

    # TRANSFORM
    def combine_eligibilities(row):
        ff_elig_list = row['ff_elig'].split('/')
        #print(row['ff_name'])
        #print(row['ff_elig'])
        #print(ff_elig_list)
        eligibilities = []

        # Infielders
        for pos in ['C', '1B', '2B', 'SS', '3B']:
            if pos in ff_elig_list:
                eligibilities.append(pos)
        if '2B' in eligibilities or 'SS' in eligibilities:
            eligibilities.append('MI')
        if '1B' in eligibilities or '3B' in eligibilities:
            eligibilities.append('CI')
        if 'MI' in eligibilities or 'CI' in eligibilities:
            eligibilities.append('IF')

        # Outfielders
        for pos in ['OF', 'RF', 'LF', 'CF']:
            if pos in ff_elig_list and 'OF' not in eligibilities:
                eligibilities.append('OF')

        # Pitchers - just use the same as FF
        if 'SP' in eligibilities or 'RP' in eligibilities or 'P' in eligibilities:
            eligibilities = ff_elig_list

        #print(eligibilities)
        # Concatenate into a string and return
        elig = ' '.join(eligibilities).strip()
        #print(elig)
        if elig == '':
            elig = 'UT'
        return elig

    df_players['elig'] = df_players.apply(
        lambda row: combine_eligibilities(row), axis=1)

    names = player_names.get_player_names()
    df_players = df_players.merge(right=names[['ff_id', 'fg_id']],
                                  how='left',
                                  on='ff_id')

    # LOAD
    bbdb = postgres.connect_to_bbdb()
    df_players.to_sql('player_pool_ff',
                      con=bbdb,
                      schema='reference',
                      if_exists='replace',
                      chunksize=1000,
                      method='multi',
                      index=False)
    print('Uploaded FleaFlicker player pool')

    return df_players
Exemple #12
0
def find_other_ids_w_yahoo(yahoo_id):
    #yahoo_id = 11702
    import sys
    sys.path.append('python/general')
    import postgres
    sys.path.append('python/munging')
    import player_names
    import pandas as pd

    bbdb = postgres.connect_to_bbdb()
    yahoo_sql = 'SELECT yahoo_id, yahoo_name, yahoo_team, yahoo_elig, fg_id FROM REFERENCE.player_pool_yahoo WHERE yahoo_id=\'' + str(
        yahoo_id) + '\''
    yahoo_info = pd.read_sql_query(yahoo_sql, con=bbdb)
    #print('Find matches for this player:')
    #print(yahoo_info)

    if len(yahoo_info) == 0:
        print(
            'This yahoo_id is not in the Yahoo player pool.  Please rerun the player pool generator'
        )
        return False
    else:
        yahoo_name = yahoo_info['yahoo_name'].to_list()[0]
        print('Here is the Yahoo player pool info available on ' + yahoo_name +
              ':')
        print(yahoo_info)

    names = player_names.get_player_names()

    # If it's already in the list of player names:
    if yahoo_name in names['name'].to_list():
        matches = names[names['name'] == yahoo_name]
        if len(matches) == 1:
            print('Found a match!')
            print('FG info:')
            fg_id = matches['fg_id'].to_list()[0]
            fg_info = get_fg_info(fg_id)
            print(fg_info)
            perform_merge = input(
                'Do you want to merge in the Yahoo ID into the existing match?'
            )
            if perform_merge:
                sql_update = 'UPDATE reference.player_names SET yahoo_id = \''+\
                    str(yahoo_id)+\
                    '\' WHERE fg_id=\''+\
                    str(fg_id)+'\''
                print(sql_update)
                bbdb.execute(sql_update)
                player_names.push_player_names_to_gs()
            else:
                print('OK, won\'t update')
        elif len(matches) > 1:
            print(
                'There is more than one match.  Please manually update.  List of matches:'
            )
            print(matches)
    else:
        # If it's not already in the list of player names, see if there is a match in the raw FG data
        yahoo_sql = \
            'SELECT name, fg_id FROM '+\
            '(SELECT "Name" as name, playerid as fg_id from proj.fg_dc_batters_raw '+\
            'UNION '+\
            'SELECT "Name" as name, playerid as fg_id from proj.fg_dc_pitchers_raw '+\
            ') fg_raw_proj_union '+\
            'WHERE fg_id NOT IN (SELECT fg_id FROM reference.player_names) ORDER BY name'
        yahoo_info = pd.read_sql_query(yahoo_sql, con=bbdb)
        if yahoo_name in yahoo_info['name'].to_list():
            matches = yahoo_info[yahoo_info['name'] == yahoo_name]
            if len(matches) == 1:
                print('Found a match!')
                print('FG info:')
                fg_id = matches['fg_id'].to_list()[0]
                fg_info = get_fg_info(fg_id)
                print(fg_info)
                perform_append = input(
                    'Do you want to append this to the list of player names?')
                if perform_append == 'Y':
                    sql_append_new_name = \
                        'INSERT INTO reference.player_names (name, fg_id, yahoo_id) '+\
                        'VALUES ('+\
                        '\'' + fg_info[1] + '\', \'' + str(fg_info[0]) + '\', \'' + str(yahoo_id) + '\'' +\
                        ')'
                    print(sql_append_new_name)
                    bbdb.execute(sql_append_new_name)
                    player_names.push_player_names_to_gs()
                else:
                    print('OK, won\'t update')
            else:
                'Cannot find an exact name match in the FG projections'
Exemple #13
0
def scrape_pod():
    import pandas as pd
    from general import postgres
    from munging import player_names

    bbdb = postgres.connect_to_bbdb()
    names = player_names.get_player_names()

    # -------------------------------
    # EXTRACT
    # -------------------------------
    pod_file_path = '/Users/andrewfelton/Documents/bb/2021/data/pod/pod_projections.xlsx'
    pod_hitters = pd.read_excel(
        pod_file_path,
        sheet_name='Hitter Projections',
        names=[
            'Name', 'fg_id', 'Age', 'Lg', 'TM', 'Position', 'Lineup Spot', 'G',
            'AB', 'PA', 'HITS', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB',
            'IBB', 'SO', 'HBP', 'SF', 'SB', 'CS', 'AVG', 'OBP', 'SLG', 'OPS',
            'ISO', 'wOBA', 'BB_PCT', 'K_PCT', 'BABIP', 'GB_PCT', 'LD_PCT',
            'FB_PCT', 'HR_FB'
        ])

    # -------------------------------
    # TRANSFORM
    # -------------------------------
    for c in ['Name', 'fg_id', 'Age', 'Lg', 'TM', 'Position', 'Lineup Spot']:
        pod_hitters[c] = pod_hitters[c].astype(str)

    # Check to confirm that all the fg_id are in the names table
    put_missing_in_GS(pod_hitters[['fg_id']], 'fg_id')

    # -------------------------------
    # LOAD
    # -------------------------------

    command = 'TRUNCATE TABLE proj.pod_batters_raw'
    bbdb.execute(command)
    pod_hitters.to_sql('pod_batters_raw',
                       bbdb,
                       schema='proj',
                       if_exists='append')
    print('Uploaded pod_batters to the database')

    pod_file_path = '/Users/andrewfelton/Documents/bb/2021/data/pod/pod_projections.xlsx'
    pod_pitchers = pd.read_excel(pod_file_path,
                                 sheet_name='Pitcher Projections')
    pod_pitchers = pd.read_excel(
        pod_file_path,
        sheet_name='Pitcher Projections',
        names=[
            'Name', 'fg_id', 'Age', 'Lg', 'TM', 'Role', 'GS', 'IP', 'IP S',
            'W', 'L', 'QS', 'SV', 'ERA', 'H', 'R', 'ER', 'HR', 'SO', 'BB',
            'HBP', 'K_9', 'BB_9', 'HR_9', 'K_PCT', 'BB_PCT', 'BAA', 'WHIP',
            'BABIP', 'LOB_PCT', 'GB_PCT', 'LD_PCT', 'FB_PCT', 'HR_FB'
        ])
    for c in ['Name', 'fg_id']:
        pod_pitchers[c] = pod_pitchers[c].astype(str)

    # Check to confirm that all the fg_id are in the names table
    put_missing_in_GS(pod_pitchers[['fg_id']], 'fg_id')

    command = 'TRUNCATE TABLE proj.pod_pitchers_raw'
    bbdb.execute(command)
    pod_pitchers.to_sql('pod_pitchers_raw',
                        bbdb,
                        schema='proj',
                        if_exists='append')
    print('Uploaded pod_pitchers to the database')
def create_combined_pitchers(ls):
    import pandas as pd
    from munging import player_names
    from general import postgres, utilities

    bbdb = postgres.connect_to_bbdb()

    query = (
        'SELECT \'razz\' as source, fg_id, ip, qs, era, whip, k as so, sv, hld '
        + 'FROM proj.razz_pitchers')
    df_razz = pd.read_sql_query(query, bbdb)
    df_razz['svhld'] = (df_razz['sv'] + df_razz['hld'])

    query = (
        'SELECT \'fg_dc\' as source, fg_id, ip, qs, era, whip, so, sv, hld ' +
        'FROM proj.fg_dc_pitchers ')
    df_fg_dc = pd.read_sql_query(query, bbdb)
    df_fg_dc['qs'] = df_fg_dc['qs'].replace({0: None})
    df_fg_dc['svhld'] = (df_razz['sv'] + df_razz['hld'])
    df = pd.concat([df_razz, df_fg_dc])

    df_ip = df[['source', 'fg_id', 'ip']]

    query_teams = 'SELECT playerid as fg_id, fg_dc_pitchers_raw."Team" as team FROM proj.fg_dc_pitchers_raw'
    df_teams = pd.read_sql_query(query_teams, bbdb)

    # if 'sample' is not predefined then use entire data set
    for var in ls.pitching_counting_stats:
        df[var] = df.apply(lambda row: 0 if pd.isna(row[var]) else row[var],
                           axis=1)

    weights = {
        'system': ['fg_dc', 'thebat', 'thebatx', 'pod', 'razz'],
        'sys_weight': [1, 1, 1.2, 0, .01]
    }
    weights = pd.DataFrame(weights)
    df = df.merge(right=weights,
                  how='left',
                  left_on='source',
                  right_on='system')

    weights_ip = {
        'system': ['fg_dc', 'thebat', 'thebatx', 'pod', 'razz'],
        'sys_weight': [.25, 0, 0, 0, .01]
    }
    weights_ip = pd.DataFrame(weights_ip)
    df_ip = df_ip.merge(right=weights_ip,
                        how='left',
                        left_on='source',
                        right_on='system')

    def weighted_average(df, data_col, weight_col, by_col):
        df['_data_times_weight'] = df[data_col] * df[weight_col]
        df['_weight_where_notnull'] = df[weight_col] * pd.notnull(df[data_col])
        g = df.groupby(by_col)
        result = g['_data_times_weight'].sum(
        ) / g['_weight_where_notnull'].sum()
        del df['_data_times_weight'], df['_weight_where_notnull']
        result = pd.DataFrame(result, columns=[data_col])
        return result

    df.loc[df['source'] == 'fg_dc', 'qs'] = None

    combined_pitchers = pd.DataFrame(df_ip['fg_id'].unique(),
                                     columns=['fg_id'])
    statlist = list(set(utilities.flatten([['ip'], ls.pitching_stats])))
    for stat in statlist:  # do this list(set(*)) to get unique values b/c ip may be in there twice
        t = weighted_average(df, stat, 'sys_weight', 'fg_id')
        combined_pitchers = combined_pitchers.merge(t, on='fg_id')

    # merge in the names and reorder
    names = player_names.get_player_names()
    combined_pitchers = combined_pitchers.merge(names[['fg_id', 'name']],
                                                on='fg_id',
                                                how='left')
    combined_pitchers = combined_pitchers.merge(df_teams,
                                                on='fg_id',
                                                how='left')
    output_stats = ['fg_id', 'name', 'team', 'ip']
    for stat in ls.pitching_stats:
        if (stat in output_stats) is False:
            output_stats.append(stat)
    combined_pitchers = combined_pitchers[output_stats]

    return combined_pitchers
def create_combined_hitters(ls, pa=0):
    import pandas as pd
    from general import utilities
    from general import postgres
    from munging import player_names

    bbdb = postgres.connect_to_bbdb()
    query = (
        'SELECT proj.* FROM (' +
        'SELECT \'fg_dc\' as source, fg_id, pa, hr_pa, r_pa, rbi_pa, sb_pa, obp, ops '
        + 'FROM proj.fg_dc_batters ' + 'UNION ' +
        'SELECT \'thebat\' as source, fg_id, pa, hr_pa, r_pa, rbi_pa, sb_pa, obp, ops '
        + 'FROM proj.thebat_batters ' + 'UNION ' +
        'SELECT \'thebatx\' as source, fg_id, pa, hr_pa, r_pa, rbi_pa, sb_pa, obp, ops '
        + 'FROM proj.thebatx_batters ' + 'UNION ' +
        'SELECT \'pod\' as source, fg_id, pa, hr_pa, r_pa, rbi_pa, sb_pa, obp, ops '
        + 'FROM proj.pod_batters ' + ') AS proj')
    df = pd.read_sql_query(query, bbdb)

    query_pa = (
        'SELECT proj.* FROM (' + 'SELECT \'fg_dc\' as source, fg_id, pa ' +
        'FROM proj.fg_dc_batters ' +
        #'UNION ' +
        #'SELECT \'pod\' as source, fg_id, pa ' +
        #'FROM proj.pod_batters ' +
        ') AS proj')
    df_pa = pd.read_sql_query(query_pa, bbdb)
    df_pa.loc[df_pa['fg_id'] == 'sa3011918', 'fg_id'] = '27506'

    query_teams = 'SELECT playerid as fg_id, fg_dc_batters_raw."Team" as team FROM proj.fg_dc_batters_raw'
    df_teams = pd.read_sql_query(query_teams, bbdb)
    df_teams.loc[df_teams['fg_id'] == 'sa3011918', 'fg_id'] = '27506'

    weights = {
        'system': ['fg_dc', 'thebat', 'thebatx', 'pod'],
        'sys_weight': [1, 1, 1.2, .6]
    }
    weights = pd.DataFrame(weights)
    df = df.merge(right=weights,
                  how='left',
                  left_on='source',
                  right_on='system')

    weights_pa = {'system': ['fg_dc', 'pod'], 'sys_weight': [1, 0]}
    weights_pa = pd.DataFrame(weights_pa)
    df_pa = df_pa.merge(right=weights_pa,
                        how='left',
                        left_on='source',
                        right_on='system')

    def weighted_average(df, data_col, weight_col, by_col):
        df['_data_times_weight'] = df[data_col] * df[weight_col]
        df['_weight_where_notnull'] = df[weight_col] * pd.notnull(df[data_col])
        g = df.groupby(by_col)
        result = g['_data_times_weight'].sum(
        ) / g['_weight_where_notnull'].sum()
        del df['_data_times_weight'], df['_weight_where_notnull']
        result = pd.DataFrame(result, columns=[data_col])
        return result

    combined_hitters = pd.DataFrame(df_pa['fg_id'].unique(), columns=['fg_id'])
    for stat in ['pa']:
        t = weighted_average(df_pa, stat, 'sys_weight', 'fg_id')
        combined_hitters = combined_hitters.merge(t, on='fg_id')
        if (pa > 0):
            combined_hitters['pa'] = pa

    stats_pa = []
    for stat in ls.hitting_counting_stats:
        stats_pa.append(stat + '_pa')

    for stat in utilities.flatten([stats_pa, ls.hitting_rate_stats]):
        t = weighted_average(df, stat, 'sys_weight', 'fg_id')
        combined_hitters = combined_hitters.merge(t, on='fg_id')

    for stat in ls.hitting_counting_stats:
        stat_pa = stat + '_pa'
        combined_hitters[
            stat] = combined_hitters[stat_pa] * combined_hitters['pa']
        combined_hitters = combined_hitters.drop(columns=[stat_pa])

    # merge in the names and reorder
    names = player_names.get_player_names()
    combined_hitters = combined_hitters.merge(names[['fg_id', 'name']],
                                              on='fg_id',
                                              how='left')
    combined_hitters = combined_hitters.merge(df_teams, on='fg_id', how='left')
    output_stats = utilities.flatten([['fg_id', 'name', 'team', 'pa'],
                                      [ls.hitting_stats]])
    combined_hitters = combined_hitters[output_stats]

    return combined_hitters