def populate_yahoo(): bbdb = postgres.connect_to_bbdb() sql = 'SELECT fg_id, name FROM reference.player_names WHERE yahoo_id = \'\'' missing_yahoo = pd.read_sql_query(sql, con=bbdb) sql = 'SELECT yahoo_id, yahoo_name FROM reference.player_pool_yahoo' yahoo_names = pd.read_sql_query(sql, con=bbdb) #yahoo_search = missing_yahoo.loc[1,:].to_list() for i in range(len(missing_yahoo)): yahoo_search = missing_yahoo.loc[i, :].to_list() fg_id = yahoo_search[0] fg_name = yahoo_search[1] print('Looking for ' + fg_name) if fg_name in yahoo_names['yahoo_name'].to_list(): print('Think we found a match for ' + fg_name) yahoo_id = yahoo_names[yahoo_names['yahoo_name'] == fg_name]['yahoo_id'].values[0] print('FG info:') print(player_names.get_fg_info(fg_id)) print('Yahoo info:') print(player_names.get_yahoo_info(yahoo_id)) update = input('Go ahead and update player names (Y/N): ') if update == 'Y': sql_update = 'UPDATE reference.player_names SET yahoo_id = \''+\ str(yahoo_id)+\ '\' WHERE fg_id=\''+\ str(fg_id)+'\'' print(sql_update) bbdb.execute(sql_update) player_names.push_player_names_to_gs()
def get_legacy_ownership(): import pandas as pd from general import postgres bbdb = postgres.connect_to_bbdb() ff_ownership = pd.read_sql('SELECT * FROM rosters.legacy', bbdb) return ff_ownership
def update_relievers_last14(): import pandas as pd import gspread import gspread_dataframe as gsdf from general import postgres from general import gs bbdb = postgres.connect_to_bbdb() relievers_last14 = pd.read_sql(''' SELECT r.name, r.team, ff_elig.ff_elig, r.g, r.ip, r.sv, r.hld, r.gmli, r.wpa, r.era, r.kwera, r.xfip, r.siera, r.xera, r.csw_pct, r.k_pct, r.bb_pct, r.swstr_pct, r.vfa, r.babip, r.lob_pct, r.hr_fb, r.asof_date, r.fg_id, rosters_sos."Team" as sos_team, rosters_legacy."Team" as legacy_team FROM tracking.relievers_last14 r LEFT JOIN reference.player_pool_ff ff_elig ON r.fg_id=ff_elig.fg_id LEFT JOIN rosters.sos rosters_sos ON r.fg_id=rosters_sos.fg_id LEFT JOIN rosters.legacy rosters_legacy ON r.fg_id=rosters_legacy.fg_id WHERE ff_elig IN ('P', 'RP', 'SP') OR ff_elig IS NULL ORDER BY r.wpa DESC ''', con=bbdb) gc = gspread.service_account(filename='../bb-2021-2b810d2e3d25.json') bb2021 = gc.open("BB 2021 InSeason") sheettitle = "Relievers - Last 14" bb2021.values_clear(sheettitle + "!A:Z") gsdf.set_with_dataframe(bb2021.worksheet(sheettitle), relievers_last14) gs.format_gsheet(bb2021.worksheet(sheettitle))
def scrape_savant(): from datetime import date import os import pandas as pd from general import selenium_utilities from general import postgres from munging import player_names driver = selenium_utilities.start_driver() draft_url = "https://baseballsavant.mlb.com/leaderboard/custom?year=2021&type=batter&filter=&sort=4&sortDir=desc&min=10&selections=b_total_pa,xba,xslg,woba,xwoba,xobp,xiso,wobacon,xwobacon,exit_velocity_avg,launch_angle_avg,barrel_batted_rate,hard_hit_percent,sprint_speed,&chart=false&x=xba&y=xba&r=no&chartType=beeswarm" driver.get(draft_url) print('Arrived at ' + driver.current_url) input_dl = driver.find_element_by_id('btnCSV') input_dl.click() basepath = "/Users/andrewfelton/Documents/bb/bb-2021" dl_file = "/Users/andrewfelton/Downloads/docker/stats.csv" today = date.today().strftime("%Y%m%d") new_file = basepath + "/data/savant/hitter_stats_" + today + ".csv" stream_command = os.popen('mv ' + dl_file + ' ' + new_file) mv_file = stream_command.read() # create the soft link ln_file = basepath + "/data/savant/hitter_stats.csv" command_ln = os.popen('ln -sf ' + new_file + ' ' + ln_file) driver.close() print("Finished scraping " + ln_file) savant = pd.read_csv(ln_file) savant.insert(0, 'asof_date', date.today().strftime('%Y-%m-%d')) # Merge in the player names and FG IDs savant.rename(columns={'player_id': 'mlb_id'}, inplace=True) savant['mlb_id'] = savant['mlb_id'].apply(str) names = player_names.get_player_names() savant = savant.merge(right=names[['mlb_id', 'fg_id']], how='left', on='mlb_id') #fg_ids = savant[['fg_id']].astype(str).values #put_missing_in_GS(id_list=pd.DataFrame(fg_ids, columns=['fg_id']), type='fg_id') savant = savant[[ 'asof_date', 'fg_id', 'b_total_pa', 'xba', 'xslg', 'woba', 'xwoba', 'xobp', 'xiso', 'wobacon', 'xwobacon', 'exit_velocity_avg', 'launch_angle_avg', 'barrel_batted_rate', 'hard_hit_percent', 'sprint_speed' ]] schema = 'tracking' tablename = 'savant' bbdb = postgres.connect_to_bbdb() savant.to_sql(tablename, bbdb, schema=schema, if_exists='replace')
def scrape_daily_roster(scoring_period, team_id): import requests import pandas as pd import datetime import json from general import postgres bbdb = postgres.connect_to_bbdb() sql_delete_rosters_this_period = ''' DELETE FROM sos_2021.rosters WHERE scoring_period = '{}' AND team_id = '{}' '''.format(scoring_period, team_id) bbdb.execute(sql_delete_rosters_this_period) league_id = '23172' query = { 'sport': 'MLB', 'league_id': league_id, 'team_id': team_id, 'season': '2021', 'scoring_period': scoring_period } response = requests.get('https://www.fleaflicker.com/api/FetchRoster', params=query) response_json = response.json() roster = [] for group in response_json['groups']: for slot in group['slots']: player = [slot['position']['label']] # The position # Need to check if there is a player or if the slot is empty if 'leaguePlayer' in slot.keys(): player.append(str(slot['leaguePlayer']['proPlayer']['id'])) player.append(slot['leaguePlayer']['proPlayer']['nameFull']) else: player.append('NULL') player.append('NULL') #print(player) roster.append(player) #print(roster) roster = pd.DataFrame(roster, columns=['position', 'ff_id', 'ff_name']) roster['league_id'] = query['league_id'] roster['team_id'] = query['team_id'] roster['scoring_period'] = query['scoring_period'] epochtime = int(response_json['lineupPeriod']['low']['startEpochMilli']) scoring_date = datetime.datetime.fromtimestamp(epochtime / 1000.0).date() roster['scoring_date'] = scoring_date roster.to_sql(name='rosters', con=bbdb, schema='sos_2021', if_exists='append') print('Uploaded roster for team_id ' + str(query['team_id']) + ' for date ' + str(scoring_date))
def get_eligibilities(league): import sys sys.path.append('python/general') from general import postgres import pandas as pd bbdb = postgres.connect_to_bbdb() query = ('SELECT fg_id, elig ' 'FROM reference.player_pool_ff ') df = pd.read_sql_query(query, bbdb) return df
def get_player_names(): import pandas as pd from general import postgres bbdb = postgres.connect_to_bbdb() names = pd.read_sql_query(sql='SELECT * FROM REFERENCE.PLAYER_NAMES', con=bbdb) for col in ['otto_id', 'yahoo_id', 'bp_id', 'espn_id', 'ff_id', 'fg_id']: names[col] = names[col].astype('str').replace('\.0', '', regex=True) return names
def create_combined_valuations(league): import pandas as pd import gspread import gspread_dataframe as gsdf import gspread_formatting as gsfmt from general import postgres from munging import player_names from general import gs assert league.league_name in ['SoS', 'Legacy'] #league = 'SoS' #league = 'Legacy' bbdb = postgres.connect_to_bbdb() names = player_names.get_player_names() gc = gspread.service_account(filename='./bb-2021-2b810d2e3d25.json') #bb2021 = gc.open("BB 2021 " + league.league_name) bb2021 = gc.open("BB 2021 InSeason") combined_hitters = create_combined_hitter_valuations(league) combined_pitchers = create_combined_pitcher_valuations(league) hitter_projections = bb2021.worksheet('Hitter Projections - ' + league.league_name) bb2021.values_clear(hitter_projections.title + "!A:Z") gsdf.set_with_dataframe(hitter_projections, combined_hitters) hitter_projections.update format_gs.format_gs_all(league=league, ls=league, type='hitting') pitcher_projections = bb2021.worksheet('Pitcher Projections - ' + league.league_name) bb2021.values_clear(pitcher_projections.title + "!A:Z") gsdf.set_with_dataframe(pitcher_projections, combined_pitchers) pitcher_projections.update format_gs.format_gs_all(league=league.league_name, ls=league, type='pitching') combined = pd.concat([ combined_hitters[['name', 'fg_id', 'type', 'zar', 'value']], combined_pitchers[['name', 'fg_id', 'type', 'zar', 'value']] ]) combined = combined.sort_values(by='value', ascending=False) gs_combined = bb2021.worksheet('Combined Z') gsdf.set_with_dataframe(gs_combined, combined) gs_combined.update gsfmt.format_cell_range( gs_combined, 'D:E', gsfmt.CellFormat( numberFormat=gsfmt.NumberFormat(type='NUMBER', pattern='0.0')))
def append_new_fg_to_names(): import sys sys.path.append('python/general') import postgres sys.path.append('python/munging') import player_names import pandas as pd bbdb = postgres.connect_to_bbdb() ff_sql = 'SELECT name, playerid as fg_id, pa FROM proj.fg_dc_batters_raw WHERE playerid NOT IN (SELECT fg_id AS playerrid FROM reference.player_names) ORDER BY pa DESC' ff_info = pd.read_sql_query(ff_sql, con=bbdb) print('Find matches for this player:')
def inseason_standings_sos(): gc = gspread.service_account(filename='./bb-2021-2b810d2e3d25.json') bb2021 = gc.open("BB 2021 InSeason") bbdb = postgres.connect_to_bbdb() # Update standings ff_standings = pd.read_sql_query('SELECT * FROM tracking.standings_sos', con=bbdb, parse_dates=['date']) sheettitle = "Standings" bb2021.values_clear(sheettitle + "!A:Z") gsdf.set_with_dataframe(bb2021.worksheet(sheettitle), ff_standings)
def push_player_names_to_gs(): import pandas as pd import gspread import gspread_dataframe as gsdf from general import postgres bbdb = postgres.connect_to_bbdb() player_names = pd.read_sql( 'SELECT * FROM reference.player_names ORDER BY name', con=bbdb) gc = gspread.service_account(filename='./bb-2021-2b810d2e3d25.json') sh = gc.open("BB 2021 Name Matching").worksheet('Player Names') gsdf.set_with_dataframe(sh, player_names) sh.clear_basic_filter() sh.set_basic_filter()
def create_actuals_hitters(ls, year=2021): import pandas as pd from general import utilities from general import postgres from general import classes from munging import player_names bbdb = postgres.connect_to_bbdb() if year == 2021: tablename = 'tracking' else: tablename = 'reference' query = ( 'SELECT year, bbref_id, bat."Tm" as team, bat."PA" as pa, ' + 'bat."HR" as hr, bat."R" as r, bat."RBI" as rbi, bat."SB" as sb, bat."OBP" as obp, bat."OPS" as ops ' + 'FROM ' + tablename + '.bbref_batting_standard bat WHERE year=' + str(year)) df = pd.read_sql_query(query, bbdb) df = df.fillna(value={ 'obp': 0, 'ops': 0, 'pa': 0, 'r': 0, 'rbi': 0, 'sb': 0 }) for c in ['pa', 'r', 'rbi', 'hr', 'sb']: df[c] = df[c].replace(r'^\s*$', 0, regex=True) df[c] = df[c].astype(int) for c in ['obp', 'ops']: df[c] = df[c].replace(r'^\s*$', 0, regex=True) df[c] = df[c].astype(float) df = df[(df['bbref_id'].notnull()) & (df['bbref_id'] != u'')] # merge in the names and reorder names = player_names.get_player_names() combined_hitters = df.merge(names[['bbref_id', 'fg_id', 'name']], on='bbref_id', how='left') output_stats = utilities.flatten( [['fg_id', 'bbref_id', 'name', 'team', 'pa'], [ls.hitting_stats]]) combined_hitters = combined_hitters[output_stats] combined_hitters.drop_duplicates(inplace=True) return combined_hitters
def create_actuals_pitchers(ls, year=2021): import pandas as pd from general import utilities from general import postgres from munging import player_names bbdb = postgres.connect_to_bbdb() query = ( 'SELECT pit_std.year, pit_std.bbref_id, pit_std."Tm" as team, pit_std."IP" as ip, pit_start."GS" as gs, pit_start."QS" as qs, pit_std."SO" as so, pit_std."ERA" as era, pit_std."WHIP" as whip, pit_relief."SV" as sv, pit_relief."Hold" as hld FROM ' + '(SELECT * FROM tracking.bbref_pitching_standard) as pit_std ' + 'LEFT JOIN (SELECT * FROM tracking.bbref_pitching_starter) as pit_start ON pit_std.bbref_id=pit_start.bbref_id AND pit_std.year=pit_start.year AND pit_std."Tm"=pit_start."Tm" ' + 'LEFT JOIN (SELECT * FROM tracking.bbref_pitching_reliever) as pit_relief ON pit_std.bbref_id=pit_relief.bbref_id AND pit_std.year=pit_relief.year AND pit_std."Tm"=pit_relief."Tm" ' + 'WHERE pit_std.year=' + str(year)) df = pd.read_sql_query(query, bbdb) df['ip'] = df['ip'].str.replace('.1', '.33', regex=False) df['ip'] = df['ip'].str.replace('.2', '.67', regex=False) df = df.fillna(value={ 'era': 0, 'whip': 0, 'gs': 0, 'qs': 0, 'sv': 0, 'hld': 0 }) for c in ['gs', 'qs', 'so', 'sv', 'hld']: df[c] = df[c].replace(r'^\s*$', 0, regex=True) df[c] = df[c].astype(int) for c in ['ip', 'era', 'whip']: df[c] = df[c].replace(r'^\s*$', 0, regex=True) df[c] = df[c].astype(float) df['svhld'] = df['sv'] + df['hld'] df = df[(df['bbref_id'].notnull()) & (df['bbref_id'] != u'')] # merge in the names and reorder names = player_names.get_player_names() combined_pitchers = df.merge(names[['bbref_id', 'fg_id', 'name']], on='bbref_id', how='left') output_stats = utilities.flatten([['fg_id', 'name', 'team', 'ip'], [ls.pitching_stats]]) combined_pitchers = combined_pitchers[output_stats] combined_pitchers.drop_duplicates(inplace=True) return combined_pitchers
def scrape_sfb_names(): # Download the latest .csv file from smartfantasybaseball.com driver = selenium_utilities.start_driver() driver.get("https://www.smartfantasybaseball.com/PLAYERIDMAPCSV") filename = '/Users/andrewfelton/Downloads/docker/SFBB Player ID Map - PLAYERIDMAP.csv' player_names = pd.read_csv(filename) os.remove(filename) tablename = 'player_names_sfb' bbdb = postgres.connect_to_bbdb() command = 'TRUNCATE TABLE reference.' + tablename bbdb.execute(command) player_names.to_sql(tablename, bbdb, schema='reference', if_exists='append') player_names['yahoo_id'] = str(player_names['yahoo_id'])
def scrape_sfb_names(): import time import os import sys import pandas as pd sys.path.append('python/general') import selenium_utilities import postgres # EXTRACT # Download the latest .csv file from smartfantasybaseball.com driver = selenium_utilities.start_driver() driver.get("https://www.smartfantasybaseball.com/PLAYERIDMAPCSV") time.sleep(2) filename = '/Users/andrewfelton/Downloads/docker/SFBB Player ID Map - PLAYERIDMAP.csv' sfb_names = pd.read_csv(filename, dtype=str) os.remove(filename) # TRANSFORM orignames = list(sfb_names.columns.values) colnames = list(sfb_names.columns.values) colmap = dict() for i in range(0, len(colnames)): colnames[i] = colnames[i].lower() if colnames[i][-2:] == 'id': colnames[i] = colnames[i][:-2] + '_id' if colnames[i][:2] == 'id': colnames[i] = colnames[i][2:] + '_id' if colnames[i][-4:] == 'name': colnames[i] = colnames[i][:-4] + '_name' colnames[i] = colnames[i].replace('fangraphs', 'fg') colmap[orignames[i]] = colnames[i] sfb_names = sfb_names.rename(mapper=colmap, axis=1) # LOAD tablename = 'player_names_sfb' bbdb = postgres.connect_to_bbdb() command = 'TRUNCATE TABLE reference.' + tablename bbdb.execute(command) sfb_names.to_sql(tablename, bbdb, schema='reference', if_exists='append') return sfb_names
def scrape_standings(league): import requests from bs4 import BeautifulSoup import pandas as pd import datetime from general import postgres assert (league.league_platform == 'fleaflicker') league_num = league.league_num roster_url = 'https://www.fleaflicker.com/mlb/leagues/' + league_num page = requests.get(roster_url) soup = BeautifulSoup(page.text, 'html.parser') main_div = soup.find('div', id='body-center-main') tables = main_div.find('table') trows = tables.find_all('tr') standings = [] for trow in trows[2:]: standing = [] tds = trow.find_all('td') standing.append(tds[0].text) for i in range(3, 15): standing.append(tds[i].find('span').text) standings.append(standing) df_standings = pd.DataFrame(standings, columns=[ 'team', 'hr', 'r', 'rbi', 'sb', 'obp', 'ops', 'so', 'sv', 'hld', 'era', 'whip', 'qs' ]) today = datetime.date.today() df_standings['date'] = today str_today = str(today) bbdb = postgres.connect_to_bbdb() df_standings.to_sql(name='standings_sos', con=bbdb, schema='tracking', index=False, if_exists='replace')
def post_sos_d2_drafts(draftnums): # Calc avg. and min. values for D2 draft bbdb = postgres.connect_to_bbdb() query = 'SELECT fg_id, MIN(draft."Pick"::DOUBLE PRECISION) as min_pick, AVG(draft."Pick"::DOUBLE PRECISION) as avg_pick FROM (' select_queries = [] for draftnum in draftnums: select_queries.append('SELECT fg_id, cm_mock_' + draftnum + '."Pick" FROM drafts.cm_mock_' + draftnum) query = query + ' UNION '.join( select_queries) + ') AS draft GROUP BY fg_id' df = pd.read_sql_query(query, bbdb) gc = gspread.service_account(filename='./bb-2021-2b810d2e3d25.json') bb2021 = gc.open("BB 2021 SoS") sheettitle = "D2 drafts" bb2021.values_clear(sheettitle + "!A:Z") gsdf.set_with_dataframe(bb2021.worksheet(sheettitle), df) combined = bb2021.worksheet('Combined') combined.update print('Updated combined spreadsheet')
def create_last30_hitters(ls): import pandas as pd from general import utilities from general import postgres from general import classes from munging import player_names bbdb = postgres.connect_to_bbdb() query = ( 'SELECT bat.fg_id, bat.team, bat.pa, ' + 'bat.hr, bat.r, bat.rbi, bat.sb, bat.obp, bat.obp+bat.slg as ops ' + 'FROM tracking.batters_last30 AS bat') df = pd.read_sql_query(query, bbdb) df = df.fillna(value={ 'obp': 0, 'ops': 0, 'pa': 0, 'r': 0, 'rbi': 0, 'sb': 0 }) for c in ['pa', 'r', 'rbi', 'hr', 'sb']: df[c] = df[c].replace(r'^\s*$', 0, regex=True) df[c] = df[c].astype(int) for c in ['obp', 'ops']: df[c] = df[c].replace(r'^\s*$', 0, regex=True) df[c] = df[c].astype(float) #df = df[(df['fg_id'].notnull()) & (df['fg_id']!=u'')] # merge in the names and reorder names = player_names.get_player_names() combined_hitters = df.merge(names[['fg_id', 'name']], on='fg_id', how='left') output_stats = utilities.flatten([['fg_id', 'name', 'team', 'pa'], [ls.hitting_stats]]) combined_hitters = combined_hitters[output_stats] combined_hitters.drop_duplicates(inplace=True) return combined_hitters
def pull_player_names_from_gs(): import pandas as pd import gspread import gspread_dataframe as gsdf from general import postgres bbdb = postgres.connect_to_bbdb() gc = gspread.service_account(filename='./bb-2021-2b810d2e3d25.json') sh = gc.open("BB 2021 Name Matching").worksheet('Player Names') player_names = gsdf.get_as_dataframe(sh) player_names.sort_values(by='name', inplace=True) # convert any numeric ids into text for col in [ 'fg_id', 'fg_minor_id', 'bbref_id', 'otto_id', 'yahoo_id', 'bp_id', 'espn_id', 'ff_id', 'mlb_id' ]: if col in player_names.columns: player_names[col] = player_names[col].astype('str').replace( '\.0', '', regex=True) player_names[col] = player_names[col].astype('str').replace( 'nan', '', regex=True) player_names[col] = player_names[col].astype('str').replace( 'NaN', '', regex=True) colnames = player_names.columns.tolist() player_names.columns = colnames command = 'TRUNCATE TABLE reference.player_names' bbdb.execute(command) player_names.to_sql('player_names', bbdb, schema='reference', if_exists='append', chunksize=1000, method='multi', index=False)
def scrape_ottoneu_player_pool(): import os from datetime import datetime from datetime import date from datetime import time import pandas as pd from selenium.webdriver.common.action_chains import ActionChains from general import selenium_utilities from munging import player_names from player_names import put_missing_in_GS from general import postgres driver = selenium_utilities.start_driver(headless=False) url = 'http://ottoneu.fangraphs.com/averageValues' driver.get(url) time.sleep(2) print('Arrived at ' + driver.current_url) button_csv = driver.find_element_by_xpath( '/html/body/main/header/div[2]/a[1]') button_csv.click() time.sleep(3) basepath = "/Users/andrewfelton/Documents/bb/bb-2021" dl_file = "/Users/andrewfelton/Downloads/docker/ottoneu_average_values.csv" today = date.today().strftime("%Y%m%d") new_file = basepath + "/data/ottoneu/ottoneu_average_values_" + today + ".csv" stream_command = os.popen('mv ' + dl_file + ' ' + new_file) mv_file = stream_command.read() # create the soft link ln_file = basepath + "/data/ottoneu/ottoneu_average_values.csv" command_ln = os.popen('ln -sf ' + new_file + ' ' + ln_file) driver.close() print("Finished scraping " + ln_file) ottoneu_player_pool = pd.read_csv(ln_file) ottoneu_player_pool.insert(0, 'asof_date', date.today().strftime('%Y-%m-%d')) ottoneu_player_pool.rename(columns={ 'Name': 'name', 'OttoneuID': 'otto_id', 'FG MajorLeagueID': 'fg_id', 'FG MinorLeagueID': 'fg_minor_id' }, inplace=True) for idtype in ['otto_id', 'fg_id', 'fg_minor_id']: ottoneu_player_pool[[idtype]] = ottoneu_player_pool[[idtype ]].astype(str) tablename = "player_pool_ottoneu" bbdb = postgres.connect_to_bbdb() query_tables = "SELECT * FROM pg_catalog.pg_tables WHERE schemaname='reference';" tables_list_result = bbdb.execute(query_tables) tables_list = [] for table in tables_list_result: tables_list.append(table[1]) if (tablename in tables_list): command = 'TRUNCATE TABLE tracking.' + tablename bbdb.execute(command) ottoneu_player_pool[[ 'asof_date', 'name', 'otto_id', 'fg_id', 'fg_minor_id' ]].to_sql(tablename, bbdb, schema='reference', if_exists='append', index=False) return ottoneu_player_pool
def update_inseason_valuations(league_sos, league_legacy): import pandas as pd import gspread import gspread_dataframe as gsdf from general import gs from general import utilities from general import postgres sos_hitters = create_combined_hitter_valuations(league=league_sos) \ .rename(columns={'zar': 'zar_sos', 'value': 'value_sos', 'value_600': 'value_600_sos'}) legacy_hitters = create_combined_hitter_valuations(league=league_legacy) \ .rename(columns={'zar': 'zar_legacy', 'value': 'value_legacy', 'value_600': 'value_600_legacy'}) legacy_extra_columns = list( set(legacy_hitters.columns).difference(sos_hitters.columns)) legacy_extra_columns = utilities.flatten(['fg_id', legacy_extra_columns]) columns = [ 'name', 'fg_id', 'type', 'elig', 'pa', league_sos.hitting_counting_stats, league_sos.hitting_counting_stats, league_legacy.hitting_rate_stats, league_legacy.hitting_rate_stats, 'value_sos', 'value_600_sos', 'value_legacy', 'value_600_legacy' ] columns = utilities.flatten(columns) combined_hitters = sos_hitters.merge(legacy_hitters[legacy_extra_columns], on='fg_id') combined_hitters.drop_duplicates(subset=['fg_id'], inplace=True) # Merge in the ownership bbdb = postgres.connect_to_bbdb() sos_rosters = pd.read_sql( 'SELECT fg_id, sos."Team" as sos_team FROM rosters.sos', con=bbdb) sos_rosters[['fg_id']] = sos_rosters[['fg_id']].astype(str) combined_hitters = combined_hitters.merge(sos_rosters, how='left', on='fg_id') legacy_rosters = pd.read_sql( 'SELECT fg_id, legacy."Team" as legacy_team FROM rosters.legacy', con=bbdb) legacy_rosters[['fg_id']] = legacy_rosters[['fg_id']].astype(str) combined_hitters = combined_hitters.merge(legacy_rosters, how='left', on='fg_id') combined_hitters.drop(combined_hitters[ (combined_hitters['fg_id'] == '19755') & (combined_hitters['legacy_team'] == 'Harper Wallbanger')].index, inplace=True) # Pitchers sos_pitchers = create_combined_pitcher_valuations(league=league_sos) \ .rename(columns={'zar': 'zar_sos', 'value': 'value_sos'}) legacy_pitchers = create_combined_pitcher_valuations(league=league_legacy) \ .rename(columns={'zar': 'zar_legacy', 'value': 'value_legacy'}) legacy_extra_columns = list( set(legacy_pitchers.columns).difference(sos_pitchers.columns)) legacy_extra_columns = utilities.flatten(['fg_id', legacy_extra_columns]) columns = [ 'name', 'fg_id', 'type', 'ip', league_sos.pitching_counting_stats, league_legacy.pitching_counting_stats, league_sos.pitching_rate_stats, league_legacy.pitching_rate_stats, 'zar_sos', 'value_sos', 'zar_legacy', 'value_legacy' ] columns = utilities.flatten(columns) combined_pitchers = sos_pitchers.merge( legacy_pitchers[legacy_extra_columns], on='fg_id') combined_pitchers = combined_pitchers[columns] # Merge in CFIP bbdb = postgres.connect_to_bbdb() cfip = pd.read_sql('SELECT * FROM hist.bp_pitchers_raw', con=bbdb) combined_pitchers = combined_pitchers.merge(cfip[['fg_id', 'DRA', 'cFIP']], how='left', on='fg_id') # Merge in xxxFIP bbdb = postgres.connect_to_bbdb() cfip = pd.read_sql('SELECT * FROM tracking.xxxfip WHERE fg_id IS NOT NULL', con=bbdb) combined_pitchers = combined_pitchers.merge(cfip[['fg_id', 'xxxFIP']], how='left', on='fg_id') # Merge in the ownership combined_pitchers = combined_pitchers.merge(sos_rosters, how='left', on='fg_id') combined_pitchers = combined_pitchers.merge(legacy_rosters, how='left', on='fg_id') combined_pitchers.drop(combined_pitchers[ (combined_pitchers['fg_id'] == '19755') & (combined_pitchers['legacy_team'] == 'Florun\'s Team')].index, inplace=True) # Update Google Sheets gc = gspread.service_account(filename='./bb-2021-2b810d2e3d25.json') sh = gc.open("BB 2021 InSeason").worksheet('Proj - Hitters') gsdf.set_with_dataframe(sh, combined_hitters) gs.format_gsheet(sheet=sh) sh = gc.open("BB 2021 InSeason").worksheet('Proj - Pitchers') gsdf.set_with_dataframe(sh, combined_pitchers) gs.format_gsheet(sheet=sh)
def find_other_ids_w_yahoo(yahoo_id): #yahoo_id = 11702 import sys sys.path.append('python/general') import postgres sys.path.append('python/munging') import player_names import pandas as pd bbdb = postgres.connect_to_bbdb() yahoo_sql = 'SELECT yahoo_id, yahoo_name, yahoo_team, yahoo_elig, fg_id FROM REFERENCE.player_pool_yahoo WHERE yahoo_id=\'' + str( yahoo_id) + '\'' yahoo_info = pd.read_sql_query(yahoo_sql, con=bbdb) #print('Find matches for this player:') #print(yahoo_info) if len(yahoo_info) == 0: print( 'This yahoo_id is not in the Yahoo player pool. Please rerun the player pool generator' ) return False else: yahoo_name = yahoo_info['yahoo_name'].to_list()[0] print('Here is the Yahoo player pool info available on ' + yahoo_name + ':') print(yahoo_info) names = player_names.get_player_names() # If it's already in the list of player names: if yahoo_name in names['name'].to_list(): matches = names[names['name'] == yahoo_name] if len(matches) == 1: print('Found a match!') print('FG info:') fg_id = matches['fg_id'].to_list()[0] fg_info = get_fg_info(fg_id) print(fg_info) perform_merge = input( 'Do you want to merge in the Yahoo ID into the existing match?' ) if perform_merge: sql_update = 'UPDATE reference.player_names SET yahoo_id = \''+\ str(yahoo_id)+\ '\' WHERE fg_id=\''+\ str(fg_id)+'\'' print(sql_update) bbdb.execute(sql_update) player_names.push_player_names_to_gs() else: print('OK, won\'t update') elif len(matches) > 1: print( 'There is more than one match. Please manually update. List of matches:' ) print(matches) else: # If it's not already in the list of player names, see if there is a match in the raw FG data yahoo_sql = \ 'SELECT name, fg_id FROM '+\ '(SELECT "Name" as name, playerid as fg_id from proj.fg_dc_batters_raw '+\ 'UNION '+\ 'SELECT "Name" as name, playerid as fg_id from proj.fg_dc_pitchers_raw '+\ ') fg_raw_proj_union '+\ 'WHERE fg_id NOT IN (SELECT fg_id FROM reference.player_names) ORDER BY name' yahoo_info = pd.read_sql_query(yahoo_sql, con=bbdb) if yahoo_name in yahoo_info['name'].to_list(): matches = yahoo_info[yahoo_info['name'] == yahoo_name] if len(matches) == 1: print('Found a match!') print('FG info:') fg_id = matches['fg_id'].to_list()[0] fg_info = get_fg_info(fg_id) print(fg_info) perform_append = input( 'Do you want to append this to the list of player names?') if perform_append == 'Y': sql_append_new_name = \ 'INSERT INTO reference.player_names (name, fg_id, yahoo_id) '+\ 'VALUES ('+\ '\'' + fg_info[1] + '\', \'' + str(fg_info[0]) + '\', \'' + str(yahoo_id) + '\'' +\ ')' print(sql_append_new_name) bbdb.execute(sql_append_new_name) player_names.push_player_names_to_gs() else: print('OK, won\'t update') else: 'Cannot find an exact name match in the FG projections'
def scrape_yahoo_player_pool(): # This loops through the FF player pages and saves the player name, id, and eligibility to the database from bs4 import BeautifulSoup import pandas as pd import requests import time import unidecode from munging import player_names from general import postgres # EXTRACT pitcher_base_url = 'https://baseball.fantasysports.yahoo.com/b1/26574/players?status=ALL&pos=P&cut_type=33&stat1=S_S_2021&myteam=0&sort=R_PO&sdir=1&count=' hitter_base_url = 'https://baseball.fantasysports.yahoo.com/b1/26574/players?status=ALL&pos=B&cut_type=33&stat1=S_S_2021&myteam=0&sort=R_PO&sdir=1&count=' players = [] for baseurl in [hitter_base_url, pitcher_base_url]: for i in range(0, 401, 25): url = baseurl + str(i) page = requests.get(url) print('Got '+url) time.sleep(1) soup = BeautifulSoup(page.text, 'html.parser') table = soup.find('div', {'id':'players-table'}).find('table') for trow in table.find('tbody').find_all('tr'): player_div = trow.find('div', {'class':'ysf-player-name'}) player_name = unidecode.unidecode(player_div.find('a').text) player_url = player_div.find('a')['href'] player_id = player_url.split('/')[-1].split('-')[-1] player_team_elig = player_div.find('span', {'class':'Fz-xxs'}).text.split('-') player_team = player_team_elig[0].strip() player_elig = player_team_elig[1].strip() players.append([player_id, player_name, player_url, player_team, player_elig]) df_players = pd.DataFrame(players, columns=['yahoo_id', 'yahoo_name', 'yahoo_url', 'yahoo_team', 'yahoo_elig']) # TRANSFORM def combine_eligibilities(row): yahoo_elig_list = row['yahoo_elig'].split(',') #print(row['ff_name']) #print(row['ff_elig']) #print(ff_elig_list) eligibilities = [] # Utilty/DH-only if yahoo_elig_list == 'Util': eligibilities.append('UT') # Infielders for pos in ['C', '1B', '2B', 'SS', '3B']: if pos in yahoo_elig_list: eligibilities.append(pos) if '2B' in eligibilities or 'SS' in eligibilities: eligibilities.append('MI') if '1B' in eligibilities or '3B' in eligibilities: eligibilities.append('CI') if 'MI' in eligibilities or 'CI' in eligibilities: eligibilities.append('IF') # Outfielders for pos in ['OF', 'RF', 'LF', 'CF']: if pos in yahoo_elig_list and 'OF' not in eligibilities: eligibilities.append('OF') # Pitchers for pos in ['SP', 'RP']: if pos in yahoo_elig_list: eligibilities.append(pos) #print(eligibilities) # Concatenate into a string and return elig = ' '.join(eligibilities).strip() #print(elig) if elig == '': elig = 'UT' return elig df_players['elig'] = df_players.apply(lambda row: combine_eligibilities(row), axis=1) names = player_names.get_player_names() df_players = df_players.merge(right=names[['yahoo_id', 'fg_id']], how='left', on='yahoo_id') # LOAD bbdb = postgres.connect_to_bbdb() df_players.to_sql('player_pool_yahoo', con=bbdb, schema='reference', if_exists='replace', chunksize=1000, method='multi', index=False) print('Uploaded Yahoo player pool') return df_players
def scrape_yahoo_roster(league_num='26574'): print('\n--------------------------\nScraping Yahoo rosters:\n') from datetime import date import requests from bs4 import BeautifulSoup import pandas as pd import gspread import gspread_dataframe as gsdf from general import postgres from general import selenium_utilities from munging import player_names league_url = 'https://baseball.fantasysports.yahoo.com/b1/' + league_num + '/startingrosters' print('Scraping from '+league_url) page = requests.get(league_url) bs_rosters = BeautifulSoup(page.text, 'html.parser') main_div = bs_rosters.find('div', id='yspmaincontent') tables = main_div.find_all('div', {'class':'Grid-u-1-2 Pend-xl'}) rosters = [] for table in tables: #roster = [] owner_id = table.find('p').find('a')['href'].split('/')[-1] owner = table.find('p').find('a').text # print('Scraping ' + owner) player_rows = table.find('table').find('tbody').find_all('tr') for player_row in player_rows: tds = player_row.find_all('td') td_pos = tds[0] pos = td_pos.text td_player = tds[1] info_player = td_player.find('div', {'class':'ysf-player-name'}) if info_player.find('div', {'class':'emptyplayer'}) is not None: rosters.append([owner, pos, 'empty', 'empty']) else: player = info_player.find('a') #print(player) playerid = str(player['href'].split('/')[-1]) playername = player.text rosters.append([owner_id, owner, pos, playerid, playername]) rosters = pd.DataFrame(rosters, columns=['owner_id', 'Team', 'pos', 'yahoo_id', 'name']) #player_names.put_missing_in_GS(id_list=rosters[rosters['yahoo_id']!='empty'], type='yahoo_id') names = player_names.get_player_names() rosters = rosters.merge( names[['yahoo_id', 'fg_id', 'name']], on='yahoo_id', how='left' ) today = date.today().strftime("%Y%m%d") rosters['date'] = today rosters = rosters[['date', 'owner_id', 'Team', 'pos', 'fg_id', 'yahoo_id']] missing_fg_id = rosters[rosters['fg_id'].isna()] if len(missing_fg_id)>0: for player in missing_fg_id.values.tolist(): if player[3] != 'empty': # Don't flag if it's just an empty position slot print('\nMissing info on:') print(player) yahoo_match = player_names.find_other_ids_w_yahoo(player[5]) player_names.push_player_names_to_gs() print('Updated Google Sheets') today = date.today().strftime("%Y%m%d") basename = "/Users/andrewfelton/Documents/bb/bb-2021/data/yahoo/rosters" new_file = basename + "_" + today + ".csv" rosters.to_csv(new_file) bbdb = postgres.connect_to_bbdb() rosters.to_sql('legacy', con=bbdb, schema='rosters', if_exists='replace', index=False) print('Uploaded to database')
def update_ff_rosters(): import pandas as pd import requests import json import datetime import time from general import postgres bbdb = postgres.connect_to_bbdb() league_id = '23172' # Get the list of teams query = { 'sport': 'MLB', 'league_id': league_id, 'season': '2021', 'scoring_period': '1' } response = requests.get( 'https://www.fleaflicker.com/api/FetchLeagueRosters', params=query) response_json = response.json() #with open('fleaflicker_api_test.json', 'w') as f: # json.dump(response_json, f, indent=4) teams = [] for roster in response_json['rosters']: print(roster['team']['name']) team = [roster['team']['name'], roster['team']['id']] teams.append(team) teams = pd.DataFrame(teams, columns=['team_name', 'team_id']) teams['league_id'] = query['league_id'] #print(teams) teams.to_sql(name='teams', con=bbdb, schema='sos_2021', if_exists='replace') # Get the list of eligible scoring periods from one team teams = pd.read_sql('SELECT * FROM sos_2021.teams LIMIT 1', con=bbdb) league_id = teams.loc[0, 'league_id'] team_id = teams.loc[0, 'team_id'] query = { 'sport': 'MLB', 'league_id': league_id, 'team_id': team_id, 'season': '2021', 'scoring_period': '1' } response = requests.get('https://www.fleaflicker.com/api/FetchRoster', params=query) response_json = response.json() with open('fleaflicker_api_test.json', 'w') as f: json.dump(response_json, f, indent=4) # Populate the table of all scoring dates scoring_dates = [] for scoring_day in response_json['eligibleLineupPeriods']: scoring_period = scoring_day['low']['ordinal'] epochmilli = int(scoring_day['low']['startEpochMilli']) scoring_date = datetime.datetime.fromtimestamp(epochmilli / 1000.0).date() scoring_dates.append([scoring_period, scoring_date]) scoring_dates = pd.DataFrame(scoring_dates, columns=['scoring_period', 'scoring_date']) scoring_dates.to_sql(name='scoring_dates', con=bbdb, schema='sos_2021', if_exists='replace') # Get the list of scoring dates that need to be updated today = datetime.date.today() missing_scoring_periods = pd.read_sql(''' SELECT scoring_period::integer, scoring_date FROM sos_2021.scoring_dates WHERE (scoring_date <= '{}') AND (scoring_period >= ( SELECT DISTINCT MAX(scoring_period::integer) AS scoring_period FROM sos_2021.rosters )) '''.format(today, today), con=bbdb) # Loop through all the teams and get the rosters teams = pd.read_sql('SELECT * FROM sos_2021.teams', con=bbdb) league_id = teams.loc[0, 'league_id'] team_ids = teams['team_id'].to_list() for scoring_period in missing_scoring_periods['scoring_period'].to_list(): for team_id in team_ids: scrape_daily_roster(scoring_period, team_id) time.sleep(.5)
def rosters(league, upload_to_db=True): import sys import datetime import requests from bs4 import BeautifulSoup import pandas as pd from general import classes from munging import player_names from general import postgres print( '\n--------------------------\nScraping Fleaflicker rosters for league:' + league.league_name + '\n') assert (league.league_platform == 'fleaflicker') league_num = league.league_num roster_url = 'https://www.fleaflicker.com/mlb/leagues/' + league_num + '/teams' page = requests.get(roster_url) soup = BeautifulSoup(page.text, 'html.parser') main_div = soup.find('div', id='body-center-main') tables = main_div.find_all('table') today = datetime.date.today() str_today = str(today) teams = [] for t in tables: trows = t.find_all('tr') for tr in trows: if (tr.find("span", {"class": "league-name" })): # Found the span with the team name team_name = tr.find("span", {"class": "league-name"}).text #print('New team: '+team_name) teams.append(classes.FantasyTeam(team_name)) current_team = teams[-1] elif tr.find('a', {"class": "player-text"}): player_data = tr.find('a', {"class": "player-text"}) player_name = player_data.text player_url = 'https://www.fleaflicker.com' + player_data['href'] player_ff_id = player_data['href'].split('/')[-1].split( '-')[-1] current_team.add_player(player_name, player_ff_id) df_export = pd.DataFrame(columns=['Team', 'Player', 'ff_id']) for team in teams: df_export = pd.concat([df_export, team.to_dataframe()]) df_export.reset_index(drop=True, inplace=True) names = player_names.get_player_names() df_export = df_export.merge(right=names[['ff_id', 'fg_id']], how='left', on='ff_id') # Go through the Fleaflicker players that don't have matching FG IDs missing_fg_id = df_export[df_export['fg_id'].isna()] if len(missing_fg_id) > 0: print('Miising fg_id for ' + str(len(missing_fg_id.values)) + ' player(s):') for player in missing_fg_id.values.tolist(): print('\nMissing info on:') print(player) ff_match = player_names.find_other_ids_w_ff(player[2]) file_rosters = '/Users/andrewfelton/Documents/bb/bb-2021/data/rosters/rosters_' + league_num + '_' + str_today + '.csv' df_export.to_csv(file_rosters, index=False) print('Saved rosters to ' + file_rosters) if upload_to_db: bbdb = postgres.connect_to_bbdb() df_export.to_sql('sos', con=bbdb, schema='rosters', if_exists='replace', index=False) print('Uploaded to database') player_names.push_player_names_to_gs() print('Updated Google Sheets') return df_export
def scrape_fg_projections(type, system, mytype, mysystem): import os from datetime import date import time import pandas as pd from datetime import datetime from general import selenium_utilities from general import postgres from munging import player_names from player_names import put_missing_in_GS from scraping import scrape_fg_projections driver = selenium_utilities.start_driver(headless=True) driver = scrape_fg_projections.fg_login(driver) fg_proj_url_base = 'https://www.fangraphs.com/projections.aspx?pos=all' fg_proj_url_type = 'stats=' + type fg_proj_url_system = 'type=' + system fg_proj_url = fg_proj_url_base + '&' + fg_proj_url_type + '&' + fg_proj_url_system driver.get(fg_proj_url) time.sleep(2) #print('Arrived at '+driver.current_url) btn_dl_projections = driver.find_element_by_id('ProjectionBoard1_cmdCSV') btn_dl_projections.click() time.sleep(3) fg_account_name = driver.find_element_by_id('linkAccount').text print('Account name is: ' + fg_account_name) basepath = "/Users/andrewfelton/Documents/bb/bb-2021" dl_file = "/Users/andrewfelton/Downloads/docker/FanGraphs\ Leaderboard.csv" today = date.today().strftime("%Y%m%d") new_file = basepath + "/data/fangraphs/" + mysystem + "_" + mytype + "_" + today + ".csv" stream_command = os.popen('mv ' + dl_file + ' ' + new_file) mv_file = stream_command.read() # create the soft link ln_file = basepath + "/data/fangraphs/" + mysystem + "_" + mytype + ".csv" command_ln = os.popen('ln -sf ' + new_file + ' ' + ln_file) driver.close() #selenium_utilities.stop_selenium('bbsel') print("Finished scraping " + ln_file) proj = pd.read_csv(ln_file) proj.insert(0, 'asof_date', date.today().strftime('%Y-%m-%d')) # Check to confirm that all the fg_id are in the names table # To avoid pandas issues take it out of the dataframe and then put it back in fg_ids = proj[['playerid']].astype(str).values #put_missing_in_GS(id_list=pd.DataFrame(fg_ids, columns=['fg_id']), type='fg_id') tablename = mysystem + "_" + mytype + "_raw" bbdb = postgres.connect_to_bbdb() query_tables = "SELECT * FROM pg_catalog.pg_tables WHERE schemaname='proj';" tables_list_result = bbdb.execute(query_tables) tables_list = [] for table in tables_list_result: tables_list.append(table[1]) if (tablename in tables_list): command = 'TRUNCATE TABLE proj.' + tablename bbdb.execute(command) proj.to_sql(tablename, bbdb, schema='proj', if_exists='append', index=False)
def scrape_fg_leaderboard(fg_leaderboard_url, scrapedate, folder, filename, schema, table, driver=None): import os from datetime import date import time import pandas as pd import datetime from selenium.webdriver.common.action_chains import ActionChains import yaml from scraping import scrape_fg_projections from general import selenium_utilities from general import postgres from munging import player_names bbdb = postgres.connect_to_bbdb() driver_keepalive = True if driver == None: driver_keepalive = False driver = selenium_utilities.start_driver(headless=False) driver = scrape_fg_projections.fg_login(driver) driver.get(fg_leaderboard_url) time.sleep(1) print('Arrived at ' + driver.current_url) btn_dl_projections = driver.find_element_by_id('LeaderBoard1_cmdCSV') actions = ActionChains(driver) actions.move_to_element(btn_dl_projections).perform() driver.execute_script("window.scrollBy(0, 200);") btn_dl_projections.click() time.sleep(3) if not driver_keepalive: driver.close() driver.quit() basepath = "/Users/andrewfelton/Documents/bb/bb-2021" dl_file = "/Users/andrewfelton/Downloads/docker/FanGraphs\ Leaderboard.csv" new_file = "{basepath}/data/{folder}/{filename}_{scrapedate}.csv".format( basepath=basepath, folder=folder, filename=filename, scrapedate=scrapedate) stream_command = 'mv {dl_file} {new_file}'.format(dl_file=dl_file, new_file=new_file) mv_file_exec = os.popen(stream_command) print(mv_file_exec.read()) print("Finished scraping " + new_file) # TRANSFORM # Read the CSV file, convert to dataframe, remap the column headers stream = open( '/Users/andrewfelton/Documents/bb/bb-2021/python/scraping/field_name_mapping.yml', 'r') mapper = yaml.load(stream, yaml.CLoader) fgfile = pd.read_csv(new_file) fgfile.insert(0, 'asof_date', scrapedate) for col in fgfile.columns: fgfile.rename(columns={col: col.lower()}, inplace=True) fgfile.rename(columns=mapper['fg'], inplace=True) fgfile[['fg_id']] = fgfile[['fg_id']].astype(str) # Check if the table already exists and if so, clear out the information from the scrapedate query_tables = "SELECT * FROM pg_catalog.pg_tables WHERE schemaname='tracking';" tables_list = [t for t in bbdb.execute(query_tables)] if (table in tables_list): if schema == 'hist': command = "DELETE FROM {schema}.{table} WHERE asof_date='{scrapedate}';".format( schema=schema, table=table, scrapedate=scrapedate) elif schema == 'tracking': command = 'TRUNCATE TABLE tracking.{table}'.format(table=table) bbdb.execute(command) # Load to the database fgfile.to_sql(name=table, con=bbdb, schema=schema, if_exists='append') return fgfile
def scrape_razz(mytype, url, logged_in_driver=False, merge_ownership=True): from bs4 import BeautifulSoup import pandas as pd from datetime import datetime from datetime import date import os from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from general import selenium_utilities from general import postgres from munging import player_names from munging import rosters print('Going to scrape '+mytype+' from '+url) if not logged_in_driver: driver = selenium_utilities.start_driver() waiter = WebDriverWait(driver, 10) # Get the home page driver.get("https://razzball.com/") #expected condition waiter.until(EC.presence_of_element_located((By.ID, 'sitemenu'))) #JavaScript Executor to stop page load driver.execute_script("window.stop();") # Check if already logged in try: user_info = driver.find_element_by_id('wp-admin-bar-user-info') except NoSuchElementException: # If not logged in, then log in driver.get("https://razzball.com/wp-login.php?redirect_to=https%3A%2F%2Frazzball.com") input_login = driver.find_element_by_id('user_login') input_login.send_keys('*****@*****.**') input_pw = driver.find_element_by_id('user_pass') input_pw.send_keys('36Pm4jKml7') input_submit = driver.find_element_by_id('wp-submit') input_submit.click() # Go to the projections page driver.get(url) print("Arrived at "+url) # Copy the csv window into BS soup = BeautifulSoup(driver.page_source, 'lxml') table = soup.find('table', id='neorazzstatstable') # Close it down driver.close() streamers = [] colnames = [] # Get the list of column names ths = table.findAll('th') for th in ths: colname = th.text.lower() colname = colname.replace('#', 'rank').replace('$','value').replace('!','').replace('%','pct_') colnames.append(colname) # Insert Razz ID before Name colnames.insert(colnames.index('name'), 'razz_id') # Loop through all the rows and append them to the list trows = table.findAll('tr') for trow in trows: streamer = [] tds = trow.findAll('td') append = True if [] == list(set(trow['class']) & set(['class=sorter-head', 'tablesorter-headerRow', 'tablesorter-ignoreRow'])): for loc, td in enumerate(tds): if (loc+1)==colnames.index('name'): player_url = td.find('a')['href'] player_id = player_url.split('/')[4] player_id = '660271' if (str(player_id) == '6602710') else player_id # Manual correction for Ohtani streamer.append(player_id) # Razz ID player_name = td.find('a').text streamer.append(player_name) # player name elif ('date' in colnames) and (loc+1)==colnames.index('date'): date_str =td.text + '/2021' streamdate = datetime.strptime(date_str, '%m/%d/%Y') streamer.append(streamdate) # Date else: try: value = float(td.text) except ValueError: value = td.text streamer.append(value) # Some times there are entries with missing values -- do not include those in the dataframe for var in ['pa', 'ip']: if var in colnames and ( str(streamer[colnames.index(var)])=='' or str(streamer[colnames.index(var)])=='None' ): append = False if streamer[colnames.index('razz_id')]==1.0: append = False if 'value' in colnames: if str(streamer[colnames.index('value')])=='': append = False if append: streamers.append(streamer) df_streamers = pd.DataFrame(streamers, columns=colnames) df_streamers.drop(df_streamers[df_streamers['razz_id']==1.0].index, inplace=True) # fix date #date_str = tds[4].text + '/2021' #streamdate = datetime.strptime(date_str, '%m/%d/%Y') names = player_names.get_player_names() df_streamers = df_streamers.merge(right=names[['mlb_id', 'fg_id']], how='left', left_on='razz_id', right_on='mlb_id') df_streamers['fg_id'] = df_streamers.apply(lambda row: row['fg_id'] if str(row['fg_id'])!='nan' else row['razz_id'], axis=1) if merge_ownership: ff_rosters = rosters.get_ff_ownership().rename(columns={"Team": "SoS_Team"}) legacy_rosters = rosters.get_legacy_ownership().rename(columns={"Team": "Legacy_Team"}) df_streamers = df_streamers.merge( right=ff_rosters[['SoS_Team', 'fg_id']], how='left', on='fg_id').merge( right=legacy_rosters[['Legacy_Team', 'fg_id']], how='left', on='fg_id') # Save on computer as .csv file mysystem = 'razz' today = date.today().strftime("%Y%m%d") basename = "/Users/andrewfelton/Documents/bb/bb-2021/data/" + mysystem + '/' + mysystem + "_" + mytype new_file = basename + "_" + today + ".csv" df_streamers.to_csv(new_file) # create the soft link ln_file = basename + ".csv" command_ln = os.popen('ln -sf ' + new_file + ' ' + ln_file) print(command_ln) # Upload to the database tablename = mytype bbdb = postgres.connect_to_bbdb() query_tables = "SELECT * FROM pg_catalog.pg_tables WHERE schemaname='proj';" tables_list_result = bbdb.execute(query_tables) tables_list = [] for table in tables_list_result: tables_list.append(table[1]) if (tablename in tables_list): command = 'TRUNCATE TABLE proj."'+tablename+'"' bbdb.execute(command) df_streamers.to_sql(tablename, bbdb, schema='proj', if_exists='append', index=False) return df_streamers
def scrape_ff_player_pool(): # This loops through the FF player pages and saves the player name, id, and eligibility to the database from bs4 import BeautifulSoup import pandas as pd import requests import time import unidecode import sys sys.path.append('python/munging') import player_names sys.path.append('python/general') import postgres # EXTRACT pitcher_base_url = 'https://www.fleaflicker.com/mlb/leagues/23172/players?season=2021&statType=1&sortMode=1&position=1536&isFreeAgent=false&tableSortDirection=DESC&tableSortName=pv7&tableOffset=' hitter_base_url = 'https://www.fleaflicker.com/mlb/leagues/23172/players?season=2021&statType=1&sortMode=1&position=511&isFreeAgent=false&tableSortDirection=DESC&tableSortName=pv7&tableOffset=' rp_base_url = 'https://www.fleaflicker.com/mlb/leagues/23172/players?season=2021&statType=1&sortMode=1&position=1536&isFreeAgent=false&tableSortName=st25&tableSortDirection=DESC&tableOffset=' players = [] for baseurl in [hitter_base_url, pitcher_base_url, rp_base_url]: count_top = 601 if baseurl in [rp_base_url]: count_top = 201 for i in range(0, count_top, 20): url = baseurl + str(i) page = requests.get(url) print('Got ' + url) time.sleep(1) soup = BeautifulSoup(page.text, 'html.parser') table = soup.find('div', {'id': 'body-center-main'}).find('table') count = 0 trow = table.find('thead').next_sibling while trow is not None and count < 20: player_data = trow.find('div', {'class': 'player'}) player_name = unidecode.unidecode( player_data.find('a', { 'class': 'player-text' }).text) player_id = player_data.find( 'a', {'class': 'player-text' })['href'].split('/')[-1].split('-')[-1] player_url = 'https://www.fleaflicker.com' + player_data.find( 'a')['href'] player_elig = player_data.find('span', { 'class': 'position' }).text player_team = player_data.find('span', { 'class': 'player-team' }).text #print(' '.join([player_name, player_id, elig])) players.append([ player_id, player_name, player_url, player_team, player_elig ]) trow = trow.next_sibling count = count + 1 df_players = pd.DataFrame( players, columns=['ff_id', 'ff_name', 'ff_url', 'ff_team', 'ff_elig']) df_players.drop_duplicates(subset=['ff_id'], inplace=True, ignore_index=True) # TRANSFORM def combine_eligibilities(row): ff_elig_list = row['ff_elig'].split('/') #print(row['ff_name']) #print(row['ff_elig']) #print(ff_elig_list) eligibilities = [] # Infielders for pos in ['C', '1B', '2B', 'SS', '3B']: if pos in ff_elig_list: eligibilities.append(pos) if '2B' in eligibilities or 'SS' in eligibilities: eligibilities.append('MI') if '1B' in eligibilities or '3B' in eligibilities: eligibilities.append('CI') if 'MI' in eligibilities or 'CI' in eligibilities: eligibilities.append('IF') # Outfielders for pos in ['OF', 'RF', 'LF', 'CF']: if pos in ff_elig_list and 'OF' not in eligibilities: eligibilities.append('OF') # Pitchers - just use the same as FF if 'SP' in eligibilities or 'RP' in eligibilities or 'P' in eligibilities: eligibilities = ff_elig_list #print(eligibilities) # Concatenate into a string and return elig = ' '.join(eligibilities).strip() #print(elig) if elig == '': elig = 'UT' return elig df_players['elig'] = df_players.apply( lambda row: combine_eligibilities(row), axis=1) names = player_names.get_player_names() df_players = df_players.merge(right=names[['ff_id', 'fg_id']], how='left', on='ff_id') # LOAD bbdb = postgres.connect_to_bbdb() df_players.to_sql('player_pool_ff', con=bbdb, schema='reference', if_exists='replace', chunksize=1000, method='multi', index=False) print('Uploaded FleaFlicker player pool') return df_players