def scrape_savant(): from datetime import date import os import pandas as pd from general import selenium_utilities from general import postgres from munging import player_names driver = selenium_utilities.start_driver() draft_url = "https://baseballsavant.mlb.com/leaderboard/custom?year=2021&type=batter&filter=&sort=4&sortDir=desc&min=10&selections=b_total_pa,xba,xslg,woba,xwoba,xobp,xiso,wobacon,xwobacon,exit_velocity_avg,launch_angle_avg,barrel_batted_rate,hard_hit_percent,sprint_speed,&chart=false&x=xba&y=xba&r=no&chartType=beeswarm" driver.get(draft_url) print('Arrived at ' + driver.current_url) input_dl = driver.find_element_by_id('btnCSV') input_dl.click() basepath = "/Users/andrewfelton/Documents/bb/bb-2021" dl_file = "/Users/andrewfelton/Downloads/docker/stats.csv" today = date.today().strftime("%Y%m%d") new_file = basepath + "/data/savant/hitter_stats_" + today + ".csv" stream_command = os.popen('mv ' + dl_file + ' ' + new_file) mv_file = stream_command.read() # create the soft link ln_file = basepath + "/data/savant/hitter_stats.csv" command_ln = os.popen('ln -sf ' + new_file + ' ' + ln_file) driver.close() print("Finished scraping " + ln_file) savant = pd.read_csv(ln_file) savant.insert(0, 'asof_date', date.today().strftime('%Y-%m-%d')) # Merge in the player names and FG IDs savant.rename(columns={'player_id': 'mlb_id'}, inplace=True) savant['mlb_id'] = savant['mlb_id'].apply(str) names = player_names.get_player_names() savant = savant.merge(right=names[['mlb_id', 'fg_id']], how='left', on='mlb_id') #fg_ids = savant[['fg_id']].astype(str).values #put_missing_in_GS(id_list=pd.DataFrame(fg_ids, columns=['fg_id']), type='fg_id') savant = savant[[ 'asof_date', 'fg_id', 'b_total_pa', 'xba', 'xslg', 'woba', 'xwoba', 'xobp', 'xiso', 'wobacon', 'xwobacon', 'exit_velocity_avg', 'launch_angle_avg', 'barrel_batted_rate', 'hard_hit_percent', 'sprint_speed' ]] schema = 'tracking' tablename = 'savant' bbdb = postgres.connect_to_bbdb() savant.to_sql(tablename, bbdb, schema=schema, if_exists='replace')
def create_combined_valuations(league): import pandas as pd import gspread import gspread_dataframe as gsdf import gspread_formatting as gsfmt from general import postgres from munging import player_names from general import gs assert league.league_name in ['SoS', 'Legacy'] #league = 'SoS' #league = 'Legacy' bbdb = postgres.connect_to_bbdb() names = player_names.get_player_names() gc = gspread.service_account(filename='./bb-2021-2b810d2e3d25.json') #bb2021 = gc.open("BB 2021 " + league.league_name) bb2021 = gc.open("BB 2021 InSeason") combined_hitters = create_combined_hitter_valuations(league) combined_pitchers = create_combined_pitcher_valuations(league) hitter_projections = bb2021.worksheet('Hitter Projections - ' + league.league_name) bb2021.values_clear(hitter_projections.title + "!A:Z") gsdf.set_with_dataframe(hitter_projections, combined_hitters) hitter_projections.update format_gs.format_gs_all(league=league, ls=league, type='hitting') pitcher_projections = bb2021.worksheet('Pitcher Projections - ' + league.league_name) bb2021.values_clear(pitcher_projections.title + "!A:Z") gsdf.set_with_dataframe(pitcher_projections, combined_pitchers) pitcher_projections.update format_gs.format_gs_all(league=league.league_name, ls=league, type='pitching') combined = pd.concat([ combined_hitters[['name', 'fg_id', 'type', 'zar', 'value']], combined_pitchers[['name', 'fg_id', 'type', 'zar', 'value']] ]) combined = combined.sort_values(by='value', ascending=False) gs_combined = bb2021.worksheet('Combined Z') gsdf.set_with_dataframe(gs_combined, combined) gs_combined.update gsfmt.format_cell_range( gs_combined, 'D:E', gsfmt.CellFormat( numberFormat=gsfmt.NumberFormat(type='NUMBER', pattern='0.0')))
def create_actuals_pitchers(ls, year=2021): import pandas as pd from general import utilities from general import postgres from munging import player_names bbdb = postgres.connect_to_bbdb() query = ( 'SELECT pit_std.year, pit_std.bbref_id, pit_std."Tm" as team, pit_std."IP" as ip, pit_start."GS" as gs, pit_start."QS" as qs, pit_std."SO" as so, pit_std."ERA" as era, pit_std."WHIP" as whip, pit_relief."SV" as sv, pit_relief."Hold" as hld FROM ' + '(SELECT * FROM tracking.bbref_pitching_standard) as pit_std ' + 'LEFT JOIN (SELECT * FROM tracking.bbref_pitching_starter) as pit_start ON pit_std.bbref_id=pit_start.bbref_id AND pit_std.year=pit_start.year AND pit_std."Tm"=pit_start."Tm" ' + 'LEFT JOIN (SELECT * FROM tracking.bbref_pitching_reliever) as pit_relief ON pit_std.bbref_id=pit_relief.bbref_id AND pit_std.year=pit_relief.year AND pit_std."Tm"=pit_relief."Tm" ' + 'WHERE pit_std.year=' + str(year)) df = pd.read_sql_query(query, bbdb) df['ip'] = df['ip'].str.replace('.1', '.33', regex=False) df['ip'] = df['ip'].str.replace('.2', '.67', regex=False) df = df.fillna(value={ 'era': 0, 'whip': 0, 'gs': 0, 'qs': 0, 'sv': 0, 'hld': 0 }) for c in ['gs', 'qs', 'so', 'sv', 'hld']: df[c] = df[c].replace(r'^\s*$', 0, regex=True) df[c] = df[c].astype(int) for c in ['ip', 'era', 'whip']: df[c] = df[c].replace(r'^\s*$', 0, regex=True) df[c] = df[c].astype(float) df['svhld'] = df['sv'] + df['hld'] df = df[(df['bbref_id'].notnull()) & (df['bbref_id'] != u'')] # merge in the names and reorder names = player_names.get_player_names() combined_pitchers = df.merge(names[['bbref_id', 'fg_id', 'name']], on='bbref_id', how='left') output_stats = utilities.flatten([['fg_id', 'name', 'team', 'ip'], [ls.pitching_stats]]) combined_pitchers = combined_pitchers[output_stats] combined_pitchers.drop_duplicates(inplace=True) return combined_pitchers
def create_actuals_hitters(ls, year=2021): import pandas as pd from general import utilities from general import postgres from general import classes from munging import player_names bbdb = postgres.connect_to_bbdb() if year == 2021: tablename = 'tracking' else: tablename = 'reference' query = ( 'SELECT year, bbref_id, bat."Tm" as team, bat."PA" as pa, ' + 'bat."HR" as hr, bat."R" as r, bat."RBI" as rbi, bat."SB" as sb, bat."OBP" as obp, bat."OPS" as ops ' + 'FROM ' + tablename + '.bbref_batting_standard bat WHERE year=' + str(year)) df = pd.read_sql_query(query, bbdb) df = df.fillna(value={ 'obp': 0, 'ops': 0, 'pa': 0, 'r': 0, 'rbi': 0, 'sb': 0 }) for c in ['pa', 'r', 'rbi', 'hr', 'sb']: df[c] = df[c].replace(r'^\s*$', 0, regex=True) df[c] = df[c].astype(int) for c in ['obp', 'ops']: df[c] = df[c].replace(r'^\s*$', 0, regex=True) df[c] = df[c].astype(float) df = df[(df['bbref_id'].notnull()) & (df['bbref_id'] != u'')] # merge in the names and reorder names = player_names.get_player_names() combined_hitters = df.merge(names[['bbref_id', 'fg_id', 'name']], on='bbref_id', how='left') output_stats = utilities.flatten( [['fg_id', 'bbref_id', 'name', 'team', 'pa'], [ls.hitting_stats]]) combined_hitters = combined_hitters[output_stats] combined_hitters.drop_duplicates(inplace=True) return combined_hitters
def get_fg_id_from_ff_id(ff_id): from fuzzywuzzy import fuzz from fuzzywuzzy import process from scraping import scrape_ff from munging import player_names # Get FF info on a player player = scrape_ff.get_ff_player_info(ff_id) #print('Searching for a match for '+player['name'] + ' in the FanGraphs player list') #print(player) # Find the best match for the name in the FF player pool names = player_names.get_player_names() bestmatch = process.extract(player['name'], names['name'], limit=1, scorer=fuzz.token_sort_ratio) name, fuzzratio, row = bestmatch[0] player_rostered = names.loc[row] return player_rostered.to_dict()
def create_last30_hitters(ls): import pandas as pd from general import utilities from general import postgres from general import classes from munging import player_names bbdb = postgres.connect_to_bbdb() query = ( 'SELECT bat.fg_id, bat.team, bat.pa, ' + 'bat.hr, bat.r, bat.rbi, bat.sb, bat.obp, bat.obp+bat.slg as ops ' + 'FROM tracking.batters_last30 AS bat') df = pd.read_sql_query(query, bbdb) df = df.fillna(value={ 'obp': 0, 'ops': 0, 'pa': 0, 'r': 0, 'rbi': 0, 'sb': 0 }) for c in ['pa', 'r', 'rbi', 'hr', 'sb']: df[c] = df[c].replace(r'^\s*$', 0, regex=True) df[c] = df[c].astype(int) for c in ['obp', 'ops']: df[c] = df[c].replace(r'^\s*$', 0, regex=True) df[c] = df[c].astype(float) #df = df[(df['fg_id'].notnull()) & (df['fg_id']!=u'')] # merge in the names and reorder names = player_names.get_player_names() combined_hitters = df.merge(names[['fg_id', 'name']], on='fg_id', how='left') output_stats = utilities.flatten([['fg_id', 'name', 'team', 'pa'], [ls.hitting_stats]]) combined_hitters = combined_hitters[output_stats] combined_hitters.drop_duplicates(inplace=True) return combined_hitters
def scrape_razz(mytype, url, logged_in_driver=False, merge_ownership=True): from bs4 import BeautifulSoup import pandas as pd from datetime import datetime from datetime import date import os from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from general import selenium_utilities from general import postgres from munging import player_names from munging import rosters print('Going to scrape '+mytype+' from '+url) if not logged_in_driver: driver = selenium_utilities.start_driver() waiter = WebDriverWait(driver, 10) # Get the home page driver.get("https://razzball.com/") #expected condition waiter.until(EC.presence_of_element_located((By.ID, 'sitemenu'))) #JavaScript Executor to stop page load driver.execute_script("window.stop();") # Check if already logged in try: user_info = driver.find_element_by_id('wp-admin-bar-user-info') except NoSuchElementException: # If not logged in, then log in driver.get("https://razzball.com/wp-login.php?redirect_to=https%3A%2F%2Frazzball.com") input_login = driver.find_element_by_id('user_login') input_login.send_keys('*****@*****.**') input_pw = driver.find_element_by_id('user_pass') input_pw.send_keys('36Pm4jKml7') input_submit = driver.find_element_by_id('wp-submit') input_submit.click() # Go to the projections page driver.get(url) print("Arrived at "+url) # Copy the csv window into BS soup = BeautifulSoup(driver.page_source, 'lxml') table = soup.find('table', id='neorazzstatstable') # Close it down driver.close() streamers = [] colnames = [] # Get the list of column names ths = table.findAll('th') for th in ths: colname = th.text.lower() colname = colname.replace('#', 'rank').replace('$','value').replace('!','').replace('%','pct_') colnames.append(colname) # Insert Razz ID before Name colnames.insert(colnames.index('name'), 'razz_id') # Loop through all the rows and append them to the list trows = table.findAll('tr') for trow in trows: streamer = [] tds = trow.findAll('td') append = True if [] == list(set(trow['class']) & set(['class=sorter-head', 'tablesorter-headerRow', 'tablesorter-ignoreRow'])): for loc, td in enumerate(tds): if (loc+1)==colnames.index('name'): player_url = td.find('a')['href'] player_id = player_url.split('/')[4] player_id = '660271' if (str(player_id) == '6602710') else player_id # Manual correction for Ohtani streamer.append(player_id) # Razz ID player_name = td.find('a').text streamer.append(player_name) # player name elif ('date' in colnames) and (loc+1)==colnames.index('date'): date_str =td.text + '/2021' streamdate = datetime.strptime(date_str, '%m/%d/%Y') streamer.append(streamdate) # Date else: try: value = float(td.text) except ValueError: value = td.text streamer.append(value) # Some times there are entries with missing values -- do not include those in the dataframe for var in ['pa', 'ip']: if var in colnames and ( str(streamer[colnames.index(var)])=='' or str(streamer[colnames.index(var)])=='None' ): append = False if streamer[colnames.index('razz_id')]==1.0: append = False if 'value' in colnames: if str(streamer[colnames.index('value')])=='': append = False if append: streamers.append(streamer) df_streamers = pd.DataFrame(streamers, columns=colnames) df_streamers.drop(df_streamers[df_streamers['razz_id']==1.0].index, inplace=True) # fix date #date_str = tds[4].text + '/2021' #streamdate = datetime.strptime(date_str, '%m/%d/%Y') names = player_names.get_player_names() df_streamers = df_streamers.merge(right=names[['mlb_id', 'fg_id']], how='left', left_on='razz_id', right_on='mlb_id') df_streamers['fg_id'] = df_streamers.apply(lambda row: row['fg_id'] if str(row['fg_id'])!='nan' else row['razz_id'], axis=1) if merge_ownership: ff_rosters = rosters.get_ff_ownership().rename(columns={"Team": "SoS_Team"}) legacy_rosters = rosters.get_legacy_ownership().rename(columns={"Team": "Legacy_Team"}) df_streamers = df_streamers.merge( right=ff_rosters[['SoS_Team', 'fg_id']], how='left', on='fg_id').merge( right=legacy_rosters[['Legacy_Team', 'fg_id']], how='left', on='fg_id') # Save on computer as .csv file mysystem = 'razz' today = date.today().strftime("%Y%m%d") basename = "/Users/andrewfelton/Documents/bb/bb-2021/data/" + mysystem + '/' + mysystem + "_" + mytype new_file = basename + "_" + today + ".csv" df_streamers.to_csv(new_file) # create the soft link ln_file = basename + ".csv" command_ln = os.popen('ln -sf ' + new_file + ' ' + ln_file) print(command_ln) # Upload to the database tablename = mytype bbdb = postgres.connect_to_bbdb() query_tables = "SELECT * FROM pg_catalog.pg_tables WHERE schemaname='proj';" tables_list_result = bbdb.execute(query_tables) tables_list = [] for table in tables_list_result: tables_list.append(table[1]) if (tablename in tables_list): command = 'TRUNCATE TABLE proj."'+tablename+'"' bbdb.execute(command) df_streamers.to_sql(tablename, bbdb, schema='proj', if_exists='append', index=False) return df_streamers
def scrape_yahoo_player_pool(): # This loops through the FF player pages and saves the player name, id, and eligibility to the database from bs4 import BeautifulSoup import pandas as pd import requests import time import unidecode from munging import player_names from general import postgres # EXTRACT pitcher_base_url = 'https://baseball.fantasysports.yahoo.com/b1/26574/players?status=ALL&pos=P&cut_type=33&stat1=S_S_2021&myteam=0&sort=R_PO&sdir=1&count=' hitter_base_url = 'https://baseball.fantasysports.yahoo.com/b1/26574/players?status=ALL&pos=B&cut_type=33&stat1=S_S_2021&myteam=0&sort=R_PO&sdir=1&count=' players = [] for baseurl in [hitter_base_url, pitcher_base_url]: for i in range(0, 401, 25): url = baseurl + str(i) page = requests.get(url) print('Got '+url) time.sleep(1) soup = BeautifulSoup(page.text, 'html.parser') table = soup.find('div', {'id':'players-table'}).find('table') for trow in table.find('tbody').find_all('tr'): player_div = trow.find('div', {'class':'ysf-player-name'}) player_name = unidecode.unidecode(player_div.find('a').text) player_url = player_div.find('a')['href'] player_id = player_url.split('/')[-1].split('-')[-1] player_team_elig = player_div.find('span', {'class':'Fz-xxs'}).text.split('-') player_team = player_team_elig[0].strip() player_elig = player_team_elig[1].strip() players.append([player_id, player_name, player_url, player_team, player_elig]) df_players = pd.DataFrame(players, columns=['yahoo_id', 'yahoo_name', 'yahoo_url', 'yahoo_team', 'yahoo_elig']) # TRANSFORM def combine_eligibilities(row): yahoo_elig_list = row['yahoo_elig'].split(',') #print(row['ff_name']) #print(row['ff_elig']) #print(ff_elig_list) eligibilities = [] # Utilty/DH-only if yahoo_elig_list == 'Util': eligibilities.append('UT') # Infielders for pos in ['C', '1B', '2B', 'SS', '3B']: if pos in yahoo_elig_list: eligibilities.append(pos) if '2B' in eligibilities or 'SS' in eligibilities: eligibilities.append('MI') if '1B' in eligibilities or '3B' in eligibilities: eligibilities.append('CI') if 'MI' in eligibilities or 'CI' in eligibilities: eligibilities.append('IF') # Outfielders for pos in ['OF', 'RF', 'LF', 'CF']: if pos in yahoo_elig_list and 'OF' not in eligibilities: eligibilities.append('OF') # Pitchers for pos in ['SP', 'RP']: if pos in yahoo_elig_list: eligibilities.append(pos) #print(eligibilities) # Concatenate into a string and return elig = ' '.join(eligibilities).strip() #print(elig) if elig == '': elig = 'UT' return elig df_players['elig'] = df_players.apply(lambda row: combine_eligibilities(row), axis=1) names = player_names.get_player_names() df_players = df_players.merge(right=names[['yahoo_id', 'fg_id']], how='left', on='yahoo_id') # LOAD bbdb = postgres.connect_to_bbdb() df_players.to_sql('player_pool_yahoo', con=bbdb, schema='reference', if_exists='replace', chunksize=1000, method='multi', index=False) print('Uploaded Yahoo player pool') return df_players
def scrape_yahoo_roster(league_num='26574'): print('\n--------------------------\nScraping Yahoo rosters:\n') from datetime import date import requests from bs4 import BeautifulSoup import pandas as pd import gspread import gspread_dataframe as gsdf from general import postgres from general import selenium_utilities from munging import player_names league_url = 'https://baseball.fantasysports.yahoo.com/b1/' + league_num + '/startingrosters' print('Scraping from '+league_url) page = requests.get(league_url) bs_rosters = BeautifulSoup(page.text, 'html.parser') main_div = bs_rosters.find('div', id='yspmaincontent') tables = main_div.find_all('div', {'class':'Grid-u-1-2 Pend-xl'}) rosters = [] for table in tables: #roster = [] owner_id = table.find('p').find('a')['href'].split('/')[-1] owner = table.find('p').find('a').text # print('Scraping ' + owner) player_rows = table.find('table').find('tbody').find_all('tr') for player_row in player_rows: tds = player_row.find_all('td') td_pos = tds[0] pos = td_pos.text td_player = tds[1] info_player = td_player.find('div', {'class':'ysf-player-name'}) if info_player.find('div', {'class':'emptyplayer'}) is not None: rosters.append([owner, pos, 'empty', 'empty']) else: player = info_player.find('a') #print(player) playerid = str(player['href'].split('/')[-1]) playername = player.text rosters.append([owner_id, owner, pos, playerid, playername]) rosters = pd.DataFrame(rosters, columns=['owner_id', 'Team', 'pos', 'yahoo_id', 'name']) #player_names.put_missing_in_GS(id_list=rosters[rosters['yahoo_id']!='empty'], type='yahoo_id') names = player_names.get_player_names() rosters = rosters.merge( names[['yahoo_id', 'fg_id', 'name']], on='yahoo_id', how='left' ) today = date.today().strftime("%Y%m%d") rosters['date'] = today rosters = rosters[['date', 'owner_id', 'Team', 'pos', 'fg_id', 'yahoo_id']] missing_fg_id = rosters[rosters['fg_id'].isna()] if len(missing_fg_id)>0: for player in missing_fg_id.values.tolist(): if player[3] != 'empty': # Don't flag if it's just an empty position slot print('\nMissing info on:') print(player) yahoo_match = player_names.find_other_ids_w_yahoo(player[5]) player_names.push_player_names_to_gs() print('Updated Google Sheets') today = date.today().strftime("%Y%m%d") basename = "/Users/andrewfelton/Documents/bb/bb-2021/data/yahoo/rosters" new_file = basename + "_" + today + ".csv" rosters.to_csv(new_file) bbdb = postgres.connect_to_bbdb() rosters.to_sql('legacy', con=bbdb, schema='rosters', if_exists='replace', index=False) print('Uploaded to database')
def rosters(league, upload_to_db=True): import sys import datetime import requests from bs4 import BeautifulSoup import pandas as pd from general import classes from munging import player_names from general import postgres print( '\n--------------------------\nScraping Fleaflicker rosters for league:' + league.league_name + '\n') assert (league.league_platform == 'fleaflicker') league_num = league.league_num roster_url = 'https://www.fleaflicker.com/mlb/leagues/' + league_num + '/teams' page = requests.get(roster_url) soup = BeautifulSoup(page.text, 'html.parser') main_div = soup.find('div', id='body-center-main') tables = main_div.find_all('table') today = datetime.date.today() str_today = str(today) teams = [] for t in tables: trows = t.find_all('tr') for tr in trows: if (tr.find("span", {"class": "league-name" })): # Found the span with the team name team_name = tr.find("span", {"class": "league-name"}).text #print('New team: '+team_name) teams.append(classes.FantasyTeam(team_name)) current_team = teams[-1] elif tr.find('a', {"class": "player-text"}): player_data = tr.find('a', {"class": "player-text"}) player_name = player_data.text player_url = 'https://www.fleaflicker.com' + player_data['href'] player_ff_id = player_data['href'].split('/')[-1].split( '-')[-1] current_team.add_player(player_name, player_ff_id) df_export = pd.DataFrame(columns=['Team', 'Player', 'ff_id']) for team in teams: df_export = pd.concat([df_export, team.to_dataframe()]) df_export.reset_index(drop=True, inplace=True) names = player_names.get_player_names() df_export = df_export.merge(right=names[['ff_id', 'fg_id']], how='left', on='ff_id') # Go through the Fleaflicker players that don't have matching FG IDs missing_fg_id = df_export[df_export['fg_id'].isna()] if len(missing_fg_id) > 0: print('Miising fg_id for ' + str(len(missing_fg_id.values)) + ' player(s):') for player in missing_fg_id.values.tolist(): print('\nMissing info on:') print(player) ff_match = player_names.find_other_ids_w_ff(player[2]) file_rosters = '/Users/andrewfelton/Documents/bb/bb-2021/data/rosters/rosters_' + league_num + '_' + str_today + '.csv' df_export.to_csv(file_rosters, index=False) print('Saved rosters to ' + file_rosters) if upload_to_db: bbdb = postgres.connect_to_bbdb() df_export.to_sql('sos', con=bbdb, schema='rosters', if_exists='replace', index=False) print('Uploaded to database') player_names.push_player_names_to_gs() print('Updated Google Sheets') return df_export
def scrape_ff_player_pool(): # This loops through the FF player pages and saves the player name, id, and eligibility to the database from bs4 import BeautifulSoup import pandas as pd import requests import time import unidecode import sys sys.path.append('python/munging') import player_names sys.path.append('python/general') import postgres # EXTRACT pitcher_base_url = 'https://www.fleaflicker.com/mlb/leagues/23172/players?season=2021&statType=1&sortMode=1&position=1536&isFreeAgent=false&tableSortDirection=DESC&tableSortName=pv7&tableOffset=' hitter_base_url = 'https://www.fleaflicker.com/mlb/leagues/23172/players?season=2021&statType=1&sortMode=1&position=511&isFreeAgent=false&tableSortDirection=DESC&tableSortName=pv7&tableOffset=' rp_base_url = 'https://www.fleaflicker.com/mlb/leagues/23172/players?season=2021&statType=1&sortMode=1&position=1536&isFreeAgent=false&tableSortName=st25&tableSortDirection=DESC&tableOffset=' players = [] for baseurl in [hitter_base_url, pitcher_base_url, rp_base_url]: count_top = 601 if baseurl in [rp_base_url]: count_top = 201 for i in range(0, count_top, 20): url = baseurl + str(i) page = requests.get(url) print('Got ' + url) time.sleep(1) soup = BeautifulSoup(page.text, 'html.parser') table = soup.find('div', {'id': 'body-center-main'}).find('table') count = 0 trow = table.find('thead').next_sibling while trow is not None and count < 20: player_data = trow.find('div', {'class': 'player'}) player_name = unidecode.unidecode( player_data.find('a', { 'class': 'player-text' }).text) player_id = player_data.find( 'a', {'class': 'player-text' })['href'].split('/')[-1].split('-')[-1] player_url = 'https://www.fleaflicker.com' + player_data.find( 'a')['href'] player_elig = player_data.find('span', { 'class': 'position' }).text player_team = player_data.find('span', { 'class': 'player-team' }).text #print(' '.join([player_name, player_id, elig])) players.append([ player_id, player_name, player_url, player_team, player_elig ]) trow = trow.next_sibling count = count + 1 df_players = pd.DataFrame( players, columns=['ff_id', 'ff_name', 'ff_url', 'ff_team', 'ff_elig']) df_players.drop_duplicates(subset=['ff_id'], inplace=True, ignore_index=True) # TRANSFORM def combine_eligibilities(row): ff_elig_list = row['ff_elig'].split('/') #print(row['ff_name']) #print(row['ff_elig']) #print(ff_elig_list) eligibilities = [] # Infielders for pos in ['C', '1B', '2B', 'SS', '3B']: if pos in ff_elig_list: eligibilities.append(pos) if '2B' in eligibilities or 'SS' in eligibilities: eligibilities.append('MI') if '1B' in eligibilities or '3B' in eligibilities: eligibilities.append('CI') if 'MI' in eligibilities or 'CI' in eligibilities: eligibilities.append('IF') # Outfielders for pos in ['OF', 'RF', 'LF', 'CF']: if pos in ff_elig_list and 'OF' not in eligibilities: eligibilities.append('OF') # Pitchers - just use the same as FF if 'SP' in eligibilities or 'RP' in eligibilities or 'P' in eligibilities: eligibilities = ff_elig_list #print(eligibilities) # Concatenate into a string and return elig = ' '.join(eligibilities).strip() #print(elig) if elig == '': elig = 'UT' return elig df_players['elig'] = df_players.apply( lambda row: combine_eligibilities(row), axis=1) names = player_names.get_player_names() df_players = df_players.merge(right=names[['ff_id', 'fg_id']], how='left', on='ff_id') # LOAD bbdb = postgres.connect_to_bbdb() df_players.to_sql('player_pool_ff', con=bbdb, schema='reference', if_exists='replace', chunksize=1000, method='multi', index=False) print('Uploaded FleaFlicker player pool') return df_players
def find_other_ids_w_yahoo(yahoo_id): #yahoo_id = 11702 import sys sys.path.append('python/general') import postgres sys.path.append('python/munging') import player_names import pandas as pd bbdb = postgres.connect_to_bbdb() yahoo_sql = 'SELECT yahoo_id, yahoo_name, yahoo_team, yahoo_elig, fg_id FROM REFERENCE.player_pool_yahoo WHERE yahoo_id=\'' + str( yahoo_id) + '\'' yahoo_info = pd.read_sql_query(yahoo_sql, con=bbdb) #print('Find matches for this player:') #print(yahoo_info) if len(yahoo_info) == 0: print( 'This yahoo_id is not in the Yahoo player pool. Please rerun the player pool generator' ) return False else: yahoo_name = yahoo_info['yahoo_name'].to_list()[0] print('Here is the Yahoo player pool info available on ' + yahoo_name + ':') print(yahoo_info) names = player_names.get_player_names() # If it's already in the list of player names: if yahoo_name in names['name'].to_list(): matches = names[names['name'] == yahoo_name] if len(matches) == 1: print('Found a match!') print('FG info:') fg_id = matches['fg_id'].to_list()[0] fg_info = get_fg_info(fg_id) print(fg_info) perform_merge = input( 'Do you want to merge in the Yahoo ID into the existing match?' ) if perform_merge: sql_update = 'UPDATE reference.player_names SET yahoo_id = \''+\ str(yahoo_id)+\ '\' WHERE fg_id=\''+\ str(fg_id)+'\'' print(sql_update) bbdb.execute(sql_update) player_names.push_player_names_to_gs() else: print('OK, won\'t update') elif len(matches) > 1: print( 'There is more than one match. Please manually update. List of matches:' ) print(matches) else: # If it's not already in the list of player names, see if there is a match in the raw FG data yahoo_sql = \ 'SELECT name, fg_id FROM '+\ '(SELECT "Name" as name, playerid as fg_id from proj.fg_dc_batters_raw '+\ 'UNION '+\ 'SELECT "Name" as name, playerid as fg_id from proj.fg_dc_pitchers_raw '+\ ') fg_raw_proj_union '+\ 'WHERE fg_id NOT IN (SELECT fg_id FROM reference.player_names) ORDER BY name' yahoo_info = pd.read_sql_query(yahoo_sql, con=bbdb) if yahoo_name in yahoo_info['name'].to_list(): matches = yahoo_info[yahoo_info['name'] == yahoo_name] if len(matches) == 1: print('Found a match!') print('FG info:') fg_id = matches['fg_id'].to_list()[0] fg_info = get_fg_info(fg_id) print(fg_info) perform_append = input( 'Do you want to append this to the list of player names?') if perform_append == 'Y': sql_append_new_name = \ 'INSERT INTO reference.player_names (name, fg_id, yahoo_id) '+\ 'VALUES ('+\ '\'' + fg_info[1] + '\', \'' + str(fg_info[0]) + '\', \'' + str(yahoo_id) + '\'' +\ ')' print(sql_append_new_name) bbdb.execute(sql_append_new_name) player_names.push_player_names_to_gs() else: print('OK, won\'t update') else: 'Cannot find an exact name match in the FG projections'
def scrape_pod(): import pandas as pd from general import postgres from munging import player_names bbdb = postgres.connect_to_bbdb() names = player_names.get_player_names() # ------------------------------- # EXTRACT # ------------------------------- pod_file_path = '/Users/andrewfelton/Documents/bb/2021/data/pod/pod_projections.xlsx' pod_hitters = pd.read_excel( pod_file_path, sheet_name='Hitter Projections', names=[ 'Name', 'fg_id', 'Age', 'Lg', 'TM', 'Position', 'Lineup Spot', 'G', 'AB', 'PA', 'HITS', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SB', 'CS', 'AVG', 'OBP', 'SLG', 'OPS', 'ISO', 'wOBA', 'BB_PCT', 'K_PCT', 'BABIP', 'GB_PCT', 'LD_PCT', 'FB_PCT', 'HR_FB' ]) # ------------------------------- # TRANSFORM # ------------------------------- for c in ['Name', 'fg_id', 'Age', 'Lg', 'TM', 'Position', 'Lineup Spot']: pod_hitters[c] = pod_hitters[c].astype(str) # Check to confirm that all the fg_id are in the names table put_missing_in_GS(pod_hitters[['fg_id']], 'fg_id') # ------------------------------- # LOAD # ------------------------------- command = 'TRUNCATE TABLE proj.pod_batters_raw' bbdb.execute(command) pod_hitters.to_sql('pod_batters_raw', bbdb, schema='proj', if_exists='append') print('Uploaded pod_batters to the database') pod_file_path = '/Users/andrewfelton/Documents/bb/2021/data/pod/pod_projections.xlsx' pod_pitchers = pd.read_excel(pod_file_path, sheet_name='Pitcher Projections') pod_pitchers = pd.read_excel( pod_file_path, sheet_name='Pitcher Projections', names=[ 'Name', 'fg_id', 'Age', 'Lg', 'TM', 'Role', 'GS', 'IP', 'IP S', 'W', 'L', 'QS', 'SV', 'ERA', 'H', 'R', 'ER', 'HR', 'SO', 'BB', 'HBP', 'K_9', 'BB_9', 'HR_9', 'K_PCT', 'BB_PCT', 'BAA', 'WHIP', 'BABIP', 'LOB_PCT', 'GB_PCT', 'LD_PCT', 'FB_PCT', 'HR_FB' ]) for c in ['Name', 'fg_id']: pod_pitchers[c] = pod_pitchers[c].astype(str) # Check to confirm that all the fg_id are in the names table put_missing_in_GS(pod_pitchers[['fg_id']], 'fg_id') command = 'TRUNCATE TABLE proj.pod_pitchers_raw' bbdb.execute(command) pod_pitchers.to_sql('pod_pitchers_raw', bbdb, schema='proj', if_exists='append') print('Uploaded pod_pitchers to the database')
def create_combined_pitchers(ls): import pandas as pd from munging import player_names from general import postgres, utilities bbdb = postgres.connect_to_bbdb() query = ( 'SELECT \'razz\' as source, fg_id, ip, qs, era, whip, k as so, sv, hld ' + 'FROM proj.razz_pitchers') df_razz = pd.read_sql_query(query, bbdb) df_razz['svhld'] = (df_razz['sv'] + df_razz['hld']) query = ( 'SELECT \'fg_dc\' as source, fg_id, ip, qs, era, whip, so, sv, hld ' + 'FROM proj.fg_dc_pitchers ') df_fg_dc = pd.read_sql_query(query, bbdb) df_fg_dc['qs'] = df_fg_dc['qs'].replace({0: None}) df_fg_dc['svhld'] = (df_razz['sv'] + df_razz['hld']) df = pd.concat([df_razz, df_fg_dc]) df_ip = df[['source', 'fg_id', 'ip']] query_teams = 'SELECT playerid as fg_id, fg_dc_pitchers_raw."Team" as team FROM proj.fg_dc_pitchers_raw' df_teams = pd.read_sql_query(query_teams, bbdb) # if 'sample' is not predefined then use entire data set for var in ls.pitching_counting_stats: df[var] = df.apply(lambda row: 0 if pd.isna(row[var]) else row[var], axis=1) weights = { 'system': ['fg_dc', 'thebat', 'thebatx', 'pod', 'razz'], 'sys_weight': [1, 1, 1.2, 0, .01] } weights = pd.DataFrame(weights) df = df.merge(right=weights, how='left', left_on='source', right_on='system') weights_ip = { 'system': ['fg_dc', 'thebat', 'thebatx', 'pod', 'razz'], 'sys_weight': [.25, 0, 0, 0, .01] } weights_ip = pd.DataFrame(weights_ip) df_ip = df_ip.merge(right=weights_ip, how='left', left_on='source', right_on='system') def weighted_average(df, data_col, weight_col, by_col): df['_data_times_weight'] = df[data_col] * df[weight_col] df['_weight_where_notnull'] = df[weight_col] * pd.notnull(df[data_col]) g = df.groupby(by_col) result = g['_data_times_weight'].sum( ) / g['_weight_where_notnull'].sum() del df['_data_times_weight'], df['_weight_where_notnull'] result = pd.DataFrame(result, columns=[data_col]) return result df.loc[df['source'] == 'fg_dc', 'qs'] = None combined_pitchers = pd.DataFrame(df_ip['fg_id'].unique(), columns=['fg_id']) statlist = list(set(utilities.flatten([['ip'], ls.pitching_stats]))) for stat in statlist: # do this list(set(*)) to get unique values b/c ip may be in there twice t = weighted_average(df, stat, 'sys_weight', 'fg_id') combined_pitchers = combined_pitchers.merge(t, on='fg_id') # merge in the names and reorder names = player_names.get_player_names() combined_pitchers = combined_pitchers.merge(names[['fg_id', 'name']], on='fg_id', how='left') combined_pitchers = combined_pitchers.merge(df_teams, on='fg_id', how='left') output_stats = ['fg_id', 'name', 'team', 'ip'] for stat in ls.pitching_stats: if (stat in output_stats) is False: output_stats.append(stat) combined_pitchers = combined_pitchers[output_stats] return combined_pitchers
def create_combined_hitters(ls, pa=0): import pandas as pd from general import utilities from general import postgres from munging import player_names bbdb = postgres.connect_to_bbdb() query = ( 'SELECT proj.* FROM (' + 'SELECT \'fg_dc\' as source, fg_id, pa, hr_pa, r_pa, rbi_pa, sb_pa, obp, ops ' + 'FROM proj.fg_dc_batters ' + 'UNION ' + 'SELECT \'thebat\' as source, fg_id, pa, hr_pa, r_pa, rbi_pa, sb_pa, obp, ops ' + 'FROM proj.thebat_batters ' + 'UNION ' + 'SELECT \'thebatx\' as source, fg_id, pa, hr_pa, r_pa, rbi_pa, sb_pa, obp, ops ' + 'FROM proj.thebatx_batters ' + 'UNION ' + 'SELECT \'pod\' as source, fg_id, pa, hr_pa, r_pa, rbi_pa, sb_pa, obp, ops ' + 'FROM proj.pod_batters ' + ') AS proj') df = pd.read_sql_query(query, bbdb) query_pa = ( 'SELECT proj.* FROM (' + 'SELECT \'fg_dc\' as source, fg_id, pa ' + 'FROM proj.fg_dc_batters ' + #'UNION ' + #'SELECT \'pod\' as source, fg_id, pa ' + #'FROM proj.pod_batters ' + ') AS proj') df_pa = pd.read_sql_query(query_pa, bbdb) df_pa.loc[df_pa['fg_id'] == 'sa3011918', 'fg_id'] = '27506' query_teams = 'SELECT playerid as fg_id, fg_dc_batters_raw."Team" as team FROM proj.fg_dc_batters_raw' df_teams = pd.read_sql_query(query_teams, bbdb) df_teams.loc[df_teams['fg_id'] == 'sa3011918', 'fg_id'] = '27506' weights = { 'system': ['fg_dc', 'thebat', 'thebatx', 'pod'], 'sys_weight': [1, 1, 1.2, .6] } weights = pd.DataFrame(weights) df = df.merge(right=weights, how='left', left_on='source', right_on='system') weights_pa = {'system': ['fg_dc', 'pod'], 'sys_weight': [1, 0]} weights_pa = pd.DataFrame(weights_pa) df_pa = df_pa.merge(right=weights_pa, how='left', left_on='source', right_on='system') def weighted_average(df, data_col, weight_col, by_col): df['_data_times_weight'] = df[data_col] * df[weight_col] df['_weight_where_notnull'] = df[weight_col] * pd.notnull(df[data_col]) g = df.groupby(by_col) result = g['_data_times_weight'].sum( ) / g['_weight_where_notnull'].sum() del df['_data_times_weight'], df['_weight_where_notnull'] result = pd.DataFrame(result, columns=[data_col]) return result combined_hitters = pd.DataFrame(df_pa['fg_id'].unique(), columns=['fg_id']) for stat in ['pa']: t = weighted_average(df_pa, stat, 'sys_weight', 'fg_id') combined_hitters = combined_hitters.merge(t, on='fg_id') if (pa > 0): combined_hitters['pa'] = pa stats_pa = [] for stat in ls.hitting_counting_stats: stats_pa.append(stat + '_pa') for stat in utilities.flatten([stats_pa, ls.hitting_rate_stats]): t = weighted_average(df, stat, 'sys_weight', 'fg_id') combined_hitters = combined_hitters.merge(t, on='fg_id') for stat in ls.hitting_counting_stats: stat_pa = stat + '_pa' combined_hitters[ stat] = combined_hitters[stat_pa] * combined_hitters['pa'] combined_hitters = combined_hitters.drop(columns=[stat_pa]) # merge in the names and reorder names = player_names.get_player_names() combined_hitters = combined_hitters.merge(names[['fg_id', 'name']], on='fg_id', how='left') combined_hitters = combined_hitters.merge(df_teams, on='fg_id', how='left') output_stats = utilities.flatten([['fg_id', 'name', 'team', 'pa'], [ls.hitting_stats]]) combined_hitters = combined_hitters[output_stats] return combined_hitters