def rushing(self, player_link, year, **kwargs): # Set up the gamelog suffix gamelog_suffix = '/gamelog/%s/' % year # Modify the player url to point to the gamelog log_url = player_link[:-4] + gamelog_suffix # Get html html = Loader.Loader().load_page(log_url).content.decode() # ************** generate general stats, these need to be combined later ****************** gen = PlayerParser.PlayerParser().parse_general_info(html) # parse tables w pandas df = pd.read_html(html)[0] # hash the columns to determine which fields are being used which_cols = hashlib.md5(json.dumps(list(df.columns.levels[0])).encode()).hexdigest() # Here we make a dict of hashes and their corresponding column parser, this is faster than if/else options = {'c3695be2dd2fa9307301dccf047b4e86': Rushhash.RushHash().md5c3695be2dd2fa9307301dccf047b4e86, '7f97f3885d50fcf9b92797810856a89f': Rushhash.RushHash().md57f97f3885d50fcf9b92797810856a89f, 'aa321161d6f3f5230259dbc4ae67299a': Rushhash.RushHash().md5aa321161d6f3f5230259dbc4ae67299a, '9c11c15180efbf7aec4300fc190cd3a5': Rushhash.RushHash().md59c11c15180efbf7aec4300fc190cd3a5, 'ad9a12e06546e3019128fec57cdc9d0e': Rushhash.RushHash().md5ad9a12e06546e3019128fec57cdc9d0e, '00f83a7c4b3e891e3c448db700cc9ada': Rushhash.RushHash().md500f83a7c4b3e891e3c448db700cc9ada, '5980508dab2f61013bd07809c5ca0e41': Rushhash.RushHash().md55980508dab2f61013bd07809c5ca0e41, 'c35b37a5f0f696bfd1576753faffe81c': Rushhash.RushHash().md5c35b37a5f0f696bfd1576753faffe81c, 'aed81e3e77b9842532b5efa73458a259': Rushhash.RushHash().md5aed81e3e77b9842532b5efa73458a259, '7d21a9a4ab9adde626d633fbd62db5c0': Rushhash.RushHash().md57d21a9a4ab9adde626d633fbd62db5c0, '91138c3c08c339b71b8323e2bac3aac7': Rushhash.RushHash().md591138c3c08c339b71b8323e2bac3aac7, 'ddcb0610869ff21799f008209ac6d229': Rushhash.RushHash().md5ddcb0610869ff21799f008209ac6d229} df = options[which_cols](df) # send df to the common parser df = self.common(df, year) # Add the name df.loc[:, 'Name'] = gen['name'] # Add the players position df.loc[:, 'Pos'] = gen['position'] df['Throws'] = gen['throws'] df['Height'] = gen['height'] df['Weight'] = gen['weight'] df['DOB_mo'] = gen['bday_mo'] df['DOB_day'] = gen['bday_day'] df['DOB_yr'] = gen['bday_yr'] df['College'] = gen['college'] df = df[['Name', 'Pos', 'Height', 'Weight', 'DOB_mo', 'DOB_day', 'DOB_yr', 'College'] + Rushhash.RushHash().base[1:] + ['PF', 'PA'] + Rushhash.RushHash().receiving + Rushhash.RushHash().rushing + Rushhash.RushHash().kick_rt + Rushhash.RushHash().punt_rt + Rushhash.RushHash().scoring2p + Rushhash.RushHash().scoring] return df
def player_links(self, start, *args): links = [] # check to see if a range of years was given if args and int(start < int(args[0])): end = args[0] for year in range(start, end + 1): # load the positions page for the given year url = self.url % year # parse the html content = Loader.Loader().load_page(url).text # Parse urls from the position page that point to the individuals players page players = re.compile( 'data-append-csv=".*?" data-stat="player" csk=".*?" ><a href="(\/players\/[a-zA-Z]\/.*?.htm)"' ).findall(content) # Since they are relative links, attach the base url and append to a list to be returned #links = [] for x in players: links.append('https://www.pro-football-reference.com' + x) # remove duplicate urls from list links = list(dict.fromkeys(links)) else: # load the positions page for the given year url = self.url % start # parse the html content = Loader.Loader().load_page(url).text # Parse urls from the position page that point to the individuals players page players = re.compile( 'data-append-csv=".*?" data-stat="player" csk=".*?" ><a href="(\/players\/[a-zA-Z]\/.*?.htm)"' ).findall(content) # Since they are relative links, attach the base url and append to a list to be returned # links = [] for x in players: links.append('https://www.pro-football-reference.com' + x) return links
def defense(self, player_link, year, **kwargs): # Set up the gamelog suffix gamelog_suffix = '/gamelog/%s/' % year # Modify the player url to point to the gamelog log_url = player_link[:-4] + gamelog_suffix # Get html html = Loader.Loader().load_page(log_url).content.decode() # ************** generate general stats, these need to be combined later ****************** gen = PlayerParser.PlayerParser().parse_general_info(html) # parse tables w pandas df = pd.read_html(html)[0] # hash the columns to determine which fields are being used which_cols = hashlib.md5(json.dumps(list(df.columns.levels[0])).encode()).hexdigest() # Here we make a dict of hashes and their corresponding column parser, this is faster than if/else options = {'0c329a15f241e5c132d0d5c7612032c0': Defhash.DefHash().md50c329a15f241e5c132d0d5c7612032c0, '58ffdd172c2358c5e5ab2e0a1994252a': Defhash.DefHash().md558ffdd172c2358c5e5ab2e0a1994252a, '141f3f6945aa9495c6580650649f4b8f': Defhash.DefHash().md5141f3f6945aa9495c6580650649f4b8f, '109394668745222b0ccbd92bfd0ac4c1': Defhash.DefHash().md5109394668745222b0ccbd92bfd0ac4c1, '60dfaf4e946c4ae3d47c6d8b430c92a4': Defhash.DefHash().md560dfaf4e946c4ae3d47c6d8b430c92a4, 'fa476dd5c907f86452c016e54b3fe0f8': Defhash.DefHash().md5fa476dd5c907f86452c016e54b3fe0f8} df = options[which_cols](df) # send df to the common parser df = self.common(df, year) # Add the name df.loc[:, 'Name'] = gen['name'] # Add the players position df.loc[:, 'Pos'] = gen['position'] df['Throws'] = gen['throws'] df['Height'] = gen['height'] df['Weight'] = gen['weight'] df['DOB_mo'] = gen['bday_mo'] df['DOB_day'] = gen['bday_day'] df['DOB_yr'] = gen['bday_yr'] df['College'] = gen['college'] df = df[['Name', 'Pos', 'Height', 'Weight', 'DOB_mo', 'DOB_day', 'DOB_yr', 'College'] + Defhash.DefHash().base[1:] + ['PF', 'PA'] + Defhash.DefHash().punt_rt + Defhash.DefHash().kick_rt + Defhash.DefHash().scoring + Defhash.DefHash().rush_sk + Defhash.DefHash().def_int] return df
def kicking(self, player_link, year, **kwargs): # Set up the gamelog suffix gamelog_suffix = '/gamelog/%s/' % year # Modify the player url to point to the gamelog log_url = player_link[:-4] + gamelog_suffix # Get html html = Loader.Loader().load_page(log_url).content.decode() # ************** generate general stats, these need to be combined later ****************** gen = PlayerParser.PlayerParser().parse_general_info(html) # parse tables w pandas df = pd.read_html(html)[0] # hash the columns to determine which fields are being used which_cols = hashlib.md5(json.dumps(list(df.columns.levels[0])).encode()).hexdigest() # Here we make a dict of hashes and their corresponding column parser, this is faster than if/else options = {'080683052961d92b5efd07588e614700': Kickhash.KickHash().md5080683052961d92b5efd07588e614700, 'c0fe30e42184e7a59c00c04dc917bb87': Kickhash.KickHash().md5c0fe30e42184e7a59c00c04dc917bb87, '7ad30bf95e287937864b02dca25801bf': Kickhash.KickHash().md57ad30bf95e287937864b02dca25801bf} df = options[which_cols](df) # send df to the common parser df = self.common(df, year) # Add the name df.loc[:, 'Name'] = gen['name'] # Add the players position df.loc[:, 'Pos'] = gen['position'] df['Throws'] = gen['throws'] df['Height'] = gen['height'] df['Weight'] = gen['weight'] df['DOB_mo'] = gen['bday_mo'] df['DOB_day'] = gen['bday_day'] df['DOB_yr'] = gen['bday_yr'] df['College'] = gen['college'] df = df[['Name', 'Pos', 'Height', 'Weight', 'DOB_mo', 'DOB_day', 'DOB_yr', 'College'] + Kickhash.KickHash().base[1:] + ['PF', 'PA'] + Kickhash.KickHash().scoring] return df
def passing(self, player_link, year, **kwargs): # Set up the gamelog suffix gamelog_suffix = '/gamelog/%s/' % year # Modify the player url to point to the gamelog log_url = player_link[:-4] + gamelog_suffix # Get html html = Loader.Loader().load_page(log_url).content.decode() # gent general stats gen = PlayerParser.PlayerParser().parse_general_info(html) # parse tables w pandas df = pd.read_html(html)[0] # drop first level of cols df.columns = df.columns.droplevel() # rename the home column df = df.rename(columns={df.columns[5]: "Home"}) # There may be many extra blank cols, delet them df = df.loc[:, ~df.columns.str.contains('^Unnamed')] # send df to the common parser df = self.common(df, year) # Add the name df.loc[:, 'Name'] = gen['name'] # Add the players position df.loc[:, 'Pos'] = gen['position'] # add additional player info df['Throws'] = gen['throws'] df['Height'] = gen['height'] df['Weight'] = gen['weight'] df['DOB_mo'] = gen['bday_mo'] df['DOB_day'] = gen['bday_day'] df['DOB_yr'] = gen['bday_yr'] df['College'] = gen['college'] return df
def receiving(self, player_link, year, **kwargs): # Set up the gamelog suffix gamelog_suffix = '/gamelog/%s/' % year # Modify the player url to point to the gamelog log_url = player_link[:-4] + gamelog_suffix # Get html html = Loader.Loader().load_page(log_url).content.decode() # ************** generate general stats, these need to be combined later ****************** gen = PlayerParser.PlayerParser().parse_general_info(html) # parse tables w pandas df = pd.read_html(html)[0] # hash the columns to determine which fields are being used which_cols = hashlib.md5(json.dumps(list(df.columns.levels[0])).encode()).hexdigest() # Here we make a dict of hashes and their corresponding column parser, this is faster than if/else options = { "b3c4237d9a10de8cfaad61852cb552c4": Rechash.RecHash().md5b3c4237d9a10de8cfaad61852cb552c4, "bcb96297b50fb2120f475e8e05fbabcd": Rechash.RecHash().md5bcb96297b50fb2120f475e8e05fbabcd, "4560c290b45e942c16cc6d7811345fce": Rechash.RecHash().md54560c290b45e942c16cc6d7811345fce, "4c82a489ec5b2c943e78c9018dcbbca1": Rechash.RecHash().md54c82a489ec5b2c943e78c9018dcbbca1, "e8ffc7202223bb253e92da83b76e9944": Rechash.RecHash().md5e8ffc7202223bb253e92da83b76e9944, "50fcceaa170b1a1e501e3f40548e403d": Rechash.RecHash().md550fcceaa170b1a1e501e3f40548e403d, "e160e714b29305ecfecf513cbf84b80f": Rechash.RecHash().md5e160e714b29305ecfecf513cbf84b80f, "111e8480632f73642d7e20acbdbe6b16": Rechash.RecHash().md5111e8480632f73642d7e20acbdbe6b16, "adc05c5af0f88775d3605d02c831c0ed": Rechash.RecHash().md5adc05c5af0f88775d3605d02c831c0ed, "bfbf86ae0485a0a70692ae04124449b9": Rechash.RecHash().md5bfbf86ae0485a0a70692ae04124449b9, "6b4698269dd34a823cf6b233c6165614": Rechash.RecHash().md56b4698269dd34a823cf6b233c6165614, "7f97f3885d50fcf9b92797810856a89f": Rechash.RecHash().md57f97f3885d50fcf9b92797810856a89f, "aa321161d6f3f5230259dbc4ae67299a": Rechash.RecHash().md5aa321161d6f3f5230259dbc4ae67299a, "1193d47266d4acdcf1b6fca165121100": Rechash.RecHash().md51193d47266d4acdcf1b6fca165121100, "52589e869a13d76c6d0dbf066cab536f": Rechash.RecHash().md552589e869a13d76c6d0dbf066cab536f, "d522b9357244c20714a3b21f8f404918": Rechash.RecHash().md5d522b9357244c20714a3b21f8f404918} df = options[which_cols](df) # send df to the common parser df = self.common(df, year) # Add the name df.loc[:, 'Name'] = gen['name'] # Add the players position df.loc[:, 'Pos'] = gen['position'] df['Throws'] = gen['throws'] df['Height'] = gen['height'] df['Weight'] = gen['weight'] df['DOB_mo'] = gen['bday_mo'] df['DOB_day'] = gen['bday_day'] df['DOB_yr'] = gen['bday_yr'] df['College'] = gen['college'] df = df[['Name', 'Pos', 'Height', 'Weight', 'DOB_mo', 'DOB_day', 'DOB_yr', 'College'] + Rechash.RecHash().base[1:] + ['PF', 'PA'] + Rechash.RecHash().receiving + Rechash.RecHash().rushing + Rechash.RecHash().kick_rt + Rechash.RecHash().punt_rt + Rechash.RecHash().scoring2p + Rechash.RecHash().scoring] return df
def receiving(self, url=None, **kwargs): # We generally pass in a url and then load the page, for testing the function allow html to be passed in if url: response = Loader.Loader().load_page(url) html = response.text else: for k, v in kwargs.items(): if k == 'html': html = v #Scrape general stats general_stats = self.parse_general_info(html) # Here we test to see if the player page being called is for a receiver or running back. Since the dataframe # structure is the same for both positions, we'll call one or the other. If the position is anything else, we # wont try to parse it parseablePositions = ['TE', 'WR'] if not any(x in general_stats['position'] for x in parseablePositions): if any(x in general_stats['position'] for x in ['RB', 'FB']): print(url, " is a ", general_stats['position'], " calling rushing method instead") df = self.rushing(url) else: print( url, " is not a receiver we can parse so we're skipping this player" ) return pd.DataFrame() else: # load the stats table into pandas dataframe. Using 'df' as the variable name to signify it's a pd.DataFrame. df = pd.read_html(html)[0] df = df.iloc[:, :27] # rename columns from origional multirow colums cols = [ 'Year', 'Age', 'Tm', 'Pos', 'No', 'G', 'GS', 'Tgt', 'Rec', 'Rec_Yds', 'Y/R', 'Rec_TD', 'Rec_Lng', 'R/G', 'Rec_Y/G', 'Ctch%', 'Rush', 'Rush_Yds', 'Rush_TD', 'Rush_Lng', 'Y/A', 'Rush_Y/G', 'A/G', 'YScm', 'RRTD', 'Fmb', 'AV' ] try: df.columns = cols except ValueError: print('Column mismatch, check url: ', url, 'skipping and returning blank DF') return pd.DataFrame() # remove the career totals row df['Age'] = pd.to_numeric(df['Age'], errors='coerce') df = df[~df['Age'].isna()] # remove spec characters that are sometimes added to the year to indicate probowl, all pro etc df['Year'] = df['Year'].str.replace('+', '') df['Year'] = df['Year'].str.replace('*', '') # some players have multiple rows w.o a year if they played on more than 1 team in that year df['Year'] = df['Year'].astype(str) df = df[df.Year != 'nan'] df['Year'] = pd.to_numeric(df['Year']) # sometimes this field is blank, so we convert the nan to an empty string so we can parse further df['Ctch%'] = df['Ctch%'].astype(str) df['Ctch%'] = df['Ctch%'].fillna('') # remove % sign on ctch% and convert to float df['Ctch%'] = df['Ctch%'].str.replace('%', '') df['Ctch%'] = pd.to_numeric(df['Ctch%'], errors='coerce') # uppercase some qualitatives df['Tm'] = df['Tm'].str.upper() # Insert general scraped info from player page df['Name'] = general_stats['name'] df['Throws'] = general_stats['throws'] df['Height'] = general_stats['height'] df['Weight'] = general_stats['weight'] df['DOB_mo'] = general_stats['bday_mo'] df['DOB_day'] = general_stats['bday_day'] df['DOB_yr'] = general_stats['bday_yr'] df['College'] = general_stats['college'] # This is hacky but position info isn't always contained in every row if df['Pos'].isnull().values.any(): df['Pos'] = general_stats['position'] df['Pos'] = df['Pos'].str.upper() # rearange the dataframe columns, this is personal preference df = df[[ 'Name', 'Year', 'Age', 'Throws', 'Height', 'Weight', 'DOB_mo', 'DOB_day', 'DOB_yr', 'College', 'Tm', 'Pos', 'No', 'G', 'GS', 'Tgt', 'Rec', 'Rec_Yds', 'Y/R', 'Rec_TD', 'Rec_Lng', 'R/G', 'Rec_Y/G', 'Ctch%', 'Rush', 'Rush_Yds', 'Rush_TD', 'Rush_Lng', 'Y/A', 'Rush_Y/G', 'A/G', 'YScm', 'RRTD', 'Fmb', 'AV' ]] return df
def kicking(self, url=None, **kwargs): # We generally pass in a url and then load the page, for testing the function allow html to be passed in if url: response = Loader.Loader().load_page(url) html = response.text else: for k, v in kwargs.items(): if k == 'html': html = v # Scrape general stats general_stats = self.parse_general_info(html) # Ensure we're only parsing QB's parseablePositions = ['K', 'P'] if not any(x in general_stats['position'] for x in parseablePositions): print( url, " is not a kicker we can parse so we're skipping this player") return pd.DataFrame() else: # load the stats table into pandas dataframe. Using 'df' as the variable name to signify it's a pd.DataFrame. df = pd.read_html(html)[0] # sometimes there's unneeded cols df = df.iloc[:, :30] # rename columns from original multirow colums cols = [ 'Year', 'Age', 'Tm', 'Pos', 'No.', 'G', 'GS', '0-19FGA', '0-19FGM', '20-29FGA', '20-29FGM', '30-39FGA', '30-39FGM', '40-49FGA', '40-49FGM', '50+FGA', '50+FGM', 'scr_FGA', 'scr_FGM', 'Lng', 'scr_FG%', 'scr_XPA', 'scr_XPM', 'scr_XP%', 'Pnt', 'Yds', 'Lng', 'Blck', 'Y/P', 'AV' ] try: df.columns = cols except ValueError: print('Column mismatch, check url: ', url) # remove the career totals row df['Age'] = pd.to_numeric(df['Age'], errors='coerce') df = df[~df['Age'].isna()] # remove spec characters that are sometimes added to the year to indicate probowl, all pro etc df['Year'] = df['Year'].astype(str) df['Year'] = df['Year'].str.replace('+', '') df['Year'] = df['Year'].str.replace('*', '') # some players have multiple rows w.o a year if they played on more than 1 team in that year df = df[df.Year != 'nan'] df['Year'] = pd.to_numeric(df['Year']) # uppercase some qualitatives df['Tm'] = df['Tm'].str.upper() # Insert general scraped info from player page df['Name'] = general_stats['name'] df['Throws'] = general_stats['throws'] df['Height'] = general_stats['height'] df['Weight'] = general_stats['weight'] df['DOB_mo'] = general_stats['bday_mo'] df['DOB_day'] = general_stats['bday_day'] df['DOB_yr'] = general_stats['bday_yr'] df['College'] = general_stats['college'] # This is hacky but position info isn't always contained in every row if df['Pos'].isnull().values.any(): df['Pos'] = general_stats['position'] df = df[[ 'Name', 'Year', 'Age', 'Throws', 'Height', 'Weight', 'DOB_mo', 'DOB_day', 'DOB_yr', 'College', 'Tm', 'Pos', 'No.', 'G', 'GS', '0-19FGA', '0-19FGM', '20-29FGA', '20-29FGM', '30-39FGA', '30-39FGM', '40-49FGA', '40-49FGM', '50+FGA', '50+FGM', 'scr_FGA', 'scr_FGM', 'Lng', 'scr_FG%', 'scr_XPA', 'scr_XPM', 'scr_XP%', 'Pnt', 'Yds', 'Lng', 'Blck', 'Y/P', 'AV' ]] return df
def defense(self, url=None, **kwargs): # We generally pass in a url and then load the page, for testing the function allow html to be passed in if url: response = Loader.Loader().load_page(url) html = response.text else: for k, v in kwargs.items(): if k == 'html': html = v # Scrape general stats general_stats = self.parse_general_info(html) # load the stats table into pandas dataframe. Using 'df' as the variable name to signify it's a pd.DataFrame. df = pd.read_html(html)[0] df = df.iloc[:, :22] cols = [ 'Year', 'Age', 'Tm', 'Pos', 'No.', 'G', 'GS', 'Int', 'Yds', 'TD', 'Lng', 'PD', 'FF', 'Fmb', 'FR', 'Fmb_Yds', 'Fmb_TD', 'Sk', 'Tkl', 'Ast', 'Sfty', 'AV' ] try: df.columns = cols except: print('Column mismatch, check url: ', url, 'skipping and returning blank DF') return pd.DataFrame() # remove the career totals row df['Age'] = pd.to_numeric(df['Age'], errors='coerce') df = df[~df['Age'].isna()] # some players have multiple rows w.o a year if they played on more than 1 team in that year df = df[df.Year != 'nan'] # remove spec characters that are sometimes added to the year to indicate probowl, all pro etc df['Year'] = df['Year'].str.replace('+', '') df['Year'] = df['Year'].str.replace('*', '') df['Year'] = pd.to_numeric(df['Year']) # uppercase some qualitatives df['Tm'] = df['Tm'].str.upper() # Insert general scraped info from player page df['Name'] = general_stats['name'] df['Throws'] = general_stats['throws'] df['Height'] = general_stats['height'] df['Weight'] = general_stats['weight'] df['DOB_mo'] = general_stats['bday_mo'] df['DOB_day'] = general_stats['bday_day'] df['DOB_yr'] = general_stats['bday_yr'] df['College'] = general_stats['college'] # This is hacky but position info isn't always contained in every row if df['Pos'].isnull().values.any(): df['Pos'] = general_stats['position'] df['Pos'] = df['Pos'].str.upper() df = df[[ 'Name', 'Year', 'Age', 'Throws', 'Height', 'Weight', 'DOB_mo', 'DOB_day', 'DOB_yr', 'College', 'Tm', 'Pos', 'No.', 'G', 'GS', 'Int', 'Yds', 'TD', 'Lng', 'PD', 'FF', 'Fmb', 'FR', 'Fmb_Yds', 'Fmb_TD', 'Sk', 'Tkl', 'Ast', 'Sfty', 'AV' ]] return df
def passing(self, url=None, **kwargs): # We generally pass in a url and then load the page, for testing the function allow html to be passed in if url: response = Loader.Loader().load_page(url) html = response.text else: for k, v in kwargs.items(): if k == 'html': html = v # Scrape general stats general_stats = self.parse_general_info(html) # Ensure we're only parsing QB's parseablePositions = ['QB'] if not any(x in general_stats['position'] for x in parseablePositions): print( url, " is not a quarterback we can parse so we're skipping this player" ) return pd.DataFrame() else: # load the stats table into pandas dataframe. Using 'df' as the variable name to signify it's a # pd.DataFrame. df = pd.read_html(html)[0] # remove the career totals row df['Age'] = pd.to_numeric(df['Age'], errors='coerce') df = df[~df['Age'].isna()] # remove spec characters that are sometimes added to the year to indicate probowl, all pro etc df['Year'] = df['Year'].str.replace('+', '') df['Year'] = df['Year'].str.replace('*', '') # some players have multiple rows w.o a year if they played on more than 1 team in that year df = df[df.Year != 'nan'] df['Year'] = pd.to_numeric(df['Year']) df['GS'] = pd.to_numeric(df['GS']) # Insert general scraped info from player page df['Pos'] = general_stats['position'] df['Name'] = general_stats['name'] df['Throws'] = general_stats['throws'] df['Height'] = general_stats['height'] df['Weight'] = general_stats['weight'] df['DOB_mo'] = general_stats['bday_mo'] df['DOB_day'] = general_stats['bday_day'] df['DOB_yr'] = general_stats['bday_yr'] df['College'] = general_stats['college'] # uppercase some qualitatives df['Tm'] = df['Tm'].str.upper() df['Pos'] = df['Pos'].str.upper() # Parse out rushing and receiving information and append to the passing info soup = BeautifulSoup(html, 'lxml') # parse out the chunk of rushing and receiving info from the html comments rush_cols = [ 'Year', 'Age', 'Tm', 'Pos', 'No.', 'G', 'GS', 'Rush', 'Rush_Yds', 'Rush_TD', 'Rush_Lng', 'Rush_Y/A', 'Rush_Y/G', 'A/G', 'Tgt', 'Rec', 'Rec_Yds', 'Y/R', 'Rec_TD', 'Rec_Lng', 'R/G', 'Rec_Y/G', 'Ctch%', 'YScm', 'RRTD', 'Fmb' ] # we need to keep track of if we actually found rushing info found = False #Rushing info for QBs is commented out unless java is enabled, so search comments for comment in soup.findAll( text=lambda text: isinstance(text, Comment)): if 'id="div_rushing_and_receiving">' in comment: new_html = comment rush_df = pd.read_html(new_html)[0] rush_df = rush_df.iloc[:, :26] try: rush_df.columns = rush_cols except ValueError: print('Column mismatch, check url: ', url) # munge the columns similar to above # remove the career totals row rush_df['Age'] = pd.to_numeric(rush_df['Age'], errors='coerce') rush_df = rush_df[~rush_df['Age'].isna()] # remove spec characters that are sometimes added to the year to indicate probowl, all pro etc rush_df['Year'] = rush_df['Year'].str.replace('+', '') rush_df['Year'] = rush_df['Year'].str.replace('*', '') # some players have multiple rows w.o a year if they played on more than 1 team in that year rush_df = rush_df[rush_df.Year != 'nan'] rush_df['Year'] = pd.to_numeric(rush_df['Year']) # This is hacky but position info isn't always contained in every row rush_df['Pos'] = general_stats['position'] # uppercase some qualitatives rush_df['Tm'] = rush_df['Tm'].str.upper() rush_df['Pos'] = rush_df['Pos'].str.upper() # Ensure that we know we have the rushing info we're looking for found = True # if we didn't get any rushing info, create an empty df if not found: rush_df = pd.DataFrame(columns=rush_cols) # merge the two DataFrames on overlapping columns and return combined_df = pd.merge( df, rush_df, on=['Year', 'Age', 'Tm', 'Pos', 'No.', 'G', 'GS'], how='left') return combined_df
def defense_req(): print('loaded defense html') return Loader.Loader().load_page(urls['defense'])
def kicking_req(): print('loaded kicking html') return Loader.Loader().load_page(urls['kicking'])
def rushing_req(): print('loaded rushing html') return Loader.Loader().load_page(urls['rushing'])
def passing_req(): print('loaded passing html') return Loader.Loader().load_page(urls['passing'])