if (scrapersettings.map_players == 1) or (scrapersettings.summary_players == 1) or (scrapersettings.summary_teams == 1): print "Generating player mappings and/or summary data for players and/or summary data for teams" # Grab data # Parse our mappings file to get our list of teams team_mapping = scraperfunctions.get_team_mappings() # Parse the stats table player_list = [] # Create an empty list for storing all of our players team_stats_total = [] for value, team in enumerate(team_mapping): # For each team in our dictionary if scrapersettings.debugmode == 1: print "Processing team " + str(team) + " (" + str(value+1) + " of " + str(len(team_mapping)) + ")" roster_url = str(scrapersettings.domain_base) + "/team/" + team + "/stats/" + str(scrapersettings.year_index) team_name = team_mapping[team][0] try: roster_page_data = scraperfunctions.grabber(roster_url, scrapersettings.params, scrapersettings.http_header) # Grab the main page for each team except: print "Error getting data. Moving on to next game." continue roster_page_data_soup = BeautifulSoup(roster_page_data,"html.parser") stat_grid = roster_page_data_soup.select('#stat_grid') # Get Player Data for rowno, row in enumerate(stat_grid[0].find('tbody').findAll('tr')): tds = row.findAll('td') player_id = tds[1].find('a').get('href').split('=')[-1] jersey = str(tds[0].get_text().encode('utf-8').strip()) name = str(tds[1].find('a').get_text().encode('utf-8').strip()) year = str(tds[2].get_text().encode('utf-8').strip()) pos = str(tds[3].get_text().encode('utf-8').strip()) height = str(tds[4].get_text().encode('utf-8').strip())
# Import modules and libraries import scraperfunctions import scrapersettings import re from bs4 import BeautifulSoup if (scrapersettings.map_teams == 1): print "Generating team mappings" # Create the file headings team_mappingfile_w = open(scrapersettings.team_mappingfile, "w") team_mappingfile_w.writelines("team_id\tteam_name\tteam_url\n") # Grab data # Download the page with the list of teams teamlist_data = scraperfunctions.grabber(scrapersettings.start_url, scrapersettings.params, scrapersettings.http_header) # Get data from main page teamlist_data_soup = BeautifulSoup(teamlist_data,"html.parser") # Soupify that data extractTeamID = scraperfunctions.get_regex_extractTeamID() # Create a mapping for teams for link in teamlist_data_soup.find_all('a'): # For each hyperlink on the page linkMatch = extractTeamID.match(link.get('href')) # If the hyperlink contains this string if linkMatch: # If it does, parse onward team_id = linkMatch.group(1) # Get the team ID from the URL team_name = str(link.get_text()) # Get the text associated with the hyperlink team_url = str(scrapersettings.domain_base + link.get('href')) # Get the URL and append the base domain team_mappingfile_w.writelines(str(team_id) + "\t" + str(team_name) + "\t" + str(team_url) + "\n") # Add lines to our TSV file for archival purposes print "Successfully generated team mappings"
# Import modules and libraries import scraperfunctions import scrapersettings from bs4 import BeautifulSoup if (scrapersettings.map_teams == 1): print "Generating team mappings" # Create the file headings team_mappingfile_w = open(scrapersettings.team_mappingfile, "w") team_mappingfile_w.writelines("team_id\tteam_name\tteam_url\n") # Grab data # Download the page with the list of teams teamlist_data = scraperfunctions.grabber( scrapersettings.start_url, scrapersettings.params, scrapersettings.http_header) # Get data from main page teamlist_data_soup = BeautifulSoup(teamlist_data) # Soupify that data # Create a mapping for teams for link in teamlist_data_soup.find_all( 'a'): # For each hyperlink on the page if "team/index/" + str( scrapersettings.year_index ) + "?org_id=" in link.get( 'href' ): # If the hyperlink contains this string (limiting it only to team pages) team_id = str( link.get('href').split( "team/index/" + str(scrapersettings.year_index) + "?org_id=")[1]) # Get the team ID from the URL
summary_team_data_w = open(scrapersettings.summary_team_data, "w") summary_team_data_w.writelines( "team_id\tteam_name\tteam_games\tNet_Rush_Yds\tPass_Yds\tTot_Off\tPlays\tYds_Per_Play\tFirst_Downs_By_Penalty\tPenalties\tPenalties_Per_Game\tPenalties_Yds_Per_Game\tTot_Off_Yards_Per_Game\topp_team_games\topp_Net_Rush_Yds\topp_Pass_Yds\topp_Tot_Off\topp_Plays\topp_Yds_Per_Play\topp_First_Downs_By_Penalty\topp_Penalties\topp_Penalties_Per_Game\topp_Penalties_Yds_Per_Game\topp_Tot_Off_Yards_Per_Game\n" ) team_mapping = scraperfunctions.get_team_mappings() team_stats_total = [] for value, team in enumerate(team_mapping): print "Processing team " + str(team) + " (" + str( value + 1) + " of " + str(len(team_mapping)) + ")" team_name = team_mapping[team][0] url = str(scrapersettings.domain_base ) + "/team/" + team + "/stats?id=" + str( scrapersettings.year_index) team_mainpage_data = scraperfunctions.grabber( url, scrapersettings.params, scrapersettings.http_header) team_mainpage_data_soup = BeautifulSoup(team_mainpage_data) # Get Correct Url for the Total Stats Category ID # The total offense has defensive stats too, because your opponent totals is a sum of # what your defense has allowed per game agaisnt you link = team_mainpage_data_soup.findAll("a", href=True, text='Total Offense') href_url = [x.get("href") for x in link] regexp = re.compile("year_stat_category_id=(.*)$") cat_id = regexp.search(href_url[0]).group(1) # New url new_url = url + "&year_stat_category_id=" + cat_id team_tot_data = scraperfunctions.grabber(new_url, scrapersettings.params, scrapersettings.http_header)
# Create the file headings schedule_mappingfile_w = open(scrapersettings.schedule_mappingfile, "w") schedule_mappingfile_w.writelines("game_id\thome_team_id\taway_team_id\tdate\tneutral_site\tgame_link\n") # Grab data # Parse our mappings file to get our list of teams team_mapping = scraperfunctions.get_team_mappings() extractTeamID = scraperfunctions.get_regex_extractTeamID() # Create the schedule schedule_list = [] # Create an empty list for storing all of our games for value, team in enumerate(team_mapping): # For each team in our dictionary if scrapersettings.debugmode == 1: print "Processing team " + str(team) + " (" + str(value+1) + " of " + str(len(team_mapping)) + ")" try: team_mainpage_data = scraperfunctions.grabber(team_mapping[team][1], scrapersettings.params, scrapersettings.http_header) # Grab the main page for each team except: print "Error getting data. Moving on to next game." continue team_mainpage_data_soup = BeautifulSoup(team_mainpage_data,"html.parser") # Soupify that page gamelinks = [] # Create a blank list for each game for link in team_mainpage_data_soup.find_all('a'): # Locate all links in the document if "game/index/" in link.get('href'): # If they contain a URL segment suggesting it is a game... game_link = str(scrapersettings.domain_base + link.get('href')).split("?")[0] # Strip out any URL variables since we don't need them try: opponent_id = str(extractTeamID.match(link.find_previous("td").find_previous("td").find("a").get('href')).group(1)) except: opponent_id = 0 opponent_text = link.find_previous("td").find_previous("td").get_text().encode('utf-8').strip() if "@" in opponent_text: # Checks if home or away; note: if it's in a neutral site, this distinction may not be accurate (but a neutral site is flagged). Assumes all games against non-D-I/III competition is at home. home_team = opponent_id
# Grab data # Parse our mappings file to get our list of teams team_mapping = scraperfunctions.get_team_mappings() extractTeamID = scraperfunctions.get_regex_extractTeamID() # Create the schedule schedule_list = [] # Create an empty list for storing all of our games for value, team in enumerate( team_mapping): # For each team in our dictionary if scrapersettings.debugmode == 1: print "Processing team " + str(team) + " (" + str( value + 1) + " of " + str(len(team_mapping)) + ")" try: team_mainpage_data = scraperfunctions.grabber( team_mapping[team][1], scrapersettings.params, scrapersettings. http_header) # Grab the main page for each team except: print "Error getting data. Moving on to next game." continue team_mainpage_data_soup = BeautifulSoup( team_mainpage_data, "html.parser") # Soupify that page gamelinks = [] # Create a blank list for each game for link in team_mainpage_data_soup.find_all( 'a'): # Locate all links in the document if "game/index/" in link.get( 'href' ): # If they contain a URL segment suggesting it is a game... game_link = str( scrapersettings.domain_base + link.get('href') ).split("?")[
# Parse our schedule file to get a list of games game_mapping = scraperfunctions.get_game_mappings() # Parse the stats tables team_stats_total = [] # Create an empty list for storing the team stats alphanum = re.compile(r'[^\w\s:]+') for value, game in enumerate( game_mapping): # For each game in our dictionary if scrapersettings.debugmode == 1: print "Processing game " + str(game) + " (" + str( value + 1) + " of " + str(len(game_mapping)) + ")" game_url = game_mapping[game][4] try: game_page_data = scraperfunctions.grabber( game_url, scrapersettings.params, scrapersettings.http_header ) # Grab the main page for each team except: print "Error getting data. Moving on to next game." continue game_page_data_soup = BeautifulSoup(game_page_data, "html.parser") neutral = game_mapping[game][3] tables = game_page_data_soup.findAll('table', class_='mytable') headertable = tables[0] awaystats = tables[1] homestats = tables[2] # Get Participants away_team_header = headertable.findAll('tr')[1] tds = away_team_header.findAll('td') try: