forked from numericOverflow/ncaab-stats-scraper
/
create_team_mappings.py
38 lines (32 loc) · 1.77 KB
/
create_team_mappings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/python
##############################################################
# Program name: NCAA Basketball Stats Scraper (Team Mappings Module)
# Version: 1.0
# By: Rodrigo Zamith
# License: MPL 2.0 (see LICENSE file in root folder)
# Additional thanks:
##############################################################
# Import modules and libraries
import scraperfunctions
import scrapersettings
import re
from bs4 import BeautifulSoup
if (scrapersettings.map_teams == 1):
print "Generating team mappings"
# Create the file headings
team_mappingfile_w = open(scrapersettings.team_mappingfile, "w")
team_mappingfile_w.writelines("team_id\tteam_name\tteam_url\n")
# Grab data
# Download the page with the list of teams
teamlist_data = scraperfunctions.grabber(scrapersettings.start_url, scrapersettings.params, scrapersettings.http_header) # Get data from main page
teamlist_data_soup = BeautifulSoup(teamlist_data,"html.parser") # Soupify that data
extractTeamID = scraperfunctions.get_regex_extractTeamID()
# Create a mapping for teams
for link in teamlist_data_soup.find_all('a'): # For each hyperlink on the page
linkMatch = extractTeamID.match(link.get('href')) # If the hyperlink contains this string
if linkMatch: # If it does, parse onward
team_id = linkMatch.group(1) # Get the team ID from the URL
team_name = str(link.get_text()) # Get the text associated with the hyperlink
team_url = str(scrapersettings.domain_base + link.get('href')) # Get the URL and append the base domain
team_mappingfile_w.writelines(str(team_id) + "\t" + str(team_name) + "\t" + str(team_url) + "\n") # Add lines to our TSV file for archival purposes
print "Successfully generated team mappings"