Python grabberの例、scraperfunctions.grabber Pythonの例

コード例 #1

0

ファイルを表示

ファイル: create_player_mappings_and_agg_stats.py プロジェクト: meysubb/NCAAB_Scrapper_2016

if (scrapersettings.map_players == 1) or (scrapersettings.summary_players == 1) or (scrapersettings.summary_teams == 1):
    print "Generating player mappings and/or summary data for players and/or summary data for teams"
    # Grab data
    # Parse our mappings file to get our list of teams
    team_mapping = scraperfunctions.get_team_mappings()

    # Parse the stats table
    player_list = [] # Create an empty list for storing all of our players
    team_stats_total = []
    for value, team in enumerate(team_mapping): # For each team in our dictionary
        if scrapersettings.debugmode == 1: print "Processing team " + str(team) + " (" + str(value+1) + " of " + str(len(team_mapping)) + ")"
        roster_url = str(scrapersettings.domain_base) + "/team/" + team + "/stats/" + str(scrapersettings.year_index)
        team_name = team_mapping[team][0]
        try:
             roster_page_data = scraperfunctions.grabber(roster_url, scrapersettings.params, scrapersettings.http_header) # Grab the main page for each team
        except:
             print "Error getting data. Moving on to next game."
             continue    
        roster_page_data_soup = BeautifulSoup(roster_page_data,"html.parser")
        stat_grid = roster_page_data_soup.select('#stat_grid')

        # Get Player Data
        for rowno, row in enumerate(stat_grid[0].find('tbody').findAll('tr')):
            tds = row.findAll('td')
            player_id = tds[1].find('a').get('href').split('=')[-1]
            jersey = str(tds[0].get_text().encode('utf-8').strip())
            name = str(tds[1].find('a').get_text().encode('utf-8').strip())
            year = str(tds[2].get_text().encode('utf-8').strip())
            pos = str(tds[3].get_text().encode('utf-8').strip())
            height = str(tds[4].get_text().encode('utf-8').strip())

コード例 #2

0

ファイルを表示

ファイル: create_team_mappings.py プロジェクト: brettyc/ncaab-stats-scraper

# Import modules and libraries
import scraperfunctions
import scrapersettings
import re
from bs4 import BeautifulSoup

if (scrapersettings.map_teams == 1):
    print "Generating team mappings"
    # Create the file headings
    team_mappingfile_w = open(scrapersettings.team_mappingfile, "w")
    team_mappingfile_w.writelines("team_id\tteam_name\tteam_url\n")

    # Grab data
    # Download the page with the list of teams
    teamlist_data = scraperfunctions.grabber(scrapersettings.start_url, scrapersettings.params, scrapersettings.http_header) # Get data from main page
    teamlist_data_soup = BeautifulSoup(teamlist_data,"html.parser") # Soupify that data

    extractTeamID = scraperfunctions.get_regex_extractTeamID()
    
    # Create a mapping for teams
    for link in teamlist_data_soup.find_all('a'): # For each hyperlink on the page
        
        linkMatch = extractTeamID.match(link.get('href')) # If the hyperlink contains this string
        if linkMatch: # If it does, parse onward
            team_id = linkMatch.group(1) # Get the team ID from the URL
            team_name = str(link.get_text()) # Get the text associated with the hyperlink
            team_url = str(scrapersettings.domain_base + link.get('href')) # Get the URL and append the base domain
            team_mappingfile_w.writelines(str(team_id) + "\t" + str(team_name) + "\t" + str(team_url) + "\n") # Add lines to our TSV file for archival purposes
    print "Successfully generated team mappings"

コード例 #3

0

ファイルを表示

ファイル: create_team_mappings.py プロジェクト: tfei3/ncaab-stats-scraper

# Import modules and libraries
import scraperfunctions
import scrapersettings
from bs4 import BeautifulSoup

if (scrapersettings.map_teams == 1):
    print "Generating team mappings"
    # Create the file headings
    team_mappingfile_w = open(scrapersettings.team_mappingfile, "w")
    team_mappingfile_w.writelines("team_id\tteam_name\tteam_url\n")

    # Grab data
    # Download the page with the list of teams
    teamlist_data = scraperfunctions.grabber(
        scrapersettings.start_url, scrapersettings.params,
        scrapersettings.http_header)  # Get data from main page
    teamlist_data_soup = BeautifulSoup(teamlist_data)  # Soupify that data

    # Create a mapping for teams
    for link in teamlist_data_soup.find_all(
            'a'):  # For each hyperlink on the page
        if "team/index/" + str(
                scrapersettings.year_index
        ) + "?org_id=" in link.get(
                'href'
        ):  # If the hyperlink contains this string (limiting it only to team pages)
            team_id = str(
                link.get('href').split(
                    "team/index/" + str(scrapersettings.year_index) +
                    "?org_id=")[1])  # Get the team ID from the URL

コード例 #4

0

ファイルを表示

ファイル: create_overall_team_stats.py プロジェクト: badlands-telecompany/NCAAF_Scraper

    summary_team_data_w = open(scrapersettings.summary_team_data, "w")
    summary_team_data_w.writelines(
        "team_id\tteam_name\tteam_games\tNet_Rush_Yds\tPass_Yds\tTot_Off\tPlays\tYds_Per_Play\tFirst_Downs_By_Penalty\tPenalties\tPenalties_Per_Game\tPenalties_Yds_Per_Game\tTot_Off_Yards_Per_Game\topp_team_games\topp_Net_Rush_Yds\topp_Pass_Yds\topp_Tot_Off\topp_Plays\topp_Yds_Per_Play\topp_First_Downs_By_Penalty\topp_Penalties\topp_Penalties_Per_Game\topp_Penalties_Yds_Per_Game\topp_Tot_Off_Yards_Per_Game\n"
    )

    team_mapping = scraperfunctions.get_team_mappings()
    team_stats_total = []

    for value, team in enumerate(team_mapping):
        print "Processing team " + str(team) + " (" + str(
            value + 1) + " of " + str(len(team_mapping)) + ")"
        team_name = team_mapping[team][0]
        url = str(scrapersettings.domain_base
                  ) + "/team/" + team + "/stats?id=" + str(
                      scrapersettings.year_index)
        team_mainpage_data = scraperfunctions.grabber(
            url, scrapersettings.params, scrapersettings.http_header)
        team_mainpage_data_soup = BeautifulSoup(team_mainpage_data)
        # Get Correct Url for the Total Stats Category ID
        # The total offense has defensive stats too, because your opponent totals is a sum of
        # what your defense has allowed per game agaisnt you
        link = team_mainpage_data_soup.findAll("a",
                                               href=True,
                                               text='Total Offense')
        href_url = [x.get("href") for x in link]
        regexp = re.compile("year_stat_category_id=(.*)$")
        cat_id = regexp.search(href_url[0]).group(1)
        # New url
        new_url = url + "&year_stat_category_id=" + cat_id
        team_tot_data = scraperfunctions.grabber(new_url,
                                                 scrapersettings.params,
                                                 scrapersettings.http_header)

コード例 #5

0

ファイルを表示

ファイル: create_schedule_mappings.py プロジェクト: brettyc/ncaab-stats-scraper

    # Create the file headings
    schedule_mappingfile_w = open(scrapersettings.schedule_mappingfile, "w")
    schedule_mappingfile_w.writelines("game_id\thome_team_id\taway_team_id\tdate\tneutral_site\tgame_link\n")

    # Grab data
    # Parse our mappings file to get our list of teams
    team_mapping = scraperfunctions.get_team_mappings()
    
    extractTeamID = scraperfunctions.get_regex_extractTeamID()
    
    # Create the schedule
    schedule_list = [] # Create an empty list for storing all of our games
    for value, team in enumerate(team_mapping): # For each team in our dictionary
        if scrapersettings.debugmode == 1: print "Processing team " + str(team) + " (" + str(value+1) + " of " + str(len(team_mapping)) + ")"
        try:
            team_mainpage_data = scraperfunctions.grabber(team_mapping[team][1], scrapersettings.params, scrapersettings.http_header) # Grab the main page for each team
        except:
            print "Error getting data. Moving on to next game."
            continue
        team_mainpage_data_soup = BeautifulSoup(team_mainpage_data,"html.parser") # Soupify that page
        gamelinks = [] # Create a blank list for each game
        for link in team_mainpage_data_soup.find_all('a'): # Locate all links in the document
            if "game/index/" in link.get('href'): # If they contain a URL segment suggesting it is a game...
                game_link = str(scrapersettings.domain_base + link.get('href')).split("?")[0] # Strip out any URL variables since we don't need them
                try:
                    opponent_id =  str(extractTeamID.match(link.find_previous("td").find_previous("td").find("a").get('href')).group(1))
                except:
                    opponent_id = 0
                opponent_text = link.find_previous("td").find_previous("td").get_text().encode('utf-8').strip()
                if "@" in opponent_text: # Checks if home or away; note: if it's in a neutral site, this distinction may not be accurate (but a neutral site is flagged). Assumes all games against non-D-I/III competition is at home.
                    home_team = opponent_id

コード例 #6

0

ファイルを表示

    # Grab data
    # Parse our mappings file to get our list of teams
    team_mapping = scraperfunctions.get_team_mappings()

    extractTeamID = scraperfunctions.get_regex_extractTeamID()

    # Create the schedule
    schedule_list = []  # Create an empty list for storing all of our games
    for value, team in enumerate(
            team_mapping):  # For each team in our dictionary
        if scrapersettings.debugmode == 1:
            print "Processing team " + str(team) + " (" + str(
                value + 1) + " of " + str(len(team_mapping)) + ")"
        try:
            team_mainpage_data = scraperfunctions.grabber(
                team_mapping[team][1], scrapersettings.params, scrapersettings.
                http_header)  # Grab the main page for each team
        except:
            print "Error getting data. Moving on to next game."
            continue
        team_mainpage_data_soup = BeautifulSoup(
            team_mainpage_data, "html.parser")  # Soupify that page
        gamelinks = []  # Create a blank list for each game
        for link in team_mainpage_data_soup.find_all(
                'a'):  # Locate all links in the document
            if "game/index/" in link.get(
                    'href'
            ):  # If they contain a URL segment suggesting it is a game...
                game_link = str(
                    scrapersettings.domain_base + link.get('href')
                ).split("?")[

コード例 #7

0

ファイルを表示

    # Parse our schedule file to get a list of games
    game_mapping = scraperfunctions.get_game_mappings()

    # Parse the stats tables
    team_stats_total = []  # Create an empty list for storing the team stats
    alphanum = re.compile(r'[^\w\s:]+')
    for value, game in enumerate(
            game_mapping):  # For each game in our dictionary
        if scrapersettings.debugmode == 1:
            print "Processing game " + str(game) + " (" + str(
                value + 1) + " of " + str(len(game_mapping)) + ")"
        game_url = game_mapping[game][4]
        try:
            game_page_data = scraperfunctions.grabber(
                game_url, scrapersettings.params, scrapersettings.http_header
            )  # Grab the main page for each team
        except:
            print "Error getting data. Moving on to next game."
            continue
        game_page_data_soup = BeautifulSoup(game_page_data, "html.parser")
        neutral = game_mapping[game][3]
        tables = game_page_data_soup.findAll('table', class_='mytable')
        headertable = tables[0]
        awaystats = tables[1]
        homestats = tables[2]

        # Get Participants
        away_team_header = headertable.findAll('tr')[1]
        tds = away_team_header.findAll('td')
        try: