def parseWeek(year, week): """ parses a specific week on http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1 which contains a csv of the fan duel player prices stores this info in fanduel_prices collection """ logger = makeLogger(str(year) + '_' + str(week), r'./logs_RotoFDStats/') startTime = datetime.now() logger.debug('Starting %d', year) client = MongoClient('localhost', 27017) db = client['nfl_data'] col_fanduel_prices = db['fanduel_prices'] if col_fanduel_prices.find({'year': year, 'yeek': week}).count(): logger.debug('Already parsed %d %d', year, week) closeLogger(logger) return None wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html.parser', user_agent=get_user_agent(logger), timeout=10) url = "http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1".format( week, year) browser = open_or_follow_link(logger, browser, 'open', url) docs = [] try: data = browser.find('pre').text lines = data.split('\n') header = lines[0] header = header.split(';') lines = lines[1:] for line in lines: doc = {} if not line: continue for index, each in enumerate(line.split(';')): doc[cleanKey(header[index])] = convertToNumber(each) docs.append(doc) except: logger.exception("Parse fail: %s", url) try: logger.debug('Bulk Creating docs') col_fanduel_prices.insert_many(docs) except: logger.exception('insert_many error') logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime)) closeLogger(str(year) + '_' + str(week))
def run(wait): """Starts the scrapping proccess. Opens a teamstats page and gathers all the form inputs Then sends these inputs to parseSeason which opens a new page for every possible option in the form If you get an error at the start, with role.find_all, just try again, nfl.com returns weird pages sometimes """ logger = makeLogger('main', r'./logs_nflteamStat/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) pool = Pool(processes=int(get_proxy_count()/2.5)) #html5lib parser required for broken html on gameSplits browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) startingUrl = "http://www.nfl.com/stats/categorystats?tabSeq=2&offensiveStatisticCategory=GAME_STATS&conference=ALL&role=TM&season=2015&seasonType=REG" browser = open_or_follow_link(logger, browser, 'open', startingUrl) role = browser.find(id="role") roles = role.find_all("option") offensiveCategory = browser.find(id="offensive-category") offensiveCategories = offensiveCategory.find_all("option") defensiveCategory = browser.find(id="defensive-category") defensiveCategories = defensiveCategory.find_all("option") season = browser.find(id="season-dropdown") seasons = season.find_all("option") seasonType = browser.find(id="season-type") seasonTypes = seasonType.find_all("option") for role in roles: availableCategories = None if role.text == "Offense": availableCategories = offensiveCategories elif role.text == "Defense": availableCategories = defensiveCategories else: print "unknown role" for category in availableCategories: if category.text == "Category...": continue for season in seasons: if season.text == "Season..." or convertToNumber(removeNewLine(season.text)) < 1960: continue #parseSeason(role, category, season, seasonTypes) pool.apply_async(parseSeason, (role, category, season, seasonTypes,)) pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now()-startTime )) closeLogger('main')
def parseWeek(year, week): """ parses a specific week on http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1 which contains a csv of the fan duel player prices stores this info in fanduel_prices collection """ logger = makeLogger(str(year) + '_' + str(week), r'./logs_RotoFDStats/') startTime = datetime.now() logger.debug('Starting %d', year) client = MongoClient('localhost', 27017) db = client['nfl_data'] col_fanduel_prices = db['fanduel_prices'] if col_fanduel_prices.find({'year': year, 'yeek': week}).count(): logger.debug('Already parsed %d %d', year, week) closeLogger(logger) return None wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html.parser', user_agent=get_user_agent(logger), timeout=10) url = "http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1".format(week, year) browser = open_or_follow_link(logger, browser, 'open', url) docs = [] try: data = browser.find('pre').text lines = data.split('\n') header = lines[0] header = header.split(';') lines = lines[1:] for line in lines: doc = {} if not line: continue for index, each in enumerate(line.split(';')): doc[cleanKey(header[index])] = convertToNumber(each) docs.append(doc) except: logger.exception("Parse fail: %s", url) try: logger.debug('Bulk Creating docs') col_fanduel_prices.insert_many(docs) except: logger.exception('insert_many error') logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime)) closeLogger(str(year) + '_' + str(week))
def run(wait): """ """ logger = makeLogger('main', r'./logs_pfrPlayerStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) player_tuples = [] for letter in list(string.ascii_uppercase): wait = random.uniform(.5, 1.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening players %s', letter) browser = open_or_follow_link( logger, browser, 'open', "http://www.pro-football-reference.com/players/{}/".format(letter)) players = browser.find(id="div_players") for player in players.find_all('p'): player = player.find('a') player_tuples.append((player.text, player['href'])) pool = Pool(processes=int(get_proxy_count() / 2.5)) logger.debug('Processing %d Players', len(player_tuples)) for player_tuple in player_tuples: #parsePlayer(player_tuple[0], player_tuple[1]) pool.apply_async(parsePlayer, ( player_tuple[0], player_tuple[1], )) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger)
def parseTeam(team_url, team_name): """ parses a teams page returns a list of year urls there is some data on this page that would be usefull to scrape in the future """ logger = makeLogger(cleanKey(team_name), r'./logs_pfrTeamStats/') startTime = datetime.now() logger.debug('Starting %s', team_name) wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', team_url) table = browser.find(id='team_index').find('tbody') year_columns = table.find_all('th') year_url_tups = [] for index, year_column in enumerate(year_columns): logger.debug('Row %d of %d', index, len(year_columns)) try: year_link = year_column.find('a') if year_link: year_url = 'http://www.pro-football-reference.com' + year_link[ 'href'] year = convertToNumber(year_link.text) if not isinstance(year, int): continue year_url_tups.append((team_name, year_url, year)) except: logger.exception(year_column) logger.debug('parseTeam time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger) return year_url_tups
def parseTeam(team_url, team_name): """ parses a teams page returns a list of year urls there is some data on this page that would be usefull to scrape in the future """ logger = makeLogger(cleanKey(team_name), r"./logs_pfrTeamStats/") startTime = datetime.now() logger.debug("Starting %s", team_name) wait = random.uniform(1.5, 3.5) logger.debug("Waiting %f", wait) time.sleep(wait) logger.debug("Opening main page") browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, "open", team_url) table = browser.find(id="team_index").find("tbody") year_columns = table.find_all("th") year_url_tups = [] for index, year_column in enumerate(year_columns): logger.debug("Row %d of %d", index, len(year_columns)) try: year_link = year_column.find("a") if year_link: year_url = "http://www.pro-football-reference.com" + year_link["href"] year = convertToNumber(year_link.text) if not isinstance(year, int): continue year_url_tups.append((team_name, year_url, year)) except: logger.exception(year_column) logger.debug("parseTeam time elapsed: " + str(datetime.now() - startTime)) closeLogger(logger) return year_url_tups
def run(wait): """ """ logger = makeLogger('main', r'./logs_pfrPlayerStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) player_tuples = [] for letter in list(string.ascii_uppercase): wait = random.uniform(.5,1.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening players %s', letter) browser = open_or_follow_link(logger, browser, 'open', "http://www.pro-football-reference.com/players/{}/".format(letter)) players = browser.find(id="div_players") for player in players.find_all('p'): player = player.find('a') player_tuples.append((player.text, player['href'])) pool = Pool(processes=int(get_proxy_count()/2.5)) logger.debug('Processing %d Players', len(player_tuples)) for player_tuple in player_tuples: #parsePlayer(player_tuple[0], player_tuple[1]) pool.apply_async(parsePlayer, (player_tuple[0], player_tuple[1],)) pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger)
def parsePlayer(playerUrl): """ Starting point to parsing all pages for a given player. Collects all links for a player and calls the approiate parsers for each link. """ startTime = datetime.now() playerId = re.search('.*?id=(.*)\?*', playerUrl).group(1) logger = makeLogger(playerId, r'./logs_nflPlayerStats/') #html5lib parser required for broken html on gameSplits browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) wait = random.uniform(2, 4) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening %s', playerUrl) browser = open_or_follow_link(logger, browser, 'open', playerUrl) #gets the actual playerUrl, the orignal value gets redirected playerUrl_api = playerUrl playerUrl = browser.url try: #parsePlayer BIO playerBio = browser.find(class_="player-info") player_profile_id = parsePlayerBio(logger, playerBio, playerUrl, playerUrl_api) if not player_profile_id: logger.debug('New player profile not made, skiping rest of tabs') return #Gets the links for each category tab, i.e Profile, career stats, game logs ... tabNames = [ tabName['href'] for tabName in browser.find(id="player-profile-tabs").find_all('a') ] for tabName in tabNames: if tabName == 'profile': continue playerUrl = getPlayerTabUrl(playerUrl, tabName) wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening %s', playerUrl) browser = open_or_follow_link(logger, browser, 'open', playerUrl) if tabName == 'careerstats': #parse careerstats careerStats = browser.find(id="player-stats-wrapper") careerStats = careerStats.find_all("table") parseCareerStats(logger, careerStats, player_profile_id) elif tabName == 'gamelogs': #Get the list of years gameLogYears = browser.find(id="criteria") gameLogYears = gameLogYears.find_all("option") yearsList = [] for year in gameLogYears: year = year.text.strip() if year: yearsList.append(year) #parse the first year of gameLogs since its already loaded gameLogs = browser.find(id="player-stats-wrapper") gameLogs = gameLogs.find_all("table") parseGameLogs(logger, gameLogs, yearsList[0], player_profile_id) #Parse the rest of the years for year in yearsList[1:]: playerUrl = getPlayerTabUrl(playerUrl, tabName) + '?season=' + year wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening %s', playerUrl) browser = open_or_follow_link(logger, browser, 'open', playerUrl) gameLogs = browser.find(id="player-stats-wrapper") gameLogs = gameLogs.find_all("table") parseGameLogs(logger, gameLogs, year, player_profile_id) elif tabName == 'gamesplits': #Get the list of years years = browser.find(id="criteria") years = years.find_all("option") yearsList = [] for year in years: year = year.text.strip() if year: yearsList.append(year) #parse the first year of gamesplits since its already loaded gameSplits = browser.find(id="player-stats-wrapper") parseSplits(logger, gameSplits, yearsList[0], 'game', player_profile_id) #Parse the rest of the years for year in yearsList[1:]: playerUrl = getPlayerTabUrl(playerUrl, tabName) + '?season=' + year wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening %s', playerUrl) browser = open_or_follow_link(logger, browser, 'open', playerUrl) gameSplits = browser.find(id="player-stats-wrapper") parseSplits(logger, gameSplits, year, 'game', player_profile_id) elif tabName == 'situationalstats': #Get the list of years years = browser.find(id="criteria") years = years.find_all("option") yearsList = [] for year in years: year = year.text.strip() if year: yearsList.append(year) #parse the first year of gamesplits since its already loaded situationalStats = browser.find(id="player-stats-wrapper") parseSplits(logger, situationalStats, yearsList[0], 'situational', player_profile_id) #Parse the rest of the years for year in yearsList[1:]: playerUrl = getPlayerTabUrl(playerUrl, tabName) + '?season=' + year wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening %s', playerUrl) browser = open_or_follow_link(logger, browser, 'open', playerUrl) situationalStats = browser.find(id="player-stats-wrapper") parseSplits(logger, situationalStats, year, 'situational', player_profile_id) elif tabName == 'draft': draft = browser.find(id="player-stats-wrapper") parseDraft(logger, draft, player_profile_id) elif tabName == 'combine': combine = browser.find(id="player-stats-wrapper") parseCombine(logger, combine, player_profile_id) except: logger.exception('Failed parsing player') logger.debug('parsePlayer time elapsed: ' + str(datetime.now() - startTime)) closeLogger(playerId)
def parseSeason(role, category, season, seasonTypes): """Parses every seasonType in a season at http://www.nfl.com/stats/categorystats for a given role/category/season doesnt follow any links some years dont have any info, but still return a page. These are loged with Exception('No teams found %s' % url) All data is stored in team_stats """ logger = makeLogger(role.text + '_' + category.text + '_' + season.text, r'./logs_nflteamStat/') startTime = datetime.now() logger.debug('Starting %s %s %s', role.text, category.text, season.text) teamStat_list = [] for seasonType in seasonTypes: if seasonType.text == "Season Type...": continue team_stats_query = {'year': convertToNumber(removeNewLine(season.text)), 'seasonType': removeNewLine(seasonType.text), 'role': removeNewLine(role.text), 'category': removeNewLine(category.text) } if col_team_stats.find(team_stats_query).count(): logger.debug('Already parsed %s', team_stats_query) continue wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Starting: %s', team_stats_query) url = 'http://www.nfl.com/stats/categorystats?' + 'archive=true&conference=null' + '&role=' + role['value'] try: if role.text == "Offense": categoryUrl = '&offensiveStatisticCategory=' + category['value'] + '&defensiveStatisticCategory=null' elif role.text == "Defense": categoryUrl = '&offensiveStatisticCategory=null&defensiveStatisticCategory=' + category['value'] else: raise Exception('Unsupported role: %s', role.text) url += categoryUrl url += '&season=' + season['value'] + '&seasonType=' + seasonType['value'] + '&tabSeq=2&qualified=false&Submit=Go' logger.debug('Opening: %s', url) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', url) result = browser.find(id="result") tries = 0 # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded while not result: if tries > 10: raise Exception('No teams found %s' % url) elif tries > 0: time.sleep(random.uniform(5, 7)) tries += 1 logger.debug('No result-tries: %d', tries) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', url) result = browser.find(id="result") tbodies = result.find_all("tbody") if len(tbodies) != 2: raise Exception("error parsing result") tableKey = tbodies[0] tableKeyRows = tableKey.find_all("tr") topTableKeys = [] if len(tableKeyRows) == 1: tableKey = tableKey.find_all("th") elif len(tableKeyRows) == 2: topTableColumns = tableKeyRows[0].find_all("th") for topTableColumn in topTableColumns: for _ in range(int(topTableColumn['colspan'])): topTableKeys.append(topTableColumn.text) tableKey = tableKeyRows[1].find_all("th") else: raise Exception('To many header rows found') tableItems = tbodies[1] tableItems = tableItems.find_all("td") tableColumn = 0 teamStatDict = {} for tableIndex, tableItem in enumerate(tableItems): if tableColumn == 0: logger.debug('Row %d of %d', tableIndex, len(tableItems)) tableColumn += 1 continue if tableColumn == 1: teamStatDict['team'] = removeNewLine(tableItem.text) teamStatDict['year'] = int(removeNewLine(season.text)) teamStatDict['seasonType'] = removeNewLine(seasonType.text) teamStatDict['role'] = removeNewLine(role.text) teamStatDict['category'] = removeNewLine(category.text) tableColumn += 1 continue if topTableKeys and topTableKeys[tableColumn]: key = topTableKeys[tableColumn] + '_' + tableKey[tableColumn].text else: key = tableKey[tableColumn].text key = cleanKey(removeNewLine(key)) value = convertToNumber(removeNewLine(tableItem.text)) teamStatDict[key] = value tableColumn += 1 if tableColumn >= len(tableKey): teamStat_list.append(teamStatDict) teamStatDict = {} tableColumn = 0 except: logger.exception('row fail') try: if teamStat_list: logger.debug('Bulk Creating teamStat_list') col_team_stats.insert_many(teamStat_list) except: logger.exception('insert_many error') logger.debug('parseSeason time elapsed: ' + str(datetime.now() - startTime)) closeLogger(role.text + '_' + category.text)
def parseYear(team_name, year_url, year): """ parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info) stores schedule info in nfl_data.schedule stores game_info in nfl_data.game_info with schedule ids """ logger = makeLogger( cleanKey(team_name) + '_' + str(year), r'./logs_pfrTeamStats/') startTime = datetime.now() logger.debug('Starting %d', year) schedule_list = [] gameInfo_list = [] client = MongoClient('localhost', 27017) db = client['nfl_data'] col_team_stats_weekly = db['team_stats_weekly'] #need to fix this to actually detect duplicate # if col_team_stats_weekly.find({'year': year}).count(): # logger.debug('Already parsed %s', year) # closeLogger(logger) # return None wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', year_url) table = browser.find(id='games') rows = table.find_all('tr') header = [ cleanKey(each.attrs['data-stat']) for each in rows[0].find_all('th') ] rows = rows[1:] row_dicts = [] for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) try: week_number = convertToNumber(row.find('th').text) row_values = [ convertToNumber(value.text) for value in row.find_all('td') ] row_values.insert(0, week_number) row_dict = dict(zip(header, row_values)) row_dict['year'] = year row_dict['team_name'] = team_name row_dict['year_url'] = year_url if row_dict['game_date'].lower() == 'playoffs': continue row_dicts.append(row_dict) except: logger.exception(row) logger.debug('team_stats_weekly.inert_many') col_team_stats_weekly.insert_many(row_dicts) logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger)
def run(wait): """ First collects a set of playerUrls to parse using, parsePlayerNames. Then parses each player. Both tasks use multiprocessing """ logger = makeLogger('main', r'./logs_nflPlayerStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) pool = Pool(processes=int(get_proxy_count() / 2.5)) results = [] #html5lib parser required for broken html on gameSplits browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) startingUrl = 'http://www.nfl.com/stats/categorystats?tabSeq=0&statisticCategory=PASSING&qualified=true&season=2015&seasonType=PRE' browser = open_or_follow_link(logger, browser, 'open', startingUrl) statisticCategory = browser.find(id="statistic-category") statisticCategories = statisticCategory.find_all("option") season = browser.find(id="season-dropdown") seasons = season.find_all("option") seasonType = browser.find(id="season-type") seasonTypes = seasonType.find_all("option") for statisticCategory in statisticCategories: if statisticCategory.text == 'Category...': continue for season in seasons: if season.text == 'Season...': continue for seasonType in seasonTypes: if seasonType.text == 'Season Type...': continue results.append( pool.apply_async(parsePlayerNames, ( statisticCategory['value'], season['value'], seasonType['value'], ))) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). playerUrl_set = set() for result in results: try: result_set = result.get() if result_set: playerUrl_set = playerUrl_set.union(result_set) except: logger.exception('Error in parsePlayerNames worker') with open('../playerUrl_set.json', 'w') as playerUrl_json: playerUrl_json.write(json.dumps(list(playerUrl_set))) pool = Pool(processes=int(get_proxy_count() / 2.5)) logger.debug('Starting to parse %d players', len(playerUrl_set)) for playerUrl in playerUrl_set: if col_player_profiles.find({'player_url': playerUrl}).count(): logger.debug('Skipping ' + playerUrl) continue #parsePlayer(playerUrl) pool.apply_async(parsePlayer, (playerUrl, )) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now() - startTime)) closeLogger('main')
def parseYear(year): """ parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info) stores schedule info in nfl_data.schedule stores game_info in nfl_data.game_info with schedule ids """ logger = makeLogger(year, r'./logs_pfrSchedule/') startTime = datetime.now() logger.debug('Starting %d', year) schedule_list = [] gameInfo_list = [] client = MongoClient('localhost', 27017) db = client['nfl_data'] col_schedule = db['schedule'] col_game_info = db['game_info'] col_failed_game_info = db['failed_game_info'] if col_schedule.find({'year': year}).count(): logger.debug('Already parsed %s', year) closeLogger(logger) return None wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', "http://www.pro-football-reference.com/years/{}/games.htm".format(year)) table = browser.find(id='games') rows = table.find_all('tr') for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) try: schedule_dict = {} gameInfo_dict = {} columns = row.find_all('td') if columns: schedule_dict['week'] = convertToNumber(columns[0].text) schedule_dict['day'] = columns[1].text schedule_dict['date'] = columns[2].text schedule_dict['year'] = convertToNumber(year) homeIndicator = columns[5].text if homeIndicator == '@': schedule_dict['homeTeam'] = columns[6].text schedule_dict['awayTeam'] = columns[4].text schedule_dict['homeTeamScore'] = convertToNumber(columns[8].text) schedule_dict['awayTeamScore'] = convertToNumber(columns[7].text) else: schedule_dict['homeTeam'] = columns[4].text schedule_dict['awayTeam'] = columns[6].text schedule_dict['homeTeamScore'] = convertToNumber(columns[7].text) schedule_dict['awayTeamScore'] = convertToNumber(columns[8].text) gameInfo_dict['week'] = convertToNumber(columns[0].text) gameInfo_dict['year'] = convertToNumber(year) wait = random.uniform(.5, 2.5) logger.debug('Waiting to follow_link %f', wait) time.sleep(wait) logger.debug('Following link') url = columns[3].find('a') if url: url = 'http://www.pro-football-reference.com' + url['href'] failed_game_info = True browser = open_or_follow_link(logger, browser, 'open', url) game_info = browser.find(id="game_info") if game_info: for each in game_info.find_all('tr'): pair = each.find_all('td') if pair: failed_game_info = False key = pair[0].text value = convertToNumber(pair[1].text) gameInfo_dict[cleanKey(key)] = convertToNumber(value) if failed_game_info: failed_dict = schedule_dict failed_dict['row'] = index failed_dict['href'] = url['href'] col_failed_game_info.insert(failed_dict) gameInfo_dict['FAIL'] = True schedule_list.append(schedule_dict) gameInfo_list.append(gameInfo_dict) except: logger.exception(row) logger.debug('nfl_schedule.inert_many') schedule_ids = col_schedule.insert_many(schedule_list).inserted_ids logger.debug('mapping nfl_schedule.id to gameInfo_list') for index, schedule_id in enumerate(schedule_ids): if len(gameInfo_list[index].keys()) <= 2: logger.debug('Empty game_info: %s', schedule_id) gameInfo_list[index]['schedule_id'] = schedule_id logger.debug('game_info.insert_many') col_game_info.insert_many(gameInfo_list) logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime)) closeLogger(year)
def run(wait): """Starts the scrapping proccess. creates a process per year between minyear and maxyear """ logger = makeLogger('main', r'./logs_pfrTeamStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link( logger, browser, 'open', "http://www.pro-football-reference.com/teams/") table_body = browser.find(id='teams_active').find('tbody') rows = table_body.find_all('tr') team_url_tups = [] for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) try: team_link = row.find('th').find('a') if team_link: team_url = 'http://www.pro-football-reference.com' + team_link[ 'href'] team_name = team_link.text team_url_tups.append((team_url, team_name)) except: logger.exception(row) pool = Pool(processes=int(get_proxy_count() / 2.5)) results = [] for team_url, team_name in team_url_tups: #print parseTeam(team_url, team_name) results.append(pool.apply_async(parseTeam, ( team_url, team_name, ))) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). year_url_tups = [] for result in results: year_url_tup = result.get() if year_url_tup: year_url_tups += (year_url_tup) logger.debug('Done gathering %d year urls', len(year_url_tups)) pool = Pool(processes=int(get_proxy_count() / 2)) logger.debug('Shuffling year_urls') random.shuffle(year_url_tups) logger.debug('Starting to parse year_urls') for team_name, year_url, year in year_url_tups: #parseYear(team_name, year_url, year) pool.apply_async(parseYear, ( team_name, year_url, year, )) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now() - startTime)) closeLogger('main')
def parsePlayer(player_name, player_url): player_url = "http://www.pro-football-reference.com" + player_url logger = makeLogger(player_name, r'./logs_pfrPlayerStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) client = MongoClient('localhost', 27017) db = client['nfl_data'] col_pfr_player_bio = db['pfr_player_bio'] col_pfr_player_game_stats = db['pfr_player_game_stats'] col_pfr_player_career_stats = db['pfr_player_career_stats'] if col_pfr_player_bio.find({'player_url': player_url}).count(): logger.debug('Player already parsed ' + player_url) return wait = random.uniform(1.5,3) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening player page %s', player_url) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', player_url + '/gamelog') logger.debug('Parsing player meta') meta_div = browser.find(id="meta") meta_items = None for div in meta_div.find_all('div'): try: if div['itemtype'] == 'http://schema.org/Person': meta_items = div.find_all('p') except KeyError: pass player_bio = { 'player_url': player_url, 'player_name': player_name } for meta_item in meta_items: physical_stat_row = False item_spans = meta_item.find_all('span') for item_span in item_spans: try: if item_span['itemprop'] == 'height': physical_stat_row = True player_bio['height'] = item_span.text elif item_span['itemprop'] == 'weight': physical_stat_row = True player_bio['weight'] = item_span.text except KeyError: pass if physical_stat_row: continue key_values = re.findall('([^:]+):([^:]+)(?: |$)', meta_item.text) for key, value in key_values: player_bio[cleanKey(key.replace(u'\xa0', u' '))] = value.strip().replace(u'\xa0', u' ') try: logger.debug('Creating player bio') player_bio_id = col_pfr_player_bio.insert(player_bio) except: logger.exception('insert error') return try: regular_season_div = browser.find(id='all_stats') regular_season_table = regular_season_div.find(class_="table_outer_container").find(id="div_stats") except AttributeError: logger.debug('No game logs, exiting player') return career_total_dicts = [] try: game_stat_dicts, career_total_dict = parseTable(logger, player_bio_id, regular_season_table, 'regular season') career_total_dicts.append(career_total_dict) except: logger.exception('parseTable error. Deleting user bio and exiting') col_pfr_player_bio.remove({'player_url': player_url}) return playoff_table = browser.find(id="stats_playoffs") if not playoff_table: logger.debug('No playoff game logs') else: try: temp_game_dicts, career_total_dict = parseTable(logger, player_bio_id, playoff_table, 'playoffs') game_stat_dicts += temp_game_dicts career_total_dicts.append(career_total_dict) except: logger.exception('parseTable error. Deleting user bio and exiting') col_pfr_player_bio.remove({'player_url': player_url}) return try: logger.debug('Bulk Creating game_stat_dicts') if game_stat_dicts: col_pfr_player_game_stats.insert_many(game_stat_dicts) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') try: logger.debug('Bulk Creating career_total_dicts') if career_total_dict: col_pfr_player_career_stats.insert_many(career_total_dicts) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') logger.debug('parsePlayer time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger)
def parsePlayerNames(statisticCategory, season, seasonType): """ Collects a set of player names from player stats by category tab. This parses just 1 of the options given in the form. Uses with pool.async """ startTime = datetime.now() logName = statisticCategory + '_' + season + '_' + seasonType logger = makeLogger(logName, r'./logs_nflPlayerStats/') logger.debug('Starting parsePlayerNames') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) playerUrl_set = set() loadNextPage = True pageNumber = 1 while(loadNextPage): logger.debug('Page %d', pageNumber) url = 'http://www.nfl.com/stats/categorystats?tabSeq=0&statisticCategory=' + statisticCategory + '&qualified=true&season=' + season + '&seasonType=' + seasonType + '&d-447263-p=' + str(pageNumber) browser = open_or_follow_link(logger, browser, 'open', url) pageNumber += 1 linkNavigation = browser.find(class_='linkNavigation') if not linkNavigation or pageNumber > len(linkNavigation.find_all('a')): loadNextPage = False result = browser.find(id="result") tries = 0 # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded while not result: if tries > 3: raise Exception('No teams found %s' % url) elif tries > 0: time.sleep(random.uniform(5, 7)) tries += 1 logger.debug('No result-tries: %d', tries) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', url) result = browser.find(id="result") tbodies = result.find_all("tbody") if len(tbodies) != 2: raise Exception("error parsing result") tableKey = tbodies[0] tableKey = tableKey.find_all("th") tableItems = tbodies[1] tableItems = tableItems.find_all("td") tableColumn = 0 teamStatDict = {} for tableIndex, tableItem in enumerate(tableItems): try: if tableColumn == 0: logger.debug('Row %d of %d', tableIndex, len(tableItems)) tableColumn += 1 continue if tableColumn == 1: playerUrl_set.add('http://www.nfl.com' + tableItem.find('a')['href']) tableColumn += 1 if tableColumn >= len(tableKey): tableColumn = 0 except: logger.exception('failed parsing row %d of %d', tableIndex, len(tableItems)) logger.debug('parsePlayerNames time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logName) return playerUrl_set
def run(wait): """Starts the scrapping proccess. Opens a teamstats page and gathers all the form inputs Then sends these inputs to parseSeason which opens a new page for every possible option in the form If you get an error at the start, with role.find_all, just try again, nfl.com returns weird pages sometimes """ logger = makeLogger('main', r'./logs_nflteamStat/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) pool = Pool(processes=int(get_proxy_count() / 2.5)) #html5lib parser required for broken html on gameSplits browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) startingUrl = "http://www.nfl.com/stats/categorystats?tabSeq=2&offensiveStatisticCategory=GAME_STATS&conference=ALL&role=TM&season=2015&seasonType=REG" browser = open_or_follow_link(logger, browser, 'open', startingUrl) role = browser.find(id="role") roles = role.find_all("option") offensiveCategory = browser.find(id="offensive-category") offensiveCategories = offensiveCategory.find_all("option") defensiveCategory = browser.find(id="defensive-category") defensiveCategories = defensiveCategory.find_all("option") season = browser.find(id="season-dropdown") seasons = season.find_all("option") seasonType = browser.find(id="season-type") seasonTypes = seasonType.find_all("option") for role in roles: availableCategories = None if role.text == "Offense": availableCategories = offensiveCategories elif role.text == "Defense": availableCategories = defensiveCategories else: print "unknown role" for category in availableCategories: if category.text == "Category...": continue for season in seasons: if season.text == "Season..." or convertToNumber( removeNewLine(season.text)) < 1960: continue #parseSeason(role, category, season, seasonTypes) pool.apply_async(parseSeason, ( role, category, season, seasonTypes, )) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now() - startTime)) closeLogger('main')
def parsePlayerNames(statisticCategory, season, seasonType): """ Collects a set of player names from player stats by category tab. This parses just 1 of the options given in the form. Uses with pool.async """ startTime = datetime.now() logName = statisticCategory + '_' + season + '_' + seasonType logger = makeLogger(logName, r'./logs_nflPlayerStats/') logger.debug('Starting parsePlayerNames') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) playerUrl_set = set() loadNextPage = True pageNumber = 1 while (loadNextPage): logger.debug('Page %d', pageNumber) url = 'http://www.nfl.com/stats/categorystats?tabSeq=0&statisticCategory=' + statisticCategory + '&qualified=true&season=' + season + '&seasonType=' + seasonType + '&d-447263-p=' + str( pageNumber) browser = open_or_follow_link(logger, browser, 'open', url) pageNumber += 1 linkNavigation = browser.find(class_='linkNavigation') if not linkNavigation or pageNumber > len( linkNavigation.find_all('a')): loadNextPage = False result = browser.find(id="result") tries = 0 # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded while not result: if tries > 3: raise Exception('No teams found %s' % url) elif tries > 0: time.sleep(random.uniform(5, 7)) tries += 1 logger.debug('No result-tries: %d', tries) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', url) result = browser.find(id="result") tbodies = result.find_all("tbody") if len(tbodies) != 2: raise Exception("error parsing result") tableKey = tbodies[0] tableKey = tableKey.find_all("th") tableItems = tbodies[1] tableItems = tableItems.find_all("td") tableColumn = 0 teamStatDict = {} for tableIndex, tableItem in enumerate(tableItems): try: if tableColumn == 0: logger.debug('Row %d of %d', tableIndex, len(tableItems)) tableColumn += 1 continue if tableColumn == 1: playerUrl_set.add('http://www.nfl.com' + tableItem.find('a')['href']) tableColumn += 1 if tableColumn >= len(tableKey): tableColumn = 0 except: logger.exception('failed parsing row %d of %d', tableIndex, len(tableItems)) logger.debug('parsePlayerNames time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logName) return playerUrl_set
def run(wait): """Starts the scrapping proccess. creates a process per year between minyear and maxyear """ logger = makeLogger("main", r"./logs_pfrTeamStats/") startTime = datetime.now() logger.debug("start time: " + str(startTime)) logger.debug("waiting %d seconds", wait) time.sleep(wait) logger.debug("Opening main page") browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, "open", "http://www.pro-football-reference.com/teams/") table_body = browser.find(id="teams_active").find("tbody") rows = table_body.find_all("tr") team_url_tups = [] for index, row in enumerate(rows): logger.debug("Row %d of %d", index, len(rows)) try: team_link = row.find("th").find("a") if team_link: team_url = "http://www.pro-football-reference.com" + team_link["href"] team_name = team_link.text team_url_tups.append((team_url, team_name)) except: logger.exception(row) pool = Pool(processes=int(get_proxy_count() / 2.5)) results = [] for team_url, team_name in team_url_tups: # print parseTeam(team_url, team_name) results.append(pool.apply_async(parseTeam, (team_url, team_name))) pool.close() # Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() # Wait for the worker processes to exit. One must call close() or terminate() before using join(). year_url_tups = [] for result in results: year_url_tup = result.get() if year_url_tup: year_url_tups += year_url_tup logger.debug("Done gathering %d year urls", len(year_url_tups)) pool = Pool(processes=int(get_proxy_count() / 2)) logger.debug("Shuffling year_urls") random.shuffle(year_url_tups) logger.debug("Starting to parse year_urls") for team_name, year_url, year in year_url_tups: # parseYear(team_name, year_url, year) pool.apply_async(parseYear, (team_name, year_url, year)) pool.close() # Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() # Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug("run time: " + str(datetime.now() - startTime)) closeLogger("main")
def run(wait): """ First collects a set of playerUrls to parse using, parsePlayerNames. Then parses each player. Both tasks use multiprocessing """ logger = makeLogger('main', r'./logs_nflPlayerStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) pool = Pool(processes=int(get_proxy_count()/2.5)) results = [] #html5lib parser required for broken html on gameSplits browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) startingUrl = 'http://www.nfl.com/stats/categorystats?tabSeq=0&statisticCategory=PASSING&qualified=true&season=2015&seasonType=PRE' browser = open_or_follow_link(logger, browser, 'open', startingUrl) statisticCategory = browser.find(id="statistic-category") statisticCategories = statisticCategory.find_all("option") season = browser.find(id="season-dropdown") seasons = season.find_all("option") seasonType = browser.find(id="season-type") seasonTypes = seasonType.find_all("option") for statisticCategory in statisticCategories: if statisticCategory.text == 'Category...': continue for season in seasons: if season.text == 'Season...': continue for seasonType in seasonTypes: if seasonType.text == 'Season Type...': continue results.append(pool.apply_async(parsePlayerNames, (statisticCategory['value'], season['value'], seasonType['value'],))) pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join(). playerUrl_set = set() for result in results: try: result_set = result.get() if result_set: playerUrl_set = playerUrl_set.union(result_set) except: logger.exception('Error in parsePlayerNames worker') with open('../playerUrl_set.json', 'w') as playerUrl_json: playerUrl_json.write(json.dumps(list(playerUrl_set))) pool = Pool(processes=int(get_proxy_count()/2.5)) logger.debug('Starting to parse %d players', len(playerUrl_set)) for playerUrl in playerUrl_set: if col_player_profiles.find({'player_url': playerUrl}).count(): logger.debug('Skipping ' + playerUrl) continue #parsePlayer(playerUrl) pool.apply_async(parsePlayer, (playerUrl,)) pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now()-startTime )) closeLogger('main')
def parsePlayer(playerUrl): """ Starting point to parsing all pages for a given player. Collects all links for a player and calls the approiate parsers for each link. """ startTime = datetime.now() playerId = re.search('.*?id=(.*)\?*', playerUrl).group(1) logger = makeLogger(playerId, r'./logs_nflPlayerStats/') #html5lib parser required for broken html on gameSplits browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) wait = random.uniform(2,4) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening %s', playerUrl) browser = open_or_follow_link(logger, browser, 'open', playerUrl) #gets the actual playerUrl, the orignal value gets redirected playerUrl_api = playerUrl playerUrl = browser.url try: #parsePlayer BIO playerBio = browser.find(class_="player-info") player_profile_id = parsePlayerBio(logger, playerBio, playerUrl, playerUrl_api) if not player_profile_id: logger.debug('New player profile not made, skiping rest of tabs') return #Gets the links for each category tab, i.e Profile, career stats, game logs ... tabNames = [tabName['href'] for tabName in browser.find(id="player-profile-tabs").find_all('a')] for tabName in tabNames: if tabName == 'profile': continue playerUrl = getPlayerTabUrl(playerUrl, tabName) wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening %s', playerUrl) browser = open_or_follow_link(logger, browser, 'open', playerUrl) if tabName == 'careerstats': #parse careerstats careerStats = browser.find(id="player-stats-wrapper") careerStats = careerStats.find_all("table") parseCareerStats(logger, careerStats, player_profile_id) elif tabName == 'gamelogs': #Get the list of years gameLogYears = browser.find(id="criteria") gameLogYears = gameLogYears.find_all("option") yearsList = [] for year in gameLogYears: year = year.text.strip() if year: yearsList.append(year) #parse the first year of gameLogs since its already loaded gameLogs = browser.find(id="player-stats-wrapper") gameLogs = gameLogs.find_all("table") parseGameLogs(logger, gameLogs, yearsList[0], player_profile_id) #Parse the rest of the years for year in yearsList[1:]: playerUrl = getPlayerTabUrl(playerUrl, tabName) + '?season=' + year wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening %s', playerUrl) browser = open_or_follow_link(logger, browser, 'open', playerUrl) gameLogs = browser.find(id="player-stats-wrapper") gameLogs = gameLogs.find_all("table") parseGameLogs(logger, gameLogs, year, player_profile_id) elif tabName == 'gamesplits': #Get the list of years years = browser.find(id="criteria") years = years.find_all("option") yearsList = [] for year in years: year = year.text.strip() if year: yearsList.append(year) #parse the first year of gamesplits since its already loaded gameSplits = browser.find(id="player-stats-wrapper") parseSplits(logger, gameSplits, yearsList[0], 'game', player_profile_id) #Parse the rest of the years for year in yearsList[1:]: playerUrl = getPlayerTabUrl(playerUrl, tabName) + '?season=' + year wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening %s', playerUrl) browser = open_or_follow_link(logger, browser, 'open', playerUrl) gameSplits = browser.find(id="player-stats-wrapper") parseSplits(logger, gameSplits, year, 'game', player_profile_id) elif tabName == 'situationalstats': #Get the list of years years = browser.find(id="criteria") years = years.find_all("option") yearsList = [] for year in years: year = year.text.strip() if year: yearsList.append(year) #parse the first year of gamesplits since its already loaded situationalStats = browser.find(id="player-stats-wrapper") parseSplits(logger, situationalStats, yearsList[0], 'situational', player_profile_id) #Parse the rest of the years for year in yearsList[1:]: playerUrl = getPlayerTabUrl(playerUrl, tabName) + '?season=' + year wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening %s', playerUrl) browser = open_or_follow_link(logger, browser, 'open', playerUrl) situationalStats = browser.find(id="player-stats-wrapper") parseSplits(logger, situationalStats, year, 'situational', player_profile_id) elif tabName == 'draft': draft = browser.find(id="player-stats-wrapper") parseDraft(logger, draft, player_profile_id) elif tabName == 'combine': combine = browser.find(id="player-stats-wrapper") parseCombine(logger, combine, player_profile_id) except: logger.exception('Failed parsing player') logger.debug('parsePlayer time elapsed: ' + str(datetime.now() - startTime)) closeLogger(playerId)
def parseYear(year): """ parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info) stores schedule info in nfl_data.schedule stores game_info in nfl_data.game_info with schedule ids """ logger = makeLogger(year, r'./logs_pfrSchedule/') startTime = datetime.now() logger.debug('Starting %d', year) schedule_list = [] gameInfo_list = [] client = MongoClient('localhost', 27017) db = client['nfl_data'] col_schedule = db['schedule'] col_game_info = db['game_info'] col_failed_game_info = db['failed_game_info'] if col_schedule.find({'year': year}).count(): logger.debug('Already parsed %s', year) closeLogger(logger) return None wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link( logger, browser, 'open', "http://www.pro-football-reference.com/years/{}/games.htm".format( year)) table = browser.find(id='games') rows = table.find_all('tr') for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) try: schedule_dict = {} gameInfo_dict = {} columns = row.find_all('td') if columns: schedule_dict['week'] = convertToNumber(columns[0].text) schedule_dict['day'] = columns[1].text schedule_dict['date'] = columns[2].text schedule_dict['year'] = convertToNumber(year) homeIndicator = columns[5].text if homeIndicator == '@': schedule_dict['homeTeam'] = columns[6].text schedule_dict['awayTeam'] = columns[4].text schedule_dict['homeTeamScore'] = convertToNumber( columns[8].text) schedule_dict['awayTeamScore'] = convertToNumber( columns[7].text) else: schedule_dict['homeTeam'] = columns[4].text schedule_dict['awayTeam'] = columns[6].text schedule_dict['homeTeamScore'] = convertToNumber( columns[7].text) schedule_dict['awayTeamScore'] = convertToNumber( columns[8].text) gameInfo_dict['week'] = convertToNumber(columns[0].text) gameInfo_dict['year'] = convertToNumber(year) wait = random.uniform(.5, 2.5) logger.debug('Waiting to follow_link %f', wait) time.sleep(wait) logger.debug('Following link') url = columns[3].find('a') if url: url = 'http://www.pro-football-reference.com' + url['href'] failed_game_info = True browser = open_or_follow_link(logger, browser, 'open', url) game_info = browser.find(id="game_info") if game_info: for each in game_info.find_all('tr'): pair = each.find_all('td') if pair: failed_game_info = False key = pair[0].text value = convertToNumber(pair[1].text) gameInfo_dict[cleanKey(key)] = convertToNumber( value) if failed_game_info: failed_dict = schedule_dict failed_dict['row'] = index failed_dict['href'] = url['href'] col_failed_game_info.insert(failed_dict) gameInfo_dict['FAIL'] = True schedule_list.append(schedule_dict) gameInfo_list.append(gameInfo_dict) except: logger.exception(row) logger.debug('nfl_schedule.inert_many') schedule_ids = col_schedule.insert_many(schedule_list).inserted_ids logger.debug('mapping nfl_schedule.id to gameInfo_list') for index, schedule_id in enumerate(schedule_ids): if len(gameInfo_list[index].keys()) <= 2: logger.debug('Empty game_info: %s', schedule_id) gameInfo_list[index]['schedule_id'] = schedule_id logger.debug('game_info.insert_many') col_game_info.insert_many(gameInfo_list) logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime)) closeLogger(year)
def parseWeek(year, week): """Parsing a specific week at http://nflweather.com/week/{}/Week-{} Follows all detial links, which is where must of the data is scraped. Scrapes weather, and stadium enough per week, and stores them in their respective collections """ logger = makeLogger(str(year) + '_' + str(week), r'./logs_nflWeather/') startTime = datetime.now() logger.debug('Starting %d %d', year, week) weather_list = [] stadium_list = [] if col_weather_info.find({'year': year, 'week': week}).count(): logger.debug('Already parsed %d %d', year, week) return None wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', "http://nflweather.com/week/{}/Week-{}".format(year, week)) data = browser.find(class_="footable") rows = data.find_all('tr') for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) weatherInfo = {'year': year, 'week': week} stadiumInfo = {'year': year, 'week': week} try: columns = row.find_all('td') if columns: weatherInfo['weatherPicAlt'] = columns[8].find('img')['alt'] weatherInfo['weatherText'] = columns[9].text.strip() weatherInfo['shortWind'] = columns[10].text details = columns[12] detialsLink = 'http://nflweather.com' + details.find('a')['href'] wait = random.uniform(.5, 2.5) logger.debug('Waiting to follow_link %f', wait) time.sleep(wait) logger.debug('Following link') browser = open_or_follow_link(logger, browser, 'open', detialsLink) gameTime = browser.find('strong').text.split('-')[0].split(':', 1)[1].strip() awayTeam = browser.find_all(class_='g-away')[1].find('a').text.replace(' ', ' ').strip() homeTeam = browser.find_all(class_='g-home')[1].find('a').text.replace(' ', ' ').strip() spans = browser.find_all(class_='span5') if len(spans) != 2: raise Exception('to many spans') weatherItems = spans[0].find_all('p') stadiumItems = spans[1].find_all('p') index = spans[0].text.find('Temperature:') weatherCondition = spans[0].text[:index].strip() for each in weatherItems: split = each.text.strip().split(':') if len(split) == 2: weatherInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip()) for index, each in enumerate(stadiumItems): split = each.text.strip().split(':') if len(split) == 2: if split[0] == 'Surface': stadiumInfo['stadium'] = stadiumItems[index-1].text.strip() stadiumInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip()) #find nfl_schedule, update gameTime, hoepfully result as id, insert id into both info dicts, append to _list schedule_query = {'year': year, 'week': week, 'homeTeam': homeTeam, 'awayTeam': awayTeam} schedule_doc = col_schedule.find(schedule_query) if schedule_doc.count() != 1: error_docs = str(schedule_query) + ' | ' + str(weatherInfo) + ' | ' + str(stadiumInfo) raise Exception("nfl_scedule doc not found " + error_docs) result = col_schedule.update_one(schedule_query, {'$set': {'dateTime': gameTime}}) schedule_id = schedule_doc[0]['_id'] weatherInfo['schedule_id'] = schedule_id stadiumInfo['schedule_id'] = schedule_id weather_list.append(weatherInfo) stadium_list.append(stadiumInfo) except: logger.exception(row) try: logger.debug('Bulk Creating weather_list') col_weather_info.insert_many(weather_list) logger.debug('Bulk Creating stadium_list') col_stadium_info.insert_many(stadium_list) except: logger.exception('insert_many error') logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime)) closeLogger(str(year) + '_' + str(week))
def parsePlayer(player_name, player_url): player_url = "http://www.pro-football-reference.com" + player_url logger = makeLogger(player_name, r'./logs_pfrPlayerStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) client = MongoClient('localhost', 27017) db = client['nfl_data'] col_pfr_player_bio = db['pfr_player_bio'] col_pfr_player_game_stats = db['pfr_player_game_stats'] col_pfr_player_career_stats = db['pfr_player_career_stats'] if col_pfr_player_bio.find({'player_url': player_url}).count(): logger.debug('Player already parsed ' + player_url) return wait = random.uniform(1.5, 3) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening player page %s', player_url) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', player_url + '/gamelog') logger.debug('Parsing player meta') meta_div = browser.find(id="meta") meta_items = None for div in meta_div.find_all('div'): try: if div['itemtype'] == 'http://schema.org/Person': meta_items = div.find_all('p') except KeyError: pass player_bio = {'player_url': player_url, 'player_name': player_name} for meta_item in meta_items: physical_stat_row = False item_spans = meta_item.find_all('span') for item_span in item_spans: try: if item_span['itemprop'] == 'height': physical_stat_row = True player_bio['height'] = item_span.text elif item_span['itemprop'] == 'weight': physical_stat_row = True player_bio['weight'] = item_span.text except KeyError: pass if physical_stat_row: continue key_values = re.findall('([^:]+):([^:]+)(?: |$)', meta_item.text) for key, value in key_values: player_bio[cleanKey(key.replace(u'\xa0', u' '))] = value.strip().replace( u'\xa0', u' ') try: logger.debug('Creating player bio') player_bio_id = col_pfr_player_bio.insert(player_bio) except: logger.exception('insert error') return try: regular_season_div = browser.find(id='all_stats') regular_season_table = regular_season_div.find( class_="table_outer_container").find(id="div_stats") except AttributeError: logger.debug('No game logs, exiting player') return career_total_dicts = [] try: game_stat_dicts, career_total_dict = parseTable( logger, player_bio_id, regular_season_table, 'regular season') career_total_dicts.append(career_total_dict) except: logger.exception('parseTable error. Deleting user bio and exiting') col_pfr_player_bio.remove({'player_url': player_url}) return playoff_table = browser.find(id="stats_playoffs") if not playoff_table: logger.debug('No playoff game logs') else: try: temp_game_dicts, career_total_dict = parseTable( logger, player_bio_id, playoff_table, 'playoffs') game_stat_dicts += temp_game_dicts career_total_dicts.append(career_total_dict) except: logger.exception('parseTable error. Deleting user bio and exiting') col_pfr_player_bio.remove({'player_url': player_url}) return try: logger.debug('Bulk Creating game_stat_dicts') if game_stat_dicts: col_pfr_player_game_stats.insert_many(game_stat_dicts) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') try: logger.debug('Bulk Creating career_total_dicts') if career_total_dict: col_pfr_player_career_stats.insert_many(career_total_dicts) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') logger.debug('parsePlayer time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger)
def parseYear(team_name, year_url, year): """ parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info) stores schedule info in nfl_data.schedule stores game_info in nfl_data.game_info with schedule ids """ logger = makeLogger(cleanKey(team_name) + "_" + str(year), r"./logs_pfrTeamStats/") startTime = datetime.now() logger.debug("Starting %d", year) schedule_list = [] gameInfo_list = [] client = MongoClient("localhost", 27017) db = client["nfl_data"] col_team_stats_weekly = db["team_stats_weekly"] # need to fix this to actually detect duplicate # if col_team_stats_weekly.find({'year': year}).count(): # logger.debug('Already parsed %s', year) # closeLogger(logger) # return None wait = random.uniform(1.5, 3.5) logger.debug("Waiting %f", wait) time.sleep(wait) logger.debug("Opening main page") browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, "open", year_url) table = browser.find(id="games") rows = table.find_all("tr") header = [cleanKey(each.attrs["data-stat"]) for each in rows[0].find_all("th")] rows = rows[1:] row_dicts = [] for index, row in enumerate(rows): logger.debug("Row %d of %d", index, len(rows)) try: week_number = convertToNumber(row.find("th").text) row_values = [convertToNumber(value.text) for value in row.find_all("td")] row_values.insert(0, week_number) row_dict = dict(zip(header, row_values)) row_dict["year"] = year row_dict["team_name"] = team_name row_dict["year_url"] = year_url if row_dict["game_date"].lower() == "playoffs": continue row_dicts.append(row_dict) except: logger.exception(row) logger.debug("team_stats_weekly.inert_many") col_team_stats_weekly.insert_many(row_dicts) logger.debug("parseYear time elapsed: " + str(datetime.now() - startTime)) closeLogger(logger)
def parseSeason(role, category, season, seasonTypes): """Parses every seasonType in a season at http://www.nfl.com/stats/categorystats for a given role/category/season doesnt follow any links some years dont have any info, but still return a page. These are loged with Exception('No teams found %s' % url) All data is stored in team_stats """ logger = makeLogger(role.text + '_' + category.text + '_' + season.text, r'./logs_nflteamStat/') startTime = datetime.now() logger.debug('Starting %s %s %s', role.text, category.text, season.text) teamStat_list = [] for seasonType in seasonTypes: if seasonType.text == "Season Type...": continue team_stats_query = { 'year': convertToNumber(removeNewLine(season.text)), 'seasonType': removeNewLine(seasonType.text), 'role': removeNewLine(role.text), 'category': removeNewLine(category.text) } if col_team_stats.find(team_stats_query).count(): logger.debug('Already parsed %s', team_stats_query) continue wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Starting: %s', team_stats_query) url = 'http://www.nfl.com/stats/categorystats?' + 'archive=true&conference=null' + '&role=' + role[ 'value'] try: if role.text == "Offense": categoryUrl = '&offensiveStatisticCategory=' + category[ 'value'] + '&defensiveStatisticCategory=null' elif role.text == "Defense": categoryUrl = '&offensiveStatisticCategory=null&defensiveStatisticCategory=' + category[ 'value'] else: raise Exception('Unsupported role: %s', role.text) url += categoryUrl url += '&season=' + season['value'] + '&seasonType=' + seasonType[ 'value'] + '&tabSeq=2&qualified=false&Submit=Go' logger.debug('Opening: %s', url) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', url) result = browser.find(id="result") tries = 0 # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded while not result: if tries > 10: raise Exception('No teams found %s' % url) elif tries > 0: time.sleep(random.uniform(5, 7)) tries += 1 logger.debug('No result-tries: %d', tries) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', url) result = browser.find(id="result") tbodies = result.find_all("tbody") if len(tbodies) != 2: raise Exception("error parsing result") tableKey = tbodies[0] tableKeyRows = tableKey.find_all("tr") topTableKeys = [] if len(tableKeyRows) == 1: tableKey = tableKey.find_all("th") elif len(tableKeyRows) == 2: topTableColumns = tableKeyRows[0].find_all("th") for topTableColumn in topTableColumns: for _ in range(int(topTableColumn['colspan'])): topTableKeys.append(topTableColumn.text) tableKey = tableKeyRows[1].find_all("th") else: raise Exception('To many header rows found') tableItems = tbodies[1] tableItems = tableItems.find_all("td") tableColumn = 0 teamStatDict = {} for tableIndex, tableItem in enumerate(tableItems): if tableColumn == 0: logger.debug('Row %d of %d', tableIndex, len(tableItems)) tableColumn += 1 continue if tableColumn == 1: teamStatDict['team'] = removeNewLine(tableItem.text) teamStatDict['year'] = int(removeNewLine(season.text)) teamStatDict['seasonType'] = removeNewLine(seasonType.text) teamStatDict['role'] = removeNewLine(role.text) teamStatDict['category'] = removeNewLine(category.text) tableColumn += 1 continue if topTableKeys and topTableKeys[tableColumn]: key = topTableKeys[tableColumn] + '_' + tableKey[ tableColumn].text else: key = tableKey[tableColumn].text key = cleanKey(removeNewLine(key)) value = convertToNumber(removeNewLine(tableItem.text)) teamStatDict[key] = value tableColumn += 1 if tableColumn >= len(tableKey): teamStat_list.append(teamStatDict) teamStatDict = {} tableColumn = 0 except: logger.exception('row fail') try: if teamStat_list: logger.debug('Bulk Creating teamStat_list') col_team_stats.insert_many(teamStat_list) except: logger.exception('insert_many error') logger.debug('parseSeason time elapsed: ' + str(datetime.now() - startTime)) closeLogger(role.text + '_' + category.text)
def parseWeek(year, week): """Parsing a specific week at http://nflweather.com/week/{}/Week-{} Follows all detial links, which is where must of the data is scraped. Scrapes weather, and stadium enough per week, and stores them in their respective collections """ logger = makeLogger(str(year) + '_' + str(week), r'./logs_nflWeather/') startTime = datetime.now() logger.debug('Starting %d %d', year, week) weather_list = [] stadium_list = [] if col_weather_info.find({'year': year, 'week': week}).count(): logger.debug('Already parsed %d %d', year, week) return None wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link( logger, browser, 'open', "http://nflweather.com/week/{}/Week-{}".format(year, week)) data = browser.find(class_="footable") rows = data.find_all('tr') for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) weatherInfo = {'year': year, 'week': week} stadiumInfo = {'year': year, 'week': week} try: columns = row.find_all('td') if columns: weatherInfo['weatherPicAlt'] = columns[8].find('img')['alt'] weatherInfo['weatherText'] = columns[9].text.strip() weatherInfo['shortWind'] = columns[10].text details = columns[12] detialsLink = 'http://nflweather.com' + details.find( 'a')['href'] wait = random.uniform(.5, 2.5) logger.debug('Waiting to follow_link %f', wait) time.sleep(wait) logger.debug('Following link') browser = open_or_follow_link(logger, browser, 'open', detialsLink) gameTime = browser.find('strong').text.split('-')[0].split( ':', 1)[1].strip() awayTeam = browser.find_all( class_='g-away')[1].find('a').text.replace(' ', ' ').strip() homeTeam = browser.find_all( class_='g-home')[1].find('a').text.replace(' ', ' ').strip() spans = browser.find_all(class_='span5') if len(spans) != 2: raise Exception('to many spans') weatherItems = spans[0].find_all('p') stadiumItems = spans[1].find_all('p') index = spans[0].text.find('Temperature:') weatherCondition = spans[0].text[:index].strip() for each in weatherItems: split = each.text.strip().split(':') if len(split) == 2: weatherInfo[cleanKey( split[0].strip())] = convertToNumber( split[1].strip()) for index, each in enumerate(stadiumItems): split = each.text.strip().split(':') if len(split) == 2: if split[0] == 'Surface': stadiumInfo['stadium'] = stadiumItems[ index - 1].text.strip() stadiumInfo[cleanKey( split[0].strip())] = convertToNumber( split[1].strip()) #find nfl_schedule, update gameTime, hoepfully result as id, insert id into both info dicts, append to _list schedule_query = { 'year': year, 'week': week, 'homeTeam': homeTeam, 'awayTeam': awayTeam } schedule_doc = col_schedule.find(schedule_query) if schedule_doc.count() != 1: error_docs = str(schedule_query) + ' | ' + str( weatherInfo) + ' | ' + str(stadiumInfo) raise Exception("nfl_scedule doc not found " + error_docs) result = col_schedule.update_one( schedule_query, {'$set': { 'dateTime': gameTime }}) schedule_id = schedule_doc[0]['_id'] weatherInfo['schedule_id'] = schedule_id stadiumInfo['schedule_id'] = schedule_id weather_list.append(weatherInfo) stadium_list.append(stadiumInfo) except: logger.exception(row) try: logger.debug('Bulk Creating weather_list') col_weather_info.insert_many(weather_list) logger.debug('Bulk Creating stadium_list') col_stadium_info.insert_many(stadium_list) except: logger.exception('insert_many error') logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime)) closeLogger(str(year) + '_' + str(week))