def parseWeek(year, week): """ parses a specific week on http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1 which contains a csv of the fan duel player prices stores this info in fanduel_prices collection """ logger = makeLogger(str(year) + '_' + str(week), r'./logs_RotoFDStats/') startTime = datetime.now() logger.debug('Starting %d', year) client = MongoClient('localhost', 27017) db = client['nfl_data'] col_fanduel_prices = db['fanduel_prices'] if col_fanduel_prices.find({'year': year, 'yeek': week}).count(): logger.debug('Already parsed %d %d', year, week) closeLogger(logger) return None wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html.parser', user_agent=get_user_agent(logger), timeout=10) url = "http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1".format( week, year) browser = open_or_follow_link(logger, browser, 'open', url) docs = [] try: data = browser.find('pre').text lines = data.split('\n') header = lines[0] header = header.split(';') lines = lines[1:] for line in lines: doc = {} if not line: continue for index, each in enumerate(line.split(';')): doc[cleanKey(header[index])] = convertToNumber(each) docs.append(doc) except: logger.exception("Parse fail: %s", url) try: logger.debug('Bulk Creating docs') col_fanduel_prices.insert_many(docs) except: logger.exception('insert_many error') logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime)) closeLogger(str(year) + '_' + str(week))
def parseWeek(year, week): """ parses a specific week on http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1 which contains a csv of the fan duel player prices stores this info in fanduel_prices collection """ logger = makeLogger(str(year) + '_' + str(week), r'./logs_RotoFDStats/') startTime = datetime.now() logger.debug('Starting %d', year) client = MongoClient('localhost', 27017) db = client['nfl_data'] col_fanduel_prices = db['fanduel_prices'] if col_fanduel_prices.find({'year': year, 'yeek': week}).count(): logger.debug('Already parsed %d %d', year, week) closeLogger(logger) return None wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html.parser', user_agent=get_user_agent(logger), timeout=10) url = "http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1".format(week, year) browser = open_or_follow_link(logger, browser, 'open', url) docs = [] try: data = browser.find('pre').text lines = data.split('\n') header = lines[0] header = header.split(';') lines = lines[1:] for line in lines: doc = {} if not line: continue for index, each in enumerate(line.split(';')): doc[cleanKey(header[index])] = convertToNumber(each) docs.append(doc) except: logger.exception("Parse fail: %s", url) try: logger.debug('Bulk Creating docs') col_fanduel_prices.insert_many(docs) except: logger.exception('insert_many error') logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime)) closeLogger(str(year) + '_' + str(week))
def parseTable(logger, pfr_player_bio_id, table, season_type): game_stat_dicts = [] career_total_dict = { 'pfr_player_bio_id': pfr_player_bio_id, 'season_type': season_type } logger.debug('Parsing ' + season_type) games = table.find('tbody').find_all('tr') for game in games: game_stats = {} #skips over class='thead' rows since only has th's values = game.find_all('td') for column_index, value_object in enumerate(values): key = cleanKey(value_object['data-stat']) if value_object.text == '': value = 0 elif key == 'game_date': value = datetime.strptime(value_object.text, '%Y-%m-%d') else: value = convertToNumber(value_object.text) #key scoring not accurate when kicker didnt attempt any FG but hasd XP game_stats[key] = value if not game_stats: continue game_stats['pfr_player_bio_id'] = pfr_player_bio_id game_stats['season_type'] = season_type game_stat_dicts.append(game_stats) logger.debug('Parsing totals ' + season_type) career_totals = table.find('tfoot').find('tr').find_all('td') for career_total in career_totals: key = cleanKey(career_total['data-stat']) value = convertToNumber(career_total.text) if not value: continue career_total_dict[key] = value return game_stat_dicts, career_total_dict
def parseTeam(team_url, team_name): """ parses a teams page returns a list of year urls there is some data on this page that would be usefull to scrape in the future """ logger = makeLogger(cleanKey(team_name), r'./logs_pfrTeamStats/') startTime = datetime.now() logger.debug('Starting %s', team_name) wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', team_url) table = browser.find(id='team_index').find('tbody') year_columns = table.find_all('th') year_url_tups = [] for index, year_column in enumerate(year_columns): logger.debug('Row %d of %d', index, len(year_columns)) try: year_link = year_column.find('a') if year_link: year_url = 'http://www.pro-football-reference.com' + year_link[ 'href'] year = convertToNumber(year_link.text) if not isinstance(year, int): continue year_url_tups.append((team_name, year_url, year)) except: logger.exception(year_column) logger.debug('parseTeam time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger) return year_url_tups
def parseTeam(team_url, team_name): """ parses a teams page returns a list of year urls there is some data on this page that would be usefull to scrape in the future """ logger = makeLogger(cleanKey(team_name), r"./logs_pfrTeamStats/") startTime = datetime.now() logger.debug("Starting %s", team_name) wait = random.uniform(1.5, 3.5) logger.debug("Waiting %f", wait) time.sleep(wait) logger.debug("Opening main page") browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, "open", team_url) table = browser.find(id="team_index").find("tbody") year_columns = table.find_all("th") year_url_tups = [] for index, year_column in enumerate(year_columns): logger.debug("Row %d of %d", index, len(year_columns)) try: year_link = year_column.find("a") if year_link: year_url = "http://www.pro-football-reference.com" + year_link["href"] year = convertToNumber(year_link.text) if not isinstance(year, int): continue year_url_tups.append((team_name, year_url, year)) except: logger.exception(year_column) logger.debug("parseTeam time elapsed: " + str(datetime.now() - startTime)) closeLogger(logger) return year_url_tups
def parseCareerStats(logger, careerStats, player_profile_id): """ Parses career stats page for given player Stores each row in its own doc """ startTime = datetime.now() logger.debug('Starting careerStats') careerStats_list = [] for tableNumber, careerStat in enumerate(careerStats): logger.debug('Table %d of %d', tableNumber, len(careerStats)) try: tableName = careerStat.find("div").text.strip() tableKey = careerStat.find_all(class_="player-table-key")[-1] tableKey = tableKey.find_all('td') tableItems = careerStat.find("tbody").find_all("td") rowDict = {'category': tableName, 'player_profile_id': player_profile_id} rowYear = None tableColumn = 0 except: logger.exception('failed parsing table') continue for index, item in enumerate(tableItems, start=1): try: if 'class' in item.attrs: if item.attrs['class'][0] == 'border-td': continue if item.attrs['class'][0] == 'player-totals': break if tableColumn == 0: logger.debug('Row %d of %d', index, len(tableItems)) rowDict['year'] = convertToNumber(item.text.strip()) tableColumn += 1 continue rowDict[cleanKey(tableKey[tableColumn].text)] = convertToNumber(item.text.strip()) tableColumn += 1 if tableColumn >= len(tableKey): careerStats_list.append(rowDict) rowDict = {'category': tableName, 'player_profile_id': player_profile_id} tableColumn = 0 except: logger.exception('failed parsing row %d of %s', index, tableName) while(tableColumn < len(tableKey)): tableColumn += 1 rowDict = {'category': tableName, 'player_profile_id': player_profile_id} try: logger.debug('Bulk Creating careerStats_list') if careerStats_list: col_player_career_stats.insert_many(careerStats_list) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') logger.debug('parseCareerStats time elapsed: ' + str(datetime.now() - startTime))
def parseYear(team_name, year_url, year): """ parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info) stores schedule info in nfl_data.schedule stores game_info in nfl_data.game_info with schedule ids """ logger = makeLogger( cleanKey(team_name) + '_' + str(year), r'./logs_pfrTeamStats/') startTime = datetime.now() logger.debug('Starting %d', year) schedule_list = [] gameInfo_list = [] client = MongoClient('localhost', 27017) db = client['nfl_data'] col_team_stats_weekly = db['team_stats_weekly'] #need to fix this to actually detect duplicate # if col_team_stats_weekly.find({'year': year}).count(): # logger.debug('Already parsed %s', year) # closeLogger(logger) # return None wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', year_url) table = browser.find(id='games') rows = table.find_all('tr') header = [ cleanKey(each.attrs['data-stat']) for each in rows[0].find_all('th') ] rows = rows[1:] row_dicts = [] for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) try: week_number = convertToNumber(row.find('th').text) row_values = [ convertToNumber(value.text) for value in row.find_all('td') ] row_values.insert(0, week_number) row_dict = dict(zip(header, row_values)) row_dict['year'] = year row_dict['team_name'] = team_name row_dict['year_url'] = year_url if row_dict['game_date'].lower() == 'playoffs': continue row_dicts.append(row_dict) except: logger.exception(row) logger.debug('team_stats_weekly.inert_many') col_team_stats_weekly.insert_many(row_dicts) logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger)
def parseYear(year): """ parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info) stores schedule info in nfl_data.schedule stores game_info in nfl_data.game_info with schedule ids """ logger = makeLogger(year, r'./logs_pfrSchedule/') startTime = datetime.now() logger.debug('Starting %d', year) schedule_list = [] gameInfo_list = [] client = MongoClient('localhost', 27017) db = client['nfl_data'] col_schedule = db['schedule'] col_game_info = db['game_info'] col_failed_game_info = db['failed_game_info'] if col_schedule.find({'year': year}).count(): logger.debug('Already parsed %s', year) closeLogger(logger) return None wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', "http://www.pro-football-reference.com/years/{}/games.htm".format(year)) table = browser.find(id='games') rows = table.find_all('tr') for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) try: schedule_dict = {} gameInfo_dict = {} columns = row.find_all('td') if columns: schedule_dict['week'] = convertToNumber(columns[0].text) schedule_dict['day'] = columns[1].text schedule_dict['date'] = columns[2].text schedule_dict['year'] = convertToNumber(year) homeIndicator = columns[5].text if homeIndicator == '@': schedule_dict['homeTeam'] = columns[6].text schedule_dict['awayTeam'] = columns[4].text schedule_dict['homeTeamScore'] = convertToNumber(columns[8].text) schedule_dict['awayTeamScore'] = convertToNumber(columns[7].text) else: schedule_dict['homeTeam'] = columns[4].text schedule_dict['awayTeam'] = columns[6].text schedule_dict['homeTeamScore'] = convertToNumber(columns[7].text) schedule_dict['awayTeamScore'] = convertToNumber(columns[8].text) gameInfo_dict['week'] = convertToNumber(columns[0].text) gameInfo_dict['year'] = convertToNumber(year) wait = random.uniform(.5, 2.5) logger.debug('Waiting to follow_link %f', wait) time.sleep(wait) logger.debug('Following link') url = columns[3].find('a') if url: url = 'http://www.pro-football-reference.com' + url['href'] failed_game_info = True browser = open_or_follow_link(logger, browser, 'open', url) game_info = browser.find(id="game_info") if game_info: for each in game_info.find_all('tr'): pair = each.find_all('td') if pair: failed_game_info = False key = pair[0].text value = convertToNumber(pair[1].text) gameInfo_dict[cleanKey(key)] = convertToNumber(value) if failed_game_info: failed_dict = schedule_dict failed_dict['row'] = index failed_dict['href'] = url['href'] col_failed_game_info.insert(failed_dict) gameInfo_dict['FAIL'] = True schedule_list.append(schedule_dict) gameInfo_list.append(gameInfo_dict) except: logger.exception(row) logger.debug('nfl_schedule.inert_many') schedule_ids = col_schedule.insert_many(schedule_list).inserted_ids logger.debug('mapping nfl_schedule.id to gameInfo_list') for index, schedule_id in enumerate(schedule_ids): if len(gameInfo_list[index].keys()) <= 2: logger.debug('Empty game_info: %s', schedule_id) gameInfo_list[index]['schedule_id'] = schedule_id logger.debug('game_info.insert_many') col_game_info.insert_many(gameInfo_list) logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime)) closeLogger(year)
def parseWeek(year, week): """Parsing a specific week at http://nflweather.com/week/{}/Week-{} Follows all detial links, which is where must of the data is scraped. Scrapes weather, and stadium enough per week, and stores them in their respective collections """ logger = makeLogger(str(year) + '_' + str(week), r'./logs_nflWeather/') startTime = datetime.now() logger.debug('Starting %d %d', year, week) weather_list = [] stadium_list = [] if col_weather_info.find({'year': year, 'week': week}).count(): logger.debug('Already parsed %d %d', year, week) return None wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', "http://nflweather.com/week/{}/Week-{}".format(year, week)) data = browser.find(class_="footable") rows = data.find_all('tr') for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) weatherInfo = {'year': year, 'week': week} stadiumInfo = {'year': year, 'week': week} try: columns = row.find_all('td') if columns: weatherInfo['weatherPicAlt'] = columns[8].find('img')['alt'] weatherInfo['weatherText'] = columns[9].text.strip() weatherInfo['shortWind'] = columns[10].text details = columns[12] detialsLink = 'http://nflweather.com' + details.find('a')['href'] wait = random.uniform(.5, 2.5) logger.debug('Waiting to follow_link %f', wait) time.sleep(wait) logger.debug('Following link') browser = open_or_follow_link(logger, browser, 'open', detialsLink) gameTime = browser.find('strong').text.split('-')[0].split(':', 1)[1].strip() awayTeam = browser.find_all(class_='g-away')[1].find('a').text.replace(' ', ' ').strip() homeTeam = browser.find_all(class_='g-home')[1].find('a').text.replace(' ', ' ').strip() spans = browser.find_all(class_='span5') if len(spans) != 2: raise Exception('to many spans') weatherItems = spans[0].find_all('p') stadiumItems = spans[1].find_all('p') index = spans[0].text.find('Temperature:') weatherCondition = spans[0].text[:index].strip() for each in weatherItems: split = each.text.strip().split(':') if len(split) == 2: weatherInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip()) for index, each in enumerate(stadiumItems): split = each.text.strip().split(':') if len(split) == 2: if split[0] == 'Surface': stadiumInfo['stadium'] = stadiumItems[index-1].text.strip() stadiumInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip()) #find nfl_schedule, update gameTime, hoepfully result as id, insert id into both info dicts, append to _list schedule_query = {'year': year, 'week': week, 'homeTeam': homeTeam, 'awayTeam': awayTeam} schedule_doc = col_schedule.find(schedule_query) if schedule_doc.count() != 1: error_docs = str(schedule_query) + ' | ' + str(weatherInfo) + ' | ' + str(stadiumInfo) raise Exception("nfl_scedule doc not found " + error_docs) result = col_schedule.update_one(schedule_query, {'$set': {'dateTime': gameTime}}) schedule_id = schedule_doc[0]['_id'] weatherInfo['schedule_id'] = schedule_id stadiumInfo['schedule_id'] = schedule_id weather_list.append(weatherInfo) stadium_list.append(stadiumInfo) except: logger.exception(row) try: logger.debug('Bulk Creating weather_list') col_weather_info.insert_many(weather_list) logger.debug('Bulk Creating stadium_list') col_stadium_info.insert_many(stadium_list) except: logger.exception('insert_many error') logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime)) closeLogger(str(year) + '_' + str(week))
def parsePlayer(player_name, player_url): player_url = "http://www.pro-football-reference.com" + player_url logger = makeLogger(player_name, r'./logs_pfrPlayerStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) client = MongoClient('localhost', 27017) db = client['nfl_data'] col_pfr_player_bio = db['pfr_player_bio'] col_pfr_player_game_stats = db['pfr_player_game_stats'] col_pfr_player_career_stats = db['pfr_player_career_stats'] if col_pfr_player_bio.find({'player_url': player_url}).count(): logger.debug('Player already parsed ' + player_url) return wait = random.uniform(1.5,3) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening player page %s', player_url) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', player_url + '/gamelog') logger.debug('Parsing player meta') meta_div = browser.find(id="meta") meta_items = None for div in meta_div.find_all('div'): try: if div['itemtype'] == 'http://schema.org/Person': meta_items = div.find_all('p') except KeyError: pass player_bio = { 'player_url': player_url, 'player_name': player_name } for meta_item in meta_items: physical_stat_row = False item_spans = meta_item.find_all('span') for item_span in item_spans: try: if item_span['itemprop'] == 'height': physical_stat_row = True player_bio['height'] = item_span.text elif item_span['itemprop'] == 'weight': physical_stat_row = True player_bio['weight'] = item_span.text except KeyError: pass if physical_stat_row: continue key_values = re.findall('([^:]+):([^:]+)(?: |$)', meta_item.text) for key, value in key_values: player_bio[cleanKey(key.replace(u'\xa0', u' '))] = value.strip().replace(u'\xa0', u' ') try: logger.debug('Creating player bio') player_bio_id = col_pfr_player_bio.insert(player_bio) except: logger.exception('insert error') return try: regular_season_div = browser.find(id='all_stats') regular_season_table = regular_season_div.find(class_="table_outer_container").find(id="div_stats") except AttributeError: logger.debug('No game logs, exiting player') return career_total_dicts = [] try: game_stat_dicts, career_total_dict = parseTable(logger, player_bio_id, regular_season_table, 'regular season') career_total_dicts.append(career_total_dict) except: logger.exception('parseTable error. Deleting user bio and exiting') col_pfr_player_bio.remove({'player_url': player_url}) return playoff_table = browser.find(id="stats_playoffs") if not playoff_table: logger.debug('No playoff game logs') else: try: temp_game_dicts, career_total_dict = parseTable(logger, player_bio_id, playoff_table, 'playoffs') game_stat_dicts += temp_game_dicts career_total_dicts.append(career_total_dict) except: logger.exception('parseTable error. Deleting user bio and exiting') col_pfr_player_bio.remove({'player_url': player_url}) return try: logger.debug('Bulk Creating game_stat_dicts') if game_stat_dicts: col_pfr_player_game_stats.insert_many(game_stat_dicts) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') try: logger.debug('Bulk Creating career_total_dicts') if career_total_dict: col_pfr_player_career_stats.insert_many(career_total_dicts) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') logger.debug('parsePlayer time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger)
def parseSplits(logger, splits, year, splitType, player_profile_id): """ splitType: game or situational Parses 1 year of games splits or situational stats for given player. Stores each row in its own doc """ startTime = datetime.now() logger.debug('Starting %s splits', splitType) try: tabs = splits.find(class_="player-tabs") tabs = tabs.find_all('li') except: logger.exception('failed parsing player tabs') return splits_list = [] for index, tab in enumerate(tabs): logger.debug('tab %d of %d', index, len(tabs)) try: currentTabText = tab.text.strip() currentTab = splits.find(id='game_split_tabs_' + str(index)) tables = currentTab.find_all('table') except: logger.exception('failed parsing player tables for tab %d of %d', index, len(tabs)) continue for tableIndex, table in enumerate(tables): logger.debug('table %d of %d', tableIndex, len(tables)) try: tableKey = table.find(class_="player-table-key") tableKey = tableKey.find_all('td') tableName = tableKey[0].text.strip() tableItems = table.find('tbody').find_all('td') rowDict = { 'currentTabText': currentTabText, 'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year), 'splitType': splitType } tableColumn = 0 except: logger.exception('failed parsing player table %d of %d', tableIndex, len(tables)) continue for rowIndex, item in enumerate(tableItems): try: if 'class' in item.attrs: if item.attrs['class'][0] == 'border-td': continue if tableColumn == 0: logger.debug('Row %d of %d', rowIndex, len(tableItems)) rowName = item.text.strip() rowDict['rowName'] = rowName tableColumn += 1 continue rowDict[cleanKey( tableKey[tableColumn].text)] = convertToNumber( item.text.strip()) tableColumn += 1 if tableColumn >= len(tableKey): splits_list.append(rowDict) tableColumn = 0 rowDict = { 'currentTabText': currentTabText, 'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year), 'splitType': splitType } except: logger.exception('failed parsing row %d of %s', rowIndex, tableName) while (tableColumn < len(tableKey)): tableColumn += 1 rowDict = { 'currentTabText': currentTabText, 'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year), 'splitType': splitType } try: logger.debug('Bulk Creating splits_list') if splits_list: col_player_splits.insert_many(splits_list) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') logger.debug('parseSplits time elapsed: ' + str(datetime.now() - startTime))
def parseGameLogs(logger, gameLogs, year, player_profile_id): """ Parses 1 year of games logs for given player. Stores each row in its own doc """ startTime = datetime.now() logger.debug('Starting gameLogs') gameLogs_list = [] #messy because of bye weeks, 1 less column present for tableNumber, gameLog in enumerate(gameLogs): logger.debug('Table %d of %d', tableNumber, len(gameLogs)) try: topTableColumns = gameLog.find( class_="player-table-header").find_all('td') topTableKey = [] if len(topTableColumns) > 1: for index, topTableColumn in enumerate(topTableColumns): for _ in range(int(topTableColumn['colspan'])): if index == 0: topTableKey.append('') else: topTableKey.append(topTableColumn.text) tableName = topTableColumns[0].text.strip() tableKey = gameLog.find(class_="player-table-key") tableKey = tableKey.find_all('td') if topTableKey: for index, key in enumerate(tableKey): if topTableKey[index]: tableKey[index] = cleanKey(topTableKey[index] + '_' + key.text) else: tableKey[index] = cleanKey(key.text) tableItems = gameLog.find("tbody").find_all("td") rowDict = { 'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year) } tableColumn = 0 byeWeek = False columnsSkip = 0 rowWeek = None except: logger.exception('failed parsing table') continue for index, item in enumerate(tableItems): try: if byeWeek: if columnsSkip >= len(tableKey) - 3: byeWeek = False columnsSkip = 0 tableColumn = 0 else: columnsSkip += 1 continue #skip borders if 'class' in item.attrs: if item.attrs['class'][0] == 'border-td': continue #detect Total row and break if 'colspan' in item.attrs: if item.attrs['colspan'] == "3": if 'class' in tableItems[index + 1].attrs: if tableItems[ index + 1].attrs["class"][0] == "player-totals": break if tableColumn == 0: logger.debug('Row %d of %d', index, len(tableItems)) rowDict['week'] = convertToNumber(item.text.strip()) tableColumn += 1 continue if tableColumn == 1: if item.text.strip() == "Bye": byeWeek = True gameDate = "Bye" tableColumn += 1 while (tableColumn < len(tableKey)): rowDict[tableKey[tableColumn]] = None tableColumn += 1 #store nones if not byeWeek: if tableColumn == 2: opp = None linksFound = len(item.find_all('a')) if linksFound == 2: opp = item.find_all('a')[1].text.strip() elif linksFound == 1: opp = item.find_all('a')[0].text.strip() else: opp = item.text.strip() rowDict[tableKey[tableColumn]] = opp.replace( '\t', '').replace('\n', '') tableColumn += 1 continue if tableColumn == 3: outCome = item.find("span") if not outCome: outCome = 'T' else: outCome = outCome.text.strip() score = None linksFound = len(item.find_all("a")) if linksFound == 1: score = item.find("a").text.strip() elif linksFound == 0: score = re.findall('[0-9]+-[0-9]+', item.text)[0] result = outCome + score rowDict[tableKey[tableColumn]] = result tableColumn += 1 continue rowDict[tableKey[tableColumn]] = convertToNumber( item.text.strip()) tableColumn += 1 if tableColumn >= len(tableKey): gameLogs_list.append(rowDict) rowDict = { 'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year) } tableColumn = 0 byeWeek = False except: logger.exception( 'failed parsing row %d of %s. Skipping the row', index, tableName) while (tableColumn < len(tableKey)): tableColumn += 1 rowDict = { 'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year) } try: logger.debug('Bulk Creating gameLogs_list') if gameLogs_list: col_player_game_logs.insert_many(gameLogs_list) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') logger.debug('parseGameLogs time elapsed: ' + str(datetime.now() - startTime))
def parseCareerStats(logger, careerStats, player_profile_id): """ Parses career stats page for given player Stores each row in its own doc """ startTime = datetime.now() logger.debug('Starting careerStats') careerStats_list = [] for tableNumber, careerStat in enumerate(careerStats): logger.debug('Table %d of %d', tableNumber, len(careerStats)) try: tableName = careerStat.find("div").text.strip() tableKey = careerStat.find_all(class_="player-table-key")[-1] tableKey = tableKey.find_all('td') tableItems = careerStat.find("tbody").find_all("td") rowDict = { 'category': tableName, 'player_profile_id': player_profile_id } rowYear = None tableColumn = 0 except: logger.exception('failed parsing table') continue for index, item in enumerate(tableItems, start=1): try: if 'class' in item.attrs: if item.attrs['class'][0] == 'border-td': continue if item.attrs['class'][0] == 'player-totals': break if tableColumn == 0: logger.debug('Row %d of %d', index, len(tableItems)) rowDict['year'] = convertToNumber(item.text.strip()) tableColumn += 1 continue rowDict[cleanKey( tableKey[tableColumn].text)] = convertToNumber( item.text.strip()) tableColumn += 1 if tableColumn >= len(tableKey): careerStats_list.append(rowDict) rowDict = { 'category': tableName, 'player_profile_id': player_profile_id } tableColumn = 0 except: logger.exception('failed parsing row %d of %s', index, tableName) while (tableColumn < len(tableKey)): tableColumn += 1 rowDict = { 'category': tableName, 'player_profile_id': player_profile_id } try: logger.debug('Bulk Creating careerStats_list') if careerStats_list: col_player_career_stats.insert_many(careerStats_list) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') logger.debug('parseCareerStats time elapsed: ' + str(datetime.now() - startTime))
def parseSeason(role, category, season, seasonTypes): """Parses every seasonType in a season at http://www.nfl.com/stats/categorystats for a given role/category/season doesnt follow any links some years dont have any info, but still return a page. These are loged with Exception('No teams found %s' % url) All data is stored in team_stats """ logger = makeLogger(role.text + '_' + category.text + '_' + season.text, r'./logs_nflteamStat/') startTime = datetime.now() logger.debug('Starting %s %s %s', role.text, category.text, season.text) teamStat_list = [] for seasonType in seasonTypes: if seasonType.text == "Season Type...": continue team_stats_query = {'year': convertToNumber(removeNewLine(season.text)), 'seasonType': removeNewLine(seasonType.text), 'role': removeNewLine(role.text), 'category': removeNewLine(category.text) } if col_team_stats.find(team_stats_query).count(): logger.debug('Already parsed %s', team_stats_query) continue wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Starting: %s', team_stats_query) url = 'http://www.nfl.com/stats/categorystats?' + 'archive=true&conference=null' + '&role=' + role['value'] try: if role.text == "Offense": categoryUrl = '&offensiveStatisticCategory=' + category['value'] + '&defensiveStatisticCategory=null' elif role.text == "Defense": categoryUrl = '&offensiveStatisticCategory=null&defensiveStatisticCategory=' + category['value'] else: raise Exception('Unsupported role: %s', role.text) url += categoryUrl url += '&season=' + season['value'] + '&seasonType=' + seasonType['value'] + '&tabSeq=2&qualified=false&Submit=Go' logger.debug('Opening: %s', url) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', url) result = browser.find(id="result") tries = 0 # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded while not result: if tries > 10: raise Exception('No teams found %s' % url) elif tries > 0: time.sleep(random.uniform(5, 7)) tries += 1 logger.debug('No result-tries: %d', tries) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', url) result = browser.find(id="result") tbodies = result.find_all("tbody") if len(tbodies) != 2: raise Exception("error parsing result") tableKey = tbodies[0] tableKeyRows = tableKey.find_all("tr") topTableKeys = [] if len(tableKeyRows) == 1: tableKey = tableKey.find_all("th") elif len(tableKeyRows) == 2: topTableColumns = tableKeyRows[0].find_all("th") for topTableColumn in topTableColumns: for _ in range(int(topTableColumn['colspan'])): topTableKeys.append(topTableColumn.text) tableKey = tableKeyRows[1].find_all("th") else: raise Exception('To many header rows found') tableItems = tbodies[1] tableItems = tableItems.find_all("td") tableColumn = 0 teamStatDict = {} for tableIndex, tableItem in enumerate(tableItems): if tableColumn == 0: logger.debug('Row %d of %d', tableIndex, len(tableItems)) tableColumn += 1 continue if tableColumn == 1: teamStatDict['team'] = removeNewLine(tableItem.text) teamStatDict['year'] = int(removeNewLine(season.text)) teamStatDict['seasonType'] = removeNewLine(seasonType.text) teamStatDict['role'] = removeNewLine(role.text) teamStatDict['category'] = removeNewLine(category.text) tableColumn += 1 continue if topTableKeys and topTableKeys[tableColumn]: key = topTableKeys[tableColumn] + '_' + tableKey[tableColumn].text else: key = tableKey[tableColumn].text key = cleanKey(removeNewLine(key)) value = convertToNumber(removeNewLine(tableItem.text)) teamStatDict[key] = value tableColumn += 1 if tableColumn >= len(tableKey): teamStat_list.append(teamStatDict) teamStatDict = {} tableColumn = 0 except: logger.exception('row fail') try: if teamStat_list: logger.debug('Bulk Creating teamStat_list') col_team_stats.insert_many(teamStat_list) except: logger.exception('insert_many error') logger.debug('parseSeason time elapsed: ' + str(datetime.now() - startTime)) closeLogger(role.text + '_' + category.text)
def parseSplits(logger, splits, year, splitType, player_profile_id): """ splitType: game or situational Parses 1 year of games splits or situational stats for given player. Stores each row in its own doc """ startTime = datetime.now() logger.debug('Starting %s splits', splitType) try: tabs = splits.find(class_="player-tabs") tabs = tabs.find_all('li') except: logger.exception('failed parsing player tabs') return splits_list = [] for index, tab in enumerate(tabs): logger.debug('tab %d of %d', index, len(tabs)) try: currentTabText = tab.text.strip() currentTab = splits.find(id='game_split_tabs_' + str(index)) tables = currentTab.find_all('table') except: logger.exception('failed parsing player tables for tab %d of %d', index, len(tabs)) continue for tableIndex, table in enumerate(tables): logger.debug('table %d of %d', tableIndex, len(tables)) try: tableKey = table.find(class_="player-table-key") tableKey = tableKey.find_all('td') tableName = tableKey[0].text.strip() tableItems = table.find('tbody').find_all('td') rowDict = {'currentTabText': currentTabText, 'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year), 'splitType': splitType} tableColumn = 0 except: logger.exception('failed parsing player table %d of %d', tableIndex, len(tables)) continue for rowIndex, item in enumerate(tableItems): try: if 'class' in item.attrs: if item.attrs['class'][0] == 'border-td': continue if tableColumn == 0: logger.debug('Row %d of %d', rowIndex, len(tableItems)) rowName = item.text.strip() rowDict['rowName'] = rowName tableColumn += 1 continue rowDict[cleanKey(tableKey[tableColumn].text)] = convertToNumber(item.text.strip()) tableColumn += 1 if tableColumn >= len(tableKey): splits_list.append(rowDict) tableColumn = 0 rowDict = {'currentTabText': currentTabText, 'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year), 'splitType': splitType} except: logger.exception('failed parsing row %d of %s', rowIndex, tableName) while(tableColumn < len(tableKey)): tableColumn += 1 rowDict = {'currentTabText': currentTabText, 'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year), 'splitType': splitType} try: logger.debug('Bulk Creating splits_list') if splits_list: col_player_splits.insert_many(splits_list) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') logger.debug('parseSplits time elapsed: ' + str(datetime.now() - startTime))
def parseGameLogs(logger, gameLogs, year, player_profile_id): """ Parses 1 year of games logs for given player. Stores each row in its own doc """ startTime = datetime.now() logger.debug('Starting gameLogs') gameLogs_list = [] #messy because of bye weeks, 1 less column present for tableNumber, gameLog in enumerate(gameLogs): logger.debug('Table %d of %d', tableNumber, len(gameLogs)) try: topTableColumns = gameLog.find(class_="player-table-header").find_all('td') topTableKey = [] if len(topTableColumns) > 1: for index, topTableColumn in enumerate(topTableColumns): for _ in range(int(topTableColumn['colspan'])): if index == 0: topTableKey.append('') else: topTableKey.append(topTableColumn.text) tableName = topTableColumns[0].text.strip() tableKey = gameLog.find(class_="player-table-key") tableKey = tableKey.find_all('td') if topTableKey: for index, key in enumerate(tableKey): if topTableKey[index]: tableKey[index] = cleanKey(topTableKey[index] + '_' + key.text) else: tableKey[index] = cleanKey(key.text) tableItems = gameLog.find("tbody").find_all("td") rowDict = {'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year)} tableColumn = 0 byeWeek = False columnsSkip = 0 rowWeek = None except: logger.exception('failed parsing table') continue for index, item in enumerate(tableItems): try: if byeWeek: if columnsSkip >= len(tableKey)-3: byeWeek = False columnsSkip = 0 tableColumn = 0 else: columnsSkip += 1 continue #skip borders if 'class' in item.attrs: if item.attrs['class'][0] == 'border-td': continue #detect Total row and break if 'colspan' in item.attrs: if item.attrs['colspan'] == "3": if 'class' in tableItems[index+1].attrs: if tableItems[index+1].attrs["class"][0] == "player-totals": break if tableColumn == 0: logger.debug('Row %d of %d', index, len(tableItems)) rowDict['week'] = convertToNumber(item.text.strip()) tableColumn += 1 continue if tableColumn == 1: if item.text.strip() == "Bye": byeWeek = True gameDate = "Bye" tableColumn +=1 while(tableColumn < len(tableKey)): rowDict[tableKey[tableColumn]] = None tableColumn += 1 #store nones if not byeWeek: if tableColumn == 2: opp = None linksFound = len(item.find_all('a')) if linksFound == 2: opp = item.find_all('a')[1].text.strip() elif linksFound == 1: opp = item.find_all('a')[0].text.strip() else: opp = item.text.strip() rowDict[tableKey[tableColumn]] = opp.replace('\t', '').replace('\n', '') tableColumn += 1 continue if tableColumn == 3: outCome = item.find("span") if not outCome: outCome = 'T' else: outCome = outCome.text.strip() score = None linksFound = len(item.find_all("a")) if linksFound == 1: score = item.find("a").text.strip() elif linksFound == 0: score = re.findall('[0-9]+-[0-9]+', item.text)[0] result = outCome + score rowDict[tableKey[tableColumn]] = result tableColumn += 1 continue rowDict[tableKey[tableColumn]] = convertToNumber(item.text.strip()) tableColumn += 1 if tableColumn >= len(tableKey): gameLogs_list.append(rowDict) rowDict = {'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year)} tableColumn = 0 byeWeek = False except: logger.exception('failed parsing row %d of %s. Skipping the row', index, tableName) while(tableColumn < len(tableKey)): tableColumn += 1 rowDict = {'category': tableName, 'player_profile_id': player_profile_id, 'year': int(year)} try: logger.debug('Bulk Creating gameLogs_list') if gameLogs_list: col_player_game_logs.insert_many(gameLogs_list) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') logger.debug('parseGameLogs time elapsed: ' + str(datetime.now() - startTime))
def parseYear(team_name, year_url, year): """ parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info) stores schedule info in nfl_data.schedule stores game_info in nfl_data.game_info with schedule ids """ logger = makeLogger(cleanKey(team_name) + "_" + str(year), r"./logs_pfrTeamStats/") startTime = datetime.now() logger.debug("Starting %d", year) schedule_list = [] gameInfo_list = [] client = MongoClient("localhost", 27017) db = client["nfl_data"] col_team_stats_weekly = db["team_stats_weekly"] # need to fix this to actually detect duplicate # if col_team_stats_weekly.find({'year': year}).count(): # logger.debug('Already parsed %s', year) # closeLogger(logger) # return None wait = random.uniform(1.5, 3.5) logger.debug("Waiting %f", wait) time.sleep(wait) logger.debug("Opening main page") browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, "open", year_url) table = browser.find(id="games") rows = table.find_all("tr") header = [cleanKey(each.attrs["data-stat"]) for each in rows[0].find_all("th")] rows = rows[1:] row_dicts = [] for index, row in enumerate(rows): logger.debug("Row %d of %d", index, len(rows)) try: week_number = convertToNumber(row.find("th").text) row_values = [convertToNumber(value.text) for value in row.find_all("td")] row_values.insert(0, week_number) row_dict = dict(zip(header, row_values)) row_dict["year"] = year row_dict["team_name"] = team_name row_dict["year_url"] = year_url if row_dict["game_date"].lower() == "playoffs": continue row_dicts.append(row_dict) except: logger.exception(row) logger.debug("team_stats_weekly.inert_many") col_team_stats_weekly.insert_many(row_dicts) logger.debug("parseYear time elapsed: " + str(datetime.now() - startTime)) closeLogger(logger)
def parseYear(year): """ parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info) stores schedule info in nfl_data.schedule stores game_info in nfl_data.game_info with schedule ids """ logger = makeLogger(year, r'./logs_pfrSchedule/') startTime = datetime.now() logger.debug('Starting %d', year) schedule_list = [] gameInfo_list = [] client = MongoClient('localhost', 27017) db = client['nfl_data'] col_schedule = db['schedule'] col_game_info = db['game_info'] col_failed_game_info = db['failed_game_info'] if col_schedule.find({'year': year}).count(): logger.debug('Already parsed %s', year) closeLogger(logger) return None wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link( logger, browser, 'open', "http://www.pro-football-reference.com/years/{}/games.htm".format( year)) table = browser.find(id='games') rows = table.find_all('tr') for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) try: schedule_dict = {} gameInfo_dict = {} columns = row.find_all('td') if columns: schedule_dict['week'] = convertToNumber(columns[0].text) schedule_dict['day'] = columns[1].text schedule_dict['date'] = columns[2].text schedule_dict['year'] = convertToNumber(year) homeIndicator = columns[5].text if homeIndicator == '@': schedule_dict['homeTeam'] = columns[6].text schedule_dict['awayTeam'] = columns[4].text schedule_dict['homeTeamScore'] = convertToNumber( columns[8].text) schedule_dict['awayTeamScore'] = convertToNumber( columns[7].text) else: schedule_dict['homeTeam'] = columns[4].text schedule_dict['awayTeam'] = columns[6].text schedule_dict['homeTeamScore'] = convertToNumber( columns[7].text) schedule_dict['awayTeamScore'] = convertToNumber( columns[8].text) gameInfo_dict['week'] = convertToNumber(columns[0].text) gameInfo_dict['year'] = convertToNumber(year) wait = random.uniform(.5, 2.5) logger.debug('Waiting to follow_link %f', wait) time.sleep(wait) logger.debug('Following link') url = columns[3].find('a') if url: url = 'http://www.pro-football-reference.com' + url['href'] failed_game_info = True browser = open_or_follow_link(logger, browser, 'open', url) game_info = browser.find(id="game_info") if game_info: for each in game_info.find_all('tr'): pair = each.find_all('td') if pair: failed_game_info = False key = pair[0].text value = convertToNumber(pair[1].text) gameInfo_dict[cleanKey(key)] = convertToNumber( value) if failed_game_info: failed_dict = schedule_dict failed_dict['row'] = index failed_dict['href'] = url['href'] col_failed_game_info.insert(failed_dict) gameInfo_dict['FAIL'] = True schedule_list.append(schedule_dict) gameInfo_list.append(gameInfo_dict) except: logger.exception(row) logger.debug('nfl_schedule.inert_many') schedule_ids = col_schedule.insert_many(schedule_list).inserted_ids logger.debug('mapping nfl_schedule.id to gameInfo_list') for index, schedule_id in enumerate(schedule_ids): if len(gameInfo_list[index].keys()) <= 2: logger.debug('Empty game_info: %s', schedule_id) gameInfo_list[index]['schedule_id'] = schedule_id logger.debug('game_info.insert_many') col_game_info.insert_many(gameInfo_list) logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime)) closeLogger(year)
def parsePlayer(player_name, player_url): player_url = "http://www.pro-football-reference.com" + player_url logger = makeLogger(player_name, r'./logs_pfrPlayerStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) client = MongoClient('localhost', 27017) db = client['nfl_data'] col_pfr_player_bio = db['pfr_player_bio'] col_pfr_player_game_stats = db['pfr_player_game_stats'] col_pfr_player_career_stats = db['pfr_player_career_stats'] if col_pfr_player_bio.find({'player_url': player_url}).count(): logger.debug('Player already parsed ' + player_url) return wait = random.uniform(1.5, 3) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening player page %s', player_url) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', player_url + '/gamelog') logger.debug('Parsing player meta') meta_div = browser.find(id="meta") meta_items = None for div in meta_div.find_all('div'): try: if div['itemtype'] == 'http://schema.org/Person': meta_items = div.find_all('p') except KeyError: pass player_bio = {'player_url': player_url, 'player_name': player_name} for meta_item in meta_items: physical_stat_row = False item_spans = meta_item.find_all('span') for item_span in item_spans: try: if item_span['itemprop'] == 'height': physical_stat_row = True player_bio['height'] = item_span.text elif item_span['itemprop'] == 'weight': physical_stat_row = True player_bio['weight'] = item_span.text except KeyError: pass if physical_stat_row: continue key_values = re.findall('([^:]+):([^:]+)(?: |$)', meta_item.text) for key, value in key_values: player_bio[cleanKey(key.replace(u'\xa0', u' '))] = value.strip().replace( u'\xa0', u' ') try: logger.debug('Creating player bio') player_bio_id = col_pfr_player_bio.insert(player_bio) except: logger.exception('insert error') return try: regular_season_div = browser.find(id='all_stats') regular_season_table = regular_season_div.find( class_="table_outer_container").find(id="div_stats") except AttributeError: logger.debug('No game logs, exiting player') return career_total_dicts = [] try: game_stat_dicts, career_total_dict = parseTable( logger, player_bio_id, regular_season_table, 'regular season') career_total_dicts.append(career_total_dict) except: logger.exception('parseTable error. Deleting user bio and exiting') col_pfr_player_bio.remove({'player_url': player_url}) return playoff_table = browser.find(id="stats_playoffs") if not playoff_table: logger.debug('No playoff game logs') else: try: temp_game_dicts, career_total_dict = parseTable( logger, player_bio_id, playoff_table, 'playoffs') game_stat_dicts += temp_game_dicts career_total_dicts.append(career_total_dict) except: logger.exception('parseTable error. Deleting user bio and exiting') col_pfr_player_bio.remove({'player_url': player_url}) return try: logger.debug('Bulk Creating game_stat_dicts') if game_stat_dicts: col_pfr_player_game_stats.insert_many(game_stat_dicts) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') try: logger.debug('Bulk Creating career_total_dicts') if career_total_dict: col_pfr_player_career_stats.insert_many(career_total_dicts) else: logger.debug('Nothing to insert') except: logger.exception('insert_many error') logger.debug('parsePlayer time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger)
def parseSeason(role, category, season, seasonTypes): """Parses every seasonType in a season at http://www.nfl.com/stats/categorystats for a given role/category/season doesnt follow any links some years dont have any info, but still return a page. These are loged with Exception('No teams found %s' % url) All data is stored in team_stats """ logger = makeLogger(role.text + '_' + category.text + '_' + season.text, r'./logs_nflteamStat/') startTime = datetime.now() logger.debug('Starting %s %s %s', role.text, category.text, season.text) teamStat_list = [] for seasonType in seasonTypes: if seasonType.text == "Season Type...": continue team_stats_query = { 'year': convertToNumber(removeNewLine(season.text)), 'seasonType': removeNewLine(seasonType.text), 'role': removeNewLine(role.text), 'category': removeNewLine(category.text) } if col_team_stats.find(team_stats_query).count(): logger.debug('Already parsed %s', team_stats_query) continue wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Starting: %s', team_stats_query) url = 'http://www.nfl.com/stats/categorystats?' + 'archive=true&conference=null' + '&role=' + role[ 'value'] try: if role.text == "Offense": categoryUrl = '&offensiveStatisticCategory=' + category[ 'value'] + '&defensiveStatisticCategory=null' elif role.text == "Defense": categoryUrl = '&offensiveStatisticCategory=null&defensiveStatisticCategory=' + category[ 'value'] else: raise Exception('Unsupported role: %s', role.text) url += categoryUrl url += '&season=' + season['value'] + '&seasonType=' + seasonType[ 'value'] + '&tabSeq=2&qualified=false&Submit=Go' logger.debug('Opening: %s', url) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', url) result = browser.find(id="result") tries = 0 # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded while not result: if tries > 10: raise Exception('No teams found %s' % url) elif tries > 0: time.sleep(random.uniform(5, 7)) tries += 1 logger.debug('No result-tries: %d', tries) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', url) result = browser.find(id="result") tbodies = result.find_all("tbody") if len(tbodies) != 2: raise Exception("error parsing result") tableKey = tbodies[0] tableKeyRows = tableKey.find_all("tr") topTableKeys = [] if len(tableKeyRows) == 1: tableKey = tableKey.find_all("th") elif len(tableKeyRows) == 2: topTableColumns = tableKeyRows[0].find_all("th") for topTableColumn in topTableColumns: for _ in range(int(topTableColumn['colspan'])): topTableKeys.append(topTableColumn.text) tableKey = tableKeyRows[1].find_all("th") else: raise Exception('To many header rows found') tableItems = tbodies[1] tableItems = tableItems.find_all("td") tableColumn = 0 teamStatDict = {} for tableIndex, tableItem in enumerate(tableItems): if tableColumn == 0: logger.debug('Row %d of %d', tableIndex, len(tableItems)) tableColumn += 1 continue if tableColumn == 1: teamStatDict['team'] = removeNewLine(tableItem.text) teamStatDict['year'] = int(removeNewLine(season.text)) teamStatDict['seasonType'] = removeNewLine(seasonType.text) teamStatDict['role'] = removeNewLine(role.text) teamStatDict['category'] = removeNewLine(category.text) tableColumn += 1 continue if topTableKeys and topTableKeys[tableColumn]: key = topTableKeys[tableColumn] + '_' + tableKey[ tableColumn].text else: key = tableKey[tableColumn].text key = cleanKey(removeNewLine(key)) value = convertToNumber(removeNewLine(tableItem.text)) teamStatDict[key] = value tableColumn += 1 if tableColumn >= len(tableKey): teamStat_list.append(teamStatDict) teamStatDict = {} tableColumn = 0 except: logger.exception('row fail') try: if teamStat_list: logger.debug('Bulk Creating teamStat_list') col_team_stats.insert_many(teamStat_list) except: logger.exception('insert_many error') logger.debug('parseSeason time elapsed: ' + str(datetime.now() - startTime)) closeLogger(role.text + '_' + category.text)
def parseWeek(year, week): """Parsing a specific week at http://nflweather.com/week/{}/Week-{} Follows all detial links, which is where must of the data is scraped. Scrapes weather, and stadium enough per week, and stores them in their respective collections """ logger = makeLogger(str(year) + '_' + str(week), r'./logs_nflWeather/') startTime = datetime.now() logger.debug('Starting %d %d', year, week) weather_list = [] stadium_list = [] if col_weather_info.find({'year': year, 'week': week}).count(): logger.debug('Already parsed %d %d', year, week) return None wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link( logger, browser, 'open', "http://nflweather.com/week/{}/Week-{}".format(year, week)) data = browser.find(class_="footable") rows = data.find_all('tr') for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) weatherInfo = {'year': year, 'week': week} stadiumInfo = {'year': year, 'week': week} try: columns = row.find_all('td') if columns: weatherInfo['weatherPicAlt'] = columns[8].find('img')['alt'] weatherInfo['weatherText'] = columns[9].text.strip() weatherInfo['shortWind'] = columns[10].text details = columns[12] detialsLink = 'http://nflweather.com' + details.find( 'a')['href'] wait = random.uniform(.5, 2.5) logger.debug('Waiting to follow_link %f', wait) time.sleep(wait) logger.debug('Following link') browser = open_or_follow_link(logger, browser, 'open', detialsLink) gameTime = browser.find('strong').text.split('-')[0].split( ':', 1)[1].strip() awayTeam = browser.find_all( class_='g-away')[1].find('a').text.replace(' ', ' ').strip() homeTeam = browser.find_all( class_='g-home')[1].find('a').text.replace(' ', ' ').strip() spans = browser.find_all(class_='span5') if len(spans) != 2: raise Exception('to many spans') weatherItems = spans[0].find_all('p') stadiumItems = spans[1].find_all('p') index = spans[0].text.find('Temperature:') weatherCondition = spans[0].text[:index].strip() for each in weatherItems: split = each.text.strip().split(':') if len(split) == 2: weatherInfo[cleanKey( split[0].strip())] = convertToNumber( split[1].strip()) for index, each in enumerate(stadiumItems): split = each.text.strip().split(':') if len(split) == 2: if split[0] == 'Surface': stadiumInfo['stadium'] = stadiumItems[ index - 1].text.strip() stadiumInfo[cleanKey( split[0].strip())] = convertToNumber( split[1].strip()) #find nfl_schedule, update gameTime, hoepfully result as id, insert id into both info dicts, append to _list schedule_query = { 'year': year, 'week': week, 'homeTeam': homeTeam, 'awayTeam': awayTeam } schedule_doc = col_schedule.find(schedule_query) if schedule_doc.count() != 1: error_docs = str(schedule_query) + ' | ' + str( weatherInfo) + ' | ' + str(stadiumInfo) raise Exception("nfl_scedule doc not found " + error_docs) result = col_schedule.update_one( schedule_query, {'$set': { 'dateTime': gameTime }}) schedule_id = schedule_doc[0]['_id'] weatherInfo['schedule_id'] = schedule_id stadiumInfo['schedule_id'] = schedule_id weather_list.append(weatherInfo) stadium_list.append(stadiumInfo) except: logger.exception(row) try: logger.debug('Bulk Creating weather_list') col_weather_info.insert_many(weather_list) logger.debug('Bulk Creating stadium_list') col_stadium_info.insert_many(stadium_list) except: logger.exception('insert_many error') logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime)) closeLogger(str(year) + '_' + str(week))