def run(wait): """Starts the scrapping proccess. creates a process per year between minyear and maxyear """ logger = makeLogger('main', r'./logs_pfrSchedule/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) minyear = 1960 maxyear = 2015 pool = Pool(processes=int(get_proxy_count() / 2)) for i in range(maxyear - minyear + 1): year = minyear + i #parseYear(year) pool.apply_async(parseYear, (year, )) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now() - startTime)) closeLogger('main')
def run(wait=0): """Starts the scrapping proccess. creates a process per week per year given in pages """ logger = makeLogger('main', r'./logs_RotoFDStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) logger.debug('starting') pool = Pool(processes=int(get_proxy_count()/2)) pages = [(2011, 17), (2012, 17), (2013, 17), (2014, 17), (2015, 17)] for year, maxWeek in pages: for week in range(1, maxWeek+1): #parseWeek(year, week) pool.apply_async(parseWeek,(year, week,)) pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now()-startTime )) closeLogger('main')
def run(wait): """Starts the scrapping proccess. creates a process for each week per year given in pages """ logger = makeLogger('main', r'./logs_nflWeather/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) pool = Pool(processes=int(get_proxy_count()/2.5)) #nflweather.com goes back to 2009, 2010 seems to be missing on the site. pages = [(2009, 17), (2011, 17), (2012, 17), (2013, 17), (2014, 17), (2015, 17)] headers = [] dataList = [] for year, maxWeek in pages: for week in range(1, maxWeek+1): #parseWeek(year, week) pool.apply_async(parseWeek, (year, week,)) pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now()-startTime )) closeLogger('main')
def run(wait): """Starts the scrapping proccess. creates a process per year between minyear and maxyear """ logger = makeLogger('main', r'./logs_pfrSchedule/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) minyear = 1960 maxyear = 2015 pool = Pool(processes=int(get_proxy_count()/2)) for i in range(maxyear-minyear+1): year = minyear + i #parseYear(year) pool.apply_async(parseYear, (year,)) pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now()-startTime )) closeLogger('main')
def run(wait=0): """Starts the scrapping proccess. creates a process per week per year given in pages """ logger = makeLogger('main', r'./logs_RotoFDStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) logger.debug('starting') pool = Pool(processes=int(get_proxy_count() / 2)) pages = [(2011, 17), (2012, 17), (2013, 17), (2014, 17), (2015, 17)] for year, maxWeek in pages: for week in range(1, maxWeek + 1): #parseWeek(year, week) pool.apply_async(parseWeek, ( year, week, )) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now() - startTime)) closeLogger('main')
def run(wait): """Starts the scrapping proccess. Opens a teamstats page and gathers all the form inputs Then sends these inputs to parseSeason which opens a new page for every possible option in the form If you get an error at the start, with role.find_all, just try again, nfl.com returns weird pages sometimes """ logger = makeLogger('main', r'./logs_nflteamStat/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) pool = Pool(processes=int(get_proxy_count()/2.5)) #html5lib parser required for broken html on gameSplits browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) startingUrl = "http://www.nfl.com/stats/categorystats?tabSeq=2&offensiveStatisticCategory=GAME_STATS&conference=ALL&role=TM&season=2015&seasonType=REG" browser = open_or_follow_link(logger, browser, 'open', startingUrl) role = browser.find(id="role") roles = role.find_all("option") offensiveCategory = browser.find(id="offensive-category") offensiveCategories = offensiveCategory.find_all("option") defensiveCategory = browser.find(id="defensive-category") defensiveCategories = defensiveCategory.find_all("option") season = browser.find(id="season-dropdown") seasons = season.find_all("option") seasonType = browser.find(id="season-type") seasonTypes = seasonType.find_all("option") for role in roles: availableCategories = None if role.text == "Offense": availableCategories = offensiveCategories elif role.text == "Defense": availableCategories = defensiveCategories else: print "unknown role" for category in availableCategories: if category.text == "Category...": continue for season in seasons: if season.text == "Season..." or convertToNumber(removeNewLine(season.text)) < 1960: continue #parseSeason(role, category, season, seasonTypes) pool.apply_async(parseSeason, (role, category, season, seasonTypes,)) pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now()-startTime )) closeLogger('main')
def run(wait): """ """ logger = makeLogger('main', r'./logs_pfrPlayerStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) player_tuples = [] for letter in list(string.ascii_uppercase): wait = random.uniform(.5, 1.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening players %s', letter) browser = open_or_follow_link( logger, browser, 'open', "http://www.pro-football-reference.com/players/{}/".format(letter)) players = browser.find(id="div_players") for player in players.find_all('p'): player = player.find('a') player_tuples.append((player.text, player['href'])) pool = Pool(processes=int(get_proxy_count() / 2.5)) logger.debug('Processing %d Players', len(player_tuples)) for player_tuple in player_tuples: #parsePlayer(player_tuple[0], player_tuple[1]) pool.apply_async(parsePlayer, ( player_tuple[0], player_tuple[1], )) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger)
def run(wait): """ """ logger = makeLogger('main', r'./logs_pfrPlayerStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) player_tuples = [] for letter in list(string.ascii_uppercase): wait = random.uniform(.5,1.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening players %s', letter) browser = open_or_follow_link(logger, browser, 'open', "http://www.pro-football-reference.com/players/{}/".format(letter)) players = browser.find(id="div_players") for player in players.find_all('p'): player = player.find('a') player_tuples.append((player.text, player['href'])) pool = Pool(processes=int(get_proxy_count()/2.5)) logger.debug('Processing %d Players', len(player_tuples)) for player_tuple in player_tuples: #parsePlayer(player_tuple[0], player_tuple[1]) pool.apply_async(parsePlayer, (player_tuple[0], player_tuple[1],)) pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger)
def run(wait): """Starts the scrapping proccess. creates a process for each week per year given in pages """ logger = makeLogger('main', r'./logs_nflWeather/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) pool = Pool(processes=int(get_proxy_count() / 2.5)) #nflweather.com goes back to 2009, 2010 seems to be missing on the site. pages = [(2009, 17), (2011, 17), (2012, 17), (2013, 17), (2014, 17), (2015, 17)] headers = [] dataList = [] for year, maxWeek in pages: for week in range(1, maxWeek + 1): #parseWeek(year, week) pool.apply_async(parseWeek, ( year, week, )) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now() - startTime)) closeLogger('main')
def run(wait): """Starts the scrapping proccess. Opens a teamstats page and gathers all the form inputs Then sends these inputs to parseSeason which opens a new page for every possible option in the form If you get an error at the start, with role.find_all, just try again, nfl.com returns weird pages sometimes """ logger = makeLogger('main', r'./logs_nflteamStat/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) pool = Pool(processes=int(get_proxy_count() / 2.5)) #html5lib parser required for broken html on gameSplits browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) startingUrl = "http://www.nfl.com/stats/categorystats?tabSeq=2&offensiveStatisticCategory=GAME_STATS&conference=ALL&role=TM&season=2015&seasonType=REG" browser = open_or_follow_link(logger, browser, 'open', startingUrl) role = browser.find(id="role") roles = role.find_all("option") offensiveCategory = browser.find(id="offensive-category") offensiveCategories = offensiveCategory.find_all("option") defensiveCategory = browser.find(id="defensive-category") defensiveCategories = defensiveCategory.find_all("option") season = browser.find(id="season-dropdown") seasons = season.find_all("option") seasonType = browser.find(id="season-type") seasonTypes = seasonType.find_all("option") for role in roles: availableCategories = None if role.text == "Offense": availableCategories = offensiveCategories elif role.text == "Defense": availableCategories = defensiveCategories else: print "unknown role" for category in availableCategories: if category.text == "Category...": continue for season in seasons: if season.text == "Season..." or convertToNumber( removeNewLine(season.text)) < 1960: continue #parseSeason(role, category, season, seasonTypes) pool.apply_async(parseSeason, ( role, category, season, seasonTypes, )) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now() - startTime)) closeLogger('main')
def run(wait): """Starts the scrapping proccess. creates a process per year between minyear and maxyear """ logger = makeLogger('main', r'./logs_pfrTeamStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link( logger, browser, 'open', "http://www.pro-football-reference.com/teams/") table_body = browser.find(id='teams_active').find('tbody') rows = table_body.find_all('tr') team_url_tups = [] for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) try: team_link = row.find('th').find('a') if team_link: team_url = 'http://www.pro-football-reference.com' + team_link[ 'href'] team_name = team_link.text team_url_tups.append((team_url, team_name)) except: logger.exception(row) pool = Pool(processes=int(get_proxy_count() / 2.5)) results = [] for team_url, team_name in team_url_tups: #print parseTeam(team_url, team_name) results.append(pool.apply_async(parseTeam, ( team_url, team_name, ))) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). year_url_tups = [] for result in results: year_url_tup = result.get() if year_url_tup: year_url_tups += (year_url_tup) logger.debug('Done gathering %d year urls', len(year_url_tups)) pool = Pool(processes=int(get_proxy_count() / 2)) logger.debug('Shuffling year_urls') random.shuffle(year_url_tups) logger.debug('Starting to parse year_urls') for team_name, year_url, year in year_url_tups: #parseYear(team_name, year_url, year) pool.apply_async(parseYear, ( team_name, year_url, year, )) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now() - startTime)) closeLogger('main')
def run(wait): """ First collects a set of playerUrls to parse using, parsePlayerNames. Then parses each player. Both tasks use multiprocessing """ logger = makeLogger('main', r'./logs_nflPlayerStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) pool = Pool(processes=int(get_proxy_count()/2.5)) results = [] #html5lib parser required for broken html on gameSplits browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) startingUrl = 'http://www.nfl.com/stats/categorystats?tabSeq=0&statisticCategory=PASSING&qualified=true&season=2015&seasonType=PRE' browser = open_or_follow_link(logger, browser, 'open', startingUrl) statisticCategory = browser.find(id="statistic-category") statisticCategories = statisticCategory.find_all("option") season = browser.find(id="season-dropdown") seasons = season.find_all("option") seasonType = browser.find(id="season-type") seasonTypes = seasonType.find_all("option") for statisticCategory in statisticCategories: if statisticCategory.text == 'Category...': continue for season in seasons: if season.text == 'Season...': continue for seasonType in seasonTypes: if seasonType.text == 'Season Type...': continue results.append(pool.apply_async(parsePlayerNames, (statisticCategory['value'], season['value'], seasonType['value'],))) pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join(). playerUrl_set = set() for result in results: try: result_set = result.get() if result_set: playerUrl_set = playerUrl_set.union(result_set) except: logger.exception('Error in parsePlayerNames worker') with open('../playerUrl_set.json', 'w') as playerUrl_json: playerUrl_json.write(json.dumps(list(playerUrl_set))) pool = Pool(processes=int(get_proxy_count()/2.5)) logger.debug('Starting to parse %d players', len(playerUrl_set)) for playerUrl in playerUrl_set: if col_player_profiles.find({'player_url': playerUrl}).count(): logger.debug('Skipping ' + playerUrl) continue #parsePlayer(playerUrl) pool.apply_async(parsePlayer, (playerUrl,)) pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now()-startTime )) closeLogger('main')
def run(wait): """Starts the scrapping proccess. creates a process per year between minyear and maxyear """ logger = makeLogger("main", r"./logs_pfrTeamStats/") startTime = datetime.now() logger.debug("start time: " + str(startTime)) logger.debug("waiting %d seconds", wait) time.sleep(wait) logger.debug("Opening main page") browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, "open", "http://www.pro-football-reference.com/teams/") table_body = browser.find(id="teams_active").find("tbody") rows = table_body.find_all("tr") team_url_tups = [] for index, row in enumerate(rows): logger.debug("Row %d of %d", index, len(rows)) try: team_link = row.find("th").find("a") if team_link: team_url = "http://www.pro-football-reference.com" + team_link["href"] team_name = team_link.text team_url_tups.append((team_url, team_name)) except: logger.exception(row) pool = Pool(processes=int(get_proxy_count() / 2.5)) results = [] for team_url, team_name in team_url_tups: # print parseTeam(team_url, team_name) results.append(pool.apply_async(parseTeam, (team_url, team_name))) pool.close() # Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() # Wait for the worker processes to exit. One must call close() or terminate() before using join(). year_url_tups = [] for result in results: year_url_tup = result.get() if year_url_tup: year_url_tups += year_url_tup logger.debug("Done gathering %d year urls", len(year_url_tups)) pool = Pool(processes=int(get_proxy_count() / 2)) logger.debug("Shuffling year_urls") random.shuffle(year_url_tups) logger.debug("Starting to parse year_urls") for team_name, year_url, year in year_url_tups: # parseYear(team_name, year_url, year) pool.apply_async(parseYear, (team_name, year_url, year)) pool.close() # Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join() # Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug("run time: " + str(datetime.now() - startTime)) closeLogger("main")
def run(wait): """ First collects a set of playerUrls to parse using, parsePlayerNames. Then parses each player. Both tasks use multiprocessing """ logger = makeLogger('main', r'./logs_nflPlayerStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) pool = Pool(processes=int(get_proxy_count() / 2.5)) results = [] #html5lib parser required for broken html on gameSplits browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) startingUrl = 'http://www.nfl.com/stats/categorystats?tabSeq=0&statisticCategory=PASSING&qualified=true&season=2015&seasonType=PRE' browser = open_or_follow_link(logger, browser, 'open', startingUrl) statisticCategory = browser.find(id="statistic-category") statisticCategories = statisticCategory.find_all("option") season = browser.find(id="season-dropdown") seasons = season.find_all("option") seasonType = browser.find(id="season-type") seasonTypes = seasonType.find_all("option") for statisticCategory in statisticCategories: if statisticCategory.text == 'Category...': continue for season in seasons: if season.text == 'Season...': continue for seasonType in seasonTypes: if seasonType.text == 'Season Type...': continue results.append( pool.apply_async(parsePlayerNames, ( statisticCategory['value'], season['value'], seasonType['value'], ))) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). playerUrl_set = set() for result in results: try: result_set = result.get() if result_set: playerUrl_set = playerUrl_set.union(result_set) except: logger.exception('Error in parsePlayerNames worker') with open('../playerUrl_set.json', 'w') as playerUrl_json: playerUrl_json.write(json.dumps(list(playerUrl_set))) pool = Pool(processes=int(get_proxy_count() / 2.5)) logger.debug('Starting to parse %d players', len(playerUrl_set)) for playerUrl in playerUrl_set: if col_player_profiles.find({'player_url': playerUrl}).count(): logger.debug('Skipping ' + playerUrl) continue #parsePlayer(playerUrl) pool.apply_async(parsePlayer, (playerUrl, )) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now() - startTime)) closeLogger('main')