def scrape_games_for_season(this_season): global wait_on_page_load logger.info('Season "%s" - getting all pagination links', this_season.name) crawler = Crawler(wait_on_page_load=wait_on_page_load) logger.info('Season "%s" - started this crawler', this_season.name) crawler.fill_in_season_pagination_links(this_season) crawler.close_browser() logger.info('Season "%s" - closed this crawler', this_season.name) logger.info('Season "%s" - populating all game data via pagination links', this_season.name) scraper = Scraper(wait_on_page_load=wait_on_page_load) logger.info('Season "%s" - started this scraper', this_season.name) scraper.populate_games_into_season(this_season) scraper.close_browser() logger.info('Season "%s" - closed this scraper', this_season.name) return this_season
def main(sport=None): global logger, data, wait_on_page_load # Instantiate the argument parser parser = argparse.ArgumentParser(description='oddsporter v1.0') # Declaring all our acceptable arguments below... parallel_cpus_desc = 'Number parallel CPUs for processing (default -1 for max available)' parser.add_argument('--number-of-cpus', type=int, nargs='?', help=parallel_cpus_desc) parser.add_argument( '--wait-time-on-page-load', type=int, nargs='?', help='How many seconds to wait on page load (default 3)') # Then grab them from the command line input # START parsing command line arguments and logging what's happening args = parser.parse_args() max_parallel_cpus = args.number_of_cpus if max_parallel_cpus == None: logger.info( 'Did not receive argument --number-of-cpus so will use maximum available to crawl and scrape' ) max_parallel_cpus = -1 else: logger.info('Received argument --number-of-cpus so will use %s', str(max_parallel_cpus)) if args.wait_time_on_page_load != None: wait_on_page_load = args.wait_time_on_page_load logger.info( 'Received argument --wait-time-on-page-load so will wait %s seconds', str(wait_on_page_load)) else: logger.info( 'Did not receive argument --wait-time-on-page-load so will use default 3 seconds' ) # END parsing command line arguments and logging what's happening logger.info('About to load "target sports"') target_sports = get_target_sports_from_file() if len(target_sports) < 1: raise RuntimeError( 'config/sports.json file appears empty - cannot proceed') logger.info('Now prompting user for which sport/league to scrape') print( 'Please input the corresponding number of which sport/league to scrape' ) print('\t[0] ' + 'all sports *buggy*') for i, target_sport_obj in enumerate(target_sports): print('\t[' + str(i + 1) + '] ' + target_sport_obj['collection_name']) if sport: sport_to_do = str(sport) else: sport_to_do = input('Selection: ') if False == sport_to_do.isdigit(): raise RuntimeError('Invalid selection, please re-rerun and try again') else: sport_to_do = int(sport_to_do) logger.info('Starting scrape of OddsPortal.com') logger.info('Loaded configuration for ' + str(len(target_sports)) + ' sports\' results to scrape') if int(sport_to_do) == 0: logger.info('Will attempt to scrape all sports') else: logger.info('Only scraping one sport though') crawler = Crawler(wait_on_page_load=wait_on_page_load, full_scraper_path=DIR_PATH) logger.info('Crawler for season links has been initialized') ran_once = False for i, target_sport_obj in enumerate(target_sports): if (i + 1) != int(sport_to_do) and int(sport_to_do) != 0: continue ran_once = True c_name = target_sport_obj['collection_name'] logger.info('Starting data collection "%s"', c_name) data.start_new_data_collection(target_sport_obj) main_league_results_url = target_sport_obj['root_url'] working_seasons = crawler.get_seasons_for_league( main_league_results_url) crawler.close_browser() logger.info('Crawler for season links has been shut down') # Make sure possible outcomes field is set, because the parallel processor needs to know for i, _ in enumerate(working_seasons): working_seasons[i].possible_outcomes = target_sport_obj['outcomes'] # Use parallel processing to scrape games for each season of this league's history working_seasons_w_games = Parallel(n_jobs=max_parallel_cpus)( delayed(scrape_games_for_season)(this_season) for this_season in working_seasons) data[c_name].league.seasons = working_seasons_w_games if ran_once: logger.info('Saving output now') data.set_output_directory(OUTPUT_DIRECTORY_PATH) data.save_all_collections_to_json() else: logger.warning('Did not run - invalid command line input for sport') logger.info('Ending scrape of OddsPortal.com')