def test_country_scrapers_returns_only_counties_default(): r = Runner() names = [s.__name__ for s in r.country_scrapers()] assert 'Bra' in names assert 'Pak' in names assert 'CountryScraper' not in names assert 'Runner' not in names
def process_data(country, cache_dir): """the main function to instantiate a runner, and process data """ # No alert manager (e.g., could be slack) runner = Runner(alert_manager=None) # Make sure geckodriver is on the path os.environ["PATH"] = "%s:%s" % (os.getcwd(), os.environ["PATH"]) # This would equivalent to: # covid-world-scraper --cache-dir=$PWD/covid-cache bra print(f"Processing {country}") runner.run(cache_dir=cache_dir, headless_status=True, filter=[country])
def test_run_is_called_on_country_scrapers(): # Patch Runner.country_scrapers to return a limited # set of countries mock_scraper_classes = [ Mock(name='Bra'), Mock(name='Pak'), ] with patch('covid_world_scraper.runner.Runner.country_scrapers' ) as mock_method: mock_method.return_value = mock_scraper_classes r = Runner() scrapers = r.run() for scraper in scrapers: scraper.run.assert_called_once()
def cli(countries, all, alert, cache_dir, list_scrapers, log_file, headless): """Scrape data for one or more countries.""" # Ensure cache directory exists Path(cache_dir).mkdir(parents=True, exist_ok=True) logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)-12s - %(message)s', datefmt='%m-%d %H:%M', filename=log_file, filemode='a') console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter('%(name)-12s - %(message)s') console.setFormatter(formatter) logging.getLogger('').addHandler(console) logger = logging.getLogger(__name__) # Set up alert manager alert_manager = None if alert: try: api_key = os.environ['COVID_WORLD_SLACK_API_KEY'] channel = os.environ['COVID_WORLD_SLACK_CHANNEL'] alert_msg = "Slack alerts will be sent to #{}.".format(channel) alert_manager = SlackAlertManager(api_key, channel) except KeyError: alert_msg = "WARNING - Slack alerts will not be sent.\n" + \ "Please ensure you've configured the below environment variables:\n" + \ "COVID_WORLD_SLACK_API_KEY=YOUR_API_KEY\n" + \ "COVID_WORLD_SLACK_CHANNEL=channel-name\n\n" finally: logger.warning(alert_msg) runner = Runner(alert_manager=alert_manager) if list_scrapers: click.echo('Available country scrapers:') for country in runner.list_countries(): msg = '- {}'.format(country) click.echo(msg) else: kwargs = { 'cache_dir': cache_dir, 'headless_status': headless, 'filter': countries, } try: runner.run(**kwargs) if alert and alert_manager: runner.send_alerts() except Exception as e: traceback_str = ''.join(traceback.format_tb(e.__traceback__)) logger.error( "ERROR: A fatal error occurred while running scrapers or sending alerts!!!" ) logger.error(traceback_str)
def test_send_alerts(): # Mock two fake scrapers, second of which raises a generic error with patch('covid_world_scraper.runner.Runner.country_scrapers' ) as mock_method: mock_scraper_classes = [ Mock(name='Bra', country_code='BRA'), Mock(name='Pak', side_effect=Exception('Woe is me')) ] mock_method.return_value = mock_scraper_classes with patch('covid_world_scraper.alerts.WebClient.chat_postMessage' ) as mock_post: # Configure runner with an alert manager instance manager = SlackAlertManager('APIKEY', 'some-channel') r = Runner(alert_manager=manager) # Run generates messages but does not automatically # send them r.run() mock_post.assert_not_called() # Slack client should be called when # we request alerts to be sent r.send_alerts() mock_post.assert_called() assert mock_post.call_count == 2 success_call, error_call = mock_post.call_args_list success_msg = success_call[1]['text'] expected = '1 scraper(s) ran successfully' assert expected in success_msg error_msg = error_call[1]['text'] assert 'Woe is me' in error_msg
def list_countries(): """Return list of all countries available, which will be mapped to the process data function (to run in parallel if possible). """ runner = Runner(alert_manager=None) return [x.split(' ')[0] for x in runner.list_countries()]
#!/usr/bin/env python3 # This is a test for running a data scrape, separate from a workflow. We want # to make sure this works in it's simple before before adding to a workflow. # We are using an application to process covid data via: # https://github.com/biglocalnews/covid-world-scraper from covid_world_scraper import Runner import os # No alert manager (e.g., could be slack) runner = Runner(alert_manager=None) # runner.list_countries() # ['BRA (Brazil)', # 'DEU (Germany)', # 'IND (India)', # 'KOR (South Korea)', # 'NGA (Nigeria)', # 'PAK (Pakistan)', # 'ZAF (South Africa)'] # Set a custom cache directory, run headless, and cache_dir = os.path.join(os.getcwd(), 'covid-cache') if not os.path.exists(cache_dir): os.mkdir(cache_dir) # Testing a single run! os.environ["PATH"] = "%s:%s" % (os.getcwd(), os.environ["PATH"]) runner.run(cache_dir=cache_dir, headless_status=True, filter=["BRA"])
def test_list_countries(): r = Runner() countries = r.list_countries() assert 'PAK (Pakistan)' in countries
def test_country_scrapers_with_incorrect_country_name(): with pytest.raises(CountryScraperError) as excep: r = Runner() names = [s.__name__ for s in r.country_scrapers(filter=['Foo'])]
def test_country_scrapers_filter(): r = Runner() names = [s.__name__ for s in r.country_scrapers(filter=['Bra'])] assert 'Bra' in names assert 'Pak' not in names
def test_country_codes(): r = Runner() codes = ['BRA', 'PAK'] assert set(r.country_codes.keys()).issuperset(codes)