def test_write(tmpdir): from civic_scraper.base.cache import Cache cache = Cache(tmpdir) content = "<h1>some content</h1>" file_path = "html/search_results_page.html" outfile = cache.write(file_path, content) scrape_dir = tmpdir.join("html") files = [f.basename for f in scrape_dir.listdir()] assert "search_results_page.html" in files actual_contents = file_contents(outfile) assert actual_contents == content
def test_scrape_cache_false_default(tmpdir): "Scrape should not cache search results pages by default" url = "http://nc-nashcounty.civicplus.com/AgendaCenter" cp = CivicPlusSite(url, cache=Cache(tmpdir)) start_date = "2020-05-03" end_date = "2020-05-06" cp.scrape(start_date, end_date) actual_files = [f.basename for f in tmpdir.listdir()] assert actual_files == []
def test_scrape_download_default(tmpdir): "Scraper should not download file assets by default" url = "http://nc-nashcounty.civicplus.com/AgendaCenter" cp = CivicPlusSite(url, cache=Cache(tmpdir)) start_date = "2020-05-05" end_date = "2020-05-05" cp.scrape( start_date, end_date, ) target_dir = tmpdir.join("assets") assert not target_dir.exists()
def test_scrape_current_day_by_default(today_local_str, tmpdir): "Scrape should assume current day be default" url = "http://nc-nashcounty.civicplus.com/AgendaCenter" cp = CivicPlusSite(url, cache=Cache(tmpdir)) cp.scrape(download=True) target_dir = tmpdir.join("assets") actual_files = set([f.basename for f in target_dir.listdir()]) expected = set([ 'civicplus_nc-nashcounty_05052020-382_minutes.pdf', 'civicplus_nc-nashcounty_05052020-382_agenda.pdf' ]) assert actual_files == expected
def test_scrape_download_filter_type(tmpdir): "Downloads should be filterable by type" url = "http://nc-nashcounty.civicplus.com/AgendaCenter" cp = CivicPlusSite(url, cache=Cache(tmpdir)) start_date = "2020-05-05" end_date = "2020-05-05" cp.scrape( start_date, end_date, download=True, asset_list=["minutes"], ) target_dir = tmpdir.join("assets") actual_files = [f.basename for f in target_dir.listdir()] expected = ["civicplus_nc-nashcounty_05052020-382_agenda.pdf"] assert actual_files == expected
def test_scrape_download_true(tmpdir): "Setting download=True should download file assets" url = "http://nc-nashcounty.civicplus.com/AgendaCenter" cp = CivicPlusSite(url, cache=Cache(tmpdir)) start_date = "2020-05-05" end_date = "2020-05-05" cp.scrape( start_date, end_date, download=True, ) target_dir = tmpdir.join("assets") actual_files = set([f.basename for f in target_dir.listdir()]) expected = set([ "civicplus_nc-nashcounty_05052020-382_minutes.pdf", "civicplus_nc-nashcounty_05052020-382_agenda.pdf", ]) assert actual_files == expected
def test_scrape_download_filter_both(tmpdir): "Downloads should be filterable by type and file size" url = "http://nc-nashcounty.civicplus.com/AgendaCenter" cp = CivicPlusSite(url, cache=Cache(tmpdir)) start_date = "2020-05-05" end_date = "2020-05-05" # Below, minutes will be filtered due to its size exceeding 0.01MB # *and* agenda, which is approx 0.018 MB will be filtered because # of asset_list cp.scrape( start_date, end_date, download=True, asset_list=["agenda"], file_size=0.019, ) target_dir = tmpdir.join("assets") actual_files = [f.basename for f in target_dir.listdir()] assert actual_files == []
def test_scrape_download_filter_size(tmpdir): "Downloads should be filterable by size in MB" url = "http://nc-nashcounty.civicplus.com/AgendaCenter" cp = CivicPlusSite(url, cache=Cache(tmpdir)) start_date = "2020-05-05" end_date = "2020-05-05" # Byte sizes of two files for May 5, 2020 # - Minutes/_05052020-382 = '28998' # - Agenda/_05052020-382 '19536' # 19536 bytes in agenda i.e. 0.0186309814453125 MBs cp.scrape( start_date, end_date, download=True, file_size=0.0186309814453125, ) target_dir = tmpdir.join("assets") actual_files = [f.basename for f in target_dir.listdir()] expected = ["civicplus_nc-nashcounty_05052020-382_agenda.pdf"] assert actual_files == expected
def test_scrape_cache_true(tmpdir): "Setting cache to True should trigger caching of search results page" url = "http://nc-nashcounty.civicplus.com/AgendaCenter" cp = CivicPlusSite(url, cache=Cache(tmpdir)) start_date = "2020-05-03" end_date = "2020-05-06" cp.scrape( start_date, end_date, cache=True, ) artifacts_path = tmpdir.join("artifacts") actual_files = [f.basename for f in artifacts_path.listdir()] expected = [ ("http__nc-nashcounty.civicplus.com__AgendaCenter__Search__QUERY" "term=&CIDs=all&startDate=05%2F03%2F2020" "&endDate=05%2F06%2F2020&dateRange=&dateSelector=") ] assert actual_files == expected # Spot check contents inpath = artifacts_path.join(expected[0]) contents = file_contents(inpath) assert "Board of Commissioners" in contents
def test_default_cache_dir(monkeypatch): target = "civic_scraper.utils.expanduser" with patch(target) as mock_method: mock_method.return_value = "/Users/you" cache = Cache() assert cache.path == "/Users/you/.civic-scraper"
def test_custom_cache_path(tmpdir): from civic_scraper.base.cache import Cache cache = Cache(tmpdir) assert tmpdir == cache.path
def scrape( self, start_date, end_date, site_urls=[], cache=False, download=False, ): """Scrape file metadata and assets for a list of agency sites. For a given scraper, scrapes file artificate metadata and downloads file artificats. Automatically generats a metadata CSV of file assets. If requested, caches intermediate file artifacts such as HTML from scraped pages and downloads file assets such as agendas, minutes (caching and downloading are optional and are off by default). Args: start_date (str): Start date of scrape (YYYY-MM-DD) end_date (str): End date of scrape (YYYY-MM-DD) site_urls (list): List of site URLs cache (bool): Optionally cache intermediate file artificats such as HTML (default: False) download (bool): Optionally download file assets such as agendas (default: False) Outputs: Metadata CSV listing file assets for given sites and params. Returns: AssetCollection instance """ asset_collection = AssetCollection() cache_obj = Cache(self.cache_path) logger.info( f"Scraping {len(site_urls)} site(s) from {start_date} to {end_date}..." ) for url in site_urls: SiteClass = self._get_site_class(url) kwargs = {} if cache: kwargs["cache"] = cache_obj site = SiteClass(url, **kwargs) logger.info(f"\t{url}") _collection = site.scrape( start_date, end_date, cache=cache, ) asset_collection.extend(_collection) metadata_file = asset_collection.to_csv(cache_obj.metadata_files_path) logger.info(f"Wrote asset metadata CSV: {metadata_file}") if download: download_counter = 0 logger.info( f"Downloading {len(asset_collection)} file asset(s) to {cache_obj.assets_path}..." ) for asset in asset_collection: # TODO: Add error-handling here logger.info(f"\t{asset.url}") asset.download(cache_obj.assets_path) download_counter += 1 return asset_collection
def test_env_configured_default(monkeypatch): "CIVIC_SCRAPER_DIR env var should configure cache" monkeypatch.setenv("CIVIC_SCRAPER_DIR", "/tmp/civic-scraper") cache = Cache() assert cache.path == "/tmp/civic-scraper"