예제 #1
0
def test_write(tmpdir):
    from civic_scraper.base.cache import Cache

    cache = Cache(tmpdir)
    content = "<h1>some content</h1>"
    file_path = "html/search_results_page.html"
    outfile = cache.write(file_path, content)
    scrape_dir = tmpdir.join("html")
    files = [f.basename for f in scrape_dir.listdir()]
    assert "search_results_page.html" in files
    actual_contents = file_contents(outfile)
    assert actual_contents == content
def test_scrape_cache_false_default(tmpdir):
    "Scrape should not cache search results pages by default"
    url = "http://nc-nashcounty.civicplus.com/AgendaCenter"
    cp = CivicPlusSite(url, cache=Cache(tmpdir))
    start_date = "2020-05-03"
    end_date = "2020-05-06"
    cp.scrape(start_date, end_date)
    actual_files = [f.basename for f in tmpdir.listdir()]
    assert actual_files == []
def test_scrape_download_default(tmpdir):
    "Scraper should not download file assets by default"
    url = "http://nc-nashcounty.civicplus.com/AgendaCenter"
    cp = CivicPlusSite(url, cache=Cache(tmpdir))
    start_date = "2020-05-05"
    end_date = "2020-05-05"
    cp.scrape(
        start_date,
        end_date,
    )
    target_dir = tmpdir.join("assets")
    assert not target_dir.exists()
def test_scrape_current_day_by_default(today_local_str, tmpdir):
    "Scrape should assume current day be default"
    url = "http://nc-nashcounty.civicplus.com/AgendaCenter"
    cp = CivicPlusSite(url, cache=Cache(tmpdir))
    cp.scrape(download=True)
    target_dir = tmpdir.join("assets")
    actual_files = set([f.basename for f in target_dir.listdir()])
    expected = set([
        'civicplus_nc-nashcounty_05052020-382_minutes.pdf',
        'civicplus_nc-nashcounty_05052020-382_agenda.pdf'
    ])
    assert actual_files == expected
def test_scrape_download_filter_type(tmpdir):
    "Downloads should be filterable by type"
    url = "http://nc-nashcounty.civicplus.com/AgendaCenter"
    cp = CivicPlusSite(url, cache=Cache(tmpdir))
    start_date = "2020-05-05"
    end_date = "2020-05-05"
    cp.scrape(
        start_date,
        end_date,
        download=True,
        asset_list=["minutes"],
    )
    target_dir = tmpdir.join("assets")
    actual_files = [f.basename for f in target_dir.listdir()]
    expected = ["civicplus_nc-nashcounty_05052020-382_agenda.pdf"]
    assert actual_files == expected
def test_scrape_download_true(tmpdir):
    "Setting download=True should download file assets"
    url = "http://nc-nashcounty.civicplus.com/AgendaCenter"
    cp = CivicPlusSite(url, cache=Cache(tmpdir))
    start_date = "2020-05-05"
    end_date = "2020-05-05"
    cp.scrape(
        start_date,
        end_date,
        download=True,
    )
    target_dir = tmpdir.join("assets")
    actual_files = set([f.basename for f in target_dir.listdir()])
    expected = set([
        "civicplus_nc-nashcounty_05052020-382_minutes.pdf",
        "civicplus_nc-nashcounty_05052020-382_agenda.pdf",
    ])
    assert actual_files == expected
def test_scrape_download_filter_both(tmpdir):
    "Downloads should be filterable by type and file size"
    url = "http://nc-nashcounty.civicplus.com/AgendaCenter"
    cp = CivicPlusSite(url, cache=Cache(tmpdir))
    start_date = "2020-05-05"
    end_date = "2020-05-05"
    # Below, minutes will be filtered due to its size exceeding 0.01MB
    # *and* agenda, which is approx 0.018 MB will be filtered because
    # of asset_list
    cp.scrape(
        start_date,
        end_date,
        download=True,
        asset_list=["agenda"],
        file_size=0.019,
    )
    target_dir = tmpdir.join("assets")
    actual_files = [f.basename for f in target_dir.listdir()]
    assert actual_files == []
def test_scrape_download_filter_size(tmpdir):
    "Downloads should be filterable by size in MB"
    url = "http://nc-nashcounty.civicplus.com/AgendaCenter"
    cp = CivicPlusSite(url, cache=Cache(tmpdir))
    start_date = "2020-05-05"
    end_date = "2020-05-05"
    # Byte sizes of two files for May 5, 2020
    # - Minutes/_05052020-382 = '28998'
    # - Agenda/_05052020-382 '19536'
    # 19536 bytes in agenda i.e. 0.0186309814453125  MBs
    cp.scrape(
        start_date,
        end_date,
        download=True,
        file_size=0.0186309814453125,
    )
    target_dir = tmpdir.join("assets")
    actual_files = [f.basename for f in target_dir.listdir()]
    expected = ["civicplus_nc-nashcounty_05052020-382_agenda.pdf"]
    assert actual_files == expected
def test_scrape_cache_true(tmpdir):
    "Setting cache to True should trigger caching of search results page"
    url = "http://nc-nashcounty.civicplus.com/AgendaCenter"
    cp = CivicPlusSite(url, cache=Cache(tmpdir))
    start_date = "2020-05-03"
    end_date = "2020-05-06"
    cp.scrape(
        start_date,
        end_date,
        cache=True,
    )
    artifacts_path = tmpdir.join("artifacts")
    actual_files = [f.basename for f in artifacts_path.listdir()]
    expected = [
        ("http__nc-nashcounty.civicplus.com__AgendaCenter__Search__QUERY"
         "term=&CIDs=all&startDate=05%2F03%2F2020"
         "&endDate=05%2F06%2F2020&dateRange=&dateSelector=")
    ]
    assert actual_files == expected
    # Spot check contents
    inpath = artifacts_path.join(expected[0])
    contents = file_contents(inpath)
    assert "Board of Commissioners" in contents
예제 #10
0
def test_default_cache_dir(monkeypatch):
    target = "civic_scraper.utils.expanduser"
    with patch(target) as mock_method:
        mock_method.return_value = "/Users/you"
        cache = Cache()
        assert cache.path == "/Users/you/.civic-scraper"
예제 #11
0
def test_custom_cache_path(tmpdir):
    from civic_scraper.base.cache import Cache

    cache = Cache(tmpdir)
    assert tmpdir == cache.path
예제 #12
0
    def scrape(
        self,
        start_date,
        end_date,
        site_urls=[],
        cache=False,
        download=False,
    ):
        """Scrape file metadata and assets for a list of agency sites.

        For a given scraper, scrapes file artificate metadata and
        downloads file artificats. Automatically generats a metadata
        CSV of file assets.

        If requested, caches intermediate file artifacts such as HTML
        from scraped pages and downloads file assets such as agendas, minutes
        (caching and downloading are optional and are off by default).

        Args:

            start_date (str): Start date of scrape (YYYY-MM-DD)
            end_date (str): End date of scrape (YYYY-MM-DD)
            site_urls (list): List of site URLs
            cache (bool): Optionally cache intermediate file artificats such as HTML
                (default: False)
            download (bool): Optionally download file assets such as agendas (default: False)

        Outputs:
            Metadata CSV listing file assets for given sites and params.

        Returns:
            AssetCollection instance
        """
        asset_collection = AssetCollection()
        cache_obj = Cache(self.cache_path)
        logger.info(
            f"Scraping {len(site_urls)} site(s) from {start_date} to {end_date}..."
        )
        for url in site_urls:
            SiteClass = self._get_site_class(url)
            kwargs = {}
            if cache:
                kwargs["cache"] = cache_obj
            site = SiteClass(url, **kwargs)
            logger.info(f"\t{url}")
            _collection = site.scrape(
                start_date,
                end_date,
                cache=cache,
            )
            asset_collection.extend(_collection)
        metadata_file = asset_collection.to_csv(cache_obj.metadata_files_path)
        logger.info(f"Wrote asset metadata CSV: {metadata_file}")
        if download:
            download_counter = 0
            logger.info(
                f"Downloading {len(asset_collection)} file asset(s) to {cache_obj.assets_path}..."
            )
            for asset in asset_collection:
                # TODO: Add error-handling here
                logger.info(f"\t{asset.url}")
                asset.download(cache_obj.assets_path)
                download_counter += 1
        return asset_collection
def test_env_configured_default(monkeypatch):
    "CIVIC_SCRAPER_DIR env var should configure cache"
    monkeypatch.setenv("CIVIC_SCRAPER_DIR", "/tmp/civic-scraper")
    cache = Cache()
    assert cache.path == "/tmp/civic-scraper"