Python CivicPlusSite 예제들, civic_scraper.scrapers.CivicPlusSite Python 예제들

예제 #1

0

파일 보기

파일: test_asset.py 프로젝트: krammy19/civic-scraper

def test_append_to_csv(tmp_path):
    "Tests the behavior of to_csv() function in AssetCollection when appending=True"

    # Create the temporary directory and temporary file
    directory = tmp_path / "sub"
    directory.mkdir()
    temp_csv = directory / "temp_csv_appending.csv"

    # Make AssetCollection
    site_url = "http://ks-liberal.civicplus.com/AgendaCenter"
    end_date = "2020-08-26"
    start_date = "2020-08-24"
    cp = CivicPlusSite(site_url)
    asset_collection = cp.scrape(end_date=end_date, start_date=start_date)

    # Call to_csv
    asset_collection.to_csv(target_path=temp_csv, appending=True)

    # Check that the content of the csv is what we'd expect
    hash_md5 = hashlib.md5()
    with open(temp_csv, 'rb') as file:
        for chunk in iter(lambda: file.read(4096), b""):
            hash_md5.update(chunk)
    assert hash_md5.digest(
    ) == b'\xf9\x84oT\x00[8\xb9wG\x15\x90\xae\x7f\x9e\xb3'

    hash_md5.digest()

    # Check that the csv exists
    assert os.path.exists(tmp_path)

예제 #2

0

파일 보기

파일: test_asset.py 프로젝트: krammy19/civic-scraper

def test_download_asset_list(tmp_path):
    '''
    Test behavior of CivicPlus.download with optional asset_list parameter
    '''
    # Make AssetCollection
    site_url = "https://ca-napacounty.civicplus.com/AgendaCenter"
    start_date = "2020-05-31"
    end_date = "2020-06-02"
    cp = CivicPlusSite(site_url)
    asset_collection = cp.scrape(start_date=start_date, end_date=end_date)

    # Set parameters
    directory = tmp_path / "sub"
    directory.mkdir()
    asset_list = ['minutes']

    # Download the assets
    asset_collection.download(target_dir=directory, asset_list=asset_list)

    # Check that files we expect to download get downloaded
    assert len(asset_collection) == 2
    expected_asset_hash = b';eJ\xfe\x8ef\xd7\x8d\x8f\xd5\x16h9c\xfa\xac'
    actual_asset_hashes = []
    for asset in os.listdir(directory):
        if asset != ".DS_Store":
            full_path = '{}/{}'.format(directory, asset)
            hash_md5 = hashlib.md5()
            with open(full_path, 'rb') as file:
                for chunk in iter(lambda: file.read(4096), b""):
                    hash_md5.update(chunk)
            actual_asset_hashes.append(hash_md5.digest())
    actual_asset_hash = actual_asset_hashes[0]
    assert expected_asset_hash == actual_asset_hash

예제 #3

0

파일 보기

파일: test_asset.py 프로젝트: krammy19/civic-scraper

def test_download_file_size(tmp_path):
    '''
    Test behavior of CivicPlus.download with optional file_size parameter
    '''
    # Make AssetCollection
    site_url = "https://ca-napacounty.civicplus.com/AgendaCenter"
    start_date = "2020-08-24"
    end_date = "2020-09-03"
    cp = CivicPlusSite(site_url)
    asset_collection = cp.scrape(start_date=start_date, end_date=end_date)

    # Set parameters
    directory = tmp_path / "sub"
    directory.mkdir()
    file_size = 0.05

    # Download the assets
    asset_collection.download(target_dir=directory, file_size=file_size)

    # Check that files we expect to download get downloaded
    assert len(asset_collection) == 6
    expected_asset_hash = b'\xe9\x7f0\x1a\xd3s\xeb\x98\x0c\x94gb\xae\xd4\xb5\x91'
    actual_asset_hashes = []
    for asset in os.listdir(directory):
        if asset != ".DS_Store":
            full_path = '{}/{}'.format(directory, asset)
            hash_md5 = hashlib.md5()
            with open(full_path, 'rb') as file:
                for chunk in iter(lambda: file.read(4096), b""):
                    hash_md5.update(chunk)
            actual_asset_hashes.append(hash_md5.digest())
    actual_asset_hash = actual_asset_hashes[0]
    assert expected_asset_hash == actual_asset_hash

예제 #4

0

파일 보기

파일: test_asset.py 프로젝트: krammy19/civic-scraper

def test_download_defaults(tmp_path):
    '''
    Test default behavior of CivicPlus.download
    '''
    # Make AssetCollection
    site_url = "https://ca-napacounty.civicplus.com/AgendaCenter"
    start_date = "2020-08-24"
    end_date = "2020-09-03"
    cp = CivicPlusSite(site_url)
    asset_collection = cp.scrape(start_date=start_date, end_date=end_date)

    # Set parameters
    directory = tmp_path / "sub"
    directory.mkdir()

    # Download the assets
    asset_collection.download(target_dir=directory)

    # Check that files we expect to download get downloaded
    assert len(asset_collection) == 6
    expected_asset_hashes = []
    actual_asset_hashes = []
    for asset in os.listdir(directory):
        full_path = directory / asset
        hash_md5 = hashlib.md5()
        with open(full_path, 'rb') as file:
            for chunk in iter(lambda: file.read(4096), b""):
                hash_md5.update(chunk)
        actual_asset_hashes.append(hash_md5.digest())

    for hash in expected_asset_hashes:
        assert hash in actual_asset_hashes

    print(actual_asset_hashes)

예제 #5

0

파일 보기

def test_scrape_parameters_2():
    '''
    Test behavior of CivicPlus.scrape when there are no responsive assets to scrape
    '''
    site_url = "http://nm-lascruces.civicplus.com/AgendaCenter"
    start_date = "2020-08-29"
    end_date = "2020-08-29"
    cp = CivicPlusSite(site_url)
    assets = cp.scrape(start_date=start_date, end_date=end_date).assets
    # Check asset count
    assert len(assets) == 0

예제 #6

0

파일 보기

def test_asset_place_state():
    '''
    Test that on sites that redirect, we get the correct state and place
    '''
    site_url = "http://wi-columbus.civicplus.com/AgendaCenter"
    start_date = "2020-10-01"
    end_date = "2020-10-09"
    cp = CivicPlusSite(site_url)
    assets = cp.scrape(start_date=start_date, end_date=end_date).assets
    # Check asset metadata
    assert assets[2].state_or_province == "wi"
    assert assets[2].place == "columbus"

예제 #7

0

파일 보기

def test_committee_match2():
    '''
    Another test that the scraper correctly matches documents to the committee that published them
    '''
    site_url = "http://wi-columbus.civicplus.com/AgendaCenter"
    start_date = "2020-10-01"
    end_date = "2020-10-09"
    cp = CivicPlusSite(site_url)
    assets = cp.scrape(start_date=start_date, end_date=end_date).assets
    # Check asset metadata
    assert assets[
        1].asset_name == "October 06, 2020, Council agenda - 10/06/20 - reg - Cancelled. Agenda"
    assert assets[1].committee_name == "City Council 2020"

예제 #8

0

파일 보기

def test_committee_match():
    '''
    Test that the scraper correctly matches documents to the committee that published them
    '''
    site_url = "http://wa-bremerton.civicplus.com/AgendaCenter"
    start_date = "2020-09-20"
    end_date = "2020-10-02"
    cp = CivicPlusSite(site_url)
    assets = cp.scrape(start_date=start_date, end_date=end_date).assets
    # Check asset metadata
    assert assets[
        0].asset_name == "September 22, 2020, Parks & Recreation Commission Regular Meeting Documents (PDF). Agenda"
    assert assets[0].committee_name == "Parks and Recreation Commission 2020"

예제 #9

0

파일 보기

def test_scrape_parameters_1():
    '''
    Test behavior of CivicPlus.scrape with lots of parameters
    '''
    site_url = "http://fl-zephyrhills.civicplus.com/AgendaCenter"
    start_date = "2020-08-01"
    end_date = "2020-08-31"
    file_size = 20
    asset_list = ['agenda_packet']
    expected_meeting_dates = [
        datetime.date(2020, 8, day) for day in range(1, 32)
    ]
    cp = CivicPlusSite(site_url)
    assets = cp.scrape(start_date=start_date,
                       end_date=end_date,
                       file_size=file_size,
                       asset_list=asset_list).assets
    # Check asset count
    assert len(assets) == 15
    # Spot check asset attributes.
    # Start simple, and add more attribute
    # checks as needed to cover expected
    # edge cases (or better yet, put those
    # checks in a separate test).
    first = assets[0]
    assert first.url == 'http://fl-zephyrhills.civicplus.com/AgendaCenter/ViewFile/Agenda/_08172020-360?html=true'
    assert first.asset_name == 'August 17, 2020, Airport Advisory Regular Meeting. HTML'
    assert first.committee_name == 'Airport Authority 2020'
    assert first.place == 'zephyrhills'
    assert first.state_or_province == 'fl'
    assert first.asset_type == 'agenda'
    assert first.meeting_date == datetime.date(2020, 8, 17)
    assert first.meeting_time == None
    assert first.meeting_id == 'civicplus_fl_zephyrhills_08172020-360'
    assert first.scraped_by == 'civicplus.py_1.0.0'
    assert first.content_type == 'text/html'
    assert first.content_length == '2487'
    # Check that assets are in the correct date range
    for asset in assets:
        assert asset.meeting_date in expected_meeting_dates
    # Check that assets have the correct size
    expected_content_lengths = [
        '2487', '54515', '54517', '3181', '1266122', '1266093', '4889',
        '1301997', '1301956', '4117', '1303606', '1303584', '3052', '144582',
        '144610'
    ]
    actual_content_lengths = [asset.content_length for asset in assets]
    assert expected_content_lengths == actual_content_lengths
    # Check range of asset types
    for asset in assets:
        assert asset.asset_type == 'agenda'

예제 #10

0

파일 보기

파일: test_asset.py 프로젝트: krammy19/civic-scraper

def test_to_csv_defaults(tmp_path):
    "Tests the default behavior of to_csv() function in AssetCollection"

    # Make the mocker
    # mocked_asset_collection = \
    #     "AssetCollection([Asset(url: http://ks-liberal.civicplus.com/AgendaCenter/ViewFile/Agenda/_08252020-411, " \
    #     "asset_name: None, committee_name: City Commission, place: liberal, state_or_province: ks,  asset_type: " \
    #     "agenda, meeting_date: 2020-08-25, meeting_time: None, meeting_id: civicplus_ks_liberal_08252020-411, " \
    #     "scraped_by: civicplus.py_1.0.0, content_type: application/pdf, content_length: 153651), Asset(url: " \
    #     "http://ks-liberal.civicplus.com/AgendaCenter/ViewFile/Agenda/_08252020-412, asset_name: None, " \
    #     "committee_name: City Commission, place: liberal, state_or_province: ks,  asset_type: agenda, meeting_date: " \
    #     "2020-08-25, meeting_time: None, meeting_id: civicplus_ks_liberal_08252020-412, scraped_by: " \
    #     "civicplus.py_1.0.0, content_type: application/pdf, content_length: 4508590)]) "
    #
    # mocker.patch(
    #     'civic_scraper.scrapers.civicplus.CivicPlusSite.scrape',
    #     return_value=mocked_asset_collection
    # )

    # Create the temporary directory and temporary file
    directory = tmp_path / "sub"
    directory.mkdir()
    temp_csv = directory / "temp_csv_default.csv"

    # Make AssetCollection
    site_url = "http://ks-liberal.civicplus.com/AgendaCenter"
    end_date = "2020-08-26"
    start_date = "2020-08-24"
    cp = CivicPlusSite(site_url)
    asset_collection = cp.scrape(end_date=end_date, start_date=start_date)

    # Call to_csv
    asset_collection.to_csv(target_path=temp_csv)

    # Check that the content of the csv is what we'd expect
    hash_md5 = hashlib.md5()
    with open(temp_csv, 'rb') as file:
        for chunk in iter(lambda: file.read(4096), b""):
            hash_md5.update(chunk)
    assert hash_md5.digest(
    ) == b'\xf9\x84oT\x00[8\xb9wG\x15\x90\xae\x7f\x9e\xb3'

    # Check that the csv exists
    assert len(list(tmp_path.iterdir())) == 1
    assert os.path.exists(tmp_path)

예제 #11

0

파일 보기

def test_scrape_defaults():
    '''
    Test default behavior of CivicPlus.scrape
    '''
    site_url = "http://nc-nashcounty.civicplus.com/AgendaCenter"
    start_date = "2020-05-03"
    end_date = "2020-05-06"
    expected_meeting_dates = [
        datetime.date(2020, 5, day) for day in range(3, 7)
    ]
    cp = CivicPlusSite(site_url)
    assets = cp.scrape(start_date, end_date).assets
    # Check asset count
    assert len(assets) == 4
    # Spot check asset attributes.
    # Start simple, and add more attribute
    # checks as needed to cover expected
    # edge cases (or better yet, put those
    # checks in a separate test).
    first = assets[0]
    assert first.url == 'http://nc-nashcounty.civicplus.com/AgendaCenter/ViewFile/Agenda/_05052020-382'
    assert first.committee_name == 'Board of Commissioners 2020'
    assert first.asset_name == 'May 05, 2020, May 5, 2020 Recessed Meeting/Budget Work Session Agenda. Agenda'
    assert first.place == 'nashcounty'
    assert first.state_or_province == 'nc'
    assert first.asset_type == 'agenda'
    assert first.meeting_date == datetime.date(2020, 5, 5)
    assert first.meeting_time == None
    assert first.meeting_id == 'civicplus_nc_nashcounty_05052020-382'
    assert first.scraped_by == 'civicplus.py_1.0.0'
    assert first.content_type == 'application/pdf'
    assert first.content_length == '19536'
    # Check that assets are in the correct date range
    for asset in assets:
        assert asset.meeting_date in expected_meeting_dates
    # Check range of asset types
    expected_asset_types = ['agenda', 'minutes', 'agenda', 'minutes']
    actual_asset_types = [asset.asset_type for asset in assets]
    assert expected_asset_types == actual_asset_types

예제 #12

0

파일 보기

파일: test_asset.py 프로젝트: krammy19/civic-scraper

def test_append_to_csv_existing(tmp_path):
    "Tests the behavior of to_csv() function in AssetCollection when appending=True and file exists"

    # Create the first temporary directory and temporary file
    directory = tmp_path / "sub"
    directory.mkdir()
    temp_csv_1 = directory / "temp.csv"

    # Create the second temporary directory and temporary file
    temp_csv_2 = directory / "temp_csv.csv"

    # Make the first AssetCollection
    site_url = "http://ks-liberal.civicplus.com/AgendaCenter"
    start_date = "2020-07-01"
    end_date = "2020-07-30"
    cp = CivicPlusSite(site_url)
    asset_collection_1 = cp.scrape(start_date=start_date, end_date=end_date)

    # Make the second AssetCollection
    site_url = "http://ks-liberal.civicplus.com/AgendaCenter"
    start_date = "2020-08-01"
    end_date = "2020-09-03"
    cp = CivicPlusSite(site_url)
    asset_collection_2 = cp.scrape(start_date=start_date, end_date=end_date)

    # Call to_csv
    asset_collection_1.to_csv(target_path=temp_csv_1)
    asset_collection_2.to_csv(target_path=temp_csv_2)

    # Check that the content of the csv is what we'd expect
    hash_md5 = hashlib.md5()
    with open(temp_csv_2, 'rb') as file:
        for chunk in iter(lambda: file.read(4096), b""):
            hash_md5.update(chunk)
    assert hash_md5.digest() == b'c\x14SWL\x85\x7f@\x9a\xd4\xf8\xdcL\x8fK\xa2'
    hash_md5.digest()

    # Check that the file exists
    assert os.path.exists(temp_csv_2)