def test_append_to_csv(tmp_path): "Tests the behavior of to_csv() function in AssetCollection when appending=True" # Create the temporary directory and temporary file directory = tmp_path / "sub" directory.mkdir() temp_csv = directory / "temp_csv_appending.csv" # Make AssetCollection site_url = "http://ks-liberal.civicplus.com/AgendaCenter" end_date = "2020-08-26" start_date = "2020-08-24" cp = CivicPlusSite(site_url) asset_collection = cp.scrape(end_date=end_date, start_date=start_date) # Call to_csv asset_collection.to_csv(target_path=temp_csv, appending=True) # Check that the content of the csv is what we'd expect hash_md5 = hashlib.md5() with open(temp_csv, 'rb') as file: for chunk in iter(lambda: file.read(4096), b""): hash_md5.update(chunk) assert hash_md5.digest( ) == b'\xf9\x84oT\x00[8\xb9wG\x15\x90\xae\x7f\x9e\xb3' hash_md5.digest() # Check that the csv exists assert os.path.exists(tmp_path)
def test_download_asset_list(tmp_path): ''' Test behavior of CivicPlus.download with optional asset_list parameter ''' # Make AssetCollection site_url = "https://ca-napacounty.civicplus.com/AgendaCenter" start_date = "2020-05-31" end_date = "2020-06-02" cp = CivicPlusSite(site_url) asset_collection = cp.scrape(start_date=start_date, end_date=end_date) # Set parameters directory = tmp_path / "sub" directory.mkdir() asset_list = ['minutes'] # Download the assets asset_collection.download(target_dir=directory, asset_list=asset_list) # Check that files we expect to download get downloaded assert len(asset_collection) == 2 expected_asset_hash = b';eJ\xfe\x8ef\xd7\x8d\x8f\xd5\x16h9c\xfa\xac' actual_asset_hashes = [] for asset in os.listdir(directory): if asset != ".DS_Store": full_path = '{}/{}'.format(directory, asset) hash_md5 = hashlib.md5() with open(full_path, 'rb') as file: for chunk in iter(lambda: file.read(4096), b""): hash_md5.update(chunk) actual_asset_hashes.append(hash_md5.digest()) actual_asset_hash = actual_asset_hashes[0] assert expected_asset_hash == actual_asset_hash
def test_download_file_size(tmp_path): ''' Test behavior of CivicPlus.download with optional file_size parameter ''' # Make AssetCollection site_url = "https://ca-napacounty.civicplus.com/AgendaCenter" start_date = "2020-08-24" end_date = "2020-09-03" cp = CivicPlusSite(site_url) asset_collection = cp.scrape(start_date=start_date, end_date=end_date) # Set parameters directory = tmp_path / "sub" directory.mkdir() file_size = 0.05 # Download the assets asset_collection.download(target_dir=directory, file_size=file_size) # Check that files we expect to download get downloaded assert len(asset_collection) == 6 expected_asset_hash = b'\xe9\x7f0\x1a\xd3s\xeb\x98\x0c\x94gb\xae\xd4\xb5\x91' actual_asset_hashes = [] for asset in os.listdir(directory): if asset != ".DS_Store": full_path = '{}/{}'.format(directory, asset) hash_md5 = hashlib.md5() with open(full_path, 'rb') as file: for chunk in iter(lambda: file.read(4096), b""): hash_md5.update(chunk) actual_asset_hashes.append(hash_md5.digest()) actual_asset_hash = actual_asset_hashes[0] assert expected_asset_hash == actual_asset_hash
def test_download_defaults(tmp_path): ''' Test default behavior of CivicPlus.download ''' # Make AssetCollection site_url = "https://ca-napacounty.civicplus.com/AgendaCenter" start_date = "2020-08-24" end_date = "2020-09-03" cp = CivicPlusSite(site_url) asset_collection = cp.scrape(start_date=start_date, end_date=end_date) # Set parameters directory = tmp_path / "sub" directory.mkdir() # Download the assets asset_collection.download(target_dir=directory) # Check that files we expect to download get downloaded assert len(asset_collection) == 6 expected_asset_hashes = [] actual_asset_hashes = [] for asset in os.listdir(directory): full_path = directory / asset hash_md5 = hashlib.md5() with open(full_path, 'rb') as file: for chunk in iter(lambda: file.read(4096), b""): hash_md5.update(chunk) actual_asset_hashes.append(hash_md5.digest()) for hash in expected_asset_hashes: assert hash in actual_asset_hashes print(actual_asset_hashes)
def test_scrape_parameters_2(): ''' Test behavior of CivicPlus.scrape when there are no responsive assets to scrape ''' site_url = "http://nm-lascruces.civicplus.com/AgendaCenter" start_date = "2020-08-29" end_date = "2020-08-29" cp = CivicPlusSite(site_url) assets = cp.scrape(start_date=start_date, end_date=end_date).assets # Check asset count assert len(assets) == 0
def test_asset_place_state(): ''' Test that on sites that redirect, we get the correct state and place ''' site_url = "http://wi-columbus.civicplus.com/AgendaCenter" start_date = "2020-10-01" end_date = "2020-10-09" cp = CivicPlusSite(site_url) assets = cp.scrape(start_date=start_date, end_date=end_date).assets # Check asset metadata assert assets[2].state_or_province == "wi" assert assets[2].place == "columbus"
def test_committee_match2(): ''' Another test that the scraper correctly matches documents to the committee that published them ''' site_url = "http://wi-columbus.civicplus.com/AgendaCenter" start_date = "2020-10-01" end_date = "2020-10-09" cp = CivicPlusSite(site_url) assets = cp.scrape(start_date=start_date, end_date=end_date).assets # Check asset metadata assert assets[ 1].asset_name == "October 06, 2020, Council agenda - 10/06/20 - reg - Cancelled. Agenda" assert assets[1].committee_name == "City Council 2020"
def test_committee_match(): ''' Test that the scraper correctly matches documents to the committee that published them ''' site_url = "http://wa-bremerton.civicplus.com/AgendaCenter" start_date = "2020-09-20" end_date = "2020-10-02" cp = CivicPlusSite(site_url) assets = cp.scrape(start_date=start_date, end_date=end_date).assets # Check asset metadata assert assets[ 0].asset_name == "September 22, 2020, Parks & Recreation Commission Regular Meeting Documents (PDF). Agenda" assert assets[0].committee_name == "Parks and Recreation Commission 2020"
def test_scrape_parameters_1(): ''' Test behavior of CivicPlus.scrape with lots of parameters ''' site_url = "http://fl-zephyrhills.civicplus.com/AgendaCenter" start_date = "2020-08-01" end_date = "2020-08-31" file_size = 20 asset_list = ['agenda_packet'] expected_meeting_dates = [ datetime.date(2020, 8, day) for day in range(1, 32) ] cp = CivicPlusSite(site_url) assets = cp.scrape(start_date=start_date, end_date=end_date, file_size=file_size, asset_list=asset_list).assets # Check asset count assert len(assets) == 15 # Spot check asset attributes. # Start simple, and add more attribute # checks as needed to cover expected # edge cases (or better yet, put those # checks in a separate test). first = assets[0] assert first.url == 'http://fl-zephyrhills.civicplus.com/AgendaCenter/ViewFile/Agenda/_08172020-360?html=true' assert first.asset_name == 'August 17, 2020, Airport Advisory Regular Meeting. HTML' assert first.committee_name == 'Airport Authority 2020' assert first.place == 'zephyrhills' assert first.state_or_province == 'fl' assert first.asset_type == 'agenda' assert first.meeting_date == datetime.date(2020, 8, 17) assert first.meeting_time == None assert first.meeting_id == 'civicplus_fl_zephyrhills_08172020-360' assert first.scraped_by == 'civicplus.py_1.0.0' assert first.content_type == 'text/html' assert first.content_length == '2487' # Check that assets are in the correct date range for asset in assets: assert asset.meeting_date in expected_meeting_dates # Check that assets have the correct size expected_content_lengths = [ '2487', '54515', '54517', '3181', '1266122', '1266093', '4889', '1301997', '1301956', '4117', '1303606', '1303584', '3052', '144582', '144610' ] actual_content_lengths = [asset.content_length for asset in assets] assert expected_content_lengths == actual_content_lengths # Check range of asset types for asset in assets: assert asset.asset_type == 'agenda'
def test_to_csv_defaults(tmp_path): "Tests the default behavior of to_csv() function in AssetCollection" # Make the mocker # mocked_asset_collection = \ # "AssetCollection([Asset(url: http://ks-liberal.civicplus.com/AgendaCenter/ViewFile/Agenda/_08252020-411, " \ # "asset_name: None, committee_name: City Commission, place: liberal, state_or_province: ks, asset_type: " \ # "agenda, meeting_date: 2020-08-25, meeting_time: None, meeting_id: civicplus_ks_liberal_08252020-411, " \ # "scraped_by: civicplus.py_1.0.0, content_type: application/pdf, content_length: 153651), Asset(url: " \ # "http://ks-liberal.civicplus.com/AgendaCenter/ViewFile/Agenda/_08252020-412, asset_name: None, " \ # "committee_name: City Commission, place: liberal, state_or_province: ks, asset_type: agenda, meeting_date: " \ # "2020-08-25, meeting_time: None, meeting_id: civicplus_ks_liberal_08252020-412, scraped_by: " \ # "civicplus.py_1.0.0, content_type: application/pdf, content_length: 4508590)]) " # # mocker.patch( # 'civic_scraper.scrapers.civicplus.CivicPlusSite.scrape', # return_value=mocked_asset_collection # ) # Create the temporary directory and temporary file directory = tmp_path / "sub" directory.mkdir() temp_csv = directory / "temp_csv_default.csv" # Make AssetCollection site_url = "http://ks-liberal.civicplus.com/AgendaCenter" end_date = "2020-08-26" start_date = "2020-08-24" cp = CivicPlusSite(site_url) asset_collection = cp.scrape(end_date=end_date, start_date=start_date) # Call to_csv asset_collection.to_csv(target_path=temp_csv) # Check that the content of the csv is what we'd expect hash_md5 = hashlib.md5() with open(temp_csv, 'rb') as file: for chunk in iter(lambda: file.read(4096), b""): hash_md5.update(chunk) assert hash_md5.digest( ) == b'\xf9\x84oT\x00[8\xb9wG\x15\x90\xae\x7f\x9e\xb3' # Check that the csv exists assert len(list(tmp_path.iterdir())) == 1 assert os.path.exists(tmp_path)
def test_scrape_defaults(): ''' Test default behavior of CivicPlus.scrape ''' site_url = "http://nc-nashcounty.civicplus.com/AgendaCenter" start_date = "2020-05-03" end_date = "2020-05-06" expected_meeting_dates = [ datetime.date(2020, 5, day) for day in range(3, 7) ] cp = CivicPlusSite(site_url) assets = cp.scrape(start_date, end_date).assets # Check asset count assert len(assets) == 4 # Spot check asset attributes. # Start simple, and add more attribute # checks as needed to cover expected # edge cases (or better yet, put those # checks in a separate test). first = assets[0] assert first.url == 'http://nc-nashcounty.civicplus.com/AgendaCenter/ViewFile/Agenda/_05052020-382' assert first.committee_name == 'Board of Commissioners 2020' assert first.asset_name == 'May 05, 2020, May 5, 2020 Recessed Meeting/Budget Work Session Agenda. Agenda' assert first.place == 'nashcounty' assert first.state_or_province == 'nc' assert first.asset_type == 'agenda' assert first.meeting_date == datetime.date(2020, 5, 5) assert first.meeting_time == None assert first.meeting_id == 'civicplus_nc_nashcounty_05052020-382' assert first.scraped_by == 'civicplus.py_1.0.0' assert first.content_type == 'application/pdf' assert first.content_length == '19536' # Check that assets are in the correct date range for asset in assets: assert asset.meeting_date in expected_meeting_dates # Check range of asset types expected_asset_types = ['agenda', 'minutes', 'agenda', 'minutes'] actual_asset_types = [asset.asset_type for asset in assets] assert expected_asset_types == actual_asset_types
def test_append_to_csv_existing(tmp_path): "Tests the behavior of to_csv() function in AssetCollection when appending=True and file exists" # Create the first temporary directory and temporary file directory = tmp_path / "sub" directory.mkdir() temp_csv_1 = directory / "temp.csv" # Create the second temporary directory and temporary file temp_csv_2 = directory / "temp_csv.csv" # Make the first AssetCollection site_url = "http://ks-liberal.civicplus.com/AgendaCenter" start_date = "2020-07-01" end_date = "2020-07-30" cp = CivicPlusSite(site_url) asset_collection_1 = cp.scrape(start_date=start_date, end_date=end_date) # Make the second AssetCollection site_url = "http://ks-liberal.civicplus.com/AgendaCenter" start_date = "2020-08-01" end_date = "2020-09-03" cp = CivicPlusSite(site_url) asset_collection_2 = cp.scrape(start_date=start_date, end_date=end_date) # Call to_csv asset_collection_1.to_csv(target_path=temp_csv_1) asset_collection_2.to_csv(target_path=temp_csv_2) # Check that the content of the csv is what we'd expect hash_md5 = hashlib.md5() with open(temp_csv_2, 'rb') as file: for chunk in iter(lambda: file.read(4096), b""): hash_md5.update(chunk) assert hash_md5.digest() == b'c\x14SWL\x85\x7f@\x9a\xd4\xf8\xdcL\x8fK\xa2' hash_md5.digest() # Check that the file exists assert os.path.exists(temp_csv_2)