def testGetAllUrls(self, mockget: Mock) -> None: REPORTS_LANDING_HTML = fixtures.as_string("aggregate/regions/ga", "reports_landing.html") REPORTS_YEAR_2015 = fixtures.as_string("aggregate/regions/ga", "reports_year_2015.html") REPORTS_YEAR_2019 = fixtures.as_string("aggregate/regions/ga", "reports_year_2019.html") def _MockGet(url: str) -> Mock: response = Mock() if "node/5617" in url: response.text = REPORTS_YEAR_2019 elif "node/4036" in url: response.text = REPORTS_YEAR_2015 else: response.text = REPORTS_LANDING_HTML return response mockget.side_effect = _MockGet url1 = "https://www.dca.ga.gov/sites/default/files/jail_report_jan19.pdf" url2 = "https://www.dca.ga.gov/sites/default/files/mar15_jail_report.pdf" expected_urls = {url1, url2} urls = ga_aggregate_site_scraper.get_urls_to_download() self.assertEqual(expected_urls, urls)
def testGetAllUrls(self, mockget: Mock) -> None: mock_landing = Mock() mock_landing.text = fixtures.as_string("aggregate/regions/ma", "reports_landing.html") mock_year = Mock() mock_year.text = fixtures.as_string("aggregate/regions/ma", "year_page.html") mockget.side_effect = [mock_landing, mock_year] url1 = "https://www.mass.gov/doc/weekly-inmate-count-12252017/download" expected_urls = {url1} urls = ma_aggregate_site_scraper.get_urls_to_download() self.assertEqual(expected_urls, urls)
def extract(self, html_filename, yaml_filename): yaml_path = os.path.join(os.path.dirname(__file__), '../testdata/data_extractor/yaml', yaml_filename) extractor = HtmlDataExtractor(yaml_path) contents = html.fromstring( fixtures.as_string('testdata/data_extractor/html', html_filename)) return extractor.extract_and_populate_data(contents)
def extract(self, html_filename: str, yaml_filename: str) -> IngestInfo: yaml_path = os.path.join(os.path.dirname(__file__), "../testdata/data_extractor/yaml", yaml_filename) extractor = HtmlDataExtractor(yaml_path) contents = html.fromstring( fixtures.as_string("testdata/data_extractor/html", html_filename)) return extractor.extract_and_populate_data(contents)
def test_parse_file_headers_only_iterator_input(self): extractor = _instantiate_extractor('header_cols_only_csv.yaml') content = fixtures.as_string('testdata/data_extractor/csv', 'header_cols_only.csv') ingest_info = \ extractor.extract_and_populate_data(iter(content.splitlines())) self.assertIsNotNone(ingest_info) self.assertFalse(ingest_info)
def test_parse_file_empty(self): """Tests that we don't crash on a completely empty CSV and return an empty IngestInfoObject""" extractor = _instantiate_extractor('header_cols_only_csv.yaml') content = fixtures.as_string('testdata/data_extractor/csv', 'empty.csv') ingest_info = extractor.extract_and_populate_data(content) self.assertIsNotNone(ingest_info) self.assertFalse(ingest_info)
def test_parse_file_headers_only(self) -> None: """Tests that we don't crash on a CSV with only a header row and return an empty IngestInfoObject. """ extractor = _instantiate_extractor("header_cols_only_csv.yaml") content = fixtures.as_string("testdata/data_extractor/csv", "header_cols_only.csv") ingest_info = extractor.extract_and_populate_data(content) self.assertIsNotNone(ingest_info) self.assertFalse(ingest_info)
def testGetAllUrls(self, mockget: Mock) -> None: mockresponse = Mock() mockget.return_value = mockresponse mockresponse.text = fixtures.as_string("aggregate/regions/hi", "report.html") url1 = ("https://dps.hawaii.gov/wp-content/uploads/2019/01/" "Pop-Reports-EOM-2018-12-31.pdf") expected_urls = {url1} urls = hi_aggregate_site_scraper.get_urls_to_download() self.assertEqual(expected_urls, urls)
def testGetAllUrls(self, mockget: Mock) -> None: mockresponse = Mock() mockget.return_value = mockresponse mockresponse.text = fixtures.as_string("aggregate/regions/fl", "reports.html") url1 = "http://www.dc.state.fl.us/pub/jails/2019/2019_06 June FCDF.pdf" url2 = "http://www.dc.state.fl.us/pub/jails/2016/jails-2016-03.pdf" expected_urls = {url1, url2} urls = fl_aggregate_site_scraper.get_urls_to_download() self.assertEqual(expected_urls, urls)
def testGetAllUrls(self, mockget: Mock) -> None: mockresponse = Mock() mockget.return_value = mockresponse mockresponse.text = fixtures.as_string("aggregate/regions/ky", "report.html") url1 = ( "https://corrections.ky.gov/About/researchandstats/Documents/" "Weekly Jail/2018/08-18-18.pdf" ) expected_urls = {url1} urls = ky_aggregate_site_scraper.get_urls_to_download() self.assertEqual(expected_urls, urls)
def testGetAllUrls(self, mockget: Mock) -> None: mockresponse = Mock() mockget.return_value = mockresponse mockresponse.text = fixtures.as_string("aggregate/regions/tn", "reports.html") url1 = ("https://www.tn.gov/content/dam/tn/correction/documents/" "JailAugust2018.pdf") url2 = ("https://www.tn.gov/content/dam/tn/correction/documents/" "JailFemaleOctober2017.pdf") expected_urls = {url1, url2} urls = tn_aggregate_site_scraper.get_urls_to_download() self.assertEqual(expected_urls, urls)
def testGetAllUrls(self, mock_date: Mock, mockget: Mock) -> None: mockresponse = Mock() mockget.return_value = mockresponse mockresponse.text = fixtures.as_string("aggregate/regions/tx", "reports.html") mock_date.today.return_value = _TODAY url1 = ("https://www.tcjs.state.tx.us/docs/AbbreviatedPopReports/" "Abbreviated Pop Rpt June 2020.pdf") url2 = ("https://www.tcjs.state.tx.us/docs/AbbreviatedPopReports/" "Abbreviated Pop Rpt Jan 2021.pdf") url3 = "https://www.tcjs.state.tx.us/wp-content/uploads/2021/04/AbbreRptCurrent.pdf" expected_urls = {url1, url2, url3} urls = tx_aggregate_site_scraper.get_urls_to_download() self.assertEqual(expected_urls, urls)
# but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # ============================================================================= """Tests for ga_aggregate_ingest.py.""" from unittest import TestCase from mock import patch, Mock import requests from recidiviz.ingest.aggregate.regions.ga import ga_aggregate_site_scraper from recidiviz.tests.ingest import fixtures REPORTS_LANDING_HTML = fixtures.as_string('aggregate/regions/ga', 'reports_landing.html') REPORTS_YEAR_2015 = fixtures.as_string('aggregate/regions/ga', 'reports_year_2015.html') REPORTS_YEAR_2019 = fixtures.as_string('aggregate/regions/ga', 'reports_year_2019.html') class TestGaAggregateSiteScraper(TestCase): """Test that ga_aggregate_site_scraper correctly scrapes urls.""" @patch.object(requests, 'get') def testGetAllUrls(self, mockget): def _MockGet(url): response = Mock() if 'node/5617' in url: response.text = REPORTS_YEAR_2019 elif 'node/4036' in url:
# but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # ============================================================================= """Tests for tn_aggregate_ingest.py.""" from unittest import TestCase from mock import patch, Mock import requests from recidiviz.ingest.aggregate.regions.tn import tn_aggregate_site_scraper from recidiviz.tests.ingest import fixtures REPORTS_HTML = fixtures.as_string("aggregate/regions/tn", "reports.html") class TestTnAggregateSiteScraper(TestCase): """Test that tn_aggregate_site_scraper correctly scrapes urls.""" @patch.object(requests, "get") def testGetAllUrls(self, mockget): mockresponse = Mock() mockget.return_value = mockresponse mockresponse.text = REPORTS_HTML url1 = ( "https://www.tn.gov/content/dam/tn/correction/documents/" "JailAugust2018.pdf" ) url2 = (
from mock import patch, Mock from recidiviz.common.ingest_metadata import IngestMetadata from recidiviz.ingest.direct.regions.us_tx_brazos.us_tx_brazos_controller \ import UsTxBrazosController from recidiviz.ingest.models.ingest_info import Arrest, Bond, Booking, Charge, \ Hold, Person, IngestInfo from recidiviz.tests.ingest import fixtures from recidiviz.tests.utils.individual_ingest_test import IndividualIngestTest from recidiviz.tests.ingest.direct.direct_ingest_util import \ build_controller_for_tests, ingest_args_for_fixture_file from recidiviz.utils import regions FIXTURE_PATH_PREFIX = 'direct/regions/us_tx_brazos' _ROSTER_PATH_CONTENTS = fixtures.as_string(FIXTURE_PATH_PREFIX, 'daily_data.csv') _FAKE_START_TIME = datetime.datetime(year=2019, month=1, day=2) @patch('recidiviz.utils.metadata.project_id', Mock(return_value='recidiviz-staging')) class UsTxBrazosControllerTest(IndividualIngestTest, TestCase): """Test Brazos direct ingest. """ def testParse(self): controller = build_controller_for_tests(UsTxBrazosController, FIXTURE_PATH_PREFIX, run_async=False) args = ingest_args_for_fixture_file(controller, 'daily_data.csv')
import tempfile from flask import Flask from mock import patch, Mock, call import requests import gcsfs import pytz from recidiviz.cloud_functions.cloud_function_utils import GCSFS_NO_CACHING from recidiviz.ingest.aggregate import scrape_aggregate_reports from recidiviz.ingest.aggregate.regions.ca import ca_aggregate_site_scraper from recidiviz.ingest.aggregate.regions.ny import ny_aggregate_site_scraper from recidiviz.ingest.aggregate.regions.tx import tx_aggregate_site_scraper from recidiviz.tests.ingest import fixtures from recidiviz.utils import metadata REPORTS_HTML = fixtures.as_string('aggregate/regions/tx', 'reports.html') APP_ID = "recidiviz-scraper-aggregate-report-test" app = Flask(__name__) app.register_blueprint( scrape_aggregate_reports.scrape_aggregate_reports_blueprint) app.config['TESTING'] = True SERVER_MODIFIED_TIME = datetime.datetime( year=2019, month=1, day=1, tzinfo=pytz.UTC) EXISTING_TEST_URL = 'http://test.com/url_test/Existing.pdf' EXISTING_TEST_URL2 = 'http://test.com/url_test/Existing2.pdf' EXISTING_TEST_URL_CA = 'http://test.com' CA_POST_DATA = {'year': 1996, 'testing': '1'} NONEXISTING_TEST_URL = 'url_test/nonexisting.pdf'
def _get_content_as_csv(content_filename: str) -> csv.DictReader: content = fixtures.as_string('testdata/data_extractor/csv', content_filename) return csv.DictReader(content.splitlines())