def test_scrape_data_and_more_no_persist_second_time_persist( self, mock_get_more, mock_fetch, mock_populate, mock_write): populate_task = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA) mock_get_more.return_value = [populate_task] mock_fetch.return_value = (TEST_HTML, {}) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=False, ) start_time = datetime.datetime.now() t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) scraper = FakeScraper("test") scraper.BATCH_WRITES = False scraper._generic_scrape(req) # Should send the ii since we chose not to persist. expected_tasks = [ QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=populate_task, scraper_start_time=start_time, ingest_info=self.ii, ) ] self.assertEqual(mock_get_more.call_count, 1) self.assertEqual(mock_populate.call_count, 1) self.assertEqual(mock_write.call_count, 0) mock_get_more.assert_called_once_with(TEST_HTML, t) self.assertCountEqual(expected_tasks, scraper.tasks) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=True, ) scraper._generic_scrape(scraper.tasks[0]) self.assertEqual(mock_get_more.call_count, 1) self.assertEqual(mock_populate.call_count, 2) self.assertEqual(mock_write.call_count, 1) expected_metadata = IngestMetadata( scraper.region.region_code, scraper.region.jurisdiction_id, start_time, scraper.get_enum_overrides(), ) expected_proto = convert_ingest_info_to_proto(self.ii) mock_write.assert_called_once_with(expected_proto, expected_metadata)
def test_scrape_data_no_more_tasks_batch( self, mock_get_more: Mock, mock_fetch: Mock, mock_populate: Mock, mock_write: Mock, mock_batch_write: Mock, ) -> None: mock_fetch.return_value = (TEST_HTML, {}) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=True, ) start_time = datetime.datetime.now() t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) scraper = FakeScraper("test") scraper._generic_scrape(req) scrape_key = ScrapeKey("test", constants.ScrapeType.BACKGROUND) self.assertEqual(mock_get_more.call_count, 0) self.assertEqual(mock_populate.call_count, 1) self.assertEqual(mock_write.call_count, 0) mock_batch_write.assert_called_once_with( ingest_info=self.ii, task=t, scrape_key=scrape_key, ) self.assertEqual(len(scraper.tasks), 0)
def test_get_more_and_updates_cookies( self, mock_get_more: Mock, mock_fetch: Mock ) -> None: mock_get_more.return_value = [TEST_TASK] mock_fetch.return_value = (TEST_HTML, {1: 1}) start_time = datetime.datetime.now() req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=TEST_TASK, scraper_start_time=start_time, ) t = Task.evolve(TEST_TASK, cookies={1: 1}) scraper = FakeScraper("test") scraper.BATCH_WRITES = False scraper._generic_scrape(req) expected_tasks = [ QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) ] self.assertCountEqual(expected_tasks, scraper.tasks)
def test_scrape_data_no_more_tasks(self, mock_get_more, mock_fetch, mock_populate, mock_write): mock_fetch.return_value = (TEST_HTML, {}) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=True, ) start_time = datetime.datetime.now() t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) scraper = FakeScraper("test") scraper.BATCH_WRITES = False scraper._generic_scrape(req) expected_metadata = IngestMetadata( scraper.region.region_code, scraper.region.jurisdiction_id, start_time, scraper.get_enum_overrides(), ) expected_proto = convert_ingest_info_to_proto(self.ii) self.assertEqual(mock_get_more.call_count, 0) self.assertEqual(mock_populate.call_count, 1) self.assertEqual(mock_write.call_count, 1) mock_write.assert_called_once_with(expected_proto, expected_metadata) self.assertEqual(len(scraper.tasks), 0)
def test_scrape_data_and_more_no_persist( self, mock_get_more, mock_fetch, mock_populate, mock_write): mock_get_more.return_value = [TEST_TASK] mock_fetch.return_value = (TEST_HTML, {}) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=False, ) start_time = datetime.datetime.now() t = Task.evolve( TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time ) scraper = FakeScraper('test') scraper.BATCH_WRITES = False scraper._generic_scrape(req) # Should send the ii since we chose not to persist. expected_tasks = [QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=TEST_TASK, scraper_start_time=start_time, ingest_info=self.ii )] self.assertEqual(mock_get_more.call_count, 1) self.assertEqual(mock_populate.call_count, 1) self.assertEqual(mock_write.call_count, 0) mock_get_more.assert_called_once_with(TEST_HTML, t) self.assertCountEqual(expected_tasks, scraper.tasks)
def test_fetch_sends_all_args(self, mock_get_more, mock_fetch): mock_get_more.return_value = [TEST_TASK] mock_fetch.return_value = (TEST_HTML, None) start_time = datetime.datetime.now() t = Task.evolve( TEST_TASK, headers='TEST_HEADERS', cookies='TEST_COOKIES', params='TEST_PARAMS', post_data='TEST_POST', json='TEST_JSON' ) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time ) scraper = FakeScraper('test') scraper.BATCH_WRITES = False scraper._generic_scrape(req) expected_tasks = [QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=TEST_TASK, scraper_start_time=start_time, )] mock_fetch.assert_called_once_with( t.endpoint, t.response_type, headers=t.headers, cookies=t.cookies, params=t.params, post_data=t.post_data, json_data=t.json ) self.assertCountEqual(expected_tasks, scraper.tasks)
def test_scrape_data_and_more_yes_persist( self, mock_get_more: Mock, mock_fetch: Mock, mock_populate: Mock, mock_write: Mock, ) -> None: mock_get_more.return_value = [TEST_TASK] mock_fetch.return_value = (TEST_HTML, {}) mock_populate.return_value = ScrapedData( ingest_info=self.ii, persist=True, ) start_time = datetime.datetime.now() t = Task.evolve(TEST_TASK, task_type=constants.TaskType.SCRAPE_DATA_AND_MORE) req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) scraper = FakeScraper("test") scraper.BATCH_WRITES = False scraper._generic_scrape(req) # Should send the ii since we chose not to persist. expected_tasks = [ QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=TEST_TASK, scraper_start_time=start_time, ) ] expected_metadata = IngestMetadata( region=scraper.region.region_code, jurisdiction_id=scraper.region.jurisdiction_id, ingest_time=start_time, enum_overrides=scraper.get_enum_overrides(), system_level=SystemLevel.COUNTY, database_key=SQLAlchemyDatabaseKey.for_schema(SchemaType.JAILS), ) expected_proto = convert_ingest_info_to_proto(self.ii) self.assertEqual(mock_get_more.call_count, 1) self.assertEqual(mock_populate.call_count, 1) self.assertEqual(mock_write.call_count, 1) mock_write.assert_called_once_with(expected_proto, expected_metadata) self.assertCountEqual(expected_tasks, scraper.tasks)
def test_content_no_fetch(self, mock_get_more: Mock, mock_fetch: Mock) -> None: t = Task.evolve(TEST_TASK, content=TEST_HTML) mock_get_more.return_value = [t] start_time = datetime.datetime.now() req = QueueRequest( scrape_type=constants.ScrapeType.BACKGROUND, next_task=t, scraper_start_time=start_time, ) scraper = FakeScraper("test") scraper.BATCH_WRITES = False scraper._generic_scrape(req) expected_tasks = [req] self.assertEqual(mock_fetch.call_count, 0) self.assertCountEqual(expected_tasks, scraper.tasks)
def _generic_scrape(self, request: QueueRequest): """ General handler for all scrape tasks. This function is a generic entry point into all types of scrapes. It decides what to call based on params. Args: params: dict of parameters passed from the last scrape session. """ try: task = request.next_task # Here we handle a special case where we weren't really sure # we were going to get data when we submitted a task, but then # we ended up with data, so no more requests are required, # just the content we already have. # TODO(#680): remove this if task.content is not None: content = self._parse_html_content(task.content) cookies = None else: post_data = task.post_data # Let the child transform the post_data if it wants before # sending the requests. This hook is in here in case the # child did something like compress the post_data before # it put it on the queue. self.transform_post_data(post_data) # We always fetch some content before doing anything. # Note that we use get here for the post_data to return a # default value of None if this scraper doesn't set it. try: content, cookies = self._fetch_content( task.endpoint, task.response_type, headers=task.headers, cookies=task.cookies, params=task.params, post_data=post_data, json_data=task.json) except Exception as e: raise ScraperFetchError(str(e)) from e scraped_data = None if self.should_scrape_data(task.task_type): # If we want to scrape data, we should either create an # ingest_info object or get the one that already exists. logging.info("Scraping data for [%s] and endpoint: [%s]", self.region.region_code, task.endpoint) try: scraped_data = self.populate_data( content, task, request.ingest_info or IngestInfo()) except Exception as e: raise ScraperPopulateDataError(str(e)) from e if self.should_get_more_tasks(task.task_type): logging.info("Getting more tasks for [%s] and endpoint: [%s]", self.region.region_code, task.endpoint) # Only send along ingest info if it will not be persisted now. ingest_info_to_send = None if scraped_data is not None and not scraped_data.persist: ingest_info_to_send = scraped_data.ingest_info try: # pylint: disable=assignment-from-no-return next_tasks = self.get_more_tasks(content, task) except Exception as e: raise ScraperGetMoreTasksError(str(e)) from e for next_task in next_tasks: # Include cookies received from response, if any if cookies: cookies.update(next_task.cookies) next_task = Task.evolve(next_task, cookies=cookies) self.add_task( '_generic_scrape', QueueRequest( scrape_type=request.scrape_type, scraper_start_time=request.scraper_start_time, next_task=next_task, ingest_info=ingest_info_to_send, )) if scraped_data is not None and scraped_data.persist: if scraped_data.ingest_info: logging.info("Logging at most 4 people (were %d):", len(scraped_data.ingest_info.people)) loop_count = min(len(scraped_data.ingest_info.people), constants.MAX_PEOPLE_TO_LOG) for i in range(loop_count): logging.info("[%s]", str(scraped_data.ingest_info.people[i])) logging.info("Last seen time of person being set as: [%s]", request.scraper_start_time) metadata = IngestMetadata(self.region.region_code, self.region.jurisdiction_id, request.scraper_start_time, self.get_enum_overrides()) if self.BATCH_WRITES: logging.info( "Queuing ingest_info ([%d] people) to " "batch_persistence for [%s]", len(scraped_data.ingest_info.people), self.region.region_code) scrape_key = ScrapeKey(self.region.region_code, request.scrape_type) batch_persistence.write( ingest_info=scraped_data.ingest_info, scrape_key=scrape_key, task=task, ) else: logging.info( "Writing ingest_info ([%d] people) to the database" " for [%s]", len(scraped_data.ingest_info.people), self.region.region_code) persistence.write( ingest_utils.convert_ingest_info_to_proto( scraped_data.ingest_info), metadata) for sc in scraped_data.single_counts: if not sc.date: scrape_key = ScrapeKey(self.region.region_code, constants.ScrapeType.BACKGROUND) session = sessions.get_current_session(scrape_key) if session: sc = attr.evolve(sc, date=session.start.date()) single_count.store_single_count( sc, self.region.jurisdiction_id) except Exception as e: if self.BATCH_WRITES: scrape_key = ScrapeKey(self.region.region_code, request.scrape_type) batch_persistence.write_error( error=str(e), trace_id=get_trace_id_from_flask(), task=task, scrape_key=scrape_key, ) raise e