def test_persist_to_db_different_regions(self, mock_write, _mock_region,
                                             mock_session_return):
        scrape_key1 = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)
        scrape_key2 = ScrapeKey(REGIONS[1], constants.ScrapeType.BACKGROUND)

        ii = ingest_info.IngestInfo()
        ii.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME).create_booking(booking_id=TEST_ID)

        ii2 = ingest_info.IngestInfo()
        ii2.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME2).create_booking(booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        t2 = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        mock_session_1 = mock_session_return.return_value = create_mock_session(
        )

        batch_persistence.write(ii, scrape_key1, t)
        expected_proto = serialization.convert_ingest_info_to_proto(ii)
        batch_persistence.persist_to_database(scrape_key1.region_code,
                                              mock_session_1.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        # We expect the region that we persisted to have no more ingest infos.
        ingest_infos_1 = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[0], mock_session_1.start)
        self.assertEqual(len(ingest_infos_1), 0)

        mock_session_2 = mock_session_return.return_value = create_mock_session(
        )

        batch_persistence.write(ii2, scrape_key2, t2)
        ingest_infos_2 = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[1], mock_session_2.start)
        self.assertEqual(len(ingest_infos_2), 1)

        expected_proto = serialization.convert_ingest_info_to_proto(ii2)
        batch_persistence.persist_to_database(scrape_key2.region_code,
                                              mock_session_2.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        self.assertEqual(mock_write.call_count, 2)
    def test_batch_delete_ingest_infos_for_region(self):
        task_hash = hash(
            json.dumps(
                Task(
                    task_type=constants.TaskType.SCRAPE_DATA,
                    endpoint=TEST_ENDPOINT,
                    response_type=constants.ResponseType.TEXT,
                ).to_serializable(),
                sort_keys=True,
            ))
        start_time = datetime.now()

        datastore_ingest_info.write_ingest_info(
            region="us_state_county",
            session_start_time=start_time,
            ingest_info=sample_ingest_info("1"),
            task_hash=task_hash,
        )
        datastore_ingest_info.write_ingest_info(
            region="us_state_county",
            session_start_time=start_time,
            ingest_info=sample_ingest_info("2"),
            task_hash=task_hash,
        )
        datastore_ingest_info.write_ingest_info(
            region="us_state_county",
            session_start_time=start_time,
            ingest_info=sample_ingest_info("3"),
            task_hash=task_hash,
        )
        unrelated = datastore_ingest_info.write_ingest_info(
            region="unrelated_us_state_county",
            session_start_time=start_time,
            ingest_info=sample_ingest_info("n/a"),
            task_hash=task_hash,
        )

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            "us_state_county")

        assert (datastore_ingest_info.batch_get_ingest_infos_for_region(
            "us_state_county", start_time) == [])

        actual = datastore_ingest_info.batch_get_ingest_infos_for_region(
            "unrelated_us_state_county", start_time)
        assert actual == [unrelated]

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            "unrelated_us_state_county")
    def test_persist_to_db(self, mock_write, _mock_region,
                           mock_session_return):
        mock_session = mock_session_return.return_value = create_mock_session()
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)

        ii = ingest_info.IngestInfo()
        ii.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME).create_booking(booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        batch_persistence.write(ii, scrape_key, t)

        expected_proto = serialization.convert_ingest_info_to_proto(ii)

        batch_persistence.persist_to_database(scrape_key.region_code,
                                              mock_session.start)

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        # After we persist, there should no longer be ingest infos on Datastore
        ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[0], mock_session.start)
        self.assertEqual(len(ingest_infos), 0)
    def test_write_errors(self):
        task_hash = hash(
            json.dumps(
                Task(
                    task_type=constants.TaskType.SCRAPE_DATA,
                    endpoint=TEST_ENDPOINT,
                    response_type=constants.ResponseType.TEXT,
                ).to_serializable(),
                sort_keys=True,
            ))

        start_time = datetime.now()
        batch_ingest_info_data = datastore_ingest_info.write_error(
            region="us_state_county",
            session_start_time=start_time,
            error="error string",
            trace_id="trace",
            task_hash=task_hash,
        )

        results = datastore_ingest_info.batch_get_ingest_infos_for_region(
            "us_state_county", start_time)

        assert results == [batch_ingest_info_data]
        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            "us_state_county")
    def test_batch_delete_over_500_ingest_infos_for_region(self):
        task_hash = hash(
            json.dumps(
                Task(
                    task_type=constants.TaskType.SCRAPE_DATA,
                    endpoint=TEST_ENDPOINT,
                    response_type=constants.ResponseType.TEXT,
                ).to_serializable(),
                sort_keys=True,
            ))
        start_time = datetime.now()

        # The Datastore limit for entity writes in one call is 500. Confirm
        # that batch delete is properly handled when more than 500 entities
        # exist for the same region.
        for i in range(600):
            datastore_ingest_info.write_ingest_info(
                region="us_state_county",
                session_start_time=start_time,
                ingest_info=sample_ingest_info(str(i)),
                task_hash=task_hash,
            )

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            "us_state_county")

        assert (datastore_ingest_info.batch_get_ingest_infos_for_region(
            "us_state_county", start_time) == [])
Example #6
0
    def test_batch_delete_ingest_infos_for_region(self):
        task_hash = hash(
            json.dumps(Task(
                task_type=constants.TaskType.SCRAPE_DATA,
                endpoint=TEST_ENDPOINT,
                response_type=constants.ResponseType.TEXT).to_serializable(),
                       sort_keys=True))
        start_time = datetime.now()

        datastore_ingest_info.write_ingest_info(
            region='us_state_county',
            session_start_time=start_time,
            ingest_info=sample_ingest_info('1'),
            task_hash=task_hash)
        datastore_ingest_info.write_ingest_info(
            region='us_state_county',
            session_start_time=start_time,
            ingest_info=sample_ingest_info('2'),
            task_hash=task_hash)
        datastore_ingest_info.write_ingest_info(
            region='us_state_county',
            session_start_time=start_time,
            ingest_info=sample_ingest_info('3'),
            task_hash=task_hash)
        unrelated = datastore_ingest_info \
            .write_ingest_info(region='unrelated_us_state_county',
                               session_start_time=start_time,
                               ingest_info=sample_ingest_info('n/a'),
                               task_hash=task_hash)

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            'us_state_county')

        assert datastore_ingest_info.batch_get_ingest_infos_for_region(
            'us_state_county', start_time) == []

        actual = datastore_ingest_info.batch_get_ingest_infos_for_region(
            'unrelated_us_state_county', start_time)
        assert actual == [unrelated]

        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            'unrelated_us_state_county')
Example #7
0
def _get_batch_ingest_info_list(
        region_code: str,
        session_start_time: datetime.datetime) -> List[BatchIngestInfoData]:
    """Reads all of the messages from Datastore for the region.
    Args:
        region_code (str): The region code of the scraper.
        session_start_time (datetime): The start time of the scraper.
    Returns:
        A list of BatchIngestInfoData.
    """
    return datastore_ingest_info.batch_get_ingest_infos_for_region(
        region_code, session_start_time)
Example #8
0
    def test_write_single_ingest_info(self):
        task_hash = hash(
            json.dumps(Task(
                task_type=constants.TaskType.SCRAPE_DATA,
                endpoint=TEST_ENDPOINT,
                response_type=constants.ResponseType.TEXT).to_serializable(),
                       sort_keys=True))

        start_time = datetime.now()
        ingest_info = datastore_ingest_info.write_ingest_info(
            region='us_state_county',
            session_start_time=start_time,
            ingest_info=sample_ingest_info('1'),
            task_hash=task_hash)
        results = datastore_ingest_info.batch_get_ingest_infos_for_region(
            'us_state_county', start_time)
        assert results == [ingest_info]
        datastore_ingest_info.batch_delete_ingest_infos_for_region(
            'us_state_county')
    def test_persist_to_db_same_task_one_fail_one_pass(self, mock_write,
                                                       _mock_region,
                                                       mock_session_return):
        mock_session = mock_session_return.return_value = create_mock_session()
        scrape_key = ScrapeKey(REGIONS[0], constants.ScrapeType.BACKGROUND)
        mock_write.return_value = True

        ii = ingest_info.IngestInfo()
        ii.create_person(
            person_id=TEST_ID,
            full_name=TEST_NAME).create_booking(booking_id=TEST_ID)

        t = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        # Because the tasks are the same, we expect that to be counted as a
        # pass.
        t2 = Task(
            task_type=constants.TaskType.SCRAPE_DATA,
            endpoint=TEST_ENDPOINT,
            response_type=constants.ResponseType.TEXT,
        )

        batch_persistence.write(ii, scrape_key, t)
        batch_persistence.write_error(TEST_ERROR, TEST_TRACE, t2, scrape_key)

        expected_proto = serialization.convert_ingest_info_to_proto(ii)

        self.assertTrue(
            batch_persistence.persist_to_database(scrape_key.region_code,
                                                  mock_session.start))

        result_proto = mock_write.call_args[0][0]
        self.assertEqual(result_proto, expected_proto)

        ingest_infos = datastore_ingest_info.batch_get_ingest_infos_for_region(
            REGIONS[0], mock_session.start)
        self.assertEqual(len(ingest_infos), 0)