Example #1
0
    def test_raw_data_import(self, mock_supported, mock_region,
                             mock_environment):
        mock_supported.return_value = ['us_xx']

        region_code = 'us_xx'

        mock_environment.return_value = 'staging'
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment='staging',
                                               ingestor=mock_controller)

        import_args = GcsfsRawDataBQImportArgs(
            raw_data_file_path=GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(
                    'bucket/raw_data_path.csv',
                    file_type=GcsfsDirectIngestFileType.RAW_DATA)))
        request_args = {
            'region': region_code,
        }
        body = {
            'cloud_task_args': import_args.to_serializable(),
            'args_type': 'GcsfsRawDataBQImportArgs',
        }
        body_encoded = json.dumps(body).encode()

        headers = {'X-Appengine-Cron': 'test-cron'}

        response = self.client.post('/raw_data_import',
                                    query_string=request_args,
                                    headers=headers,
                                    data=body_encoded)
        self.assertEqual(200, response.status_code)
        mock_controller.do_raw_data_import.assert_called_with(import_args)
    def test_create_direct_ingest_raw_data_import_task(
            self, mock_client: mock.MagicMock,
            mock_uuid: mock.MagicMock) -> None:
        # Arrange
        raw_data_path = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path(
                "bucket/raw_data_path.csv",
                file_type=GcsfsDirectIngestFileType.RAW_DATA,
            ))
        import_args = GcsfsRawDataBQImportArgs(
            raw_data_file_path=raw_data_path)
        body = {
            "cloud_task_args": import_args.to_serializable(),
            "args_type": "GcsfsRawDataBQImportArgs",
        }
        body_encoded = json.dumps(body).encode()
        uuid = "random-uuid"
        mock_uuid.uuid4.return_value = uuid
        date = "2019-07-20"
        queue_path = f"{DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2}-path"

        task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + "/{}-{}-{}".format(
            _REGION.region_code, date, uuid)
        url_params = {
            "region": _REGION.region_code,
            "file_path": raw_data_path.abs_path(),
        }
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                "http_method": "POST",
                "relative_uri":
                f"/direct/raw_data_import?{urlencode(url_params)}",
                "body": body_encoded,
            },
        )

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(
        ).create_direct_ingest_raw_data_import_task(
            _REGION, DirectIngestInstance.PRIMARY, import_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            self.mock_project_id, QUEUES_REGION,
            DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
    def _schedule_raw_data_import_tasks(self) -> bool:
        if not self.region.are_raw_data_bq_imports_enabled_in_env():
            return False

        queue_info = self.cloud_task_manager.get_bq_import_export_queue_info(
            self.region)

        did_schedule = False
        tasks_to_schedule = [
            GcsfsRawDataBQImportArgs(path) for path in
            self.raw_file_import_manager.get_unprocessed_raw_files_to_import()
        ]
        for task_args in tasks_to_schedule:
            # If the file path has not actually been discovered by the metadata manager yet, it likely was just added
            # and a subsequent call to handle_files will register it and trigger another call to this function so we can
            # schedule the appropriate job.
            discovered = self.file_metadata_manager.has_file_been_discovered(
                task_args.raw_data_file_path)
            if discovered and not queue_info.has_task_already_scheduled(
                    task_args):
                self.cloud_task_manager.create_direct_ingest_raw_data_import_task(
                    self.region, task_args)
                did_schedule = True

        return queue_info.has_raw_data_import_jobs_queued() or did_schedule
    def create_direct_ingest_raw_data_import_task(
        self,
        region: Region,
        ingest_instance: DirectIngestInstance,
        data_import_args: GcsfsRawDataBQImportArgs,
    ) -> None:
        task_id = _build_task_id(
            region.region_code,
            ingest_instance,
            task_id_tag=data_import_args.task_id_tag(),
            prefix_only=False,
        )

        params = {
            "region": region.region_code.lower(),
            "file_path": data_import_args.raw_data_file_path.abs_path(),
        }
        relative_uri = f"/direct/raw_data_import?{urlencode(params)}"

        body = self._get_body_from_args(data_import_args)

        self._get_bq_import_export_queue_manager(region,
                                                 ingest_instance).create_task(
                                                     task_id=task_id,
                                                     relative_uri=relative_uri,
                                                     body=body,
                                                 )
Example #5
0
    def test_create_direct_ingest_raw_data_import_task(self, mock_client,
                                                       mock_uuid):
        # Arrange
        project_id = 'recidiviz-456'
        import_args = GcsfsRawDataBQImportArgs(
            raw_data_file_path=GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(
                    'bucket/raw_data_path.csv',
                    file_type=GcsfsDirectIngestFileType.RAW_DATA)))
        body = {
            'cloud_task_args': import_args.to_serializable(),
            'args_type': 'GcsfsRawDataBQImportArgs'
        }
        body_encoded = json.dumps(body).encode()
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid
        date = '2019-07-20'
        queue_path = _REGION.shared_queue + '-path'

        task_name = DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2 + '/{}-{}-{}'.format(
            _REGION.region_code, date, uuid)
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                'http_method': 'POST',
                'relative_uri':
                f'/direct/raw_data_import?region={_REGION.region_code}',
                'body': body_encoded
            })

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(
            project_id=project_id).create_direct_ingest_raw_data_import_task(
                _REGION, import_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
    def test_raw_data_import(
        self,
        mock_supported: mock.MagicMock,
        mock_region: mock.MagicMock,
        mock_environment: mock.MagicMock,
    ) -> None:
        mock_supported.return_value = ["us_xx"]

        region_code = "us_xx"

        mock_environment.return_value = "staging"
        mock_controller = create_autospec(GcsfsDirectIngestController)
        mock_region.return_value = fake_region(region_code=region_code,
                                               environment="staging",
                                               ingestor=mock_controller)

        import_args = GcsfsRawDataBQImportArgs(
            raw_data_file_path=GcsfsFilePath.from_absolute_path(
                to_normalized_unprocessed_file_path(
                    "bucket/raw_data_path.csv",
                    file_type=GcsfsDirectIngestFileType.RAW_DATA,
                )))
        request_args = {
            "region": region_code,
        }
        body = {
            "cloud_task_args": import_args.to_serializable(),
            "args_type": "GcsfsRawDataBQImportArgs",
        }
        body_encoded = json.dumps(body).encode()

        headers = {"X-Appengine-Cron": "test-cron"}

        response = self.client.post(
            "/raw_data_import",
            query_string=request_args,
            headers=headers,
            data=body_encoded,
        )
        self.assertEqual(200, response.status_code)
        mock_controller.do_raw_data_import.assert_called_with(import_args)
 def json_to_cloud_task_args(json_data: dict) -> Optional[CloudTaskArgs]:
     if "cloud_task_args" in json_data and "args_type" in json_data:
         args_type = json_data["args_type"]
         cloud_task_args_dict = json_data["cloud_task_args"]
         if args_type == GcsfsIngestArgs.__name__:
             return GcsfsIngestArgs.from_serializable(cloud_task_args_dict)
         if args_type == GcsfsRawDataBQImportArgs.__name__:
             return GcsfsRawDataBQImportArgs.from_serializable(
                 cloud_task_args_dict)
         if args_type == GcsfsIngestViewExportArgs.__name__:
             return GcsfsIngestViewExportArgs.from_serializable(
                 cloud_task_args_dict)
         logging.error("Unexpected args_type in json_data: %s", args_type)
     return None
    def _schedule_raw_data_import_tasks(self) -> bool:
        if not self.region.are_raw_data_bq_imports_enabled_in_env():
            return False

        queue_info = self.cloud_task_manager.get_bq_import_export_queue_info(self.region)

        did_schedule = False
        tasks_to_schedule = [GcsfsRawDataBQImportArgs(path)
                             for path in self.raw_file_import_manager.get_unprocessed_raw_files_to_import()]
        for task_args in tasks_to_schedule:
            if not queue_info.has_task_already_scheduled(task_args):
                self.cloud_task_manager.create_direct_ingest_raw_data_import_task(self.region, task_args)
                did_schedule = True

        return queue_info.has_raw_data_import_jobs_queued() or did_schedule
    def create_direct_ingest_raw_data_import_task(
            self, region: Region, data_import_args: GcsfsRawDataBQImportArgs):
        task_id = _build_task_id(region.region_code,
                                 task_id_tag=data_import_args.task_id_tag(),
                                 prefix_only=False)
        relative_uri = f'/direct/raw_data_import?region={region.region_code}'

        body = self._get_body_from_args(data_import_args)

        self.cloud_task_client.create_task(
            task_id=task_id,
            queue_name=DIRECT_INGEST_BQ_IMPORT_EXPORT_QUEUE_V2,
            relative_uri=relative_uri,
            body=body,
        )
 def json_to_cloud_task_args(json_data: dict):
     if 'cloud_task_args' in json_data and 'args_type' in json_data:
         args_type = json_data['args_type']
         cloud_task_args_dict = json_data['cloud_task_args']
         if args_type == IngestArgs.__name__:
             return IngestArgs.from_serializable(cloud_task_args_dict)
         if args_type == GcsfsIngestArgs.__name__:
             return GcsfsIngestArgs.from_serializable(cloud_task_args_dict)
         if args_type == GcsfsRawDataBQImportArgs.__name__:
             return GcsfsRawDataBQImportArgs.from_serializable(
                 cloud_task_args_dict)
         if args_type == GcsfsIngestViewExportArgs.__name__:
             return GcsfsIngestViewExportArgs.from_serializable(
                 cloud_task_args_dict)
         logging.error('Unexpected args_type in json_data: %s', args_type)
     return None
Example #11
0
    def create_direct_ingest_raw_data_import_task(
            self, region: Region,
            data_import_args: GcsfsRawDataBQImportArgs) -> None:
        task_id = _build_task_id(
            region.region_code,
            task_id_tag=data_import_args.task_id_tag(),
            prefix_only=False,
        )
        relative_uri = f"/direct/raw_data_import?region={region.region_code}"

        body = self._get_body_from_args(data_import_args)

        self.bq_import_export_cloud_task_queue_manager.create_task(
            task_id=task_id,
            relative_uri=relative_uri,
            body=body,
        )
Example #12
0
    def _schedule_raw_data_import_tasks(self) -> bool:
        """Schedules all pending ingest view export tasks for launched ingest view tags,
        if they have not been scheduled. If tasks are scheduled or are still running,
        returns True. Otherwise, if it's safe to proceed with next steps of ingest,
        returns False."""
        queue_info = self.cloud_task_manager.get_bq_import_export_queue_info(
            self.region, self.ingest_instance)

        did_schedule = False
        tasks_to_schedule = [
            GcsfsRawDataBQImportArgs(path) for path in
            self.raw_file_import_manager.get_unprocessed_raw_files_to_import()
        ]
        for task_args in tasks_to_schedule:
            # If the file path has not actually been discovered by the metadata manager yet, it likely was just added
            # and a subsequent call to handle_files will register it and trigger another call to this function so we can
            # schedule the appropriate job.
            discovered = self.file_metadata_manager.has_raw_file_been_discovered(
                task_args.raw_data_file_path)
            # If the file path has been processed, but still in the GCS bucket, it's likely due
            # to either a manual move or an accidental duplicate uploading. In either case, we
            # trust the database to have the source of truth.
            processed = self.file_metadata_manager.has_raw_file_been_processed(
                task_args.raw_data_file_path)
            if processed:
                logging.warning(
                    "File [%s] is already marked as processed. Skipping file processing.",
                    task_args.raw_data_file_path,
                )
            if (discovered and not processed
                    and not queue_info.has_task_already_scheduled(task_args)):
                self.cloud_task_manager.create_direct_ingest_raw_data_import_task(
                    self.region, self.ingest_instance, task_args)
                did_schedule = True

        return queue_info.has_raw_data_import_jobs_queued() or did_schedule