Example #1
0
    def test_is_task_queued_has_tasks(self):
        # Arrange
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        full_task_name = \
            _build_task_id(_REGION.region_code, gcsfs_args.task_id_tag())
        info = ProcessIngestJobCloudTaskQueueInfo(
            queue_name='queue_name',
            task_names=[
                'projects/path/to/random_task',
                f'projects/path/to/{full_task_name}'
            ])
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        # Act
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertTrue(gcsfs_args_queued)
    def test_info_single_task(self) -> None:
        # Arrange
        gcsfs_args = GcsfsIngestArgs(
            ingest_time=datetime.datetime.now(),
            file_path=self.ingest_view_file_path,
        )

        full_task_name = _build_task_id(_REGION.region_code,
                                        DirectIngestInstance.PRIMARY,
                                        gcsfs_args.task_id_tag())
        info = ProcessIngestJobCloudTaskQueueInfo(
            queue_name="queue_name",
            task_names=[
                "projects/path/to/random_task",
                f"projects/path/to/{full_task_name}",
            ],
        )
        gcsfs_args = GcsfsIngestArgs(
            ingest_time=datetime.datetime.now(),
            file_path=self.ingest_view_file_path,
        )

        # Act
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertTrue(gcsfs_args_queued)
        self.assertTrue(
            info.tasks_for_instance(_REGION.region_code,
                                    DirectIngestInstance.PRIMARY))
        self.assertFalse(
            info.tasks_for_instance(_REGION.region_code,
                                    DirectIngestInstance.SECONDARY))
    def test_create_direct_ingest_process_job_task_secondary(
            self, mock_client: mock.MagicMock,
            mock_uuid: mock.MagicMock) -> None:
        # Arrange
        file_path = to_normalized_unprocessed_file_path(
            "bucket/ingest_view_name.csv",
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW,
        )
        ingest_args = GcsfsIngestArgs(
            datetime.datetime(year=2019, month=7, day=20),
            file_path=GcsfsFilePath.from_absolute_path(file_path),
        )
        body = {
            "cloud_task_args": ingest_args.to_serializable(),
            "args_type": "GcsfsIngestArgs",
        }
        body_encoded = json.dumps(body).encode()
        uuid = "random-uuid"
        mock_uuid.uuid4.return_value = uuid
        date = "2019-07-20"
        queue_path = "us-xx-process-queue-path"
        queue_name = "direct-ingest-state-us-xx-process-job-queue"

        task_name = "{}/{}-{}-{}".format(
            DIRECT_INGEST_STATE_PROCESS_JOB_QUEUE_V2, _REGION.region_code,
            date, uuid)
        url_params = {"region": _REGION.region_code, "file_path": file_path}
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                "http_method": "POST",
                "relative_uri": f"/direct/process_job?{urlencode(url_params)}",
                "body": body_encoded,
            },
        )

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(
        ).create_direct_ingest_process_job_task(_REGION,
                                                DirectIngestInstance.SECONDARY,
                                                ingest_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            self.mock_project_id,
            QUEUES_REGION,
            queue_name,
        )
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
    def create_direct_ingest_process_job_task(
        self,
        region: Region,
        ingest_instance: DirectIngestInstance,
        ingest_args: GcsfsIngestArgs,
    ) -> None:
        task_id = _build_task_id(
            region.region_code,
            ingest_instance,
            ingest_args.task_id_tag(),
            prefix_only=False,
        )
        params = {
            "region": region.region_code.lower(),
            "file_path": ingest_args.file_path.abs_path(),
        }
        relative_uri = f"/direct/process_job?{urlencode(params)}"
        body = self._get_body_from_args(ingest_args)

        self._get_process_job_queue_manager(region,
                                            ingest_instance).create_task(
                                                task_id=task_id,
                                                relative_uri=relative_uri,
                                                body=body,
                                            )
Example #5
0
def ingest_args_for_fixture_file(controller: GcsfsDirectIngestController,
                                 filename: str) -> GcsfsIngestArgs:
    original_path = os.path.join(controller.ingest_directory_path, filename)
    file_path = to_normalized_unprocessed_file_path(original_path)
    return GcsfsIngestArgs(
        ingest_time=datetime.datetime.now(),
        file_path=file_path,
    )
    def test_create_direct_ingest_process_job_task_gcsfs_args(
            self, mock_client: MagicMock, mock_uuid: MagicMock,
            mock_datetime: MagicMock) -> None:
        # Arrange
        file_path = to_normalized_unprocessed_file_path(
            "bucket/file_path.csv", GcsfsDirectIngestFileType.INGEST_VIEW)
        ingest_args = GcsfsIngestArgs(
            ingest_time=datetime.datetime(year=2019, month=7, day=20),
            file_path=GcsfsFilePath.from_absolute_path(file_path),
        )
        body = {
            "cloud_task_args": ingest_args.to_serializable(),
            "args_type": "GcsfsIngestArgs",
        }
        body_encoded = json.dumps(body).encode()
        uuid = "random-uuid"
        mock_uuid.uuid4.return_value = uuid
        date = "2019-07-20"
        mock_datetime.date.today.return_value = date
        queue_path = f"{_REGION.shared_queue}-path"

        task_name = _REGION.get_queue_name() + "/{}-{}-{}".format(
            _REGION.region_code, date, uuid)
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                "http_method": "POST",
                "relative_uri":
                f"/direct/process_job?region={_REGION.region_code}",
                "body": body_encoded,
            },
        )

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(
        ).create_direct_ingest_process_job_task(_REGION, ingest_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            self.mock_project_id, QUEUES_REGION, _REGION.shared_queue)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
Example #7
0
    def test_create_direct_ingest_process_job_task_gcsfs_args(
            self, mock_client, mock_uuid, mock_datetime):
        # Arrange
        project_id = 'recidiviz-456'
        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        ingest_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime(year=2019, month=7, day=20),
                file_path=GcsfsFilePath.from_absolute_path(file_path))
        body = {
            'cloud_task_args': ingest_args.to_serializable(),
            'args_type': 'GcsfsIngestArgs'
        }
        body_encoded = json.dumps(body).encode()
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid
        date = '2019-07-20'
        mock_datetime.date.today.return_value = date
        queue_path = _REGION.shared_queue + '-path'

        task_name = _REGION.shared_queue + '/{}-{}-{}'.format(
            _REGION.region_code, date, uuid)
        task = tasks_v2.types.task_pb2.Task(
            name=task_name,
            app_engine_http_request={
                'http_method': 'POST',
                'relative_uri':
                f'/direct/process_job?region={_REGION.region_code}',
                'body': body_encoded
            })

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl(project_id=project_id).\
            create_direct_ingest_process_job_task(_REGION, ingest_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            project_id, QUEUES_REGION, _REGION.shared_queue)
        mock_client.return_value.create_task.assert_called_with(
            parent=queue_path, task=task)
Example #8
0
 def json_to_ingest_args(json_data):
     if 'ingest_args' in json_data and 'args_type' in json_data:
         args_type = json_data['args_type']
         ingest_args = json_data['ingest_args']
         if args_type == IngestArgs.__name__:
             return IngestArgs.from_serializable(ingest_args)
         if args_type == GcsfsIngestArgs.__name__:
             return GcsfsIngestArgs.from_serializable(ingest_args)
         logging.error('Unexpected args_type in json_data: %s', args_type)
     return None
Example #9
0
def ingest_args_for_fixture_file(
        controller: GcsfsDirectIngestController,
        filename: str,
        should_normalize: bool = True) -> GcsfsIngestArgs:
    file_path = path_for_fixture_file(controller, filename, should_normalize)
    if not isinstance(file_path, GcsfsFilePath):
        raise ValueError(f'Unexpected type [{file_path}]')
    return GcsfsIngestArgs(
        ingest_time=datetime.datetime.now(),
        file_path=file_path,
    )
    def test_is_task_queued_has_tasks(self):
        # Arrange
        file_path = to_normalized_unprocessed_file_path('file_path.csv')
        gcsfs_args = GcsfsIngestArgs(ingest_time=datetime.datetime.now(),
                                     file_path=file_path)
        full_task_name = \
            _build_task_id(_REGION.region_code, gcsfs_args.task_id_tag())
        info = CloudTaskQueueInfo(queue_name='queue_name',
                                  task_names=[
                                      f'projects/path/to/random_task',
                                      f'projects/path/to/{full_task_name}'
                                  ])
        file_path = to_normalized_unprocessed_file_path('file_path.csv')
        gcsfs_args = GcsfsIngestArgs(ingest_time=datetime.datetime.now(),
                                     file_path=file_path)

        # Act
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertTrue(gcsfs_args_queued)
    def test_create_direct_ingest_process_job_task_gcsfs_args(
            self, mock_client, mock_uuid, mock_datetime):
        # Arrange
        ingest_args = GcsfsIngestArgs(
            datetime.datetime(year=2019, month=7, day=20),
            file_path=to_normalized_unprocessed_file_path('file_path.csv'))
        body = {
            'ingest_args': ingest_args.to_serializable(),
            'args_type': 'GcsfsIngestArgs'
        }
        body_encoded = json.dumps(body).encode()
        uuid = 'random-uuid'
        mock_uuid.uuid4.return_value = uuid
        date = '2019-07-20'
        mock_datetime.date.today.return_value = date
        queue_path = _REGION.shared_queue + '-path'

        task_name = _REGION.shared_queue + '/{}-{}-{}'.format(
            _REGION.region_code, date, uuid)
        task = tasks.types.Task(
            name=task_name,
            app_engine_http_request={
                'relative_uri':
                f'/direct/process_job?region={_REGION.region_code}',
                'body': body_encoded
            })

        mock_client.return_value.task_path.return_value = task_name
        mock_client.return_value.queue_path.return_value = queue_path

        # Act
        DirectIngestCloudTaskManagerImpl().\
            create_direct_ingest_process_job_task(_REGION, ingest_args)

        # Assert
        mock_client.return_value.queue_path.assert_called_with(
            metadata.project_id(), metadata.region(), _REGION.shared_queue)
        mock_client.return_value.create_task.assert_called_with(
            queue_path, task)
 def json_to_cloud_task_args(json_data: dict) -> Optional[CloudTaskArgs]:
     if "cloud_task_args" in json_data and "args_type" in json_data:
         args_type = json_data["args_type"]
         cloud_task_args_dict = json_data["cloud_task_args"]
         if args_type == GcsfsIngestArgs.__name__:
             return GcsfsIngestArgs.from_serializable(cloud_task_args_dict)
         if args_type == GcsfsRawDataBQImportArgs.__name__:
             return GcsfsRawDataBQImportArgs.from_serializable(
                 cloud_task_args_dict)
         if args_type == GcsfsIngestViewExportArgs.__name__:
             return GcsfsIngestViewExportArgs.from_serializable(
                 cloud_task_args_dict)
         logging.error("Unexpected args_type in json_data: %s", args_type)
     return None
    def test_info_tasks_both_instances(self) -> None:
        # Arrange
        gcsfs_args = GcsfsIngestArgs(
            ingest_time=datetime.datetime.now(),
            file_path=self.ingest_view_file_path,
        )

        full_task_names = [
            _build_task_id(
                _REGION.region_code,
                ingest_instance,
                gcsfs_args.task_id_tag(),
            ) for ingest_instance in DirectIngestInstance
        ]

        info = ProcessIngestJobCloudTaskQueueInfo(
            queue_name="queue_name",
            task_names=[
                "projects/path/to/random_task",
            ] + [
                f"projects/path/to/{full_task_name}"
                for full_task_name in full_task_names
            ],
        )
        gcsfs_args = GcsfsIngestArgs(
            ingest_time=datetime.datetime.now(),
            file_path=self.ingest_view_file_path,
        )

        # Act
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertTrue(gcsfs_args_queued)
        for ingest_instance in DirectIngestInstance:
            self.assertTrue(
                info.tasks_for_instance(_REGION.region_code, ingest_instance))
    def test_parse_args(self):
        ingest_args = GcsfsIngestArgs(
            ingest_time=datetime.datetime.now(),
            file_path='/foo/bar',
        )

        ingest_args_class_name = ingest_args.__class__.__name__
        ingest_args_class_module = ingest_args.__module__
        ingest_args_dict = attr.asdict(ingest_args)

        module = importlib.import_module(ingest_args_class_module)
        ingest_class = getattr(module, ingest_args_class_name)
        result_args = ingest_class(**ingest_args_dict)

        self.assertEqual(ingest_args, result_args)
    def is_task_queued(self, region: Region,
                       ingest_args: GcsfsIngestArgs) -> bool:
        """Returns true if the ingest_args correspond to a task currently in
        the queue.
        """

        task_id_prefix = _build_task_id(
            region.region_code,
            DirectIngestInstance.for_ingest_bucket(
                ingest_args.file_path.bucket_path),
            ingest_args.task_id_tag(),
            prefix_only=True,
        )

        return bool(next(self._tasks_for_prefix(task_id_prefix), None))
    def get_next_job_args(
            self,
            date_str: Optional[str] = None) -> Optional[GcsfsIngestArgs]:
        """Returns args for the next job to run based on the files currently
        in cloud storage.

        Args:
            date_str: (string) If not None, this function will only return jobs
                for files uploaded on the specified date.
        """
        next_file_path = \
            self._get_next_valid_unprocessed_file_path(date_str)
        if not next_file_path:
            return None

        return GcsfsIngestArgs(ingest_time=datetime.datetime.utcnow(), file_path=next_file_path)
 def json_to_cloud_task_args(json_data: dict):
     if 'cloud_task_args' in json_data and 'args_type' in json_data:
         args_type = json_data['args_type']
         cloud_task_args_dict = json_data['cloud_task_args']
         if args_type == IngestArgs.__name__:
             return IngestArgs.from_serializable(cloud_task_args_dict)
         if args_type == GcsfsIngestArgs.__name__:
             return GcsfsIngestArgs.from_serializable(cloud_task_args_dict)
         if args_type == GcsfsRawDataBQImportArgs.__name__:
             return GcsfsRawDataBQImportArgs.from_serializable(
                 cloud_task_args_dict)
         if args_type == GcsfsIngestViewExportArgs.__name__:
             return GcsfsIngestViewExportArgs.from_serializable(
                 cloud_task_args_dict)
         logging.error('Unexpected args_type in json_data: %s', args_type)
     return None
def ingest_args_for_fixture_file(
    controller: BaseDirectIngestController,
    filename: str,
    should_normalize: bool = True,
) -> GcsfsIngestArgs:
    file_path = path_for_fixture_file(
        controller,
        filename,
        should_normalize,
        file_type=GcsfsDirectIngestFileType.INGEST_VIEW,
    )
    if not isinstance(file_path, GcsfsFilePath):
        raise ValueError(f"Unexpected type [{file_path}]")
    return GcsfsIngestArgs(
        ingest_time=datetime.datetime.now(),
        file_path=file_path,
    )
Example #19
0
    def test_serialize_gcsfs_ingest_args(self):
        now = datetime.datetime.now()

        str_now = datetime_to_serializable(now)
        now_converted = serializable_to_datetime(str_now)

        self.assertTrue(now, now_converted)

        args = GcsfsIngestArgs(
            ingest_time=datetime.datetime.now(),
            file_path='foo/bar.csv',
        )

        args_dict = attr_to_json_dict(args)
        serialized = json.dumps(args_dict).encode()
        args_dict = json.loads(serialized)
        result_args = attr_from_json_dict(args_dict)
        self.assertEqual(args, result_args)
    def test_is_task_queued_no_tasks(self):
        # Arrange
        info = CloudTaskQueueInfo(queue_name='queue_name', task_names=[])

        file_path = to_normalized_unprocessed_file_path('file_path.csv')
        args = IngestArgs(ingest_time=datetime.datetime.now())
        gcsfs_args = GcsfsIngestArgs(ingest_time=datetime.datetime.now(),
                                     file_path=file_path)

        # Act
        basic_args_queued = info.is_task_queued(_REGION, args)
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertFalse(basic_args_queued)
        self.assertFalse(gcsfs_args_queued)

        self.assertFalse(info.is_task_queued(_REGION, gcsfs_args))
    def create_direct_ingest_process_job_task(
        self,
        region: Region,
        ingest_instance: DirectIngestInstance,
        ingest_args: GcsfsIngestArgs,
    ) -> None:
        """Queues *but does not run* a process job task."""
        if not self.controller:
            raise ValueError(
                "Controller is null - did you call set_controller()?")

        task_id = _build_task_id(
            self.controller.region.region_code,
            ingest_instance,
            ingest_args.task_id_tag(),
        )
        self.process_job_tasks.append(
            (f"projects/path/to/{task_id}", ingest_args))
    def test_info_no_tasks(self) -> None:
        # Arrange
        info = ProcessIngestJobCloudTaskQueueInfo(queue_name="queue_name",
                                                  task_names=[])

        gcsfs_args = GcsfsIngestArgs(
            ingest_time=datetime.datetime.now(),
            file_path=self.ingest_view_file_path,
        )

        # Act
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertFalse(gcsfs_args_queued)

        self.assertFalse(info.is_task_queued(_REGION, gcsfs_args))
        for instance in DirectIngestInstance:
            self.assertFalse(
                info.tasks_for_instance(_REGION.region_code, instance))
Example #23
0
    def test_is_task_queued_no_tasks(self):
        # Arrange
        info = ProcessIngestJobCloudTaskQueueInfo(queue_name='queue_name',
                                                  task_names=[])

        file_path = to_normalized_unprocessed_file_path(
            'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW)
        args = IngestArgs(ingest_time=datetime.datetime.now())
        gcsfs_args = \
            GcsfsIngestArgs(
                ingest_time=datetime.datetime.now(),
                file_path=GcsfsFilePath.from_absolute_path(file_path))

        # Act
        basic_args_queued = info.is_task_queued(_REGION, args)
        gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args)

        # Assert
        self.assertFalse(basic_args_queued)
        self.assertFalse(gcsfs_args_queued)

        self.assertFalse(info.is_task_queued(_REGION, gcsfs_args))