def test_is_task_queued_has_tasks(self): # Arrange file_path = to_normalized_unprocessed_file_path( 'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW) gcsfs_args = \ GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=GcsfsFilePath.from_absolute_path(file_path)) full_task_name = \ _build_task_id(_REGION.region_code, gcsfs_args.task_id_tag()) info = ProcessIngestJobCloudTaskQueueInfo( queue_name='queue_name', task_names=[ 'projects/path/to/random_task', f'projects/path/to/{full_task_name}' ]) file_path = to_normalized_unprocessed_file_path( 'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW) gcsfs_args = \ GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=GcsfsFilePath.from_absolute_path(file_path)) # Act gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args) # Assert self.assertTrue(gcsfs_args_queued)
def test_info_single_task(self) -> None: # Arrange gcsfs_args = GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=self.ingest_view_file_path, ) full_task_name = _build_task_id(_REGION.region_code, DirectIngestInstance.PRIMARY, gcsfs_args.task_id_tag()) info = ProcessIngestJobCloudTaskQueueInfo( queue_name="queue_name", task_names=[ "projects/path/to/random_task", f"projects/path/to/{full_task_name}", ], ) gcsfs_args = GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=self.ingest_view_file_path, ) # Act gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args) # Assert self.assertTrue(gcsfs_args_queued) self.assertTrue( info.tasks_for_instance(_REGION.region_code, DirectIngestInstance.PRIMARY)) self.assertFalse( info.tasks_for_instance(_REGION.region_code, DirectIngestInstance.SECONDARY))
def test_create_direct_ingest_process_job_task_secondary( self, mock_client: mock.MagicMock, mock_uuid: mock.MagicMock) -> None: # Arrange file_path = to_normalized_unprocessed_file_path( "bucket/ingest_view_name.csv", file_type=GcsfsDirectIngestFileType.INGEST_VIEW, ) ingest_args = GcsfsIngestArgs( datetime.datetime(year=2019, month=7, day=20), file_path=GcsfsFilePath.from_absolute_path(file_path), ) body = { "cloud_task_args": ingest_args.to_serializable(), "args_type": "GcsfsIngestArgs", } body_encoded = json.dumps(body).encode() uuid = "random-uuid" mock_uuid.uuid4.return_value = uuid date = "2019-07-20" queue_path = "us-xx-process-queue-path" queue_name = "direct-ingest-state-us-xx-process-job-queue" task_name = "{}/{}-{}-{}".format( DIRECT_INGEST_STATE_PROCESS_JOB_QUEUE_V2, _REGION.region_code, date, uuid) url_params = {"region": _REGION.region_code, "file_path": file_path} task = tasks_v2.types.task_pb2.Task( name=task_name, app_engine_http_request={ "http_method": "POST", "relative_uri": f"/direct/process_job?{urlencode(url_params)}", "body": body_encoded, }, ) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl( ).create_direct_ingest_process_job_task(_REGION, DirectIngestInstance.SECONDARY, ingest_args) # Assert mock_client.return_value.queue_path.assert_called_with( self.mock_project_id, QUEUES_REGION, queue_name, ) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def create_direct_ingest_process_job_task( self, region: Region, ingest_instance: DirectIngestInstance, ingest_args: GcsfsIngestArgs, ) -> None: task_id = _build_task_id( region.region_code, ingest_instance, ingest_args.task_id_tag(), prefix_only=False, ) params = { "region": region.region_code.lower(), "file_path": ingest_args.file_path.abs_path(), } relative_uri = f"/direct/process_job?{urlencode(params)}" body = self._get_body_from_args(ingest_args) self._get_process_job_queue_manager(region, ingest_instance).create_task( task_id=task_id, relative_uri=relative_uri, body=body, )
def ingest_args_for_fixture_file(controller: GcsfsDirectIngestController, filename: str) -> GcsfsIngestArgs: original_path = os.path.join(controller.ingest_directory_path, filename) file_path = to_normalized_unprocessed_file_path(original_path) return GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=file_path, )
def test_create_direct_ingest_process_job_task_gcsfs_args( self, mock_client: MagicMock, mock_uuid: MagicMock, mock_datetime: MagicMock) -> None: # Arrange file_path = to_normalized_unprocessed_file_path( "bucket/file_path.csv", GcsfsDirectIngestFileType.INGEST_VIEW) ingest_args = GcsfsIngestArgs( ingest_time=datetime.datetime(year=2019, month=7, day=20), file_path=GcsfsFilePath.from_absolute_path(file_path), ) body = { "cloud_task_args": ingest_args.to_serializable(), "args_type": "GcsfsIngestArgs", } body_encoded = json.dumps(body).encode() uuid = "random-uuid" mock_uuid.uuid4.return_value = uuid date = "2019-07-20" mock_datetime.date.today.return_value = date queue_path = f"{_REGION.shared_queue}-path" task_name = _REGION.get_queue_name() + "/{}-{}-{}".format( _REGION.region_code, date, uuid) task = tasks_v2.types.task_pb2.Task( name=task_name, app_engine_http_request={ "http_method": "POST", "relative_uri": f"/direct/process_job?region={_REGION.region_code}", "body": body_encoded, }, ) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl( ).create_direct_ingest_process_job_task(_REGION, ingest_args) # Assert mock_client.return_value.queue_path.assert_called_with( self.mock_project_id, QUEUES_REGION, _REGION.shared_queue) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def test_create_direct_ingest_process_job_task_gcsfs_args( self, mock_client, mock_uuid, mock_datetime): # Arrange project_id = 'recidiviz-456' file_path = to_normalized_unprocessed_file_path( 'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW) ingest_args = \ GcsfsIngestArgs( ingest_time=datetime.datetime(year=2019, month=7, day=20), file_path=GcsfsFilePath.from_absolute_path(file_path)) body = { 'cloud_task_args': ingest_args.to_serializable(), 'args_type': 'GcsfsIngestArgs' } body_encoded = json.dumps(body).encode() uuid = 'random-uuid' mock_uuid.uuid4.return_value = uuid date = '2019-07-20' mock_datetime.date.today.return_value = date queue_path = _REGION.shared_queue + '-path' task_name = _REGION.shared_queue + '/{}-{}-{}'.format( _REGION.region_code, date, uuid) task = tasks_v2.types.task_pb2.Task( name=task_name, app_engine_http_request={ 'http_method': 'POST', 'relative_uri': f'/direct/process_job?region={_REGION.region_code}', 'body': body_encoded }) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl(project_id=project_id).\ create_direct_ingest_process_job_task(_REGION, ingest_args) # Assert mock_client.return_value.queue_path.assert_called_with( project_id, QUEUES_REGION, _REGION.shared_queue) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def json_to_ingest_args(json_data): if 'ingest_args' in json_data and 'args_type' in json_data: args_type = json_data['args_type'] ingest_args = json_data['ingest_args'] if args_type == IngestArgs.__name__: return IngestArgs.from_serializable(ingest_args) if args_type == GcsfsIngestArgs.__name__: return GcsfsIngestArgs.from_serializable(ingest_args) logging.error('Unexpected args_type in json_data: %s', args_type) return None
def ingest_args_for_fixture_file( controller: GcsfsDirectIngestController, filename: str, should_normalize: bool = True) -> GcsfsIngestArgs: file_path = path_for_fixture_file(controller, filename, should_normalize) if not isinstance(file_path, GcsfsFilePath): raise ValueError(f'Unexpected type [{file_path}]') return GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=file_path, )
def test_is_task_queued_has_tasks(self): # Arrange file_path = to_normalized_unprocessed_file_path('file_path.csv') gcsfs_args = GcsfsIngestArgs(ingest_time=datetime.datetime.now(), file_path=file_path) full_task_name = \ _build_task_id(_REGION.region_code, gcsfs_args.task_id_tag()) info = CloudTaskQueueInfo(queue_name='queue_name', task_names=[ f'projects/path/to/random_task', f'projects/path/to/{full_task_name}' ]) file_path = to_normalized_unprocessed_file_path('file_path.csv') gcsfs_args = GcsfsIngestArgs(ingest_time=datetime.datetime.now(), file_path=file_path) # Act gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args) # Assert self.assertTrue(gcsfs_args_queued)
def test_create_direct_ingest_process_job_task_gcsfs_args( self, mock_client, mock_uuid, mock_datetime): # Arrange ingest_args = GcsfsIngestArgs( datetime.datetime(year=2019, month=7, day=20), file_path=to_normalized_unprocessed_file_path('file_path.csv')) body = { 'ingest_args': ingest_args.to_serializable(), 'args_type': 'GcsfsIngestArgs' } body_encoded = json.dumps(body).encode() uuid = 'random-uuid' mock_uuid.uuid4.return_value = uuid date = '2019-07-20' mock_datetime.date.today.return_value = date queue_path = _REGION.shared_queue + '-path' task_name = _REGION.shared_queue + '/{}-{}-{}'.format( _REGION.region_code, date, uuid) task = tasks.types.Task( name=task_name, app_engine_http_request={ 'relative_uri': f'/direct/process_job?region={_REGION.region_code}', 'body': body_encoded }) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl().\ create_direct_ingest_process_job_task(_REGION, ingest_args) # Assert mock_client.return_value.queue_path.assert_called_with( metadata.project_id(), metadata.region(), _REGION.shared_queue) mock_client.return_value.create_task.assert_called_with( queue_path, task)
def json_to_cloud_task_args(json_data: dict) -> Optional[CloudTaskArgs]: if "cloud_task_args" in json_data and "args_type" in json_data: args_type = json_data["args_type"] cloud_task_args_dict = json_data["cloud_task_args"] if args_type == GcsfsIngestArgs.__name__: return GcsfsIngestArgs.from_serializable(cloud_task_args_dict) if args_type == GcsfsRawDataBQImportArgs.__name__: return GcsfsRawDataBQImportArgs.from_serializable( cloud_task_args_dict) if args_type == GcsfsIngestViewExportArgs.__name__: return GcsfsIngestViewExportArgs.from_serializable( cloud_task_args_dict) logging.error("Unexpected args_type in json_data: %s", args_type) return None
def test_info_tasks_both_instances(self) -> None: # Arrange gcsfs_args = GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=self.ingest_view_file_path, ) full_task_names = [ _build_task_id( _REGION.region_code, ingest_instance, gcsfs_args.task_id_tag(), ) for ingest_instance in DirectIngestInstance ] info = ProcessIngestJobCloudTaskQueueInfo( queue_name="queue_name", task_names=[ "projects/path/to/random_task", ] + [ f"projects/path/to/{full_task_name}" for full_task_name in full_task_names ], ) gcsfs_args = GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=self.ingest_view_file_path, ) # Act gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args) # Assert self.assertTrue(gcsfs_args_queued) for ingest_instance in DirectIngestInstance: self.assertTrue( info.tasks_for_instance(_REGION.region_code, ingest_instance))
def test_parse_args(self): ingest_args = GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path='/foo/bar', ) ingest_args_class_name = ingest_args.__class__.__name__ ingest_args_class_module = ingest_args.__module__ ingest_args_dict = attr.asdict(ingest_args) module = importlib.import_module(ingest_args_class_module) ingest_class = getattr(module, ingest_args_class_name) result_args = ingest_class(**ingest_args_dict) self.assertEqual(ingest_args, result_args)
def is_task_queued(self, region: Region, ingest_args: GcsfsIngestArgs) -> bool: """Returns true if the ingest_args correspond to a task currently in the queue. """ task_id_prefix = _build_task_id( region.region_code, DirectIngestInstance.for_ingest_bucket( ingest_args.file_path.bucket_path), ingest_args.task_id_tag(), prefix_only=True, ) return bool(next(self._tasks_for_prefix(task_id_prefix), None))
def get_next_job_args( self, date_str: Optional[str] = None) -> Optional[GcsfsIngestArgs]: """Returns args for the next job to run based on the files currently in cloud storage. Args: date_str: (string) If not None, this function will only return jobs for files uploaded on the specified date. """ next_file_path = \ self._get_next_valid_unprocessed_file_path(date_str) if not next_file_path: return None return GcsfsIngestArgs(ingest_time=datetime.datetime.utcnow(), file_path=next_file_path)
def json_to_cloud_task_args(json_data: dict): if 'cloud_task_args' in json_data and 'args_type' in json_data: args_type = json_data['args_type'] cloud_task_args_dict = json_data['cloud_task_args'] if args_type == IngestArgs.__name__: return IngestArgs.from_serializable(cloud_task_args_dict) if args_type == GcsfsIngestArgs.__name__: return GcsfsIngestArgs.from_serializable(cloud_task_args_dict) if args_type == GcsfsRawDataBQImportArgs.__name__: return GcsfsRawDataBQImportArgs.from_serializable( cloud_task_args_dict) if args_type == GcsfsIngestViewExportArgs.__name__: return GcsfsIngestViewExportArgs.from_serializable( cloud_task_args_dict) logging.error('Unexpected args_type in json_data: %s', args_type) return None
def ingest_args_for_fixture_file( controller: BaseDirectIngestController, filename: str, should_normalize: bool = True, ) -> GcsfsIngestArgs: file_path = path_for_fixture_file( controller, filename, should_normalize, file_type=GcsfsDirectIngestFileType.INGEST_VIEW, ) if not isinstance(file_path, GcsfsFilePath): raise ValueError(f"Unexpected type [{file_path}]") return GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=file_path, )
def test_serialize_gcsfs_ingest_args(self): now = datetime.datetime.now() str_now = datetime_to_serializable(now) now_converted = serializable_to_datetime(str_now) self.assertTrue(now, now_converted) args = GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path='foo/bar.csv', ) args_dict = attr_to_json_dict(args) serialized = json.dumps(args_dict).encode() args_dict = json.loads(serialized) result_args = attr_from_json_dict(args_dict) self.assertEqual(args, result_args)
def test_is_task_queued_no_tasks(self): # Arrange info = CloudTaskQueueInfo(queue_name='queue_name', task_names=[]) file_path = to_normalized_unprocessed_file_path('file_path.csv') args = IngestArgs(ingest_time=datetime.datetime.now()) gcsfs_args = GcsfsIngestArgs(ingest_time=datetime.datetime.now(), file_path=file_path) # Act basic_args_queued = info.is_task_queued(_REGION, args) gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args) # Assert self.assertFalse(basic_args_queued) self.assertFalse(gcsfs_args_queued) self.assertFalse(info.is_task_queued(_REGION, gcsfs_args))
def create_direct_ingest_process_job_task( self, region: Region, ingest_instance: DirectIngestInstance, ingest_args: GcsfsIngestArgs, ) -> None: """Queues *but does not run* a process job task.""" if not self.controller: raise ValueError( "Controller is null - did you call set_controller()?") task_id = _build_task_id( self.controller.region.region_code, ingest_instance, ingest_args.task_id_tag(), ) self.process_job_tasks.append( (f"projects/path/to/{task_id}", ingest_args))
def test_info_no_tasks(self) -> None: # Arrange info = ProcessIngestJobCloudTaskQueueInfo(queue_name="queue_name", task_names=[]) gcsfs_args = GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=self.ingest_view_file_path, ) # Act gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args) # Assert self.assertFalse(gcsfs_args_queued) self.assertFalse(info.is_task_queued(_REGION, gcsfs_args)) for instance in DirectIngestInstance: self.assertFalse( info.tasks_for_instance(_REGION.region_code, instance))
def test_is_task_queued_no_tasks(self): # Arrange info = ProcessIngestJobCloudTaskQueueInfo(queue_name='queue_name', task_names=[]) file_path = to_normalized_unprocessed_file_path( 'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW) args = IngestArgs(ingest_time=datetime.datetime.now()) gcsfs_args = \ GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=GcsfsFilePath.from_absolute_path(file_path)) # Act basic_args_queued = info.is_task_queued(_REGION, args) gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args) # Assert self.assertFalse(basic_args_queued) self.assertFalse(gcsfs_args_queued) self.assertFalse(info.is_task_queued(_REGION, gcsfs_args))