def test_serde_idempotence(state_obj): """ Verifies that Serialization + Deserialization reconstructs the original object fully. """ # 1. Construct the initial checkpoint object orig_checkpoint_obj = Checkpoint( job_name=test_job_name, pipeline_name=test_pipeline_name, platform_instance_id=test_platform_instance_id, run_id=test_run_id, config=test_source_config, state=state_obj, ) # 2. Convert it to the aspect form. checkpoint_aspect = orig_checkpoint_obj.to_checkpoint_aspect( # fmt: off max_allowed_state_size=2**20 # fmt: on ) assert checkpoint_aspect is not None # 3. Reconstruct from the aspect form and verify that it matches the original. serde_checkpoint_obj = Checkpoint.create_from_checkpoint_aspect( job_name=test_job_name, checkpoint_aspect=checkpoint_aspect, state_class=type(state_obj), config_class=MySQLConfig, ) assert orig_checkpoint_obj == serde_checkpoint_obj
def test_create_from_checkpoint_aspect(state_obj): """ Tests the Checkpoint class API 'create_from_checkpoint_aspect' with the state_obj parameter as the state. """ # 1. Construct the raw aspect object with the state checkpoint_state = IngestionCheckpointStateClass( formatVersion=state_obj.version, serde=state_obj.serde, payload=state_obj.to_bytes(), ) checkpoint_aspect = DatahubIngestionCheckpointClass( timestampMillis=int(datetime.utcnow().timestamp() * 1000), pipelineName=test_pipeline_name, platformInstanceId=test_platform_instance_id, config=test_source_config.json(), state=checkpoint_state, runId=test_run_id, ) # 2. Create the checkpoint from the raw checkpoint aspect and validate. checkpoint_obj = Checkpoint.create_from_checkpoint_aspect( job_name=test_job_name, checkpoint_aspect=checkpoint_aspect, state_class=type(state_obj), config_class=MySQLConfig, ) expected_checkpoint_obj = Checkpoint( job_name=test_job_name, pipeline_name=test_pipeline_name, platform_instance_id=test_platform_instance_id, run_id=test_run_id, config=test_source_config, state=state_obj, ) assert checkpoint_obj == expected_checkpoint_obj
def create_checkpoint(self, job_id: JobId) -> Optional[Checkpoint]: """ Create the custom checkpoint with empty state for the job. """ assert self.ctx.pipeline_name is not None if job_id == self.get_default_ingestion_job_id(): return Checkpoint( job_name=job_id, pipeline_name=self.ctx.pipeline_name, platform_instance_id=self.get_platform_instance_id(), run_id=self.ctx.run_id, config=self.config, state=BaseSQLAlchemyCheckpointState(), ) return None
def create_checkpoint(self, job_id: JobId) -> Optional[Checkpoint]: """ Create the custom checkpoint with empty state for the job. """ assert self.ctx.pipeline_name if job_id == self.get_default_ingestion_job_id(): return Checkpoint( job_name=job_id, pipeline_name=self.ctx.pipeline_name, platform_instance_id=self.get_platform_instance_id(), run_id=self.ctx.run_id, config=self.config, state=BaseUsageCheckpointState( begin_timestamp_millis=int( self.config.start_time.timestamp() * 1000), end_timestamp_millis=int(self.config.end_time.timestamp() * 1000), ), ) return None
def _get_last_checkpoint( self, job_id: JobId, checkpoint_state_class: Type[CheckpointStateBase] ) -> Optional[Checkpoint]: """ This is a template method implementation for querying the last checkpoint state. """ last_checkpoint: Optional[Checkpoint] = None if self.is_stateful_ingestion_configured(): # Obtain the latest checkpoint from GMS for this job. last_checkpoint_aspect = self.ingestion_checkpointing_state_provider.get_latest_checkpoint( # type: ignore pipeline_name=self.ctx.pipeline_name, # type: ignore platform_instance_id=self.get_platform_instance_id(), job_name=job_id, ) # Convert it to a first-class Checkpoint object. last_checkpoint = Checkpoint.create_from_checkpoint_aspect( job_name=job_id, checkpoint_aspect=last_checkpoint_aspect, config_class=self.source_config_type, state_class=checkpoint_state_class, ) return last_checkpoint
def test_provider(self): # 1. Create the individual job checkpoints with appropriate states. # Job1 - Checkpoint with a BaseSQLAlchemyCheckpointState state job1_state_obj = BaseSQLAlchemyCheckpointState() job1_checkpoint = Checkpoint( job_name=self.job_names[0], pipeline_name=self.pipeline_name, platform_instance_id=self.platform_instance_id, run_id=self.run_id, config=MySQLConfig(), state=job1_state_obj, ) # Job2 - Checkpoint with a BaseUsageCheckpointState state job2_state_obj = BaseUsageCheckpointState(begin_timestamp_millis=10, end_timestamp_millis=100) job2_checkpoint = Checkpoint( job_name=self.job_names[1], pipeline_name=self.pipeline_name, platform_instance_id=self.platform_instance_id, run_id=self.run_id, config=MySQLConfig(), state=job2_state_obj, ) # 2. Set the provider's state_to_commit. self.provider.state_to_commit = { # NOTE: state_to_commit accepts only the aspect version of the checkpoint. self.job_names[0]: job1_checkpoint.to_checkpoint_aspect( # fmt: off max_allowed_state_size=2**20 # fmt: on ), self.job_names[1]: job2_checkpoint.to_checkpoint_aspect( # fmt: off max_allowed_state_size=2**20 # fmt: on ), } # 3. Perform the commit # NOTE: This will commit the state to the in-memory self.mcps_emitted because of the monkey-patching. self.provider.commit() self.assertTrue(self.provider.committed) # 4. Get last committed state. This must match what has been committed earlier. # NOTE: This will retrieve from in-memory self.mcps_emitted because of the monkey-patching. last_state: Optional[ CheckpointJobStatesMap] = self.provider.get_last_state( self.job_state_key) assert last_state is not None self.assertEqual(len(last_state), 2) # 5. Validate individual job checkpoint state values that have been committed and retrieved # against the original values. self.assertIsNotNone(last_state[self.job_names[0]]) job1_last_checkpoint = Checkpoint.create_from_checkpoint_aspect( job_name=self.job_names[0], checkpoint_aspect=last_state[self.job_names[0]], state_class=type(job1_state_obj), config_class=type(job1_checkpoint.config), ) self.assertEqual(job1_last_checkpoint, job1_checkpoint) self.assertIsNotNone(last_state[self.job_names[1]]) job2_last_checkpoint = Checkpoint.create_from_checkpoint_aspect( job_name=self.job_names[1], checkpoint_aspect=last_state[self.job_names[1]], state_class=type(job2_state_obj), config_class=type(job2_checkpoint.config), ) self.assertEqual(job2_last_checkpoint, job2_checkpoint)