def test_with_incremental_substream(self, _): """Ensure substreams don't set currently_syncing""" # There's no point in setting currently_syncing # substreams since we rely on the parent stream, so # we can't immediately pick-up there. state = handle_record( "foo", record={ "bar": "biz", "modified_at": "2020-01-01" }, stream_def=MagicMock(is_valid_incremental=True, replication_key="modified_at"), stream_version=1, state={"foo": "bar"}, ) self.assertDictEqual( state, { "bookmarks": { "foo": { "modified_at": "2020-01-01" } }, "foo": "bar", }, )
def test_with_incremental_stream(self): """Ensure currently_syncing is set and bookmark add/updated""" state = handle_record( "foo", record={ "bar": "biz", "modified_at": "2020-01-01" }, stream_def=MagicMock(is_valid_incremental=True, replication_key="modified_at"), stream_version=1, state={"foo": "bar"}, ) self.assertDictEqual( state, { "currently_syncing": "foo", "bookmarks": { "foo": { "modified_at": "2020-01-01" } }, "foo": "bar", }, )
def test_with_full_table_stream(self): state = handle_record( "foo", record={"bar": "biz"}, stream_def=MagicMock(is_valid_incremental=False, replication_key=None), stream_version=1234, state={"foo": "bar"}, ) self.assertDictEqual(state, {"currently_syncing": "foo", "foo": "bar"})
def test_with_missing_replication_key(self): """Ensure the bookmarks aren't touched when the replication_key is missing from the record """ state = handle_record( "foo", record={"bar": "biz"}, stream_def=MagicMock(is_valid_incremental=True, replication_key="modified_at"), stream_version=1, state={"foo": "bar"}, ) self.assertDictEqual( state, { "currently_syncing": "foo", "foo": "bar", }, )
def process_stream( stream_def: Union[Stream, ResponseSubstream, EndpointSubstream], stream_version: Optional[int], state: Dict[str, Any], json_message: Dict[str, Any], filter_datetime: "datetime", ) -> None: LOGGER.info("Message: %s", json.dumps(json_message)) stream_id = pluralize(underscore(json_message["object"])) record = json_message["record"] # Filter based off of the message timestamp or # the replication key? if filter_record( record, DataContext(tap_stream_id=stream_id, stream=stream_def, filter_datetime=filter_datetime), ): return None state = handle_record(stream_id, record, stream_def, stream_version, state) # Make sure stream is selected for record to print if stream_def.is_selected: if isinstance(stream_def, Stream): for substream in stream_def.substreams: # Can't handle EndpointSubstream's like this - # I'm assuming the producer is pushing the data # in a similar way to the API? if not substream.is_selected: continue if not isinstance(substream, ResponseSubstream): continue # .sync_sub_records performs transformations, so not necessary # to invoke ourselves here for tap_substream_id, sub_record in stream_def.sync_sub_records( substream, record, filter_datetime): state = handle_record(tap_substream_id, sub_record, stream_def, stream_version, state) with stream_def.transformer_class() as transformer: for record in transformer.transform( record, stream_def.schema_dict, context=DataContext( stream=stream_def, filter_datetime=filter_datetime, tap_stream_id=stream_id, ), metadata=stream_def.mapped_metadata, ): state = handle_record(stream_id, record, stream_def, stream_version, state) elif isinstance(stream_def, EndpointSubstream): # This assumes the data being consumed is akin to # the API. As in - /customer/<id>/notes is separated # into its own individual message context = DataContext( tap_stream_id=stream_def.tap_stream_id, stream=stream_def, filter_datetime=filter_datetime, ) with stream_def.transformer_class() as transformer: records = transformer.transform( record, stream_def.schema_dict, context=context, metadata=stream_def.mapped_metadata, ) for record in records: state = handle_record(stream_id, record, stream_def, stream_version, state) write_state(state) return None