def _read_full_refresh( self, logger: logging.Logger, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream, internal_config: InternalConfig, ) -> Iterator[AirbyteMessage]: slices = stream_instance.stream_slices( sync_mode=SyncMode.full_refresh, cursor_field=configured_stream.cursor_field) logger.debug( f"Processing stream slices for {configured_stream.stream.name}", extra={"stream_slices": slices}) total_records_counter = 0 for _slice in slices: logger.debug("Processing stream slice", extra={"slice": _slice}) records = stream_instance.read_records( stream_slice=_slice, sync_mode=SyncMode.full_refresh, cursor_field=configured_stream.cursor_field, ) for record in records: yield self._as_airbyte_record(configured_stream.stream.name, record) total_records_counter += 1 if self._limit_reached(internal_config, total_records_counter): return
def _read_incremental(stream_instance: Stream, stream_state: MutableMapping[str, Any]): res = [] slices = stream_instance.stream_slices(sync_mode=SyncMode.incremental, stream_state=stream_state) for slice in slices: records = stream_instance.read_records(sync_mode=SyncMode.incremental, stream_slice=slice, stream_state=stream_state) for record in records: res.append(record) return res, stream_instance.state
def read_full_refresh(stream_instance: Stream): records = [] slices = stream_instance.stream_slices(sync_mode=SyncMode.full_refresh) for slice in slices: records.extend( list( stream_instance.read_records(stream_slice=slice, sync_mode=SyncMode.full_refresh))) return records
def read_full_refresh(stream_instance: Stream): res = [] schema = stream_instance.get_json_schema() slices = stream_instance.stream_slices(sync_mode=SyncMode.full_refresh) for slice in slices: records = stream_instance.read_records(stream_slice=slice, sync_mode=SyncMode.full_refresh) for record in records: stream_instance.transformer.transform(record, schema) res.append(record) return res
def read_incremental(stream_instance: Stream, stream_state: MutableMapping[str, Any]): res = [] if stream_state and "state" in dir(stream_instance): stream_instance.state = stream_state slices = stream_instance.stream_slices(sync_mode=SyncMode.incremental, stream_state=stream_state) for slice in slices: records = stream_instance.read_records(sync_mode=SyncMode.incremental, stream_slice=slice, stream_state=stream_state) for record in records: stream_state = stream_instance.get_updated_state(stream_state, record) res.append(record) return res, stream_state
def _read_full_refresh( self, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream, internal_config: InternalConfig ) -> Iterator[AirbyteMessage]: slices = stream_instance.stream_slices(sync_mode=SyncMode.full_refresh, cursor_field=configured_stream.cursor_field) total_records_counter = 0 for slice in slices: records = stream_instance.read_records( stream_slice=slice, sync_mode=SyncMode.full_refresh, cursor_field=configured_stream.cursor_field ) for record in records: yield self._as_airbyte_record(configured_stream.stream.name, record) total_records_counter += 1 if self._limit_reached(internal_config, total_records_counter): return
def _read_full_refresh( self, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream ) -> Iterator[AirbyteMessage]: slices = stream_instance.stream_slices( sync_mode=SyncMode.full_refresh, cursor_field=configured_stream.cursor_field) for slice in slices: records = stream_instance.read_records( stream_slice=slice, sync_mode=SyncMode.full_refresh, cursor_field=configured_stream.cursor_field) for record in records: yield self._as_airbyte_record(configured_stream.stream.name, record)
def _read_stream( self, logger: AirbyteLogger, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream, connector_state: MutableMapping[str, Any], internal_config: InternalConfig, ) -> Iterator[AirbyteMessage]: if internal_config.page_size and isinstance(stream_instance, HttpStream): logger.info(f"Setting page size for {stream_instance.name} to {internal_config.page_size}") stream_instance.page_size = internal_config.page_size use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental if use_incremental: record_iterator = self._read_incremental(logger, stream_instance, configured_stream, connector_state, internal_config) else: record_iterator = self._read_full_refresh(stream_instance, configured_stream, internal_config) record_counter = 0 stream_name = configured_stream.stream.name logger.info(f"Syncing stream: {stream_name} ") for record in record_iterator: if record.type == MessageType.RECORD: record_counter += 1 yield record logger.info(f"Read {record_counter} records from {stream_name} stream")
def _read_incremental( self, logger: AirbyteLogger, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream, connector_state: MutableMapping[str, Any], internal_config: InternalConfig, ) -> Iterator[AirbyteMessage]: stream_name = configured_stream.stream.name stream_state = connector_state.get(stream_name, {}) if stream_state: logger.info( f"Setting state of {stream_name} stream to {stream_state}") checkpoint_interval = stream_instance.state_checkpoint_interval slices = stream_instance.stream_slices( cursor_field=configured_stream.cursor_field, sync_mode=SyncMode.incremental, stream_state=stream_state) total_records_counter = 0 for slice in slices: records = stream_instance.read_records( sync_mode=SyncMode.incremental, stream_slice=slice, stream_state=stream_state, cursor_field=configured_stream.cursor_field or None, ) for record_counter, record_data in enumerate(records, start=1): yield self._as_airbyte_record(stream_name, record_data) stream_state = stream_instance.get_updated_state( stream_state, record_data) if checkpoint_interval and record_counter % checkpoint_interval == 0: yield self._checkpoint_state(stream_name, stream_state, connector_state, logger) total_records_counter += 1 # This functionality should ideally live outside of this method # but since state is managed inside this method, we keep track # of it here. if self._limit_reached(internal_config, total_records_counter): # Break from slice loop to save state and exit from _read_incremental function. break yield self._checkpoint_state(stream_name, stream_state, connector_state, logger) if self._limit_reached(internal_config, total_records_counter): return
def test_wrapped_primary_key_various_argument(test_input, expected): """ Should always wrap primary key into list of lists. """ wrapped = Stream._wrapped_primary_key(test_input) assert wrapped == expected
def _read_incremental( self, logger: AirbyteLogger, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream, connector_state: MutableMapping[str, Any], ) -> Iterator[AirbyteMessage]: stream_name = configured_stream.stream.name stream_state = connector_state.get(stream_name, {}) if stream_state: logger.info( f"Setting state of {stream_name} stream to {stream_state.get(stream_name)}" ) checkpoint_interval = stream_instance.state_checkpoint_interval slices = stream_instance.stream_slices( cursor_field=configured_stream.cursor_field, sync_mode=SyncMode.incremental, stream_state=stream_state) for slice in slices: record_counter = 0 records = stream_instance.read_records( sync_mode=SyncMode.incremental, stream_slice=slice, stream_state=stream_state, cursor_field=configured_stream.cursor_field or None, ) for record_data in records: record_counter += 1 yield self._as_airbyte_record(stream_name, record_data) stream_state = stream_instance.get_updated_state( stream_state, record_data) if checkpoint_interval and record_counter % checkpoint_interval == 0: yield self._checkpoint_state(stream_name, stream_state, connector_state, logger) yield self._checkpoint_state(stream_name, stream_state, connector_state, logger)
def _read_stream( self, logger: logging.Logger, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream, connector_state: MutableMapping[str, Any], internal_config: InternalConfig, ) -> Iterator[AirbyteMessage]: self._apply_log_level_to_stream_logger(logger, stream_instance) if internal_config.page_size and isinstance(stream_instance, HttpStream): logger.info( f"Setting page size for {stream_instance.name} to {internal_config.page_size}" ) stream_instance.page_size = internal_config.page_size logger.debug( f"Syncing stream: {configured_stream.stream.name}", extra={ "sync_mode": configured_stream.sync_mode, "primary_key": configured_stream.primary_key, "cursor_field": configured_stream.cursor_field, }, ) use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental if use_incremental: record_iterator = self._read_incremental( logger, stream_instance, configured_stream, connector_state, internal_config, ) else: record_iterator = self._read_full_refresh(logger, stream_instance, configured_stream, internal_config) record_counter = 0 stream_name = configured_stream.stream.name logger.info(f"Syncing stream: {stream_name} ") for record in record_iterator: if record.type == MessageType.RECORD: record_counter += 1 yield record logger.info(f"Read {record_counter} records from {stream_name} stream")
def _read_incremental( self, logger: logging.Logger, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream, connector_state: MutableMapping[str, Any], internal_config: InternalConfig, ) -> Iterator[AirbyteMessage]: """ This method is overridden to checkpoint the latest actual state, because stream state is refreshed after reading each batch of records (if need_chunk is True), or reading all records in the stream. """ yield from super()._read_incremental( logger=logger, stream_instance=stream_instance, configured_stream=configured_stream, connector_state=connector_state, internal_config=internal_config, ) stream_state = stream_instance.get_updated_state( current_stream_state={}, latest_record={}) yield self._checkpoint_state(stream_instance, stream_state, connector_state)
def _configured_stream(stream: Stream, sync_mode: SyncMode): return ConfiguredAirbyteStream( stream=stream.as_airbyte_stream(), sync_mode=sync_mode, destination_sync_mode=DestinationSyncMode.overwrite)