def _read_incremental( self, logger: AirbyteLogger, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream, connector_state: MutableMapping[str, Any], ) -> Iterator[AirbyteMessage]: stream_name = configured_stream.stream.name stream_state = connector_state.get(stream_name, {}) if stream_state: logger.info(f"Setting state of {stream_name} stream to {stream_state.get(stream_name)}") checkpoint_interval = stream_instance.state_checkpoint_interval slices = stream_instance.stream_slices( cursor_field=configured_stream.cursor_field, sync_mode=SyncMode.incremental, stream_state=stream_state ) for slice in slices: record_counter = 0 records = stream_instance.read_records( sync_mode=SyncMode.incremental, stream_slice=slice, stream_state=stream_state, cursor_field=configured_stream.cursor_field or None, ) for record_data in records: record_counter += 1 yield self._as_airbyte_record(stream_name, record_data) stream_state = stream_instance.get_updated_state(stream_state, record_data) if checkpoint_interval and record_counter % checkpoint_interval == 0: yield self._checkpoint_state(stream_name, stream_state, connector_state, logger) yield self._checkpoint_state(stream_name, stream_state, connector_state, logger)
def read( self, logger: AirbyteLogger, config: Mapping[str, Any], catalog: ConfiguredAirbyteCatalog, state: MutableMapping[str, Any] = None) -> Iterator[AirbyteMessage]: connector_state = copy.deepcopy(state or {}) logger.info(f"Starting syncing {self.name}") # TODO assert all streams exist in the connector # get the streams once in case the connector needs to make any queries to generate them stream_instances = {s.name: s for s in self.streams(config)} for configured_stream in catalog.streams: try: stream_instance = stream_instances[ configured_stream.stream.name] yield from self._read_stream( logger=logger, stream_instance=stream_instance, configured_stream=configured_stream, connector_state=connector_state) except Exception as e: logger.exception( f"Encountered an exception while reading stream {self.name}" ) raise e logger.info(f"Finished syncing {self.name}")
def _read_stream( self, logger: AirbyteLogger, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream, connector_state: MutableMapping[str, Any], ) -> Iterator[AirbyteMessage]: use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental if use_incremental: record_iterator = self._read_incremental(logger, stream_instance, configured_stream, connector_state) else: record_iterator = self._read_full_refresh(stream_instance, configured_stream) record_counter = 0 stream_name = configured_stream.stream.name logger.info(f"Syncing stream: {stream_name} ") for record in record_iterator: if record.type == MessageType.RECORD: record_counter += 1 yield record logger.info(f"Read {record_counter} records from {stream_name} stream")
def streams(self, config: Mapping[str, Any]) -> List[Stream]: authenticator = TokenAuthenticator(config["api_token"]) default_start_date = pendulum.now().subtract( days=14) # TODO make this configurable threads_lookback_window = {"days": 7} # TODO make this configurable streams = [ Channels(authenticator=authenticator), ChannelMembers(authenticator=authenticator), ChannelMessages(authenticator=authenticator, default_start_date=default_start_date), Threads(authenticator=authenticator, default_start_date=default_start_date, lookback_window=threads_lookback_window), Users(authenticator=authenticator), ] # To sync data from channels, the bot backed by this token needs to join all those channels. This operation is idempotent. # TODO make joining configurable. Also make joining archived and private channels configurable logger = AirbyteLogger() logger.info("joining Slack channels") join_channels_stream = JoinChannelsStream(authenticator=authenticator) for stream_slice in join_channels_stream.stream_slices(): for message in join_channels_stream.read_records( sync_mode=SyncMode.full_refresh, stream_slice=stream_slice): logger.info(message["message"]) return streams
def _read_stream( self, logger: AirbyteLogger, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream, state: MutableMapping[str, Any]) -> Iterator[AirbyteMessage]: stream_name = configured_stream.stream.name use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental stream_state = {} if use_incremental and state.get(stream_name): logger.info( f"Set state of {stream_name} stream to {state.get(stream_name)}" ) stream_state = state.get(stream_name) logger.info(f"Syncing stream: {stream_name} ") record_counter = 0 for record in stream_instance.read_stream( configured_stream=configured_stream, stream_state=copy.deepcopy(stream_state)): now_millis = int(datetime.now().timestamp()) * 1000 message = AirbyteRecordMessage(stream=stream_name, data=record, emitted_at=now_millis) yield AirbyteMessage(type=MessageType.RECORD, record=message) record_counter += 1 if use_incremental: stream_state = stream_instance.get_updated_state( stream_state, record) if record_counter % stream_instance.state_checkpoint_interval == 0: state[stream_name] = stream_state yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=state)) if use_incremental and stream_state: state[stream_name] = stream_state # output state object only together with other stream states yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=state))
def read( self, logger: AirbyteLogger, config: Mapping[str, Any], catalog: ConfiguredAirbyteCatalog, state: MutableMapping[str, Any] = None) -> Iterator[AirbyteMessage]: """Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.io/architecture/airbyte-specification.""" connector_state = copy.deepcopy(state or {}) logger.info(f"Starting syncing {self.name}") # TODO assert all streams exist in the connector # get the streams once in case the connector needs to make any queries to generate them stream_instances = {s.name: s for s in self.streams(config)} with create_timer(self.name) as timer: for configured_stream in catalog.streams: try: stream_instance = stream_instances[ configured_stream.stream.name] timer.start_event(configured_stream.stream.name) yield from self._read_stream( logger=logger, stream_instance=stream_instance, configured_stream=configured_stream, connector_state=connector_state, ) timer.end_event() except Exception as e: logger.exception( f"Encountered an exception while reading stream {self.name}" ) raise e finally: logger.info(f"Finished syncing {self.name}") logger.info(timer.report()) logger.info(f"Finished syncing {self.name}")