Example #1
0
    def _read_incremental(
        self,
        logger: AirbyteLogger,
        stream_instance: Stream,
        configured_stream: ConfiguredAirbyteStream,
        connector_state: MutableMapping[str, Any],
    ) -> Iterator[AirbyteMessage]:
        stream_name = configured_stream.stream.name
        stream_state = connector_state.get(stream_name, {})
        if stream_state:
            logger.info(f"Setting state of {stream_name} stream to {stream_state.get(stream_name)}")

        checkpoint_interval = stream_instance.state_checkpoint_interval
        slices = stream_instance.stream_slices(
            cursor_field=configured_stream.cursor_field, sync_mode=SyncMode.incremental, stream_state=stream_state
        )
        for slice in slices:
            record_counter = 0
            records = stream_instance.read_records(
                sync_mode=SyncMode.incremental,
                stream_slice=slice,
                stream_state=stream_state,
                cursor_field=configured_stream.cursor_field or None,
            )
            for record_data in records:
                record_counter += 1
                yield self._as_airbyte_record(stream_name, record_data)
                stream_state = stream_instance.get_updated_state(stream_state, record_data)
                if checkpoint_interval and record_counter % checkpoint_interval == 0:
                    yield self._checkpoint_state(stream_name, stream_state, connector_state, logger)

            yield self._checkpoint_state(stream_name, stream_state, connector_state, logger)
Example #2
0
    def _read_stream(
        self,
        logger: AirbyteLogger,
        stream_instance: Stream,
        configured_stream: ConfiguredAirbyteStream,
        connector_state: MutableMapping[str, Any],
    ) -> Iterator[AirbyteMessage]:

        use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental
        if use_incremental:
            record_iterator = self._read_incremental(logger, stream_instance,
                                                     configured_stream,
                                                     connector_state)
        else:
            record_iterator = self._read_full_refresh(stream_instance,
                                                      configured_stream)

        record_counter = 0
        stream_name = configured_stream.stream.name
        logger.info(f"Syncing stream: {stream_name} ")
        for record in record_iterator:
            if record.type == MessageType.RECORD:
                record_counter += 1
            yield record

        logger.info(f"Read {record_counter} records from {stream_name} stream")
Example #3
0
    def read(
            self,
            logger: AirbyteLogger,
            config: Mapping[str, Any],
            catalog: ConfiguredAirbyteCatalog,
            state: MutableMapping[str,
                                  Any] = None) -> Iterator[AirbyteMessage]:

        connector_state = copy.deepcopy(state or {})
        logger.info(f"Starting syncing {self.name}")
        # TODO assert all streams exist in the connector
        # get the streams once in case the connector needs to make any queries to generate them
        stream_instances = {s.name: s for s in self.streams(config)}
        for configured_stream in catalog.streams:
            try:
                stream_instance = stream_instances[
                    configured_stream.stream.name]
                yield from self._read_stream(
                    logger=logger,
                    stream_instance=stream_instance,
                    configured_stream=configured_stream,
                    connector_state=connector_state)
            except Exception as e:
                logger.exception(
                    f"Encountered an exception while reading stream {self.name}"
                )
                raise e

        logger.info(f"Finished syncing {self.name}")
Example #4
0
    def streams(self, config: Mapping[str, Any]) -> List[Stream]:
        authenticator = TokenAuthenticator(config["api_token"])
        default_start_date = pendulum.now().subtract(
            days=14)  # TODO make this configurable
        threads_lookback_window = {"days": 7}  # TODO make this configurable

        streams = [
            Channels(authenticator=authenticator),
            ChannelMembers(authenticator=authenticator),
            ChannelMessages(authenticator=authenticator,
                            default_start_date=default_start_date),
            Threads(authenticator=authenticator,
                    default_start_date=default_start_date,
                    lookback_window=threads_lookback_window),
            Users(authenticator=authenticator),
        ]

        # To sync data from channels, the bot backed by this token needs to join all those channels. This operation is idempotent.
        # TODO make joining configurable. Also make joining archived and private channels configurable
        logger = AirbyteLogger()
        logger.info("joining Slack channels")
        join_channels_stream = JoinChannelsStream(authenticator=authenticator)
        for stream_slice in join_channels_stream.stream_slices():
            for message in join_channels_stream.read_records(
                    sync_mode=SyncMode.full_refresh,
                    stream_slice=stream_slice):
                logger.info(message["message"])

        return streams
Example #5
0
    def _read_stream(
            self, logger: AirbyteLogger, stream_instance: Stream,
            configured_stream: ConfiguredAirbyteStream,
            state: MutableMapping[str, Any]) -> Iterator[AirbyteMessage]:
        stream_name = configured_stream.stream.name
        use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental

        stream_state = {}
        if use_incremental and state.get(stream_name):
            logger.info(
                f"Set state of {stream_name} stream to {state.get(stream_name)}"
            )
            stream_state = state.get(stream_name)

        logger.info(f"Syncing stream: {stream_name} ")
        record_counter = 0
        for record in stream_instance.read_stream(
                configured_stream=configured_stream,
                stream_state=copy.deepcopy(stream_state)):
            now_millis = int(datetime.now().timestamp()) * 1000
            message = AirbyteRecordMessage(stream=stream_name,
                                           data=record,
                                           emitted_at=now_millis)
            yield AirbyteMessage(type=MessageType.RECORD, record=message)

            record_counter += 1
            if use_incremental:
                stream_state = stream_instance.get_updated_state(
                    stream_state, record)
                if record_counter % stream_instance.state_checkpoint_interval == 0:
                    state[stream_name] = stream_state
                    yield AirbyteMessage(type=MessageType.STATE,
                                         state=AirbyteStateMessage(data=state))

        if use_incremental and stream_state:
            state[stream_name] = stream_state
            # output state object only together with other stream states
            yield AirbyteMessage(type=MessageType.STATE,
                                 state=AirbyteStateMessage(data=state))
Example #6
0
    def read(
            self,
            logger: AirbyteLogger,
            config: Mapping[str, Any],
            catalog: ConfiguredAirbyteCatalog,
            state: MutableMapping[str,
                                  Any] = None) -> Iterator[AirbyteMessage]:
        """Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.io/architecture/airbyte-specification."""
        connector_state = copy.deepcopy(state or {})
        logger.info(f"Starting syncing {self.name}")
        # TODO assert all streams exist in the connector
        # get the streams once in case the connector needs to make any queries to generate them
        stream_instances = {s.name: s for s in self.streams(config)}
        with create_timer(self.name) as timer:
            for configured_stream in catalog.streams:
                try:
                    stream_instance = stream_instances[
                        configured_stream.stream.name]
                    timer.start_event(configured_stream.stream.name)
                    yield from self._read_stream(
                        logger=logger,
                        stream_instance=stream_instance,
                        configured_stream=configured_stream,
                        connector_state=connector_state,
                    )
                    timer.end_event()
                except Exception as e:
                    logger.exception(
                        f"Encountered an exception while reading stream {self.name}"
                    )
                    raise e
                finally:
                    logger.info(f"Finished syncing {self.name}")
                    logger.info(timer.report())

        logger.info(f"Finished syncing {self.name}")
Example #7
0
class Stream(ABC):
    """
    Base abstract class for an Airbyte Stream. Makes no assumption of the Stream's underlying transport protocol.
    """

    # Use self.logger in subclasses to log any messages
    logger = AirbyteLogger(
    )  # TODO use native "logging" loggers with custom handlers

    @property
    def name(self) -> str:
        """
        :return: Stream name. By default this is the implementing class name, but it can be overridden as needed.
        """
        return casing.camel_to_snake(self.__class__.__name__)

    @abstractmethod
    def read_records(
        self,
        sync_mode: SyncMode,
        cursor_field: List[str] = None,
        stream_slice: Mapping[str, any] = None,
        stream_state: Mapping[str, Any] = None,
    ) -> Iterable[Mapping[str, Any]]:
        """
        This method should be overridden by subclasses to read records based on the inputs
        """

    def get_json_schema(self) -> Mapping[str, Any]:
        """
        :return: A dict of the JSON schema representing this stream.

        The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property.
        Override as needed.
        """
        # TODO show an example of using pydantic to define the JSON schema, or reading an OpenAPI spec
        return ResourceSchemaLoader(package_name_from_class(
            self.__class__)).get_schema(self.name)

    def as_airbyte_stream(self) -> AirbyteStream:
        stream = AirbyteStream(name=self.name,
                               json_schema=dict(self.get_json_schema()),
                               supported_sync_modes=[SyncMode.full_refresh])

        if self.supports_incremental:
            stream.source_defined_cursor = self.source_defined_cursor
            stream.supported_sync_modes.append(SyncMode.incremental)
            stream.default_cursor_field = self._wrapped_cursor_field()

        return stream

    @property
    def supports_incremental(self) -> bool:
        """
        :return: True if this stream supports incrementally reading data
        """
        return len(self._wrapped_cursor_field()) > 0

    def _wrapped_cursor_field(self) -> List[str]:
        return [self.cursor_field] if isinstance(self.cursor_field,
                                                 str) else self.cursor_field

    @property
    def cursor_field(self) -> Union[str, List[str]]:
        """
        Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.
        :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.
        """
        return []

    @property
    def source_defined_cursor(self) -> bool:
        """
        Return False if the cursor can be configured by the user.
        """
        return True

    def stream_slices(
        self,
        sync_mode: SyncMode,
        cursor_field: List[str] = None,
        stream_state: Mapping[str, Any] = None
    ) -> Iterable[Optional[Mapping[str, any]]]:
        """
        Override to define the slices for this stream. See the stream slicing section of the docs for more information.

        :param stream_state:
        :return:
        """
        return [None]

    @property
    def state_checkpoint_interval(self) -> Optional[int]:
        """
        Decides how often to checkpoint state (i.e: emit a STATE message). E.g: if this returns a value of 100, then state is persisted after reading
        100 records, then 200, 300, etc.. A good default value is 1000 although your mileage may vary depending on the underlying data source.

        Checkpointing a stream avoids re-reading records in the case a sync is failed or cancelled.

        return None if state should not be checkpointed e.g: because records returned from the underlying data source are not returned in
        ascending order with respect to the cursor field. This can happen if the source does not support reading records in ascending order of
        created_at date (or whatever the cursor is). In those cases, state must only be saved once the full stream has been read.
        """
        return None

    def get_updated_state(self, current_stream_state: MutableMapping[str, Any],
                          latest_record: Mapping[str, Any]):
        """
        Override to extract state from the latest record. Needed to implement incremental sync.

        Inspects the latest record extracted from the data source and the current state object and return an updated state object.

        For example: if the state object is based on created_at timestamp, and the current state is {'created_at': 10}, and the latest_record is
        {'name': 'octavia', 'created_at': 20 } then this method would return {'created_at': 20} to indicate state should be updated to this object.

        :param current_stream_state: The stream's current state object
        :param latest_record: The latest record extracted from the stream
        :return: An updated state object
        """
        return {}
Example #8
0
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import sys
import time

import backoff
from base_python.cdk.streams.exceptions import DefaultBackoffException, UserDefinedBackoffException
from base_python.logger import AirbyteLogger
from requests import codes, exceptions

TRANSIENT_EXCEPTIONS = (DefaultBackoffException, exceptions.ConnectTimeout,
                        exceptions.ReadTimeout, exceptions.ConnectionError)

# TODO inject singleton logger?
logger = AirbyteLogger()


def default_backoff_handler(max_tries: int, factor: int, **kwargs):
    def log_retry_attempt(details):
        _, exc, _ = sys.exc_info()
        logger.info(str(exc))
        logger.info(
            f"Caught retryable error after {details['tries']} tries. Waiting {details['wait']} seconds then retrying..."
        )

    def should_give_up(exc):
        # If a non-rate-limiting related 4XX error makes it this far, it means it was unexpected and probably consistent, so we shouldn't back off
        give_up = exc.response is not None and exc.response.status_code != codes.too_many_requests and 400 <= exc.response.status_code < 500
        if give_up:
            logger.info(