Ejemplo n.º 1
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        try:
            for configured_stream in catalog.streams:
                if configured_stream.sync_mode == SyncMode.full_refresh:
                    stream_name = configured_stream.stream.name
                    reader = Reader(logger, config)
                    table_client = reader.get_table_client(stream_name)
                    logger.info(f"Reading data from stream '{stream_name}'")

                    for row in reader.read(table_client, None):
                        # Timestamp property is in metadata object
                        # row.metadata.timestamp
                        row["additionalProperties"] = True
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=AirbyteRecordMessage(
                                stream=stream_name,
                                data=row,
                                emitted_at=int(datetime.now().timestamp()) *
                                1000),
                        )
                if configured_stream.sync_mode == SyncMode.incremental:
                    logger.warn(
                        f"Incremental sync is not supported by stream {stream_name}"
                    )

        except Exception as err:
            reason = f"Failed to read data of {stream_name}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
Ejemplo n.º 2
0
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        client = GoogleSheetsClient(self.get_credentials(config))
        spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
        try:
            logger.info(f"Running discovery on sheet {spreadsheet_id}")
            spreadsheet_metadata = Spreadsheet.parse_obj(
                client.get(spreadsheetId=spreadsheet_id,
                           includeGridData=False))
            grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
            streams = []
            for sheet_name in grid_sheets:
                try:
                    header_row_data = Helpers.get_first_row(
                        client, spreadsheet_id, sheet_name)
                    stream = Helpers.headers_to_airbyte_stream(
                        logger, sheet_name, header_row_data)
                    streams.append(stream)
                except Exception as err:
                    if str(err).startswith(
                            "Expected data for exactly one row for sheet"):
                        logger.warn(f"Skip empty sheet: {sheet_name}")
                    else:
                        logger.error(str(err))
            return AirbyteCatalog(streams=streams)

        except errors.HttpError as err:
            reason = str(err)
            if err.resp.status == status_codes.NOT_FOUND:
                reason = "Requested spreadsheet was not found."
            raise Exception(f"Could not run discovery: {reason}")
Ejemplo n.º 3
0
    def check_connection(
            self, logger: AirbyteLogger,
            config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
        """
        This method checks two things:
            - That the credentials provided in config are valid for access.
            - That the path pattern(s) provided in config are valid to be matched against.

        :param logger: an instance of AirbyteLogger to use
        :param config: The user-provided configuration as specified by the source's spec.
                                This usually contains information required to check connection e.g. tokens, secrets and keys etc.
        :return: A tuple of (boolean, error). If boolean is true, then the connection check is successful and we can connect to the underlying data
        source using the provided configuration.
        Otherwise, the input config cannot be used to connect to the underlying data source, and the "error" object should describe what went wrong.
        The error object will be cast to string to display the problem to the user.
        """
        try:
            for file_info in self.stream_class(**config).filepath_iterator():
                # TODO: will need to split config.get("path_pattern") up by stream once supporting multiple streams
                # test that matching on the pattern doesn't error
                globmatch(file_info.key,
                          config.get("path_pattern"),
                          flags=GLOBSTAR | SPLIT)
                # just need first file here to test connection and valid patterns
                return True, None

        except Exception as e:
            logger.error(format_exc())
            return False, e

        logger.warn("Found 0 files (but connection is valid).")
        return True, None
Ejemplo n.º 4
0
    def headers_to_airbyte_stream(
            logger: AirbyteLogger, sheet_name: str,
            header_row_values: List[str]) -> AirbyteStream:
        """
        Parses sheet headers from the provided row. This method assumes that data is contiguous
        i.e: every cell contains a value and the first cell which does not contain a value denotes the end
        of the headers. For example, if the first row contains "One | Two | | Three" then this method
        will parse the headers as ["One", "Two"]. This assumption is made for simplicity and can be modified later.
        """
        fields, duplicate_fields = Helpers.get_valid_headers_and_duplicates(
            header_row_values)
        if duplicate_fields:
            logger.warn(
                f"Duplicate headers found in {sheet_name}. Ignoring them :{duplicate_fields}"
            )

        sheet_json_schema = {
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            # For simplicity, the type of every cell is a string
            "properties": {field: {
                "type": "string"
            }
                           for field in fields},
        }

        return AirbyteStream(name=sheet_name,
                             json_schema=sheet_json_schema,
                             supported_sync_modes=["full_refresh"])
Ejemplo n.º 5
0
    def get_records(self, catalog: ConfiguredAirbyteCatalog,
                    logger: AirbyteLogger, state: Dict[str, any]):
        cursor_field = self.CDC_LSN
        txid = self._client.txid()

        for configured_stream in catalog.streams:
            stream = configured_stream.stream

            if stream.name not in self.STREAMS:
                logger.warn(
                    f"Stream '{stream.name}' is not recognized in this source")
                continue

            if configured_stream.sync_mode == SyncMode.incremental and cursor_field in state[
                    stream.name]:

                for record in self._client.replicate(
                        stream.name,
                        txidfra=state[stream.name][cursor_field],
                        txidtil=txid):
                    formatted_record = self._format_columns(record)

                    yield self._record(stream=stream.name,
                                       data=formatted_record)
            else:
                for record in self._client.replicate(stream.name):
                    formatted_record = self._format_columns(record)

                    yield self._record(stream=stream.name,
                                       data=formatted_record)

            # Set new state
            # Add one cause SDK is inclusive
            state[stream.name][cursor_field] = txid + 1
            yield self._state(state)
Ejemplo n.º 6
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config)

        logger.info("Starting syncing recurly")
        for configured_stream in catalog.streams:
            # TODO handle incremental syncs
            stream = configured_stream.stream
            if stream.name not in client.ENTITIES:
                logger.warn(
                    f"Stream '{stream}' not found in the recognized entities")
                continue
            for record in self._read_record(client=client, stream=stream.name):
                yield AirbyteMessage(type=Type.RECORD, record=record)

        logger.info("Finished syncing recurly")
Ejemplo n.º 7
0
    def check(self, logger: AirbyteLogger,
              config: json) -> AirbyteConnectionStatus:
        # Check involves verifying that the specified spreadsheet is reachable with our credentials.
        try:
            client = GoogleSheetsClient(self.get_credentials(config))
        except Exception as e:
            return AirbyteConnectionStatus(
                status=Status.FAILED,
                message=f"Please use valid credentials json file. Error: {e}")

        spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])

        try:
            # Attempt to get first row of sheet
            client.get(spreadsheetId=spreadsheet_id,
                       includeGridData=False,
                       ranges="1:1")
        except errors.HttpError as err:
            reason = str(err)
            # Give a clearer message if it's a common error like 404.
            if err.resp.status == status_codes.NOT_FOUND:
                reason = "Requested spreadsheet was not found."
            logger.error(f"Formatted error: {reason}")
            return AirbyteConnectionStatus(
                status=Status.FAILED,
                message=
                f"Unable to connect with the provided credentials to spreadsheet. Error: {reason}"
            )

        # Check for duplicate headers
        spreadsheet_metadata = Spreadsheet.parse_obj(
            client.get(spreadsheetId=spreadsheet_id, includeGridData=False))

        grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)

        duplicate_headers_in_sheet = {}
        for sheet_name in grid_sheets:
            try:
                header_row_data = Helpers.get_first_row(
                    client, spreadsheet_id, sheet_name)
                _, duplicate_headers = Helpers.get_valid_headers_and_duplicates(
                    header_row_data)
                if duplicate_headers:
                    duplicate_headers_in_sheet[sheet_name] = duplicate_headers
            except Exception as err:
                if str(err).startswith(
                        "Expected data for exactly one row for sheet"):
                    logger.warn(f"Skip empty sheet: {sheet_name}")
                else:
                    logger.error(str(err))
                    return AirbyteConnectionStatus(
                        status=Status.FAILED,
                        message=
                        f"Unable to read the schema of sheet {sheet_name}. Error: {str(err)}"
                    )
        if duplicate_headers_in_sheet:
            duplicate_headers_error_message = ", ".join([
                f"[sheet:{sheet_name}, headers:{duplicate_sheet_headers}]"
                for sheet_name, duplicate_sheet_headers in
                duplicate_headers_in_sheet.items()
            ])
            return AirbyteConnectionStatus(
                status=Status.FAILED,
                message=
                "The following duplicate headers were found in the following sheets. Please fix them to continue: "
                + duplicate_headers_error_message,
            )

        return AirbyteConnectionStatus(status=Status.SUCCEEDED)
Ejemplo n.º 8
0
    def check_config(self, logger: AirbyteLogger, config_path: str,
                     config: json) -> AirbyteConnectionStatus:
        """
        Tests if the input configuration can be used to successfully connect to the integration
            e.g: if a provided Stripe API token can be used to connect to the Stripe API.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config_path: Path to the file containing the configuration json config
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteConnectionStatus indicating a Success or Failure
        """
        try:
            # If an app on the appstore does not support subscriptions or sales, it cannot pull the relevant reports.
            # However, the way the Appstore API expresses this is not via clear error messages. Instead it expresses it by throwing an unrelated
            # error, in this case "invalid vendor ID". There is no way to distinguish if this error is due to invalid credentials or due to
            # the account not supporting this kind of report. So to "check connection" we see if any of the reports can be pulled and if so
            # return success. If no reports can be pulled we display the exception messages generated for all reports and return failure.
            api_fields_to_test = {
                "subscription_event_report": {
                    "reportType": "SUBSCRIPTION_EVENT",
                    "frequency": "DAILY",
                    "reportSubType": "SUMMARY",
                    "version": "1_2",
                },
                "subscriber_report": {
                    "reportType": "SUBSCRIBER",
                    "frequency": "DAILY",
                    "reportSubType": "DETAILED",
                    "version": "1_2"
                },
                "subscription_report": {
                    "reportType": "SUBSCRIPTION",
                    "frequency": "DAILY",
                    "reportSubType": "SUMMARY",
                    "version": "1_2"
                },
                "sales_report": {
                    "reportType": "SALES",
                    "frequency": "DAILY",
                    "reportSubType": "SUMMARY",
                    "version": "1_0"
                },
            }

            api = Api(config["key_id"], config["key_file"],
                      config["issuer_id"])
            stream_to_error = {}
            for stream, params in api_fields_to_test.items():
                test_date = date.today() - timedelta(days=2)
                report_filters = {
                    "reportDate": test_date.strftime("%Y-%m-%d"),
                    "vendorNumber": f"{config['vendor']}"
                }
                report_filters.update(api_fields_to_test[stream])
                try:
                    rep_tsv = api.download_sales_and_trends_reports(
                        filters=report_filters)
                    if isinstance(rep_tsv, dict):
                        raise Exception(
                            f"An exception occurred: Received a JSON response instead of"
                            f" the report: {str(rep_tsv)}")
                except Exception as e:
                    logger.warn(f"Unable to download {stream}: {e}")
                    stream_to_error[stream] = e

            # All streams have failed
            if len(stream_to_error.keys()) == api_fields_to_test.keys():
                message = "\n".join([
                    f"Unable to access {stream} due to error: {e}"
                    for stream, e in stream_to_error
                ])
                return AirbyteConnectionStatus(status=Status.FAILED,
                                               message=message)

            return AirbyteConnectionStatus(status=Status.SUCCEEDED)
        except Exception as e:
            logger.warn(e)
            return AirbyteConnectionStatus(
                status=Status.FAILED,
                message=f"An exception occurred: {str(e)}")