コード例 #1
0
ファイル: google_sheets_source.py プロジェクト: Mu-L/airbyte
    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        client = GoogleSheetsClient(self.get_credentials(config))
        spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])
        try:
            logger.info(f"Running discovery on sheet {spreadsheet_id}")
            spreadsheet_metadata = Spreadsheet.parse_obj(
                client.get(spreadsheetId=spreadsheet_id,
                           includeGridData=False))
            grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
            streams = []
            for sheet_name in grid_sheets:
                try:
                    header_row_data = Helpers.get_first_row(
                        client, spreadsheet_id, sheet_name)
                    stream = Helpers.headers_to_airbyte_stream(
                        logger, sheet_name, header_row_data)
                    streams.append(stream)
                except Exception as err:
                    if str(err).startswith(
                            "Expected data for exactly one row for sheet"):
                        logger.warn(f"Skip empty sheet: {sheet_name}")
                    else:
                        logger.error(str(err))
            return AirbyteCatalog(streams=streams)

        except errors.HttpError as err:
            reason = str(err)
            if err.resp.status == status_codes.NOT_FOUND:
                reason = "Requested spreadsheet was not found."
            raise Exception(f"Could not run discovery: {reason}")
コード例 #2
0
ファイル: source.py プロジェクト: yevhenii-ldv/airbyte
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        try:
            for configured_stream in catalog.streams:
                if configured_stream.sync_mode == SyncMode.full_refresh:
                    stream_name = configured_stream.stream.name
                    reader = Reader(logger, config)
                    table_client = reader.get_table_client(stream_name)
                    logger.info(f"Reading data from stream '{stream_name}'")

                    for row in reader.read(table_client, None):
                        # Timestamp property is in metadata object
                        # row.metadata.timestamp
                        row["additionalProperties"] = True
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=AirbyteRecordMessage(
                                stream=stream_name,
                                data=row,
                                emitted_at=int(datetime.now().timestamp()) *
                                1000),
                        )
                if configured_stream.sync_mode == SyncMode.incremental:
                    logger.warn(
                        f"Incremental sync is not supported by stream {stream_name}"
                    )

        except Exception as err:
            reason = f"Failed to read data of {stream_name}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
コード例 #3
0
    def check_connection(
            self, logger: AirbyteLogger,
            config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
        """
        This method checks two things:
            - That the credentials provided in config are valid for access.
            - That the path pattern(s) provided in config are valid to be matched against.

        :param logger: an instance of AirbyteLogger to use
        :param config: The user-provided configuration as specified by the source's spec.
                                This usually contains information required to check connection e.g. tokens, secrets and keys etc.
        :return: A tuple of (boolean, error). If boolean is true, then the connection check is successful and we can connect to the underlying data
        source using the provided configuration.
        Otherwise, the input config cannot be used to connect to the underlying data source, and the "error" object should describe what went wrong.
        The error object will be cast to string to display the problem to the user.
        """
        try:
            for file_info in self.stream_class(**config).filepath_iterator():
                # TODO: will need to split config.get("path_pattern") up by stream once supporting multiple streams
                # test that matching on the pattern doesn't error
                globmatch(file_info.key,
                          config.get("path_pattern"),
                          flags=GLOBSTAR | SPLIT)
                # just need first file here to test connection and valid patterns
                return True, None

        except Exception as e:
            logger.error(format_exc())
            return False, e

        logger.warn("Found 0 files (but connection is valid).")
        return True, None
コード例 #4
0
ファイル: source.py プロジェクト: yevhenii-ldv/airbyte
 def check_config(self, logger: AirbyteLogger, config_path: str,
                  config: json) -> AirbyteConnectionStatus:
     try:
         self.try_connect(logger, config)
     except self.api_error as err:
         logger.error(
             f"Exception while connecting to {self.tap_name}: {err}")
         # this should be in UI
         error_msg = f"Unable to connect to {self.tap_name} with the provided credentials. Error: {err}"
         return AirbyteConnectionStatus(status=Status.FAILED,
                                        message=error_msg)
     return AirbyteConnectionStatus(status=Status.SUCCEEDED)
コード例 #5
0
ファイル: source.py プロジェクト: Mu-L/airbyte
 def check_connection(self, logger: AirbyteLogger,
                      config: Mapping[str, Any]) -> Tuple[bool, Any]:
     """
     Check connection to Amazon SP API by requesting the list of reports as this endpoint should be available for any config.
     Validate if response has the expected error code and body.
     Show error message in case of request exception or unexpected response.
     """
     try:
         config = AmazonSellerPartnerConfig.parse_obj(
             config)  # FIXME: this will be not need after we fix CDK
         stream_kwargs = self._get_stream_kwargs(config)
         orders_stream = Orders(**stream_kwargs)
         next(orders_stream.read_records(sync_mode=SyncMode.full_refresh))
         return True, None
     except Exception as e:
         if isinstance(e, StopIteration):
             logger.error(
                 "Could not check connection without data for Orders stream. Please change value for replication start date field."
             )
         return False, e
コード例 #6
0
ファイル: google_sheets_source.py プロジェクト: Mu-L/airbyte
    def check(self, logger: AirbyteLogger,
              config: json) -> AirbyteConnectionStatus:
        # Check involves verifying that the specified spreadsheet is reachable with our credentials.
        try:
            client = GoogleSheetsClient(self.get_credentials(config))
        except Exception as e:
            return AirbyteConnectionStatus(
                status=Status.FAILED,
                message=f"Please use valid credentials json file. Error: {e}")

        spreadsheet_id = Helpers.get_spreadsheet_id(config["spreadsheet_id"])

        try:
            # Attempt to get first row of sheet
            client.get(spreadsheetId=spreadsheet_id,
                       includeGridData=False,
                       ranges="1:1")
        except errors.HttpError as err:
            reason = str(err)
            # Give a clearer message if it's a common error like 404.
            if err.resp.status == status_codes.NOT_FOUND:
                reason = "Requested spreadsheet was not found."
            logger.error(f"Formatted error: {reason}")
            return AirbyteConnectionStatus(
                status=Status.FAILED,
                message=
                f"Unable to connect with the provided credentials to spreadsheet. Error: {reason}"
            )

        # Check for duplicate headers
        spreadsheet_metadata = Spreadsheet.parse_obj(
            client.get(spreadsheetId=spreadsheet_id, includeGridData=False))

        grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)

        duplicate_headers_in_sheet = {}
        for sheet_name in grid_sheets:
            try:
                header_row_data = Helpers.get_first_row(
                    client, spreadsheet_id, sheet_name)
                _, duplicate_headers = Helpers.get_valid_headers_and_duplicates(
                    header_row_data)
                if duplicate_headers:
                    duplicate_headers_in_sheet[sheet_name] = duplicate_headers
            except Exception as err:
                if str(err).startswith(
                        "Expected data for exactly one row for sheet"):
                    logger.warn(f"Skip empty sheet: {sheet_name}")
                else:
                    logger.error(str(err))
                    return AirbyteConnectionStatus(
                        status=Status.FAILED,
                        message=
                        f"Unable to read the schema of sheet {sheet_name}. Error: {str(err)}"
                    )
        if duplicate_headers_in_sheet:
            duplicate_headers_error_message = ", ".join([
                f"[sheet:{sheet_name}, headers:{duplicate_sheet_headers}]"
                for sheet_name, duplicate_sheet_headers in
                duplicate_headers_in_sheet.items()
            ])
            return AirbyteConnectionStatus(
                status=Status.FAILED,
                message=
                "The following duplicate headers were found in the following sheets. Please fix them to continue: "
                + duplicate_headers_error_message,
            )

        return AirbyteConnectionStatus(status=Status.SUCCEEDED)