Python AirbyteCatalog Exemples, airbyte_protocol.AirbyteCatalog Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : catalog_helpers.py Projet : yevhenii-ldv/airbyte

    def coerce_catalog_as_full_refresh(
            catalog: AirbyteCatalog) -> AirbyteCatalog:
        """
        Updates the sync mode on all streams in this catalog to be full refresh
        """
        coerced_catalog = catalog.copy()
        for stream in catalog.streams:
            stream.source_defined_cursor = False
            stream.supported_sync_modes = [SyncMode.full_refresh]
            stream.default_cursor_field = None

        # remove nulls
        return AirbyteCatalog.parse_raw(
            coerced_catalog.json(exclude_unset=True, exclude_none=True))

Exemple #2

0

Afficher le fichier

    def test_parse_sheet_and_column_names_from_catalog(self):
        sheet1 = "soccer_team"
        sheet1_columns = frozenset(
            ["arsenal", "chelsea", "manutd", "liverpool"])
        sheet1_schema = {
            "properties": {c: {
                "type": "string"
            }
                           for c in sheet1_columns}
        }

        sheet2 = "basketball_teams"
        sheet2_columns = frozenset(["gsw", "lakers"])
        sheet2_schema = {
            "properties": {c: {
                "type": "string"
            }
                           for c in sheet2_columns}
        }

        catalog = AirbyteCatalog(streams=[
            AirbyteStream(name=sheet1, json_schema=sheet1_schema),
            AirbyteStream(name=sheet2, json_schema=sheet2_schema)
        ])

        actual = Helpers.parse_sheet_and_column_names_from_catalog(catalog)

        expected = {sheet1: sheet1_columns, sheet2: sheet2_columns}
        self.assertEqual(actual, expected)

Exemple #3

0

Afficher le fichier

    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        client = GoogleSheetsClient(self.get_credentials(config))
        spreadsheet_id = config["spreadsheet_id"]
        try:
            logger.info(f"Running discovery on sheet {spreadsheet_id}")
            spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False))
            grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata)
            streams = []
            for sheet_name in grid_sheets:
                try:
                    header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name)
                    stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_row_data)
                    streams.append(stream)
                except Exception as err:
                    if str(err).startswith("Expected data for exactly one row for sheet"):
                        logger.warn(f"Skip empty sheet: {sheet_name}")
                    else:
                        logger.error(str(err))
            return AirbyteCatalog(streams=streams)

        except errors.HttpError as err:
            reason = str(err)
            if err.resp.status == status_codes.NOT_FOUND:
                reason = "Requested spreadsheet was not found."
            raise Exception(f"Could not run discovery: {reason}")

Exemple #4

0

Afficher le fichier

    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration.
        For example, given valid credentials to a Postgres database,
        returns an Airbyte catalog where each postgres table is a stream, and each table column is a field.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteCatalog is an object describing a list of all available streams in this source.
            A stream is an AirbyteStream object that includes:
            - its stream name (or table name in the case of Postgres)
            - json_schema providing the specifications of expected schema for this stream (a list of columns described
            by their names and types)
        """
        streams = []

        stream_name = "TableName"  # Example
        json_schema = {  # Example
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": {"columnName": {"type": "string"}},
        }

        # Not Implemented

        streams.append(AirbyteStream(name=stream_name,
                                     json_schema=json_schema))
        return AirbyteCatalog(streams=streams)

Exemple #5

0

Afficher le fichier

Fichier : source.py Projet : eugene-kulak/airbyte

    def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]:
        """

        :param logger:
        :param config_container:
        :param catalog_path:
        :param state_path:
        :return:
        """
        config = config_container.rendered_config
        storage = SourceFile.get_storage_scheme(logger, config["storage"], config["url"])
        url = SourceFile.get_simple_url(config["url"])
        logger.info(f"Reading ({storage}{url}, {catalog_path}, {state_path})...")
        catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path))
        selection = SourceFile.parse_catalog(catalog)
        try:
            df_list = SourceFile.load_dataframes(config, logger)
            for df in df_list:
                columns = selection.intersection(set(df.columns))
                for data in df[columns].to_dict(orient="records"):
                    yield AirbyteMessage(
                        type=Type.RECORD,
                        record=AirbyteRecordMessage(stream=url, data=data, emitted_at=int(datetime.now().timestamp()) * 1000),
                    )
        except Exception as err:
            reason = f"Failed to read data of {storage}{url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err

Exemple #6

0

Afficher le fichier

Fichier : source.py Projet : eugene-kulak/airbyte

    def discover(self, logger, config_container) -> AirbyteCatalog:
        """

        :param logger:
        :param config_container:
        :return:
        """
        config = config_container.rendered_config
        storage = SourceFile.get_storage_scheme(logger, config["storage"], config["url"])
        url = SourceFile.get_simple_url(config["url"])
        logger.info(f"Discovering schema of {storage}{url}...")
        streams = []
        try:
            # TODO handle discovery of directories of multiple files instead
            # Don't skip data when discovering in order to infer column types
            df_list = SourceFile.load_dataframes(config, logger, skip_data=False)
            fields = {}
            for df in df_list:
                for col in df.columns:
                    fields[col] = SourceFile.convert_dtype(df[col].dtype)
            json_schema = {
                "$schema": "http://json-schema.org/draft-07/schema#",
                "type": "object",
                "properties": {field: {"type": fields[field]} for field in fields},
            }
            streams.append(AirbyteStream(name=url, json_schema=json_schema))
        except Exception as err:
            reason = f"Failed to discover schemas of {storage}{url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
        return AirbyteCatalog(streams=streams)

Exemple #7

0

Afficher le fichier

def catalog_fixture(
        configured_catalog: ConfiguredAirbyteCatalog
) -> Optional[AirbyteCatalog]:
    if configured_catalog:
        return AirbyteCatalog(
            streams=[stream.stream for stream in configured_catalog.streams])
    return None

Exemple #8

0

Afficher le fichier

Fichier : singer_helpers.py Projet : vinhdangphuc/airbyte

    def singer_catalog_to_airbyte_catalog(
            singer_catalog: Dict[str, any]) -> AirbyteCatalog:
        airbyte_streams = []
        for stream in singer_catalog.get("streams"):
            name = stream.get("stream")
            schema = stream.get("schema")
            airbyte_stream = AirbyteStream(name=name, json_schema=schema)
            metadatas = stream.get("metadata", [])
            stream_metadata = get_stream_level_metadata(metadatas)
            if stream_metadata:
                # TODO unclear from the singer spec what behavior should be if there are no valid replication keys, but forced-replication-method is INCREMENTAL.
                #  For now requiring replication keys for a stream to be considered incremental.
                replication_keys = stream_metadata.get(
                    "valid-replication-keys", [])
                if len(replication_keys) > 0:
                    airbyte_stream.source_defined_cursor = True
                    airbyte_stream.supported_sync_modes = [
                        SyncMode.full_refresh, SyncMode.incremental
                    ]
                    # TODO if there are multiple replication keys, allow configuring which one is used. For now we deterministically take the first
                    airbyte_stream.default_cursor_field = [
                        sorted(replication_keys)[0]
                    ]

            airbyte_streams += [airbyte_stream]
        return AirbyteCatalog(streams=airbyte_streams)

Exemple #9

0

Afficher le fichier

    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        client = Helpers.get_authenticated_sheets_client(
            json.loads(config["credentials_json"]))
        spreadsheet_id = config["spreadsheet_id"]
        try:
            logger.info(f"Running discovery on sheet {spreadsheet_id}")
            spreadsheet_metadata = Spreadsheet.parse_obj(
                client.get(spreadsheetId=spreadsheet_id,
                           includeGridData=False).execute())
            sheet_names = [
                sheet.properties.title for sheet in spreadsheet_metadata.sheets
            ]
            streams = []
            for sheet_name in sheet_names:
                header_row_data = Helpers.get_first_row(
                    client, spreadsheet_id, sheet_name)
                stream = Helpers.headers_to_airbyte_stream(
                    sheet_name, header_row_data)
                streams.append(stream)
            return AirbyteCatalog(streams=streams)

        except errors.HttpError as err:
            reason = str(err)
            if err.resp.status == 404:
                reason = "Requested spreadsheet was not found."
            raise Exception(f"Could not run discovery: {reason}")

Exemple #10

0

Afficher le fichier

Fichier : source.py Projet : charity1475/airbyte

 def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
     stream_name = DATASET_ITEMS_STREAM_NAME
     json_schema = {
         "$schema": "http://json-schema.org/draft-07/schema#",
         "type": "object",
     }
     return AirbyteCatalog(
         streams=[AirbyteStream(name=stream_name, json_schema=json_schema)])

Exemple #11

0

Afficher le fichier

Fichier : abstract_source.py Projet : zestyping/airbyte

 def discover(self, logger: AirbyteLogger,
              config: Mapping[str, Any]) -> AirbyteCatalog:
     """Implements the Discover operation from the Airbyte Specification. See https://docs.airbyte.io/architecture/airbyte-specification."""
     streams = [
         stream.as_airbyte_stream()
         for stream in self.streams(config=config)
     ]
     return AirbyteCatalog(streams=streams)

Exemple #12

0

Afficher le fichier

 def singer_catalog_to_airbyte_catalog(
         singer_catalog: Dict[str, any]) -> AirbyteCatalog:
     airbyte_streams = []
     for stream in singer_catalog.get("streams"):
         name = stream.get("stream")
         schema = stream.get("schema")
         airbyte_streams += [AirbyteStream(name=name, json_schema=schema)]
     return AirbyteCatalog(streams=airbyte_streams)

Exemple #13

0

Afficher le fichier

 def discover(self, logger: AirbyteLogger,
              config: Mapping[str, Any]) -> AirbyteCatalog:
     """Discover streams"""
     streams = [
         stream.as_airbyte_stream()
         for stream in self.streams(config=config)
     ]
     return AirbyteCatalog(streams=streams)

Exemple #14

0

Afficher le fichier

Fichier : google_sheets_source.py Projet : winar-jin/airbyte

    def read(self,
             logger: AirbyteLogger,
             config_container,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        config = config_container.rendered_config
        client = Helpers.get_authenticated_sheets_client(
            json.loads(config["credentials_json"]))

        catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path))

        sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(
            catalog)
        spreadsheet_id = config["spreadsheet_id"]

        logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
        # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
        # a blank row, emit the row batch
        sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
            client, spreadsheet_id, sheet_to_column_name)
        for sheet in sheet_to_column_index_to_name.keys():
            logger.info(f"Syncing sheet {sheet}")
            column_index_to_name = sheet_to_column_index_to_name[sheet]
            row_cursor = 2  # we start syncing past the header row
            encountered_blank_row = False
            while not encountered_blank_row:
                range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}"
                logger.info(f"Fetching range {range}")
                row_batch = SpreadsheetValues.parse_obj(
                    client.values().batchGet(spreadsheetId=spreadsheet_id,
                                             ranges=range,
                                             majorDimension="ROWS").execute())
                row_cursor += ROW_BATCH_SIZE + 1
                # there should always be one range since we requested only one
                value_ranges = row_batch.valueRanges[0]

                if not value_ranges.values:
                    break

                row_values = value_ranges.values
                if len(row_values) == 0:
                    break

                for row in row_values:
                    if Helpers.is_row_empty(row):
                        encountered_blank_row = True
                        break
                    elif Helpers.row_contains_relevant_data(
                            row, column_index_to_name.keys()):
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=Helpers.row_data_to_record_message(
                                sheet, row, column_index_to_name))
        logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")

Exemple #15

0

Afficher le fichier

    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        json_schema = {
            "$schema": "http://json-schema.org/draft-07/schema#",
            "additionalProperties": True,
            "type": "object",
            # todo (cgardens) - remove data column. added to handle UI bug where streams without fields cannot be selected.
            # issue: https://github.com/airbytehq/airbyte/issues/1104
            "properties": {"data": {"type": "object"}},
        }

        # json body will be returned as the "data" stream". we can't know its schema ahead of time, so we assume it's object (i.e. valid json).
        return AirbyteCatalog(streams=[AirbyteStream(name=SourceHttpRequest.STREAM_NAME, json_schema=json_schema)])

Exemple #16

0

Afficher le fichier

    def discover(self, logger: AirbyteLogger,
                 config_container) -> AirbyteCatalog:
        json_schema = {
            "$schema": "http://json-schema.org/draft-07/schema#",
            "additionalProperties": True,
            "type": "object",
            "properties": {},
        }

        # json body will be returned as the "data" stream". we can't know its schema ahead of time, so we assume it's object (i.e. valid json).
        return AirbyteCatalog(streams=[
            AirbyteStream(name=SourceRestApi.STREAM_NAME,
                          json_schema=json_schema)
        ])

Exemple #17

0

Afficher le fichier

Fichier : source.py Projet : zzstoatzz/airbyte

    def discover(self, logger: AirbyteLogger,
                 config: Mapping) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a
        Remote CSV File, returns an Airbyte catalog where each csv file is a stream, and each column is a field.
        """
        client = self._get_client(config)
        name = client.stream_name

        logger.info(
            f"Discovering schema of {name} at {client.reader.full_url}...")
        try:
            streams = list(client.streams)
        except Exception as err:
            reason = f"Failed to discover schemas of {name} at {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
        return AirbyteCatalog(streams=streams)

Exemple #18

0

Afficher le fichier

Fichier : singer_helpers.py Projet : agilee/airbyte

    def get_catalogs(logger, shell_command, singer_transform=(lambda catalog: catalog), airbyte_transform=(lambda catalog: catalog)) -> Catalogs:
        completed_process = subprocess.run(shell_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                                           universal_newlines=True)

        for line in completed_process.stderr.splitlines():
            logger.log_by_prefix(line, "ERROR")

        airbyte_streams = []
        singer_catalog = singer_transform(json.loads(completed_process.stdout))

        for stream in singer_catalog.get("streams"):
            name = stream.get("stream")
            schema = stream.get("schema")
            airbyte_streams += [AirbyteStream(name=name, json_schema=schema)]

        airbyte_catalog = airbyte_transform(AirbyteCatalog(streams=airbyte_streams))

        return Catalogs(singer_catalog=singer_catalog, airbyte_catalog=airbyte_catalog)

Exemple #19

0

Afficher le fichier

Fichier : singer_helpers.py Projet : amitku/airbyte

    def singer_catalog_to_airbyte_catalog(singer_catalog: Dict[str, any], sync_mode_overrides: Dict[str, SyncModeInfo]) -> AirbyteCatalog:
        """
        :param singer_catalog:
        :param sync_mode_overrides: A dict from stream name to the sync modes it should use. Each stream in this dict must exist in the Singer catalog,
        but not every stream in the catalog should exist in this
        :return: Airbyte Catalog
        """
        airbyte_streams = []
        for stream in singer_catalog.get("streams"):
            name = stream.get("stream")
            schema = stream.get("schema")
            airbyte_stream = AirbyteStream(name=name, json_schema=schema)
            if name in sync_mode_overrides:
                override_sync_modes(airbyte_stream, sync_mode_overrides[name])
            else:
                set_sync_modes_from_metadata(airbyte_stream, stream.get("metadata", []))

            airbyte_streams += [airbyte_stream]
        return AirbyteCatalog(streams=airbyte_streams)

Exemple #20

0

Afficher le fichier

    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        access_token = config["access_token"]
        spreadsheet_id = config["spreadsheet_id"]
        streams = []

        smartsheet_client = smartsheet.Smartsheet(access_token)
        try:
            sheet = smartsheet_client.Sheets.get_sheet(spreadsheet_id)
            sheet = json.loads(str(sheet))  # make it subscriptable
            sheet_json_schema = get_json_schema(sheet)

            logger.info(
                f"Running discovery on sheet: {sheet['name']} with {spreadsheet_id}"
            )

            stream = AirbyteStream(name=sheet["name"],
                                   json_schema=sheet_json_schema)
            streams.append(stream)

        except Exception as e:
            raise Exception(f"Could not run discovery: {str(e)}")

        return AirbyteCatalog(streams=streams)

Exemple #21

0

Afficher le fichier

Fichier : source.py Projet : vitaliizazmic/airbyte

 def discover(self, logger, config: json) -> AirbyteCatalog:
     """
     Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a
     Remote CSV File, returns an Airbyte catalog where each csv file is a stream, and each column is a field.
     """
     storage = SourceFile.get_storage_scheme(logger, config["provider"]["storage"], config["url"])
     url = SourceFile.get_simple_url(config["url"])
     name = SourceFile.get_stream_name(config)
     logger.info(f"Discovering schema of {name} at {storage}{url}...")
     streams = []
     try:
         # TODO handle discovery of directories of multiple files instead
         if "format" in config and config["format"] == "json":
             schema = SourceFile.load_nested_json_schema(config, logger)
             json_schema = {
                 "$schema": "http://json-schema.org/draft-07/schema#",
                 "type": "object",
                 "properties": schema,
             }
         else:
             # Don't skip data when discovering in order to infer column types
             df_list = SourceFile.load_dataframes(config, logger, skip_data=False)
             fields = {}
             for df in df_list:
                 for col in df.columns:
                     fields[col] = SourceFile.convert_dtype(df[col].dtype)
             json_schema = {
                 "$schema": "http://json-schema.org/draft-07/schema#",
                 "type": "object",
                 "properties": {field: {"type": fields[field]} for field in fields},
             }
         streams.append(AirbyteStream(name=name, json_schema=json_schema))
     except Exception as err:
         reason = f"Failed to discover schemas of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}"
         logger.error(reason)
         raise err
     return AirbyteCatalog(streams=streams)

Exemple #22

0

Afficher le fichier

    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration.
        For example, given valid credentials to a Postgres database,
        returns an Airbyte catalog where each postgres table is a stream, and each table column is a field.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteCatalog is an object describing a list of all available streams in this source.
            A stream is an AirbyteStream object that includes:q
            - its stream name (or table name in the case of Postgres)
            - json_schema providing the specifications of expected schema for this stream (a list of columns described
            by their names and types)
        """
        streams = []

        # GET SPEC TO GRAB DESCRIPTIONS OF FIELDS
        spec = self.spec(logger).connectionSpecification
        defs = spec['definitions']

        def get_spec_def_obj(name):
            return defs[name]
        def get_spec_def_desc(name):
            return defs[name]['description']
        def get_spec_def_type(name):
            return defs[name]['type']
        def get_spec_def_prop(spec_def_name,def_prop_name):
            return defs[spec_def_name][def_prop_name]

        # ADD SCHEMA FOR StreamGetSiteMetaData
        stream_name = StreamGetSiteMetaData 
        json_schema = {  # Example
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": {
                "or_site_id": get_spec_def_obj('or_site_id'),
                "site_id": get_spec_def_obj('site_id'),
                "location":{"desription":"describes site location","type":"string"},
                "owner":{"desription":"DEPRECATED","type":"string"},
                "system_id":{"description":"identifies the input system for which the site belongs.", "type":"integer"},
                "client_id":{"description":"identifies the client that owns the input system for which the site belongs.","type":"string"},
                "latitude_dec":{"description":"latitude of site in decimal form","type":"number"},
                "longitude_dec":{"description":"longitude of site in decimal form","type":"number"},
                "elevation":{"description":"elevation of site","type":"number"},
            },
        }
        streams.append(AirbyteStream(name=stream_name, 
                                     supported_sync_modes=["full_refresh"], # don't need incremental for site metadata. small dataset
                                     source_defined_cursor=False, # small dataset don't need 
                                     json_schema=json_schema))

        # ADD SCHEMA FOR StreamGetSensorMetaData
        stream_name = StreamGetSensorMetaData
        json_schema = {
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": {
                "site_id": get_spec_def_obj('site_id'),
                "sensor_id": get_spec_def_obj('sensor_id'),
                "or_site_id": get_spec_def_obj('or_site_id'),
                "or_sensor_id":get_spec_def_obj('or_sensor_id'),
                "location":{"description":"site name","type":"string"},
                "description":{"description":"sensor name", "type":"string"},
                "sensor_class":get_spec_def_obj('class'),
                "sensor_type":{"description":"source type of data","type":"string"},
                "units":get_spec_def_obj('units'),
                "translate":{"description":"text translation enabled", "type":"boolean"}, 
                "precision":{"description":"number of decimals displayed for Reading/Finished value in user interface", "type":"integer"},
                "last_time":{"description":"last data time; see GetSensorData A5","type":"string"},
                "last_value":{"description":"last Reading/Finished; see GetSensorData A8", "type":"number"},
                "last_time_received":{"description":"last data time; see GetSensorData A5", "type":"string"},
                "last_value_received":{"description":"last Reading/Finished value; see GetSensorData A8", "type":"number"},
                "last_raw_value":{"description":"last raw value; see GetSensorData A6", "type":"number"},
                "last_raw_value_received":{"description":"last raw value received; see GetSensorData A6","type":"number"},
                "change_time":{"description":"time of last change to sensor metadata","type":"string"},
                "normal":{"description":"is sensor in normal mode (not timed out)?", "type":"integer"}, # boolean?
                "active":{"description":"is sensor active (not in maintenance mode/out of service)?", "type":"integer"}, #boolean?
                "valid":{"description":"*may* indicate if last value is valid. unknown", "type":"integer"}, #boolean?
                "change_rate":{"description":"DEPRECATED/UNUSED", "type":"number"},
                "time_min_consec_zeros":{"description":"DEPRECATED/UNUSED", "type":"integer"},
                "validation":{"description":"validation protocol for finished value", "type":"string"},
                "value_max":{"description":"validation parameter: maximum value", "type":"number"},
                "value_min":{"description":"validation parameter: minimum value", "type":"number"},
                "delta_pos":{"description":"validation parameter: positive delta", "type":"number"},
                "delta_neg":{"description":"validation parameter: negative delta", "type":"number"},
                "rate_pos":{"description":"DEPRECATED", "type":"integer"},
                "rate_neg":{"description":"DEPRECATED", "type":"integer"},
                "time_max":{"description":"validation parameter: maximum time", "type":"integer"},
                "time_min":{"description":"validation parameter: minimum time", "type":"integer"},
                "slope":{"description":"used in data conversion; multiplicative value", "type":"number"},
                "offset":{"description":"used in data conversion; additive value", "type":"number"},
                "reference":{"description":"used in data conversion; additive value", "type":"number"},
                "utc_offset":{"description":"the numeric offset (in hours) from Universal Coordinated Time", "type":"integer"},
                "using_dst":{"description":"DEPRECATED", "type":"boolean"},
                "conversion":{"description":"conversion protocol for raw to finished value", "type":"string"},
                "usage":{"description":"DEPRECATED/UNUSED", "type":"string"},
                "protocol":{"description":"DEPRECATED/UNUSED", "type":"integer"}  

            }
        } 
        streams.append(AirbyteStream(name=stream_name, 
                                     supported_sync_modes=["full_refresh"], # don't need incremental. small dataset
                                     source_defined_cursor=False,
                                     json_schema=json_schema))

        # ADD STREAM FOR StreamGetSensorData
        stream_name = StreamGetSensorData
        json_schema = {
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": {
                "site_id":get_spec_def_obj('site_id'),
                "sensor_id":get_spec_def_obj('sensor_id'),
                "or_site_id":get_spec_def_obj('or_site_id'),
                "or_sensor_id":get_spec_def_obj('or_sensor_id'),
                "sensor_class":get_spec_def_obj('class'),
                "data_time": {
                    "type": get_spec_def_type('onerain_datetime'),
                    "description":"date/time data was captured",
                    "pattern":get_spec_def_prop('onerain_datetime','pattern')
                },
                "data_value": {
                    "type":"number",
                    "description":"finished data value with precision (conversion) applied",
                 
                },
                "data_quality": get_spec_def_obj('data_quality'),
                "raw_value": {
                    "type":"number",
                    "description":"this is the value supplied by the source system. It is the value before any conversion or validation is applied.",
                },
                "units": get_spec_def_obj('units')
    
                
            }
        }

        streams.append(AirbyteStream(name=stream_name, 
                                     supported_sync_modes=["full_refresh","incremental"],
                                     source_defined_cursor=True,
                                     json_schema=json_schema))

        return AirbyteCatalog(streams=streams)

Exemple #23

0

Afficher le fichier

Fichier : source.py Projet : moszutij/airbyte

 def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
     client = self._get_client(config)
     return AirbyteCatalog(streams=client.get_streams())

Exemple #24

0

Afficher le fichier

    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration.
        For example, given valid credentials to a Postgres database,
        returns an Airbyte catalog where each postgres table is a stream, and each table column is a field.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteCatalog is an object describing a list of all available streams in this source.
            A stream is an AirbyteStream object that includes:
            - its stream name (or table name in the case of Postgres)
            - json_schema providing the specifications of expected schema for this stream (a list of columns described
            by their names and types)
        """
        streams = []

        stream_name = "coda_connector"  # Example
        json_schema = {
                    "$schema": "http://json-schema.org/draft-04/schema#",
                    "type": "array",
                    "items": {
                    "type": "object",
                    "properties": {
                        "items": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                            "id": {
                                "type": "string"
                            },
                            "type": {
                                "type": "string"
                            },
                            "href": {
                                "type": "string"
                            },
                            "name": {
                                "type": "string"
                            },
                            "index": {
                                "type": "number"
                            },
                            "createdAt": {
                                "type": "string"
                            },
                            "updatedAt": {
                                "type": "string"
                            },
                            "browserLink": {
                                "type": "string"
                            },
                            "values": {
                                "type": "object",
                                "properties": {
                                "dataset_id": {
                                    "type": "string"
                                },
                                "table_name": {
                                    "type": "string"
                                },
                                "client": {
                                    "type": "string"
                                },
                                "client_default_project": {
                                    "type": "string"
                                },
                                "project_id": {
                                    "type": "string"
                                },
                                "table_ref": {
                                    "type": "string"
                                },
                                "description": {
                                    "type": "string"
                                }
                                }
                            }
                            },
                            "required": [
                            "id",
                            "type",
                            "href",
                            "name",
                            "index",
                            "createdAt",
                            "updatedAt",
                            "browserLink",
                            "values"
                            ]
                        }
                        },
                        "href": {
                        "type": "string"
                        },
                        "nextSyncToken": {
                        "type": "string"
                        }
                    }
                    }
                }

        # Not Implemented

        streams.append(AirbyteStream(name=stream_name, json_schema=json_schema))
        return AirbyteCatalog(streams=streams)

Exemple #25

0

Afficher le fichier

    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        """Discover streams"""
        client = self._get_client(config)

        return AirbyteCatalog(streams=[stream for stream in client.streams])

Exemple #26

0

Afficher le fichier

 def get_catalog(self) -> AirbyteCatalog:
     raw_spec = pkgutil.get_data(
         self.__class__.__module__.split(".")[0], "catalog.json")
     return AirbyteCatalog.parse_obj(json.loads(raw_spec))

Exemple #27

0

Afficher le fichier

    def discover(self, logger: AirbyteLogger, config_container: ConfigContainer) -> AirbyteCatalog:
        client = self._client(config_container)

        return AirbyteCatalog(streams=client.get_streams())

Exemple #28

0

Afficher le fichier

    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration.
        For example, given valid credentials to a Postgres database,
        returns an Airbyte catalog where each postgres table is a stream, and each table column is a field.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteCatalog is an object describing a list of all available streams in this source.
            A stream is an AirbyteStream object that includes:
            - its stream name (or table name in the case of Postgres)
            - json_schema providing the specifications of expected schema for this stream (a list of columns described
            by their names and types)
        """
        pressure_gwl_schema = {'$schema': 'http://json-schema.org/draft-07/schema#',
                               'type': 'object',
                               'properties': {'OBJECTID': {'type': 'number'},
                                              'PointID': {'type': 'string'},
                                              'WellID': {'type': 'string'},
                                              'GlobalID': {'type': 'string'},
                                              'DateTimeMeasured': {'type': 'string',
                                                                   'format': 'date-time'},
                                              'DepthToWaterBGS': {'type': 'number'},
                                              'WaterHead': {'type': 'number'},
                                              'WaterHeadAdjusted': {'type': 'number'},
                                              'DataSource': {'type': 'string'},
                                              'MeasuringAgency': {'type': 'string'},
                                              'MeasurementMethod': {'type': 'string'},
                                              }
                               }
        acoustic_gwl_schema = {'$schema': 'http://json-schema.org/draft-07/schema#',
                               'type': 'object',
                               'properties': {'OBJECTID': {'type': 'number'},
                                              'PointID': {'type': 'string'},
                                              'WellID': {'type': 'string'},
                                              'GlobalID': {'type': 'string'},
                                              'DateTimeMeasured': {'type': 'string',
                                                                   'format': 'date-time'},
                                              'DepthToWaterBGS': {'type': 'number'},
                                              'DataSource': {'type': 'string'},
                                              'MeasuringAgency': {'type': 'string'},
                                              'MeasurementMethod': {'type': 'string'},
                                              'import_uuid': {'type': 'string'}
                                              }
                               }
        manual_gwl_schema = {'$schema': 'http://json-schema.org/draft-07/schema#',
                             'type': 'object',
                             'properties': {'OBJECTID': {'type': 'number'},
                                            'PointID': {'type': 'string'},
                                            'WellID': {'type': 'string'},
                                            'GlobalID': {'type': 'string'},
                                            'DateTimeMeasured': {'type': 'string',
                                                                 'format': 'date-time'},
                                            'DepthToWaterBGS': {'type': 'number'},
                                            'DepthToWater': {'type': 'number'},
                                            'SiteNotes': {'type': 'string'},
                                            'DataSource': {'type': 'string'},
                                            'MeasuringAgency': {'type': 'string'},
                                            'MeasurementMethod': {'type': 'string'},
                                            'LevelStatus': {'type': 'string'},
                                            'DataQuality': {'type': 'string'},
                                            'MPHeight': {'type': 'number'},
                                            }
                             }

        site_schema = {'$schema': 'http://json-schema.org/draft-07/schema#',
                       'type': 'object',
                       'properties': {'OBJECTID': {'type': 'number'},
                                      'PointID': {'type': 'string'},
                                      'OSEWellID': {'type': 'string'},
                                      'WellID': {'type': 'string'},
                                      'OSEWelltagID': {'type': 'string'},
                                      'HoleDepth': {'type': 'number'},
                                      'WellDepth': {'type': 'number'},
                                      'DepthSource': {'type': 'string'},
                                      'CompletionDate': {'type': 'string'},
                                      'CompletionSource': {'type': 'string'},
                                      'MeasuringPoint': {'type': 'string'},
                                      'MPHeight': {'type': 'number'},
                                      'CasingDiameter': {'type': 'number'},
                                      'CasingDepth': {'type': 'number'},
                                      'CasingDescription': {'type': 'string'},
                                      'DrillerName': {'type': 'string'},
                                      'ConstructionMethod': {'type': 'string'},
                                      'ConstructionNotes': {'type': 'string'},
                                      'AquiferType': {'type': 'string'},
                                      'AqClass': {'type': 'string'},
                                      'FormationZone': {'type': 'string'},
                                      'StaticWater': {'type': 'number'},
                                      'WaterNotes': {'type': 'string'},
                                      'Status': {'type': 'string'},
                                      'StatusDescription': {'type': 'string'},
                                      'CurrentUse': {'type': 'string'},
                                      'CurrentUseDescription': {'type': 'string'},
                                      'StatusUserNotes': {'type': 'string'},
                                      'MonitoringStatus': {'type': 'string'},
                                      'OpenWellLoggerOK': {'type': 'string'},
                                      'MonitorOK': {'type': 'string'},
                                      'SampleOK': {'type': 'string'},
                                      'DataSource': {'type': 'string'},
                                      'Notes': {'type': 'string'},
                                      'MonitorGroup': {'type': 'number'},
                                      'WellPdf': {'type': 'string'},
                                      'MonitorStatusReason': {'type': 'string'},
                                      'HydrographInterp': {'type': 'string'},
                                      'PrimaryUseSite_USGS': {'type': 'string'},
                                      'PrimaryUseWater_USGS': {'type': 'string'},
                                      'DateCreated': {'type': 'string'},
                                      'SiteNames': {'type': 'string'},
                                      'SiteID': {'type': 'string'},
                                      'AlternateSiteID': {'type': 'string'},
                                      'AlternateSiteID2': {'type': 'string'},
                                      'SiteDate': {'type': 'string'},
                                      'DataReliability': {'type': 'string'},
                                      'Confidential': {'type': 'boolean'},
                                      'SiteType': {'type': 'string'},
                                      'WL_Continuous': {'type': 'boolean'},
                                      'WL_Intermittent': {'type': 'boolean'},
                                      'WaterQuality': {'type': 'boolean'},
                                      'WaterFlow': {'type': 'boolean'},
                                      'Hydraulic': {'type': 'boolean'},
                                      'Subsurface': {'type': 'boolean'},
                                      'WellorSpgNoData': {'type': 'boolean'},
                                      'SubsurfaceType': {'type': 'string'},
                                      'Easting': {'type': 'number'},
                                      'Northing': {'type': 'number'},
                                      'UTMDatum': {'type': 'string'},
                                      'CoordinateNotes': {'type': 'string'},
                                      'Altitude': {'type': 'number'},
                                      'AltitudeAccuracy': {'type': 'string'},
                                      'AltitudeMethod': {'type': 'string'},
                                      'AltDatum': {'type': 'string'},
                                      'Latitude': {'type': 'number'},
                                      'Longitude': {'type': 'number'},
                                      'LatLonDatum': {'type': 'string'},
                                      'CoordinateAccuracy': {'type': 'string'},
                                      'CoordinateMethod': {'type': 'string'},
                                      'Township': {'type': 'number'},
                                      'TownshipDirection': {'type': 'string'},
                                      'Range': {'type': 'string'},
                                      'RangeDirection': {'type': 'string'},
                                      'SectionQuarters': {'type': 'number'},
                                      'SPX': {'type': 'string'},
                                      'SPY': {'type': 'string'},
                                      'QuadName': {'type': 'string'},
                                      'County': {'type': 'string'},
                                      'State': {'type': 'string'},
                                      'LocationNotes': {'type': 'string'},
                                      'WLReportDeliver': {'type': 'string'},
                                      'ChemistryReportDeliver': {'type': 'string'},
                                      'WLReportNote': {'type': 'string'},
                                      'ChemistryReportNote': {'type': 'string'},
                                      'X_NAD83_Zone12': {'type': 'number'},
                                      'Y_NAD83_Zone12': {'type': 'number'},
                                      'projectname': {'type': 'string'},
                                      'USGSProjectID': {'type': 'string'},
                                      'LatitudeDD': {'type': 'string'},
                                      'LongitudeDD': {'type': 'string'},
                                      'PublicRelease': {'type': 'boolean'},
                                      }}

        screens_schema = {'$schema': 'http://json-schema.org/draft-07/schema#',
                          'type': 'object',
                          'properties': {'WellID': {'type': 'string'},
                                         'WDBID': {'type': 'number'},
                                         'PointID': {'type': 'string'},
                                         'counter': {'type': 'number'},
                                         'ScreenTop': {'type': 'number'},
                                         'ScreenBottom': {'type': 'number'},
                                         'ScreenDescription': {'type': 'string'},
                                         'OBJECTID': {'type': 'number'},
                                         'GlobalID': {'type': 'string'},
                                         }}

        streams = [
            AirbyteStream(name='ManualGWL',
                          supported_sync_modes=["full_refresh", ],
                          source_defined_cursor=True,
                          json_schema=manual_gwl_schema),
            AirbyteStream(name='PressureGWL',
                          supported_sync_modes=["full_refresh", ],
                          source_defined_cursor=True,
                          json_schema=pressure_gwl_schema),
            # AirbyteStream(name='Manual',
            #               supported_sync_modes=["full_refresh", "incremental"],
            #               source_defined_cursor=True,
            #               json_schema=gwl_schema),
            # AirbyteStream(name='Pressure',
            #               supported_sync_modes=["full_refresh", "incremental"],
            #               source_defined_cursor=True,
            #               json_schema=gwl_schema),
            AirbyteStream(name='AcousticGWL',
                          supported_sync_modes=["full_refresh", ],
                          source_defined_cursor=True,
                          json_schema=acoustic_gwl_schema
                          ),
            AirbyteStream(name='WellScreens',
                          supported_sync_modes=["full_refresh", ],
                          source_defined_cursor=True,
                          json_schema=screens_schema),
            AirbyteStream(name='SiteMetaData',
                          supported_sync_modes=['full_refresh', ],
                          source_defined_cursor=True,
                          json_schema=site_schema)]

        return AirbyteCatalog(streams=streams)

Exemple #29

0

Afficher le fichier

 def discover(self, logger, config_container) -> AirbyteCatalog:
     logger.info(
         f'Discovering ({config_container.rendered_config_path})...')
     return AirbyteCatalog.from_json(
         pkgutil.get_data(__name__, 'catalog.json'))

Exemple #30

0

Afficher le fichier

Fichier : source.py Projet : datacequia/airbyte

    def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog:
        """
        Returns an AirbyteCatalog representing the available streams and fields in this integration.
        For example, given valid credentials to a Postgres database,
        returns an Airbyte catalog where each postgres table is a stream, and each table column is a field.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
        the properties of the spec.json file

        :return: AirbyteCatalog is an object describing a list of all available streams in this source.
            A stream is an AirbyteStream object that includes:q
            - its stream name (or table name in the case of Postgres)
            - json_schema providing the specifications of expected schema for this stream (a list of columns described
            by their names and types)
        """
        streams = []

        # GET SPEC TO GRAB DESCRIPTIONS OF FIELDS
        spec = self.spec(logger).connectionSpecification
        defs = spec['definitions']

        def get_spec_def_obj(name):
            return defs[name]

        def get_spec_def_desc(name):
            return defs[name]['description']

        def get_spec_def_type(name):
            return defs[name]['type']

        def get_spec_def_prop(spec_def_name, def_prop_name):
            return defs[spec_def_name][def_prop_name]

        # ADD SCHEMA FOR StreamGetSiteMetaData
        stream_name = StreamGetSiteMetaData
        json_schema = {  # Example
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": {
                "or_site_id": get_spec_def_obj('or_site_id'),
                "site_id": get_spec_def_obj('site_id'),
                "location":{"desription":"descriptive site location","type":"string"},
                "owner":{"desription":"site owner","type":"string"},
                "system_id":{"description":"system id?", "type":"number"},
                "client_id":{"description":"???","type":"string"},
                "latitude_dec":{"description":"decimal latitude","type":"number"},
                "longitude_dec":{"description":"decimal longitude","type":"number"},
                "elevation":{"description":"site elevation (in units of ???)","type":"number"},
            },
        }
        streams.append(AirbyteStream(name=stream_name,
                                     json_schema=json_schema))

        # ADD SCHEMA FOR StreamGetSensorMetaData
        stream_name = StreamGetSensorMetaData
        json_schema = {
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": {
                "site_id": get_spec_def_obj('site_id'),
                "sensor_id": get_spec_def_obj('sensor_id'),
                "or_site_id": get_spec_def_obj('or_site_id'),
                "or_sensor_id": get_spec_def_obj('or_sensor_id'),
                "location": {
                    "description": "",
                    "type": "string"
                },
                "description": {
                    "description": "",
                    "type": "string"
                },
                "sensor_class": get_spec_def_obj('class'),
                "sensor_type": {
                    "description": "Sensor type",
                    "type": "string"
                },
                "units": get_spec_def_obj('units'),
                "translate": {
                    "description": "",
                    "type": "boolean"
                },
                "precision": {
                    "description": "",
                    "type": "integer"
                },
                "last_time": {
                    "description": "",
                    "type": "string"
                },
                "last_value": {
                    "description": "",
                    "type": "number"
                },
                "last_time_received": {
                    "description": "",
                    "type": "string"
                },
                "last_value_received": {
                    "description": "",
                    "type": "number"
                },
                "last_raw_value": {
                    "description": "",
                    "type": "number"
                },
                "last_raw_value_received": {
                    "description": "",
                    "type": "number"
                },
                "change_time": {
                    "description": "",
                    "type": "string"
                },
                "normal": {
                    "description": "",
                    "type": "integer"
                },  # boolean?
                "active": {
                    "description": "",
                    "type": "integer"
                },  #boolean?
                "valid": {
                    "description": "",
                    "type": "integer"
                },  #boolean?
                "change_rate": {
                    "description": "",
                    "type": "number"
                },
                "time_min_consec_zeros": {
                    "description": "",
                    "type": "integer"
                },
                "validation": {
                    "description": "",
                    "type": "string"
                },
                "value_max": {
                    "description": "",
                    "type": "number"
                },
                "value_min": {
                    "description": "",
                    "type": "number"
                },
                "delta_pos": {
                    "description": "",
                    "type": "number"
                },
                "delta_neg": {
                    "description": "",
                    "type": "number"
                },
                "rate_pos": {
                    "description": "",
                    "type": "number"
                },
                "rate_neg": {
                    "description": "",
                    "type": "number"
                },
                "time_max": {
                    "description": "",
                    "type": "integer"
                },
                "time_min": {
                    "description": "",
                    "type": "integer"
                },
                "slope": {
                    "description": "",
                    "type": "number"
                },
                "offset": {
                    "description": "",
                    "type": "number"
                },
                "reference": {
                    "description": "",
                    "type": "number"
                },
                "utc_offset": {
                    "description": "",
                    "type": "integer"
                },
                "using_dst": {
                    "description": "",
                    "type": "boolean"
                },
                "conversion": {
                    "description": "",
                    "type": "string"
                },
                "usage": {
                    "description": "",
                    "type": "string"
                },
                "protocol": {
                    "description": "",
                    "type": "integer"
                }
            }
        }
        streams.append(AirbyteStream(name=stream_name,
                                     json_schema=json_schema))

        # ADD STREAM FOR StreamGetSensorData
        stream_name = StreamGetSensorData
        json_schema = {
            "$schema": "http://json-schema.org/draft-07/schema#",
            "type": "object",
            "properties": {
                "site_id": get_spec_def_obj('site_id'),
                "sensor_id": get_spec_def_obj('sensor_id'),
                "or_site_id": get_spec_def_obj('or_site_id'),
                "or_sensor_id": get_spec_def_obj('or_sensor_id'),
                "sensor_class": get_spec_def_obj('class'),
                "data_time": {
                    "type": get_spec_def_type('onerain_datetime'),
                    "description": "date/time data was captured",
                    "pattern": get_spec_def_prop('onerain_datetime', 'pattern')
                },
                "data_value": {
                    "type": "number",
                    "description": "data value",
                },
                "raw_value": {
                    "type": "number",
                    "description": "raw data value",
                },
                "units": get_spec_def_obj('units')
            }
        }

        return AirbyteCatalog(streams=streams)