def coerce_catalog_as_full_refresh( catalog: AirbyteCatalog) -> AirbyteCatalog: """ Updates the sync mode on all streams in this catalog to be full refresh """ coerced_catalog = catalog.copy() for stream in catalog.streams: stream.source_defined_cursor = False stream.supported_sync_modes = [SyncMode.full_refresh] stream.default_cursor_field = None # remove nulls return AirbyteCatalog.parse_raw( coerced_catalog.json(exclude_unset=True, exclude_none=True))
def test_parse_sheet_and_column_names_from_catalog(self): sheet1 = "soccer_team" sheet1_columns = frozenset( ["arsenal", "chelsea", "manutd", "liverpool"]) sheet1_schema = { "properties": {c: { "type": "string" } for c in sheet1_columns} } sheet2 = "basketball_teams" sheet2_columns = frozenset(["gsw", "lakers"]) sheet2_schema = { "properties": {c: { "type": "string" } for c in sheet2_columns} } catalog = AirbyteCatalog(streams=[ AirbyteStream(name=sheet1, json_schema=sheet1_schema), AirbyteStream(name=sheet2, json_schema=sheet2_schema) ]) actual = Helpers.parse_sheet_and_column_names_from_catalog(catalog) expected = {sheet1: sheet1_columns, sheet2: sheet2_columns} self.assertEqual(actual, expected)
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: client = GoogleSheetsClient(self.get_credentials(config)) spreadsheet_id = config["spreadsheet_id"] try: logger.info(f"Running discovery on sheet {spreadsheet_id}") spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False)) grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata) streams = [] for sheet_name in grid_sheets: try: header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name) stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_row_data) streams.append(stream) except Exception as err: if str(err).startswith("Expected data for exactly one row for sheet"): logger.warn(f"Skip empty sheet: {sheet_name}") else: logger.error(str(err)) return AirbyteCatalog(streams=streams) except errors.HttpError as err: reason = str(err) if err.resp.status == status_codes.NOT_FOUND: reason = "Requested spreadsheet was not found." raise Exception(f"Could not run discovery: {reason}")
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: """ Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a Postgres database, returns an Airbyte catalog where each postgres table is a stream, and each table column is a field. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :return: AirbyteCatalog is an object describing a list of all available streams in this source. A stream is an AirbyteStream object that includes: - its stream name (or table name in the case of Postgres) - json_schema providing the specifications of expected schema for this stream (a list of columns described by their names and types) """ streams = [] stream_name = "TableName" # Example json_schema = { # Example "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": {"columnName": {"type": "string"}}, } # Not Implemented streams.append(AirbyteStream(name=stream_name, json_schema=json_schema)) return AirbyteCatalog(streams=streams)
def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]: """ :param logger: :param config_container: :param catalog_path: :param state_path: :return: """ config = config_container.rendered_config storage = SourceFile.get_storage_scheme(logger, config["storage"], config["url"]) url = SourceFile.get_simple_url(config["url"]) logger.info(f"Reading ({storage}{url}, {catalog_path}, {state_path})...") catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path)) selection = SourceFile.parse_catalog(catalog) try: df_list = SourceFile.load_dataframes(config, logger) for df in df_list: columns = selection.intersection(set(df.columns)) for data in df[columns].to_dict(orient="records"): yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=url, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as err: reason = f"Failed to read data of {storage}{url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def discover(self, logger, config_container) -> AirbyteCatalog: """ :param logger: :param config_container: :return: """ config = config_container.rendered_config storage = SourceFile.get_storage_scheme(logger, config["storage"], config["url"]) url = SourceFile.get_simple_url(config["url"]) logger.info(f"Discovering schema of {storage}{url}...") streams = [] try: # TODO handle discovery of directories of multiple files instead # Don't skip data when discovering in order to infer column types df_list = SourceFile.load_dataframes(config, logger, skip_data=False) fields = {} for df in df_list: for col in df.columns: fields[col] = SourceFile.convert_dtype(df[col].dtype) json_schema = { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": {field: {"type": fields[field]} for field in fields}, } streams.append(AirbyteStream(name=url, json_schema=json_schema)) except Exception as err: reason = f"Failed to discover schemas of {storage}{url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err return AirbyteCatalog(streams=streams)
def catalog_fixture( configured_catalog: ConfiguredAirbyteCatalog ) -> Optional[AirbyteCatalog]: if configured_catalog: return AirbyteCatalog( streams=[stream.stream for stream in configured_catalog.streams]) return None
def singer_catalog_to_airbyte_catalog( singer_catalog: Dict[str, any]) -> AirbyteCatalog: airbyte_streams = [] for stream in singer_catalog.get("streams"): name = stream.get("stream") schema = stream.get("schema") airbyte_stream = AirbyteStream(name=name, json_schema=schema) metadatas = stream.get("metadata", []) stream_metadata = get_stream_level_metadata(metadatas) if stream_metadata: # TODO unclear from the singer spec what behavior should be if there are no valid replication keys, but forced-replication-method is INCREMENTAL. # For now requiring replication keys for a stream to be considered incremental. replication_keys = stream_metadata.get( "valid-replication-keys", []) if len(replication_keys) > 0: airbyte_stream.source_defined_cursor = True airbyte_stream.supported_sync_modes = [ SyncMode.full_refresh, SyncMode.incremental ] # TODO if there are multiple replication keys, allow configuring which one is used. For now we deterministically take the first airbyte_stream.default_cursor_field = [ sorted(replication_keys)[0] ] airbyte_streams += [airbyte_stream] return AirbyteCatalog(streams=airbyte_streams)
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: client = Helpers.get_authenticated_sheets_client( json.loads(config["credentials_json"])) spreadsheet_id = config["spreadsheet_id"] try: logger.info(f"Running discovery on sheet {spreadsheet_id}") spreadsheet_metadata = Spreadsheet.parse_obj( client.get(spreadsheetId=spreadsheet_id, includeGridData=False).execute()) sheet_names = [ sheet.properties.title for sheet in spreadsheet_metadata.sheets ] streams = [] for sheet_name in sheet_names: header_row_data = Helpers.get_first_row( client, spreadsheet_id, sheet_name) stream = Helpers.headers_to_airbyte_stream( sheet_name, header_row_data) streams.append(stream) return AirbyteCatalog(streams=streams) except errors.HttpError as err: reason = str(err) if err.resp.status == 404: reason = "Requested spreadsheet was not found." raise Exception(f"Could not run discovery: {reason}")
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: stream_name = DATASET_ITEMS_STREAM_NAME json_schema = { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", } return AirbyteCatalog( streams=[AirbyteStream(name=stream_name, json_schema=json_schema)])
def discover(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> AirbyteCatalog: """Implements the Discover operation from the Airbyte Specification. See https://docs.airbyte.io/architecture/airbyte-specification.""" streams = [ stream.as_airbyte_stream() for stream in self.streams(config=config) ] return AirbyteCatalog(streams=streams)
def singer_catalog_to_airbyte_catalog( singer_catalog: Dict[str, any]) -> AirbyteCatalog: airbyte_streams = [] for stream in singer_catalog.get("streams"): name = stream.get("stream") schema = stream.get("schema") airbyte_streams += [AirbyteStream(name=name, json_schema=schema)] return AirbyteCatalog(streams=airbyte_streams)
def discover(self, logger: AirbyteLogger, config: Mapping[str, Any]) -> AirbyteCatalog: """Discover streams""" streams = [ stream.as_airbyte_stream() for stream in self.streams(config=config) ] return AirbyteCatalog(streams=streams)
def read(self, logger: AirbyteLogger, config_container, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: config = config_container.rendered_config client = Helpers.get_authenticated_sheets_client( json.loads(config["credentials_json"])) catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path)) sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog( catalog) spreadsheet_id = config["spreadsheet_id"] logger.info(f"Starting syncing spreadsheet {spreadsheet_id}") # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been # a blank row, emit the row batch sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name( client, spreadsheet_id, sheet_to_column_name) for sheet in sheet_to_column_index_to_name.keys(): logger.info(f"Syncing sheet {sheet}") column_index_to_name = sheet_to_column_index_to_name[sheet] row_cursor = 2 # we start syncing past the header row encountered_blank_row = False while not encountered_blank_row: range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}" logger.info(f"Fetching range {range}") row_batch = SpreadsheetValues.parse_obj( client.values().batchGet(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS").execute()) row_cursor += ROW_BATCH_SIZE + 1 # there should always be one range since we requested only one value_ranges = row_batch.valueRanges[0] if not value_ranges.values: break row_values = value_ranges.values if len(row_values) == 0: break for row in row_values: if Helpers.is_row_empty(row): encountered_blank_row = True break elif Helpers.row_contains_relevant_data( row, column_index_to_name.keys()): yield AirbyteMessage( type=Type.RECORD, record=Helpers.row_data_to_record_message( sheet, row, column_index_to_name)) logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: json_schema = { "$schema": "http://json-schema.org/draft-07/schema#", "additionalProperties": True, "type": "object", # todo (cgardens) - remove data column. added to handle UI bug where streams without fields cannot be selected. # issue: https://github.com/airbytehq/airbyte/issues/1104 "properties": {"data": {"type": "object"}}, } # json body will be returned as the "data" stream". we can't know its schema ahead of time, so we assume it's object (i.e. valid json). return AirbyteCatalog(streams=[AirbyteStream(name=SourceHttpRequest.STREAM_NAME, json_schema=json_schema)])
def discover(self, logger: AirbyteLogger, config_container) -> AirbyteCatalog: json_schema = { "$schema": "http://json-schema.org/draft-07/schema#", "additionalProperties": True, "type": "object", "properties": {}, } # json body will be returned as the "data" stream". we can't know its schema ahead of time, so we assume it's object (i.e. valid json). return AirbyteCatalog(streams=[ AirbyteStream(name=SourceRestApi.STREAM_NAME, json_schema=json_schema) ])
def discover(self, logger: AirbyteLogger, config: Mapping) -> AirbyteCatalog: """ Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a Remote CSV File, returns an Airbyte catalog where each csv file is a stream, and each column is a field. """ client = self._get_client(config) name = client.stream_name logger.info( f"Discovering schema of {name} at {client.reader.full_url}...") try: streams = list(client.streams) except Exception as err: reason = f"Failed to discover schemas of {name} at {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err return AirbyteCatalog(streams=streams)
def get_catalogs(logger, shell_command, singer_transform=(lambda catalog: catalog), airbyte_transform=(lambda catalog: catalog)) -> Catalogs: completed_process = subprocess.run(shell_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) for line in completed_process.stderr.splitlines(): logger.log_by_prefix(line, "ERROR") airbyte_streams = [] singer_catalog = singer_transform(json.loads(completed_process.stdout)) for stream in singer_catalog.get("streams"): name = stream.get("stream") schema = stream.get("schema") airbyte_streams += [AirbyteStream(name=name, json_schema=schema)] airbyte_catalog = airbyte_transform(AirbyteCatalog(streams=airbyte_streams)) return Catalogs(singer_catalog=singer_catalog, airbyte_catalog=airbyte_catalog)
def singer_catalog_to_airbyte_catalog(singer_catalog: Dict[str, any], sync_mode_overrides: Dict[str, SyncModeInfo]) -> AirbyteCatalog: """ :param singer_catalog: :param sync_mode_overrides: A dict from stream name to the sync modes it should use. Each stream in this dict must exist in the Singer catalog, but not every stream in the catalog should exist in this :return: Airbyte Catalog """ airbyte_streams = [] for stream in singer_catalog.get("streams"): name = stream.get("stream") schema = stream.get("schema") airbyte_stream = AirbyteStream(name=name, json_schema=schema) if name in sync_mode_overrides: override_sync_modes(airbyte_stream, sync_mode_overrides[name]) else: set_sync_modes_from_metadata(airbyte_stream, stream.get("metadata", [])) airbyte_streams += [airbyte_stream] return AirbyteCatalog(streams=airbyte_streams)
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: access_token = config["access_token"] spreadsheet_id = config["spreadsheet_id"] streams = [] smartsheet_client = smartsheet.Smartsheet(access_token) try: sheet = smartsheet_client.Sheets.get_sheet(spreadsheet_id) sheet = json.loads(str(sheet)) # make it subscriptable sheet_json_schema = get_json_schema(sheet) logger.info( f"Running discovery on sheet: {sheet['name']} with {spreadsheet_id}" ) stream = AirbyteStream(name=sheet["name"], json_schema=sheet_json_schema) streams.append(stream) except Exception as e: raise Exception(f"Could not run discovery: {str(e)}") return AirbyteCatalog(streams=streams)
def discover(self, logger, config: json) -> AirbyteCatalog: """ Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a Remote CSV File, returns an Airbyte catalog where each csv file is a stream, and each column is a field. """ storage = SourceFile.get_storage_scheme(logger, config["provider"]["storage"], config["url"]) url = SourceFile.get_simple_url(config["url"]) name = SourceFile.get_stream_name(config) logger.info(f"Discovering schema of {name} at {storage}{url}...") streams = [] try: # TODO handle discovery of directories of multiple files instead if "format" in config and config["format"] == "json": schema = SourceFile.load_nested_json_schema(config, logger) json_schema = { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": schema, } else: # Don't skip data when discovering in order to infer column types df_list = SourceFile.load_dataframes(config, logger, skip_data=False) fields = {} for df in df_list: for col in df.columns: fields[col] = SourceFile.convert_dtype(df[col].dtype) json_schema = { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": {field: {"type": fields[field]} for field in fields}, } streams.append(AirbyteStream(name=name, json_schema=json_schema)) except Exception as err: reason = f"Failed to discover schemas of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err return AirbyteCatalog(streams=streams)
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: """ Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a Postgres database, returns an Airbyte catalog where each postgres table is a stream, and each table column is a field. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :return: AirbyteCatalog is an object describing a list of all available streams in this source. A stream is an AirbyteStream object that includes:q - its stream name (or table name in the case of Postgres) - json_schema providing the specifications of expected schema for this stream (a list of columns described by their names and types) """ streams = [] # GET SPEC TO GRAB DESCRIPTIONS OF FIELDS spec = self.spec(logger).connectionSpecification defs = spec['definitions'] def get_spec_def_obj(name): return defs[name] def get_spec_def_desc(name): return defs[name]['description'] def get_spec_def_type(name): return defs[name]['type'] def get_spec_def_prop(spec_def_name,def_prop_name): return defs[spec_def_name][def_prop_name] # ADD SCHEMA FOR StreamGetSiteMetaData stream_name = StreamGetSiteMetaData json_schema = { # Example "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": { "or_site_id": get_spec_def_obj('or_site_id'), "site_id": get_spec_def_obj('site_id'), "location":{"desription":"describes site location","type":"string"}, "owner":{"desription":"DEPRECATED","type":"string"}, "system_id":{"description":"identifies the input system for which the site belongs.", "type":"integer"}, "client_id":{"description":"identifies the client that owns the input system for which the site belongs.","type":"string"}, "latitude_dec":{"description":"latitude of site in decimal form","type":"number"}, "longitude_dec":{"description":"longitude of site in decimal form","type":"number"}, "elevation":{"description":"elevation of site","type":"number"}, }, } streams.append(AirbyteStream(name=stream_name, supported_sync_modes=["full_refresh"], # don't need incremental for site metadata. small dataset source_defined_cursor=False, # small dataset don't need json_schema=json_schema)) # ADD SCHEMA FOR StreamGetSensorMetaData stream_name = StreamGetSensorMetaData json_schema = { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": { "site_id": get_spec_def_obj('site_id'), "sensor_id": get_spec_def_obj('sensor_id'), "or_site_id": get_spec_def_obj('or_site_id'), "or_sensor_id":get_spec_def_obj('or_sensor_id'), "location":{"description":"site name","type":"string"}, "description":{"description":"sensor name", "type":"string"}, "sensor_class":get_spec_def_obj('class'), "sensor_type":{"description":"source type of data","type":"string"}, "units":get_spec_def_obj('units'), "translate":{"description":"text translation enabled", "type":"boolean"}, "precision":{"description":"number of decimals displayed for Reading/Finished value in user interface", "type":"integer"}, "last_time":{"description":"last data time; see GetSensorData A5","type":"string"}, "last_value":{"description":"last Reading/Finished; see GetSensorData A8", "type":"number"}, "last_time_received":{"description":"last data time; see GetSensorData A5", "type":"string"}, "last_value_received":{"description":"last Reading/Finished value; see GetSensorData A8", "type":"number"}, "last_raw_value":{"description":"last raw value; see GetSensorData A6", "type":"number"}, "last_raw_value_received":{"description":"last raw value received; see GetSensorData A6","type":"number"}, "change_time":{"description":"time of last change to sensor metadata","type":"string"}, "normal":{"description":"is sensor in normal mode (not timed out)?", "type":"integer"}, # boolean? "active":{"description":"is sensor active (not in maintenance mode/out of service)?", "type":"integer"}, #boolean? "valid":{"description":"*may* indicate if last value is valid. unknown", "type":"integer"}, #boolean? "change_rate":{"description":"DEPRECATED/UNUSED", "type":"number"}, "time_min_consec_zeros":{"description":"DEPRECATED/UNUSED", "type":"integer"}, "validation":{"description":"validation protocol for finished value", "type":"string"}, "value_max":{"description":"validation parameter: maximum value", "type":"number"}, "value_min":{"description":"validation parameter: minimum value", "type":"number"}, "delta_pos":{"description":"validation parameter: positive delta", "type":"number"}, "delta_neg":{"description":"validation parameter: negative delta", "type":"number"}, "rate_pos":{"description":"DEPRECATED", "type":"integer"}, "rate_neg":{"description":"DEPRECATED", "type":"integer"}, "time_max":{"description":"validation parameter: maximum time", "type":"integer"}, "time_min":{"description":"validation parameter: minimum time", "type":"integer"}, "slope":{"description":"used in data conversion; multiplicative value", "type":"number"}, "offset":{"description":"used in data conversion; additive value", "type":"number"}, "reference":{"description":"used in data conversion; additive value", "type":"number"}, "utc_offset":{"description":"the numeric offset (in hours) from Universal Coordinated Time", "type":"integer"}, "using_dst":{"description":"DEPRECATED", "type":"boolean"}, "conversion":{"description":"conversion protocol for raw to finished value", "type":"string"}, "usage":{"description":"DEPRECATED/UNUSED", "type":"string"}, "protocol":{"description":"DEPRECATED/UNUSED", "type":"integer"} } } streams.append(AirbyteStream(name=stream_name, supported_sync_modes=["full_refresh"], # don't need incremental. small dataset source_defined_cursor=False, json_schema=json_schema)) # ADD STREAM FOR StreamGetSensorData stream_name = StreamGetSensorData json_schema = { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": { "site_id":get_spec_def_obj('site_id'), "sensor_id":get_spec_def_obj('sensor_id'), "or_site_id":get_spec_def_obj('or_site_id'), "or_sensor_id":get_spec_def_obj('or_sensor_id'), "sensor_class":get_spec_def_obj('class'), "data_time": { "type": get_spec_def_type('onerain_datetime'), "description":"date/time data was captured", "pattern":get_spec_def_prop('onerain_datetime','pattern') }, "data_value": { "type":"number", "description":"finished data value with precision (conversion) applied", }, "data_quality": get_spec_def_obj('data_quality'), "raw_value": { "type":"number", "description":"this is the value supplied by the source system. It is the value before any conversion or validation is applied.", }, "units": get_spec_def_obj('units') } } streams.append(AirbyteStream(name=stream_name, supported_sync_modes=["full_refresh","incremental"], source_defined_cursor=True, json_schema=json_schema)) return AirbyteCatalog(streams=streams)
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: client = self._get_client(config) return AirbyteCatalog(streams=client.get_streams())
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: """ Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a Postgres database, returns an Airbyte catalog where each postgres table is a stream, and each table column is a field. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :return: AirbyteCatalog is an object describing a list of all available streams in this source. A stream is an AirbyteStream object that includes: - its stream name (or table name in the case of Postgres) - json_schema providing the specifications of expected schema for this stream (a list of columns described by their names and types) """ streams = [] stream_name = "coda_connector" # Example json_schema = { "$schema": "http://json-schema.org/draft-04/schema#", "type": "array", "items": { "type": "object", "properties": { "items": { "type": "array", "items": { "type": "object", "properties": { "id": { "type": "string" }, "type": { "type": "string" }, "href": { "type": "string" }, "name": { "type": "string" }, "index": { "type": "number" }, "createdAt": { "type": "string" }, "updatedAt": { "type": "string" }, "browserLink": { "type": "string" }, "values": { "type": "object", "properties": { "dataset_id": { "type": "string" }, "table_name": { "type": "string" }, "client": { "type": "string" }, "client_default_project": { "type": "string" }, "project_id": { "type": "string" }, "table_ref": { "type": "string" }, "description": { "type": "string" } } } }, "required": [ "id", "type", "href", "name", "index", "createdAt", "updatedAt", "browserLink", "values" ] } }, "href": { "type": "string" }, "nextSyncToken": { "type": "string" } } } } # Not Implemented streams.append(AirbyteStream(name=stream_name, json_schema=json_schema)) return AirbyteCatalog(streams=streams)
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: """Discover streams""" client = self._get_client(config) return AirbyteCatalog(streams=[stream for stream in client.streams])
def get_catalog(self) -> AirbyteCatalog: raw_spec = pkgutil.get_data( self.__class__.__module__.split(".")[0], "catalog.json") return AirbyteCatalog.parse_obj(json.loads(raw_spec))
def discover(self, logger: AirbyteLogger, config_container: ConfigContainer) -> AirbyteCatalog: client = self._client(config_container) return AirbyteCatalog(streams=client.get_streams())
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: """ Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a Postgres database, returns an Airbyte catalog where each postgres table is a stream, and each table column is a field. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :return: AirbyteCatalog is an object describing a list of all available streams in this source. A stream is an AirbyteStream object that includes: - its stream name (or table name in the case of Postgres) - json_schema providing the specifications of expected schema for this stream (a list of columns described by their names and types) """ pressure_gwl_schema = {'$schema': 'http://json-schema.org/draft-07/schema#', 'type': 'object', 'properties': {'OBJECTID': {'type': 'number'}, 'PointID': {'type': 'string'}, 'WellID': {'type': 'string'}, 'GlobalID': {'type': 'string'}, 'DateTimeMeasured': {'type': 'string', 'format': 'date-time'}, 'DepthToWaterBGS': {'type': 'number'}, 'WaterHead': {'type': 'number'}, 'WaterHeadAdjusted': {'type': 'number'}, 'DataSource': {'type': 'string'}, 'MeasuringAgency': {'type': 'string'}, 'MeasurementMethod': {'type': 'string'}, } } acoustic_gwl_schema = {'$schema': 'http://json-schema.org/draft-07/schema#', 'type': 'object', 'properties': {'OBJECTID': {'type': 'number'}, 'PointID': {'type': 'string'}, 'WellID': {'type': 'string'}, 'GlobalID': {'type': 'string'}, 'DateTimeMeasured': {'type': 'string', 'format': 'date-time'}, 'DepthToWaterBGS': {'type': 'number'}, 'DataSource': {'type': 'string'}, 'MeasuringAgency': {'type': 'string'}, 'MeasurementMethod': {'type': 'string'}, 'import_uuid': {'type': 'string'} } } manual_gwl_schema = {'$schema': 'http://json-schema.org/draft-07/schema#', 'type': 'object', 'properties': {'OBJECTID': {'type': 'number'}, 'PointID': {'type': 'string'}, 'WellID': {'type': 'string'}, 'GlobalID': {'type': 'string'}, 'DateTimeMeasured': {'type': 'string', 'format': 'date-time'}, 'DepthToWaterBGS': {'type': 'number'}, 'DepthToWater': {'type': 'number'}, 'SiteNotes': {'type': 'string'}, 'DataSource': {'type': 'string'}, 'MeasuringAgency': {'type': 'string'}, 'MeasurementMethod': {'type': 'string'}, 'LevelStatus': {'type': 'string'}, 'DataQuality': {'type': 'string'}, 'MPHeight': {'type': 'number'}, } } site_schema = {'$schema': 'http://json-schema.org/draft-07/schema#', 'type': 'object', 'properties': {'OBJECTID': {'type': 'number'}, 'PointID': {'type': 'string'}, 'OSEWellID': {'type': 'string'}, 'WellID': {'type': 'string'}, 'OSEWelltagID': {'type': 'string'}, 'HoleDepth': {'type': 'number'}, 'WellDepth': {'type': 'number'}, 'DepthSource': {'type': 'string'}, 'CompletionDate': {'type': 'string'}, 'CompletionSource': {'type': 'string'}, 'MeasuringPoint': {'type': 'string'}, 'MPHeight': {'type': 'number'}, 'CasingDiameter': {'type': 'number'}, 'CasingDepth': {'type': 'number'}, 'CasingDescription': {'type': 'string'}, 'DrillerName': {'type': 'string'}, 'ConstructionMethod': {'type': 'string'}, 'ConstructionNotes': {'type': 'string'}, 'AquiferType': {'type': 'string'}, 'AqClass': {'type': 'string'}, 'FormationZone': {'type': 'string'}, 'StaticWater': {'type': 'number'}, 'WaterNotes': {'type': 'string'}, 'Status': {'type': 'string'}, 'StatusDescription': {'type': 'string'}, 'CurrentUse': {'type': 'string'}, 'CurrentUseDescription': {'type': 'string'}, 'StatusUserNotes': {'type': 'string'}, 'MonitoringStatus': {'type': 'string'}, 'OpenWellLoggerOK': {'type': 'string'}, 'MonitorOK': {'type': 'string'}, 'SampleOK': {'type': 'string'}, 'DataSource': {'type': 'string'}, 'Notes': {'type': 'string'}, 'MonitorGroup': {'type': 'number'}, 'WellPdf': {'type': 'string'}, 'MonitorStatusReason': {'type': 'string'}, 'HydrographInterp': {'type': 'string'}, 'PrimaryUseSite_USGS': {'type': 'string'}, 'PrimaryUseWater_USGS': {'type': 'string'}, 'DateCreated': {'type': 'string'}, 'SiteNames': {'type': 'string'}, 'SiteID': {'type': 'string'}, 'AlternateSiteID': {'type': 'string'}, 'AlternateSiteID2': {'type': 'string'}, 'SiteDate': {'type': 'string'}, 'DataReliability': {'type': 'string'}, 'Confidential': {'type': 'boolean'}, 'SiteType': {'type': 'string'}, 'WL_Continuous': {'type': 'boolean'}, 'WL_Intermittent': {'type': 'boolean'}, 'WaterQuality': {'type': 'boolean'}, 'WaterFlow': {'type': 'boolean'}, 'Hydraulic': {'type': 'boolean'}, 'Subsurface': {'type': 'boolean'}, 'WellorSpgNoData': {'type': 'boolean'}, 'SubsurfaceType': {'type': 'string'}, 'Easting': {'type': 'number'}, 'Northing': {'type': 'number'}, 'UTMDatum': {'type': 'string'}, 'CoordinateNotes': {'type': 'string'}, 'Altitude': {'type': 'number'}, 'AltitudeAccuracy': {'type': 'string'}, 'AltitudeMethod': {'type': 'string'}, 'AltDatum': {'type': 'string'}, 'Latitude': {'type': 'number'}, 'Longitude': {'type': 'number'}, 'LatLonDatum': {'type': 'string'}, 'CoordinateAccuracy': {'type': 'string'}, 'CoordinateMethod': {'type': 'string'}, 'Township': {'type': 'number'}, 'TownshipDirection': {'type': 'string'}, 'Range': {'type': 'string'}, 'RangeDirection': {'type': 'string'}, 'SectionQuarters': {'type': 'number'}, 'SPX': {'type': 'string'}, 'SPY': {'type': 'string'}, 'QuadName': {'type': 'string'}, 'County': {'type': 'string'}, 'State': {'type': 'string'}, 'LocationNotes': {'type': 'string'}, 'WLReportDeliver': {'type': 'string'}, 'ChemistryReportDeliver': {'type': 'string'}, 'WLReportNote': {'type': 'string'}, 'ChemistryReportNote': {'type': 'string'}, 'X_NAD83_Zone12': {'type': 'number'}, 'Y_NAD83_Zone12': {'type': 'number'}, 'projectname': {'type': 'string'}, 'USGSProjectID': {'type': 'string'}, 'LatitudeDD': {'type': 'string'}, 'LongitudeDD': {'type': 'string'}, 'PublicRelease': {'type': 'boolean'}, }} screens_schema = {'$schema': 'http://json-schema.org/draft-07/schema#', 'type': 'object', 'properties': {'WellID': {'type': 'string'}, 'WDBID': {'type': 'number'}, 'PointID': {'type': 'string'}, 'counter': {'type': 'number'}, 'ScreenTop': {'type': 'number'}, 'ScreenBottom': {'type': 'number'}, 'ScreenDescription': {'type': 'string'}, 'OBJECTID': {'type': 'number'}, 'GlobalID': {'type': 'string'}, }} streams = [ AirbyteStream(name='ManualGWL', supported_sync_modes=["full_refresh", ], source_defined_cursor=True, json_schema=manual_gwl_schema), AirbyteStream(name='PressureGWL', supported_sync_modes=["full_refresh", ], source_defined_cursor=True, json_schema=pressure_gwl_schema), # AirbyteStream(name='Manual', # supported_sync_modes=["full_refresh", "incremental"], # source_defined_cursor=True, # json_schema=gwl_schema), # AirbyteStream(name='Pressure', # supported_sync_modes=["full_refresh", "incremental"], # source_defined_cursor=True, # json_schema=gwl_schema), AirbyteStream(name='AcousticGWL', supported_sync_modes=["full_refresh", ], source_defined_cursor=True, json_schema=acoustic_gwl_schema ), AirbyteStream(name='WellScreens', supported_sync_modes=["full_refresh", ], source_defined_cursor=True, json_schema=screens_schema), AirbyteStream(name='SiteMetaData', supported_sync_modes=['full_refresh', ], source_defined_cursor=True, json_schema=site_schema)] return AirbyteCatalog(streams=streams)
def discover(self, logger, config_container) -> AirbyteCatalog: logger.info( f'Discovering ({config_container.rendered_config_path})...') return AirbyteCatalog.from_json( pkgutil.get_data(__name__, 'catalog.json'))
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: """ Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a Postgres database, returns an Airbyte catalog where each postgres table is a stream, and each table column is a field. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :return: AirbyteCatalog is an object describing a list of all available streams in this source. A stream is an AirbyteStream object that includes:q - its stream name (or table name in the case of Postgres) - json_schema providing the specifications of expected schema for this stream (a list of columns described by their names and types) """ streams = [] # GET SPEC TO GRAB DESCRIPTIONS OF FIELDS spec = self.spec(logger).connectionSpecification defs = spec['definitions'] def get_spec_def_obj(name): return defs[name] def get_spec_def_desc(name): return defs[name]['description'] def get_spec_def_type(name): return defs[name]['type'] def get_spec_def_prop(spec_def_name, def_prop_name): return defs[spec_def_name][def_prop_name] # ADD SCHEMA FOR StreamGetSiteMetaData stream_name = StreamGetSiteMetaData json_schema = { # Example "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": { "or_site_id": get_spec_def_obj('or_site_id'), "site_id": get_spec_def_obj('site_id'), "location":{"desription":"descriptive site location","type":"string"}, "owner":{"desription":"site owner","type":"string"}, "system_id":{"description":"system id?", "type":"number"}, "client_id":{"description":"???","type":"string"}, "latitude_dec":{"description":"decimal latitude","type":"number"}, "longitude_dec":{"description":"decimal longitude","type":"number"}, "elevation":{"description":"site elevation (in units of ???)","type":"number"}, }, } streams.append(AirbyteStream(name=stream_name, json_schema=json_schema)) # ADD SCHEMA FOR StreamGetSensorMetaData stream_name = StreamGetSensorMetaData json_schema = { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": { "site_id": get_spec_def_obj('site_id'), "sensor_id": get_spec_def_obj('sensor_id'), "or_site_id": get_spec_def_obj('or_site_id'), "or_sensor_id": get_spec_def_obj('or_sensor_id'), "location": { "description": "", "type": "string" }, "description": { "description": "", "type": "string" }, "sensor_class": get_spec_def_obj('class'), "sensor_type": { "description": "Sensor type", "type": "string" }, "units": get_spec_def_obj('units'), "translate": { "description": "", "type": "boolean" }, "precision": { "description": "", "type": "integer" }, "last_time": { "description": "", "type": "string" }, "last_value": { "description": "", "type": "number" }, "last_time_received": { "description": "", "type": "string" }, "last_value_received": { "description": "", "type": "number" }, "last_raw_value": { "description": "", "type": "number" }, "last_raw_value_received": { "description": "", "type": "number" }, "change_time": { "description": "", "type": "string" }, "normal": { "description": "", "type": "integer" }, # boolean? "active": { "description": "", "type": "integer" }, #boolean? "valid": { "description": "", "type": "integer" }, #boolean? "change_rate": { "description": "", "type": "number" }, "time_min_consec_zeros": { "description": "", "type": "integer" }, "validation": { "description": "", "type": "string" }, "value_max": { "description": "", "type": "number" }, "value_min": { "description": "", "type": "number" }, "delta_pos": { "description": "", "type": "number" }, "delta_neg": { "description": "", "type": "number" }, "rate_pos": { "description": "", "type": "number" }, "rate_neg": { "description": "", "type": "number" }, "time_max": { "description": "", "type": "integer" }, "time_min": { "description": "", "type": "integer" }, "slope": { "description": "", "type": "number" }, "offset": { "description": "", "type": "number" }, "reference": { "description": "", "type": "number" }, "utc_offset": { "description": "", "type": "integer" }, "using_dst": { "description": "", "type": "boolean" }, "conversion": { "description": "", "type": "string" }, "usage": { "description": "", "type": "string" }, "protocol": { "description": "", "type": "integer" } } } streams.append(AirbyteStream(name=stream_name, json_schema=json_schema)) # ADD STREAM FOR StreamGetSensorData stream_name = StreamGetSensorData json_schema = { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "properties": { "site_id": get_spec_def_obj('site_id'), "sensor_id": get_spec_def_obj('sensor_id'), "or_site_id": get_spec_def_obj('or_site_id'), "or_sensor_id": get_spec_def_obj('or_sensor_id'), "sensor_class": get_spec_def_obj('class'), "data_time": { "type": get_spec_def_type('onerain_datetime'), "description": "date/time data was captured", "pattern": get_spec_def_prop('onerain_datetime', 'pattern') }, "data_value": { "type": "number", "description": "data value", }, "raw_value": { "type": "number", "description": "raw data value", }, "units": get_spec_def_obj('units') } } return AirbyteCatalog(streams=streams)