def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: logger.info("read called") url = config["url"] username = config["username"] key = config["access_token"] client = WSClient(url) login = client.do_login(username, key, withpassword=False) query = config["query"] logger.info(query) data = client.do_query(query) try: for single_dict in data: yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=DATASET_ITEMS_STREAM_NAME, data=single_dict, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as err: reason = f"Failed to read data of {DATASET_ITEMS_STREAM_NAME} at {url}" logger.error(reason) raise err
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: client = Helpers.get_authenticated_sheets_client( json.loads(config["credentials_json"])) spreadsheet_id = config["spreadsheet_id"] try: logger.info(f"Running discovery on sheet {spreadsheet_id}") spreadsheet_metadata = Spreadsheet.parse_obj( client.get(spreadsheetId=spreadsheet_id, includeGridData=False).execute()) sheet_names = [ sheet.properties.title for sheet in spreadsheet_metadata.sheets ] streams = [] for sheet_name in sheet_names: header_row_data = Helpers.get_first_row( client, spreadsheet_id, sheet_name) stream = Helpers.headers_to_airbyte_stream( sheet_name, header_row_data) streams.append(stream) return AirbyteCatalog(streams=streams) except errors.HttpError as err: reason = str(err) if err.resp.status == 404: reason = "Requested spreadsheet was not found." raise Exception(f"Could not run discovery: {reason}")
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ for stream in catalog.streams: name = stream.stream.name key = stream.stream.name logger.debug(f'****** mode {stream.sync_mode} state={state}') if key == 'SiteMetaData': url = sitemetadata_url(config) elif key == 'WellScreens': url = screens_url(config) elif key == 'ManualGWL': url = manual_water_levels_url(config) elif key == 'PressureGWL': url = pressure_water_levels_url(config) elif key == 'AcousticGWL': url = acoustic_water_levels_url(config) else: continue while 1: objectid = state[key] if objectid: curl = f'{url}?objectid={objectid}' else: curl = url logger.info(f'fetching url={curl}') jobj = get_json(logger, curl) if jobj: state[key] = jobj[-1]['OBJECTID'] else: break for di in jobj: di['import_uuid'] = str(uuid.uuid4()) yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=name, data=di, emitted_at=int(datetime.now().timestamp()) * 1000))
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: client = GoogleSheetsClient(self.get_credentials(config)) spreadsheet_id = config["spreadsheet_id"] try: logger.info(f"Running discovery on sheet {spreadsheet_id}") spreadsheet_metadata = Spreadsheet.parse_obj(client.get(spreadsheetId=spreadsheet_id, includeGridData=False)) grid_sheets = Helpers.get_grid_sheets(spreadsheet_metadata) streams = [] for sheet_name in grid_sheets: try: header_row_data = Helpers.get_first_row(client, spreadsheet_id, sheet_name) stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_row_data) streams.append(stream) except Exception as err: if str(err).startswith("Expected data for exactly one row for sheet"): logger.warn(f"Skip empty sheet: {sheet_name}") else: logger.error(str(err)) return AirbyteCatalog(streams=streams) except errors.HttpError as err: reason = str(err) if err.resp.status == status_codes.NOT_FOUND: reason = "Requested spreadsheet was not found." raise Exception(f"Could not run discovery: {reason}")
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: client = self._client(config) logger.info("Starting syncing mailchimp") for configured_stream in catalog.streams: yield from self._read_record(client=client, configured_stream=configured_stream, state=state) logger.info("Finished syncing mailchimp")
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: client = GoogleSheetsClient(json.loads(config["credentials_json"])) sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog( catalog) spreadsheet_id = config["spreadsheet_id"] logger.info(f"Starting syncing spreadsheet {spreadsheet_id}") # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been # a blank row, emit the row batch sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name( client, spreadsheet_id, sheet_to_column_name) sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id) logger.info(f"Row counts: {sheet_row_counts}") for sheet in sheet_to_column_index_to_name.keys(): logger.info(f"Syncing sheet {sheet}") column_index_to_name = sheet_to_column_index_to_name[sheet] row_cursor = 2 # we start syncing past the header row # For the loop, it is necessary that the initial row exists when we send a request to the API, # if the last row of the interval goes outside the sheet - this is normal, we will return # only the real data of the sheet and in the next iteration we will loop out. while row_cursor <= sheet_row_counts[sheet]: range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}" logger.info(f"Fetching range {range}") row_batch = SpreadsheetValues.parse_obj( client.get_values(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS")) row_cursor += ROW_BATCH_SIZE + 1 # there should always be one range since we requested only one value_ranges = row_batch.valueRanges[0] if not value_ranges.values: break row_values = value_ranges.values if len(row_values) == 0: break for row in row_values: if not Helpers.is_row_empty( row) and Helpers.row_contains_relevant_data( row, column_index_to_name.keys()): yield AirbyteMessage( type=Type.RECORD, record=Helpers.row_data_to_record_message( sheet, row, column_index_to_name)) logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
def read(self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path, state_path: str = None) -> Generator[AirbyteMessage, None, None]: client = self._client(config_container) if state_path: logger.info("Starting sync with provided state file") state_obj = json.loads(open(state_path, "r").read()) else: logger.info("No state provided, starting fresh sync") state_obj = {} state = defaultdict(dict, state_obj) catalog = ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path)) logger.info("Starting syncing mailchimp") for configured_stream in catalog.streams: stream = configured_stream.stream for record in self._read_record(client=client, stream=stream.name, state=state): yield record logger.info("Finished syncing mailchimp")
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: client = self._get_client(config) logger.info(f"Starting syncing {self.__class__.__name__}") for configured_stream in catalog.streams: stream = configured_stream.stream if stream.name not in client.ENTITY_MAP.keys(): continue try: for record in self._read_record(client=client, stream=stream.name): yield AirbyteMessage(type=Type.RECORD, record=record) except requests.exceptions.RequestException: logger.error(f"Get {stream.name} error") logger.info(f"Finished syncing {self.__class__.__name__}")
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: client = self._client(config) logger.info("Starting syncing sendgrid") for configured_stream in catalog.streams: # TODO handle incremental syncs stream = configured_stream.stream if stream.name not in client.ENTITY_MAP.keys(): logger.warn(f"Stream '{stream}' not found in the recognized entities") continue for record in self._read_record(client=client, stream=stream.name): yield AirbyteMessage(type=Type.RECORD, record=record) logger.info("Finished syncing sendgrid")
def read(self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: client = self._client(config_container) catalog = ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path)) logger.info("Starting syncing mailchimp") for configured_stream in catalog.streams: stream = configured_stream.stream for record in self._read_record(client=client, stream=stream.name): yield AirbyteMessage(type=Type.RECORD, record=record) logger.info("Finished syncing mailchimp")
def check_config(self, logger: AirbyteLogger, config_path: str, config: json) -> AirbyteConnectionStatus: """ Tests if the input configuration can be used to successfully connect to the integration e.g: if a provided Stripe API token can be used to connect to the Stripe API. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config_path: Path to the file containing the configuration json config :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :return: AirbyteConnectionStatus indicating a Success or Failure """ try: test_date = (date.today() - timedelta(days=2)).strftime("%Y-%m-%d %H:%M") params = { "from": test_date, "to": test_date, "api_token": config["api_token"] } base_url = "https://hq.appsflyer.com" test_endpoint = "/export/{}/installs_report/v5".format( config["app_id"]) url = base_url + test_endpoint logger.info("GET {}".format(url)) resp = requests.get(url, params=params) if resp.status_code == 200: return AirbyteConnectionStatus(status=Status.SUCCEEDED) else: return AirbyteConnectionStatus( status=Status.FAILED, message= f"An exception occurred: Status Code: {0}, content: {1}". format(resp.status_code, resp.content), ) except Exception as e: return AirbyteConnectionStatus( status=Status.FAILED, message=f"An exception occurred: {str(e)}")
def discover(self, logger: AirbyteLogger, config: Mapping) -> AirbyteCatalog: """ Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a Remote CSV File, returns an Airbyte catalog where each csv file is a stream, and each column is a field. """ client = self._get_client(config) name = client.stream_name logger.info( f"Discovering schema of {name} at {client.reader.full_url}...") try: streams = list(client.streams) except Exception as err: reason = f"Failed to discover schemas of {name} at {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err return AirbyteCatalog(streams=streams)
def read(self, logger: AirbyteLogger, config_container, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: config = config_container.rendered_config client = Helpers.get_authenticated_sheets_client( json.loads(config["credentials_json"])) catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path)) sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog( catalog) spreadsheet_id = config["spreadsheet_id"] logger.info(f"Starting syncing spreadsheet {spreadsheet_id}") # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been # a blank row, emit the row batch sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name( client, spreadsheet_id, sheet_to_column_name) for sheet in sheet_to_column_index_to_name.keys(): logger.info(f"Syncing sheet {sheet}") column_index_to_name = sheet_to_column_index_to_name[sheet] row_cursor = 2 # we start syncing past the header row encountered_blank_row = False while not encountered_blank_row: range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}" logger.info(f"Fetching range {range}") row_batch = SpreadsheetValues.parse_obj( client.values().batchGet(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS").execute()) row_cursor += ROW_BATCH_SIZE + 1 # there should always be one range since we requested only one value_ranges = row_batch.valueRanges[0] if not value_ranges.values: break row_values = value_ranges.values if len(row_values) == 0: break for row in row_values: if Helpers.is_row_empty(row): encountered_blank_row = True break elif Helpers.row_contains_relevant_data( row, column_index_to_name.keys()): yield AirbyteMessage( type=Type.RECORD, record=Helpers.row_data_to_record_message( sheet, row, column_index_to_name)) logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
def read(self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: client = self._client(config_container) config = self.read_config(catalog_path) catalog = ConfiguredAirbyteCatalog.parse_obj(config) logger.info("Starting syncing recurly") for configured_stream in catalog.streams: # TODO handle incremental syncs stream = configured_stream.stream if stream.name not in client.ENTITIES: logger.warn( f"Stream '{stream}' not found in the recognized entities") continue for record in self._read_record(client=client, stream=stream.name): yield AirbyteMessage(type=Type.RECORD, record=record) logger.info("Finished syncing recurly")
def read( self, logger: AirbyteLogger, config: Mapping, catalog: ConfiguredAirbyteCatalog, state_path: Mapping[str, any]) -> Generator[AirbyteMessage, None, None]: """Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state.""" client = self._get_client(config) fields = self.selected_fields(catalog) name = client.stream_name logger.info(f"Reading {name} ({client.reader.full_url})...") try: for row in client.read(fields=fields): record = AirbyteRecordMessage( stream=name, data=row, emitted_at=int(datetime.now().timestamp()) * 1000) yield AirbyteMessage(type=Type.RECORD, record=record) except Exception as err: reason = f"Failed to read data of {name} at {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def discover(self, logger: AirbyteLogger, config: json) -> AirbyteCatalog: access_token = config["access_token"] spreadsheet_id = config["spreadsheet_id"] streams = [] smartsheet_client = smartsheet.Smartsheet(access_token) try: sheet = smartsheet_client.Sheets.get_sheet(spreadsheet_id) sheet = json.loads(str(sheet)) # make it subscriptable sheet_json_schema = get_json_schema(sheet) logger.info( f"Running discovery on sheet: {sheet['name']} with {spreadsheet_id}" ) stream = AirbyteStream(name=sheet["name"], json_schema=sheet_json_schema) streams.append(stream) except Exception as e: raise Exception(f"Could not run discovery: {str(e)}") return AirbyteCatalog(streams=streams)
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: access_token = config["access_token"] spreadsheet_id = config["spreadsheet_id"] smartsheet_client = smartsheet.Smartsheet(access_token) for configured_stream in catalog.streams: stream = configured_stream.stream properties = stream.json_schema["properties"] if isinstance(properties, list): columns = tuple(key for dct in properties for key in dct.keys()) elif isinstance(properties, dict): columns = tuple(i for i in properties.keys()) else: logger.error( "Could not read properties from the JSONschema in this stream" ) name = stream.name try: sheet = smartsheet_client.Sheets.get_sheet(spreadsheet_id) sheet = json.loads(str(sheet)) # make it subscriptable logger.info(f"Starting syncing spreadsheet {sheet['name']}") logger.info(f"Row count: {sheet['totalRowCount']}") for row in sheet["rows"]: values = tuple(i["value"] for i in row["cells"]) try: data = dict(zip(columns, values)) yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as e: logger.error( f"Unable to encode row into an AirbyteMessage with the following error: {e}" ) except Exception as e: logger.error(f"Could not read smartsheet: {name}") raise e logger.info(f"Finished syncing spreadsheet with ID: {spreadsheet_id}")
def check_config(self, logger: AirbyteLogger, config_path: str, config: json) -> AirbyteConnectionStatus: try: # pulled from tap-salesforce singer impl # https://github.com/singer-io/tap-salesforce/blob/master/tap_salesforce/salesforce/__init__.py#L295-L327 if config["is_sandbox"]: login_url = "https://test.salesforce.com/services/oauth2/token" else: login_url = "https://login.salesforce.com/services/oauth2/token" login_body = { "grant_type": "refresh_token", "client_id": config["client_id"], "client_secret": config["client_secret"], "refresh_token": config["refresh_token"], } logger.info("Attempting login via OAuth2") r = None try: logger.info( f"Making POST request to {login_url} with body {login_body}" ) headers = {"Content-Type": "application/x-www-form-urlencoded"} r = requests.post(login_url, headers=headers, data=login_body) if r.status_code == 200: logger.info("OAuth2 login successful") return AirbyteConnectionStatus(status=Status.SUCCEEDED) else: return AirbyteConnectionStatus( status=Status.FAILED, message="Response from Salesforce: {}".format(r.text)) except Exception as e: error_message = str(e) if r is None and hasattr( e, "response") and e.response is not None: # pylint:disable=no-member r = e.response # pylint:disable=no-member # NB: requests.models.Response is always falsy here. It is false if status code >= 400 if isinstance(r, requests.models.Response): error_message = error_message + ", Response from Salesforce: {}".format( r.text) return AirbyteConnectionStatus(status=Status.FAILED, message=error_message) except Exception as e: return AirbyteConnectionStatus(status=Status.FAILED, message=f"{str(e)}")
def _write_config(self, token): logger = AirbyteLogger() logger.info("Credentials Refreshed")
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ stream_name = StreamGetSiteMetaData # Example def get_request_url(stream, config): query_params = dict() data_api_url = config[ConfigPropDataApiUrl] query_params[ConfigPropSystemKey] = config[ConfigPropSystemKey] if stream in config: for stream_prop in config[stream]: query_params[stream_prop] = config[stream][stream_prop] return f'{data_api_url}?method={stream}&{urlencode(query_params)}' req_url = get_request_url(stream_name, config) logger.info(f'requesting {req_url}') def assert_onerain_response(response_object, expect_http_code): assert isinstance(expect_http_code, int) assert response_object.status_code == expect_http_code #logger.info(r.text) doc = xmltodict.parse(r.text) assert 'onerain' in doc if 'error' in doc['onerain']: err_msg = doc['onerain']['error'] raise ValueError(err_msg) # if 'row' key is not an ordered dictionary then return # empty ordered dictionary results = [] #collections.OrderedDict() try: rows = doc['onerain']['response']['general']['row'] row = rows[0] results = rows except Exception as e: logger.debug(f'no records: str(e)') return results # RETRIEVE SITE METADATA try: r = requests.get(req_url) # ITERATE SITE METADATA AND RETURN AS STREAM results = assert_onerain_response(r, 200) for row in results: or_site_id = int(row['or_site_id']) site_id = row['site_id'] location = row['location'] owner = row['owner'] system_id = int(row['system_id']) client_id = row['client_id'] latitude_dec = float(row['latitude_dec']) longitude_dec = float(row['longitude_dec']) elevation = int(row['elevation']) data = dict() data['or_site_id'] = or_site_id data['site_id'] = site_id data['location'] = location data['owner'] = owner data['system_id'] = system_id data['client_id'] = client_id data['latitude_dec'] = latitude_dec data['longitude_dec'] = longitude_dec data['elevation'] = elevation yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=stream_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as e: logger.error( f'failed to process stream {stream_name}: {traceback.format_exc()}' ) # RETRIEVE SENSOR METADATA AND RETURN AS STREAM stream_name = StreamGetSensorMetaData req_url = get_request_url(stream_name, config) logger.info(f'requesting {req_url}') try: # submit request r = requests.get(req_url) results = assert_onerain_response(r, 200) for row in results: data = dict() data['site_id'] = row['site_id'] data['sensor_id'] = int(row['sensor_id']) data['or_site_id'] = int(row['or_site_id']) data['or_sensor_id'] = int(row['or_sensor_id']) data['location'] = row['location'] data['description'] = row['description'] data['sensor_class'] = int(row['sensor_class']) data['sensor_type'] = row['sensor_type'] data['units'] = row['units'] data['translate'] = str_to_bool(row['translate']) data['precision'] = int(row['precision']) data['last_time'] = row['last_time'] data['last_value'] = row['last_value'] data['last_time_received'] = row['last_time_received'] data['last_value_received'] = float(row['last_value_received']) data['last_raw_value'] = float(row['last_raw_value']) data['last_raw_value_received'] = float( row['last_raw_value_received']) #data['change_time'] = row['change_time'] data['normal'] = int(row['normal']) data['active'] = int(row['active']) data['valid'] = int(row['valid']) data['change_rate'] = float(row['change_rate']) data['time_min_consec_zeros'] = int( row['time_min_consec_zeros']) data['validation'] = row['validation'] data['value_max'] = float(row['value_max']) data['value_min'] = float(row['value_min']) data['delta_pos'] = float(row['delta_pos']) data['delta_neg'] = float(row['delta_neg']) data['time_max'] = int(row['time_max']) data['time_min'] = int(row['time_min']) data['slope'] = float(row['slope']) data['offset'] = float(row['offset']) data['reference'] = float(row['reference']) data['utc_offset'] = int(row['utc_offset']) data['using_dst'] = str_to_bool(row['using_dst']) data['conversion'] = row['conversion'] data['usage'] = row['usage'] data['protocol'] = int(row['protocol']) yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=stream_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as e: logger.error( f'failed to process stream {stream_name}: {traceback.format_exc()}' ) # RETRIEVE SENSOR DATA AND RETURN AS STREAM stream_name = StreamGetSensorData req_url = get_request_url(stream_name, config) logger.info(f'requesting {req_url}') try: # submit request r = requests.get(req_url) results = assert_onerain_response(r, 200) for row in results: data = dict() data['site_id'] = row['site_id'] data['sensor_id'] = row['sensor_id'] data['or_site_id'] = int(row['or_site_id']) data['or_sensor_id'] = int(row['or_sensor_id']) data['sensor_class'] = int(row['sensor_class']) data['data_time'] = row['data_time'] data['data_value'] = float(row['data_value']) data['raw_value'] = float(row['raw_value']) data['units'] = row['units'] yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=stream_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as e: logger.error( f'failed to process stream {stream_name}: {traceback.format_exc()}' )
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ stream_name = StreamGetSiteMetaData # Example req_url = get_request_url(stream_name,config) # iterate configured streams and fetch their data for stream in catalog.streams: #logger.debug(f"configured catalog stream: {stream}") stream_name = stream.stream.name is_incremental = stream.sync_mode == SyncMode.incremental # and key in state logger.info(f"incremental state for stream {stream_name}: {is_incremental}: stream.sync_mode = '{stream.sync_mode}', SyncMode.incremental = '{SyncMode.incremental}'") req_url = get_request_url(stream_name,config) if stream_name == StreamGetSiteMetaData: data = get_site_metadata(req_url,logger,state,config,stream_name,is_incremental) elif stream_name == StreamGetSensorMetaData: data = get_sensor_metadata(req_url,logger,state,config,stream_name,is_incremental) elif stream_name == StreamGetSensorData: data = get_sensor_data(logger,state,config,stream_name,is_incremental) else: raise NotImplementedError(f"read(): don't handle stream {key} found in catalog") result_count=0 for d in data: result_count=result_count+1 yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=stream_name, data=d, emitted_at=int(datetime.now().timestamp()) * 1000), ) if result_count < 1: logger.debug(f'no new data for {stream_name}: state={state.get(stream_name)}') # RETRIEVE SENSOR METADATA AND RETURN AS STREAM stream_name = StreamGetSensorMetaData req_url = get_request_url(stream_name,config) # RETRIEVE SENSOR DATA AND RETURN AS STREAM stream_name = StreamGetSensorData req_url = get_request_url(stream_name,config)