def read(self, logger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. """ storage = SourceFile.get_storage_scheme(logger, config["provider"]["storage"], config["url"]) url = SourceFile.get_simple_url(config["url"]) name = SourceFile.get_stream_name(config) logger.info(f"Reading {name} ({storage}{url})...") selection = SourceFile.parse_catalog(catalog) try: if "format" in config and config["format"] == "json": data_list = SourceFile.load_nested_json(config, logger) for data in data_list: yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) else: df_list = SourceFile.load_dataframes(config, logger) for df in df_list: if len(selection) > 0: columns = selection.intersection(set(df.columns)) else: columns = df.columns df = df.replace(np.nan, "NaN", regex=True) for data in df[columns].to_dict(orient="records"): yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as err: reason = f"Failed to read data of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]: """ :param logger: :param config_container: :param catalog_path: :param state_path: :return: """ config = config_container.rendered_config storage = SourceFile.get_storage_scheme(logger, config["provider"]["storage"], config["url"]) url = SourceFile.get_simple_url(config["url"]) name = SourceFile.get_stream_name(config) logger.info( f"Reading {name} ({storage}{url}, {catalog_path}, {state_path})..." ) catalog = ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path)) selection = SourceFile.parse_catalog(catalog) try: if "format" in config and config["format"] == "json": data_list = SourceFile.load_nested_json(config, logger) for data in data_list: yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) else: df_list = SourceFile.load_dataframes(config, logger) for df in df_list: if len(selection) > 0: columns = selection.intersection(set(df.columns)) else: columns = df.columns df = df.replace(np.nan, "NaN", regex=True) for data in df[columns].to_dict(orient="records"): yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as err: reason = f"Failed to read data of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: logger.info("read called") url = config["url"] username = config["username"] key = config["access_token"] client = WSClient(url) login = client.do_login(username, key, withpassword=False) query = config["query"] logger.info(query) data = client.do_query(query) try: for single_dict in data: yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=DATASET_ITEMS_STREAM_NAME, data=single_dict, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as err: reason = f"Failed to read data of {DATASET_ITEMS_STREAM_NAME} at {url}" logger.error(reason) raise err
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ stream_name = "TableName" # Example data = {"columnName": "Hello World"} # Example # Not Implemented yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=stream_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), )
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ for stream in catalog.streams: name = stream.stream.name key = stream.stream.name logger.debug(f'****** mode {stream.sync_mode} state={state}') if key == 'SiteMetaData': url = sitemetadata_url(config) elif key == 'WellScreens': url = screens_url(config) elif key == 'ManualGWL': url = manual_water_levels_url(config) elif key == 'PressureGWL': url = pressure_water_levels_url(config) elif key == 'AcousticGWL': url = acoustic_water_levels_url(config) else: continue while 1: objectid = state[key] if objectid: curl = f'{url}?objectid={objectid}' else: curl = url logger.info(f'fetching url={curl}') jobj = get_json(logger, curl) if jobj: state[key] = jobj[-1]['OBJECTID'] else: break for di in jobj: di['import_uuid'] = str(uuid.uuid4()) yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=name, data=di, emitted_at=int(datetime.now().timestamp()) * 1000))
def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]: """ :param logger: :param config_container: :param catalog_path: :param state_path: :return: """ config = config_container.rendered_config storage = SourceFile.get_storage_scheme(logger, config["storage"], config["url"]) url = SourceFile.get_simple_url(config["url"]) logger.info(f"Reading ({storage}{url}, {catalog_path}, {state_path})...") catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path)) selection = SourceFile.parse_catalog(catalog) try: df_list = SourceFile.load_dataframes(config, logger) for df in df_list: columns = selection.intersection(set(df.columns)) for data in df[columns].to_dict(orient="records"): yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=url, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as err: reason = f"Failed to read data of {storage}{url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: client = self._get_client(config) logger.info(f"Starting syncing {self.__class__.__name__}") total_state = {**state} for configured_stream in catalog.streams: stream_name = configured_stream.stream.name if client.stream_has_state(stream_name) and state.get(stream_name): logger.info(f"Set state of {stream_name} stream to {state.get(stream_name)}") client.set_stream_state(stream_name, state.get(stream_name)) logger.info(f"Syncing {stream_name} stream") for record in client.read_stream(configured_stream.stream): now = int(datetime.now().timestamp()) * 1000 message = AirbyteRecordMessage(stream=stream_name, data=record, emitted_at=now) yield AirbyteMessage(type=MessageType.RECORD, record=message) if client.stream_has_state(stream_name): total_state[stream_name] = client.get_stream_state(stream_name) # output state object only together with other stream states yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=total_state)) logger.info(f"Finished syncing {self.__class__.__name__}")
def _read_record(client: Client, stream: str): try: for record in client.ENTITY_MAP[stream](): now = int(datetime.now().timestamp()) * 1000 yield AirbyteRecordMessage(stream=stream, data=record, emitted_at=now) except ForbiddenError: return
def expected_records_fixture(inputs, base_path) -> List[AirbyteRecordMessage]: expect_records = getattr(inputs, "expect_records") if not expect_records: return [] with open(str(base_path / getattr(expect_records, "path"))) as f: return [AirbyteRecordMessage.parse_raw(line) for line in f]
def _read_record(self, client: Client, stream: str): for record in client.ENTITY_MAP[stream](): for item in record: now = int(datetime.now().timestamp()) * 1000 yield AirbyteRecordMessage(stream=stream, data=item, emitted_at=now)
def _read_stream(self, logger: AirbyteLogger, client: BaseClient, configured_stream: ConfiguredAirbyteStream, state: MutableMapping[str, Any]): stream_name = configured_stream.stream.name use_incremental = configured_stream.sync_mode == SyncMode.incremental and client.stream_has_state( stream_name) if use_incremental and state.get(stream_name): logger.info( f"Set state of {stream_name} stream to {state.get(stream_name)}" ) client.set_stream_state(stream_name, state.get(stream_name)) logger.info(f"Syncing {stream_name} stream") for record in client.read_stream(configured_stream.stream): now = int(datetime.now().timestamp()) * 1000 message = AirbyteRecordMessage(stream=stream_name, data=record, emitted_at=now) yield AirbyteMessage(type=MessageType.RECORD, record=message) if use_incremental and client.get_stream_state(stream_name): state[stream_name] = client.get_stream_state(stream_name) # output state object only together with other stream states yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=state))
def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]: logger.info(f"Reading ({config_container.rendered_config_path}, {catalog_path}, {state_path})...") message = AirbyteRecordMessage(stream="love_airbyte", data={"love": True}, emitted_at=int(time.time() * 1000)) yield AirbyteMessage(type="RECORD", record=message) state = AirbyteStateMessage(data={"love_cursor": "next_version"}) yield AirbyteMessage(type="STATE", state=state)
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ for stream in catalog.streams: key = stream.stream.name prid = None if stream.sync_mode == SyncMode.incremental and key in state: prid = state.get(key) ret = self._get_records(logger, config, prid) if ret is not None: header, rid, records = ret if records: for data in records: for k, v in data.items(): if v.isdigit(): continue try: data[k] = float(v) except ValueError: pass record = AirbyteRecordMessage( stream=key, data=data, emitted_at=int(datetime.now().timestamp()) * 1000) yield AirbyteMessage(type=Type.RECORD, record=record) state[key] = rid output_message = { "type": "STATE", "state": { "data": state } } print(json.dumps(output_message))
def test_row_data_to_record_message(self): sheet = "my_sheet" cell_values = ["v1", "v2", "v3", "v4"] column_index_to_name = {0: "c1", 3: "c4"} actual = Helpers.row_data_to_record_message(sheet, cell_values, column_index_to_name) expected = AirbyteRecordMessage(stream=sheet, data={"c1": "v1", "c4": "v4"}, emitted_at=1) self.assertEqual(expected.stream, actual.stream) self.assertEqual(expected.data, actual.data)
def _read_record(self, client: Client, stream: str): entity_map = { "Lists": client.lists, "Campaigns": client.campaigns, } for record in entity_map[stream](): now = int(datetime.now().timestamp()) * 1000 yield AirbyteRecordMessage(stream=stream, data=record, emitted_at=now)
def read(self, logger: AirbyteLogger, config_container, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: r = self._make_request(config_container.rendered_config) if r.status_code != 200: raise Exception(f"Request failed. {r.text}") # need to eagerly fetch the json. message = AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=SourceRestApi.STREAM_NAME, data=r.json(), emitted_at=int(datetime.now().timestamp()) * 1000), ) return (m for m in [message])
def row_data_to_record_message(sheet_name: str, cell_values: List[str], column_index_to_name: Dict[int, str]) -> AirbyteRecordMessage: data = {} for relevant_index in sorted(column_index_to_name.keys()): if relevant_index >= len(cell_values): break cell_value = cell_values[relevant_index] if cell_value.strip() != "": data[column_index_to_name[relevant_index]] = cell_value return AirbyteRecordMessage(stream=sheet_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000)
def read( logger, shell_command, is_message=(lambda x: True), transform=(lambda x: x) ) -> Generator[AirbyteMessage, None, None]: with subprocess.Popen(shell_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) as p: sel = selectors.DefaultSelector() sel.register(p.stdout, selectors.EVENT_READ) sel.register(p.stderr, selectors.EVENT_READ) ok = True while ok: for key, val1 in sel.select(): line = key.fileobj.readline() if not line: ok = False elif key.fileobj is p.stdout: out_json = to_json(line) if out_json is not None and is_message(out_json): transformed_json = transform(out_json) if transformed_json is not None: if transformed_json.get( "type" ) == "SCHEMA" or transformed_json.get( "type") == "ACTIVATE_VERSION": pass elif transformed_json.get("type") == "STATE": out_record = AirbyteStateMessage( data=transformed_json["value"]) out_message = AirbyteMessage( type=Type.STATE, state=out_record) yield transform(out_message) else: # todo: check that messages match the discovered schema stream_name = transformed_json["stream"] out_record = AirbyteRecordMessage( stream=stream_name, data=transformed_json["record"], emitted_at=int( datetime.now().timestamp()) * 1000, ) out_message = AirbyteMessage( type=Type.RECORD, record=out_record) yield transform(out_message) else: logger.log_by_prefix(line, "INFO") else: logger.log_by_prefix(line, "ERROR")
def read_stream( self, stream: AirbyteStream ) -> Generator[AirbyteRecordMessage, None, None]: """Yield records from stream""" method = self._stream_methods.get(stream.name) if not method: raise ValueError( f"Client does not know how to read stream `{stream.name}`") for message in method(): now = int(datetime.now().timestamp()) * 1000 yield AirbyteRecordMessage(stream=stream.name, data=message, emitted_at=now)
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: access_token = config["access_token"] spreadsheet_id = config["spreadsheet_id"] smartsheet_client = smartsheet.Smartsheet(access_token) for configured_stream in catalog.streams: stream = configured_stream.stream properties = stream.json_schema["properties"] if isinstance(properties, list): columns = tuple(key for dct in properties for key in dct.keys()) elif isinstance(properties, dict): columns = tuple(i for i in properties.keys()) else: logger.error( "Could not read properties from the JSONschema in this stream" ) name = stream.name try: sheet = smartsheet_client.Sheets.get_sheet(spreadsheet_id) sheet = json.loads(str(sheet)) # make it subscriptable logger.info(f"Starting syncing spreadsheet {sheet['name']}") logger.info(f"Row count: {sheet['totalRowCount']}") for row in sheet["rows"]: values = tuple(i["value"] for i in row["cells"]) try: data = dict(zip(columns, values)) yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as e: logger.error( f"Unable to encode row into an AirbyteMessage with the following error: {e}" ) except Exception as e: logger.error(f"Could not read smartsheet: {name}") raise e logger.info(f"Finished syncing spreadsheet with ID: {spreadsheet_id}")
def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]: logger.info( f'Reading ({config_container.rendered_config_path}, {catalog_path}, {state_path})...' ) message = AirbyteRecordMessage(stream='love_airbyte', data={'love': True}, emitted_at=int(time.time() * 1000)) yield AirbyteMessage(type='RECORD', record=message) state = AirbyteStateMessage(data={'love_cursor': 'next_version'}) yield AirbyteMessage(type='STATE', state=state)
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: r = self._make_request(config) if r.status_code != 200: raise Exception(f"Request failed. {r.text}") # need to eagerly fetch the json. message = AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=SourceHttpRequest.STREAM_NAME, data=r.json(), emitted_at=int(datetime.now().timestamp()) * 1000 ), ) return (m for m in [message])
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ coda_token = config["api_key"] headers = {'Authorization': f'Bearer {config["api_key"]}'} docs_uri = 'https://coda.io/apis/v1/docs' docs_params = {'isOwner': True} stream_name = "CodaRows" # Example #data = {"columnName": {"Hello World": "hi"}} data_res = self._api_call(docs_uri, coda_token, headers) data = data_res yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) # ********************** END - Implementing read connection ************************* # from airbyte-integrations/connectors/source-<source-name> # python main_dev.py spec # python main_dev.py check --config secrets/config.json # python main_dev.py discover --config secrets/config.json # python main_dev.py read --config secrets/config.json --catalog sample_files/configured_catalog.json # python main_dev.py read --config secrets/config.json --catalog source_code_connector/schema/configured_catalog.json
def read( shell_command, is_message=(lambda x: True), transform=(lambda x: x) ) -> Generator[AirbyteMessage, None, None]: with subprocess.Popen(shell_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True) as p: for tuple in zip(p.stdout, p.stderr): out_line = tuple[0] err_line = tuple[1] if out_line: out_json = to_json(out_line) if out_json is not None and is_message(out_json): transformed_json = transform(out_json) if transformed_json is not None: if transformed_json.get('type') == "SCHEMA": pass elif transformed_json.get('type') == "STATE": out_record = AirbyteStateMessage( data=transformed_json["value"]) out_message = AirbyteMessage(type="STATE", state=out_record) yield transform(out_message) else: # todo: remove type from record # todo: handle stream designation # todo: check that messages match the discovered schema stream_name = transformed_json["stream"] out_record = AirbyteRecordMessage( stream=stream_name, data=transformed_json["record"], emitted_at=int( datetime.now().timestamp()) * 1000) out_message = AirbyteMessage(type="RECORD", record=out_record) yield transform(out_message) elif out_line: log_line(out_line, "INFO") if err_line: log_line(err_line, "ERROR")
def _airbyte_message_from_json( transformed_json: Mapping[str, Any]) -> Optional[AirbyteMessage]: if transformed_json is None or transformed_json.get( "type") == "SCHEMA" or transformed_json.get( "type") == "ACTIVATE_VERSION": return None elif transformed_json.get("type") == "STATE": out_record = AirbyteStateMessage(data=transformed_json["value"]) out_message = AirbyteMessage(type=Type.STATE, state=out_record) else: # todo: check that messages match the discovered schema stream_name = transformed_json["stream"] out_record = AirbyteRecordMessage( stream=stream_name, data=transformed_json["record"], emitted_at=int(datetime.now().timestamp()) * 1000, ) out_message = AirbyteMessage(type=Type.RECORD, record=out_record) return out_message
def read( self, logger: AirbyteLogger, config: Mapping, catalog: ConfiguredAirbyteCatalog, state_path: Mapping[str, any]) -> Generator[AirbyteMessage, None, None]: """Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state.""" client = self._get_client(config) fields = self.selected_fields(catalog) name = client.stream_name logger.info(f"Reading {name} ({client.reader.full_url})...") try: for row in client.read(fields=fields): record = AirbyteRecordMessage( stream=name, data=row, emitted_at=int(datetime.now().timestamp()) * 1000) yield AirbyteMessage(type=Type.RECORD, record=record) except Exception as err: reason = f"Failed to read data of {name} at {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def _read_stream( self, logger: AirbyteLogger, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream, state: MutableMapping[str, Any]) -> Iterator[AirbyteMessage]: stream_name = configured_stream.stream.name use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental stream_state = {} if use_incremental and state.get(stream_name): logger.info( f"Set state of {stream_name} stream to {state.get(stream_name)}" ) stream_state = state.get(stream_name) logger.info(f"Syncing stream: {stream_name} ") record_counter = 0 for record in stream_instance.read_stream( configured_stream=configured_stream, stream_state=copy.deepcopy(stream_state)): now_millis = int(datetime.now().timestamp()) * 1000 message = AirbyteRecordMessage(stream=stream_name, data=record, emitted_at=now_millis) yield AirbyteMessage(type=MessageType.RECORD, record=message) record_counter += 1 if use_incremental: stream_state = stream_instance.get_updated_state( stream_state, record) if record_counter % stream_instance.state_checkpoint_interval == 0: state[stream_name] = stream_state yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=state)) if use_incremental and stream_state: state[stream_name] = stream_state # output state object only together with other stream states yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=state))
def _read_record(self, client: Client, stream: str): for record in client.get_entities(stream): now = int(datetime.now().timestamp()) * 1000 yield AirbyteRecordMessage(stream=stream, data=record, emitted_at=now)
def _as_airbyte_record(self, stream_name: str, data: Mapping[str, Any]): now_millis = int(datetime.now().timestamp()) * 1000 message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis) return AirbyteMessage(type=MessageType.RECORD, record=message)
def read( self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any] ) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ stream_name = StreamGetSiteMetaData # Example req_url = get_request_url(stream_name,config) # iterate configured streams and fetch their data for stream in catalog.streams: #logger.debug(f"configured catalog stream: {stream}") stream_name = stream.stream.name is_incremental = stream.sync_mode == SyncMode.incremental # and key in state logger.info(f"incremental state for stream {stream_name}: {is_incremental}: stream.sync_mode = '{stream.sync_mode}', SyncMode.incremental = '{SyncMode.incremental}'") req_url = get_request_url(stream_name,config) if stream_name == StreamGetSiteMetaData: data = get_site_metadata(req_url,logger,state,config,stream_name,is_incremental) elif stream_name == StreamGetSensorMetaData: data = get_sensor_metadata(req_url,logger,state,config,stream_name,is_incremental) elif stream_name == StreamGetSensorData: data = get_sensor_data(logger,state,config,stream_name,is_incremental) else: raise NotImplementedError(f"read(): don't handle stream {key} found in catalog") result_count=0 for d in data: result_count=result_count+1 yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream=stream_name, data=d, emitted_at=int(datetime.now().timestamp()) * 1000), ) if result_count < 1: logger.debug(f'no new data for {stream_name}: state={state.get(stream_name)}') # RETRIEVE SENSOR METADATA AND RETURN AS STREAM stream_name = StreamGetSensorMetaData req_url = get_request_url(stream_name,config) # RETRIEVE SENSOR DATA AND RETURN AS STREAM stream_name = StreamGetSensorData req_url = get_request_url(stream_name,config)