def _read_stream(self, logger: AirbyteLogger, client: BaseClient, configured_stream: ConfiguredAirbyteStream, state: MutableMapping[str, Any]): stream_name = configured_stream.stream.name use_incremental = configured_stream.sync_mode == SyncMode.incremental and client.stream_has_state( stream_name) if use_incremental and state.get(stream_name): logger.info( f"Set state of {stream_name} stream to {state.get(stream_name)}" ) client.set_stream_state(stream_name, state.get(stream_name)) logger.info(f"Syncing {stream_name} stream") for record in client.read_stream(configured_stream.stream): now = int(datetime.now().timestamp()) * 1000 message = AirbyteRecordMessage(stream=stream_name, data=record, emitted_at=now) yield AirbyteMessage(type=MessageType.RECORD, record=message) if use_incremental and client.get_stream_state(stream_name): state[stream_name] = client.get_stream_state(stream_name) # output state object only together with other stream states yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=state))
def run(self, parsed_args: argparse.Namespace) -> Iterable[str]: cmd = parsed_args.command if not cmd: raise Exception("No command passed") if hasattr(parsed_args, "debug") and parsed_args.debug: self.logger.setLevel(logging.DEBUG) self.logger.debug("Debug logs enabled") else: self.logger.setLevel(logging.INFO) # todo: add try catch for exceptions with different exit codes source_spec: ConnectorSpecification = self.source.spec(self.logger) with tempfile.TemporaryDirectory() as temp_dir: if cmd == "spec": message = AirbyteMessage(type=Type.SPEC, spec=source_spec) yield message.json(exclude_unset=True) else: raw_config = self.source.read_config(parsed_args.config) config = self.source.configure(raw_config, temp_dir) # Now that we have the config, we can use it to get a list of ai airbyte_secrets # that we should filter in logging to avoid leaking secrets config_secrets = get_secrets( source_spec.connectionSpecification, config) update_secrets(config_secrets) # Remove internal flags from config before validating so # jsonschema's additionalProperties flag wont fail the validation connector_config, _ = split_config(config) if self.source.check_config_against_spec or cmd == "check": check_config_against_spec_or_exit(connector_config, source_spec) if cmd == "check": check_result = self.source.check(self.logger, config) if check_result.status == Status.SUCCEEDED: self.logger.info("Check succeeded") else: self.logger.error("Check failed") output_message = AirbyteMessage( type=Type.CONNECTION_STATUS, connectionStatus=check_result).json(exclude_unset=True) yield output_message elif cmd == "discover": catalog = self.source.discover(self.logger, config) yield AirbyteMessage( type=Type.CATALOG, catalog=catalog).json(exclude_unset=True) elif cmd == "read": config_catalog = self.source.read_catalog( parsed_args.catalog) state = self.source.read_state(parsed_args.state) generator = self.source.read(self.logger, config, config_catalog, state) for message in generator: yield message.json(exclude_unset=True) else: raise Exception("Unexpected command " + cmd)
def read( self, logger: logging.Logger, config: Mapping[str, Any], catalog: ConfiguredAirbyteCatalog, state: MutableMapping[str, Any] = None ) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json/spec.yaml file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ report_name = config.get("report_name") response = self._run_report(config) rows = Client.response_to_list(response) last_cursor_value = state.get(report_name, {}).get(DEFAULT_CURSOR_FIELD, "") for row in rows: if last_cursor_value <= row[DEFAULT_CURSOR_FIELD]: yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=report_name, data=row, emitted_at=int(datetime.now().timestamp()) * 1000), ) last_cursor_value = row[DEFAULT_CURSOR_FIELD] yield AirbyteMessage( type=Type.STATE, state=AirbyteStateMessage( data={report_name: { DEFAULT_CURSOR_FIELD: last_cursor_value }}))
def run(self, parsed_args: argparse.Namespace) -> Iterable[str]: cmd = parsed_args.command if not cmd: raise Exception("No command passed") # todo: add try catch for exceptions with different exit codes source_spec = self.source.spec(self.logger) with tempfile.TemporaryDirectory() as temp_dir: if cmd == "spec": message = AirbyteMessage(type=Type.SPEC, spec=source_spec) yield message.json(exclude_unset=True) else: raw_config = self.source.read_config(parsed_args.config) config = self.source.configure(raw_config, temp_dir) # Remove internal flags from config before validating so # jsonschema's additionalProperties flag wont fail the validation config, internal_config = split_config(config) if self.source.check_config_against_spec or cmd == "check": check_config_against_spec_or_exit(config, source_spec, self.logger) # Put internal flags back to config dict config.update(internal_config.dict()) if cmd == "check": check_result = self.source.check(self.logger, config) if check_result.status == Status.SUCCEEDED: self.logger.info("Check succeeded") else: self.logger.error("Check failed") output_message = AirbyteMessage( type=Type.CONNECTION_STATUS, connectionStatus=check_result).json(exclude_unset=True) yield output_message elif cmd == "discover": catalog = self.source.discover(self.logger, config) yield AirbyteMessage( type=Type.CATALOG, catalog=catalog).json(exclude_unset=True) elif cmd == "read": config_catalog = self.source.read_catalog( parsed_args.catalog) state = self.source.read_state(parsed_args.state) generator = self.source.read(self.logger, config, config_catalog, state) for message in generator: yield message.json(exclude_unset=True) else: raise Exception("Unexpected command " + cmd)
def test_source_streams(): source = SourceFaker() config = {"count": 1} catalog = source.discover(None, config) catalog = AirbyteMessage(type=Type.CATALOG, catalog=catalog).dict(exclude_unset=True) schemas = [stream["json_schema"] for stream in catalog["catalog"]["streams"]] assert len(schemas) == 3 assert schemas[0]["properties"] == { "id": {"type": "number"}, "created_at": {"type": "string", "format": "date-time", "airbyte_type": "timestamp_without_timezone"}, "updated_at": {"type": "string", "format": "date-time", "airbyte_type": "timestamp_without_timezone"}, "job": {"type": "string"}, "company": {"type": "string"}, "ssn": {"type": "string"}, "residence": {"type": "string"}, "current_location": {"type": "array"}, "blood_group": {"type": "string"}, "website": {"type": "array"}, "username": {"type": "string"}, "name": {"type": "string"}, "sex": {"type": "string"}, "address": {"type": "string"}, "mail": {"type": "string"}, } for schema in schemas: jsonschema.Draft7Validator.check_schema(schema)
def test_configure_catalog(): stream = AirbyteStream(name="stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}) catalog = AirbyteCatalog(streams=[stream]) catalog_message = AirbyteMessage(type=Type.CATALOG, catalog=catalog) sys.stdin = io.StringIO(catalog_message.json()) expected_configured_catalog = ConfiguredAirbyteCatalog(streams=[ ConfiguredAirbyteStream( stream=stream, sync_mode=SyncMode.full_refresh, destination_sync_mode=DestinationSyncMode.append) ]) expected_configured_catalog_json = json.loads( expected_configured_catalog.json()) with tempfile.TemporaryDirectory() as temp_dir: os.chdir(temp_dir) configure_catalog() assert os.path.exists("integration_tests/configured_catalog.json") with open("integration_tests/configured_catalog.json") as f: configured_catalog_json = json.loads(f.read()) assert configured_catalog_json == expected_configured_catalog_json
def test_infer_schemas(): expected_schema = { "$schema": "http://json-schema.org/schema#", "properties": { "a": { "type": "integer" }, "b": { "type": "string" } }, "type": "object", } with tempfile.TemporaryDirectory() as temp_dir: os.chdir(temp_dir) record = {"a": 1, "b": "test"} record_message = AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage( stream="stream", data=record, emitted_at=111)).json() sys.stdin = io.StringIO(record_message) infer_schemas() assert os.path.exists("schemas/stream.json") with open("schemas/stream.json") as f: schema = json.loads(f.read()) assert schema == expected_schema
def airbyte_message_from_data(raw_data: List[Any], columns: List[str], table_name: str) -> Optional[AirbyteMessage]: """ Wrap data into an AirbyteMessage. :param raw_data: Raw data row returned from a fetch query. Each item in the list represents a row of data. Example: [10, "Oranges"] :param columns: List of column names Example: ["Quantity", "Fruit"] :param table_name: Name of a table where data was fetched from :return: AirbyteMessage containing parsed data """ raw_data = format_fetch_result(raw_data) data = dict(zip(columns, raw_data)) # Remove empty values data = {k: v for k, v in data.items() if v is not None} if not data: return None return AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=table_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000, ), )
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. :param logger: Logging object to display debug/info/error to the logs (logs will not be accessible via airbyte UI if they are not passed to this logger) :param config: Json object containing the configuration of this source, content of this json is as specified in the properties of the spec.json file :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog returned by discover(), but in addition, it's been configured in the UI! For each particular stream and field, there may have been provided with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume replication in the future from that saved checkpoint. This is the object that is provided with state from previous runs and avoid replicating the entire set of data everytime. :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object. """ stream_name = "TableName" # Example data = {"columnName": "Hello World"} # Example # Not Implemented yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=stream_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), )
def test_read(schema, record, should_fail): catalog = ConfiguredAirbyteCatalog(streams=[ ConfiguredAirbyteStream( stream=AirbyteStream.parse_obj({ "name": "test_stream", "json_schema": schema }), sync_mode="full_refresh", destination_sync_mode="overwrite", ) ]) input_config = BasicReadTestConfig() docker_runner_mock = MagicMock() docker_runner_mock.call_read.return_value = [ AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream="test_stream", data=record, emitted_at=111)) ] t = _TestBasicRead() if should_fail: with pytest.raises( AssertionError, match="stream should have some fields mentioned by json schema" ): t.test_read(None, catalog, input_config, [], docker_runner_mock, MagicMock()) else: t.test_read(None, catalog, input_config, [], docker_runner_mock, MagicMock())
def _record() -> AirbyteMessage: return AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage( stream=TEST_STREAM, data=TEST_MESSAGE, emitted_at=0, namespace=TEST_NAMESPACE))
def read(self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]: try: for configured_stream in catalog.streams: if configured_stream.sync_mode == SyncMode.full_refresh: stream_name = configured_stream.stream.name reader = Reader(logger, config) table_client = reader.get_table_client(stream_name) logger.info(f"Reading data from stream '{stream_name}'") for row in reader.read(table_client, None): # Timestamp property is in metadata object # row.metadata.timestamp row["additionalProperties"] = True yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=stream_name, data=row, emitted_at=int(datetime.now().timestamp()) * 1000), ) if configured_stream.sync_mode == SyncMode.incremental: logger.warn( f"Incremental sync is not supported by stream {stream_name}" ) except Exception as err: reason = f"Failed to read data of {stream_name}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def format(self, record: logging.LogRecord) -> str: """Return a JSON representation of the log message""" message = super().format(record) airbyte_level = self.level_mapping.get(record.levelno, "INFO") log_message = AirbyteMessage(type="LOG", log=AirbyteLogMessage(level=airbyte_level, message=message)) return log_message.json(exclude_unset=True)
def _checkpoint_state(self, stream, stream_state, connector_state): try: connector_state[stream.name] = stream.state except AttributeError: connector_state[stream.name] = stream_state return AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=connector_state))
def _wrap_message( submessage: Union[AirbyteConnectionStatus, ConnectorSpecification, AirbyteRecordMessage, AirbyteCatalog] ) -> str: if isinstance(submessage, AirbyteConnectionStatus): message = AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=submessage) elif isinstance(submessage, ConnectorSpecification): message = AirbyteMessage(type=Type.SPEC, spec=submessage) elif isinstance(submessage, AirbyteCatalog): message = AirbyteMessage(type=Type.CATALOG, catalog=submessage) elif isinstance(submessage, AirbyteRecordMessage): message = AirbyteMessage(type=Type.RECORD, record=submessage) else: raise Exception(f"Unknown message type: {submessage}") return message.json(exclude_unset=True)
def _record(stream: str, str_value: str, int_value: int) -> AirbyteMessage: return AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data={ "str_col": str_value, "int_col": int_value }, emitted_at=0))
def _wrapped( msg: Union[AirbyteRecordMessage, AirbyteStateMessage, AirbyteCatalog, ConnectorSpecification, AirbyteConnectionStatus] ) -> AirbyteMessage: if isinstance(msg, AirbyteRecordMessage): return AirbyteMessage(type=Type.RECORD, record=msg) elif isinstance(msg, AirbyteStateMessage): return AirbyteMessage(type=Type.STATE, state=msg) elif isinstance(msg, AirbyteCatalog): return AirbyteMessage(type=Type.CATALOG, catalog=msg) elif isinstance(msg, AirbyteConnectionStatus): return AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=msg) elif isinstance(msg, ConnectorSpecification): return AirbyteMessage(type=Type.SPEC, spec=msg) else: raise Exception(f"Invalid Airbyte Message: {msg}")
def test_uncaught_exception_handler(): cmd = "from airbyte_cdk.logger import init_logger; from airbyte_cdk.exception_handler import init_uncaught_exception_handler; logger = init_logger('airbyte'); init_uncaught_exception_handler(logger); raise 1" exception_message = "exceptions must derive from BaseException" exception_trace = ("Traceback (most recent call last):\n" ' File "<string>", line 1, in <module>\n' "TypeError: exceptions must derive from BaseException") expected_log_message = AirbyteMessage( type="LOG", log=AirbyteLogMessage( level="FATAL", message=f"{exception_message}\n{exception_trace}")) expected_trace_message = AirbyteMessage( type="TRACE", trace=AirbyteTraceMessage( type="ERROR", emitted_at=0.0, error=AirbyteErrorTraceMessage( failure_type="system_error", message= "Something went wrong in the connector. See the logs for more details.", internal_message=exception_message, stack_trace=f"{exception_trace}\n", ), ), ) with pytest.raises(subprocess.CalledProcessError) as err: subprocess.check_output([sys.executable, "-c", cmd], stderr=subprocess.STDOUT) assert not err.value.stderr, "nothing on the stderr" stdout_lines = err.value.output.decode("utf-8").strip().split("\n") assert len(stdout_lines) == 2 log_output, trace_output = stdout_lines out_log_message = AirbyteMessage.parse_obj(json.loads(log_output)) assert out_log_message == expected_log_message, "Log message should be emitted in expected form" out_trace_message = AirbyteMessage.parse_obj(json.loads(trace_output)) assert out_trace_message.trace.emitted_at > 0 out_trace_message.trace.emitted_at = 0.0 # set a specific emitted_at value for testing assert out_trace_message == expected_trace_message, "Trace message should be emitted in expected form"
def _record(stream: str, data: Dict[str, Any], seller_id: str) -> AirbyteMessage: now = int(datetime.now().timestamp()) * 1000 if seller_id: data["seller_id"] = seller_id return AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data=data, emitted_at=now))
def record_message_from_record(record_: Dict) -> List[AirbyteMessage]: return [ AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage(stream="test_stream", data=record_, emitted_at=111), ) ]
def retrieve_all_records(client): return [ AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream=collection.id, data=doc.to_dict(), emitted_at=0)) for collection in client.collections() for doc in collection.order_by( "int_col", direction=firestore.Query.ASCENDING).stream() ]
def _airbyte_message_from_json( transformed_json: Mapping[str, Any]) -> Optional[AirbyteMessage]: if transformed_json is None or transformed_json.get( "type") == "SCHEMA" or transformed_json.get( "type") == "ACTIVATE_VERSION": return None elif transformed_json.get("type") == "STATE": out_record = AirbyteStateMessage(data=transformed_json["value"]) out_message = AirbyteMessage(type=Type.STATE, state=out_record) else: # todo: check that messages match the discovered schema stream_name = transformed_json["stream"] out_record = AirbyteRecordMessage( stream=stream_name, data=transformed_json["record"], emitted_at=int(datetime.now().timestamp()) * 1000, ) out_message = AirbyteMessage(type=Type.RECORD, record=out_record) return out_message
def test_discover(test_config): source = SourceBraintree() catalog = source.discover(None, test_config) catalog = AirbyteMessage(type=Type.CATALOG, catalog=catalog).dict(exclude_unset=True) schemas = [ stream["json_schema"] for stream in catalog["catalog"]["streams"] ] for schema in schemas: jsonschema.Draft7Validator.check_schema(schema)
def _as_airbyte_record(self, stream_name: str, data: Mapping[str, Any]): now_millis = int(datetime.now().timestamp() * 1000) transformer, schema = self._get_stream_transformer_and_schema(stream_name) # Transform object fields according to config. Most likely you will # need it to normalize values against json schema. By default no action # taken unless configured. See # docs/connector-development/cdk-python/schemas.md for details. transformer.transform(data, schema) message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis) return AirbyteMessage(type=MessageType.RECORD, record=message)
def run(self, parsed_args: argparse.Namespace) -> Iterable[str]: cmd = parsed_args.command if not cmd: raise Exception("No command passed") # todo: add try catch for exceptions with different exit codes with tempfile.TemporaryDirectory() as temp_dir: if cmd == "spec": message = AirbyteMessage(type=Type.SPEC, spec=self.source.spec(logger)) yield message.json(exclude_unset=True) else: raw_config = self.source.read_config(parsed_args.config) config = self.source.configure(raw_config, temp_dir) if cmd == "check": check_result = self.source.check(logger, config) if check_result.status == Status.SUCCEEDED: logger.info("Check succeeded") else: logger.error("Check failed") output_message = AirbyteMessage( type=Type.CONNECTION_STATUS, connectionStatus=check_result).json(exclude_unset=True) yield output_message elif cmd == "discover": catalog = self.source.discover(logger, config) yield AirbyteMessage( type=Type.CATALOG, catalog=catalog).json(exclude_unset=True) elif cmd == "read": config_catalog = self.source.read_catalog( parsed_args.catalog) state = self.source.read_state(parsed_args.state) generator = self.source.read(logger, config, config_catalog, state) for message in generator: yield message.json(exclude_unset=True) else: raise Exception("Unexpected command " + cmd)
def test_discover(config): setup_responses() source = SourceAmazonAds() catalog = source.discover(None, config) catalog = AirbyteMessage(type=Type.CATALOG, catalog=catalog).dict(exclude_unset=True) schemas = [ stream["json_schema"] for stream in catalog["catalog"]["streams"] ] for schema in schemas: Draft4Validator.check_schema(schema)
def records_fixture(): return [ AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream="my_stream", data={"id": 1, "ts_created": "2015-11-01T22:03:11", "nested": {"ts_updated": "2015-05-01"}}, emitted_at=0, ), ) ]
def test_discover_v2(test_config_v2): source = SourceChargebee() logger_mock = MagicMock() catalog = source.discover(logger_mock, test_config_v2) catalog = AirbyteMessage(type=Type.CATALOG, catalog=catalog).dict(exclude_unset=True) schemas = [ stream["json_schema"] for stream in catalog["catalog"]["streams"] ] for schema in schemas: Draft7Validator.check_schema(schema)
def airbyte_message2() -> AirbyteMessage: return AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream="table2", data={ "key1": "value2", "key2": 3 }, emitted_at=int(datetime.now().timestamp()) * 1000, ), )
def airbyte_message1(test_table_name: str): return AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=test_table_name, data={ "key1": "value1", "key2": 2 }, emitted_at=int(datetime.now().timestamp()) * 1000, ), )