def test_incremental_sync(config, configured_catalog): today = pendulum.now().date() start_date = today.subtract(months=1) config["start_date"] = start_date.to_date_string() google_ads_client = SourceGoogleAds() records = list(google_ads_client.read(AirbyteLogger(), config, ConfiguredAirbyteCatalog.parse_obj(configured_catalog))) latest_state = None for record in records[::-1]: if record and record.type == Type.STATE: latest_state = record.state.data["ad_group_ad_report"][config["customer_id"]]["segments.date"] break for message in records: if not message or message.type != Type.RECORD: continue cursor_value = message.record.data["segments.date"] assert cursor_value <= latest_state assert cursor_value >= start_date.subtract(days=GAP_DAYS).to_date_string() # next sync records = list( google_ads_client.read( AirbyteLogger(), config, ConfiguredAirbyteCatalog.parse_obj(configured_catalog), {"ad_group_ad_report": {config["customer_id"]: {"segments.date": latest_state}}}, ) ) for record in records: if record.type == Type.RECORD: assert record.record.data["segments.date"] >= pendulum.parse(latest_state).subtract(days=GAP_DAYS).to_date_string() if record.type == Type.STATE: assert record.state.data["ad_group_ad_report"][config["customer_id"]]["segments.date"] >= latest_state
def test_configure_catalog(): stream = AirbyteStream(name="stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}) catalog = AirbyteCatalog(streams=[stream]) catalog_message = AirbyteMessage(type=Type.CATALOG, catalog=catalog) sys.stdin = io.StringIO(catalog_message.json()) expected_configured_catalog = ConfiguredAirbyteCatalog(streams=[ ConfiguredAirbyteStream( stream=stream, sync_mode=SyncMode.full_refresh, destination_sync_mode=DestinationSyncMode.append) ]) expected_configured_catalog_json = json.loads( expected_configured_catalog.json()) with tempfile.TemporaryDirectory() as temp_dir: os.chdir(temp_dir) configure_catalog() assert os.path.exists("integration_tests/configured_catalog.json") with open("integration_tests/configured_catalog.json") as f: configured_catalog_json = json.loads(f.read()) assert configured_catalog_json == expected_configured_catalog_json
def test_incremental_sync(config): google_ads_client = SourceGoogleAds() state = "2021-05-24" records = google_ads_client.read( AirbyteLogger(), config, ConfiguredAirbyteCatalog.parse_obj(SAMPLE_CATALOG), {"ad_group_ad_report": { "segments.date": state }}) current_state = pendulum.parse(state).subtract(days=14).to_date_string() for record in records: if record and record.type == Type.STATE: print(record) current_state = record.state.data["ad_group_ad_report"][ "segments.date"] if record and record.type == Type.RECORD: assert record.record.data["segments.date"] >= current_state # Next sync state = "2021-06-04" records = google_ads_client.read( AirbyteLogger(), config, ConfiguredAirbyteCatalog.parse_obj(SAMPLE_CATALOG), {"ad_group_ad_report": { "segments.date": state }}) current_state = pendulum.parse(state).subtract(days=14).to_date_string() for record in records: if record and record.type == Type.STATE: current_state = record.state.data["ad_group_ad_report"][ "segments.date"] if record and record.type == Type.RECORD: assert record.record.data["segments.date"] >= current_state # Abnormal state state = "2029-06-04" records = google_ads_client.read( AirbyteLogger(), config, ConfiguredAirbyteCatalog.parse_obj(SAMPLE_CATALOG), {"ad_group_ad_report": { "segments.date": state }}) current_state = pendulum.parse(state).subtract(days=14).to_date_string() no_records = True for record in records: if record and record.type == Type.STATE: assert record.state.data["ad_group_ad_report"][ "segments.date"] == state if record and record.type == Type.RECORD: no_records = False assert no_records
def test_read(): # Create User user = create_user_with_all_permissions() # Create Queue queue_name = "amazon-sqs-mock-queue" queue_region = "eu-west-1" client = boto3.client( "sqs", aws_access_key_id=user["AccessKeyId"], aws_secret_access_key=user["SecretAccessKey"], region_name=queue_region ) queue_url = client.create_queue(QueueName=queue_name)["QueueUrl"] # Create config config = create_config(queue_url, user["AccessKeyId"], user["SecretAccessKey"], queue_region, False) # Create ConfiguredAirbyteCatalog catalog = ConfiguredAirbyteCatalog(streams=get_catalog()["streams"]) # Create AirbyteLogger logger = AirbyteLogger() # Create State state = Dict[str, any] # Create Source source = SourceAmazonSqs() # Send test message test_message = "UNIT_TEST_MESSAGE" client.send_message(QueueUrl=queue_url, MessageBody=test_message) # Run read for message in source.read(logger, config, catalog, state): record = message.record stream = record.stream assert stream == queue_name data = record.data data_body = data["body"] assert data_body == test_message
def test_read_special_types_no_state(mock_connection, config, stream2, logger): source = SourceFirebolt() c_stream = ConfiguredAirbyteStream( sync_mode=SyncMode.full_refresh, destination_sync_mode=DestinationSyncMode.overwrite, stream=stream2, ) catalog = ConfiguredAirbyteCatalog(streams=[c_stream]) mock_connection().__enter__().cursor().__enter__().fetchall( ).__iter__.return_value = iter([ [ [ datetime.fromisoformat("2019-01-01 20:12:02"), datetime.fromisoformat("2019-02-01 20:12:02") ], Decimal("1231232.123459999990457054844258706536"), ], ]) message1 = next(source.read(logger, config, catalog, {})) assert message1.record.stream == stream2.name assert message1.record.data == { "col3": ["2019-01-01T20:12:02", "2019-02-01T20:12:02"], "col4": "1231232.123459999990457054844258706536", }
def test_sql_write_overwrite( mock_connection: MagicMock, mock_writer: MagicMock, mock_s3_writer: MagicMock, config: Dict[str, str], configured_stream1: ConfiguredAirbyteStream, configured_stream2: ConfiguredAirbyteStream, airbyte_message1: AirbyteMessage, airbyte_message2: AirbyteMessage, airbyte_state_message: AirbyteMessage, ): # Overwrite triggers a delete configured_stream1.destination_sync_mode = DestinationSyncMode.overwrite catalog = ConfiguredAirbyteCatalog( streams=[configured_stream1, configured_stream2]) destination = DestinationFirebolt() result = destination.write( config, catalog, [airbyte_message1, airbyte_state_message, airbyte_message2]) mock_s3_writer.assert_not_called() assert list(result) == [airbyte_state_message] mock_writer.return_value.delete_table.assert_called_once_with("table1") mock_writer.return_value.create_raw_table.mock_calls = [ call(mock_connection, "table1"), call(mock_connection, "table2") ]
def test_valid_incremental_read_with_checkpoint_interval(mocker, logger): """Tests that an incremental read which doesn't specify a checkpoint interval outputs a STATE message after reading N records within a stream""" stream_output = [{"k1": "v1"}, {"k2": "v2"}] s1 = MockStream([({"sync_mode": SyncMode.incremental, "stream_state": {}}, stream_output)], name="s1") s2 = MockStream([({"sync_mode": SyncMode.incremental, "stream_state": {}}, stream_output)], name="s2") state = {"cursor": "value"} mocker.patch.object(MockStream, "get_updated_state", return_value=state) mocker.patch.object(MockStream, "supports_incremental", return_value=True) mocker.patch.object(MockStream, "get_json_schema", return_value={}) # Tell the source to output one state message per record mocker.patch.object(MockStream, "state_checkpoint_interval", new_callable=mocker.PropertyMock, return_value=1) src = MockSource(streams=[s1, s2]) catalog = ConfiguredAirbyteCatalog(streams=[_configured_stream(s1, SyncMode.incremental), _configured_stream(s2, SyncMode.incremental)]) expected = [ _as_record("s1", stream_output[0]), _state({"s1": state}), _as_record("s1", stream_output[1]), _state({"s1": state}), _state({"s1": state}), _as_record("s2", stream_output[0]), _state({"s1": state, "s2": state}), _as_record("s2", stream_output[1]), _state({"s1": state, "s2": state}), _state({"s1": state, "s2": state}), ] messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=defaultdict(dict)))) assert expected == messages
def test_with_purchases(): source = SourceFaker() logger = None config = {"count": 1000, "records_per_sync": 1000} catalog = ConfiguredAirbyteCatalog( streams=[ {"stream": {"name": "Users", "json_schema": {}}, "sync_mode": "full_refresh", "destination_sync_mode": "overwrite"}, {"stream": {"name": "Products", "json_schema": {}}, "sync_mode": "full_refresh", "destination_sync_mode": "overwrite"}, {"stream": {"name": "Purchases", "json_schema": {}}, "sync_mode": "full_refresh", "destination_sync_mode": "overwrite"}, ] ) state = {} iterator = source.read(logger, config, catalog, state) record_rows_count = 0 state_rows_count = 0 latest_state = {} for row in iterator: if row.type is Type.RECORD: record_rows_count = record_rows_count + 1 if row.type is Type.STATE: state_rows_count = state_rows_count + 1 latest_state = row assert record_rows_count > 1000 + 100 # should be greater than 1000 users, and 100 products assert state_rows_count > 10 + 1 + 1 # should be greater than 1000/100 + one more state at the end, and one state for the products assert latest_state.state.data["Users"] == {"cursor": 1000, "seed": None} assert latest_state.state.data["Products"] == {"product_count": 100} assert latest_state.state.data["Purchases"]["purchases_count"] > 0
def test_config_skip_test(): docker_runner_mock = MagicMock() docker_runner_mock.call_read.return_value = [] t = _TestIncremental() with patch.object(pytest, "skip", return_value=None): t.test_read_sequential_slices( inputs=IncrementalConfig(skip_comprehensive_incremental_tests=True), connector_config=MagicMock(), configured_catalog_for_incremental=ConfiguredAirbyteCatalog( streams=[ ConfiguredAirbyteStream( stream=AirbyteStream( name="test_stream", json_schema={"type": "object", "properties": {"date": {"type": "date"}}}, supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], ), sync_mode=SyncMode.incremental, destination_sync_mode=DestinationSyncMode.overwrite, cursor_field=["date"], ) ] ), cursor_paths={}, docker_runner=docker_runner_mock, ) # This is guaranteed to fail when the test gets executed docker_runner_mock.call_read.assert_not_called()
def test_valid_full_refresh_read_no_slices(mocker): """Tests that running a full refresh sync on streams which don't specify slices produces the expected AirbyteMessages""" stream_output = [{"k1": "v1"}, {"k2": "v2"}] s1 = MockStream([({ "sync_mode": SyncMode.full_refresh }, stream_output)], name="s1") s2 = MockStream([({ "sync_mode": SyncMode.full_refresh }, stream_output)], name="s2") mocker.patch.object(MockStream, "get_json_schema", return_value={}) src = MockSource(streams=[s1, s2]) catalog = ConfiguredAirbyteCatalog(streams=[ _configured_stream(s1, SyncMode.full_refresh), _configured_stream(s2, SyncMode.full_refresh) ]) expected = _as_records("s1", stream_output) + _as_records( "s2", stream_output) messages = _fix_emitted_at(list(src.read(logger, {}, catalog))) assert expected == messages
def test_airbyte_trace_message_on_failure(self, connector_config, inputs: BasicReadTestConfig, docker_runner: ConnectorRunner): if not inputs.expect_trace_message_on_failure: pytest.skip("Skipping `test_airbyte_trace_message_on_failure` because `inputs.expect_trace_message_on_failure=False`") return invalid_configured_catalog = ConfiguredAirbyteCatalog( streams=[ # create ConfiguredAirbyteStream without validation ConfiguredAirbyteStream.construct( stream=AirbyteStream( name="__AIRBYTE__stream_that_does_not_exist", json_schema={"type": "object", "properties": {"f1": {"type": "string"}}}, supported_sync_modes=[SyncMode.full_refresh], ), sync_mode="INVALID", destination_sync_mode="INVALID", ) ] ) output = docker_runner.call_read(connector_config, invalid_configured_catalog, raise_container_error=False) trace_messages = filter_output(output, Type.TRACE) error_trace_messages = list(filter(lambda m: m.trace.type == TraceType.ERROR, trace_messages)) assert len(error_trace_messages) >= 1, "Connector should emit at least one error trace message"
def test_valid_full_refresh_read_with_slices(mocker): """Tests that running a full refresh sync on streams which use slices produces the expected AirbyteMessages""" slices = [{"1": "1"}, {"2": "2"}] # When attempting to sync a slice, just output that slice as a record s1 = MockStream([({ "sync_mode": SyncMode.full_refresh, "stream_slice": s }, [s]) for s in slices], name="s1") s2 = MockStream([({ "sync_mode": SyncMode.full_refresh, "stream_slice": s }, [s]) for s in slices], name="s2") mocker.patch.object(MockStream, "get_json_schema", return_value={}) mocker.patch.object(MockStream, "stream_slices", return_value=slices) src = MockSource(streams=[s1, s2]) catalog = ConfiguredAirbyteCatalog(streams=[ _configured_stream(s1, SyncMode.full_refresh), _configured_stream(s2, SyncMode.full_refresh) ]) expected = [*_as_records("s1", slices), *_as_records("s2", slices)] messages = _fix_emitted_at(list(src.read(logger, {}, catalog))) assert expected == messages
def test_sql_write_append( mock_connection: MagicMock, mock_writer: MagicMock, config: Dict[str, str], configured_stream1: ConfiguredAirbyteStream, configured_stream2: ConfiguredAirbyteStream, airbyte_message1: AirbyteMessage, airbyte_message2: AirbyteMessage, airbyte_state_message: AirbyteMessage, ) -> None: catalog = ConfiguredAirbyteCatalog( streams=[configured_stream1, configured_stream2]) destination = DestinationFirebolt() result = destination.write( config, catalog, [airbyte_message1, airbyte_state_message, airbyte_message2]) assert list(result) == [airbyte_state_message] mock_writer.return_value.delete_table.assert_not_called() mock_writer.return_value.create_raw_table.mock_calls = [ call(mock_connection, "table1"), call(mock_connection, "table2") ] assert len(mock_writer.return_value.queue_write_data.mock_calls) == 2 mock_writer.return_value.flush.assert_called_once()
def slice_catalog(catalog: ConfiguredAirbyteCatalog, streams: Set[str]) -> ConfiguredAirbyteCatalog: sliced_catalog = ConfiguredAirbyteCatalog(streams=[]) for stream in catalog.streams: if stream.stream.name in streams: sliced_catalog.streams.append(stream) return sliced_catalog
def configured_catalog_fixture() -> ConfiguredAirbyteCatalog: stream_schema = { "type": "object", "properties": { "string_col": { "type": "str" }, "int_col": { "type": "integer" } }, } append_stream = ConfiguredAirbyteStream( stream=AirbyteStream(name="append_stream", json_schema=stream_schema), sync_mode=SyncMode.incremental, destination_sync_mode=DestinationSyncMode.append, ) overwrite_stream = ConfiguredAirbyteStream( stream=AirbyteStream(name="overwrite_stream", json_schema=stream_schema), sync_mode=SyncMode.incremental, destination_sync_mode=DestinationSyncMode.overwrite, ) return ConfiguredAirbyteCatalog(streams=[append_stream, overwrite_stream])
def test_streams_outputs_records(self, catalog_path, config): configured_catalog = ConfiguredAirbyteCatalog.parse_file(catalog_path) records, states = self._read_records(config, configured_catalog) assert records, "should have some records returned" if configured_catalog.streams[0].sync_mode == SyncMode.incremental: assert states, "should have some states returned"
def test_transform_for_tickets_stream(config, input_data, expected_data): """Checks Transform in case when records come with invalid fields data types""" test_catalog = ConfiguredAirbyteCatalog(streams=[ ConfiguredAirbyteStream( stream=AirbyteStream(name="tickets", json_schema={}), sync_mode=SyncMode.full_refresh, destination_sync_mode=DestinationSyncMode.overwrite, ) ]) with requests_mock.Mocker() as ticket_mock: ticket_mock.get( f"https://{config['subdomain']}.zendesk.com/api/v2/incremental/tickets.json", status_code=200, json={ "tickets": [input_data], "end_time": "2021-07-22T06:55:55Z", "end_of_stream": True }, ) source = SourceZendeskSupport() records = source.read(MagicMock(), config, test_catalog, None) for record in records: assert record.record.data == expected_data
def test_read(schema, record, should_fail): catalog = ConfiguredAirbyteCatalog(streams=[ ConfiguredAirbyteStream( stream=AirbyteStream.parse_obj({ "name": "test_stream", "json_schema": schema }), sync_mode="full_refresh", destination_sync_mode="overwrite", ) ]) input_config = BasicReadTestConfig() docker_runner_mock = MagicMock() docker_runner_mock.call_read.return_value = [ AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream="test_stream", data=record, emitted_at=111)) ] t = _TestBasicRead() if should_fail: with pytest.raises( AssertionError, match="stream should have some fields mentioned by json schema" ): t.test_read(None, catalog, input_config, [], docker_runner_mock, MagicMock()) else: t.test_read(None, catalog, input_config, [], docker_runner_mock, MagicMock())
def catalog_fixture(record_schema) -> ConfiguredAirbyteCatalog: stream = ConfiguredAirbyteStream( stream=AirbyteStream(name="my_stream", json_schema=record_schema), sync_mode=SyncMode.full_refresh, destination_sync_mode=DestinationSyncMode.append, ) return ConfiguredAirbyteCatalog(streams=[stream])
def _run_write( self, config: Mapping[str, Any], configured_catalog_path: str, input_stream: io.TextIOWrapper ) -> Iterable[AirbyteMessage]: catalog = ConfiguredAirbyteCatalog.parse_file(configured_catalog_path) input_messages = self._parse_input_stream(input_stream) self.logger.info("Begin writing to the destination...") yield from self.write(config=config, configured_catalog=catalog, input_messages=input_messages) self.logger.info("Writing complete.")
def slice_catalog( catalog: ConfiguredAirbyteCatalog, predicate: Callable[[str], bool]) -> ConfiguredAirbyteCatalog: sliced_catalog = ConfiguredAirbyteCatalog(streams=[]) for stream in catalog.streams: if predicate(stream.stream.name): sliced_catalog.streams.append(stream) return sliced_catalog
def catalog(request): return ConfiguredAirbyteCatalog(streams=[ ConfiguredAirbyteStream( stream=AirbyteStream(name=request.param, json_schema={}), sync_mode="full_refresh", destination_sync_mode="append", ) ])
def configured_catalog_from_client( client: BaseClient) -> ConfiguredAirbyteCatalog: """Helper to generate configured catalog for testing""" catalog = ConfiguredAirbyteCatalog(streams=[ ConfiguredAirbyteStream(stream=stream) for stream in client.streams ]) return catalog
def configured_catalog_fixture( configured_catalog_path, discovered_catalog) -> Optional[ConfiguredAirbyteCatalog]: if configured_catalog_path: catalog = ConfiguredAirbyteCatalog.parse_file(configured_catalog_path) for configured_stream in catalog.streams: configured_stream.stream = discovered_catalog.get( configured_stream.stream.name, configured_stream.stream) return catalog return None
def configured_catalog_fixture( configured_catalog_path, catalog_schemas) -> Optional[ConfiguredAirbyteCatalog]: if configured_catalog_path: catalog = ConfiguredAirbyteCatalog.parse_file(configured_catalog_path) for configured_stream in catalog.streams: configured_stream.stream.json_schema = catalog_schemas.get( configured_stream.stream.name, {}) return catalog return None
def full_refresh_only_catalog(configured_catalog: ConfiguredAirbyteCatalog) -> ConfiguredAirbyteCatalog: """Transform provided catalog to catalog with all streams configured to use Full Refresh sync (when possible)""" streams = [] for stream in configured_catalog.streams: if SyncMode.full_refresh in stream.stream.supported_sync_modes: stream.sync_mode = SyncMode.full_refresh streams.append(stream) configured_catalog.streams = streams return configured_catalog
def test_read(config): source = SourceInstagram() catalog = ConfiguredAirbyteCatalog(streams=[ ConfiguredAirbyteStream( stream=AirbyteStream(name="users", json_schema={}), sync_mode=SyncMode.full_refresh, destination_sync_mode=DestinationSyncMode.overwrite, ) ]) assert source.read(logger, config, catalog)
def test_read(config_token): source = SourcePipedrive() catalog = ConfiguredAirbyteCatalog(streams=[ ConfiguredAirbyteStream( stream=AirbyteStream(name="deals", json_schema={}), sync_mode=SyncMode.full_refresh, destination_sync_mode=DestinationSyncMode.overwrite, ) ]) assert source.read(logger, config_token, catalog)
def configured_catalog_fixture(configured_catalog_path, discovered_catalog) -> ConfiguredAirbyteCatalog: """Take ConfiguredAirbyteCatalog from discover command by default""" if configured_catalog_path: catalog = ConfiguredAirbyteCatalog.parse_file(configured_catalog_path) for configured_stream in catalog.streams: configured_stream.stream = discovered_catalog.get( configured_stream.stream.name, configured_stream.stream) return catalog streams = [ ConfiguredAirbyteStream( stream=stream, sync_mode=stream.supported_sync_modes[0], destination_sync_mode=DestinationSyncMode.append, cursor_field=stream.default_cursor_field, primary_key=stream.source_defined_primary_key, ) for _, stream in discovered_catalog.items() ] return ConfiguredAirbyteCatalog(streams=streams)
def incremental_only_catalog(configured_catalog: ConfiguredAirbyteCatalog) -> ConfiguredAirbyteCatalog: """Transform provided catalog to catalog with all streams configured to use Incremental sync (when possible)""" streams = [] for stream in configured_catalog.streams: if SyncMode.incremental in stream.stream.supported_sync_modes: stream.sync_mode = SyncMode.incremental streams.append(stream) configured_catalog.streams = streams return configured_catalog