def test_master_schema(self, capsys, user_schema, min_datetime, ordered_file_infos, file_schemas, expected_schema, log_expected, error_expected): file_format_parser_mock = MagicMock(return_value=MagicMock( get_inferred_schema=MagicMock(side_effect=file_schemas))) with patch.object(IncrementalFileStreamS3, "fileformatparser_class", file_format_parser_mock): with patch.object(IncrementalFileStreamS3, "get_time_ordered_file_infos", MagicMock(return_value=ordered_file_infos)): stream_instance = IncrementalFileStreamS3( dataset="dummy", provider={}, format={"filetype": "csv"}, schema=user_schema, path_pattern="**/prefix*.csv") if error_expected: with pytest.raises(RuntimeError): stream_instance._get_master_schema( min_datetime=min_datetime) else: assert stream_instance._get_master_schema( min_datetime=min_datetime) == expected_schema if log_expected: captured = capsys.readouterr() assert "Detected mismatched datatype" in captured.out
def test_fileformatparser_map(self): stream_instance = IncrementalFileStreamS3( dataset="dummy", provider={"bucket": "test-test"}, format={}, path_pattern="**/prefix*.csv") assert stream_instance.fileformatparser_map
def test_get_json_schema(self): stream_instance = IncrementalFileStreamS3( dataset="dummy", provider={}, format={"filetype": "csv"}, schema={}, path_pattern="**/prefix*.csv") assert stream_instance.get_json_schema() == { "properties": { "_ab_additional_properties": { "type": "object" }, "_ab_source_file_last_modified": { "format": "date-time", "type": "string" }, "_ab_source_file_url": { "type": "string" }, "column_A": { "type": ["null", "string"] }, "column_B": { "type": ["null", "integer"] }, "column_C": { "type": ["null", "boolean"] }, }, "type": "object", }
def test_fileformatparser_class(self, file_type, error_expected): stream_instance = IncrementalFileStreamS3( dataset="dummy", provider={}, format={"filetype": file_type}, schema={}, path_pattern="**/prefix*.csv") if error_expected: with pytest.raises(RuntimeError): _ = stream_instance.fileformatparser_class else: assert stream_instance.fileformatparser_class
def test_get_schema_map(self): stream_instance = IncrementalFileStreamS3( dataset="dummy", provider={}, format={"filetype": "csv"}, schema={}, path_pattern="**/prefix*.csv") assert stream_instance._get_schema_map() == { "_ab_additional_properties": "object", "_ab_source_file_last_modified": "string", "_ab_source_file_url": "string", "column_A": "string", "column_B": "integer", "column_C": "boolean", }
def test_incremental_read(self): stream_instance = IncrementalFileStreamS3( dataset="dummy", provider={"bucket": "test-test"}, format={}, path_pattern="**/prefix*.csv") stream_instance._list_bucket = MagicMock() records = [] slices = stream_instance.stream_slices(sync_mode=SyncMode.incremental) for slice in slices: records.extend( list( stream_instance.read_records( stream_slice=slice, sync_mode=SyncMode.incremental))) assert not records
def test_filepath_iterator(self, bucket, path_prefix, list_v2_objects, expected_file_info): provider = { "aws_access_key_id": "key_id", "aws_secret_access_key": "access_key" } s3_client_mock = MagicMock(return_value=MagicMock( list_objects_v2=MagicMock(side_effect=list_v2_objects))) with patch("source_s3.stream.make_s3_client", s3_client_mock): stream_instance = IncrementalFileStreamS3( dataset="dummy", provider={ "bucket": bucket, "path_prefix": path_prefix, **provider }, format={}, path_pattern="**/prefix*.png", ) expected_info = iter(expected_file_info) for file_info in stream_instance.filepath_iterator(): assert file_info == next(expected_info)
def test_read(self): stream_instance = IncrementalFileStreamS3( dataset="dummy", provider={"bucket": "test-test"}, format={}, path_pattern="**/prefix*.csv") stream_instance._list_bucket = MagicMock() records = [] slices = stream_instance.stream_slices(sync_mode=SyncMode.full_refresh) for slice in slices: records.extend( list( stream_instance.read_records( stream_slice=slice, sync_mode=SyncMode.full_refresh, stream_state={ "_ab_source_file_last_modified": "1999-01-01T00:00:00+0000" }, ))) assert not records