def test_json_load_repeated_schema(): transformer = JsonDataLiteralTransformer().use_datetime_like_cast() query = transformer.load( PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/" "repeated_schema_datum.json"), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/repeated_schema.json" )) assert query == ( "(select ['str with quote \\'', cast(null as STRING), " "'and backslash \\\\'] as f_string, [from_base64('YW55'), " "from_base64('c3RyaW5n'), cast(null as BYTES)] as f_bytes, [cast(null as INT64), " "cast(1 as INT64), cast(2 as INT64)] as f_int, [cast(1.5 as FLOAT64), cast(null as FLOAT64), " "cast(2.6 as FLOAT64)] as f_float, " "[true, false, cast(null as BOOLEAN)] as f_bool, " "[cast(null as TIMESTAMP), cast('2020-11-26 17:09:03.967259 UTC' as TIMESTAMP), " "cast('2019-11-26 17:09:03.967259 UTC' as TIMESTAMP)] as f_timestamp, " "[cast('2020-11-26' as DATE), cast(null as DATE), cast('2019-11-26' as DATE)] as f_date, " "[cast('11:09:03' as TIME), cast('10:09:03' as TIME), cast(null as TIME)] as f_time, " "[cast(null as DATETIME), cast('2020-11-26 17:09:03' as DATETIME), " "cast('2019-11-26 17:09:03' as DATETIME)] as f_datetime, [cast(1.6 as NUMERIC), " "cast(null as NUMERIC), cast(1.7 as NUMERIC)] as f_numeric, " "[struct(cast(null as ARRAY<STRING>) as f_string, [from_base64('dG9fYnl0ZXM=')] as f_bytes), " "struct(['a_string'] as f_string, cast(null as ARRAY<BYTES>) as f_bytes), " "cast(null as STRUCT<f_string ARRAY<STRING>, f_bytes ARRAY<BYTES>>)] as f_nested)" )
def test_technical_column_with_datum_literal(bqtk: BQTestKit): complex_schema = PackageFileLoader("tests/it/bq_test_kit/bq_dsl/resources/" "technical_column_schema.json") complex_datum = PackageFileLoader( "tests/it/bq_test_kit/bq_dsl/resources/technical_column_schema_datum.json" ) transformer = JsonDataLiteralTransformer(json_format=JsonFormat.JSON_ARRAY) with pytest.raises(BadRequest): bqtk.query_template(from_="select * \nfrom technical_column_table;\n") \ .with_datum({ "technical_column_table": (complex_datum, complex_schema) }) \ .as_data_literals() \ .loaded_with(transformer) \ .add_interpolator(JinjaInterpolator()) \ .run() result = bqtk.query_template(from_="select count(*) \nfrom {{technical_column_table}};\n") \ .with_datum({ "technical_column_table": (complex_datum, complex_schema) }) \ .as_data_literals() \ .loaded_with(transformer) \ .add_interpolator(JinjaInterpolator()) \ .run() assert len(result.rows) == 1
def test_json_empty_array_schema(): expected = ( "(select struct(cast(null as STRING) as f_string, " "cast([] as ARRAY<STRUCT<f_string STRING, f_bytes BYTES, f_int INT64, f_float FLOAT64, " "f_bool BOOLEAN, f_timestamp TIMESTAMP, f_date DATE, f_time TIME, f_datetime DATETIME, " "f_numeric NUMERIC, f_geography GEOGRAPHY>>) as f_repeated_struct, " "cast([] as ARRAY<STRING>) as f_string_repeated, cast([] as ARRAY<BYTES>) as f_bytes_repeated, " "cast([] as ARRAY<INT64>) as f_int_repeated, " "cast([] as ARRAY<FLOAT64>) as f_float_repeated, cast([] as ARRAY<BOOLEAN>) as f_bool_repeated, " "cast([] as ARRAY<TIMESTAMP>) as f_timestamp_repeated, cast([] as ARRAY<DATE>) as f_date_repeated, " "cast([] as ARRAY<TIME>) as f_time_repeated, cast([] as ARRAY<DATETIME>) as f_datetime_repeated, " "cast([] as ARRAY<NUMERIC>) as f_numeric_repeated, " "cast([] as ARRAY<GEOGRAPHY>) as f_geography_repeated) as f_struct, " "cast([] as ARRAY<STRUCT<f_string STRING, f_bytes BYTES, f_int INT64, f_float FLOAT64, " "f_bool BOOLEAN, f_timestamp TIMESTAMP, f_date DATE, f_time TIME, f_datetime DATETIME, " "f_numeric NUMERIC, f_geography GEOGRAPHY>>) as f_repeated_struct, " "cast([] as ARRAY<STRING>) as f_string_repeated, " "cast([] as ARRAY<BYTES>) as f_bytes_repeated, cast([] as ARRAY<INT64>) as f_int_repeated, " "cast([] as ARRAY<FLOAT64>) as f_float_repeated, " "cast([] as ARRAY<BOOLEAN>) as f_bool_repeated, " "cast([] as ARRAY<TIMESTAMP>) as f_timestamp_repeated, " "cast([] as ARRAY<DATE>) as f_date_repeated, " "cast([] as ARRAY<TIME>) as f_time_repeated, " "cast([] as ARRAY<DATETIME>) as f_datetime_repeated, " "cast([] as ARRAY<NUMERIC>) as f_numeric_repeated, cast([] as ARRAY<GEOGRAPHY>) " "as f_geography_repeated)") query = JsonDataLiteralTransformer(json_format=JsonFormat.JSON_ARRAY).load( PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/empty_array_schema_datum.json" ), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/empty_array_schema.json" )) assert query == expected
def test_json_empty_array_schema(bqtk: BQTestKit): schema = PackageFileLoader( "tests/it/bq_test_kit/data_literal_transformers/resources/empty_array_schema.json" ) query = JsonDataLiteralTransformer(json_format=JsonFormat.JSON_ARRAY).load( PackageFileLoader( "tests/it/bq_test_kit/data_literal_transformers/resources/empty_array_schema_datum.json" ), schema) result = bqtk.query_template(from_=query).run() assert result.schema == SchemaMixin().to_schema_field_list(schema)
def test_invalid_json_load(): transformer = JsonDataLiteralTransformer() with pytest.raises(RowParsingException): transformer.load( PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/invalid_json_datum.txt" ), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema.json" ))
def test_json_complex_schema(bqtk: BQTestKit): complex_schema_pfl = PackageFileLoader( "tests/it/bq_test_kit/data_literal_transformers/resources/" "complex_schema.json") transformer = JsonDataLiteralTransformer() complex_schema = SchemaMixin().to_schema_field_list(complex_schema_pfl) query = transformer.load( PackageFileLoader( "tests/it/bq_test_kit/data_literal_transformers/resources/complex_schema_datum.json" ), complex_schema_pfl) result = bqtk.query_template(from_=query).run() assert result.schema == complex_schema
def test_invalid_instance(): transformer = JsonDataLiteralTransformer() with pytest.raises(InvalidInstanceException): transformer.load( 123, PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/geography_schema.json" )) with pytest.raises(InvalidInstanceException): transformer.with_json_format(JsonFormat.JSON_ARRAY).load( 123, PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/geography_schema.json" ))
def test_complex_temp_tables(bqtk: BQTestKit): complex_schema = PackageFileLoader("tests/it/bq_test_kit/bq_dsl/resources/" "complex_schema.json") complex_datum = PackageFileLoader( "tests/it/bq_test_kit/bq_dsl/resources/complex_schema_datum.json") transformer = JsonDataLiteralTransformer() result = bqtk.query_template(from_="select * from complex_table") \ .with_temp_tables((transformer, { "complex_table": (complex_datum, complex_schema) })) \ .run() assert len(result.rows) == 1 assert result.schema == SchemaMixin().to_schema_field_list(complex_schema)
def test_json_empty_complex_schema(): expected = ( "(select * from (select cast(null as STRING) as f_string, cast(null as BYTES) as f_bytes, " "cast(null as INT64) as f_int, cast(null as FLOAT64) as f_float, cast(null as BOOLEAN) as f_bool," " cast(null as TIMESTAMP) as f_timestamp, cast(null as DATE) as f_date, " "cast(null as TIME) as f_time, cast(null as DATETIME) as f_datetime, " "cast(null as NUMERIC) as f_numeric, cast(null as GEOGRAPHY) as f_geography, " "cast(null as STRUCT<f_string STRING, f_bytes BYTES, f_int INT64, f_float FLOAT64, " "f_bool BOOLEAN, f_timestamp TIMESTAMP, f_date DATE, f_time TIME, f_datetime DATETIME, " "f_numeric NUMERIC, f_geography GEOGRAPHY, f_repeated_struct ARRAY<STRUCT<f_string STRING, " "f_bytes BYTES, f_int INT64, f_float FLOAT64, f_bool BOOLEAN, f_timestamp TIMESTAMP, " "f_date DATE, f_time TIME, f_datetime DATETIME, f_numeric NUMERIC, f_geography GEOGRAPHY>>, " "f_string_repeated ARRAY<STRING>, f_bytes_repeated ARRAY<BYTES>, f_int_repeated ARRAY<INT64>, " "f_float_repeated ARRAY<FLOAT64>, f_bool_repeated ARRAY<BOOLEAN>, " "f_timestamp_repeated ARRAY<TIMESTAMP>, f_date_repeated ARRAY<DATE>, " "f_time_repeated ARRAY<TIME>, f_datetime_repeated ARRAY<DATETIME>, " "f_numeric_repeated ARRAY<NUMERIC>, f_geography_repeated ARRAY<GEOGRAPHY>>) as f_struct, " "cast(null as ARRAY<STRUCT<f_string STRING, f_bytes BYTES, f_int INT64, f_float FLOAT64, " "f_bool BOOLEAN, f_timestamp TIMESTAMP, f_date DATE, f_time TIME, f_datetime DATETIME, " "f_numeric NUMERIC, f_geography GEOGRAPHY>>) as f_repeated_struct, " "cast(null as ARRAY<STRING>) as f_string_repeated, " "cast(null as ARRAY<BYTES>) as f_bytes_repeated, cast(null as ARRAY<INT64>) as f_int_repeated, " "cast(null as ARRAY<FLOAT64>) as f_float_repeated, " "cast(null as ARRAY<BOOLEAN>) as f_bool_repeated, " "cast(null as ARRAY<TIMESTAMP>) as f_timestamp_repeated, " "cast(null as ARRAY<DATE>) as f_date_repeated, " "cast(null as ARRAY<TIME>) as f_time_repeated, " "cast(null as ARRAY<DATETIME>) as f_datetime_repeated, " "cast(null as ARRAY<NUMERIC>) as f_numeric_repeated, cast(null as ARRAY<GEOGRAPHY>) " "as f_geography_repeated) limit 0)") query = JsonDataLiteralTransformer().load( None, PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/complex_schema.json" )) assert query == expected query = JsonDataLiteralTransformer(json_format=JsonFormat.JSON_ARRAY).load( "[]", PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/complex_schema.json" )) assert query == expected query = JsonDataLiteralTransformer().load( [], PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/complex_schema.json" )) assert query == expected
def test_json_load_nested_schema(): transformer = JsonDataLiteralTransformer().use_datetime_like_cast() query = transformer.load( PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/nested_schema_datum.json" ), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/nested_schema.json" )) assert query == ( "(select struct('str with quote \\' and backslash \\\\' as f_string, " "from_base64('YW55') as f_bytes, cast(1 as INT64) as f_int, cast(1.5 as FLOAT64) as f_float, " "true as f_bool, " "cast('2020-11-26 17:09:03.967259 UTC' as TIMESTAMP) as f_timestamp, " "cast('2020-11-26' as DATE) as f_date, cast('11:09:03' as TIME) as f_time, " "cast('2020-11-26 17:09:03' as DATETIME) as f_datetime, cast(1.6 as NUMERIC) as f_numeric, " "cast(null as STRUCT<f_string STRING>) as f_nested2) as f_nested)")
def test_csv_skip_line_load_simple_schema(): transformer = DsvDataLiteralTransformer().use_datetime_like_cast( ).skip_leading_rows(1) query = transformer.load( PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema_datum.csv" ), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema.json" )) assert query == ( "(select * from (select cast(null as STRING) as f_string, cast(null as BYTES) as f_bytes, " "cast(null as INT64) as f_int, cast(null as FLOAT64) as f_float, cast(null as BOOLEAN) as f_bool," " cast(null as TIMESTAMP) as f_timestamp, cast(null as DATE) as f_date, " "cast(null as TIME) as f_time, cast(null as DATETIME) as f_datetime, " "cast(null as NUMERIC) as f_numeric, cast(null as GEOGRAPHY) as f_geography, " "cast(null as STRUCT<f_datetime DATETIME>) as f_struct) limit 0)")
def test_json_error_if_exists_in_partition(bqtk: BQTestKit): with bqtk.project("it").dataset("dataset_foo").isolate() as ds: with ds.table("table_bar", schema=json_schema).partition_by(IngestionTime()).isolate() as t: pfl = PackageFileLoader("tests/it/bq_test_kit/bq_dsl/bq_resources/data_loaders/resources/dummy_data.json") t.json_loader(from_=pfl).to_partition("20201111").error_if_exists().load() t.json_loader(from_=pfl).to_partition("20201112").error_if_exists().load() with pytest.raises(Exception): t.json_loader(from_=pfl).to_partition("20201112").error_if_exists().load()
def test_csv_error_if_exists_in_partition(bqtk: BQTestKit): with bqtk.project("it").dataset("dataset_foo").isolate() as ds: schema = [SchemaField("f1", field_type="STRING"), SchemaField("f2", field_type="INT64")] with ds.table("table_bar", schema=schema).partition_by(IngestionTime()).isolate() as t: pfl = PackageFileLoader("tests/it/bq_test_kit/bq_dsl/bq_resources/data_loaders/resources/dummy_data.csv") t.dsv_loader(from_=pfl).to_partition("20201111").error_if_exists().load() t.dsv_loader(from_=pfl).to_partition("20201112").error_if_exists().load() with pytest.raises(Exception): t.dsv_loader(from_=pfl).to_partition("20201112").error_if_exists().load()
def test_csv_invalid_load(): transformer = DsvDataLiteralTransformer().use_datetime_like_cast( ).skip_leading_rows(1) with pytest.raises(InvalidInstanceException): transformer.load( 123, PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema.json" ))
def test_tsv_load_simple_schema(): transformer = DsvDataLiteralTransformer().use_datetime_like_cast() \ .with_quote_character("#").with_field_delimiter("\t").with_escape_character("~") query = transformer.load( PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema_datum.tsv" ), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema.json" )) assert query == ( "(select 'str with quote \\' and backslash \\\\~' as f_string, " "from_base64('YW55') as f_bytes, cast(1 as INT64) as f_int, cast(1.5 as FLOAT64) as f_float, " "true as f_bool, " "cast('2020-11-26 17:09:03.967259 UTC' as TIMESTAMP) as f_timestamp, " "cast('2020-11-26' as DATE) as f_date, cast('11:09:03' as TIME) as f_time, " "cast('2020-11-26T17:09:03' as DATETIME) as f_datetime, cast(1.6 as NUMERIC) as f_numeric, " "ST_GEOGPOINT(-122.35, 47.62) as f_geography, " "cast(null as STRUCT<f_datetime DATETIME>) as f_struct)")
def test_json_load_in_partition(bqtk: BQTestKit): with bqtk.project("it").dataset("dataset_foo").isolate() as ds: with ds.table("table_bar", schema=json_schema).partition_by(IngestionTime()).isolate() as t: pfl = PackageFileLoader("tests/it/bq_test_kit/bq_dsl/bq_resources/data_loaders/resources/dummy_data.json") t.json_loader(from_=pfl).to_partition("20201112").load() job: QueryJob = t._bq_client.query(f"select max(_partitiontime) as pt from `{t.fqdn()}`") rows_it = job.result() rows = list(rows_it) assert len(rows) == 1 assert rows[0].pt == datetime(2020, 11, 12, 00, 00, tzinfo=timezone.utc)
def test_json_load_simple_schema_with_string_cast_to_bytes(): transformer = JsonDataLiteralTransformer().use_string_cast_to_bytes( ).use_datetime_like_cast() query = transformer.load( PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/" "simple_schema_datum_with_string_cast_to_bytes.json"), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema.json" )) assert query == ( "(select 'str with quote \\' and backslash \\\\' as f_string, " "cast('any' as BYTES) as f_bytes, cast(1 as INT64) as f_int, cast(1.5 as FLOAT64) as f_float, " "true as f_bool, " "cast('2020-11-26 17:09:03.967259 UTC' as TIMESTAMP) as f_timestamp, " "cast('2020-11-26' as DATE) as f_date, cast('11:09:03' as TIME) as f_time, " "cast('2020-11-26 17:09:03' as DATETIME) as f_datetime, cast(1.6 as NUMERIC) as f_numeric, " "ST_GEOGPOINT(-122.35, 47.62) as f_geography, " "cast(null as STRUCT<f_datetime DATETIME>) as f_struct)")
def test_json_truncate_in_partition(bqtk: BQTestKit): with bqtk.project("it").dataset("dataset_foo").isolate() as ds: with ds.table("table_bar", schema=json_schema).partition_by(IngestionTime()).isolate() as t: pfl = PackageFileLoader("tests/it/bq_test_kit/bq_dsl/bq_resources/data_loaders/resources/dummy_data.json") t.json_loader(from_=pfl).to_partition("20201112").load() t.json_loader(from_=pfl).to_partition("20201112").overwrite().load() job: QueryJob = t._bq_client.query(f"select count(*) as nb from `{t.fqdn()}`") rows_it = job.result() rows = list(rows_it) assert rows[0]["nb"] == 2
def test_required_schema(): transformer = JsonDataLiteralTransformer() with pytest.raises(DataLiteralTransformException) as exception: transformer.load( PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/" "required_schema_datum.json"), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/required_schema.json" )) assert str(exception.value) == ( "Exception happened in line 1 with the following errors :\n" "\t.f_string is required,\n" "\t.f_bytes is required,\n" "\t.f_int is required,\n" "\t.f_float is required,\n" "\t.f_bool is required,\n" "\t.f_timestamp is required,\n" "\t.f_date is required,\n" "\t.f_time is required,\n" "\t.f_datetime is required,\n" "\t.f_numeric is required,\n" "\t.f_geography is required,\n" "\t.f_record is required,\n" "\t.f_repeated_record[0].f_string is required,\n" "\t.f_repeated_record[0].f_int is required,\n" "\t.f_repeated_record[0].f_bool is required,\n" "\t.f_repeated_record[0].f_date is required,\n" "\t.f_repeated_record[0].f_datetime is required,\n" "\t.f_repeated_record[0].f_geography is required,\n" "\t.f_repeated_record[0].f_record is required,\n" "\t.f_repeated_record[0].f_repeated_record[0].f_nested_string is required,\n" "\t.f_repeated_record[1].f_string is required,\n" "\t.f_repeated_record[1].f_int is required,\n" "\t.f_repeated_record[1].f_bool is required,\n" "\t.f_repeated_record[1].f_date is required,\n" "\t.f_repeated_record[1].f_datetime is required,\n" "\t.f_repeated_record[1].f_geography is required,\n" "\t.f_repeated_record[1].f_record is required,\n" "\t.f_repeated_record[1].f_repeated_record[0].f_nested_string is required" )
def test_geography_point(): transformer = JsonDataLiteralTransformer() with pytest.raises(DataLiteralTransformException) as exception: transformer.load( PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/" "geography_schema_datum.json"), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/geography_schema.json" )) assert str(exception.value) == ( "Exception happened in line 6 with the following errors :\n" "\t.f_geography is a GEOGRAPHY type. It is expected to match POINT(x y) " "where x and y are FLOAT64. Instead get NotPoInT( -122.35 47.62 ). " "POINT is case insensitive.\n\n" "Exception happened in line 7 with the following errors :\n" "\t.f_geography is a GEOGRAPHY type. It is expected to match POINT(x y) " "where x and y are FLOAT64. Instead get NotPoInT( -122.35 47.62 ). " "POINT is case insensitive.")
def test_technical_column_with_datum_temp_table(bqtk: BQTestKit): complex_schema = PackageFileLoader("tests/it/bq_test_kit/bq_dsl/resources/" "technical_column_schema.json") temp_complex_schema = PackageFileLoader( "tests/it/bq_test_kit/bq_dsl/resources/" "temp_technical_column_schema.json") complex_datum = PackageFileLoader( "tests/it/bq_test_kit/bq_dsl/resources/technical_column_schema_datum.json" ) transformer = JsonDataLiteralTransformer(json_format=JsonFormat.JSON_ARRAY) result = bqtk.query_template(from_="select * \nfrom technical_column_table;\n") \ .with_datum({ "technical_column_table": (complex_datum, complex_schema) }) \ .loaded_with(transformer) \ .add_interpolator(JinjaInterpolator()) \ .run() assert len(result.rows) == 1 assert result.schema == SchemaMixin().to_schema_field_list( temp_complex_schema) result = bqtk.query_template(from_="select count(*) \nfrom {{technical_column_table}};\n") \ .with_datum({ "technical_column_table": (complex_datum, complex_schema) }) \ .loaded_with(transformer) \ .add_interpolator(JinjaInterpolator()) \ .run() assert len(result.rows) == 1 result = bqtk.query_template(from_=""" select * from technical_column_table UNION ALL select * from {{technical_column_table}} """) \ .with_datum({ "technical_column_table": (complex_datum, complex_schema) }) \ .loaded_with(transformer) \ .add_interpolator(JinjaInterpolator()) \ .run() assert len(result.rows) == 2 assert result.rows[0] == result.rows[1]
def test_csv_append_in_partition(bqtk: BQTestKit): with bqtk.project("it").dataset("dataset_foo").isolate() as ds: schema = [SchemaField("f1", field_type="STRING"), SchemaField("f2", field_type="INT64")] with ds.table("table_bar", schema=schema).partition_by(IngestionTime()).isolate() as t: pfl = PackageFileLoader("tests/it/bq_test_kit/bq_dsl/bq_resources/data_loaders/resources/dummy_data.csv") t.dsv_loader(from_=pfl).to_partition("20201112").load() t.dsv_loader(from_=pfl).to_partition("20201112").load() t.dsv_loader(from_=pfl).to_partition("20201112").append().load() job: QueryJob = t._bq_client.query(f"select count(*) as nb from `{t.fqdn()}`") rows_it = job.result() rows = list(rows_it) assert rows[0]["nb"] == 6
def test_json_load_with_extra_column(): transformer = JsonDataLiteralTransformer() with pytest.raises(DataLiteralTransformException) as exception: transformer.load( PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/" "simple_schema_with_extra_datum.json"), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema.json" )) assert str(exception.value) == ( "Exception happened in line 1 with the following errors :\n" "\tKey f_string_extra @ . not in schema\n\n" "Exception happened in line 2 with the following errors :\n" "\tKey f_string_extra2 @ . not in schema") query = transformer.ignore_unknown_values().use_datetime_like_cast().load( PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/" "simple_schema_with_extra_datum.json"), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema.json" )) assert query == ( "(select 'str with quote \\' and backslash \\\\' as f_string, " "from_base64('YW55') as f_bytes, cast(1 as INT64) as f_int, cast(1.5 as FLOAT64) as f_float, " "true as f_bool, " "cast('2020-11-26 17:09:03.967259 UTC' as TIMESTAMP) as f_timestamp, " "cast('2020-11-26' as DATE) as f_date, cast('11:09:03' as TIME) as f_time, " "cast('2020-11-26 17:09:03' as DATETIME) as f_datetime, cast(1.6 as NUMERIC) as f_numeric, " "ST_GEOGPOINT(-122.35, 47.62) as f_geography, " "cast(null as STRUCT<f_datetime DATETIME>) as f_struct" "\nunion all\n" "select 'other string' as f_string, " "from_base64('YW55') as f_bytes, cast(2 as INT64) as f_int, cast(2.5 as FLOAT64) as f_float, " "true as f_bool, " "cast('2020-11-26 17:09:03.967259 UTC' as TIMESTAMP) as f_timestamp, " "cast('2020-11-26' as DATE) as f_date, cast('11:09:03' as TIME) as f_time, " "cast('2020-11-26 17:09:03' as DATETIME) as f_datetime, cast(2.6 as NUMERIC) as f_numeric, " "ST_GEOGPOINT(-122.50, -40.6) as f_geography, " "cast(null as STRUCT<f_datetime DATETIME>) as f_struct)")
def test_dsv_load_struct(): transformer = DsvDataLiteralTransformer() with pytest.raises(DataLiteralTransformException) as exception: transformer.load( ('"other string","YW55",2,2.5,true,"2020-11-26 17:09:03.967259 UTC",' '"2020-11-26","11:09:03","2020-11-26 17:09:03",2.6,"POINT(-122.50 -40.6)",invalid struct' ), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema.json" )) assert str(exception.value) == ( "Exception happened in line 1 with the following errors :\n" "\t.f_struct is not a dictionary while schema is of type RECORD")
def test_json_loader(): conf = BQTestKitConfig({DEFAULT_LOCATION: "EU"}) project = Project("test_project", bq_client=None, bqtk_config=conf) ds = Dataset("dataset_foo", project=project, bq_client=None, bqtk_config=conf) table = Table("table_bar", from_dataset=ds, bq_client=None, bqtk_config=conf) pfl = PackageFileLoader( "tests/ut/bq_test_kit/resource_loaders/resources/package_file_test_resource.txt" ) assert isinstance(table.json_loader(from_=pfl), JsonDataLoader)
def test_json_load_as_simple_schema(): transformer = JsonDataLiteralTransformer().use_datetime_like_cast() query = transformer.load_as({ "input_1": (PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema_datum.json" ), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema.json" )), "input_2": ("{}", PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema.json" )) }) assert query == { "input_1": ("(select 'str with quote \\' and backslash \\\\' as f_string, " "from_base64('YW55') as f_bytes, cast(1 as INT64) as f_int, cast(1.5 as FLOAT64) as f_float, " "true as f_bool, " "cast('2020-11-26 17:09:03.967259 UTC' as TIMESTAMP) as f_timestamp, " "cast('2020-11-26' as DATE) as f_date, cast('11:09:03' as TIME) as f_time, " "cast('2020-11-26T17:09:03' as DATETIME) as f_datetime, cast(1.6 as NUMERIC) as f_numeric, " "ST_GEOGPOINT(-122.35, 47.62) as f_geography, " "cast(null as STRUCT<f_datetime DATETIME>) as f_struct)"), "input_2": ("(select cast(null as STRING) as f_string, " "cast(null as BYTES) as f_bytes, cast(null as INT64) as f_int, cast(null as FLOAT64) as f_float, " "cast(null as BOOLEAN) as f_bool, " "cast(null as TIMESTAMP) as f_timestamp, " "cast(null as DATE) as f_date, cast(null as TIME) as f_time, " "cast(null as DATETIME) as f_datetime, cast(null as NUMERIC) as f_numeric, " "cast(null as GEOGRAPHY) as f_geography, " "cast(null as STRUCT<f_datetime DATETIME>) as f_struct)") }
def test_csv_load(bqtk: BQTestKit): with bqtk.project("it").dataset("dataset_foo").isolate() as ds: schema = [SchemaField("f1", field_type="STRING"), SchemaField("f2", field_type="INT64")] with ds.table("table_bar", schema=schema).isolate() as t: pfl = PackageFileLoader("tests/it/bq_test_kit/bq_dsl/bq_resources/data_loaders/resources/dummy_data.csv") t.dsv_loader(from_=pfl).load() job: QueryJob = t._bq_client.query(f"select count(*) as nb from `{t.fqdn()}`") rows_it = job.result() rows = list(rows_it) assert len(rows) == 1 assert rows[0].nb == 2 job: QueryJob = t._bq_client.query(f"select * from `{t.fqdn()}` order by f1") rows_it = job.result() rows = list(rows_it) rows_as_dict = [dict(row.items()) for row in rows] assert rows_as_dict == [{"f1": "value1", "f2": 2}, {"f1": "value3", "f2": 4}]
def test_json_array_load_simple_schema(json_input): transformer = JsonDataLiteralTransformer(json_format=JsonFormat.JSON_ARRAY) query = transformer.load( json_input, PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/simple_schema.json" ), ) assert query == ( "(select 'str with quote \\' and backslash \\\\' as f_string, " "from_base64('YW55') as f_bytes, cast(1 as INT64) as f_int, cast(1.5 as FLOAT64) as f_float, " "true as f_bool, " "timestamp '2020-11-26 17:09:03.967259 UTC' as f_timestamp, " "date '2020-11-26' as f_date, time '11:09:03' as f_time, " "datetime '2020-11-26T17:09:03' as f_datetime, cast(1.6 as NUMERIC) as f_numeric, " "ST_GEOGPOINT(-122.35, 47.62) as f_geography, " "cast(null as STRUCT<f_datetime DATETIME>) as f_struct)")
def test_json_load(bqtk: BQTestKit): with bqtk.project("it").dataset("dataset_foo").isolate() as ds: with ds.table("table_bar", schema=json_schema).isolate() as t: pfl = PackageFileLoader("tests/it/bq_test_kit/bq_dsl/bq_resources/data_loaders/resources/dummy_data.json") t.json_loader(from_=pfl).load() job: QueryJob = t._bq_client.query(f"select count(*) as nb from `{t.fqdn()}`") rows_it = job.result() rows = list(rows_it) assert len(rows) == 1 assert rows[0].nb == 2 job: QueryJob = t._bq_client.query(f"select * from `{t.fqdn()}` order by f1") rows_it = job.result() rows = list(rows_it) rows_as_dict = [dict(row.items()) for row in rows] assert rows_as_dict == [ { "f1": "value1", "struct_f2": { "f2_1": 1 }, "array_f3": [ { "f3_1": datetime(2020, 10, 21, 10, 0) }, { "f3_1": datetime(2020, 10, 21, 11, 0) } ] }, { "f1": "value2", "struct_f2": { "f2_1": 2 }, "array_f3": [ { "f3_1": datetime(2020, 10, 21, 12, 0) }, { "f3_1": datetime(2020, 10, 21, 13, 0) } ] } ]
def test_json_complex_schema(): transformer = JsonDataLiteralTransformer() query = transformer.load( PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/" "complex_schema_datum.json"), PackageFileLoader( "tests/ut/bq_test_kit/data_literal_transformers/resources/complex_schema.json" )) assert query == ( "(select 'f_string' as f_string, from_base64('YW55') as f_bytes, cast(1 as INT64) as f_int, " "cast(1.1 as FLOAT64) as f_float, true as f_bool, " "timestamp '2020-11-26 17:09:03.967259 UTC' as f_timestamp, date '2020-11-26' as f_date, " "time '17:09:03.967259' as f_time, datetime '2020-11-26 17:09:03.967259' as f_datetime, " "cast(1.2 as NUMERIC) as f_numeric, ST_GEOGPOINT(1, 2) as f_geography, " "struct('f_string' as f_string, from_base64('YW55') as f_bytes, cast(2 as INT64) as f_int, " "cast(2.1 as FLOAT64) as f_float, true as f_bool, " "timestamp '2019-11-26 17:09:03.967259 UTC' as f_timestamp, date '2019-11-26' as f_date, " "time '16:09:03.967259' as f_time, datetime '2019-11-26 17:09:03.967259' as f_datetime, " "cast(2.2 as NUMERIC) as f_numeric, ST_GEOGPOINT(1, 2) as f_geography, " "[struct('f_string' as f_string, from_base64('YW55') as f_bytes, " "cast(1 as INT64) as f_int, cast(1.1 as FLOAT64) as f_float, true as f_bool, " "timestamp '2020-11-26 17:09:03.967259 UTC' as f_timestamp, date '2020-11-26' as f_date, " "time '17:09:03.967259' as f_time, datetime '2020-11-26 17:09:03.967259' as f_datetime, " "cast(1.2 as NUMERIC) as f_numeric, ST_GEOGPOINT(1, 2) as f_geography)] as f_repeated_struct, " "['f_string'] as f_string_repeated, [from_base64('YW55')] as f_bytes_repeated, " "[cast(1 as INT64)] as f_int_repeated, [cast(1.1 as FLOAT64)] as f_float_repeated, " "[true] as f_bool_repeated, [timestamp '2020-11-26 17:09:03.967259 UTC'] as f_timestamp_repeated," " [date '2020-11-26'] as f_date_repeated, [time '17:09:03.967259'] as f_time_repeated, " "[datetime '2020-11-26 17:09:03.967259'] as f_datetime_repeated, " "[cast(1.2 as NUMERIC)] as f_numeric_repeated, [ST_GEOGPOINT(1, 2)] as f_geography_repeated) " "as f_struct, [struct('f_string' as f_string, from_base64('YW55') as f_bytes, " "cast(1 as INT64) as f_int, cast(1.1 as FLOAT64) as f_float, true as f_bool, " "timestamp '2020-11-26 17:09:03.967259 UTC' as f_timestamp, date '2020-11-26' as f_date, " "time '17:09:03.967259' as f_time, datetime '2020-11-26 17:09:03.967259' as f_datetime, " "cast(1.2 as NUMERIC) as f_numeric, ST_GEOGPOINT(1, 2) as f_geography)] as f_repeated_struct," " ['f_string'] as f_string_repeated, [from_base64('YW55')] as f_bytes_repeated, " "[cast(1 as INT64)] as f_int_repeated, [cast(1.1 as FLOAT64)] as f_float_repeated, " "[true] as f_bool_repeated, [timestamp '2020-11-26 17:09:03.967259 UTC'] as f_timestamp_repeated," " [date '2020-11-26'] as f_date_repeated, [time '17:09:03.967259'] as f_time_repeated, " "[datetime '2020-11-26 17:09:03.967259'] as f_datetime_repeated, " "[cast(1.2 as NUMERIC)] as f_numeric_repeated, [ST_GEOGPOINT(1, 2)] as f_geography_repeated" "\nunion all\n" "select cast(null as STRING) as f_string, cast(null as BYTES) as f_bytes, " "cast(null as INT64) as f_int, cast(null as FLOAT64) as f_float, cast(null as BOOLEAN) as f_bool," " cast(null as TIMESTAMP) as f_timestamp, cast(null as DATE) as f_date, " "cast(null as TIME) as f_time, cast(null as DATETIME) as f_datetime, " "cast(null as NUMERIC) as f_numeric, cast(null as GEOGRAPHY) as f_geography, " "cast(null as STRUCT<f_string STRING, f_bytes BYTES, f_int INT64, f_float FLOAT64, " "f_bool BOOLEAN, f_timestamp TIMESTAMP, f_date DATE, f_time TIME, f_datetime DATETIME, " "f_numeric NUMERIC, f_geography GEOGRAPHY, f_repeated_struct ARRAY<STRUCT<f_string STRING, " "f_bytes BYTES, f_int INT64, f_float FLOAT64, f_bool BOOLEAN, f_timestamp TIMESTAMP, " "f_date DATE, f_time TIME, f_datetime DATETIME, f_numeric NUMERIC, f_geography GEOGRAPHY>>, " "f_string_repeated ARRAY<STRING>, f_bytes_repeated ARRAY<BYTES>, f_int_repeated ARRAY<INT64>, " "f_float_repeated ARRAY<FLOAT64>, f_bool_repeated ARRAY<BOOLEAN>, " "f_timestamp_repeated ARRAY<TIMESTAMP>, f_date_repeated ARRAY<DATE>, " "f_time_repeated ARRAY<TIME>, f_datetime_repeated ARRAY<DATETIME>, " "f_numeric_repeated ARRAY<NUMERIC>, f_geography_repeated ARRAY<GEOGRAPHY>>) as f_struct, " "cast(null as ARRAY<STRUCT<f_string STRING, f_bytes BYTES, f_int INT64, f_float FLOAT64, " "f_bool BOOLEAN, f_timestamp TIMESTAMP, f_date DATE, f_time TIME, f_datetime DATETIME, " "f_numeric NUMERIC, f_geography GEOGRAPHY>>) as f_repeated_struct, " "cast(null as ARRAY<STRING>) as f_string_repeated, " "cast(null as ARRAY<BYTES>) as f_bytes_repeated, cast(null as ARRAY<INT64>) as f_int_repeated, " "cast(null as ARRAY<FLOAT64>) as f_float_repeated, " "cast(null as ARRAY<BOOLEAN>) as f_bool_repeated, " "cast(null as ARRAY<TIMESTAMP>) as f_timestamp_repeated, " "cast(null as ARRAY<DATE>) as f_date_repeated, " "cast(null as ARRAY<TIME>) as f_time_repeated, " "cast(null as ARRAY<DATETIME>) as f_datetime_repeated, " "cast(null as ARRAY<NUMERIC>) as f_numeric_repeated, cast(null as ARRAY<GEOGRAPHY>) " "as f_geography_repeated" "\nunion all\n" "select cast(null as STRING) as f_string, cast(null as BYTES) as f_bytes, " "cast(null as INT64) as f_int, cast(null as FLOAT64) as f_float, cast(null as BOOLEAN) as f_bool," " cast(null as TIMESTAMP) as f_timestamp, cast(null as DATE) as f_date, " "cast(null as TIME) as f_time, cast(null as DATETIME) as f_datetime, " "cast(null as NUMERIC) as f_numeric, cast(null as GEOGRAPHY) as f_geography, " "cast(null as STRUCT<f_string STRING, f_bytes BYTES, f_int INT64, f_float FLOAT64, " "f_bool BOOLEAN, f_timestamp TIMESTAMP, f_date DATE, f_time TIME, f_datetime DATETIME, " "f_numeric NUMERIC, f_geography GEOGRAPHY, f_repeated_struct ARRAY<STRUCT<f_string STRING, " "f_bytes BYTES, f_int INT64, f_float FLOAT64, f_bool BOOLEAN, f_timestamp TIMESTAMP, f_date DATE," " f_time TIME, f_datetime DATETIME, f_numeric NUMERIC, f_geography GEOGRAPHY>>, " "f_string_repeated ARRAY<STRING>, f_bytes_repeated ARRAY<BYTES>, f_int_repeated ARRAY<INT64>, " "f_float_repeated ARRAY<FLOAT64>, f_bool_repeated ARRAY<BOOLEAN>, " "f_timestamp_repeated ARRAY<TIMESTAMP>, f_date_repeated ARRAY<DATE>, " "f_time_repeated ARRAY<TIME>, f_datetime_repeated ARRAY<DATETIME>, " "f_numeric_repeated ARRAY<NUMERIC>, f_geography_repeated ARRAY<GEOGRAPHY>>) as f_struct, " "[cast(null as STRUCT<f_string STRING, f_bytes BYTES, f_int INT64, f_float FLOAT64, " "f_bool BOOLEAN, f_timestamp TIMESTAMP, f_date DATE, f_time TIME, f_datetime DATETIME, " "f_numeric NUMERIC, f_geography GEOGRAPHY>)] as f_repeated_struct, " "[cast(null as STRING)] as f_string_repeated, [cast(null as BYTES)] as f_bytes_repeated, " "[cast(null as INT64)] as f_int_repeated, [cast(null as FLOAT64)] as f_float_repeated, " "[cast(null as BOOLEAN)] as f_bool_repeated, [cast(null as TIMESTAMP)] as f_timestamp_repeated, " "[cast(null as DATE)] as f_date_repeated, [cast(null as TIME)] as f_time_repeated, " "[cast(null as DATETIME)] as f_datetime_repeated, [cast(null as NUMERIC)] as f_numeric_repeated, " "[cast(null as GEOGRAPHY)] as f_geography_repeated)")