def test_dataframe_to_bq_schema_fallback_needed_w_pyarrow(module_under_test): dataframe = pandas.DataFrame( data=[ {"id": 10, "status": u"FOO", "created_at": datetime.date(2019, 5, 10)}, {"id": 20, "status": u"BAR", "created_at": datetime.date(2018, 9, 12)}, ] ) with warnings.catch_warnings(record=True) as warned: detected_schema = module_under_test.dataframe_to_bq_schema( dataframe, bq_schema=[] ) expected_schema = ( schema.SchemaField("id", "INTEGER", mode="NULLABLE"), schema.SchemaField("status", "STRING", mode="NULLABLE"), schema.SchemaField("created_at", "DATE", mode="NULLABLE"), ) by_name = operator.attrgetter("name") assert sorted(detected_schema, key=by_name) == sorted(expected_schema, key=by_name) # there should be no relevant warnings unwanted_warnings = [ warning for warning in warned if "could not determine" in str(warning).lower() ] assert not unwanted_warnings
def test_dataframe_to_parquet_dict_sequence_schema(module_under_test): dict_schema = [ {"name": "field01", "type": "STRING", "mode": "REQUIRED"}, {"name": "field02", "type": "BOOL", "mode": "NULLABLE"}, ] dataframe = pandas.DataFrame( {"field01": [u"hello", u"world"], "field02": [True, False]} ) write_table_patch = mock.patch.object( module_under_test.pyarrow.parquet, "write_table", autospec=True ) to_arrow_patch = mock.patch.object( module_under_test, "dataframe_to_arrow", autospec=True ) with write_table_patch, to_arrow_patch as fake_to_arrow: module_under_test.dataframe_to_parquet(dataframe, dict_schema, None) expected_schema_arg = [ schema.SchemaField("field01", "STRING", mode="REQUIRED"), schema.SchemaField("field02", "BOOL", mode="NULLABLE"), ] schema_arg = fake_to_arrow.call_args.args[1] assert schema_arg == expected_schema_arg
def test_dataframe_to_arrow_w_unknown_type(module_under_test): bq_schema = ( schema.SchemaField("field00", "UNKNOWN_TYPE"), schema.SchemaField("field01", "STRING"), schema.SchemaField("field02", "BYTES"), schema.SchemaField("field03", "INTEGER"), ) dataframe = pandas.DataFrame({ "field00": ["whoami", "whatami"], "field01": ["hello", "world"], "field02": [b"abd", b"efg"], "field03": [1, 2], }) with warnings.catch_warnings(record=True) as warned: arrow_table = module_under_test.dataframe_to_arrow( dataframe, bq_schema) arrow_schema = arrow_table.schema assert len(warned) == 1 warning = warned[0] assert "field00" in str(warning) assert len(arrow_schema) == len(bq_schema) assert arrow_schema[0].name == "field00" assert arrow_schema[1].name == "field01" assert arrow_schema[2].name == "field02" assert arrow_schema[3].name == "field03"
def test_dataframe_to_bq_schema_dict_sequence(module_under_test): df_data = collections.OrderedDict([ ("str_column", [u"hello", u"world"]), ("int_column", [42, 8]), ("bool_column", [True, False]), ]) dataframe = pandas.DataFrame(df_data) dict_schema = [ { "name": "str_column", "type": "STRING", "mode": "NULLABLE" }, { "name": "bool_column", "type": "BOOL", "mode": "REQUIRED" }, ] returned_schema = module_under_test.dataframe_to_bq_schema( dataframe, dict_schema) expected_schema = ( schema.SchemaField("str_column", "STRING", "NULLABLE"), schema.SchemaField("int_column", "INTEGER", "NULLABLE"), schema.SchemaField("bool_column", "BOOL", "REQUIRED"), ) assert returned_schema == expected_schema
def test_augment_schema_type_detection_fails(module_under_test): dataframe = pandas.DataFrame( data=[ { "status": u"FOO", "struct_field": {"one": 1}, "struct_field_2": {"foo": u"123"}, }, { "status": u"BAR", "struct_field": {"two": u"111"}, "struct_field_2": {"bar": 27}, }, ] ) current_schema = [ schema.SchemaField("status", field_type="STRING", mode="NULLABLE"), schema.SchemaField("struct_field", field_type=None, mode="NULLABLE"), schema.SchemaField("struct_field_2", field_type=None, mode="NULLABLE"), ] with warnings.catch_warnings(record=True) as warned: augmented_schema = module_under_test.augment_schema(dataframe, current_schema) assert augmented_schema is None expected_warnings = [ warning for warning in warned if "could not determine" in str(warning) ] assert len(expected_warnings) == 1 warning_msg = str(expected_warnings[0]) assert "pyarrow" in warning_msg.lower() assert "struct_field" in warning_msg and "struct_field_2" in warning_msg
def test_download_arrow_tabledata_list_known_field_type(module_under_test): fake_page = api_core.page_iterator.Page( parent=mock.Mock(), items=[{"page_data": "foo"}], item_to_value=api_core.page_iterator._item_to_value_identity, ) fake_page._columns = [[1, 10, 100], ["2.2", "22.22", "222.222"]] pages = [fake_page] bq_schema = [ schema.SchemaField("population_size", "INTEGER"), schema.SchemaField("non_alien_field", "STRING"), ] results_gen = module_under_test.download_arrow_tabledata_list(pages, bq_schema) with warnings.catch_warnings(record=True) as warned: result = next(results_gen) unwanted_warnings = [ warning for warning in warned if "please pass schema= explicitly" in str(warning).lower() ] assert not unwanted_warnings assert len(result.columns) == 2 col = result.columns[0] assert type(col) is pyarrow.lib.Int64Array assert list(col) == [1, 10, 100] col = result.columns[1] assert type(col) is pyarrow.lib.StringArray assert list(col) == ["2.2", "22.22", "222.222"]
def test_bq_to_arrow_schema_w_unknown_type(module_under_test): fields = ( schema.SchemaField("field1", "STRING"), schema.SchemaField("field2", "INTEGER"), # Don't know what to convert UNKNOWN_TYPE to, let type inference work, # instead. schema.SchemaField("field3", "UNKNOWN_TYPE"), ) actual = module_under_test.bq_to_arrow_schema(fields) assert actual is None
def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test): fields = ( schema.SchemaField("field1", "STRING"), schema.SchemaField("field2", "INTEGER"), # Don't know what to convert UNKNOWN_TYPE to, let type inference work, # instead. schema.SchemaField("field3", "UNKNOWN_TYPE"), ) field = schema.SchemaField("ignored_name", "RECORD", mode="NULLABLE", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) assert actual is None
def test_bq_to_arrow_schema_w_unknown_type(module_under_test): fields = ( schema.SchemaField("field1", "STRING"), schema.SchemaField("field2", "INTEGER"), # Don't know what to convert UNKNOWN_TYPE to, let type inference work, # instead. schema.SchemaField("field3", "UNKNOWN_TYPE"), ) with warnings.catch_warnings(record=True) as warned: actual = module_under_test.bq_to_arrow_schema(fields) assert actual is None assert len(warned) == 1 warning = warned[0] assert "field3" in str(warning)
def test_to_api_repr_base(self): ec = external_config.ExternalConfig("") ec.source_uris = self.SOURCE_URIS ec.max_bad_records = 17 ec.autodetect = True ec.ignore_unknown_values = False ec.compression = "compression" ec.schema = [schema.SchemaField("full_name", "STRING", mode="REQUIRED")] exp_schema = { "fields": [ { "name": "full_name", "type": "STRING", "mode": "REQUIRED", "description": None, } ] } got_resource = ec.to_api_repr() exp_resource = { "sourceFormat": "", "sourceUris": self.SOURCE_URIS, "maxBadRecords": 17, "autodetect": True, "ignoreUnknownValues": False, "compression": "compression", "schema": exp_schema, } self.assertEqual(got_resource, exp_resource)
def test_from_api_repr_base(self): resource = copy.deepcopy(self.BASE_RESOURCE) ec = external_config.ExternalConfig.from_api_repr(resource) self._verify_base(ec) self.assertEqual(ec.schema, []) self.assertIsNone(ec.options) got_resource = ec.to_api_repr() self.assertEqual(got_resource, self.BASE_RESOURCE) resource = _copy_and_update( self.BASE_RESOURCE, { "schema": { "fields": [ { "name": "full_name", "type": "STRING", "mode": "REQUIRED", "description": None, } ] } }, ) ec = external_config.ExternalConfig.from_api_repr(resource) self._verify_base(ec) exp_schema = [schema.SchemaField("full_name", "STRING", mode="REQUIRED")] self.assertEqual(ec.schema, exp_schema) self.assertIsNone(ec.options) got_resource = ec.to_api_repr() self.assertEqual(got_resource, resource)
def test_bq_to_arrow_array_w_pandas_timestamp(module_under_test, bq_type, rows): rows = [pandas.Timestamp(row) for row in rows] series = pandas.Series(rows) bq_field = schema.SchemaField("field_name", bq_type) arrow_array = module_under_test.bq_to_arrow_array(series, bq_field) roundtrip = arrow_array.to_pandas() assert series.equals(roundtrip)
def test_from_api_repr_base(self): resource = copy.deepcopy(self.BASE_RESOURCE) ec = external_config.ExternalConfig.from_api_repr(resource) self._verify_base(ec) self.assertEqual(ec.schema, []) self.assertIsNone(ec.options) got_resource = ec.to_api_repr() self.assertEqual(got_resource, self.BASE_RESOURCE) resource = _copy_and_update(self.BASE_RESOURCE, { 'schema': { 'fields': [ { 'name': 'full_name', 'type': 'STRING', 'mode': 'REQUIRED', 'description': None, }, ], }, }) ec = external_config.ExternalConfig.from_api_repr(resource) self._verify_base(ec) exp_schema = [ schema.SchemaField('full_name', 'STRING', mode='REQUIRED') ] self.assertEqual(ec.schema, exp_schema) self.assertIsNone(ec.options) got_resource = ec.to_api_repr() self.assertEqual(got_resource, resource)
def test_bq_to_arrow_array_w_nullable_scalars(module_under_test, bq_type, rows): series = pandas.Series(rows, dtype="object") bq_field = schema.SchemaField("field_name", bq_type) arrow_array = module_under_test.bq_to_arrow_array(series, bq_field) roundtrip = arrow_array.to_pylist() assert rows == roundtrip
def test_bq_to_arrow_array_w_arrays(module_under_test): rows = [[1, 2, 3], [], [4, 5, 6]] series = pandas.Series(rows, dtype="object") bq_field = schema.SchemaField("field_name", "INTEGER", mode="REPEATED") arrow_array = module_under_test.bq_to_arrow_array(series, bq_field) roundtrip = arrow_array.to_pylist() assert rows == roundtrip
def test_to_api_repr_base(self): ec = external_config.ExternalConfig('') ec.source_uris = self.SOURCE_URIS ec.max_bad_records = 17 ec.autodetect = True ec.ignore_unknown_values = False ec.compression = 'compression' ec.schema = [ schema.SchemaField('full_name', 'STRING', mode='REQUIRED') ] exp_schema = { 'fields': [ { 'name': 'full_name', 'type': 'STRING', 'mode': 'REQUIRED', 'description': None, }, ] } got_resource = ec.to_api_repr() exp_resource = { 'sourceFormat': '', 'sourceUris': self.SOURCE_URIS, 'maxBadRecords': 17, 'autodetect': True, 'ignoreUnknownValues': False, 'compression': 'compression', 'schema': exp_schema } self.assertEqual(got_resource, exp_resource)
def test_dataframe_to_parquet_w_missing_columns(module_under_test, monkeypatch): with pytest.raises(ValueError) as exc_context: module_under_test.dataframe_to_parquet( pandas.DataFrame(), (schema.SchemaField("not_found", "STRING"), ), None) assert "columns in schema must match" in str(exc_context.value)
def test_dataframe_to_parquet_w_extra_fields(module_under_test, monkeypatch): with pytest.raises(ValueError) as exc_context: module_under_test.dataframe_to_parquet( pandas.DataFrame(), (schema.SchemaField("not_in_df", "STRING"),), None ) message = str(exc_context.value) assert "bq_schema contains fields not present in dataframe" in message assert "not_in_df" in message
def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test): fields = ( schema.SchemaField("field1", "STRING"), schema.SchemaField("field2", "INTEGER"), # Don't know what to convert UNKNOWN_TYPE to, let type inference work, # instead. schema.SchemaField("field3", "UNKNOWN_TYPE"), ) field = schema.SchemaField("ignored_name", "RECORD", mode="NULLABLE", fields=fields) with warnings.catch_warnings(record=True) as warned: actual = module_under_test.bq_to_arrow_data_type(field) assert actual is None assert len(warned) == 1 warning = warned[0] assert "field3" in str(warning)
def test_bq_to_arrow_array_w_structs(module_under_test, bq_type): rows = [ {"int_col": 123, "string_col": "abc"}, None, {"int_col": 456, "string_col": "def"}, ] series = pandas.Series(rows, dtype="object") bq_field = schema.SchemaField( "field_name", bq_type, fields=( schema.SchemaField("int_col", "INTEGER"), schema.SchemaField("string_col", "STRING"), ), ) arrow_array = module_under_test.bq_to_arrow_array(series, bq_field) roundtrip = arrow_array.to_pylist() assert rows == roundtrip
def augment_schema(dataframe, current_bq_schema): """Try to deduce the unknown field types and return an improved schema. This function requires ``pyarrow`` to run. If all the missing types still cannot be detected, ``None`` is returned. If all types are already known, a shallow copy of the given schema is returned. Args: dataframe (pandas.DataFrame): DataFrame for which some of the field types are still unknown. current_bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]): A BigQuery schema for ``dataframe``. The types of some or all of the fields may be ``None``. Returns: Optional[Sequence[google.cloud.bigquery.schema.SchemaField]] """ # pytype: disable=attribute-error augmented_schema = [] unknown_type_fields = [] for field in current_bq_schema: if field.field_type is not None: augmented_schema.append(field) continue arrow_table = pyarrow.array(dataframe[field.name]) if pyarrow.types.is_list(arrow_table.type): # `pyarrow.ListType` detected_mode = "REPEATED" detected_type = ARROW_SCALAR_IDS_TO_BQ.get( arrow_table.values.type.id) else: detected_mode = field.mode detected_type = ARROW_SCALAR_IDS_TO_BQ.get(arrow_table.type.id) if detected_type is None: unknown_type_fields.append(field) continue new_field = schema.SchemaField( name=field.name, field_type=detected_type, mode=detected_mode, description=field.description, fields=field.fields, ) augmented_schema.append(new_field) if unknown_type_fields: warnings.warn( "Pyarrow could not determine the type of columns: {}.".format( ", ".join(field.name for field in unknown_type_fields))) return None return augmented_schema
def dataframe_to_bq_schema(dataframe, bq_schema): """Convert a pandas DataFrame schema to a BigQuery schema. Args: dataframe (pandas.DataFrame): DataFrame for which the client determines the BigQuery schema. bq_schema (Sequence[google.cloud.bigquery.schema.SchemaField]): A BigQuery schema. Use this argument to override the autodetected type for some or all of the DataFrame columns. Returns: Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]: The automatically determined schema. Returns None if the type of any column cannot be determined. """ if bq_schema: for field in bq_schema: if field.field_type in schema._STRUCT_TYPES: raise ValueError( "Uploading dataframes with struct (record) column types " "is not supported. See: " "https://github.com/googleapis/google-cloud-python/issues/8191" ) bq_schema_index = {field.name: field for field in bq_schema} bq_schema_unused = set(bq_schema_index.keys()) else: bq_schema_index = {} bq_schema_unused = set() bq_schema_out = [] for column, dtype in list_columns_and_indexes(dataframe): # Use provided type from schema, if present. bq_field = bq_schema_index.get(column) if bq_field: bq_schema_out.append(bq_field) bq_schema_unused.discard(bq_field.name) continue # Otherwise, try to automatically determine the type based on the # pandas dtype. bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name) if not bq_type: warnings.warn(u"Unable to determine type of column '{}'.".format(column)) return None bq_field = schema.SchemaField(column, bq_type) bq_schema_out.append(bq_field) # Catch any schema mismatch. The developer explicitly asked to serialize a # column, but it was not found. if bq_schema_unused: raise ValueError( u"bq_schema contains fields not present in dataframe: {}".format( bq_schema_unused ) ) return tuple(bq_schema_out)
def test_upload_time_and_datetime_56(bigquery_client, dataset_id): df = pandas.DataFrame( dict( dt=[ datetime.datetime(2020, 1, 8, 8, 0, 0), datetime.datetime( 2020, 1, 8, 8, 0, 0, tzinfo=datetime.timezone(datetime.timedelta(hours=-7)), ), ], t=[datetime.time(0, 0, 10, 100001), None], ) ) table = f"{dataset_id}.test_upload_time_and_datetime" bigquery_client.load_table_from_dataframe(df, table).result() data = list(map(list, bigquery_client.list_rows(table))) assert data == [ [ datetime.datetime(2020, 1, 8, 8, 0, tzinfo=datetime.timezone.utc), datetime.time(0, 0, 10, 100001), ], [datetime.datetime(2020, 1, 8, 15, 0, tzinfo=datetime.timezone.utc), None], ] from google.cloud.bigquery import job, schema table = f"{dataset_id}.test_upload_time_and_datetime_dt" config = job.LoadJobConfig( schema=[schema.SchemaField("dt", "DATETIME"), schema.SchemaField("t", "TIME")] ) bigquery_client.load_table_from_dataframe(df, table, job_config=config).result() data = list(map(list, bigquery_client.list_rows(table))) assert data == [ [datetime.datetime(2020, 1, 8, 8, 0), datetime.time(0, 0, 10, 100001)], [datetime.datetime(2020, 1, 8, 15, 0), None], ]
def test_bq_to_arrow_array_w_special_floats(module_under_test): bq_field = schema.SchemaField("field_name", "FLOAT64") rows = [float("-inf"), float("nan"), float("inf"), None] series = pandas.Series(rows, dtype="object") arrow_array = module_under_test.bq_to_arrow_array(series, bq_field) roundtrip = arrow_array.to_pylist() assert len(rows) == len(roundtrip) assert roundtrip[0] == float("-inf") assert roundtrip[1] != roundtrip[1] # NaN doesn't equal itself. assert roundtrip[2] == float("inf") assert roundtrip[3] is None
def test_bq_to_arrow_array_w_special_floats(module_under_test): bq_field = schema.SchemaField("field_name", "FLOAT64") rows = [float("-inf"), float("nan"), float("inf"), None] series = pandas.Series(rows, dtype="object") arrow_array = module_under_test.bq_to_arrow_array(series, bq_field) roundtrip = arrow_array.to_pylist() assert len(rows) == len(roundtrip) assert roundtrip[0] == float("-inf") # Since we are converting from pandas, NaN is treated as NULL in pyarrow # due to pandas conventions. # https://arrow.apache.org/docs/python/data.html#none-values-and-nan-handling assert roundtrip[1] is None assert roundtrip[2] == float("inf") assert roundtrip[3] is None
def test_dataframe_to_parquet_compression_method(module_under_test): bq_schema = (schema.SchemaField("field00", "STRING"),) dataframe = pandas.DataFrame({"field00": ["foo", "bar"]}) write_table_patch = mock.patch.object( module_under_test.pyarrow.parquet, "write_table", autospec=True ) with write_table_patch as fake_write_table: module_under_test.dataframe_to_parquet( dataframe, bq_schema, None, parquet_compression="ZSTD" ) call_args = fake_write_table.call_args assert call_args is not None assert call_args.kwargs.get("compression") == "ZSTD"
def bq_to_arrow_data_type(field): """Return the Arrow data type, corresponding to a given BigQuery column. Returns None if default Arrow type inspection should be used. """ if field.mode is not None and field.mode.upper() == "REPEATED": inner_type = bq_to_arrow_data_type( schema.SchemaField(field.name, field.field_type)) if inner_type: return pyarrow.list_(inner_type) return None if field.field_type.upper() in STRUCT_TYPES: return bq_to_arrow_struct_data_type(field) data_type_constructor = BQ_TO_ARROW_SCALARS.get(field.field_type.upper()) if data_type_constructor is None: return None return data_type_constructor()
def test_dataframe_to_arrow_w_required_fields(module_under_test): bq_schema = ( schema.SchemaField("field01", "STRING", mode="REQUIRED"), schema.SchemaField("field02", "BYTES", mode="REQUIRED"), schema.SchemaField("field03", "INTEGER", mode="REQUIRED"), schema.SchemaField("field04", "INT64", mode="REQUIRED"), schema.SchemaField("field05", "FLOAT", mode="REQUIRED"), schema.SchemaField("field06", "FLOAT64", mode="REQUIRED"), schema.SchemaField("field07", "NUMERIC", mode="REQUIRED"), schema.SchemaField("field08", "BOOLEAN", mode="REQUIRED"), schema.SchemaField("field09", "BOOL", mode="REQUIRED"), schema.SchemaField("field10", "TIMESTAMP", mode="REQUIRED"), schema.SchemaField("field11", "DATE", mode="REQUIRED"), schema.SchemaField("field12", "TIME", mode="REQUIRED"), schema.SchemaField("field13", "DATETIME", mode="REQUIRED"), schema.SchemaField("field14", "GEOGRAPHY", mode="REQUIRED"), ) dataframe = pandas.DataFrame({ "field01": ["hello", "world"], "field02": [b"abd", b"efg"], "field03": [1, 2], "field04": [3, 4], "field05": [1.25, 9.75], "field06": [-1.75, -3.5], "field07": [decimal.Decimal("1.2345"), decimal.Decimal("6.7891")], "field08": [True, False], "field09": [False, True], "field10": [ datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=pytz.utc), datetime.datetime(2012, 12, 21, 9, 7, 42, tzinfo=pytz.utc), ], "field11": [datetime.date(9999, 12, 31), datetime.date(1970, 1, 1)], "field12": [datetime.time(23, 59, 59, 999999), datetime.time(12, 0, 0)], "field13": [ datetime.datetime(1970, 1, 1, 0, 0, 0), datetime.datetime(2012, 12, 21, 9, 7, 42), ], "field14": [ "POINT(30 10)", "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))", ], }) arrow_table = module_under_test.dataframe_to_arrow(dataframe, bq_schema) arrow_schema = arrow_table.schema assert len(arrow_schema) == len(bq_schema) for arrow_field in arrow_schema: assert not arrow_field.nullable
def dataframe_to_bq_schema(dataframe, bq_schema): """Convert a pandas DataFrame schema to a BigQuery schema. Args: dataframe (pandas.DataFrame): DataFrame for which the client determines the BigQuery schema. bq_schema (Sequence[Union[ \ :class:`~google.cloud.bigquery.schema.SchemaField`, \ Mapping[str, Any] \ ]]): A BigQuery schema. Use this argument to override the autodetected type for some or all of the DataFrame columns. Returns: Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]: The automatically determined schema. Returns None if the type of any column cannot be determined. """ if bq_schema: bq_schema = schema._to_schema_fields(bq_schema) bq_schema_index = {field.name: field for field in bq_schema} bq_schema_unused = set(bq_schema_index.keys()) else: bq_schema_index = {} bq_schema_unused = set() bq_schema_out = [] unknown_type_fields = [] for column, dtype in list_columns_and_indexes(dataframe): # Use provided type from schema, if present. bq_field = bq_schema_index.get(column) if bq_field: bq_schema_out.append(bq_field) bq_schema_unused.discard(bq_field.name) continue # Otherwise, try to automatically determine the type based on the # pandas dtype. bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name) bq_field = schema.SchemaField(column, bq_type) bq_schema_out.append(bq_field) if bq_field.field_type is None: unknown_type_fields.append(bq_field) # Catch any schema mismatch. The developer explicitly asked to serialize a # column, but it was not found. if bq_schema_unused: raise ValueError( u"bq_schema contains fields not present in dataframe: {}".format( bq_schema_unused)) # If schema detection was not successful for all columns, also try with # pyarrow, if available. if unknown_type_fields: if not pyarrow: msg = u"Could not determine the type of columns: {}".format( ", ".join(field.name for field in unknown_type_fields)) warnings.warn(msg) return None # We cannot detect the schema in full. # The augment_schema() helper itself will also issue unknown type # warnings if detection still fails for any of the fields. bq_schema_out = augment_schema(dataframe, bq_schema_out) return tuple(bq_schema_out) if bq_schema_out else None
def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): fields = ( schema.SchemaField("field01", "STRING"), schema.SchemaField("field02", "BYTES"), schema.SchemaField("field03", "INTEGER"), schema.SchemaField("field04", "INT64"), schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), schema.SchemaField("field08", "BOOLEAN"), schema.SchemaField("field09", "BOOL"), schema.SchemaField("field10", "TIMESTAMP"), schema.SchemaField("field11", "DATE"), schema.SchemaField("field12", "TIME"), schema.SchemaField("field13", "DATETIME"), schema.SchemaField("field14", "GEOGRAPHY"), ) field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) expected_value_type = pyarrow.struct(( pyarrow.field("field01", pyarrow.string()), pyarrow.field("field02", pyarrow.binary()), pyarrow.field("field03", pyarrow.int64()), pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), pyarrow.field("field07", module_under_test.pyarrow_numeric()), pyarrow.field("field08", pyarrow.bool_()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", module_under_test.pyarrow_timestamp()), pyarrow.field("field11", pyarrow.date32()), pyarrow.field("field12", module_under_test.pyarrow_time()), pyarrow.field("field13", module_under_test.pyarrow_datetime()), pyarrow.field("field14", pyarrow.string()), )) assert pyarrow.types.is_list(actual) assert pyarrow.types.is_struct(actual.value_type) assert actual.value_type.num_children == len(fields) assert actual.value_type.equals(expected_value_type)