def test_set_col_type_category_from_types(col_input: Any, expected_cat: str): meta = Metadata(columns=col_input) meta.set_col_type_category_from_types() assert meta.columns[0]["type_category"] == expected_cat
def test_unpack_complex_data_type(data_type, expected): meta = Metadata() assert _unpack_complex_data_type(data_type) == expected assert meta.unpack_complex_data_type(data_type) == expected
def test_set_col_types_from_type_category(): test_dict = { "name": "test", "description": "test", "file_format": "test", "sensitive": False, "columns": [ { "name": "test_null", "type_category": "null" }, { "name": "test_integer", "type_category": "integer" }, { "name": "test_float", "type_category": "float" }, { "name": "test_string", "type_category": "string" }, { "name": "test_timestamp", "type_category": "timestamp" }, { "name": "test_binary", "type_category": "binary" }, { "name": "test_boolean", "type_category": "boolean" }, { "name": "test_list", "type_category": "list" }, { "name": "test_struct", "type_category": "struct" }, ], } meta = Metadata.from_dict(test_dict) with pytest.warns(UserWarning): meta.set_col_types_from_type_category() for c in meta.columns: default_type_cat = c["name"].replace("test_", "") expected_type = meta.default_type_category_lookup.get(default_type_cat) assert c["type"] == expected_type new_dict = { "null": "null", "integer": "uint8", "float": "decimal128(2,5)", "string": "large_string", "timestamp": "timestamp(us)", "binary": "large_binary", "boolean": "bool_", "list": "large_list<null>", "struct": "map_<null>", } meta2 = Metadata.from_dict(test_dict) meta2.set_col_types_from_type_category( lambda x: new_dict.get(x["type_category"])) for c in meta2.columns: default_type_cat = c["name"].replace("test_", "") assert c["type"] == new_dict.get(default_type_cat)
def test_columns_pass(col_input: Any): Metadata(columns=col_input)
def test_columns_default(): metadata = Metadata() assert metadata.columns == []
def test_data_override_merge(m1, m2, data, expected_name): assert Metadata.merge(m1, m2, data_override=data).name == expected_name
def test_columns_validation_error(col_input: Any): metadata = Metadata() with pytest.raises(ValidationError): metadata.columns = col_input
def test_cols_merge(m1, m2, expected_cols): assert sorted(x.items() for x in Metadata.merge(m1, m2).columns) == sorted( x.items() for x in expected_cols)
def test_params_merge(m1, m2, expected_partitions): assert Metadata.merge(m1, m2).partitions == expected_partitions
def test_merge_error_raised(m1, m2): with pytest.raises(ValueError): Metadata.merge(m1, m2, mismatch="error")
def test_inferred_input_fails(fake_input): with pytest.raises(TypeError): Metadata.from_infer(fake_input)
def test_inferred_input_passes(monkeypatch, patch_out, fake_input): monkeypatch.setattr(Metadata, patch_out, lambda x: True) assert Metadata.from_infer(fake_input)
0.0, [], (), ], ) def test_inferred_input_fails(fake_input): with pytest.raises(TypeError): Metadata.from_infer(fake_input) merge_meta_test = Metadata.from_dict({ "name": "merge_test", "columns": [{ "name": "c1", "type": "int64" }, { "name": "c2", "type": "string" }], "partitions": ["c1"], }) merge_meta_diff_col_type = Metadata.from_dict({ "name": "merge_test", "columns": [ { "name": "c3", "type": "string" }, {
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict): """ Reads in the data from the given filepath and returns a dataframe """ meta_col_names = [ c["name"] for c in metadata["columns"] if c["name"] not in metadata.get("partitions", []) ] # For string based file types convert make arrow readers read them in as strings # validators will still treat these as dates but will run validation against strings # cols expecting values to match a timestamp format if "json" in metadata["file_format"] or "csv" in metadata["file_format"]: md_obj = Metadata.from_dict(metadata) cols = md_obj.columns cols_to_force_str_read_in = [] for c in cols: if c["type"].startswith("time") or c["type"].startswith("date"): c["type"] = "string" c["type_category"] = "string" cols_to_force_str_read_in.append(c["name"]) md_obj.columns = cols ac = ArrowConverter() arrow_schema = ac.generate_from_meta(md_obj) ts_as_str_schema = pa.schema([]) for cname in cols_to_force_str_read_in: ts_as_str_schema = ts_as_str_schema.append( arrow_schema.field(cname)) # Set the reader type if filepath.startswith("s3://"): reader_fs = fs.S3FileSystem(region="eu-west-1") fp_for_file_reader = filepath.replace("s3://", "", 1) else: reader_fs = fs.LocalFileSystem() fp_for_file_reader = filepath with reader_fs.open_input_stream(fp_for_file_reader) as f: if "csv" in metadata["file_format"]: # Safer CSV load for newlines_in_values set to True if table_params.get("expect-header", True): po = csv.ParseOptions(newlines_in_values=True) else: po = csv.ParseOptions(newlines_in_values=True, column_names=meta_col_names) if ts_as_str_schema: co = csv.ConvertOptions(column_types=ts_as_str_schema) else: co = None df = pa_read_csv_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, convert_options=co, ) # dates/datetimes == string elif "json" in metadata["file_format"]: po = json.ParseOptions( newlines_in_values=True, explicit_schema=ts_as_str_schema if ts_as_str_schema else None, ) df = pa_read_json_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, ) # dates/datetimes == string elif "parquet" in metadata["file_format"]: df = arrow_to_pandas(pq.read_table(f)) # dates/datetimes == datetime / date else: raise ValueError( f"Unknown file_format in metadata: {metadata['file_format']}.") if table_params.get("row-limit"): df = df.sample(table_params.get("row-limit")) if table_params.get("headers-ignore-case"): df_cols = [c.lower() for c in df.columns] df.columns = df_cols if table_params.get("only-test-cols-in-metadata", False): keep_cols = [c for c in df.columns if c in meta_col_names] df = df[keep_cols] return df