def test_merge_error_not_raised(): m1 = Metadata.from_dict({ "name": "merge_test", "columns": [ { "name": "c1", "type": "string" }, { "name": "c2", "type": "int64" }, ], "partitions": ["c2"], }) m2 = Metadata.from_dict({ "name": "merge_test", "columns": [ { "name": "c2", "type": "int64" }, { "name": "c3", "type": "string" }, ], "partitions": ["c2"], }) expected = Metadata.from_dict({ "name": "merge_test", "columns": [ { "name": "c1", "type": "string" }, { "name": "c2", "type": "int64" }, { "name": "c3", "type": "string" }, ], "partitions": ["c2"], }) merged = Metadata.merge(m1, m2) assert (sorted(x.items() for x in merged.columns) == sorted( x.items() for x in expected.columns) and merged.name == expected.name and sorted(merged.partitions) == sorted(expected.partitions))
def test_preservation_of_underlying_metadata(): # Test if additional data is preserved test_dict = { "name": "test", "description": "test", "file_format": "test", "sensitive": False, "columns": [{ "name": "test", "type": "null" }], "primary_key": ["test"], "partitions": ["test"], "additional-attr": "test", } meta = Metadata.from_dict(test_dict) out_dict = meta.to_dict() for k, v in test_dict.items(): assert out_dict[k] == v # make sure data is copied and not just a pointer assert id(test_dict) != id(meta._data) test_dict["columns"] = [{"name": "new_test", "type": "bool_"}] assert test_dict != meta.columns # Assert Metadata instances are different m1 = Metadata() m2 = Metadata() assert m1.columns == m2.columns m1.columns.append({"name": "new_test", "type": "bool_"}) assert m1.columns != m2.columns
def test_generate_from_meta(spec_name, serde_name, expected_file_name): md = Metadata.from_dict({ "name": "test_table", "file_format": spec_name, "columns": [ { "name": "my_int", "type": "int64", "description": "This is an integer", }, { "name": "my_double", "type": "float64" }, { "name": "my_date", "type": "date64" }, { "name": "my_decimal", "type": "decimal128(10,2)" }, { "name": "my_timestamp", "type": "timestamp(s)", "description": "Partition column", }, ], "partitions": ["my_timestamp"], }) gc = GlueConverter() if spec_name == "csv": gc.options.set_csv_serde(serde_name) if spec_name == "json": gc.options.set_json_serde(serde_name) opts = GlueConverterOptions(default_db_base_path="s3://bucket/", default_db_name="test_db") gc_default_opts = GlueConverter(opts) table_path = "s3://bucket/test_table" # DO DICT TEST spec = gc.generate_from_meta(md, database_name="test_db", table_location=table_path) spec_default_opts = gc_default_opts.generate_from_meta(md, ) assert spec == spec_default_opts with open(f"tests/data/glue_converter/{expected_file_name}.json") as f: expected_spec = json.load(f) assert spec == expected_spec
def metadata(self, meta_dict: dict): try: meta_obj = Metadata.from_dict(meta_dict) meta_obj.set_col_type_category_from_types() self._metadata = meta_obj.to_dict() if "file_format" not in self.metadata: raise ValidationError( "metadata given must have a file_format property") except ValidationError as e: error_msg = ("Pandas validator requires schemas that conform " "to those found in the mojap_metadata package. " f"Metadata given failed validation: {str(e)}") raise ValidationError(error_msg)
def test_generate_from_meta(): md = Metadata.from_dict({ "name": "test_table", "file_format": "test-format", "columns": [ { "name": "my_int", "type": "int64", "description": "This is an integer", "nullable": False, }, { "name": "my_double", "type": "float64", "nullable": True }, { "name": "my_date", "type": "date64" }, { "name": "my_decimal", "type": "decimal128(10,2)" }, { "name": "my_timestamp", "type": "timestamp(s)", "description": "Partition column", }, ], "partitions": ["my_timestamp"], }) ac = ArrowConverter() assert isinstance(ac.options, BaseConverterOptions) schema1 = ac.generate_from_meta(md) schema2 = ac.generate_from_meta(md, False) assert isinstance(schema1, pa.Schema) assert isinstance(schema2, pa.Schema) schema_str1 = ("my_int: int64 not null\nmy_double: double\n" "my_date: date64[ms]\nmy_decimal: decimal(10, 2)") schema_str2 = schema_str1 + "\nmy_timestamp: timestamp[s]" assert schema1.to_string() == schema_str1 assert schema2.to_string() == schema_str2
def test_infer_file_format_from_meta(file_format, expected): meta = { "name": "test", "columns": [{ "name": "a", "type": "string" }], "file_format": file_format, } actual = infer_file_format_from_meta(meta) assert actual == expected metadata = Metadata.from_dict(meta) actual = infer_file_format_from_meta(metadata) assert actual == expected
def test_from_dict(): test_dict = { "name": "test", "description": "test", "file_format": "test", "sensitive": False, "columns": [{ "name": "test", "type": "null" }], "primary_key": ["test"], "partitions": ["test"], } meta = Metadata.from_dict(test_dict) for k, v in test_dict.items(): assert getattr(meta, k) == v meta.name = "bob" assert meta.name == meta._data["name"]
def test_file_reader_works_with_both_meta_types(): csv_meta = { "columns": [ { "name": "test", "type_category": "string" }, { "name": "a_column", "type_category": "string" }, ] } df_csv1 = reader.csv.read("tests/data/example_data.csv", csv_meta) csv_meta = Metadata.from_dict(csv_meta) df_csv2 = reader.csv.read("tests/data/example_data.csv", csv_meta) assert_frame_equal(df_csv1, df_csv2) with pytest.raises(ValidationError): m = {"columns": [{"name": "broken", "type": "not-a-type"}]} reader.csv.read("tests/data/example_data.jsonl", metadata=m)
def get_meta(ff: str, additional_params: Optional[dict] = None): additional_params = {} if not additional_params else additional_params md = Metadata.from_dict({ "name": "test_table", "file_format": ff, "columns": [ { "name": "my_int", "type": "int64", "description": "This is an integer", }, { "name": "my_double", "type": "float64" }, { "name": "my_date", "type": "date64" }, { "name": "my_decimal", "type": "decimal128(10,2)" }, { "name": "my_complex", "type": "large_list<int64>" }, { "name": "my_timestamp", "type": "timestamp(s)", "description": "Partition column", }, ], "partitions": ["my_timestamp"], **additional_params, }) return md
def test_to_from_json_yaml(tmpdir, writer): path_file = tmpdir.mkdir("test_outputs").join("meta.{writer}") test_dict = { "name": "test", "description": "test", "file_format": "test", "sensitive": False, "columns": [{ "name": "test", "type": "null" }], "primary_key": ["test"], "partitions": ["test"], } meta = Metadata.from_dict(test_dict) # test in/out reader getattr(meta, f"to_{writer}")(str(path_file)) read_meta = getattr(Metadata, f"from_{writer}")(str(path_file)) out_dict = read_meta.to_dict() for k, v in test_dict.items(): assert out_dict[k] == v
def cast_pandas_table_to_schema( df: pd.DataFrame, metadata: Union[Metadata, dict], ignore_columns: List = None, drop_columns: List = None, pd_integer: bool = True, pd_string: bool = True, pd_boolean: bool = True, pd_date_type: str = "datetime_object", pd_timestamp_type: str = "datetime_object", bool_map: Union[Callable, dict] = None, num_error_map: dict = None, ): """ Casts the columns in dataframe provided to the meta data dictionary provided. Safest casting occurs when all coltypes of input dataframe are strings. df: Pandas dataframe meta: Metadata or dict representation of metadata ignore_columns: a list of column names to not cast to the meta data dictionary. These columns are remained unchanged. drop_columns: Removes these columns from the dataframe bool_map (Callable, dict, optional): A custom mapping function that is applied to str cols to be converted to booleans before conversion to boolean type. e.g. {"Yes": True, "No": False}. If not set bool values are inferred by the _default_str_bool_mapper. """ default_num_errors = "raise" if ignore_columns is None: ignore_columns = [] if drop_columns is None: drop_columns = [] if isinstance(metadata, Metadata): meta = metadata.to_dict() elif isinstance(metadata, dict): if "columns" not in metadata: raise KeyError('metadata missing a "columns" key') _ = Metadata.from_dict(metadata) # Check metadata is valid meta = deepcopy(metadata) else: error_msg = ("Input metadata must be of type Metadata " f"or dict got {type(metadata)}") raise ValueError(error_msg) df = df.copy() all_exclude_cols = ignore_columns + drop_columns meta_cols_to_convert = [ c for c in meta["columns"] if c["name"] not in all_exclude_cols or c["name"] not in meta["partitions"] ] for c in meta_cols_to_convert: # Null first if applicable if c["name"] not in df.columns: raise ValueError(f"Column '{c['name']}' not in df") else: # must get num_errors from either meta or num_error_map. Meta has precedence if c.get("num_errors"): num_errors = c.get("num_errors") elif isinstance(num_error_map, dict): num_errors = num_error_map.get(c["name"]) else: num_errors = default_num_errors df[c["name"]] = cast_pandas_column_to_schema( df[c["name"]], metacol=c, pd_integer=pd_integer, pd_string=pd_string, pd_boolean=pd_boolean, pd_date_type=pd_date_type, pd_timestamp_type=pd_timestamp_type, num_errors=num_errors, bool_map=bool_map, ) final_cols = [ c["name"] for c in meta["columns"] if c["name"] not in drop_columns or c["name"] not in meta["partitions"] ] df = df[final_cols] return df
def test_generate_from_meta(): md = Metadata.from_dict({ "name": "test_table", "file_format": "test-format", "columns": [ { "name": "my_int", "type": "int64", "description": "This is an integer", "nullable": False, }, { "name": "my_double", "type": "float64", "nullable": True }, { "name": "my_date", "type": "date64" }, { "name": "my_decimal", "type": "decimal128(10,2)" }, { "name": "my_timestamp", "type": "timestamp(s)", "description": "Partition column", }, ], "partitions": ["my_timestamp"], }) ac = ArrowConverter() assert isinstance(ac.options, BaseConverterOptions) schema1 = ac.generate_from_meta(md) schema2 = ac.generate_from_meta(md, False) assert isinstance(schema1, pa.Schema) assert isinstance(schema2, pa.Schema) expected_names = ["my_int", "my_double", "my_date", "my_decimal"] expected_types = [ pa.int64(), pa.float64(), pa.date64(), pa.decimal128(10, 2) ] assert schema1.names == expected_names checks1 = [a.equals(e) for a, e in zip(schema1.types, expected_types)] assert all(checks1) # Do schema2 assertions expected_names.append("my_timestamp") expected_types.append(pa.timestamp("s")) assert schema2.names == expected_names checks2 = [a.equals(e) for a, e in zip(schema2.types, expected_types)] assert all(checks2) # Also check specific type properties assert schema2.field("my_decimal").type.precision == 10 assert schema2.field("my_decimal").type.scale == 2 assert schema2.field("my_timestamp").type.unit == "s"
def test_set_col_types_from_type_category(): test_dict = { "name": "test", "description": "test", "file_format": "test", "sensitive": False, "columns": [ { "name": "test_null", "type_category": "null" }, { "name": "test_integer", "type_category": "integer" }, { "name": "test_float", "type_category": "float" }, { "name": "test_string", "type_category": "string" }, { "name": "test_timestamp", "type_category": "timestamp" }, { "name": "test_binary", "type_category": "binary" }, { "name": "test_boolean", "type_category": "boolean" }, { "name": "test_list", "type_category": "list" }, { "name": "test_struct", "type_category": "struct" }, ], } meta = Metadata.from_dict(test_dict) with pytest.warns(UserWarning): meta.set_col_types_from_type_category() for c in meta.columns: default_type_cat = c["name"].replace("test_", "") expected_type = meta.default_type_category_lookup.get(default_type_cat) assert c["type"] == expected_type new_dict = { "null": "null", "integer": "uint8", "float": "decimal128(2,5)", "string": "large_string", "timestamp": "timestamp(us)", "binary": "large_binary", "boolean": "bool_", "list": "large_list<null>", "struct": "map_<null>", } meta2 = Metadata.from_dict(test_dict) meta2.set_col_types_from_type_category( lambda x: new_dict.get(x["type_category"])) for c in meta2.columns: default_type_cat = c["name"].replace("test_", "") assert c["type"] == new_dict.get(default_type_cat)
def test_generate_from_meta(): md = Metadata.from_dict( { "name": "test_table", "file_format": "csv", "description": "A test table", "columns": [ { "name": "my_int", "type": "int64", "description": "This is an integer", "nullable": False, "minimum": 10, }, {"name": "my_double", "type": "float64", "nullable": True}, {"name": "my_date", "type": "date64"}, {"name": "my_decimal", "type": "decimal128(10,2)"}, {"name": "my_string", "type": "string", "enum": ["cat", "dog"]}, { "name": "my_timestamp", "type": "timestamp(s)", "description": "Partition column", }, ], "partitions": ["my_timestamp"], } ) expected1 = { "$schema": "https://moj-analytical-services.github.io/metadata_schema/table/v1.4.0.json", # noqa: 401 "name": "test_table", "data_format": "csv", "location": "test_table/", "description": "A test table", "columns": [ { "name": "my_int", "type": "long", "description": "This is an integer", "nullable": False, "minimum": 10, }, { "name": "my_double", "type": "double", "nullable": True, "description": "", }, {"name": "my_date", "type": "date", "description": ""}, {"name": "my_decimal", "type": "decimal(10,2)", "description": ""}, { "name": "my_string", "type": "character", "enum": ["cat", "dog"], "description": "", }, { "name": "my_timestamp", "type": "datetime", "description": "Partition column", }, ], "partitions": ["my_timestamp"], } emc = EtlManagerConverter() assert isinstance(emc.options, BaseConverterOptions) etl1 = emc.generate_from_meta(md) assert isinstance(etl1, TableMeta) etl1 = etl1.to_dict() assert etl1 == expected1 expected2 = copy.deepcopy(expected1) # Remove additional cols not native to etl_manager del expected2["columns"][0]["minimum"] etl2 = emc.generate_from_meta(md, include_extra_column_params=False).to_dict() assert etl2 == expected2 # Check table_location works expected3 = copy.deepcopy(expected1) expected3["location"] = "some/new/tablepath/" etl3 = emc.generate_from_meta(md, table_location="some/new/tablepath/").to_dict() assert etl3 == expected3 # Check file_format_mapper mapper = {"csv": "csv_quoted_nodate"}.get expected4 = copy.deepcopy(expected1) expected4["data_format"] = "csv_quoted_nodate" etl4 = emc.generate_from_meta(md, file_format_mapper=mapper).to_dict() assert etl4 == expected4 # Check glue_specific mapper = {"csv": "csv_quoted_nodate"}.get expected5 = copy.deepcopy(expected1) gs = { "Parameters": {"skip.header.line.count": "1"}, "StorageDescriptor": {"Parameters": {"skip.header.line.count": "1"}}, } expected5["glue_specific"] = gs etl5 = emc.generate_from_meta(md, glue_specific=gs).to_dict() assert etl5 == expected5
0.0, [], (), ], ) def test_inferred_input_fails(fake_input): with pytest.raises(TypeError): Metadata.from_infer(fake_input) merge_meta_test = Metadata.from_dict({ "name": "merge_test", "columns": [{ "name": "c1", "type": "int64" }, { "name": "c2", "type": "string" }], "partitions": ["c1"], }) merge_meta_diff_col_type = Metadata.from_dict({ "name": "merge_test", "columns": [ { "name": "c3", "type": "string" }, {
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict): """ Reads in the data from the given filepath and returns a dataframe """ meta_col_names = [ c["name"] for c in metadata["columns"] if c["name"] not in metadata.get("partitions", []) ] # For string based file types convert make arrow readers read them in as strings # validators will still treat these as dates but will run validation against strings # cols expecting values to match a timestamp format if "json" in metadata["file_format"] or "csv" in metadata["file_format"]: md_obj = Metadata.from_dict(metadata) cols = md_obj.columns cols_to_force_str_read_in = [] for c in cols: if c["type"].startswith("time") or c["type"].startswith("date"): c["type"] = "string" c["type_category"] = "string" cols_to_force_str_read_in.append(c["name"]) md_obj.columns = cols ac = ArrowConverter() arrow_schema = ac.generate_from_meta(md_obj) ts_as_str_schema = pa.schema([]) for cname in cols_to_force_str_read_in: ts_as_str_schema = ts_as_str_schema.append( arrow_schema.field(cname)) # Set the reader type if filepath.startswith("s3://"): reader_fs = fs.S3FileSystem(region="eu-west-1") fp_for_file_reader = filepath.replace("s3://", "", 1) else: reader_fs = fs.LocalFileSystem() fp_for_file_reader = filepath with reader_fs.open_input_stream(fp_for_file_reader) as f: if "csv" in metadata["file_format"]: # Safer CSV load for newlines_in_values set to True if table_params.get("expect-header", True): po = csv.ParseOptions(newlines_in_values=True) else: po = csv.ParseOptions(newlines_in_values=True, column_names=meta_col_names) if ts_as_str_schema: co = csv.ConvertOptions(column_types=ts_as_str_schema) else: co = None df = pa_read_csv_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, convert_options=co, ) # dates/datetimes == string elif "json" in metadata["file_format"]: po = json.ParseOptions( newlines_in_values=True, explicit_schema=ts_as_str_schema if ts_as_str_schema else None, ) df = pa_read_json_to_pandas( input_file=f, schema=arrow_schema, expect_full_schema=False, parse_options=po, ) # dates/datetimes == string elif "parquet" in metadata["file_format"]: df = arrow_to_pandas(pq.read_table(f)) # dates/datetimes == datetime / date else: raise ValueError( f"Unknown file_format in metadata: {metadata['file_format']}.") if table_params.get("row-limit"): df = df.sample(table_params.get("row-limit")) if table_params.get("headers-ignore-case"): df_cols = [c.lower() for c in df.columns] df.columns = df_cols if table_params.get("only-test-cols-in-metadata", False): keep_cols = [c for c in df.columns if c in meta_col_names] df = df[keep_cols] return df