Example #1
0
def test_merge_error_not_raised():
    m1 = Metadata.from_dict({
        "name":
        "merge_test",
        "columns": [
            {
                "name": "c1",
                "type": "string"
            },
            {
                "name": "c2",
                "type": "int64"
            },
        ],
        "partitions": ["c2"],
    })
    m2 = Metadata.from_dict({
        "name":
        "merge_test",
        "columns": [
            {
                "name": "c2",
                "type": "int64"
            },
            {
                "name": "c3",
                "type": "string"
            },
        ],
        "partitions": ["c2"],
    })
    expected = Metadata.from_dict({
        "name":
        "merge_test",
        "columns": [
            {
                "name": "c1",
                "type": "string"
            },
            {
                "name": "c2",
                "type": "int64"
            },
            {
                "name": "c3",
                "type": "string"
            },
        ],
        "partitions": ["c2"],
    })
    merged = Metadata.merge(m1, m2)
    assert (sorted(x.items() for x in merged.columns) == sorted(
        x.items() for x in expected.columns) and merged.name == expected.name
            and sorted(merged.partitions) == sorted(expected.partitions))
Example #2
0
def test_preservation_of_underlying_metadata():

    # Test if additional data is preserved
    test_dict = {
        "name": "test",
        "description": "test",
        "file_format": "test",
        "sensitive": False,
        "columns": [{
            "name": "test",
            "type": "null"
        }],
        "primary_key": ["test"],
        "partitions": ["test"],
        "additional-attr": "test",
    }
    meta = Metadata.from_dict(test_dict)
    out_dict = meta.to_dict()
    for k, v in test_dict.items():
        assert out_dict[k] == v

    # make sure data is copied and not just a pointer
    assert id(test_dict) != id(meta._data)

    test_dict["columns"] = [{"name": "new_test", "type": "bool_"}]
    assert test_dict != meta.columns

    # Assert Metadata instances are different
    m1 = Metadata()
    m2 = Metadata()

    assert m1.columns == m2.columns

    m1.columns.append({"name": "new_test", "type": "bool_"})
    assert m1.columns != m2.columns
def test_generate_from_meta(spec_name, serde_name, expected_file_name):
    md = Metadata.from_dict({
        "name":
        "test_table",
        "file_format":
        spec_name,
        "columns": [
            {
                "name": "my_int",
                "type": "int64",
                "description": "This is an integer",
            },
            {
                "name": "my_double",
                "type": "float64"
            },
            {
                "name": "my_date",
                "type": "date64"
            },
            {
                "name": "my_decimal",
                "type": "decimal128(10,2)"
            },
            {
                "name": "my_timestamp",
                "type": "timestamp(s)",
                "description": "Partition column",
            },
        ],
        "partitions": ["my_timestamp"],
    })

    gc = GlueConverter()
    if spec_name == "csv":
        gc.options.set_csv_serde(serde_name)

    if spec_name == "json":
        gc.options.set_json_serde(serde_name)

    opts = GlueConverterOptions(default_db_base_path="s3://bucket/",
                                default_db_name="test_db")

    gc_default_opts = GlueConverter(opts)

    table_path = "s3://bucket/test_table"

    # DO DICT TEST
    spec = gc.generate_from_meta(md,
                                 database_name="test_db",
                                 table_location=table_path)
    spec_default_opts = gc_default_opts.generate_from_meta(md, )
    assert spec == spec_default_opts

    with open(f"tests/data/glue_converter/{expected_file_name}.json") as f:
        expected_spec = json.load(f)

    assert spec == expected_spec
    def metadata(self, meta_dict: dict):
        try:
            meta_obj = Metadata.from_dict(meta_dict)
            meta_obj.set_col_type_category_from_types()
            self._metadata = meta_obj.to_dict()

            if "file_format" not in self.metadata:
                raise ValidationError(
                    "metadata given must have a file_format property")
        except ValidationError as e:
            error_msg = ("Pandas validator requires schemas that conform "
                         "to those found in the mojap_metadata package. "
                         f"Metadata given failed validation: {str(e)}")
            raise ValidationError(error_msg)
def test_generate_from_meta():
    md = Metadata.from_dict({
        "name":
        "test_table",
        "file_format":
        "test-format",
        "columns": [
            {
                "name": "my_int",
                "type": "int64",
                "description": "This is an integer",
                "nullable": False,
            },
            {
                "name": "my_double",
                "type": "float64",
                "nullable": True
            },
            {
                "name": "my_date",
                "type": "date64"
            },
            {
                "name": "my_decimal",
                "type": "decimal128(10,2)"
            },
            {
                "name": "my_timestamp",
                "type": "timestamp(s)",
                "description": "Partition column",
            },
        ],
        "partitions": ["my_timestamp"],
    })

    ac = ArrowConverter()
    assert isinstance(ac.options, BaseConverterOptions)

    schema1 = ac.generate_from_meta(md)
    schema2 = ac.generate_from_meta(md, False)

    assert isinstance(schema1, pa.Schema)
    assert isinstance(schema2, pa.Schema)

    schema_str1 = ("my_int: int64 not null\nmy_double: double\n"
                   "my_date: date64[ms]\nmy_decimal: decimal(10, 2)")
    schema_str2 = schema_str1 + "\nmy_timestamp: timestamp[s]"
    assert schema1.to_string() == schema_str1
    assert schema2.to_string() == schema_str2
Example #6
0
def test_infer_file_format_from_meta(file_format, expected):
    meta = {
        "name": "test",
        "columns": [{
            "name": "a",
            "type": "string"
        }],
        "file_format": file_format,
    }
    actual = infer_file_format_from_meta(meta)
    assert actual == expected

    metadata = Metadata.from_dict(meta)
    actual = infer_file_format_from_meta(metadata)
    assert actual == expected
Example #7
0
def test_from_dict():
    test_dict = {
        "name": "test",
        "description": "test",
        "file_format": "test",
        "sensitive": False,
        "columns": [{
            "name": "test",
            "type": "null"
        }],
        "primary_key": ["test"],
        "partitions": ["test"],
    }
    meta = Metadata.from_dict(test_dict)

    for k, v in test_dict.items():
        assert getattr(meta, k) == v

    meta.name = "bob"
    assert meta.name == meta._data["name"]
def test_file_reader_works_with_both_meta_types():
    csv_meta = {
        "columns": [
            {
                "name": "test",
                "type_category": "string"
            },
            {
                "name": "a_column",
                "type_category": "string"
            },
        ]
    }

    df_csv1 = reader.csv.read("tests/data/example_data.csv", csv_meta)
    csv_meta = Metadata.from_dict(csv_meta)
    df_csv2 = reader.csv.read("tests/data/example_data.csv", csv_meta)
    assert_frame_equal(df_csv1, df_csv2)
    with pytest.raises(ValidationError):
        m = {"columns": [{"name": "broken", "type": "not-a-type"}]}
        reader.csv.read("tests/data/example_data.jsonl", metadata=m)
def get_meta(ff: str, additional_params: Optional[dict] = None):
    additional_params = {} if not additional_params else additional_params
    md = Metadata.from_dict({
        "name":
        "test_table",
        "file_format":
        ff,
        "columns": [
            {
                "name": "my_int",
                "type": "int64",
                "description": "This is an integer",
            },
            {
                "name": "my_double",
                "type": "float64"
            },
            {
                "name": "my_date",
                "type": "date64"
            },
            {
                "name": "my_decimal",
                "type": "decimal128(10,2)"
            },
            {
                "name": "my_complex",
                "type": "large_list<int64>"
            },
            {
                "name": "my_timestamp",
                "type": "timestamp(s)",
                "description": "Partition column",
            },
        ],
        "partitions": ["my_timestamp"],
        **additional_params,
    })

    return md
Example #10
0
def test_to_from_json_yaml(tmpdir, writer):
    path_file = tmpdir.mkdir("test_outputs").join("meta.{writer}")

    test_dict = {
        "name": "test",
        "description": "test",
        "file_format": "test",
        "sensitive": False,
        "columns": [{
            "name": "test",
            "type": "null"
        }],
        "primary_key": ["test"],
        "partitions": ["test"],
    }
    meta = Metadata.from_dict(test_dict)

    # test in/out reader
    getattr(meta, f"to_{writer}")(str(path_file))
    read_meta = getattr(Metadata, f"from_{writer}")(str(path_file))
    out_dict = read_meta.to_dict()
    for k, v in test_dict.items():
        assert out_dict[k] == v
Example #11
0
def cast_pandas_table_to_schema(
    df: pd.DataFrame,
    metadata: Union[Metadata, dict],
    ignore_columns: List = None,
    drop_columns: List = None,
    pd_integer: bool = True,
    pd_string: bool = True,
    pd_boolean: bool = True,
    pd_date_type: str = "datetime_object",
    pd_timestamp_type: str = "datetime_object",
    bool_map: Union[Callable, dict] = None,
    num_error_map: dict = None,
):
    """
    Casts the columns in dataframe provided to the meta data dictionary provided.
    Safest casting occurs when all coltypes of input dataframe are strings.

    df: Pandas dataframe
    meta: Metadata or dict representation of metadata
    ignore_columns: a list of column names to not cast to the meta data dictionary.
        These columns are remained unchanged.
    drop_columns: Removes these columns from the dataframe
    bool_map (Callable, dict, optional): A custom mapping function that is applied
        to str cols to be converted to booleans before conversion to boolean type.
        e.g. {"Yes": True, "No": False}. If not set bool values are inferred by the
        _default_str_bool_mapper.
    """

    default_num_errors = "raise"

    if ignore_columns is None:
        ignore_columns = []

    if drop_columns is None:
        drop_columns = []

    if isinstance(metadata, Metadata):
        meta = metadata.to_dict()
    elif isinstance(metadata, dict):
        if "columns" not in metadata:
            raise KeyError('metadata missing a "columns" key')

        _ = Metadata.from_dict(metadata)  # Check metadata is valid
        meta = deepcopy(metadata)
    else:
        error_msg = ("Input metadata must be of type Metadata "
                     f"or dict got {type(metadata)}")
        raise ValueError(error_msg)
    df = df.copy()

    all_exclude_cols = ignore_columns + drop_columns
    meta_cols_to_convert = [
        c for c in meta["columns"] if c["name"] not in all_exclude_cols
        or c["name"] not in meta["partitions"]
    ]

    for c in meta_cols_to_convert:
        # Null first if applicable
        if c["name"] not in df.columns:
            raise ValueError(f"Column '{c['name']}' not in df")

        else:
            # must get num_errors from either meta or num_error_map. Meta has precedence
            if c.get("num_errors"):
                num_errors = c.get("num_errors")
            elif isinstance(num_error_map, dict):
                num_errors = num_error_map.get(c["name"])
            else:
                num_errors = default_num_errors

            df[c["name"]] = cast_pandas_column_to_schema(
                df[c["name"]],
                metacol=c,
                pd_integer=pd_integer,
                pd_string=pd_string,
                pd_boolean=pd_boolean,
                pd_date_type=pd_date_type,
                pd_timestamp_type=pd_timestamp_type,
                num_errors=num_errors,
                bool_map=bool_map,
            )

    final_cols = [
        c["name"] for c in meta["columns"]
        if c["name"] not in drop_columns or c["name"] not in meta["partitions"]
    ]
    df = df[final_cols]

    return df
def test_generate_from_meta():
    md = Metadata.from_dict({
        "name":
        "test_table",
        "file_format":
        "test-format",
        "columns": [
            {
                "name": "my_int",
                "type": "int64",
                "description": "This is an integer",
                "nullable": False,
            },
            {
                "name": "my_double",
                "type": "float64",
                "nullable": True
            },
            {
                "name": "my_date",
                "type": "date64"
            },
            {
                "name": "my_decimal",
                "type": "decimal128(10,2)"
            },
            {
                "name": "my_timestamp",
                "type": "timestamp(s)",
                "description": "Partition column",
            },
        ],
        "partitions": ["my_timestamp"],
    })

    ac = ArrowConverter()
    assert isinstance(ac.options, BaseConverterOptions)

    schema1 = ac.generate_from_meta(md)
    schema2 = ac.generate_from_meta(md, False)

    assert isinstance(schema1, pa.Schema)
    assert isinstance(schema2, pa.Schema)

    expected_names = ["my_int", "my_double", "my_date", "my_decimal"]
    expected_types = [
        pa.int64(),
        pa.float64(),
        pa.date64(),
        pa.decimal128(10, 2)
    ]
    assert schema1.names == expected_names

    checks1 = [a.equals(e) for a, e in zip(schema1.types, expected_types)]
    assert all(checks1)

    # Do schema2 assertions
    expected_names.append("my_timestamp")
    expected_types.append(pa.timestamp("s"))

    assert schema2.names == expected_names

    checks2 = [a.equals(e) for a, e in zip(schema2.types, expected_types)]
    assert all(checks2)

    # Also check specific type properties
    assert schema2.field("my_decimal").type.precision == 10
    assert schema2.field("my_decimal").type.scale == 2
    assert schema2.field("my_timestamp").type.unit == "s"
Example #13
0
def test_set_col_types_from_type_category():
    test_dict = {
        "name":
        "test",
        "description":
        "test",
        "file_format":
        "test",
        "sensitive":
        False,
        "columns": [
            {
                "name": "test_null",
                "type_category": "null"
            },
            {
                "name": "test_integer",
                "type_category": "integer"
            },
            {
                "name": "test_float",
                "type_category": "float"
            },
            {
                "name": "test_string",
                "type_category": "string"
            },
            {
                "name": "test_timestamp",
                "type_category": "timestamp"
            },
            {
                "name": "test_binary",
                "type_category": "binary"
            },
            {
                "name": "test_boolean",
                "type_category": "boolean"
            },
            {
                "name": "test_list",
                "type_category": "list"
            },
            {
                "name": "test_struct",
                "type_category": "struct"
            },
        ],
    }
    meta = Metadata.from_dict(test_dict)
    with pytest.warns(UserWarning):
        meta.set_col_types_from_type_category()

    for c in meta.columns:
        default_type_cat = c["name"].replace("test_", "")
        expected_type = meta.default_type_category_lookup.get(default_type_cat)
        assert c["type"] == expected_type

    new_dict = {
        "null": "null",
        "integer": "uint8",
        "float": "decimal128(2,5)",
        "string": "large_string",
        "timestamp": "timestamp(us)",
        "binary": "large_binary",
        "boolean": "bool_",
        "list": "large_list<null>",
        "struct": "map_<null>",
    }

    meta2 = Metadata.from_dict(test_dict)
    meta2.set_col_types_from_type_category(
        lambda x: new_dict.get(x["type_category"]))

    for c in meta2.columns:
        default_type_cat = c["name"].replace("test_", "")
        assert c["type"] == new_dict.get(default_type_cat)
Example #14
0
def test_generate_from_meta():
    md = Metadata.from_dict(
        {
            "name": "test_table",
            "file_format": "csv",
            "description": "A test table",
            "columns": [
                {
                    "name": "my_int",
                    "type": "int64",
                    "description": "This is an integer",
                    "nullable": False,
                    "minimum": 10,
                },
                {"name": "my_double", "type": "float64", "nullable": True},
                {"name": "my_date", "type": "date64"},
                {"name": "my_decimal", "type": "decimal128(10,2)"},
                {"name": "my_string", "type": "string", "enum": ["cat", "dog"]},
                {
                    "name": "my_timestamp",
                    "type": "timestamp(s)",
                    "description": "Partition column",
                },
            ],
            "partitions": ["my_timestamp"],
        }
    )

    expected1 = {
        "$schema": "https://moj-analytical-services.github.io/metadata_schema/table/v1.4.0.json",  # noqa: 401
        "name": "test_table",
        "data_format": "csv",
        "location": "test_table/",
        "description": "A test table",
        "columns": [
            {
                "name": "my_int",
                "type": "long",
                "description": "This is an integer",
                "nullable": False,
                "minimum": 10,
            },
            {
                "name": "my_double",
                "type": "double",
                "nullable": True,
                "description": "",
            },
            {"name": "my_date", "type": "date", "description": ""},
            {"name": "my_decimal", "type": "decimal(10,2)", "description": ""},
            {
                "name": "my_string",
                "type": "character",
                "enum": ["cat", "dog"],
                "description": "",
            },
            {
                "name": "my_timestamp",
                "type": "datetime",
                "description": "Partition column",
            },
        ],
        "partitions": ["my_timestamp"],
    }
    emc = EtlManagerConverter()
    assert isinstance(emc.options, BaseConverterOptions)

    etl1 = emc.generate_from_meta(md)
    assert isinstance(etl1, TableMeta)

    etl1 = etl1.to_dict()
    assert etl1 == expected1

    expected2 = copy.deepcopy(expected1)

    # Remove additional cols not native to etl_manager
    del expected2["columns"][0]["minimum"]

    etl2 = emc.generate_from_meta(md, include_extra_column_params=False).to_dict()
    assert etl2 == expected2

    # Check table_location works
    expected3 = copy.deepcopy(expected1)
    expected3["location"] = "some/new/tablepath/"
    etl3 = emc.generate_from_meta(md, table_location="some/new/tablepath/").to_dict()
    assert etl3 == expected3

    # Check file_format_mapper
    mapper = {"csv": "csv_quoted_nodate"}.get
    expected4 = copy.deepcopy(expected1)
    expected4["data_format"] = "csv_quoted_nodate"
    etl4 = emc.generate_from_meta(md, file_format_mapper=mapper).to_dict()
    assert etl4 == expected4

    # Check glue_specific
    mapper = {"csv": "csv_quoted_nodate"}.get
    expected5 = copy.deepcopy(expected1)
    gs = {
        "Parameters": {"skip.header.line.count": "1"},
        "StorageDescriptor": {"Parameters": {"skip.header.line.count": "1"}},
    }
    expected5["glue_specific"] = gs
    etl5 = emc.generate_from_meta(md, glue_specific=gs).to_dict()
    assert etl5 == expected5
Example #15
0
        0.0,
        [],
        (),
    ],
)
def test_inferred_input_fails(fake_input):
    with pytest.raises(TypeError):
        Metadata.from_infer(fake_input)


merge_meta_test = Metadata.from_dict({
    "name":
    "merge_test",
    "columns": [{
        "name": "c1",
        "type": "int64"
    }, {
        "name": "c2",
        "type": "string"
    }],
    "partitions": ["c1"],
})

merge_meta_diff_col_type = Metadata.from_dict({
    "name":
    "merge_test",
    "columns": [
        {
            "name": "c3",
            "type": "string"
        },
        {
Example #16
0
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict):
    """
    Reads in the data from the given filepath and returns
    a dataframe
    """

    meta_col_names = [
        c["name"] for c in metadata["columns"]
        if c["name"] not in metadata.get("partitions", [])
    ]

    # For string based file types convert make arrow readers read them in as strings
    # validators will still treat these as dates but will run validation against strings
    # cols expecting values to match a timestamp format
    if "json" in metadata["file_format"] or "csv" in metadata["file_format"]:
        md_obj = Metadata.from_dict(metadata)
        cols = md_obj.columns

        cols_to_force_str_read_in = []
        for c in cols:
            if c["type"].startswith("time") or c["type"].startswith("date"):
                c["type"] = "string"
                c["type_category"] = "string"
                cols_to_force_str_read_in.append(c["name"])

        md_obj.columns = cols
        ac = ArrowConverter()
        arrow_schema = ac.generate_from_meta(md_obj)

        ts_as_str_schema = pa.schema([])
        for cname in cols_to_force_str_read_in:
            ts_as_str_schema = ts_as_str_schema.append(
                arrow_schema.field(cname))

    # Set the reader type
    if filepath.startswith("s3://"):
        reader_fs = fs.S3FileSystem(region="eu-west-1")
        fp_for_file_reader = filepath.replace("s3://", "", 1)

    else:
        reader_fs = fs.LocalFileSystem()
        fp_for_file_reader = filepath

    with reader_fs.open_input_stream(fp_for_file_reader) as f:
        if "csv" in metadata["file_format"]:

            # Safer CSV load for newlines_in_values set to True
            if table_params.get("expect-header", True):
                po = csv.ParseOptions(newlines_in_values=True)
            else:
                po = csv.ParseOptions(newlines_in_values=True,
                                      column_names=meta_col_names)

            if ts_as_str_schema:
                co = csv.ConvertOptions(column_types=ts_as_str_schema)
            else:
                co = None

            df = pa_read_csv_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
                convert_options=co,
            )
            # dates/datetimes == string

        elif "json" in metadata["file_format"]:

            po = json.ParseOptions(
                newlines_in_values=True,
                explicit_schema=ts_as_str_schema if ts_as_str_schema else None,
            )

            df = pa_read_json_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
            )
            # dates/datetimes == string

        elif "parquet" in metadata["file_format"]:
            df = arrow_to_pandas(pq.read_table(f))
            # dates/datetimes == datetime / date

        else:
            raise ValueError(
                f"Unknown file_format in metadata: {metadata['file_format']}.")

    if table_params.get("row-limit"):
        df = df.sample(table_params.get("row-limit"))

    if table_params.get("headers-ignore-case"):
        df_cols = [c.lower() for c in df.columns]
        df.columns = df_cols

    if table_params.get("only-test-cols-in-metadata", False):
        keep_cols = [c for c in df.columns if c in meta_col_names]
        df = df[keep_cols]

    return df