Esempio n. 1
0
def test_inferred_schema_io():
    """Test that inferred schema can be writted to yaml."""
    df = pd.DataFrame({
        "column1": [5, 10, 20],
        "column2": [5., 1., 3.],
        "column3": ["a", "b", "c"],
    })
    schema = pa.infer_schema(df)
    schema_yaml_str = schema.to_yaml()
    schema_from_yaml = io.from_yaml(schema_yaml_str)
    assert schema == schema_from_yaml
clean_data = pd.DataFrame({"continuous": ["1.0"], "categorical": ["A"]})
supplementary_data = pd.DataFrame({"discrete": [1]})
JoinedData(clean_data.join(supplementary_data))

# %% [markdown] slideshow={"slide_type": "slide"}
# #### Bootstrap and Interoperate
#
# ##### Infer a schema definition from reference data

# %% tags=[]
clean_data = pd.DataFrame({
    "continuous": range(100),
    "categorical": [*"ABCAB" * 20]
})

schema = pa.infer_schema(clean_data)
print(schema)

# %% [markdown] slideshow={"slide_type": "slide"}
# ##### Write it to/from a yaml file

# %% tags=[] jupyter={"outputs_hidden": true}
yaml_schema = schema.to_yaml()
print(yaml_schema)

# %% tags=[] jupyter={"outputs_hidden": true}
print(schema.from_yaml(yaml_schema))

# %% [markdown] slideshow={"slide_type": "slide"}
# ##### Write it to a python script for further refinement using `schema.to_script()`
#
# For some datasets, it might make sense to infer a schema from a sample of
# data and go from there:

# %% [markdown] slideshow={"slide_type": "notes"}
# Finally, you can even bootstrap a schema from a sample of data because it
# can be tedious to write a schema from scratch. All you have to do is
# call the `infer_schema` function, which you can then write out in yaml format
# or as a python script to further edit and refine.

# %% slideshow={"slide_type": "fragment"}
raw_df = pd.read_csv(StringIO(raw_data.strip()))
display(raw_df.head(3))

# %% slideshow={"slide_type": "fragment"}
schema = pa.infer_schema(raw_df)
schema.to_yaml()
schema.to_script()
print(schema)


# %% [markdown] slideshow={"slide_type": "subslide"}
# ## 🪛🪓🪚 Use Cases
#
# - CI tests for ETL/model training pipeline
# - Alerting for dataset shift
# - Monitoring model quality in production

# %% [markdown] slideshow={"slide_type": "notes"}
# To sum up the practical use cases of statistical typing and pandera in
# particular, you can use it in the context of continuous integration tests