def test_inferred_schema_io(): """Test that inferred schema can be writted to yaml.""" df = pd.DataFrame({ "column1": [5, 10, 20], "column2": [5., 1., 3.], "column3": ["a", "b", "c"], }) schema = pa.infer_schema(df) schema_yaml_str = schema.to_yaml() schema_from_yaml = io.from_yaml(schema_yaml_str) assert schema == schema_from_yaml
clean_data = pd.DataFrame({"continuous": ["1.0"], "categorical": ["A"]}) supplementary_data = pd.DataFrame({"discrete": [1]}) JoinedData(clean_data.join(supplementary_data)) # %% [markdown] slideshow={"slide_type": "slide"} # #### Bootstrap and Interoperate # # ##### Infer a schema definition from reference data # %% tags=[] clean_data = pd.DataFrame({ "continuous": range(100), "categorical": [*"ABCAB" * 20] }) schema = pa.infer_schema(clean_data) print(schema) # %% [markdown] slideshow={"slide_type": "slide"} # ##### Write it to/from a yaml file # %% tags=[] jupyter={"outputs_hidden": true} yaml_schema = schema.to_yaml() print(yaml_schema) # %% tags=[] jupyter={"outputs_hidden": true} print(schema.from_yaml(yaml_schema)) # %% [markdown] slideshow={"slide_type": "slide"} # ##### Write it to a python script for further refinement using `schema.to_script()`
# # For some datasets, it might make sense to infer a schema from a sample of # data and go from there: # %% [markdown] slideshow={"slide_type": "notes"} # Finally, you can even bootstrap a schema from a sample of data because it # can be tedious to write a schema from scratch. All you have to do is # call the `infer_schema` function, which you can then write out in yaml format # or as a python script to further edit and refine. # %% slideshow={"slide_type": "fragment"} raw_df = pd.read_csv(StringIO(raw_data.strip())) display(raw_df.head(3)) # %% slideshow={"slide_type": "fragment"} schema = pa.infer_schema(raw_df) schema.to_yaml() schema.to_script() print(schema) # %% [markdown] slideshow={"slide_type": "subslide"} # ## 🪛🪓🪚 Use Cases # # - CI tests for ETL/model training pipeline # - Alerting for dataset shift # - Monitoring model quality in production # %% [markdown] slideshow={"slide_type": "notes"} # To sum up the practical use cases of statistical typing and pandera in # particular, you can use it in the context of continuous integration tests