def test_set_variable(): r = ProfileReport(pool_size=3) assert config["pool_size"].get(int) == 3 assert config["html"]["minify_html"].get(bool) r.set_variable("pool_size", 1) assert config["pool_size"].get(int) == 1 r.set_variable("html.minify_html", False) assert not config["html"]["minify_html"].get(bool) r.set_variable("html", {"minify_html": True}) assert config["html"]["minify_html"].get(bool)
def test_config_shorthands(): r = ProfileReport( samples=None, correlations=None, missing_diagrams=None, duplicates=None ) assert config["samples"]["head"].get(int) == 0 assert config["samples"]["tail"].get(int) == 0 assert config["duplicates"]["head"].get(int) == 0 assert not config["correlations"]["spearman"]["calculate"].get(bool) assert not config["missing_diagrams"]["bar"].get(bool) r = ProfileReport() r.set_variable("samples", None) r.set_variable("duplicates", None) r.set_variable("correlations", None) r.set_variable("missing_diagrams", None) assert config["samples"]["head"].get(int) == 0 assert config["samples"]["tail"].get(int) == 0 assert config["duplicates"]["head"].get(int) == 0 assert not config["correlations"]["spearman"]["calculate"].get(bool) assert not config["missing_diagrams"]["bar"].get(bool)
"hours-per-week", "native-country", ], ) # Prepare missing values df = df.replace("\\?", np.nan, regex=True) # Initialize the report profile = ProfileReport(df, title="Census Dataset", explorative=True) # show column definition definitions = json.load(open(f"census_column_definition.json")) profile.set_variable( "dataset", { "description": 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean records was extracted using the following conditions: ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0)). Prediction task is to determine whether a person makes over 50K a year.', "copyright_year": "1996", "author": "Ronny Kohavi and Barry Becker", "creator": "Barry Becker", "url": "https://archive.ics.uci.edu/ml/datasets/adult", }, ) profile.set_variable("variables.descriptions", definitions) # Only show the descriptions in the overview profile.set_variable("show_variable_description", False) profile.to_file(Path("./census_report.html"))
# PP only accepts absolute paths series = series.apply(lambda x: x.absolute()).apply(str) df = pd.DataFrame(series) # Generate the profile report profile = ProfileReport( df, title="Example showcasing EXIF data (Kaggle 5 Celebrity Faces Dataset)", # Disable what's not in our focus duplicates=None, correlations=None, samples=None, missing_diagrams=None, # Enable files and images (off by default, as it uses relatively expensive computations when not interested) explorative=True, ) # We can also configure the report like this profile.set_variable( "variables.descriptions", { "files": "The 5 Celebrity Faces Dataset found on Kaggle (dansbecker/5-celebrity-faces-dataset)." }, ) # Save the report profile.to_file("celebrity-faces.html") # The analysis reveals that quite some photos contain "hidden" EXIF information. # This can be both interesting as troublesome, depending on the situation.
# Generate the profile report profile = ProfileReport( df, title= "Example of summarization of an image dataset (Kaggle Cat and Dog dataset)", # We will not need those samples=None, missing_diagrams=None, ) # Give our variable a description profile.set_variable( "variables.descriptions", { "files": "Paths linking to the cats and dogs found https://www.kaggle.com/tongpython/cat-and-dog." }, ) # If the number of samples is above this threshold, the scatter plots are replaced with hexbin plots # We are just over the threshold of 10.000 samples, so let's increase the limit. profile.set_variable("plot.scatter_threshold", 25000) # Enable files and images (off by default, as it uses relatively expensive computations when not interested) profile.set_variable("vars.path.active", True) profile.set_variable("vars.file.active", True) profile.set_variable("vars.image.active", True) # No exif found, so turn off expensive computation profile.set_variable("vars.image.exif", False)
# # If a column is not present as specified by the schema, a `SchemaError` is raised. # %% corrupted_data = fatal_encounters.drop("Subject's age", axis="columns") try: clean_columns(corrupted_data) except pa.errors.SchemaError as exc: print(exc) # %% [markdown] slideshow={"slide_type": "slide"} # ### Explore the Data with [`pandas-profiling`](https://github.com/pandas-profiling/pandas-profiling) # %% tags=["hide_input"] profile = ProfileReport(fatal_encounters_clean_columns, minimal=True) profile.set_variable("html.navbar_show", False) profile.to_notebook_iframe() # %% [markdown] slideshow={"slide_type": "slide"} # ### Declare the Training Data Schema # %% slideshow={"slide_type": "skip"} genders = ["female", "male", "transgender", "transexual"] races = [ "african_american_black", "asian_pacific_islander", "european_american_white", "hispanic_latino", "middle_eastern", "native_american_alaskan", "race_unspecified", ] causes_of_death = [ 'asphyxiated_restrained', 'beaten_bludgeoned_with_instrument', 'burned_smoke_inhalation', 'chemical_agent_pepper_spray',