def test_example(get_data_file, test_output_dir): file_name = get_data_file( "meteorites.csv", "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD", ) df = pd.read_csv(file_name) # Note: Pandas does not support dates before 1880, so we ignore these for this analysis df["year"] = pd.to_datetime(df["year"], errors="coerce") # Example: Constant variable df["source"] = "NASA" # Example: Boolean variable df["boolean"] = np.random.choice([True, False], df.shape[0]) # Example: Mixed with base types df["mixed"] = np.random.choice([1, "A"], df.shape[0]) # Example: Highly correlated variables df["reclat_city"] = df["reclat"] + np.random.normal(scale=5, size=(len(df))) # Example: Duplicate observations duplicates_to_add = pd.DataFrame(df.iloc[0:10]) duplicates_to_add["name"] += " copy" df = df.append(duplicates_to_add, ignore_index=True) output_file = test_output_dir / "profile.html" profile = ProfileReport(df, title="NASA Meteorites", samples={ "head": 5, "tail": 5 }, sort="ascending") profile.to_file(output_file=output_file) assert (test_output_dir / "profile.html").exists(), "Output file does not exist" assert (type(profile.get_description()) == dict and len( profile.get_description().items()) == 7), "Unexpected result" if sys.version_info[1] >= 6: assert list(profile.get_description()["variables"].keys()) == [ "boolean", "fall", "GeoLocation", "id", "mass (g)", "mixed", "name", "nametype", "recclass", "reclat", "reclat_city", "reclong", "source", "year", ], "Ascending sort did not work"
def test_issue353(): df = pd.DataFrame(np.random.rand(100, 5), columns=["a", "b", "c", "d", "e"]) # make one column categorical df["a"] = df["a"].multiply(5).astype("int").astype("category") profile = ProfileReport(df, title="Pandas Profiling Report", html={"style": { "full_width": True }}) profile.get_description()
def test_example(get_data_file, test_output_dir): file_name = get_data_file( "meteorites.csv", "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD", ) # For reproducibility np.random.seed(7331) df = pd.read_csv(file_name) # Note: Pandas does not support dates before 1880, so we ignore these for this analysis df["year"] = pd.to_datetime(df["year"], errors="coerce") # Example: Constant variable df["source"] = "NASA" # Example: Boolean variable df["boolean"] = np.random.choice([True, False], df.shape[0]) # Example: Mixed with base types df["mixed"] = np.random.choice([1, "A"], df.shape[0]) # Example: Highly correlated variables df["reclat_city"] = df["reclat"] + np.random.normal(scale=5, size=(len(df))) # Example: Duplicate observations duplicates_to_add = pd.DataFrame(df.iloc[0:10].copy()) df = df.append(duplicates_to_add, ignore_index=True) output_file = test_output_dir / "profile.html" profile = ProfileReport( df, title="NASA Meteorites", samples={ "head": 5, "tail": 5 }, duplicates={"head": 10}, minimal=True, ) profile.to_file(output_file) assert (test_output_dir / "profile.html").exists(), "Output file does not exist" assert (type(profile.get_description()) == dict and len( profile.get_description().items()) == 10), "Unexpected result" assert "<span class=badge>12</span>" in profile.to_html()
def test_issue85(): data = { "booleans_type": [False, True, True], "booleans_type_nan": [False, True, np.nan], "integers": [1, 0, 0], "integers_nan": [1, 0, np.nan], "str_yes_no": ["Y", "N", "Y"], "str_yes_no_mixed": ["Y", "n", "y"], "str_yes_no_nana": ["Y", "N", np.nan], "str_true_false": ["True", "False", "False"], "str_true_false_nan": ["True", "False", np.nan], } df = pd.DataFrame(data) report = ProfileReport( df, pool_size=1, title="Dataset with <em>Boolean</em> Variables", samples={"head": 20}, ) for col, variable_stats in report.get_description()["variables"].items(): assert ( variable_stats["type"] == Variable.TYPE_BOOL ), "Variable should be boolean"
def test_check_date_type_warning(): df = pd.DataFrame(["2018-01-01", "2017-02-01", "2018-04-07"], columns=["date"]) report = ProfileReport(df) assert any(message.message_type == MessageType.TYPE_DATE for message in report.get_description() ["messages"]), "Date warning should be present"
def test_issue51_empty(): df = pd.DataFrame({ "test": ["", "", "", "", ""], "blest": ["", "", "", "", ""], "bert": ["", "", "", "", ""], }) report = ProfileReport( df, title="Pandas Profiling Report", progress_bar=False, explorative=True, ) report.config.vars.num.low_categorical_threshold = 0 assert ("cramers" not in report.get_description()["correlations"] or (report.get_description()["correlations"]["cramers"].values == np.ones((3, 3))).all())
def pandas_profiling_test(df): from pandas_profiling import ProfileReport report = ProfileReport(df, minimal=True) description = report.get_description() table = description["table"] return table
def test_issue377(df): if df is None: pytest.skip("dataset unavailable") return original_order = tuple(df.columns.values) profile = ProfileReport(df, sort=None, pool_size=1, progress_bar=False) new_order = tuple(profile.get_description()["variables"].keys()) assert original_order == new_order
def test_example_empty(): df = pd.DataFrame({"A": [], "B": []}) profile = ProfileReport(df) description = profile.get_description() assert len(description["correlations"]) == 0 assert len(description["missing"]) == 0 assert len(description["sample"]) == 0 html = profile.to_html() assert "Dataset is empty" in html
def test_custom_sample(): df = pd.DataFrame({"test": [1, 2, 3, 4, 5]}) # In case that a sample of the real data (cars) would disclose sensitive information, we can replace it with # mock data. For illustrative purposes, we use data based on cars from a popular game series. mock_data = pd.DataFrame({ "make": ["Blista Kanjo", "Sentinel", "Burrito"], "price": [58000, 95000, 65000], "mpg": [20, 30, 22], "rep78": ["Average", "Excellent", "Fair"], "headroom": [2.5, 3.0, 1.5], "trunk": [8, 10, 4], "weight": [1050, 1600, 2500], "length": [165, 170, 180], "turn": [40, 50, 32], "displacement": [80, 100, 60], "gear_ratio": [2.74, 3.51, 2.41], "foreign": ["Domestic", "Domestic", "Foreign"], }) # Length left out due to correlation with weight. report = ProfileReport( df, title="Test custom sample", sample={ "name": "Mock data sample", "data": mock_data, "caption": "Disclaimer: this is synthetic data generated based on the format of the data in this table.", }, minimal=True, ) samples = report.get_description()["sample"] assert len(samples) == 1 sample = samples[0] assert sample.id == "custom" assert hash_dataframe(sample.data) == hash_dataframe(mock_data) assert sample.name == "Mock data sample" assert ( sample.caption == "Disclaimer: this is synthetic data generated based on the format of the data in this table." ) html = report.to_html() assert "Mock data sample" in html assert all(make in html for make in mock_data["make"].values.tolist()) assert ( "Disclaimer: this is synthetic data generated based on the format of the data in this table" in html)
def test_issue523(): # https://github.com/pandas-dev/pandas/issues/33803 data = [ 1871248, 12522551, 1489260, 6657093, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, 1489260, pd.NA, 2468576, ] df = pd.DataFrame({"col": data}, dtype=pd.Int64Dtype()) profile_report = ProfileReport(df, title="Test Report", progress_bar=False) assert len(profile_report.get_description()) > 0
def test_interactions_target(): n_rows = 10 n_columns = 50 n_targets = 2 df = pd.DataFrame( np.random.randint(0, 1000, size=(n_rows, n_columns)), columns=[f"column_{c}" for c in range(n_columns)], ) targets = [f"column_{target}" for target in range(0, n_targets)] profile = ProfileReport(df, minimal=True, interactions={ "continuous": True, "targets": targets }) total = sum( len(v.keys()) for k, v in profile.get_description()["scatter"].items()) assert total == n_targets * n_columns
def test_modular_description_set(tdf): profile = ProfileReport( tdf, title="Modular test", duplicates=None, samples={ "head": 0, "tail": 0 }, correlations=None, interactions=None, missing_diagrams={ "matrix": False, "bar": False, "dendrogram": False, "heatmap": False, }, pool_size=1, ) html = profile.get_description() assert len(html) > 0
def test_issue351(): data = pd.DataFrame(["Jan", 1]).set_index(0) profile = ProfileReport(data) assert (profile.get_description()["variables"]["0"]["type"] == Variable.S_TYPE_UNSUPPORTED)
for k, df in datasets.items(): print("------------") print(f"TRAINING DATASET ({k.upper()})") print("------------") profile = ProfileReport(df, title=f"Passengers Training Data ({k.title()})", html={"style":{"full_width":True}}) print(type(profile)) if TO_HTML: profile_path = os.path.join(REPORTS_DIR, f"passengers_profile_{k}.html") profile.to_file(output_file=profile_path) #webbrowser.open(os.path.abspath(TRAINING_PROFILE_FILEPATH)) #> RESULTS ... desc = profile.get_description() #print(desc.keys()) #> ['table', 'variables', 'scatter', 'correlations', 'missing', 'messages', 'package'] print("------------") print("MESSAGES:") print("------------") for message in desc["messages"]: print(message) #print("------------") #print("VARS:") #print("------------") #for k,v in desc["variables"].items(): # print(k.upper()) # print(v.keys()) # #pprint(v) # print("---")
def test_issue282(): index = [ "BJ110", "BJ126", "BJ163", "BJ054", "BJ017", "LP045", "BJ153", "AD013", "NL047", "BJ036", "AD026", "BJ018", "LP044", "LP006", "BO014", "BJ035", "BJ155", "TLL003", "BJ073", "BJ068", "BJ049", "TLL061", "NL010", "AD019", "LP003", "BJ107", "BJ023", "BJ012", "TLL067", "LP020", "AD031", "BJ172", "NL031", "LP032", "AD016", "BJ077", "BJ047", "BJ001", "BJ105", "BJ062", "AD022", "BJ106", "BJ102", "BJ022", "BJ010", "TLL007", "AD011", "LP018", "TLL004", "TLL030", "BJ005", "AD003", "BJ025", "LP005", "BJ144", "BJ080", "TLL062", "BJ166", "LP014", "NL005", "TLL038", "BJ072", "AD032", "BO001", "BO024", "BO005", "AD004", "TLL006", "BJ063", "BJ007", "LP007", "BJ159", "NL056", "NL059", "BJ115", "NL037", "BJ003", "BJ117", "AD025", "BJ050", "LP029", "BJ149", "AD002", "AD010", "BJ160", "BJ147", "BO023", "NL055", "NL038", "BO004", "BJ123", "NL051", "NL011", ] df = pd.DataFrame( index=index, data={"column_1": ["value"] * len(index), "column_2": [1.0] * len(index)}, ) report = ProfileReport(df) description = report.get_description() assert type(description) == dict
def test_issue351(): data = pd.DataFrame(["Jan", 1]).set_index(0) profile = ProfileReport(data, progress_bar=False) assert profile.get_description()["variables"]["0"]["type"] == "Unsupported"