def test_urls(get_data_file): file_name = get_data_file( "whitelist_urls.csv", "https://raw.githubusercontent.com/openeventdata/scraper/master/whitelist_urls.csv", ) df = pd.read_csv(file_name, header=None, names=["source", "url", "reach", "language"]) # Add ~10% missing values df = df.mask(np.random.random(df.shape) < 0.1) profile = ProfileReport( df, title="DataFrame with URL column", samples={ "head": 0, "tail": 0 }, explorative=True, ) assert "<small>URL</small>" in profile.to_html(), "URL not detected" assert "<th>URL</th>" in profile.to_html(), "URL not detected"
def test_issue200(): df = pd.DataFrame([0, 1, 2], columns=["a"], index=["0", "1", "2"]) assert df.index.dtype == "object", "Index type should be 'object'" report = ProfileReport(df, title="String indices") assert ("<title>String indices</title>" in report.to_html()), "Profile report should be generated."
def test_load(get_data_file, test_output_dir): file_name = get_data_file( "meteorites.csv", "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD", ) # For reproducibility np.random.seed(7331) df = pd.read_csv(file_name) # Note: Pandas does not support dates before 1880, so we ignore these for this analysis df["year"] = pd.to_datetime(df["year"], errors="coerce") # Example: Constant variable df["source"] = "NASA" # Example: Boolean variable df["boolean"] = np.random.choice([True, False], df.shape[0]) # Example: Mixed with base types df["mixed"] = np.random.choice([1, "A"], df.shape[0]) # Example: Highly correlated variables df["reclat_city"] = df["reclat"] + np.random.normal(scale=5, size=(len(df))) # Example: Duplicate observations duplicates_to_add = pd.DataFrame(df.iloc[0:10].copy()) df = pd.concat([df, duplicates_to_add], ignore_index=True) profile1 = ProfileReport( df, title="NASA Meteorites", samples={"head": 5, "tail": 5}, duplicates={"head": 10}, minimal=True, progress_bar=False, ) test_output_path = test_output_dir / "NASA-Meteorites.pp" json1 = profile1.to_json() profile1.dump(test_output_path) _ = profile1.to_html() assert test_output_path.exists(), "Output file does not exist" profile2 = ProfileReport(df, progress_bar=False).load(test_output_path) # json1 are compute before dumps, so _description_set should be the same assert isinstance(profile2._description_set, dict) # profile1 is lazy, html1 are compute after dumps, so report should be None assert profile2._report is None json2 = profile2.to_json() # both profile should generate same output assert json1 == json2
def test_issue_169_index(issue_169_data): df = pd.read_csv(issue_169_data, sep=",", index_col=0) report = ProfileReport(df, missing_diagrams={ "dendrogram": True, "heatmap": True }) html = report.to_html() assert type(html) == str assert "<p class=h4>Dataset statistics</p>" in html
def test_issue671(): test = pd.DataFrame([0, 5, 22, 32, 65, np.nan], columns=["a"]) for i in range(0, 10): profile = ProfileReport(test, vars={"num": { "low_categorical_threshold": i }}, progress_bar=False) assert len(profile.to_html()) > 0
def make_report(df): report = ProfileReport( df, minimal=False, pool_size=0, sort="None", title="Dataset with <em>Numeric</em> Categories", ) html = report.to_html() assert type(html) == str and '<p class="h2">Dataset info</p>' in html
def test_issue147(tmpdir): file_name = Path(str(tmpdir)) / "userdata1.parquet" data = requests.get( "https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata2.parquet" ) file_name.write_bytes(data.content) df = pd.read_parquet(str(file_name), engine="pyarrow") report = ProfileReport(df, title="PyArrow with Pandas Parquet Backend") html = report.to_html() assert type(html) == str and '<p class="h2">Dataset info</p>' in html
def test_issue_169_column(issue_169_data): df = pd.read_csv(issue_169_data, sep=",") report = ProfileReport( df, missing_diagrams={"dendrogram": True, "heatmap": True}, progress_bar=False, pool_size=1, ) html = report.to_html() assert type(html) == str assert "<p class=h4>Dataset statistics</p>" in html
def test_example_empty(): df = pd.DataFrame({"A": [], "B": []}) profile = ProfileReport(df) description = profile.get_description() assert len(description["correlations"]) == 0 assert len(description["missing"]) == 0 assert len(description["sample"]) == 0 html = profile.to_html() assert "Dataset is empty" in html
def test_issue147(get_data_file): file_name = get_data_file( "userdata1.parquet", "https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata2.parquet", ) df = pd.read_parquet(str(file_name), engine="pyarrow") report = ProfileReport(df, title="PyArrow with Pandas Parquet Backend") html = report.to_html() assert type(html) == str assert "<p class=h4>Dataset statistics</p>" in html
def test_issue545(get_data_file): file_name = get_data_file( "sample_eda_df.pkl", "https://github.com/justinsola/justinsola.github.com/raw/master/files/sample_eda_df.pkl", ) sample_eda_df = pd.read_pickle(str(file_name)) sample_profile = ProfileReport(sample_eda_df, title="Sample Profiling Report", explorative=True, pool_size=1) assert len(sample_profile.to_html()) > 0
def test_issue353(): df = pd.DataFrame(np.random.rand(100, 5), columns=["a", "b", "c", "d", "e"]) # make one column categorical df["a"] = df["a"].multiply(5).astype("int").astype("category") profile = ProfileReport(df, title="Pandas Profiling Report", html={"style": { "full_width": True }}) assert len(profile.to_html()) > 0
def test_custom_sample(): df = pd.DataFrame({"test": [1, 2, 3, 4, 5]}) # In case that a sample of the real data (cars) would disclose sensitive information, we can replace it with # mock data. For illustrative purposes, we use data based on cars from a popular game series. mock_data = pd.DataFrame({ "make": ["Blista Kanjo", "Sentinel", "Burrito"], "price": [58000, 95000, 65000], "mpg": [20, 30, 22], "rep78": ["Average", "Excellent", "Fair"], "headroom": [2.5, 3.0, 1.5], "trunk": [8, 10, 4], "weight": [1050, 1600, 2500], "length": [165, 170, 180], "turn": [40, 50, 32], "displacement": [80, 100, 60], "gear_ratio": [2.74, 3.51, 2.41], "foreign": ["Domestic", "Domestic", "Foreign"], }) # Length left out due to correlation with weight. report = ProfileReport( df, title="Test custom sample", sample={ "name": "Mock data sample", "data": mock_data, "caption": "Disclaimer: this is synthetic data generated based on the format of the data in this table.", }, minimal=True, ) samples = report.get_description()["sample"] assert len(samples) == 1 sample = samples[0] assert sample.id == "custom" assert hash_dataframe(sample.data) == hash_dataframe(mock_data) assert sample.name == "Mock data sample" assert ( sample.caption == "Disclaimer: this is synthetic data generated based on the format of the data in this table." ) html = report.to_html() assert "Mock data sample" in html assert all(make in html for make in mock_data["make"].values.tolist()) assert ( "Disclaimer: this is synthetic data generated based on the format of the data in this table" in html)
def test_issue_120(): df = pd.read_csv( "https://github.com/pandas-profiling/pandas-profiling/files/2386812/pandas_profiling_bug.txt" ) report = ProfileReport( df, correlations={"cramers": {"calculate": False}}, vars={"cat": {"check_composition": True}}, ) html = report.to_html() assert type(html) == str assert "<p class=h2>Dataset info</p>" in html
def test_issue100(): df = pd.DataFrame(np.random.randint(0, 1000, size=(1000, 4)), columns=list("ABCD")) df[["B", "C"]] = df[["B", "C"]].astype("category") report = ProfileReport( df, pool_size=1, title="Dataset with <em>Numeric</em> Categories", samples={"head": 20}, ) html = report.to_html() assert type(html) == str and '<p class="h2">Dataset info</p>' in html
def test_decorator(): df = pd.read_csv( "https://raw.githubusercontent.com/oncletom/coursera-ml/master/week-1/people-example.csv" ) report = ProfileReport( df, title="Coursera Test Report", samples={"head": 20}, missing_diagrams={ "heatmap": False, "dendrogram": False }, ) assert "Coursera Test Report" in report.to_html(), "Title is not found"
def test_example(get_data_file, test_output_dir): file_name = get_data_file( "meteorites.csv", "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD", ) # For reproducibility np.random.seed(7331) df = pd.read_csv(file_name) # Note: Pandas does not support dates before 1880, so we ignore these for this analysis df["year"] = pd.to_datetime(df["year"], errors="coerce") # Example: Constant variable df["source"] = "NASA" # Example: Boolean variable df["boolean"] = np.random.choice([True, False], df.shape[0]) # Example: Mixed with base types df["mixed"] = np.random.choice([1, "A"], df.shape[0]) # Example: Highly correlated variables df["reclat_city"] = df["reclat"] + np.random.normal(scale=5, size=(len(df))) # Example: Duplicate observations duplicates_to_add = pd.DataFrame(df.iloc[0:10].copy()) df = df.append(duplicates_to_add, ignore_index=True) output_file = test_output_dir / "profile.html" profile = ProfileReport( df, title="NASA Meteorites", samples={ "head": 5, "tail": 5 }, duplicates={"head": 10}, minimal=True, ) profile.to_file(output_file) assert (test_output_dir / "profile.html").exists(), "Output file does not exist" assert (type(profile.get_description()) == dict and len( profile.get_description().items()) == 10), "Unexpected result" assert "<span class=badge>12</span>" in profile.to_html()
def test_issue51(get_data_file): # Categorical has empty ('') value file_name = get_data_file( "buggy1.pkl", "https://raw.githubusercontent.com/adamrossnelson/HelloWorld/master/sparefiles/buggy1.pkl", ) df = pd.read_pickle(str(file_name)) report = ProfileReport(df, title="Pandas Profiling Report", progress_bar=False, explorative=True) assert ("<title>Pandas Profiling Report</title>" in report.to_html()), "Profile report should be generated."
def test_issue51_similar(): df = pd.DataFrame({ "test": ["", "hoi", None], "blest": [None, "", "geert"], "bert": ["snor", "", None], }) report = ProfileReport(df, title="Pandas Profiling Report", progress_bar=False, explorative=True) report.config.vars.num.low_categorical_threshold = 0 # FIXME: assert correlation values (report.description_set["correlations"]) assert ("<title>Pandas Profiling Report</title>" in report.to_html()), "Profile report should be generated."
def test_sensitive(): df = pd.DataFrame( { "name": ["John Doe", "Marco Polo", "Louis Brandeis", "William Douglas"], "year": [1965, 1271, 1916, 1975], "tf": [True, False, False, True], "date": pd.to_datetime( [datetime.now() - timedelta(days=i) for i in range(4)] ), } ) report = ProfileReport(df, sensitive=True, explorative=True) html = report.to_html() assert all(value not in html for value in df["name"].values.tolist())
def test_modular_present(tdf): profile = ProfileReport( tdf, title="Modular test", duplicates={"head": 10}, samples={ "head": 10, "tail": 10 }, interactions={ "targets": ["mass (g)"], "continuous": True }, correlations={ "pearson": { "calculate": True }, "spearman": { "calculate": True }, "kendall": { "calculate": True }, "phi_k": { "calculate": True }, "cramers": { "calculate": True }, }, missing_diagrams={ "matrix": True, "bar": True, "dendrogram": True, "heatmap": True, }, pool_size=1, ) html = profile.to_html() assert "Correlations</h1>" in html assert "Duplicate rows</h1>" in html assert "Sample</h1>" in html assert "Missing values</h1>" in html
def test_issue_120(get_data_file): file_name = get_data_file( "pandas_profiling_bug.txt", "https://github.com/pandas-profiling/pandas-profiling/files/2386812/pandas_profiling_bug.txt", ) df = pd.read_csv(file_name) report = ProfileReport( df, correlations={"cramers": { "calculate": False }}, vars={"cat": { "check_composition": True }}, ) html = report.to_html() assert type(html) == str assert "<p class=h4>Dataset statistics</p>" in html
def test_modular_absent(tdf): profile = ProfileReport( tdf, title="Modular test", duplicates={"head": 0}, samples={ "head": 0, "tail": 0 }, interactions=None, correlations=None, missing_diagrams=None, ) html = profile.to_html() assert "Correlations</h1>" not in html assert "Duplicate rows</h1>" not in html assert "Sample</h1>" not in html assert "Missing values</h1>" not in html
def test_issue523(): # https://github.com/pandas-dev/pandas/issues/33803 data = [ 1871248, 12522551, 1489260, 6657093, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, 1489260, pd.NA, 2468576, ] df = pd.DataFrame({"col": data}, dtype=pd.Int64Dtype()) profile_report = ProfileReport(df, title="Test Report") assert len(profile_report.to_html()) > 0
def get_profile_results(data): """profiles pandas dataframe""" if isinstance(data, pd.DataFrame): profile = ProfileReport( data, title='Snowflake Data Profiler from Hashmap', progress_bar=False, explorative=True, correlations={ "pearson": {"calculate": True}, "spearman": {"calculate": False}, "kendall": {"calculate": False}, "phi_k": {"calculate": False}, "cramers": {"calculate": False}, }, ) p = profile.to_html() # this step sometimes fails with matplotlib errors about threads. I've only fixed it by adjusting requirements.txt in the past. I've just specified the specific versions of libraries. Pyarrow seems to have an impact on this. return p else: raise TypeError('This is not a pandas dataframe.')
def func(df, **kwargs): profile = ProfileReport(df, progress_bar=False, **kwargs) report = profile.to_html() return report
def _to_html(profile_report: ProfileReport) -> str: html_report = profile_report.to_html() html_report = html.escape(html_report) return ( f'<iframe srcdoc="{html_report}" style={STYLE} frameborder="0" ' "allowfullscreen></iframe>")
def test_issue249(): df = pd.DataFrame(data=[[1], [2]], index=["foo", 1], columns=["a"]) report = ProfileReport(df, explorative=True, progress_bar=False) assert type(report.config.title) == str assert len(report.to_html()) > 0
def test_issue664(): test = pd.DataFrame([np.nan] * 100, columns=["a"]) profile = ProfileReport(test) assert len(profile.to_html()) > 0
def to_html(self, df: pandas.DataFrame) -> str: assert isinstance(df, pandas.DataFrame) profile = ProfileReport(df, title=self._title) return profile.to_html()