def test_example(get_data_file, test_output_dir):
    file_name = get_data_file(
        "meteorites.csv",
        "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
    )

    df = pd.read_csv(file_name)
    # Note: Pandas does not support dates before 1880, so we ignore these for this analysis
    df["year"] = pd.to_datetime(df["year"], errors="coerce")

    # Example: Constant variable
    df["source"] = "NASA"

    # Example: Boolean variable
    df["boolean"] = np.random.choice([True, False], df.shape[0])

    # Example: Mixed with base types
    df["mixed"] = np.random.choice([1, "A"], df.shape[0])

    # Example: Highly correlated variables
    df["reclat_city"] = df["reclat"] + np.random.normal(scale=5,
                                                        size=(len(df)))

    # Example: Duplicate observations
    duplicates_to_add = pd.DataFrame(df.iloc[0:10])
    duplicates_to_add["name"] += " copy"

    df = df.append(duplicates_to_add, ignore_index=True)

    output_file = test_output_dir / "profile.html"
    profile = ProfileReport(df,
                            title="NASA Meteorites",
                            samples={
                                "head": 5,
                                "tail": 5
                            },
                            sort="ascending")
    profile.to_file(output_file=output_file)
    assert (test_output_dir /
            "profile.html").exists(), "Output file does not exist"
    assert (type(profile.get_description()) == dict and len(
        profile.get_description().items()) == 7), "Unexpected result"
    if sys.version_info[1] >= 6:
        assert list(profile.get_description()["variables"].keys()) == [
            "boolean",
            "fall",
            "GeoLocation",
            "id",
            "mass (g)",
            "mixed",
            "name",
            "nametype",
            "recclass",
            "reclat",
            "reclat_city",
            "reclong",
            "source",
            "year",
        ], "Ascending sort did not work"
def test_issue353():
    df = pd.DataFrame(np.random.rand(100, 5),
                      columns=["a", "b", "c", "d", "e"])
    # make one column categorical
    df["a"] = df["a"].multiply(5).astype("int").astype("category")

    profile = ProfileReport(df,
                            title="Pandas Profiling Report",
                            html={"style": {
                                "full_width": True
                            }})
    profile.get_description()
def test_example(get_data_file, test_output_dir):
    file_name = get_data_file(
        "meteorites.csv",
        "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
    )

    # For reproducibility
    np.random.seed(7331)

    df = pd.read_csv(file_name)

    # Note: Pandas does not support dates before 1880, so we ignore these for this analysis
    df["year"] = pd.to_datetime(df["year"], errors="coerce")

    # Example: Constant variable
    df["source"] = "NASA"

    # Example: Boolean variable
    df["boolean"] = np.random.choice([True, False], df.shape[0])

    # Example: Mixed with base types
    df["mixed"] = np.random.choice([1, "A"], df.shape[0])

    # Example: Highly correlated variables
    df["reclat_city"] = df["reclat"] + np.random.normal(scale=5,
                                                        size=(len(df)))

    # Example: Duplicate observations
    duplicates_to_add = pd.DataFrame(df.iloc[0:10].copy())

    df = df.append(duplicates_to_add, ignore_index=True)

    output_file = test_output_dir / "profile.html"
    profile = ProfileReport(
        df,
        title="NASA Meteorites",
        samples={
            "head": 5,
            "tail": 5
        },
        duplicates={"head": 10},
        minimal=True,
    )
    profile.to_file(output_file)
    assert (test_output_dir /
            "profile.html").exists(), "Output file does not exist"
    assert (type(profile.get_description()) == dict and len(
        profile.get_description().items()) == 10), "Unexpected result"
    assert "<span class=badge>12</span>" in profile.to_html()
def test_issue85():
    data = {
        "booleans_type": [False, True, True],
        "booleans_type_nan": [False, True, np.nan],
        "integers": [1, 0, 0],
        "integers_nan": [1, 0, np.nan],
        "str_yes_no": ["Y", "N", "Y"],
        "str_yes_no_mixed": ["Y", "n", "y"],
        "str_yes_no_nana": ["Y", "N", np.nan],
        "str_true_false": ["True", "False", "False"],
        "str_true_false_nan": ["True", "False", np.nan],
    }

    df = pd.DataFrame(data)

    report = ProfileReport(
        df,
        pool_size=1,
        title="Dataset with <em>Boolean</em> Variables",
        samples={"head": 20},
    )
    for col, variable_stats in report.get_description()["variables"].items():
        assert (
            variable_stats["type"] == Variable.TYPE_BOOL
        ), "Variable should be boolean"
Beispiel #5
0
def test_check_date_type_warning():
    df = pd.DataFrame(["2018-01-01", "2017-02-01", "2018-04-07"],
                      columns=["date"])

    report = ProfileReport(df)
    assert any(message.message_type == MessageType.TYPE_DATE
               for message in report.get_description()
               ["messages"]), "Date warning should be present"
def test_issue51_empty():
    df = pd.DataFrame({
        "test": ["", "", "", "", ""],
        "blest": ["", "", "", "", ""],
        "bert": ["", "", "", "", ""],
    })

    report = ProfileReport(
        df,
        title="Pandas Profiling Report",
        progress_bar=False,
        explorative=True,
    )
    report.config.vars.num.low_categorical_threshold = 0

    assert ("cramers" not in report.get_description()["correlations"]
            or (report.get_description()["correlations"]["cramers"].values
                == np.ones((3, 3))).all())
def pandas_profiling_test(df):
    from pandas_profiling import ProfileReport
    
    report = ProfileReport(df, minimal=True)
    
    description = report.get_description()
    
    table = description["table"]
    
    return table
def test_issue377(df):
    if df is None:
        pytest.skip("dataset unavailable")
        return

    original_order = tuple(df.columns.values)

    profile = ProfileReport(df, sort=None, pool_size=1, progress_bar=False)
    new_order = tuple(profile.get_description()["variables"].keys())
    assert original_order == new_order
def test_example_empty():
    df = pd.DataFrame({"A": [], "B": []})
    profile = ProfileReport(df)
    description = profile.get_description()

    assert len(description["correlations"]) == 0
    assert len(description["missing"]) == 0
    assert len(description["sample"]) == 0

    html = profile.to_html()
    assert "Dataset is empty" in html
def test_custom_sample():
    df = pd.DataFrame({"test": [1, 2, 3, 4, 5]})

    # In case that a sample of the real data (cars) would disclose sensitive information, we can replace it with
    # mock data. For illustrative purposes, we use data based on cars from a popular game series.
    mock_data = pd.DataFrame({
        "make": ["Blista Kanjo", "Sentinel", "Burrito"],
        "price": [58000, 95000, 65000],
        "mpg": [20, 30, 22],
        "rep78": ["Average", "Excellent", "Fair"],
        "headroom": [2.5, 3.0, 1.5],
        "trunk": [8, 10, 4],
        "weight": [1050, 1600, 2500],
        "length": [165, 170, 180],
        "turn": [40, 50, 32],
        "displacement": [80, 100, 60],
        "gear_ratio": [2.74, 3.51, 2.41],
        "foreign": ["Domestic", "Domestic", "Foreign"],
    })

    # Length left out due to correlation with weight.
    report = ProfileReport(
        df,
        title="Test custom sample",
        sample={
            "name":
            "Mock data sample",
            "data":
            mock_data,
            "caption":
            "Disclaimer: this is synthetic data generated based on the format of the data in this table.",
        },
        minimal=True,
    )

    samples = report.get_description()["sample"]
    assert len(samples) == 1
    sample = samples[0]
    assert sample.id == "custom"
    assert hash_dataframe(sample.data) == hash_dataframe(mock_data)
    assert sample.name == "Mock data sample"
    assert (
        sample.caption ==
        "Disclaimer: this is synthetic data generated based on the format of the data in this table."
    )

    html = report.to_html()
    assert "Mock data sample" in html
    assert all(make in html for make in mock_data["make"].values.tolist())
    assert (
        "Disclaimer: this is synthetic data generated based on the format of the data in this table"
        in html)
Beispiel #11
0
def test_issue523():
    # https://github.com/pandas-dev/pandas/issues/33803

    data = [
        1871248,
        12522551,
        1489260,
        6657093,
        pd.NA,
        pd.NA,
        pd.NA,
        pd.NA,
        pd.NA,
        1489260,
        pd.NA,
        2468576,
    ]
    df = pd.DataFrame({"col": data}, dtype=pd.Int64Dtype())

    profile_report = ProfileReport(df, title="Test Report", progress_bar=False)
    assert len(profile_report.get_description()) > 0
Beispiel #12
0
def test_interactions_target():
    n_rows = 10
    n_columns = 50
    n_targets = 2

    df = pd.DataFrame(
        np.random.randint(0, 1000, size=(n_rows, n_columns)),
        columns=[f"column_{c}" for c in range(n_columns)],
    )
    targets = [f"column_{target}" for target in range(0, n_targets)]

    profile = ProfileReport(df,
                            minimal=True,
                            interactions={
                                "continuous": True,
                                "targets": targets
                            })

    total = sum(
        len(v.keys()) for k, v in profile.get_description()["scatter"].items())
    assert total == n_targets * n_columns
Beispiel #13
0
def test_modular_description_set(tdf):
    profile = ProfileReport(
        tdf,
        title="Modular test",
        duplicates=None,
        samples={
            "head": 0,
            "tail": 0
        },
        correlations=None,
        interactions=None,
        missing_diagrams={
            "matrix": False,
            "bar": False,
            "dendrogram": False,
            "heatmap": False,
        },
        pool_size=1,
    )

    html = profile.get_description()
    assert len(html) > 0
def test_issue351():
    data = pd.DataFrame(["Jan", 1]).set_index(0)

    profile = ProfileReport(data)
    assert (profile.get_description()["variables"]["0"]["type"] ==
            Variable.S_TYPE_UNSUPPORTED)
Beispiel #15
0
    for k, df in datasets.items():
        print("------------")
        print(f"TRAINING DATASET ({k.upper()})")
        print("------------")

        profile = ProfileReport(df, title=f"Passengers Training Data ({k.title()})", html={"style":{"full_width":True}})
        print(type(profile))
        if TO_HTML:
            profile_path = os.path.join(REPORTS_DIR, f"passengers_profile_{k}.html")
            profile.to_file(output_file=profile_path)
            #webbrowser.open(os.path.abspath(TRAINING_PROFILE_FILEPATH))

        #> RESULTS ...

        desc = profile.get_description()
        #print(desc.keys()) #> ['table', 'variables', 'scatter', 'correlations', 'missing', 'messages', 'package']
        print("------------")
        print("MESSAGES:")
        print("------------")
        for message in desc["messages"]:
            print(message)

        #print("------------")
        #print("VARS:")
        #print("------------")
        #for k,v in desc["variables"].items():
        #    print(k.upper())
        #    print(v.keys())
        #    #pprint(v)
        #    print("---")
Beispiel #16
0
def test_issue282():
    index = [
        "BJ110",
        "BJ126",
        "BJ163",
        "BJ054",
        "BJ017",
        "LP045",
        "BJ153",
        "AD013",
        "NL047",
        "BJ036",
        "AD026",
        "BJ018",
        "LP044",
        "LP006",
        "BO014",
        "BJ035",
        "BJ155",
        "TLL003",
        "BJ073",
        "BJ068",
        "BJ049",
        "TLL061",
        "NL010",
        "AD019",
        "LP003",
        "BJ107",
        "BJ023",
        "BJ012",
        "TLL067",
        "LP020",
        "AD031",
        "BJ172",
        "NL031",
        "LP032",
        "AD016",
        "BJ077",
        "BJ047",
        "BJ001",
        "BJ105",
        "BJ062",
        "AD022",
        "BJ106",
        "BJ102",
        "BJ022",
        "BJ010",
        "TLL007",
        "AD011",
        "LP018",
        "TLL004",
        "TLL030",
        "BJ005",
        "AD003",
        "BJ025",
        "LP005",
        "BJ144",
        "BJ080",
        "TLL062",
        "BJ166",
        "LP014",
        "NL005",
        "TLL038",
        "BJ072",
        "AD032",
        "BO001",
        "BO024",
        "BO005",
        "AD004",
        "TLL006",
        "BJ063",
        "BJ007",
        "LP007",
        "BJ159",
        "NL056",
        "NL059",
        "BJ115",
        "NL037",
        "BJ003",
        "BJ117",
        "AD025",
        "BJ050",
        "LP029",
        "BJ149",
        "AD002",
        "AD010",
        "BJ160",
        "BJ147",
        "BO023",
        "NL055",
        "NL038",
        "BO004",
        "BJ123",
        "NL051",
        "NL011",
    ]

    df = pd.DataFrame(
        index=index,
        data={"column_1": ["value"] * len(index), "column_2": [1.0] * len(index)},
    )
    report = ProfileReport(df)
    description = report.get_description()
    assert type(description) == dict
Beispiel #17
0
def test_issue351():
    data = pd.DataFrame(["Jan", 1]).set_index(0)
    profile = ProfileReport(data, progress_bar=False)
    assert profile.get_description()["variables"]["0"]["type"] == "Unsupported"