def test_dataset_schema():
    file_name = cache_file("auto2.dta", "http://www.stata-press.com/data/r15/auto2.dta")
    df = pd.read_stata(file_name)

    metadata = {
        "creator": "Firstname Lastname",
        "author": "Firstname Lastname",
        "description": "This profiling report was generated using a sample of 5% of the original dataset.",
        "copyright_holder": "RandoCorp LLC",
        "copyright_year": "2020",
        "url": "http://www.dataset-sources.com/data/dataset.dat",
    }

    # Length left out due to correlation with weight.
    report = df.profile_report(
        title="Dataset schema",
        dataset=metadata,
        minimal=True,
    )

    html = report.to_html()

    assert "<p class=h4>Dataset</p>" in html
    for key in metadata.keys():
        if not key.startswith("copyright_") and key != "url":
            assert f"<th>{key.capitalize()}</th>" in html
    assert "<tr><th>Copyright</th><td>(c) RandoCorp LLC 2020</td></tr>"
    assert '<tr><th>URL</th><td><a href="http://www.dataset-sources.com/data/dataset.dat">http://www.dataset-sources.com/data/dataset.dat</a></td></tr>'
    assert "<p class=h4>Reproduction</p>" in html
Ejemplo n.º 2
0
def test_titanic_default(benchmark):
    file_name = cache_file(
        "titanic.parquet",
        "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet",
    )

    data = pd.read_parquet(file_name)

    benchmark(partial(func), data)
Ejemplo n.º 3
0
def test_titanic_explorative(benchmark):
    file_name = cache_file(
        "titanic.parquet",
        "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet",
    )

    data = pd.read_parquet(file_name)

    kwargs = dict(explorative=True)
    benchmark(partial(func, **kwargs), data)
Ejemplo n.º 4
0
def test_rdw_minimal(benchmark):
    file_name = cache_file(
        "rdw_sample_100k.parquet",
        "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/rdw_sample_100k.parquet",
    )

    data = pd.read_parquet(file_name)

    kwargs = dict(minimal=True)
    benchmark(partial(func, **kwargs), data)
Ejemplo n.º 5
0
def test_titanic_minimal(benchmark):
    file_name = cache_file(
        "titanic.parquet",
        "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet",
    )

    data = pd.read_parquet(file_name)

    kwargs = {"minimal": True}
    benchmark(partial(func, **kwargs), data)
def test_dataset_schema_empty():
    file_name = cache_file("auto2.dta", "http://www.stata-press.com/data/r15/auto2.dta")
    df = pd.read_stata(file_name)

    # Length left out due to correlation with weight.
    report = df.profile_report(
        title="Dataset schema empty", minimal=True, dataset=None,
    )

    html = report.to_html()

    assert "<p class=h4>Dataset</p>" not in html
    assert "<div class=col-sm-12><p class=h4>Reproduction</p>" in html
Ejemplo n.º 7
0
def test_issue416():
    file_name = cache_file(
        "products.tsv",
        "https://raw.githubusercontent.com/mrichman/clickstream-pandas/master/products.tsv",
    )

    df = pd.read_csv(file_name, sep="\t")
    df["path"] = df["url"].str.replace("http://www.acme.com", "")

    profile = pandas_profiling.ProfileReport(
        df, title="Pandas Profiling Report", html={"style": {"full_width": True}}
    )
    data = profile.to_json()
    assert '"PATH": 1' in data
    assert '"common_prefix": "/",' in data
def test_issue377():
    file_name = cache_file(
        "bank-full.csv",
        "https://storage.googleapis.com/erwinh-public-data/bankingdata/bank-full.csv",
    )

    # Download the UCI Bank Marketing Dataset
    df = pd.read_csv(file_name, sep=";")

    original_order = tuple(df.columns.values)

    profile = pandas_profiling.ProfileReport(df,
                                             sort="None",
                                             pool_size=5,
                                             progress_bar=False)
    new_order = tuple(profile.get_description()["variables"].keys())
    assert original_order == new_order
Ejemplo n.º 9
0
def test_issue416():
    file_name = cache_file(
        "products.tsv",
        "https://raw.githubusercontent.com/mrichman/clickstream-pandas/master/products.tsv",
    )

    df = pd.read_csv(file_name, sep="\t")
    df["path"] = df["url"].str.replace("http://www.acme.com", "")

    profile = pandas_profiling.ProfileReport(
        df,
        title="Pandas Profiling Report",
        html={"style": {
            "full_width": True
        }},
        explorative=True,
    )
    data = profile.get_description()

    assert data["table"]["types"][Categorical] == 1
    assert data["table"]["types"][Path] == 1
    assert data["table"]["types"][URL] == 1
    assert data["variables"]["path"]["common_prefix"] == "/"
Ejemplo n.º 10
0
#%%
from pathlib import Path

import requests
import numpy as np
import pandas as pd

import pandas_profiling
from pandas_profiling.utils.cache import cache_file

# %%
file_name = cache_file(
    "apple.csv",
    "https://raw.githubusercontent.com/anarinsk/adp-apple_mobility_trend/master/data/applemobilitytrends-2020-04-23.csv",
)
    
df = pd.read_csv(file_name)

#%%
tmplist = zip(df.columns[3:], "time_" +  df.columns[3:])
df.rename(columns = dict(tmplist), inplace=True)
#
df = pd.wide_to_long(df, stubnames='time', i=['geo_type', 'region', 'transportation_type'], j='date', sep='_', suffix='.*')
df = df.reset_index()
#
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d') 
df
Ejemplo n.º 11
0
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "coal_prices.csv",
        r"https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23e1e9f0&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=off&txtcolor=%23444444&ts=12&tts=12&width=1168&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=PCOALAUUSDM&scale=left&cosd=1990-01-01&coed=2020-01-01&line_color=%234572a7&link_values=false&line_style=solid&mark_type=none&mw=3&lw=2&ost=-99999&oet=99999&mma=0&fml=a&fq=Monthly&fam=avg&fgst=lin&fgsnd=2009-06-01&line_index=1&transformation=lin&vintage_date=2020-02-12&revision_date=2020-02-12&nd=1990-01-01",
    )

    df = pd.read_csv(file_name, parse_dates=["DATE"])

    report = ProfileReport(
        df,
        title="Coal Prices (IMF)",
        config_file=Path("../../src/pandas_profiling/config_dark.yaml"),
        html={
            "style": {
                "logo":
                "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAEAAAAArCAYAAADIWo5HAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JQAAgIMAAPn/AACA6QAAdTAAAOpgAAA6mAAAF2+SX8VGAAAABmJLR0QAAAAAAAD5Q7t/AAAACXBIWXMAAAsTAAALEwEAmpwYAAAAB3RJTUUH5AIMFC0AShJOrwAACuVJREFUaN7tmWnMpXV5xn//7dnec867DTMvzsAww2xUsYVCYyEYbVUgBSPSRm1iE/nQJk0a28TYxdgmGhJbSRdJl9TGllZR0ZombZq0SRdRC12cqGDJIINI4J31Xc+z/7d+eA6TpuknGHghmevDOck5eZ7nPte5l+u6/3AJl3AJl3AJLz9+5SO/rO69/zfFqy0u/Uo9aG11/W1bZ7fOPPL8E9954BN/MLe9vaXOnDp/TaL1DWVVduPxeCMfzf39lz//UCWE4Lqbb1Rr61vi2SeedC9nXC/bP/LJP/09uWdu+eC/P/rogaqs31KV5Xud8/HcuXP/BnFvhMz7cCCGcHlZljHL0q4o5v7x4NUHzj/1vadCmuflZZfteqqs6y+/+923n3v00f/Sn/7jz/SvCQLe9YGfkbkz781M+rHnnntut7NuHCOUZYn3Hu8dRhuEECilmE5LlpaX6Pse7z1CwFwxR5Im9frGxmMHDuz/j/F4FDtr733wrx48+6ol4Pbb3ibfdNPN4/88/q3DW+vrD9iu/yEQjMcjyqrCO08IAR88RZ4TQkApSQygtGF7ewshBCYZyAk+4LxHK8ni0kKYzE8+eNWxI39038c/ES5WzOpiEnDNtW849vTJ7//l2dNnfr6clocn4zExBJaWltjenkIMIARJmpAXBc55qqomz3O6riNNUwCccwM5UhJ8QClFCFGUZXV9My33Hzp8aP/eq/Z3z5x8+uyrhoD7/vwP5ZlnVj985vTZn9VSLmqt8TFie0td1zRNQ1NXCCFACKSQeO9BQJKmlFVF13UorUiThPFoPCSoEKRZSt/1tG03cs79WAi+S5Lkn3/6595z/hv/8rVXBwGpTN66ubb+8XJ7OlJKoaSknJZorZFSoJUiAtYHpFJ451lYWEApRdPU9H2HFKCMYXlpiaosiUDTNPR9T9t1+BCIIcY0zeyhq6++9vzZc4889u3HtnecgLvf/x65dvrcR6WQN2utqKqKpmkpioIYA0JInHMoozFGI6WkKHJCjDjnEAKSJEEpjVKaqqpxzuG8J4ZIJOKdwyQJ3nsRvF/J8/yo0ebsgaNHv3niu//tX2zs8mIQ4DprXG9HbdMyLStCCCRJQp7nF+raGENqEsajEVII+q6na1vqukZIiQ8R7wPBe+p6uEfwnjQbrplMxgTvEcC0LFl9flWfPnX67q5v515K7BeFgL7qbnHO39J2LVmWoo0hyzKEgDzPgUjbtvR9j3OeECO9tSilGY1GECGEQIwREBTFHEopjEmQQiKVZmFhkSRJyLKcIh8aaFlWC0uThcmOE9D1/ZEQwh4pFUpr8rwgztJ7c3OLEOMgO7Wmt5aiKAgxYp0lEsnynKIocN7TtA0hREKMhBjY2NgkBM90OsU5T17k5HnO9nSKEGK36+2Re37xF+SOEXD7XXekaaL7ELy3fY/rLQJIjGFxYYEQAzEEjNGkaYoxhr7rSZIErTREmL0wHo1QSiGEQAhB0zQkiaG3lo3NLaSUVGWJsxbvHWVZzq+urn5oe3s6vyNe4Efe8iaTm/QjvevuiTEqgBgjIXhC8Kytr5MYQ4wghKCqK5RUSKVItKbtOrTWlFVFmqQkSULTNOzdu4yzPYkxTMuSqiwRArquwxhDMVegpcQ5J43Sdjw3bl5xAq561w28fnLonVvrG79UltWCVhovPMaYC/OdGFGzkSelpG0b+ghLi4tkeYZSkq7vUVLRti2RyGQywdqe6fY2SimUkoxHI+q6xmiNdXYohxAQAlZPnbq8izEH2leUgDTN6a3d3tzcdINIaUCIYdwphZRySOUQaJuWNE3IsgzrPF3X0dkeIQRd2yGVQoihEWZZTts2aK3RxqCDJsZABHwIOOfprWU8GrO5tUmWZ8uL86PLgI1XtAec+OLXqKryuTRNN5IkQUo1zOwYEULgrKNtWuJM7xdFPvsu0tke2/c461haWiLPsmFsZhlKCrRUxBgpy3ImgnqIkel0CgjSJKG3lizLuGLfvt1pYg7vSBPMsvTJYjL+rbqp2xAH7S4FGG2QSpJmCc47vPf0bT+4QedJkwQAa3vWN9apqgopJSZJmE63ccFhTELwHpMkaKlwzpNlGXmezRpkzfxkwhX79mVXXXnlNfp1r3vlCfjKQ1/xSsnpZDLv4szlCSmp6oq6qgapKQfl52NACIGUkrquCT4QQqBrW3rbY61la3MT7wNN3eCcI01TnLWEGNB6yLAXLLTWQ2nUdfPd6XQ6uevtb013ZAzu3rXr2N59+0ZplmGMQSuDVoo0TQcRIwXGJCilSAYpi0AQYsBZhxRDCG3bXHCBaZaBiEg1GCatNUoqTDLcx/vBVpdVtfX9Z3/wYSfD76hE2x0h4Nz5839TVuUj4/FkaGp9N0tng9Z6FqzHOYd3fsgCJQdZKwe3F0NADmMN5yzW9jR1c0EhVlVNkqaE4IkhYK0lMYbEJGmeZ7caJe0XPv1A2BECDt3whqerqvrO2vnzxDg0wQjEMDM6sx+plKJ3Fuc9Pnh8CHjvkWKQvkJIvHd03dAcEaC1IkvTIdWbCmM0bdfivaeuG9q+X02z/F+LYi7siBACeP7kSaQQnTaaGBkcYJjVcfBIMUyAth3GtLMW7wQCgdaatm2p6hqIaG1YXJin63u881RVRfCB0Xg8W5UJrHXkeY73gYWFxY3DR4989fc/+bv2xcb/ku3wY8cfiz/+5pvWBbx5e2tr2VorYoyoWf0O8z3OmtZQ18O4hDxLEUKitCKEgBRyVkY9XddjtMaHoVk67+naFpMks/soLr9897LtO7334P5HTp743s70AIADRw4cT7PsU0op0XUdXd8jtWI8Gg27veCxM/3e9z3GaLRWWOtmG6CUNM1wztFbi5QSKQRSKZaXl4elymyTlKXpIJjSlLW19eb551e/1QeaHcsAgIf/6avhjW+8VnVdf6NUakXMPIGQkr7v6a29IG6882ht6LpusNJdj5By2H69cJ0Qg5v0ntFohPeeLEupqxoYRmkxl/9AGvOrpzbX//r4w9940SVwUTIA4OpDhzaOXXPU7L5sF0oqyrIihllvmnVzYwxFUeC9H8ZZCEitcN7hrMUkyWx7NHgHJSUbGxuUVYmdZUZR5Bw8dPDsaH7+12+56cbPPvXtx9uXEvdFOxmKxPUQwp8VRf6+NM/2pbZfcd4LBEgpUFoxHo+YTsshxaVEKTXTAN3svSXEyGhujhACTdOCANdbNu0W4/HY7lnZ88DiwuTrB19/7Esf/dBvvOT1+EU/GLnjzluv955JVdV3ee9vapr2qFZ6vL62xng8wvtA3/VY5/DeDU1wZoaEkLRdy6iYm2VMArOtUl7krKysPB6VeMdDD37h1Kv2bPDv/vYfjgO8485bH+/qbnnvrstWjFQ/UTf1fiHlbUWa7bHWDvu+NGVpcYnNrS0Ew2TQStF13aAagyfPMubnJxw4cPCRw0cP/drJZ58585o4G/y/+Mk7b5sUUr/vyn1XfOzEE0/OlVVZOOvE/PykbZpGN03rQoyn2rbZmyapHY3Hm3mR73XOrY7Go2/OTSb3PfTg5x9+zRyO/n94+0/dtrB3ZWXf2dPnlibj0R3b29tXBsQDWsocgcjz/MTa+bXfbtt247ofvf4vrv3h6w6XdfP1aV+dWDfb9v4P3htf0wT8b7z/ng+M+r7XX/zs5zZf+OzOu9+pluaXD+ej0XN/8qn7Sy7hEi7hEi7h5cX/AJEc/aDrj11yAAAAJXRFWHRkYXRlOmNyZWF0ZQAyMDIwLTAyLTEyVDIwOjQ1OjAwKzAwOjAwz7DySAAAACV0RVh0ZGF0ZTptb2RpZnkAMjAyMC0wMi0xMlQyMDo0NTowMCswMDowML7tSvQAAAAASUVORK5CYII="
            }
        },
    )
    report.to_file(Path("flatly_report.html"))
Ejemplo n.º 12
0
from pathlib import Path

import numpy as np
import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "census_train.csv",
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    )

    # Names based on https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
    df = pd.read_csv(
        file_name,
        header=None,
        index_col=False,
        names=[
            "age",
            "workclass",
            "fnlwgt",
            "education",
            "education-num",
            "marital-status",
            "occupation",
            "relationship",
            "race",
            "sex",
            "capital-gain",
import great_expectations as ge
import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

file_name = cache_file(
    "titanic.csv",
    "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv",
)

df = pd.read_csv(file_name)

profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)

# Example 1
# Obtain expectation suite, this includes profiling the dataset, saving the expectation suite, validating the
# dataframe, and building data docs
suite = profile.to_expectation_suite(suite_name="titanic_expectations")

# Example 2
# Run Great Expectations while specifying the directory with an existing Great Expectations set-up by passing in a
# Data Context
data_context = ge.data_context.DataContext(
    context_root_dir="my_ge_root_directory/")

suite = profile.to_expectation_suite(suite_name="titanic_expectations",
                                     data_context=data_context)

# Example 3
# Just build the suite
Ejemplo n.º 14
0
# As featured on this Google Cloud Platform page:
# https://cloud.google.com/solutions/building-a-propensity-model-for-financial-services-on-gcp
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "bank-full.csv",
        "https://storage.googleapis.com/erwinh-public-data/bankingdata/bank-full.csv",
    )

    # Download the UCI Bank Marketing Dataset
    df = pd.read_csv(file_name, sep=";")

    profile = ProfileReport(
        df, title="Profile Report of the UCI Bank Marketing Dataset")
    profile.to_file(Path("uci_bank_marketing_report.html"))
Ejemplo n.º 15
0
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "chicago_employees.csv",
        "https://data.cityofchicago.org/api/views/xzkq-xp2w/rows.csv?accessType=DOWNLOAD",
    )

    df = pd.read_csv(file_name)

    profile = ProfileReport(df, title="Chicago Employees")
    profile.to_file(output_file=Path("./chicago_employees_report.html"))
Ejemplo n.º 16
0
import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "rdw.parquet",
        "https://raw.githubusercontent.com/pandas-profiling/pandas-profiling-data/master/data/rdw.parquet",
    )
    data = pd.read_parquet(file_name)

    profile = ProfileReport(data, title="RDW Dataset", minimal=True)
    profile.to_file("rdw.html")
Ejemplo n.º 17
0
from pathlib import Path

import numpy as np
import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "meteorites.csv",
        "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
    )

    # Set a seed for reproducibility
    np.random.seed(7331)

    df = pd.read_csv(file_name)
    # Note: Pandas does not support dates before 1880, so we ignore these for this analysis
    df["year"] = pd.to_datetime(df["year"], errors="coerce")

    # Example: Constant variable
    df["source"] = "NASA"

    # Example: Boolean variable
    df["boolean"] = np.random.choice([True, False], df.shape[0])

    # Example: Mixed with base types
    df["mixed"] = np.random.choice([1, "A"], df.shape[0])

    # Example: Unhashable
Ejemplo n.º 18
0
 def getter(file_name, url):
     source_file = cache_file(file_name, url)
     # Move to temporary directory
     test_path = Path(str(tmpdir))
     shutil.copy(str(source_file), str(test_path / file_name))
     return str(test_path / file_name)
Ejemplo n.º 19
0
import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "colors.csv",
        "https://github.com/codebrainz/color-names/raw/master/output/colors.csv",
    )

    df = pd.read_csv(file_name, names=["Code", "Name", "Hex", "R", "G", "B"])
    report = ProfileReport(df, title="Colors")
    report.to_file("colors_report.html")
Ejemplo n.º 20
0
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "reviews_Musical_Instruments_5.json.gz",
        r"http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz",
    )

    df = pd.read_json(file_name, compression="gzip", lines=True)

    profile = ProfileReport(
        df, title="Amazon Musical Instrument Review | Profile Report")
    profile.to_file(Path("./review_report.html"))
Ejemplo n.º 21
0
import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "vektis_postcodes.csv",
        "https://www.vektis.nl/uploads/Docs%20per%20pagina/Open%20Data%20Bestanden/2017/Vektis%20Open%20Databestand%20Zorgverzekeringswet%202017%20-%20postcode3.csv",
    )

    df = pd.read_csv(file_name, sep=";", low_memory=False)
    report = ProfileReport(
        df,
        title="Vektis Postal Code 2017",
        correlations={
            "recoded": {"calculate": False},
            "kendall": {"calculate": False},
            "phi_k": {"calculate": False},
            "cramers": {"calculate": False},
        },
        plot={"histogram": {"bayesian_blocks_bins": False}},
    )
    report.to_file("vektis_report.html", True)
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file("auto2.dta",
                           "http://www.stata-press.com/data/r15/auto2.dta")
    # Suggested by @adamrossnelson
    df = pd.read_stata(file_name)

    # Length left out due to correlation with weight.
    report = ProfileReport(df,
                           title="1978 Automobile dataset",
                           explorative=True)
    report.to_file(Path("stata_auto_report.html"))
Ejemplo n.º 23
0
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "websites.csv",
        "https://raw.githubusercontent.com/berkmancenter/url-lists/master/lists/et.csv",
    )

    df = pd.read_csv(file_name, parse_dates=["date_added"])
    profile = ProfileReport(
        df,
        title="Website Inaccessibility Test Lists",
        correlations={"cramers": {
            "calculate": False
        }},
    )
    profile.to_file(Path("./website_inaccessibility_report.html"))