Example #1
0
def render_generic(summary):
    template_variables = {}  # render_common(summary)

    info = VariableInfo(
        anchor_id=summary["varid"],
        warnings=summary["warnings"],
        var_type="Unsupported",
        var_name=summary["varname"],
    )

    table = Table([
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    return {
        "top": Container([info, table, HTML("")], sequence_type="grid"),
        "bottom": None,
    }
Example #2
0
def render_generic(config: Settings, summary: dict) -> dict:
    info = VariableInfo(
        anchor_id=summary["varid"],
        alerts=summary["alerts"],
        var_type="Unsupported",
        var_name=summary["varname"],
        description=summary["description"],
    )

    table = Table([
        {
            "name": "Missing",
            "value": fmt(summary["n_missing"]),
            "alert": "n_missing" in summary["alert_fields"],
        },
        {
            "name": "Missing (%)",
            "value": fmt_percent(summary["p_missing"]),
            "alert": "p_missing" in summary["alert_fields"],
        },
        {
            "name": "Memory size",
            "value": fmt_bytesize(summary["memory_size"]),
            "alert": False,
        },
    ])

    return {
        "top": Container([info, table, HTML("")], sequence_type="grid"),
        "bottom": None,
    }
def render_complex(summary):
    varid = summary["varid"]
    template_variables = {}
    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Complex number (ℂ)",
        summary["warnings"],
        summary["description"],
    )

    table1 = Table(
        [
            {"name": "Distinct", "value": summary["n_distinct"], "fmt": "fmt"},
            {
                "name": "Distinct (%)",
                "value": summary["p_distinct"],
                "fmt": "fmt_percent",
            },
            {"name": "Missing", "value": summary["n_missing"], "fmt": "fmt"},
            {
                "name": "Missing (%)",
                "value": summary["p_missing"],
                "fmt": "fmt_percent",
            },
            {
                "name": "Memory size",
                "value": summary["memory_size"],
                "fmt": "fmt_bytesize",
            },
        ]
    )

    table2 = Table(
        [
            {"name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric"},
            {"name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric"},
            {"name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric"},
            {"name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt_numeric"},
            {"name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent"},
        ]
    )

    placeholder = HTML("")

    template_variables["top"] = Container(
        [info, table1, table2, placeholder], sequence_type="grid"
    )

    # Bottom
    items = [
        Image(
            scatter_complex(summary["scatter_data"]),
            image_format=image_format,
            alt="Scatterplot",
            caption="Scatterplot in the complex plane",
            name="Scatter",
            anchor_id=f"{varid}scatter",
        )
    ]

    bottom = Container(items, sequence_type="tabs", anchor_id=summary["varid"])

    template_variables["bottom"] = bottom

    return template_variables
Example #4
0
def get_correlation_items(summary) -> Optional[Renderable]:
    """Create the list of correlation items

    Args:
        summary: dict of correlations

    Returns:
        List of correlation items to show in the interface.
    """
    items = get_items()

    pearson_description = (
        "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear correlation "
        "between two variables. It's value lies between -1 and +1, -1 indicating total negative "
        "linear correlation, 0 indicating no linear correlation and 1 indicating total positive "
        "linear correlation. Furthermore, <em>r</em> is invariant under separate changes in location "
        "and scale of the two variables, implying that for a linear function the angle to the "
        "x-axis does not affect <em>r</em>.<br /><br />To calculate <em>r</em> for two "
        "variables <em>X</em> and <em>Y</em>, one divides the covariance of <em>X</em> and "
        "<em>Y</em> by the product of their standard deviations. ")
    spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic 
    correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than 
    Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 
    0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To 
    calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank 
    variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """

    kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation 
    coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, 
    -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation.
    <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of 
    concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the 
    discordant pairs divided by the total number of pairs."""

    key_to_data = {
        "pearson": (-1, "Pearson's r", pearson_description),
        "spearman": (-1, "Spearman's ρ", spearman_description),
        "kendall": (-1, "Kendall's τ", kendall_description),
        "phi_k": (0, "Phik (φk)", ""),
        "cramers": (0, "Cramér's V (φc)", ""),
        "recoded": (0, "Recoded", ""),
    }

    image_format = config["plot"]["image_format"].get(str)

    for key, item in summary["correlations"].items():
        vmin, name, description = key_to_data[key]

        diagram = Image(
            plot.correlation_matrix(item, vmin=vmin),
            image_format=image_format,
            alt=name,
            anchor_id="{key}_diagram".format(key=key),
            name=name,
            classes="correlation-diagram",
        )

        if len(description) > 0:
            desc = HTML(
                '<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>'
                .format(description=description, name=name),
                anchor_id="{key}_html".format(key=key),
                classes="correlation-description",
            )

            tbl = Sequence([diagram, desc],
                           anchor_id=key,
                           name=name,
                           sequence_type="grid")

            items.append(tbl)
        else:
            items.append(diagram)

    corr = Sequence(
        items,
        sequence_type="tabs",
        name="Correlations Tab",
        anchor_id="correlations_tab",
    )

    if len(items) > 0:
        btn = ToggleButton(
            "Toggle correlation descriptions",
            anchor_id="toggle-correlation-description",
            name="Toggle correlation descriptions",
        )

        return Collapse(name="Correlations",
                        anchor_id="correlations",
                        button=btn,
                        item=corr)
    else:
        return None
Example #5
0
def get_report_structure(summary: dict) -> Renderable:
    """Generate a HTML report from summary statistics and a given sample.

    Args:
      summary: Statistics to use for the overview, variables, correlations and missing values.

    Returns:
      The profile report in HTML format
    """
    disable_progress_bar = not config["progress_bar"].get(bool)
    with tqdm(total=1,
              desc="Generate report structure",
              disable=disable_progress_bar) as pbar:
        warnings = summary["messages"]

        section_items: List[Renderable] = [
            Container(
                get_dataset_items(summary, warnings),
                sequence_type="tabs",
                name="Overview",
                anchor_id="overview",
            ),
            Container(
                render_variables_section(summary),
                sequence_type="accordion",
                name="Variables",
                anchor_id="variables",
            ),
        ]

        scatter_items = get_scatter_matrix(summary["scatter"])
        if len(scatter_items) > 0:
            section_items.append(
                Container(
                    scatter_items,
                    sequence_type="tabs"
                    if len(scatter_items) <= 10 else "select",
                    name="Interactions",
                    anchor_id="interactions",
                ), )

        corr = get_correlation_items(summary)
        if corr is not None:
            section_items.append(corr)

        missing_items = get_missing_items(summary)
        if len(missing_items) > 0:
            section_items.append(
                Container(
                    missing_items,
                    sequence_type="tabs",
                    name="Missing values",
                    anchor_id="missing",
                ))

        sample_items = get_sample_items(summary["sample"])
        if len(sample_items) > 0:
            section_items.append(
                Container(
                    items=sample_items,
                    sequence_type="list",
                    name="Sample",
                    anchor_id="sample",
                ))

        duplicate_items = get_duplicates_items(summary["duplicates"])
        if len(duplicate_items) > 0:
            section_items.append(
                Container(
                    items=duplicate_items,
                    sequence_type="list",
                    name="Duplicate rows",
                    anchor_id="duplicate",
                ))

        sections = Container(section_items,
                             name="Root",
                             sequence_type="sections")
        pbar.update()

    footer = HTML(
        content=
        'Report generated with <a href="https://github.com/pandas-profiling/pandas-profiling">pandas-profiling</a>.'
    )

    return Root("Root", sections, footer)
Example #6
0
def get_correlation_items(summary) -> Optional[Renderable]:
    """Create the list of correlation items

    Args:
        summary: dict of correlations

    Returns:
        List of correlation items to show in the interface.
    """
    items: List[Renderable] = []

    pearson_description = (
        "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear correlation "
        "between two variables. It's value lies between -1 and +1, -1 indicating total negative "
        "linear correlation, 0 indicating no linear correlation and 1 indicating total positive "
        "linear correlation. Furthermore, <em>r</em> is invariant under separate changes in location "
        "and scale of the two variables, implying that for a linear function the angle to the "
        "x-axis does not affect <em>r</em>.<br /><br />To calculate <em>r</em> for two "
        "variables <em>X</em> and <em>Y</em>, one divides the covariance of <em>X</em> and "
        "<em>Y</em> by the product of their standard deviations. ")
    spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic 
    correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than 
    Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 
    0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To 
    calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank 
    variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """

    kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation 
    coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, 
    -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation.
    <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of 
    concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the 
    discordant pairs divided by the total number of pairs."""

    phi_k_description = """Phik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case
    of a bivariate normal input distribution. There is extensive documentation available <a href='https://phik.readthedocs.io/en/latest/index.html'>here</a>."""

    cramers_description = """Cramér's V is an association measure for nominal random variables. The coefficient ranges from 0 to 1, with 0 indicating independence and 1 indicating perfect association.
    The empirical estimators used for Cramér's V have been proved to be biased, even for large samples.
    We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found <a href='http://stats.lse.ac.uk/bergsma/pdf/cramerV3.pdf'>here</a>."""

    key_to_data = {
        "pearson": (-1, "Pearson's r", pearson_description),
        "spearman": (-1, "Spearman's ρ", spearman_description),
        "kendall": (-1, "Kendall's τ", kendall_description),
        "phi_k": (0, "Phik (φk)", phi_k_description),
        "cramers": (0, "Cramér's V (φc)", cramers_description),
    }

    image_format = config["plot"]["image_format"].get(str)

    for key, item in summary["correlations"].items():
        vmin, name, description = key_to_data[key]

        diagram = Image(
            plot.correlation_matrix(item, vmin=vmin),
            image_format=image_format,
            alt=name,
            anchor_id=f"{key}_diagram",
            name=name,
            classes="correlation-diagram",
        )

        if len(description) > 0:
            desc = HTML(
                f'<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>',
                anchor_id=f"{key}_html",
                classes="correlation-description",
            )

            tbl = Container([diagram, desc],
                            anchor_id=key,
                            name=name,
                            sequence_type="grid")

            items.append(tbl)
        else:
            items.append(diagram)

    corr = Container(
        items,
        sequence_type="tabs",
        name="Correlations Tab",
        anchor_id="correlations_tab",
    )

    if len(items) > 0:
        btn = ToggleButton(
            "Toggle correlation descriptions",
            anchor_id="toggle-correlation-description",
            name="Toggle correlation descriptions",
        )

        return Collapse(name="Correlations",
                        anchor_id="correlations",
                        button=btn,
                        item=corr)
    else:
        return None
Example #7
0
def render_complex(summary):
    template_variables = {}

    # Top
    info = Overview(
        summary["varid"],
        summary["varname"],
        "Complex number (&Copf;)",
        summary["warnings"],
    )

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt"
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent"
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt"
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
        },
    ])

    table2 = Table([
        {
            "name": "Mean",
            "value": summary["mean"],
            "fmt": "fmt"
        },
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt"
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt"
        },
        {
            "name": "Zeros",
            "value": summary["n_zeros"],
            "fmt": "fmt"
        },
        {
            "name": "Zeros (%)",
            "value": summary["p_zeros"],
            "fmt": "fmt_percent"
        },
    ])

    placeholder = HTML("")

    template_variables["top"] = Sequence([info, table1, table2, placeholder],
                                         sequence_type="grid")

    # Bottom
    items = [
        Image(
            scatter_complex(summary["scatter_data"]),
            alt="Scatterplot",
            caption="Scatterplot in the complex plane",
            name="Scatter",
            anchor_id="{varid}scatter".format(varid=summary["varid"]),
        )
    ]

    bottom = Sequence(items, sequence_type="tabs", anchor_id=summary["varid"])

    template_variables["bottom"] = bottom

    return template_variables