Ejemplo n.º 1
0
def get_report_structure(date_start: datetime, date_end: datetime,
                         sample: dict, summary: dict) -> Renderable:
    """Generate a HTML report from summary statistics and a given sample.

    Args:
      sample: A dict containing the samples to print.
      summary: Statistics to use for the overview, variables, correlations and missing values.

    Returns:
      The profile report in HTML format
    """

    sections = Sequence(
        [
            Dataset(
                package=summary["package"],
                date_start=date_start,
                date_end=date_end,
                values=summary["table"],
                messages=summary["messages"],
                variables=summary["variables"],
                name="Overview",
                anchor_id="overview",
            ),
            Sequence(
                render_variables_section(summary),
                sequence_type="accordion",
                name="Variables",
                anchor_id="variables",
            ),
            Sequence(
                get_correlation_items(summary),
                sequence_type="tabs",
                name="Correlations",
                anchor_id="correlations",
            ),
            Sequence(
                get_missing_items(summary),
                sequence_type="tabs",
                name="Missing values",
                anchor_id="missing",
            ),
            Sequence(
                get_sample_items(sample),
                sequence_type="list",
                name="Sample",
                anchor_id="sample",
            ),
        ],
        name="Report",
        sequence_type="sections",
    )

    return sections
Ejemplo n.º 2
0
def get_scatter_matrix(scatter_matrix):
    image_format = config["plot"]["image_format"].get(str)

    titems = []
    for x_col, y_cols in scatter_matrix.items():
        items = []
        for y_col, splot in y_cols.items():
            items.append(
                Image(
                    splot,
                    image_format=image_format,
                    alt=f"{x_col} x {y_col}",
                    anchor_id=f"interactions_{x_col}_{y_col}",
                    name=y_col,
                )
            )

        titems.append(
            Sequence(
                items,
                sequence_type="tabs",
                name=x_col,
                anchor_id=f"interactions_{x_col}",
            )
        )
    return titems
Ejemplo n.º 3
0
def render_generic(summary):
    template_variables = {}  # render_common(summary)

    info = Overview(
        anchor_id=summary["varid"],
        warnings=summary["warnings"],
        var_type="Unsupported",
        var_name=summary["varname"],
    )

    table = Table([
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "class": "alert" if "n_missing" in summary["warn_fields"] else "",
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "class": "alert" if "p_missing" in summary["warn_fields"] else "",
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
        },
    ])

    return {
        "top": Sequence([info, table, HTML("")], sequence_type="grid"),
        "bottom": None,
        "ignore": "ignore",
    }
Ejemplo n.º 4
0
def render_path_image(summary):
    n_freq_table_max = config["n_freq_table_max"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_path(summary)

    # Top
    template_variables["top"].content["items"][0].content[
        "var_type"] = "Image Path"

    # Bottom
    keys = {"Image shape": "image_shape", "Exif keys": "exif_keys"}

    for title, key in keys.items():
        template_variables["freqtable_{}".format(key)] = freq_table(
            freqtable=summary["{}_counts".format(key)],
            n=summary["n"],
            max_number_to_print=n_freq_table_max,
        )

    # TODO: add dropdown to switch to specific values
    exif_keys = FrequencyTable(
        template_variables["freqtable_{}".format("exif_keys")],
        name="Exif keys",
        anchor_id="{varid}exif_frequency".format(varid=summary["varid"]),
    )

    template_variables["bottom"].content["items"].append(exif_keys)

    image_shape_freq = FrequencyTable(
        template_variables["freqtable_{}".format("image_shape")],
        name="Frequency",
        anchor_id="{varid}image_shape_frequency".format(
            varid=summary["varid"]),
    )

    image_shape_scatter = Image(
        scatter_series(summary["scatter_data"]),
        image_format=image_format,
        alt="Scatterplot of image sizes",
        caption="Scatterplot of image sizes",
        name="Scatter",
        anchor_id="{varid}scatter".format(varid=summary["varid"]),
    )

    image_shape = Sequence(
        [image_shape_freq, image_shape_scatter],
        sequence_type="tabs",
        name="Image shape",
        anchor_id="{varid}image_shape".format(varid=summary["varid"]),
    )

    template_variables["bottom"].content["items"].append(image_shape)

    return template_variables
Ejemplo n.º 5
0
def get_report_structure(date_start: datetime, date_end: datetime,
                         sample: dict, summary: dict) -> Renderable:
    """Generate a HTML report from summary statistics and a given sample.

    Args:
      sample: A dict containing the samples to print.
      summary: Statistics to use for the overview, variables, correlations and missing values.

    Returns:
      The profile report in HTML format
    """

    warnings = summary["messages"]

    section_items = get_section_items()

    section_items.append(
        Sequence(
            get_dataset_items(summary, date_start, date_end, warnings),
            sequence_type="tabs",
            name="Overview",
            anchor_id="overview",
        ))
    section_items.append(
        Sequence(
            render_variables_section(summary),
            sequence_type="accordion",
            name="Variables",
            anchor_id="variables",
        ))
    section_items.append(
        Sequence(
            get_scatter_matrix(summary["scatter"]),
            sequence_type="tabs",
            name="Interactions",
            anchor_id="interactions",
        ))

    corr = get_correlation_items(summary)
    if corr is not None:
        section_items.append(corr)

    section_items.append(
        Sequence(
            get_missing_items(summary),
            sequence_type="tabs",
            name="Missing values",
            anchor_id="missing",
        ))
    section_items.append(
        Sequence(
            get_sample_items(sample),
            sequence_type="list",
            name="Sample",
            anchor_id="sample",
        ))

    sections = Sequence(section_items, name="Report", sequence_type="sections")

    return sections
Ejemplo n.º 6
0
def get_dataset_overview(summary):
    dataset_info = Table(
        [
            {
                "name": "Total Number of Records",
                "value": summary["table"]["n"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Total Number of Columns",
                "value": summary["table"]["n_var"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Missing row cells",
                "value": summary["table"]["n_cells_missing"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Missing row cells (%)",
                "value": summary["table"]["p_cells_missing"],
                "fmt": "fmt_percent",
            },
            {
                "name": "Duplicate rows",
                "value": summary["table"]["n_duplicates"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Duplicate rows (%)",
                "value": summary["table"]["p_duplicates"],
                "fmt": "fmt_percent",
            },
        ],
        name="Table statistics",
    )

    dataset_types = Table(
        [{
            "name": type_name,
            "value": count,
            "fmt": "fmt_numeric"
        } for type_name, count in summary["table"]["types"].items()],
        name="Variable types",
    )

    return Sequence(
        [dataset_info, dataset_types],
        anchor_id="dataset_overview",
        name="Overview",
        sequence_type="grid",
    )
Ejemplo n.º 7
0
def render_url(summary):
    n_freq_table_max = config["n_freq_table_max"].get(int)

    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)

    # TODO: merge with boolean/categorical
    mini_freq_table_rows = freq_table(freqtable=summary["value_counts"],
                                      n=summary["n"],
                                      max_number_to_print=n_obs_cat)
    template_variables = render_common(summary)

    keys = ["scheme", "netloc", "path", "query", "fragment"]
    for url_part in keys:
        template_variables["freqtable_{}".format(url_part)] = freq_table(
            freqtable=summary["{}_counts".format(url_part)],
            n=summary["n"],
            max_number_to_print=n_freq_table_max,
        )

    full_frequency_table = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Full",
        anchor_id="{varid}full_frequency".format(varid=summary["varid"]),
    )
    scheme_frequency_table = FrequencyTable(
        template_variables["freqtable_scheme"],
        name="Scheme",
        anchor_id="{varid}scheme_frequency".format(varid=summary["varid"]),
    )
    netloc_frequency_table = FrequencyTable(
        template_variables["freqtable_netloc"],
        name="Netloc",
        anchor_id="{varid}netloc_frequency".format(varid=summary["varid"]),
    )
    path_frequency_table = FrequencyTable(
        template_variables["freqtable_path"],
        name="Path",
        anchor_id="{varid}path_frequency".format(varid=summary["varid"]),
    )
    query_frequency_table = FrequencyTable(
        template_variables["freqtable_query"],
        name="Query",
        anchor_id="{varid}query_frequency".format(varid=summary["varid"]),
    )
    fragment_frequency_table = FrequencyTable(
        template_variables["freqtable_fragment"],
        name="Fragment",
        anchor_id="{varid}fragment_frequency".format(varid=summary["varid"]),
    )

    items = [
        full_frequency_table,
        scheme_frequency_table,
        netloc_frequency_table,
        path_frequency_table,
        query_frequency_table,
        fragment_frequency_table,
    ]
    template_variables["bottom"] = Sequence(items, sequence_type="tabs")

    # Element composition
    info = Overview(summary["varid"], summary["varname"], "URL",
                    summary["warnings"])

    table = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt"
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent"
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt"
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
        },
    ])

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    # TODO: settings 3,3,6
    template_variables["top"] = Sequence([info, table, fqm],
                                         sequence_type="grid")

    return template_variables
Ejemplo n.º 8
0
def render_categorical(summary):
    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)

    template_variables = render_common(summary)

    # TODO: merge with boolean
    mini_freq_table_rows = freq_table(
        freqtable=summary["value_counts"],
        n=summary["count"],
        max_number_to_print=n_obs_cat,
    )

    # Top
    # Element composition
    info = Overview(
        summary["varid"], summary["varname"], "Categorical", summary["warnings"]
    )

    table = Table(
        [
            {
                "name": "Distinct count",
                "value": summary["n_unique"],
                "fmt": "fmt",
                "class": "alert" if "n_unique" in summary["warn_fields"] else "",
            },
            {
                "name": "Unique (%)",
                "value": summary["p_unique"],
                "fmt": "fmt_percent",
                "class": "alert" if "p_unique" in summary["warn_fields"] else "",
            },
            {
                "name": "Missing",
                "value": summary["n_missing"],
                "fmt": "fmt",
                "class": "alert" if "n_missing" in summary["warn_fields"] else "",
            },
            {
                "name": "Missing (%)",
                "value": summary["p_missing"],
                "fmt": "fmt_percent",
                "class": "alert" if "p_missing" in summary["warn_fields"] else "",
            },
            {
                "name": "Memory size",
                "value": summary["memory_size"],
                "fmt": "fmt_bytesize",
            },
        ]
    )

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    # TODO: settings 3,3,6
    template_variables["top"] = Sequence([info, table, fqm], sequence_type="grid")

    # Bottom
    items = []
    frequency_table = FrequencyTable(
        # 'frequency_table',
        template_variables["freq_table_rows"],
        name="Common Values",
        anchor_id="{varid}common_values".format(varid=summary["varid"]),
    )

    items.append(frequency_table)

    check_compositions = config["vars"]["cat"]["check_composition"].get(bool)
    if check_compositions:
        composition = Table(
            [
                {
                    "name": "Contains chars",
                    "value": summary["composition"]["chars"],
                    "fmt": "fmt",
                },
                {
                    "name": "Contains digits",
                    "value": summary["composition"]["digits"],
                    "fmt": "fmt",
                },
                {
                    "name": "Contains whitespace",
                    "value": summary["composition"]["spaces"],
                    "fmt": "fmt",
                },
                {
                    "name": "Contains non-words",
                    "value": summary["composition"]["non-words"],
                    "fmt": "fmt",
                },
            ],
            name="Composition",
            anchor_id="{varid}composition".format(varid=summary["varid"]),
        )

        length = Table(
            [
                {
                    "name": "Max length",
                    "value": summary["max_length"],
                    "fmt": "fmt_numeric",
                },
                {
                    "name": "Mean length",
                    "value": summary["mean_length"],
                    "fmt": "fmt_numeric",
                },
                {
                    "name": "Min length",
                    "value": summary["min_length"],
                    "fmt": "fmt_numeric",
                },
            ],
            name="Length",
            anchor_id="{varid}lengthstats".format(varid=summary["varid"]),
        )

        tbl = Sequence(
            [composition, length],
            anchor_id="{varid}tbl".format(varid=summary["varid"]),
            name="Composition",
            sequence_type="grid",
        )

        items.append(tbl)

        histogram_bins = 10

        length = Image(
            histogram(summary["length"], summary, histogram_bins),
            alt="Scatter",
            name="Length",
            anchor_id="{varid}length".format(varid=summary["varid"]),
        )
        items.append(length)

    template_variables["bottom"] = Sequence(
        items,
        sequence_type="tabs",
        anchor_id="{varid}bottom".format(varid=summary["varid"]),
    )

    return template_variables
Ejemplo n.º 9
0
def get_correlation_items(summary) -> Optional[Renderable]:
    """Create the list of correlation items

    Args:
        summary: dict of correlations

    Returns:
        List of correlation items to show in the interface.
    """
    items = get_items()

    pearson_description = (
        "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear correlation "
        "between two variables. It's value lies between -1 and +1, -1 indicating total negative "
        "linear correlation, 0 indicating no linear correlation and 1 indicating total positive "
        "linear correlation. Furthermore, <em>r</em> is invariant under separate changes in location "
        "and scale of the two variables, implying that for a linear function the angle to the "
        "x-axis does not affect <em>r</em>.<br /><br />To calculate <em>r</em> for two "
        "variables <em>X</em> and <em>Y</em>, one divides the covariance of <em>X</em> and "
        "<em>Y</em> by the product of their standard deviations. ")
    spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic 
    correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than 
    Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 
    0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To 
    calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank 
    variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """

    kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation 
    coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, 
    -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation.
    <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of 
    concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the 
    discordant pairs divided by the total number of pairs."""

    phi_k_description = """Phik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case
    of a bivariate normal input distribution. There is extensive documentation available <a href='https://phik.readthedocs.io/en/latest/index.html'>here</a>."""

    cramers_description = """Cramér's V is an association measure for nominal random variables. The coefficient ranges from 0 to 1, with 0 indicating independence and 1 indicating perfect association.
    The empirical estimators used for Cramér's V have been proved to be biased, even for large samples.
    We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found <a href='http://stats.lse.ac.uk/bergsma/pdf/cramerV3.pdf'>here</a>."""

    key_to_data = {
        "pearson": (-1, "Pearson's r", pearson_description),
        "spearman": (-1, "Spearman's ρ", spearman_description),
        "kendall": (-1, "Kendall's τ", kendall_description),
        "phi_k": (0, "Phik (φk)", phi_k_description),
        "cramers": (0, "Cramér's V (φc)", cramers_description),
    }

    image_format = config["plot"]["image_format"].get(str)

    for key, item in summary["correlations"].items():
        vmin, name, description = key_to_data[key]

        diagram = Image(
            plot.correlation_matrix(item, vmin=vmin),
            image_format=image_format,
            alt=name,
            anchor_id=f"{key}_diagram",
            name=name,
            classes="correlation-diagram",
        )

        if len(description) > 0:
            desc = HTML(
                f'<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>',
                anchor_id=f"{key}_html",
                classes="correlation-description",
            )

            tbl = Sequence([diagram, desc],
                           anchor_id=key,
                           name=name,
                           sequence_type="grid")

            items.append(tbl)
        else:
            items.append(diagram)

    corr = Sequence(
        items,
        sequence_type="tabs",
        name="Correlations Tab",
        anchor_id="correlations_tab",
    )

    if len(items) > 0:
        btn = ToggleButton(
            "Toggle correlation descriptions",
            anchor_id="toggle-correlation-description",
            name="Toggle correlation descriptions",
        )

        return Collapse(name="Correlations",
                        anchor_id="correlations",
                        button=btn,
                        item=corr)
    else:
        return None
Ejemplo n.º 10
0
def render_categorical(summary):
    n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)
    image_format = config["plot"]["image_format"].get(str)

    template_variables = render_common(summary)

    # TODO: merge with boolean
    mini_freq_table_rows = freq_table(
        freqtable=summary["value_counts"],
        n=summary["count"],
        max_number_to_print=n_obs_cat,
    )

    # Top
    # Element composition
    info = Overview(summary["varid"], summary["varname"], "Categorical",
                    summary["warnings"])

    table = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "class": "alert" if "n_unique" in summary["warn_fields"] else "",
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "class": "alert" if "p_unique" in summary["warn_fields"] else "",
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "class": "alert" if "n_missing" in summary["warn_fields"] else "",
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "class": "alert" if "p_missing" in summary["warn_fields"] else "",
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
        },
    ])

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    # TODO: settings 3,3,6
    template_variables["top"] = Sequence([info, table, fqm],
                                         sequence_type="grid")

    # Bottom
    items = []
    frequency_table = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common Values",
        anchor_id="{varid}common_values".format(varid=summary["varid"]),
    )

    items.append(frequency_table)

    check_compositions = config["vars"]["cat"]["check_composition"].get(bool)
    if check_compositions:
        length_table = Table(
            [
                {
                    "name": "Max length",
                    "value": summary["max_length"],
                    "fmt": "fmt_numeric",
                },
                {
                    "name": "Mean length",
                    "value": summary["mean_length"],
                    "fmt": "fmt_numeric",
                },
                {
                    "name": "Min length",
                    "value": summary["min_length"],
                    "fmt": "fmt_numeric",
                },
            ],
            name="Length",
            anchor_id="{varid}lengthstats".format(varid=summary["varid"]),
        )

        histogram_bins = 10

        length = Image(
            histogram(summary["length"], summary, histogram_bins),
            image_format=image_format,
            alt="Scatter",
            name="Length",
            anchor_id="{varid}length".format(varid=summary["varid"]),
        )

        tbl = Sequence(
            [length, length_table],
            anchor_id="{varid}tbl".format(varid=summary["varid"]),
            name="Length",
            sequence_type="grid",
        )

        items.append(tbl)

        n_freq_table_max = config["n_freq_table_max"].get(int)

        citems = []
        vc = pd.Series(summary["category_alias_values"]).value_counts()
        citems.append(
            FrequencyTable(
                freq_table(freqtable=vc,
                           n=vc.sum(),
                           max_number_to_print=n_freq_table_max),
                name="Categories",
                anchor_id="{varid}category_long_values".format(
                    varid=summary["varid"]),
            ))

        vc = pd.Series(summary["script_values"]).value_counts()
        citems.append(
            FrequencyTable(
                freq_table(freqtable=vc,
                           n=vc.sum(),
                           max_number_to_print=n_freq_table_max),
                name="Scripts",
                anchor_id="{varid}script_values".format(
                    varid=summary["varid"]),
            ))

        vc = pd.Series(summary["block_alias_values"]).value_counts()
        citems.append(
            FrequencyTable(
                freq_table(freqtable=vc,
                           n=vc.sum(),
                           max_number_to_print=n_freq_table_max),
                name="Blocks",
                anchor_id="{varid}block_alias_values".format(
                    varid=summary["varid"]),
            ))

        characters = Sequence(
            citems,
            name="Characters",
            sequence_type="tabs",
            anchor_id="{varid}characters".format(varid=summary["varid"]),
        )

        items.append(characters)

    template_variables["bottom"] = Sequence(
        items,
        sequence_type="tabs",
        anchor_id="{varid}bottom".format(varid=summary["varid"]),
    )

    return template_variables
Ejemplo n.º 11
0
def render_count(summary):
    template_variables = render_common(summary)
    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"],
        summary["varname"],
        "Real number (&Ropf; / &Ropf;<sub>&ge;0</sub>)",
        summary["warnings"],
    )

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": False,
        },
    ])

    table2 = Table([
        {
            "name": "Mean",
            "value": summary["mean"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Zeros",
            "value": summary["n_zeros"],
            "fmt": "fmt",
            "alert": False,
        },
        {
            "name": "Zeros (%)",
            "value": summary["p_zeros"],
            "fmt": "fmt_percent",
            "alert": False,
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    # TODO: replace with SmallImage...
    mini_histo = Image(
        mini_histogram(summary["histogram_data"], summary,
                       summary["histogram_bins"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Sequence([info, table1, table2, mini_histo],
                                         sequence_type="grid")

    quantile_statistics = {
        "name":
        "Quantile statistics",
        "items": [
            {
                "name": "Minimum",
                "value": summary["min"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "5-th percentile",
                "value": summary["quantile_5"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Q1",
                "value": summary["quantile_25"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "median",
                "value": summary["quantile_50"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Q3",
                "value": summary["quantile_75"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "95-th percentile",
                "value": summary["quantile_95"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Maximum",
                "value": summary["max"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Range",
                "value": summary["range"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
            {
                "name": "Interquartile range",
                "value": summary["iqr"],
                "fmt": "fmt_numeric",
                "alert": False,
            },
        ],
    }

    descriptive_statistics = {
        "name":
        "Descriptive statistics",
        "items": [
            {
                "name": "Standard deviation",
                "value": summary["std"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Coefficient of variation",
                "value": summary["cv"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Kurtosis",
                "value": summary["kurt"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Mean",
                "value": summary["mean"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "MAD",
                "value": summary["mad"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Skewness",
                "value": summary["skew"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Sum",
                "value": summary["sum"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Variance",
                "value": summary["var"],
                "fmt": "fmt_numeric"
            },
        ],
    }

    # TODO: Make sections data structure
    # statistics = ItemRenderer(
    #     'statistics',
    #     'Statistics',
    #     'table',
    #     [
    #         quantile_statistics,
    #         descriptive_statistics
    #     ]
    # )

    seqs = [
        Image(
            histogram(summary["histogram_data"], summary,
                      summary["histogram_bins"]),
            image_format=image_format,
            alt="Histogram",
            caption="<strong>Histogram with fixed size bins</strong> (bins={})"
            .format(summary["histogram_bins"]),
            name="Histogram",
            anchor_id="histogram",
        )
    ]

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id="common_values",
    )

    evs = Sequence(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name="Minimum 5 values",
                anchor_id="firstn",
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name="Maximum 5 values",
                anchor_id="lastn",
            ),
        ],
        sequence_type="tabs",
        name="Extreme values",
        anchor_id="extreme_values",
    )

    if "histogram_bins_bayesian_blocks" in summary:
        histo_dyn = Image(
            histogram(
                summary["histogram_data"],
                summary,
                summary["histogram_bins_bayesian_blocks"],
            ),
            image_format=image_format,
            alt="Histogram",
            caption=
            '<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)'
            .format(
                fmt_array(summary["histogram_bins_bayesian_blocks"],
                          threshold=5)),
            name="Dynamic Histogram",
            anchor_id="dynamic_histogram",
        )

        seqs.append(histo_dyn)

    template_variables["bottom"] = Sequence(
        [
            # statistics,
            Sequence(seqs,
                     sequence_type="tabs",
                     name="Histogram(s)",
                     anchor_id="histograms"),
            fq,
            evs,
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    return template_variables
Ejemplo n.º 12
0
def get_dataset_overview(summary):
    dataset_info = Table(
        [
            {
                "name": "Number of variables",
                "value": summary["table"]["n_var"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Number of observations",
                "value": summary["table"]["n"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Missing cells",
                "value": summary["table"]["n_cells_missing"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Missing cells (%)",
                "value": summary["table"]["p_cells_missing"],
                "fmt": "fmt_percent",
            },
            {
                "name": "Duplicate rows",
                "value": summary["table"]["n_duplicates"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Duplicate rows (%)",
                "value": summary["table"]["p_duplicates"],
                "fmt": "fmt_percent",
            },
            {
                "name": "Total size in memory",
                "value": summary["table"]["memory_size"],
                "fmt": "fmt_bytesize",
            },
            {
                "name": "Average record size in memory",
                "value": summary["table"]["record_size"],
                "fmt": "fmt_bytesize",
            },
        ],
        name="Dataset statistics",
    )

    dataset_types = Table(
        [{
            "name": type_name,
            "value": count,
            "fmt": "fmt_numeric"
        } for type_name, count in summary["table"]["types"].items()],
        name="Variable types",
    )

    return Sequence(
        [dataset_info, dataset_types],
        anchor_id="dataset_overview",
        name="Overview",
        sequence_type="grid",
    )
Ejemplo n.º 13
0
def get_correlation_items(summary) -> Optional[Renderable]:
    """Create the list of correlation items

    Args:
        summary: dict of correlations

    Returns:
        List of correlation items to show in the interface.
    """
    items = get_items()

    pearson_description = (
        "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear correlation "
        "between two variables. It's value lies between -1 and +1, -1 indicating total negative "
        "linear correlation, 0 indicating no linear correlation and 1 indicating total positive "
        "linear correlation. Furthermore, <em>r</em> is invariant under separate changes in location "
        "and scale of the two variables, implying that for a linear function the angle to the "
        "x-axis does not affect <em>r</em>.<br /><br />To calculate <em>r</em> for two "
        "variables <em>X</em> and <em>Y</em>, one divides the covariance of <em>X</em> and "
        "<em>Y</em> by the product of their standard deviations. ")
    spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic 
    correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than 
    Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 
    0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To 
    calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank 
    variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """

    kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation 
    coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, 
    -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation.
    <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of 
    concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the 
    discordant pairs divided by the total number of pairs."""

    key_to_data = {
        "pearson": (-1, "Pearson's r", pearson_description),
        "spearman": (-1, "Spearman's ρ", spearman_description),
        "kendall": (-1, "Kendall's τ", kendall_description),
        "phi_k": (0, "Phik (φk)", ""),
        "cramers": (0, "Cramér's V (φc)", ""),
        "recoded": (0, "Recoded", ""),
    }

    image_format = config["plot"]["image_format"].get(str)

    for key, item in summary["correlations"].items():
        vmin, name, description = key_to_data[key]

        diagram = Image(
            plot.correlation_matrix(item, vmin=vmin),
            image_format=image_format,
            alt=name,
            anchor_id="{key}_diagram".format(key=key),
            name=name,
            classes="correlation-diagram",
        )

        if len(description) > 0:
            desc = HTML(
                '<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>'
                .format(description=description, name=name),
                anchor_id="{key}_html".format(key=key),
                classes="correlation-description",
            )

            tbl = Sequence([diagram, desc],
                           anchor_id=key,
                           name=name,
                           sequence_type="grid")

            items.append(tbl)
        else:
            items.append(diagram)

    corr = Sequence(
        items,
        sequence_type="tabs",
        name="Correlations Tab",
        anchor_id="correlations_tab",
    )

    if len(items) > 0:
        btn = ToggleButton(
            "Toggle correlation descriptions",
            anchor_id="toggle-correlation-description",
            name="Toggle correlation descriptions",
        )

        return Collapse(name="Correlations",
                        anchor_id="correlations",
                        button=btn,
                        item=corr)
    else:
        return None
Ejemplo n.º 14
0
def render_complex(summary):
    template_variables = {}
    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = Overview(
        summary["varid"],
        summary["varname"],
        "Complex number (&Copf;)",
        summary["warnings"],
    )

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt"
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent"
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt"
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
        },
    ])

    table2 = Table([
        {
            "name": "Mean",
            "value": summary["mean"],
            "fmt": "fmt"
        },
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt"
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt"
        },
        {
            "name": "Zeros",
            "value": summary["n_zeros"],
            "fmt": "fmt"
        },
        {
            "name": "Zeros (%)",
            "value": summary["p_zeros"],
            "fmt": "fmt_percent"
        },
    ])

    placeholder = HTML("")

    template_variables["top"] = Sequence([info, table1, table2, placeholder],
                                         sequence_type="grid")

    # Bottom
    items = [
        Image(
            scatter_complex(summary["scatter_data"]),
            image_format=image_format,
            alt="Scatterplot",
            caption="Scatterplot in the complex plane",
            name="Scatter",
            anchor_id="{varid}scatter".format(varid=summary["varid"]),
        )
    ]

    bottom = Sequence(items, sequence_type="tabs", anchor_id=summary["varid"])

    template_variables["bottom"] = bottom

    return template_variables
Ejemplo n.º 15
0
def render_date(summary):
    # TODO: render common?
    template_variables = {}
    # Top
    info = Overview(summary["varid"], summary["varname"], "Date", [])

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt"
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent"
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt"
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
        },
    ])

    table2 = Table([
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt"
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt"
        },
        # {'name': '', 'value': '', 'fmt': 'fmt'},
        # {'name': '', 'value': '', 'fmt': 'fmt'},
        # {'name': '', 'value': '', 'fmt': 'fmt'},
        # {'name': '', 'value': '', 'fmt': 'fmt'},
    ])

    mini_histo = Image(
        mini_histogram(summary["histogram_data"], summary,
                       summary["histogram_bins"]),
        "Mini histogram",
    )

    template_variables["top"] = Sequence([info, table1, table2, mini_histo],
                                         sequence_type="grid")

    # Bottom
    bottom = Sequence(
        [
            Image(
                histogram(summary["histogram_data"], summary,
                          summary["histogram_bins"]),
                alt="Histogram",
                caption="Histogram",
                name="Histogram",
                anchor_id="{varid}histogram".format(varid=summary["varid"]),
            )
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    template_variables["bottom"] = bottom

    return template_variables
Ejemplo n.º 16
0
def render_date(summary):
    varid = summary["varid"]
    # TODO: render common?
    template_variables = {}

    image_format = config["plot"]["image_format"].get(str)

    # Top
    info = VariableInfo(
        summary["varid"], summary["varname"], "Date", summary["warnings"]
    )

    table1 = Table(
        [
            {
                "name": "Distinct count",
                "value": summary["n_unique"],
                "fmt": "fmt",
                "alert": False,
            },
            {
                "name": "Unique (%)",
                "value": summary["p_unique"],
                "fmt": "fmt_percent",
                "alert": False,
            },
            {
                "name": "Missing",
                "value": summary["n_missing"],
                "fmt": "fmt",
                "alert": False,
            },
            {
                "name": "Missing (%)",
                "value": summary["p_missing"],
                "fmt": "fmt_percent",
                "alert": False,
            },
            {
                "name": "Memory size",
                "value": summary["memory_size"],
                "fmt": "fmt_bytesize",
                "alert": False,
            },
        ]
    )

    table2 = Table(
        [
            {"name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False},
            {"name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False},
        ]
    )

    mini_histo = Image(
        mini_histogram(summary["histogram_data"], summary, summary["histogram_bins"]),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Sequence(
        [info, table1, table2, mini_histo], sequence_type="grid"
    )

    # Bottom
    bottom = Sequence(
        [
            Image(
                histogram(
                    summary["histogram_data"], summary, summary["histogram_bins"]
                ),
                image_format=image_format,
                alt="Histogram",
                caption="Histogram",
                name="Histogram",
                anchor_id=f"{varid}histogram",
            )
        ],
        sequence_type="tabs",
        anchor_id=summary["varid"],
    )

    template_variables["bottom"] = bottom

    return template_variables
Ejemplo n.º 17
0
def render_real(summary):
    varid = summary["varid"]
    template_variables = render_common(summary)
    image_format = config["plot"]["image_format"].get(str)

    if summary["min"] >= 0:
        name = "Real number (&Ropf;<sub>&ge;0</sub>)"
    else:
        name = "Real number (&Ropf;)"

    # Top
    info = VariableInfo(summary["varid"], summary["varname"], name,
                        summary["warnings"])

    table1 = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": "n_unique" in summary["warn_fields"],
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": "p_unique" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Infinite",
            "value": summary["n_infinite"],
            "fmt": "fmt",
            "alert": "n_infinite" in summary["warn_fields"],
        },
        {
            "name": "Infinite (%)",
            "value": summary["p_infinite"],
            "fmt": "fmt_percent",
            "alert": "p_infinite" in summary["warn_fields"],
        },
    ])

    table2 = Table([
        {
            "name": "Mean",
            "value": summary["mean"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Minimum",
            "value": summary["min"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Maximum",
            "value": summary["max"],
            "fmt": "fmt",
            "alert": False
        },
        {
            "name": "Zeros",
            "value": summary["n_zeros"],
            "fmt": "fmt",
            "alert": "n_zeros" in summary["warn_fields"],
        },
        {
            "name": "Zeros (%)",
            "value": summary["p_zeros"],
            "fmt": "fmt_percent",
            "alert": "p_zeros" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    histogram_bins = 10

    # TODO: replace with SmallImage...
    mini_histo = Image(
        mini_histogram(summary["histogram_data"], summary, histogram_bins),
        image_format=image_format,
        alt="Mini histogram",
    )

    template_variables["top"] = Sequence([info, table1, table2, mini_histo],
                                         sequence_type="grid")

    quantile_statistics = Table(
        [
            {
                "name": "Minimum",
                "value": summary["min"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "5-th percentile",
                "value": summary["5%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Q1",
                "value": summary["25%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "median",
                "value": summary["50%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Q3",
                "value": summary["75%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "95-th percentile",
                "value": summary["95%"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Maximum",
                "value": summary["max"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Range",
                "value": summary["range"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Interquartile range (IQR)",
                "value": summary["iqr"],
                "fmt": "fmt_numeric",
            },
        ],
        name="Quantile statistics",
    )

    descriptive_statistics = Table(
        [
            {
                "name": "Standard deviation",
                "value": summary["std"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Coefficient of variation (CV)",
                "value": summary["cv"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Kurtosis",
                "value": summary["kurtosis"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Mean",
                "value": summary["mean"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Median Absolute Deviation (MAD)",
                "value": summary["mad"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Skewness",
                "value": summary["skewness"],
                "fmt": "fmt_numeric",
                "class":
                "alert" if "skewness" in summary["warn_fields"] else "",
            },
            {
                "name": "Sum",
                "value": summary["sum"],
                "fmt": "fmt_numeric"
            },
            {
                "name": "Variance",
                "value": summary["variance"],
                "fmt": "fmt_numeric"
            },
        ],
        name="Descriptive statistics",
    )

    statistics = Sequence(
        [quantile_statistics, descriptive_statistics],
        anchor_id=f"{varid}statistics",
        name="Statistics",
        sequence_type="grid",
    )

    seqs = [
        Image(
            histogram(summary["histogram_data"], summary, histogram_bins),
            image_format=image_format,
            alt="Histogram",
            caption=
            f"<strong>Histogram with fixed size bins</strong> (bins={histogram_bins})",
            name="Histogram",
            anchor_id=f"{varid}histogram",
        )
    ]

    fq = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Common values",
        anchor_id=f"{varid}common_values",
    )

    evs = Sequence(
        [
            FrequencyTable(
                template_variables["firstn_expanded"],
                name="Minimum 5 values",
                anchor_id=f"{varid}firstn",
            ),
            FrequencyTable(
                template_variables["lastn_expanded"],
                name="Maximum 5 values",
                anchor_id=f"{varid}lastn",
            ),
        ],
        sequence_type="tabs",
        name="Extreme values",
        anchor_id=f"{varid}extreme_values",
    )

    if "histogram_bins_bayesian_blocks" in summary:
        histo_dyn = Image(
            histogram(
                summary["histogram_data"],
                summary,
                summary["histogram_bins_bayesian_blocks"],
            ),
            image_format=image_format,
            alt="Histogram",
            caption=
            '<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)'
            .format(
                fmt_array(summary["histogram_bins_bayesian_blocks"],
                          threshold=5)),
            name="Dynamic Histogram",
            anchor_id=f"{varid}dynamic_histogram",
        )

        seqs.append(histo_dyn)

    template_variables["bottom"] = Sequence(
        [
            statistics,
            Sequence(
                seqs,
                sequence_type="tabs",
                name="Histogram(s)",
                anchor_id=f"{varid}histograms",
            ),
            fq,
            evs,
        ],
        sequence_type="tabs",
        anchor_id=f"{varid}bottom",
    )

    return template_variables
Ejemplo n.º 18
0
def render_boolean(summary):
    n_obs_bool = config["vars"]["bool"]["n_obs"].get(int)

    # Prepare variables
    template_variables = render_common(summary)
    mini_freq_table_rows = freq_table(
        freqtable=summary["value_counts"],
        n=summary["n"],
        max_number_to_print=n_obs_bool,
    )

    # Element composition
    info = VariableInfo(
        anchor_id=summary["varid"],
        warnings=summary["warnings"],
        var_type="Boolean",
        var_name=summary["varname"],
    )

    table = Table([
        {
            "name": "Distinct count",
            "value": summary["n_unique"],
            "fmt": "fmt",
            "alert": "n_unique" in summary["warn_fields"],
        },
        {
            "name": "Unique (%)",
            "value": summary["p_unique"],
            "fmt": "fmt_percent",
            "alert": "p_unique" in summary["warn_fields"],
        },
        {
            "name": "Missing",
            "value": summary["n_missing"],
            "fmt": "fmt",
            "alert": "n_missing" in summary["warn_fields"],
        },
        {
            "name": "Missing (%)",
            "value": summary["p_missing"],
            "fmt": "fmt_percent",
            "alert": "p_missing" in summary["warn_fields"],
        },
        {
            "name": "Memory size",
            "value": summary["memory_size"],
            "fmt": "fmt_bytesize",
            "alert": False,
        },
    ])

    fqm = FrequencyTableSmall(mini_freq_table_rows)

    template_variables["top"] = Sequence([info, table, fqm],
                                         sequence_type="grid")

    freqtable = FrequencyTable(
        template_variables["freq_table_rows"],
        name="Frequency Table",
        anchor_id="{varid}frequency_table".format(varid=summary["varid"]),
    )

    template_variables["bottom"] = Sequence(
        [freqtable],
        sequence_type="tabs",
        anchor_id="{varid}bottom".format(varid=summary["varid"]),
    )

    return template_variables