def render_generic(summary): template_variables = {} # render_common(summary) info = VariableInfo( anchor_id=summary["varid"], warnings=summary["warnings"], var_type="Unsupported", var_name=summary["varname"], ) table = Table([ { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) return { "top": Container([info, table, HTML("")], sequence_type="grid"), "bottom": None, }
def render_generic(config: Settings, summary: dict) -> dict: info = VariableInfo( anchor_id=summary["varid"], alerts=summary["alerts"], var_type="Unsupported", var_name=summary["varname"], description=summary["description"], ) table = Table([ { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ]) return { "top": Container([info, table, HTML("")], sequence_type="grid"), "bottom": None, }
def render_complex(summary): varid = summary["varid"] template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Complex number (ℂ)", summary["warnings"], summary["description"], ) table1 = Table( [ {"name": "Distinct", "value": summary["n_distinct"], "fmt": "fmt"}, { "name": "Distinct (%)", "value": summary["p_distinct"], "fmt": "fmt_percent", }, {"name": "Missing", "value": summary["n_missing"], "fmt": "fmt"}, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ] ) table2 = Table( [ {"name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric"}, {"name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric"}, {"name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric"}, {"name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt_numeric"}, {"name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent"}, ] ) placeholder = HTML("") template_variables["top"] = Container( [info, table1, table2, placeholder], sequence_type="grid" ) # Bottom items = [ Image( scatter_complex(summary["scatter_data"]), image_format=image_format, alt="Scatterplot", caption="Scatterplot in the complex plane", name="Scatter", anchor_id=f"{varid}scatter", ) ] bottom = Container(items, sequence_type="tabs", anchor_id=summary["varid"]) template_variables["bottom"] = bottom return template_variables
def get_correlation_items(summary) -> Optional[Renderable]: """Create the list of correlation items Args: summary: dict of correlations Returns: List of correlation items to show in the interface. """ items = get_items() pearson_description = ( "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear correlation " "between two variables. It's value lies between -1 and +1, -1 indicating total negative " "linear correlation, 0 indicating no linear correlation and 1 indicating total positive " "linear correlation. Furthermore, <em>r</em> is invariant under separate changes in location " "and scale of the two variables, implying that for a linear function the angle to the " "x-axis does not affect <em>r</em>.<br /><br />To calculate <em>r</em> for two " "variables <em>X</em> and <em>Y</em>, one divides the covariance of <em>X</em> and " "<em>Y</em> by the product of their standard deviations. ") spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """ kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation. <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the discordant pairs divided by the total number of pairs.""" key_to_data = { "pearson": (-1, "Pearson's r", pearson_description), "spearman": (-1, "Spearman's ρ", spearman_description), "kendall": (-1, "Kendall's τ", kendall_description), "phi_k": (0, "Phik (φk)", ""), "cramers": (0, "Cramér's V (φc)", ""), "recoded": (0, "Recoded", ""), } image_format = config["plot"]["image_format"].get(str) for key, item in summary["correlations"].items(): vmin, name, description = key_to_data[key] diagram = Image( plot.correlation_matrix(item, vmin=vmin), image_format=image_format, alt=name, anchor_id="{key}_diagram".format(key=key), name=name, classes="correlation-diagram", ) if len(description) > 0: desc = HTML( '<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>' .format(description=description, name=name), anchor_id="{key}_html".format(key=key), classes="correlation-description", ) tbl = Sequence([diagram, desc], anchor_id=key, name=name, sequence_type="grid") items.append(tbl) else: items.append(diagram) corr = Sequence( items, sequence_type="tabs", name="Correlations Tab", anchor_id="correlations_tab", ) if len(items) > 0: btn = ToggleButton( "Toggle correlation descriptions", anchor_id="toggle-correlation-description", name="Toggle correlation descriptions", ) return Collapse(name="Correlations", anchor_id="correlations", button=btn, item=corr) else: return None
def get_report_structure(summary: dict) -> Renderable: """Generate a HTML report from summary statistics and a given sample. Args: summary: Statistics to use for the overview, variables, correlations and missing values. Returns: The profile report in HTML format """ disable_progress_bar = not config["progress_bar"].get(bool) with tqdm(total=1, desc="Generate report structure", disable=disable_progress_bar) as pbar: warnings = summary["messages"] section_items: List[Renderable] = [ Container( get_dataset_items(summary, warnings), sequence_type="tabs", name="Overview", anchor_id="overview", ), Container( render_variables_section(summary), sequence_type="accordion", name="Variables", anchor_id="variables", ), ] scatter_items = get_scatter_matrix(summary["scatter"]) if len(scatter_items) > 0: section_items.append( Container( scatter_items, sequence_type="tabs" if len(scatter_items) <= 10 else "select", name="Interactions", anchor_id="interactions", ), ) corr = get_correlation_items(summary) if corr is not None: section_items.append(corr) missing_items = get_missing_items(summary) if len(missing_items) > 0: section_items.append( Container( missing_items, sequence_type="tabs", name="Missing values", anchor_id="missing", )) sample_items = get_sample_items(summary["sample"]) if len(sample_items) > 0: section_items.append( Container( items=sample_items, sequence_type="list", name="Sample", anchor_id="sample", )) duplicate_items = get_duplicates_items(summary["duplicates"]) if len(duplicate_items) > 0: section_items.append( Container( items=duplicate_items, sequence_type="list", name="Duplicate rows", anchor_id="duplicate", )) sections = Container(section_items, name="Root", sequence_type="sections") pbar.update() footer = HTML( content= 'Report generated with <a href="https://github.com/pandas-profiling/pandas-profiling">pandas-profiling</a>.' ) return Root("Root", sections, footer)
def get_correlation_items(summary) -> Optional[Renderable]: """Create the list of correlation items Args: summary: dict of correlations Returns: List of correlation items to show in the interface. """ items: List[Renderable] = [] pearson_description = ( "The Pearson's correlation coefficient (<em>r</em>) is a measure of linear correlation " "between two variables. It's value lies between -1 and +1, -1 indicating total negative " "linear correlation, 0 indicating no linear correlation and 1 indicating total positive " "linear correlation. Furthermore, <em>r</em> is invariant under separate changes in location " "and scale of the two variables, implying that for a linear function the angle to the " "x-axis does not affect <em>r</em>.<br /><br />To calculate <em>r</em> for two " "variables <em>X</em> and <em>Y</em>, one divides the covariance of <em>X</em> and " "<em>Y</em> by the product of their standard deviations. ") spearman_description = """The Spearman's rank correlation coefficient (<em>ρ</em>) is a measure of monotonic correlation between two variables, and is therefore better in catching nonlinear monotonic correlations than Pearson's <em>r</em>. It's value lies between -1 and +1, -1 indicating total negative monotonic correlation, 0 indicating no monotonic correlation and 1 indicating total positive monotonic correlation.<br /><br />To calculate <em>ρ</em> for two variables <em>X</em> and <em>Y</em>, one divides the covariance of the rank variables of <em>X</em> and <em>Y</em> by the product of their standard deviations. """ kendall_description = """Similarly to Spearman's rank correlation coefficient, the Kendall rank correlation coefficient (<em>τ</em>) measures ordinal association between two variables. It's value lies between -1 and +1, -1 indicating total negative correlation, 0 indicating no correlation and 1 indicating total positive correlation. <br /><br />To calculate <em>τ</em> for two variables <em>X</em> and <em>Y</em>, one determines the number of concordant and discordant pairs of observations. <em>τ</em> is given by the number of concordant pairs minus the discordant pairs divided by the total number of pairs.""" phi_k_description = """Phik (φk) is a new and practical correlation coefficient that works consistently between categorical, ordinal and interval variables, captures non-linear dependency and reverts to the Pearson correlation coefficient in case of a bivariate normal input distribution. There is extensive documentation available <a href='https://phik.readthedocs.io/en/latest/index.html'>here</a>.""" cramers_description = """Cramér's V is an association measure for nominal random variables. The coefficient ranges from 0 to 1, with 0 indicating independence and 1 indicating perfect association. The empirical estimators used for Cramér's V have been proved to be biased, even for large samples. We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found <a href='http://stats.lse.ac.uk/bergsma/pdf/cramerV3.pdf'>here</a>.""" key_to_data = { "pearson": (-1, "Pearson's r", pearson_description), "spearman": (-1, "Spearman's ρ", spearman_description), "kendall": (-1, "Kendall's τ", kendall_description), "phi_k": (0, "Phik (φk)", phi_k_description), "cramers": (0, "Cramér's V (φc)", cramers_description), } image_format = config["plot"]["image_format"].get(str) for key, item in summary["correlations"].items(): vmin, name, description = key_to_data[key] diagram = Image( plot.correlation_matrix(item, vmin=vmin), image_format=image_format, alt=name, anchor_id=f"{key}_diagram", name=name, classes="correlation-diagram", ) if len(description) > 0: desc = HTML( f'<div style="padding:20px" class="text-muted"><h3>{name}</h3>{description}</div>', anchor_id=f"{key}_html", classes="correlation-description", ) tbl = Container([diagram, desc], anchor_id=key, name=name, sequence_type="grid") items.append(tbl) else: items.append(diagram) corr = Container( items, sequence_type="tabs", name="Correlations Tab", anchor_id="correlations_tab", ) if len(items) > 0: btn = ToggleButton( "Toggle correlation descriptions", anchor_id="toggle-correlation-description", name="Toggle correlation descriptions", ) return Collapse(name="Correlations", anchor_id="correlations", button=btn, item=corr) else: return None
def render_complex(summary): template_variables = {} # Top info = Overview( summary["varid"], summary["varname"], "Complex number (ℂ)", summary["warnings"], ) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt" }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent" }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt" }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ]) table2 = Table([ { "name": "Mean", "value": summary["mean"], "fmt": "fmt" }, { "name": "Minimum", "value": summary["min"], "fmt": "fmt" }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt" }, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt" }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent" }, ]) placeholder = HTML("") template_variables["top"] = Sequence([info, table1, table2, placeholder], sequence_type="grid") # Bottom items = [ Image( scatter_complex(summary["scatter_data"]), alt="Scatterplot", caption="Scatterplot in the complex plane", name="Scatter", anchor_id="{varid}scatter".format(varid=summary["varid"]), ) ] bottom = Sequence(items, sequence_type="tabs", anchor_id=summary["varid"]) template_variables["bottom"] = bottom return template_variables