def to_html(self) -> str: """Generate and return complete template as lengthy string for using with frameworks. Returns: Profiling report html including wrapper. """ return templates.template("wrapper.html").render( content=self.html, title=self.title, correlation=len(self.description_set["correlations"]) > 0, missing=len(self.description_set["missing"]) > 0, sample=len(self.sample) > 0, version=__version__, offline=self.use_local_assets, primary_color=config["style"]["primary_color"].get(str), theme=config["style"]["theme"].get(str), )
def to_html(sample: dict, stats_object: dict) -> str: """Generate a HTML report from summary statistics and a given sample. Args: sample: A dict containing the samples to print. stats_object: Statistics to use for the overview, variables, correlations and missing values. Returns: The profile report in HTML format """ if not isinstance(sample, dict): raise TypeError("sample must be of type dict") if not isinstance(stats_object, dict): raise TypeError( "stats_object must be of type dict. Did you generate this using the " "pandas_profiling.describe() function?") if not {"table", "variables", "correlations"}.issubset( set(stats_object.keys())): raise TypeError( "stats_object badly formatted. Did you generate this using the pandas_profiling.describe() function?" ) render_htmls = { "overview_html": render_overview_html(stats_object), "rows_html": render_variables_html(stats_object), "correlations_html": render_correlations_html(stats_object), "missing_html": render_missing_html(stats_object), "sample_html": render_sample_html(sample), "full_width": config["style"]["full_width"].get(bool), } # TODO: should be done in the template return templates.template("base.html").render(render_htmls)
def render_variables_section(stats_object: dict) -> str: """Render the HTML for each of the variables in the DataFrame. Args: stats_object: The statistics for each variable. Returns: The rendered HTML, where each row represents a variable. """ rows_html = u"" n_obs_unique = config["n_obs_unique"].get(int) n_obs_bool = config["n_obs_bool"].get(int) n_extreme_obs = config["n_extreme_obs"].get(int) n_freq_table_max = config["n_freq_table_max"].get(int) messages = stats_object["messages"] # TODO: move to for loop in template for idx, row in stats_object["variables"].items(): formatted_values = row formatted_values.update({"varname": idx, "varid": hash(idx), "row_classes": {}}) # TODO: obtain from messages (ignore) for m in messages: if m.column_name == idx: if m.message_type == MessageType.SKEWED: formatted_values["row_classes"]["skewness"] = "alert" elif m.message_type == MessageType.HIGH_CARDINALITY: # TODO: rename alert to prevent overlap with bootstrap classes formatted_values["row_classes"]["distinct_count"] = "alert" elif m.message_type == MessageType.ZEROS: formatted_values["row_classes"]["zeros"] = "alert" elif m.message_type == MessageType.MISSING: formatted_values["row_classes"]["missing"] = "alert" if row["type"] in {Variable.TYPE_NUM, Variable.TYPE_DATE}: formatted_values["histogram"] = histogram( row["histogramdata"], row, row["histogram_bins"] ) formatted_values["mini_histogram"] = mini_histogram( row["histogramdata"], row, row["histogram_bins"] ) if ( "histogram_bins_bayesian_blocks" in row and row["type"] == Variable.TYPE_NUM ): formatted_values["histogram_bayesian_blocks"] = histogram( row["histogramdata"], row, row["histogram_bins_bayesian_blocks"] ) if row["type"] in {Variable.TYPE_CAT, Variable.TYPE_BOOL}: # The number of column to use in the display of the frequency table according to the category mini_freq_table_nb_col = {Variable.TYPE_CAT: 6, Variable.TYPE_BOOL: 3} formatted_values["minifreqtable"] = freq_table( stats_object["variables"][idx]["value_counts_without_nan"], stats_object["table"]["n"], "mini_freq_table.html", max_number_to_print=n_obs_bool, idx=idx, nb_col=mini_freq_table_nb_col[row["type"]], ) if row["type"] in {Variable.TYPE_URL}: keys = ["scheme", "netloc", "path", "query", "fragment"] for url_part in keys: formatted_values["freqtable_{}".format(url_part)] = freq_table( freqtable=stats_object["variables"][idx][ "{}_counts".format(url_part) ], # TODO: n - missing n=stats_object["table"]["n"], table_template="freq_table.html", idx=idx, max_number_to_print=n_freq_table_max, ) if row["type"] in {Variable.TYPE_PATH}: keys = ["name", "parent", "suffix", "stem"] for path_part in keys: formatted_values["freqtable_{}".format(path_part)] = freq_table( freqtable=stats_object["variables"][idx][ "{}_counts".format(path_part) ], # TODO: n - missing n=stats_object["table"]["n"], table_template="freq_table.html", idx=idx, max_number_to_print=n_freq_table_max, ) if row["type"] == Variable.S_TYPE_UNIQUE: table = stats_object["variables"][idx][ "value_counts_without_nan" ].sort_index() obs = table.index formatted_values["firstn"] = pd.DataFrame( list(obs[0:n_obs_unique]), columns=["First {} values".format(n_obs_unique)], ).to_html(classes="example_values", index=False) formatted_values["lastn"] = pd.DataFrame( list(obs[-n_obs_unique:]), columns=["Last {} values".format(n_obs_unique)], ).to_html(classes="example_values", index=False) if row["type"] not in { Variable.S_TYPE_UNSUPPORTED, Variable.S_TYPE_CORR, Variable.S_TYPE_CONST, Variable.S_TYPE_RECODED, }: formatted_values["freqtable"] = freq_table( freqtable=stats_object["variables"][idx]["value_counts_without_nan"], n=stats_object["table"]["n"], table_template="freq_table.html", idx=idx, max_number_to_print=n_freq_table_max, ) formatted_values["firstn_expanded"] = extreme_obs_table( freqtable=stats_object["variables"][idx]["value_counts_without_nan"], number_to_print=n_extreme_obs, n=stats_object["table"]["n"], ascending=True, ) formatted_values["lastn_expanded"] = extreme_obs_table( freqtable=stats_object["variables"][idx]["value_counts_without_nan"], number_to_print=n_extreme_obs, n=stats_object["table"]["n"], ascending=False, ) if row["type"] == Variable.TYPE_NUM: formatted_values["sections"] = { "statistics": { "name": "Statistics", "content": templates.template( "variables/row_num_statistics.html" ).render(values=formatted_values), }, "histogram": { "name": "Histogram", "content": templates.template( "variables/row_num_histogram.html" ).render(values=formatted_values), }, "frequency_table": { "name": "Common values", "content": templates.template( "variables/row_num_frequency_table.html" ).render(values=formatted_values), }, "extreme_values": { "name": "Extreme values", "content": templates.template( "variables/row_num_extreme_values.html" ).render(values=formatted_values), }, } if row["type"] == Variable.TYPE_CAT: formatted_values["sections"] = { "frequency_table": { "name": "Common values", "content": templates.template( "variables/row_cat_frequency_table.html" ).render(values=formatted_values), } } check_compositions = config["vars"]["cat"]["check_composition"].get(bool) if check_compositions: formatted_values["sections"]["composition"] = { "name": "Composition", "content": templates.template( "variables/row_cat_composition.html" ).render(values=formatted_values), } if row["type"] == Variable.TYPE_URL: formatted_values["sections"] = { "full": {"name": "Full", "value": formatted_values["freqtable"]}, "scheme": { "name": "Scheme", "value": formatted_values["freqtable_scheme"], }, "netloc": { "name": "Netloc", "value": formatted_values["freqtable_netloc"], }, "path": {"name": "Path", "value": formatted_values["freqtable_path"]}, "query": { "name": "Query", "value": formatted_values["freqtable_query"], }, "fragment": { "name": "Fragment", "value": formatted_values["freqtable_fragment"], }, } if row["type"] == Variable.TYPE_PATH: formatted_values["sections"] = { "full": {"name": "Full", "value": formatted_values["freqtable"]}, "stem": {"name": "Stem", "value": formatted_values["freqtable_stem"]}, "name": {"name": "Name", "value": formatted_values["freqtable_name"]}, "suffix": { "name": "Suffix", "value": formatted_values["freqtable_suffix"], }, "parent": { "name": "Parent", "value": formatted_values["freqtable_parent"], }, } rows_html += templates.template( "variables/row_{}.html".format(row["type"].value.lower()) ).render(values=formatted_values) return rows_html
def freq_table( freqtable, n: int, table_template, max_number_to_print: int, idx: int, nb_col=6 ) -> str: """Render the HTML for a frequency table (value, count). Args: idx: The variable id. freqtable: The frequency table. n: The total number of values. table_template: The name of the template. max_number_to_print: The maximum number of observations to print. nb_col: The number of columns in the grid. (Default value = 6) Returns: The HTML representation of the frequency table. """ if max_number_to_print > n: max_number_to_print = n if max_number_to_print < len(freqtable): freq_other = sum(freqtable.iloc[max_number_to_print:]) min_freq = freqtable.values[max_number_to_print] else: freq_other = 0 min_freq = 0 freq_missing = n - sum(freqtable) max_freq = max(freqtable.values[0], freq_other, freq_missing) # TODO: Correctly sort missing and other if max_freq == 0: raise ValueError("Empty column") rows = [] for label, freq in freqtable.iloc[0:max_number_to_print].items(): rows.append( { "label": label, "width": freq / max_freq, "count": freq, "percentage": float(freq) / n, "extra_class": "", } ) if freq_other > min_freq: rows.append( { "label": "Other values ({})".format( str(freqtable.count() - max_number_to_print) ), "width": freq_other / max_freq, "count": freq_other, "percentage": float(freq_other) / n, "extra_class": "other", } ) if freq_missing > min_freq: rows.append( { "label": "(Missing)", "width": freq_missing / max_freq, "count": freq_missing, "percentage": float(freq_missing) / n, "extra_class": "missing", } ) return templates.template(table_template).render( rows=rows, varid=hash(idx), nb_col=nb_col )
def render_correlations_html(stats_object: dict) -> str: """Render the correlations HTML. Args: stats_object: The diagrams to display in the correlation component. Returns: The rendered HTML of the correlations component of the profile. """ values = {} active = "" if "pearson" in stats_object["correlations"]: if active == "": active = "pearson" values["pearson"] = { "matrix": plot.correlation_matrix(stats_object["correlations"]["pearson"]), "name": "Pearson", } if "spearman" in stats_object["correlations"]: if active == "": active = "spearman" values["spearman"] = { "matrix": plot.correlation_matrix(stats_object["correlations"]["spearman"]), "name": "Spearman", } if "kendall" in stats_object["correlations"]: if active == "": active = "kendall" values["kendall"] = { "matrix": plot.correlation_matrix(stats_object["correlations"]["kendall"]), "name": "Kendall", } if "phi_k" in stats_object["correlations"]: if active == "": active = "phi_k" values["phi_k"] = { "matrix": plot.correlation_matrix(stats_object["correlations"]["phi_k"], vmin=0), "name": "Phi<sub>k</sub>", } if "cramers" in stats_object["correlations"]: if active == "": active = "cramers" values["cramers"] = { "matrix": plot.correlation_matrix(stats_object["correlations"]["cramers"], vmin=0), "name": "Cramér's V", } if "recoded" in stats_object["correlations"]: if active == "": active = "recoded" values["recoded"] = { "matrix": plot.correlation_matrix(stats_object["correlations"]["recoded"], vmin=0), "name": "Recoded", } return templates.template("correlations.html").render(values=values, active=active)