def get_cluster_items(summary) -> Optional[Renderable]: """Create the list of cluster items Args: summary: dict of clusters Returns: List of cluster items to show in the interface. """ items: List[Renderable] = [] image_format = config["plot"]["image_format"].get(str) for key, cluster in summary["clusters"].items(): data = cluster.drop(["Cluster"], axis=1) labels = np.array(list(map(lambda x: "noise" if x == -1 else f"cluster {x}", cluster["Cluster"]))) visualisation_items = [] for name, visualisation in config["clusters"]["visualisations"].get(): n_components_items = [] for n_components in [2,3]: diagram = Image( scatter_dataset(data, labels, visualisation=eval(visualisation), n_components=n_components), image_format=image_format, alt="{n_components}D", anchor_id=f"{key}_{name}_{n_components}D_diagram", name=f"{n_components}D", classes=f"{key}-{name}-{n_components}D-diagram", ) n_components_items.append(diagram) visualisation_items.append( Container( n_components_items, sequence_type="tabs", name=name, anchor_id=f"{key}_{name}_tab", ) ) items.append( Container( visualisation_items, sequence_type="tabs", name=key, anchor_id=f"{key}_tab", ) ) data = list(summary["clusters"].values())[0].drop(["Cluster"], axis=1) # Clustermap if config["clusters"]["clustermap"].get(bool): items.append( Image( clustermap(data[list(data.select_dtypes(include=np.number).columns)]), image_format=image_format, alt="Clustermap", anchor_id=f"clustermap_diagram", name=f"Clustermap", classes=f"clustermap-diagram", ) ) return items
def render_generic(summary): template_variables = {} # render_common(summary) info = VariableInfo( anchor_id=summary["varid"], warnings=summary["warnings"], var_type="Unsupported", var_name=summary["varname"], ) table = Table([ { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) return { "top": Container([info, table, HTML("")], sequence_type="grid"), "bottom": None, }
def get_dataset_column_definitions(definitions: dict): """Generate an overview section for the variable description Args: definitions: the variable descriptions. Returns: A container object """ variable_descriptions = [ Table( [ {"name": column, "value": value, "fmt": "fmt"} for column, value in definitions.items() ], name="Variable descriptions", anchor_id="variable_definition_table", ) ] return Container( variable_descriptions, name="Variables", anchor_id="variable_descriptions", sequence_type="grid", )
def get_dataset_reproduction(summary: dict): version = summary["package"]["pandas_profiling_version"] config = quote(summary["package"]["pandas_profiling_config"]) date_start = summary["analysis"]["date_start"] date_end = summary["analysis"]["date_end"] duration = summary["analysis"]["duration"] reproduction_table = Table( [ {"name": "Analysis started", "value": date_start, "fmt": "fmt"}, {"name": "Analysis finished", "value": date_end, "fmt": "fmt"}, {"name": "Duration", "value": duration, "fmt": "fmt_timespan"}, { "name": "Software version", "value": f'<a href="https://github.com/pandas-profiling/pandas-profiling">pandas-profiling v{version}</a>', "fmt": "raw", }, { "name": "Download configuration", "value": f'<a download="config.yaml" href="data:text/plain;charset=utf-8,{config}">config.yaml</a>', "fmt": "raw", }, ], name="Reproduction", anchor_id="overview_reproduction", ) return Container( [reproduction_table], name="Reproduction", anchor_id="reproduction", sequence_type="grid", )
def render_generic(config: Settings, summary: dict) -> dict: info = VariableInfo( anchor_id=summary["varid"], alerts=summary["alerts"], var_type="Unsupported", var_name=summary["varname"], description=summary["description"], ) table = Table([ { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ]) return { "top": Container([info, table, HTML("")], sequence_type="grid"), "bottom": None, }
def get_scatter_matrix(scatter_matrix: dict) -> list: """Returns the interaction components for the report Args: scatter_matrix: a nested dict containing the scatter plots Returns: A list of components for the interaction section of the report """ image_format = config["plot"]["image_format"].get(str) titems = [] for x_col, y_cols in scatter_matrix.items(): items = [] for y_col, splot in y_cols.items(): items.append( Image( splot, image_format=image_format, alt=f"{x_col} x {y_col}", anchor_id= f"interactions_{x_col.replace(' ', '_')}_{y_col.replace(' ', '_')}", name=y_col, )) titems.append( Container( items, sequence_type="tabs" if len(items) <= 10 else "select", name=x_col, nested=len(scatter_matrix) > 10, anchor_id=f"interactions_{x_col.replace(' ', '_')}", )) return titems
def get_scatter_matrix(config: Settings, scatter_matrix: dict) -> list: """Returns the interaction components for the report Args: config: report Settings object scatter_matrix: a nested dict containing the scatter plots Returns: A list of components for the interaction section of the report """ titems = [] for x_col, y_cols in scatter_matrix.items(): items = [ ImageWidget( splot, image_format=config.plot.image_format, alt=f"{x_col} x {y_col}", anchor_id=f"interactions_{slugify(x_col)}_{slugify(y_col)}", name=y_col, ) for y_col, splot in y_cols.items() ] titems.append( Container( items, sequence_type="tabs" if len(items) <= 10 else "select", name=x_col, nested=len(scatter_matrix) > 10, anchor_id=f"interactions_{slugify(x_col)}", ) ) return titems
def render_path_image(summary): varid = summary["varid"] n_freq_table_max = config["n_freq_table_max"].get(int) image_format = config["plot"]["image_format"].get(str) template_variables = render_path(summary) # Top template_variables["top"].content["items"][0].content[ "var_type"] = "Image Path" # Bottom keys = {"Image shape": "image_shape", "Exif keys": "exif_keys"} for title, key in keys.items(): template_variables[f"freqtable_{key}"] = freq_table( freqtable=summary[f"{key}_counts"], n=summary["n"], max_number_to_print=n_freq_table_max, ) # TODO: add dropdown to switch to specific values exif_keys = FrequencyTable( template_variables["freqtable_exif_keys"], name="Exif keys", anchor_id=f"{varid}exif_frequency", ) template_variables["bottom"].content["items"].append(exif_keys) image_shape_freq = FrequencyTable( template_variables["freqtable_image_shape"], name="Frequency", anchor_id=f"{varid}image_shape_frequency", ) image_shape_scatter = Image( scatter_series(summary["scatter_data"]), image_format=image_format, alt="Scatterplot of image sizes", caption="Scatterplot of image sizes", name="Scatter", anchor_id=f"{varid}scatter", ) image_shape = Container( [image_shape_freq, image_shape_scatter], sequence_type="tabs", name="Image shape", anchor_id=f"{varid}image_shape", ) template_variables["bottom"].content["items"].append(image_shape) return template_variables
def render_file(summary): varid = summary["varid"] template_variables = render_path(summary) # Top template_variables["top"].content["items"][0].content["var_type"] = "File" n_freq_table_max = config["n_freq_table_max"].get(int) image_format = config["plot"]["image_format"].get(str) file_tabs = [] if "file_size" in summary: file_tabs.append( Image( histogram(*summary["histogram_file_size"]), image_format=image_format, alt="Size", caption= f"<strong>Histogram with fixed size bins of file sizes (in bytes)</strong> (bins={len(summary['histogram_file_size'][1]) - 1})", name="File size", anchor_id=f"{varid}file_size_histogram", )) file_dates = { "file_created_time": "Created", "file_accessed_time": "Accessed", "file_modified_time": "Modified", } for file_date_id, description in file_dates.items(): if file_date_id in summary: file_tabs.append( FrequencyTable( freq_table( freqtable=summary[file_date_id].value_counts(), n=summary["n"], max_number_to_print=n_freq_table_max, ), name=description, anchor_id=f"{varid}{file_date_id}", redact=False, )) file_tab = Container( file_tabs, name="File", sequence_type="tabs", anchor_id=f"{varid}file", ) template_variables["bottom"].content["items"].append(file_tab) return template_variables
def render_categorical_length(summary, varid, image_format): length_table = Table( [ { "name": "Max length", "value": summary["max_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Median length", "value": summary["median_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Mean length", "value": summary["mean_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Min length", "value": summary["min_length"], "fmt": "fmt_numeric", "alert": False, }, ], name="Length", anchor_id=f"{varid}lengthstats", ) length = Image( histogram(*summary["histogram_length"]), image_format=image_format, alt="length histogram", name="Length", caption="Histogram of lengths of the category", anchor_id=f"{varid}length", ) length_tab = Container( [length, length_table], anchor_id=f"{varid}tbl", name="Length", sequence_type="grid", ) return length_tab
def render_categorical_length(summary, varid, image_format): length_table = Table( [ { "name": "最大长度", "value": summary["max_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "中位长度", "value": summary["median_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "平均长度", "value": summary["mean_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "最小长度", "value": summary["min_length"], "fmt": "fmt_numeric", "alert": False, }, ], name="长度", anchor_id=f"{varid}lengthstats", ) length = Image( histogram(*summary["histogram_length"]), image_format=image_format, alt="Scatter", name="Length", anchor_id=f"{varid}length", ) length_tab = Container( [length, length_table], anchor_id=f"{varid}tbl", name="长度", sequence_type="grid", ) return length_tab
def get_dataset_schema(metadata) -> Optional[Container]: about_dataset = [] for key in ["description", "creator", "author"]: if key in metadata and len(metadata[key]) > 0: about_dataset.append( {"name": key.capitalize(), "value": metadata[key], "fmt": "fmt"} ) if "url" in metadata: about_dataset.append( { "name": "URL", "value": f'<a href="{metadata["url"]}">{metadata["url"]}</a>', "fmt": "raw", } ) if "copyright_holder" in metadata and len(metadata["copyright_holder"]) > 0: if "copyright_year" not in metadata: about_dataset.append( { "name": "Copyright", "value": f"(c) {metadata['copyright_holder']}", "fmt": "fmt", } ) else: about_dataset.append( { "name": "Copyright", "value": f"(c) {metadata['copyright_holder']} {metadata['copyright_year']}", "fmt": "fmt", } ) return Container( [Table(about_dataset, name="Dataset", anchor_id="metadata_dataset")], name="Dataset", anchor_id="dataset", sequence_type="grid", )
def get_scatter_matrix(scatter_matrix: dict) -> list: """Returns the interaction components for the report Args: scatter_matrix: a nested dict containing the scatter plots Returns: A list of components for the interaction section of the report """ image_format = config["plot"]["image_format"].get(str) titems = [] alphanum = re.compile(r"[^a-zA-Z\s]") def clean_name(name): return alphanum.sub("", name).replace(" ", "_") for x_col, y_cols in scatter_matrix.items(): items = [] for y_col, splot in y_cols.items(): items.append( ImageWidget( splot, image_format=image_format, alt=f"{x_col} x {y_col}", anchor_id= f"interactions_{clean_name(x_col)}_{clean_name(y_col)}", name=y_col, )) titems.append( Container( items, sequence_type="tabs" if len(items) <= 10 else "select", name=x_col, nested=len(scatter_matrix) > 10, anchor_id=f"interactions_{clean_name(x_col)}", )) return titems
def render_categorical_frequency(summary, varid, image_format): frequency_table = Table( [ { "name": "Unique", "value": f"{summary['n_unique']} {help('The number of unique values (all values that occur exactly once in the dataset).')}", "fmt": "raw", "alert": "n_unique" in summary["warn_fields"], }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, ], name="Unique", anchor_id=f"{varid}uniquenessstats", ) frequencies = Image( histogram(*summary["histogram_frequencies"]), image_format=image_format, alt="frequencies histogram", name="Frequencies histogram", caption="Frequencies of value counts", anchor_id=f"{varid}frequencies", ) frequency_tab = Container( [frequencies, frequency_table], anchor_id=f"{varid}tbl", name="Overview", sequence_type="grid", ) return frequency_tab
def get_scatter_matrix(scatter_matrix): image_format = config["plot"]["image_format"].get(str) titems = [] for x_col, y_cols in scatter_matrix.items(): items = [] for y_col, splot in y_cols.items(): items.append( Image( splot, image_format=image_format, alt=f"{x_col} x {y_col}", anchor_id=f"interactions_{x_col}_{y_col}", name=y_col, )) titems.append( Container( items, sequence_type="tabs", name=x_col, anchor_id=f"interactions_{x_col}", )) return titems
def render_url(summary): varid = summary["varid"] n_freq_table_max = config["n_freq_table_max"].get(int) n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) # TODO: merge with boolean/categorical mini_freq_table_rows = freq_table(freqtable=summary["value_counts"], n=summary["n"], max_number_to_print=n_obs_cat) template_variables = render_common(summary) keys = ["scheme", "netloc", "path", "query", "fragment"] for url_part in keys: template_variables[f"freqtable_{url_part}"] = freq_table( freqtable=summary[f"{url_part}_counts"], n=summary["n"], max_number_to_print=n_freq_table_max, ) full_frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Full", anchor_id=f"{varid}full_frequency", ) scheme_frequency_table = FrequencyTable( template_variables["freqtable_scheme"], name="Scheme", anchor_id=f"{varid}scheme_frequency", ) netloc_frequency_table = FrequencyTable( template_variables["freqtable_netloc"], name="Netloc", anchor_id=f"{varid}netloc_frequency", ) path_frequency_table = FrequencyTable( template_variables["freqtable_path"], name="Path", anchor_id=f"{varid}path_frequency", ) query_frequency_table = FrequencyTable( template_variables["freqtable_query"], name="Query", anchor_id=f"{varid}query_frequency", ) fragment_frequency_table = FrequencyTable( template_variables["freqtable_fragment"], name="Fragment", anchor_id=f"{varid}fragment_frequency", ) items = [ full_frequency_table, scheme_frequency_table, netloc_frequency_table, path_frequency_table, query_frequency_table, fragment_frequency_table, ] template_variables["bottom"] = Container(items, sequence_type="tabs", name="url stats", anchor_id=f"{varid}urlstats") # Element composition info = VariableInfo(summary["varid"], summary["varname"], "URL", summary["warnings"]) table = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) fqm = FrequencyTableSmall(mini_freq_table_rows) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") return template_variables
def render_real(summary): varid = summary["varid"] template_variables = render_common(summary) image_format = config["plot"]["image_format"].get(str) if summary["min"] >= 0: name = "Real number (ℝ<sub>≥0</sub>)" else: name = "Real number (ℝ)" # Top info = VariableInfo( summary["varid"], summary["varname"], name, summary["warnings"], summary["description"], ) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": "n_unique" in summary["warn_fields"], }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Infinite", "value": summary["n_infinite"], "fmt": "fmt", "alert": "n_infinite" in summary["warn_fields"], }, { "name": "Infinite (%)", "value": summary["p_infinite"], "fmt": "fmt_percent", "alert": "p_infinite" in summary["warn_fields"], }, ]) table2 = Table([ { "name": "Mean", "value": summary["mean"], "fmt": "fmt", "alert": False }, { "name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False }, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt", "alert": "n_zeros" in summary["warn_fields"], }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent", "alert": "p_zeros" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) histogram_bins = 10 # TODO: replace with SmallImage... mini_histo = Image( mini_histogram(summary["histogram_data"], summary, histogram_bins), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") quantile_statistics = Table( [ { "name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric" }, { "name": "5-th percentile", "value": summary["5%"], "fmt": "fmt_numeric" }, { "name": "Q1", "value": summary["25%"], "fmt": "fmt_numeric" }, { "name": "median", "value": summary["50%"], "fmt": "fmt_numeric" }, { "name": "Q3", "value": summary["75%"], "fmt": "fmt_numeric" }, { "name": "95-th percentile", "value": summary["95%"], "fmt": "fmt_numeric" }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric" }, { "name": "Range", "value": summary["range"], "fmt": "fmt_numeric" }, { "name": "Interquartile range (IQR)", "value": summary["iqr"], "fmt": "fmt_numeric", }, ], name="Quantile statistics", ) descriptive_statistics = Table( [ { "name": "Standard deviation", "value": summary["std"], "fmt": "fmt_numeric", }, { "name": "Coefficient of variation (CV)", "value": summary["cv"], "fmt": "fmt_numeric", }, { "name": "Kurtosis", "value": summary["kurtosis"], "fmt": "fmt_numeric" }, { "name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric" }, { "name": "Median Absolute Deviation (MAD)", "value": summary["mad"], "fmt": "fmt_numeric", }, { "name": "Skewness", "value": summary["skewness"], "fmt": "fmt_numeric", "class": "alert" if "skewness" in summary["warn_fields"] else "", }, { "name": "Sum", "value": summary["sum"], "fmt": "fmt_numeric" }, { "name": "Variance", "value": summary["variance"], "fmt": "fmt_numeric" }, ], name="Descriptive statistics", ) statistics = Container( [quantile_statistics, descriptive_statistics], anchor_id=f"{varid}statistics", name="Statistics", sequence_type="grid", ) seqs = [ Image( histogram(summary["histogram_data"], summary, histogram_bins), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={histogram_bins})", name="Histogram", anchor_id=f"{varid}histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id=f"{varid}common_values", ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id=f"{varid}firstn", ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id=f"{varid}lastn", ), ], sequence_type="tabs", name="Extreme values", anchor_id=f"{varid}extreme_values", ) if "histogram_bins_bayesian_blocks" in summary: histo_dyn = Image( histogram( summary["histogram_data"], summary, summary["histogram_bins_bayesian_blocks"], ), image_format=image_format, alt="Histogram", caption= '<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)' .format( fmt_array(summary["histogram_bins_bayesian_blocks"], threshold=5)), name="Dynamic Histogram", anchor_id=f"{varid}dynamic_histogram", ) seqs.append(histo_dyn) template_variables["bottom"] = Container( [ statistics, Container( seqs, sequence_type="tabs", name="Histogram(s)", anchor_id=f"{varid}histograms", ), fq, evs, ], sequence_type="tabs", anchor_id=f"{varid}bottom", ) return template_variables
def render_real(summary): varid = summary["varid"] template_variables = render_common(summary) image_format = config["plot"]["image_format"].get(str) if summary["min"] >= 0: name = "Real number (ℝ<sub>≥0</sub>)" else: name = "Real number (ℝ)" # Top info = VariableInfo( summary["varid"], summary["varname"], name, summary["warnings"], summary["description"], ) table1 = Table([ { "name": "Distinct", "value": summary["n_distinct"], "fmt": "fmt", "alert": "n_distinct" in summary["warn_fields"], }, { "name": "Distinct (%)", "value": summary["p_distinct"], "fmt": "fmt_percent", "alert": "p_distinct" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Infinite", "value": summary["n_infinite"], "fmt": "fmt", "alert": "n_infinite" in summary["warn_fields"], }, { "name": "Infinite (%)", "value": summary["p_infinite"], "fmt": "fmt_percent", "alert": "p_infinite" in summary["warn_fields"], }, { "name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric", "alert": False, }, ]) table2 = Table([ { "name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt", "alert": "n_zeros" in summary["warn_fields"], }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent", "alert": "p_zeros" in summary["warn_fields"], }, { "name": "Negative", "value": summary["n_negative"], "fmt": "fmt", "alert": False, }, { "name": "Negative (%)", "value": summary["p_negative"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) mini_histo = Image( mini_histogram(*summary["histogram"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") quantile_statistics = Table( [ { "name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric" }, { "name": "5-th percentile", "value": summary["5%"], "fmt": "fmt_numeric" }, { "name": "Q1", "value": summary["25%"], "fmt": "fmt_numeric" }, { "name": "median", "value": summary["50%"], "fmt": "fmt_numeric" }, { "name": "Q3", "value": summary["75%"], "fmt": "fmt_numeric" }, { "name": "95-th percentile", "value": summary["95%"], "fmt": "fmt_numeric" }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric" }, { "name": "Range", "value": summary["range"], "fmt": "fmt_numeric" }, { "name": "Interquartile range (IQR)", "value": summary["iqr"], "fmt": "fmt_numeric", }, ], name="Quantile statistics", ) descriptive_statistics = Table( [ { "name": "Standard deviation", "value": summary["std"], "fmt": "fmt_numeric", }, { "name": "Coefficient of variation (CV)", "value": summary["cv"], "fmt": "fmt_numeric", }, { "name": "Kurtosis", "value": summary["kurtosis"], "fmt": "fmt_numeric" }, { "name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric" }, { "name": "Median Absolute Deviation (MAD)", "value": summary["mad"], "fmt": "fmt_numeric", }, { "name": "Skewness", "value": summary["skewness"], "fmt": "fmt_numeric", "class": "alert" if "skewness" in summary["warn_fields"] else "", }, { "name": "Sum", "value": summary["sum"], "fmt": "fmt_numeric" }, { "name": "Variance", "value": summary["variance"], "fmt": "fmt_numeric" }, { "name": "Monotonicity", "value": summary["monotonic"], "fmt": "fmt_monotonic", }, ], name="Descriptive statistics", ) statistics = Container( [quantile_statistics, descriptive_statistics], anchor_id=f"{varid}statistics", name="Statistics", sequence_type="grid", ) hist = Image( histogram(*summary["histogram"]), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id=f"{varid}histogram", ) fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id=f"{varid}common_values", redact=False, ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id=f"{varid}firstn", redact=False, ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id=f"{varid}lastn", redact=False, ), ], sequence_type="tabs", name="Extreme values", anchor_id=f"{varid}extreme_values", ) template_variables["bottom"] = Container( [statistics, hist, fq, evs], sequence_type="tabs", anchor_id=f"{varid}bottom", ) return template_variables
def render_image(config: Settings, summary: dict) -> dict: varid = summary["varid"] n_freq_table_max = config.n_freq_table_max redact = config.vars.cat.redact template_variables = render_file(config, summary) # Top template_variables["top"].content["items"][0].content["var_type"] = "Image" # Bottom image_items = [] """ Min Width Min Height Min Area Mean Width Mean Height Mean Height Median Width Median Height Median Height Max Width Max Height Max Height All dimension properties are in pixels. """ image_shape_items = [ Container( [ Table([ { "name": "Min width", "value": fmt_numeric(summary["min_width"], precision=config.report.precision), "alert": False, }, { "name": "Median width", "value": fmt_numeric( summary["median_width"], precision=config.report.precision, ), "alert": False, }, { "name": "Max width", "value": fmt_numeric(summary["max_width"], precision=config.report.precision), "alert": False, }, ]), Table([ { "name": "Min height", "value": fmt_numeric(summary["min_height"], precision=config.report.precision), "alert": False, }, { "name": "Median height", "value": fmt_numeric( summary["median_height"], precision=config.report.precision, ), "alert": False, }, { "name": "Max height", "value": fmt_numeric(summary["max_height"], precision=config.report.precision), "alert": False, }, ]), Table([ { "name": "Min area", "value": fmt_numeric(summary["min_area"], precision=config.report.precision), "alert": False, }, { "name": "Median area", "value": fmt_numeric( summary["median_area"], precision=config.report.precision, ), "alert": False, }, { "name": "Max area", "value": fmt_numeric(summary["max_area"], precision=config.report.precision), "alert": False, }, ]), ], anchor_id=f"{varid}tbl", name="Overview", sequence_type="grid", ), Image( scatter_series(config, summary["image_dimensions"]), image_format=config.plot.image_format, alt="Scatter plot of image sizes", caption="Scatter plot of image sizes", name="Scatter plot", anchor_id=f"{varid}image_dimensions_scatter", ), FrequencyTable( freq_table( freqtable=summary["image_dimensions"].value_counts(), n=summary["n"], max_number_to_print=n_freq_table_max, ), name="Common values", anchor_id=f"{varid}image_dimensions_frequency", redact=False, ), ] image_shape = Container( image_shape_items, sequence_type="named_list", name="Dimensions", anchor_id=f"{varid}image_dimensions", ) if "exif_keys_counts" in summary: items = [ FrequencyTable( freq_table( freqtable=pd.Series(summary["exif_keys_counts"]), n=summary["n"], max_number_to_print=n_freq_table_max, ), name="Exif keys", anchor_id=f"{varid}exif_keys", redact=redact, ) ] for key, counts in summary["exif_data"].items(): if key == "exif_keys": continue items.append( FrequencyTable( freq_table( freqtable=counts, n=summary["n"], max_number_to_print=n_freq_table_max, ), name=key, anchor_id=f"{varid}_exif_{key}", redact=redact, )) image_items.append( Container( items, anchor_id=f"{varid}exif_data", name="Exif data", sequence_type="named_list", )) image_items.append(image_shape) image_tab = Container( image_items, name="Image", sequence_type="tabs", anchor_id=f"{varid}image", ) template_variables["bottom"].content["items"].append(image_tab) return template_variables
def render_categorical(config: Settings, summary: dict) -> dict: varid = summary["varid"] n_obs_cat = config.vars.cat.n_obs image_format = config.plot.image_format words = config.vars.cat.words characters = config.vars.cat.characters length = config.vars.cat.length template_variables = render_common(config, summary) info = VariableInfo( summary["varid"], summary["varname"], "Categorical", summary["alerts"], summary["description"], ) table = Table([ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": "n_distinct" in summary["alert_fields"], }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": "p_distinct" in summary["alert_fields"], }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ]) fqm = FrequencyTableSmall( freq_table( freqtable=summary["value_counts_without_nan"], n=summary["count"], max_number_to_print=n_obs_cat, ), redact=config.vars.cat.redact, ) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") # ============================================================================================ frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}common_values", redact=config.vars.cat.redact, ) unique_stats = render_categorical_frequency(config, summary, varid) overview_items = [] if length: length_table, length_histo = render_categorical_length( config, summary, varid) overview_items.append(length_table) if characters: overview_table_char, unitab = render_categorical_unicode( config, summary, varid) overview_items.append(overview_table_char) overview_items.append(unique_stats) if not config.vars.cat.redact: rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row") sample = Table( [{ "name": name, "value": fmt(value), "alert": False, } for name, value in zip(rows, summary["first_rows"])], name="Sample", ) overview_items.append(sample) string_items: List[Renderable] = [frequency_table] if length: string_items.append(length_histo) max_unique = config.plot.pie.max_unique if max_unique > 0 and summary["n_distinct"] <= max_unique: string_items.append( Image( pie_plot( config, summary["value_counts_without_nan"], legend_kws={"loc": "upper right"}, ), image_format=image_format, alt="Pie chart", name="Pie chart", anchor_id=f"{varid}pie_chart", )) bottom_items = [ Container( overview_items, name="Overview", anchor_id=f"{varid}overview", sequence_type="batch_grid", batch_size=len(overview_items), titles=False, ), Container( string_items, name="Categories", anchor_id=f"{varid}string", sequence_type="batch_grid", batch_size=len(string_items), ), ] if words: woc = freq_table( freqtable=summary["word_counts"], n=summary["word_counts"].sum(), max_number_to_print=10, ) fqwo = FrequencyTable( woc, name="Common words", anchor_id=f"{varid}cwo", redact=config.vars.cat.redact, ) bottom_items.append( Container( [fqwo], name="Words", anchor_id=f"{varid}word", sequence_type="grid", )) if characters: bottom_items.append( Container( [unitab], name="Characters", anchor_id=f"{varid}characters", sequence_type="grid", )) # Bottom template_variables["bottom"] = Container(bottom_items, sequence_type="tabs", anchor_id=f"{varid}bottom") return template_variables
def render_categorical_unicode(config: Settings, summary: dict, varid: str) -> Tuple[Renderable, Renderable]: n_freq_table_max = config.n_freq_table_max category_overview = FrequencyTable( freq_table( freqtable=summary["category_alias_counts"], n=summary["category_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring categories", anchor_id=f"{varid}category_long_values", redact=False, ) cats = [] for category_alias_name, category_alias_counts in sorted( summary["category_alias_char_counts"].items(), key=lambda x: -len(x[1])): category_alias_name = category_alias_name.replace("_", " ") cats.append( FrequencyTable( freq_table( freqtable=category_alias_counts, n=category_alias_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"{category_alias_name}", anchor_id=f"{varid}category_alias_values_{category_alias_name}", redact=config.vars.cat.redact, )) category_items = [ category_overview, Container( cats, name="Most frequent character per category", sequence_type="batch_grid", anchor_id=f"{varid}categories", batch_size=2, subtitles=True, ), ] script_overview = FrequencyTable( freq_table( freqtable=summary["script_counts"], n=summary["script_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring scripts", anchor_id=f"{varid}script_values", redact=False, ) scripts = [ FrequencyTable( freq_table( freqtable=script_counts, n=script_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"{script_name}", anchor_id=f"{varid}script_values_{script_name}", redact=config.vars.cat.redact, ) for script_name, script_counts in sorted( summary["script_char_counts"].items(), key=lambda x: -len(x[1])) ] script_items = [ script_overview, Container( scripts, name="Most frequent character per script", sequence_type="batch_grid", anchor_id=f"{varid}scripts", batch_size=2, subtitles=True, ), ] block_overview = FrequencyTable( freq_table( freqtable=summary["block_alias_counts"], n=summary["block_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring blocks", anchor_id=f"{varid}block_alias_values", redact=False, ) blocks = [ FrequencyTable( freq_table( freqtable=block_counts, n=block_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"{block_name}", anchor_id=f"{varid}block_alias_values_{block_name}", redact=config.vars.cat.redact, ) for block_name, block_counts in summary["block_alias_char_counts"].items() ] block_items = [ block_overview, Container( blocks, name="Most frequent character per block", sequence_type="batch_grid", anchor_id=f"{varid}blocks", batch_size=2, subtitles=True, ), ] overview_table = Table( [ { "name": "Total characters", "value": fmt_number(summary["n_characters"]), "alert": False, }, { "name": "Distinct characters", "value": fmt_number(summary["n_characters_distinct"]), "alert": False, }, { "name": "Distinct categories", "value": f"{fmt_number(summary['n_category'])} {help(title='Unicode categories (click for more information)', url='https://en.wikipedia.org/wiki/Unicode_character_property#General_Category')}", "alert": False, }, { "name": "Distinct scripts", "value": f"{fmt_number(summary['n_scripts'])} {help(title='Unicode scripts (click for more information)', url='https://en.wikipedia.org/wiki/Script_(Unicode)#List_of_scripts_in_Unicode')}", "alert": False, }, { "name": "Distinct blocks", "value": f"{fmt_number(summary['n_block_alias'])} {help(title='Unicode blocks (click for more information)', url='https://en.wikipedia.org/wiki/Unicode_block')}", "alert": False, }, ], name="Characters and Unicode", caption= "The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables. ", ) citems = [ Container( [ FrequencyTable( freq_table( freqtable=summary["character_counts"], n=summary["n_characters"], max_number_to_print=n_freq_table_max, ), name="Most occurring characters", anchor_id=f"{varid}character_frequency", redact=config.vars.cat.redact, ), ], name="Characters", anchor_id=f"{varid}characters", sequence_type="named_list", ), Container( category_items, name="Categories", anchor_id=f"{varid}categories", sequence_type="named_list", ), Container( script_items, name="Scripts", anchor_id=f"{varid}scripts", sequence_type="named_list", ), Container( block_items, name="Blocks", anchor_id=f"{varid}blocks", sequence_type="named_list", ), ] return overview_table, Container( citems, name="Unicode", sequence_type="tabs", anchor_id=f"{varid}unicode", )
def render_categorical(summary): varid = summary["varid"] n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) image_format = config["plot"]["image_format"].get(str) redact = config["vars"]["cat"]["redact"].get(bool) template_variables = render_common(summary) info = VariableInfo( summary["varid"], summary["varname"], "分类变量", summary["warnings"], summary["description"], ) table = Table( [ { "name": "唯一值计数", "value": summary["n_unique"], "fmt": "fmt", "alert": "n_unique" in summary["warn_fields"], }, { "name": "唯一值比例 (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, { "name": "缺失值", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "缺失值比例(%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "内存占用", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ] ) fqm = FrequencyTableSmall( freq_table( freqtable=summary["value_counts"], n=summary["count"], max_number_to_print=n_obs_cat, ), redact=redact, ) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") # Bottom items = [ FrequencyTable( template_variables["freq_table_rows"], name="常见值", anchor_id=f"{varid}common_values", redact=redact, ) ] max_unique = config["plot"]["pie"]["max_unique"].get(int) if max_unique > 0 and summary["n_unique"] <= max_unique: items.append( Image( pie_plot(summary["value_counts"], legend_kws={"loc": "upper right"}), image_format=image_format, alt="Chart", name="图表", anchor_id=f"{varid}pie_chart", ) ) check_length = config["vars"]["cat"]["length"].get(bool) if check_length: items.append(render_categorical_length(summary, varid, image_format)) check_unicode = config["vars"]["cat"]["unicode"].get(bool) if check_unicode: items.append(render_categorical_unicode(summary, varid, redact)) template_variables["bottom"] = Container( items, sequence_type="tabs", anchor_id=f"{varid}bottom" ) return template_variables
def render_date(summary): varid = summary["varid"] # TODO: render common? template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Date", summary["warnings"], summary["description"], ) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) table2 = Table([ { "name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False }, ]) mini_histo = Image( mini_histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") # Bottom bottom = Container( [ Image( histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Histogram", caption="Histogram", name="Histogram", anchor_id=f"{varid}histogram", ) ], sequence_type="tabs", anchor_id=summary["varid"], ) template_variables["bottom"] = bottom return template_variables
def get_report_structure(summary: dict) -> Renderable: """Generate a HTML report from summary statistics and a given sample. Args: summary: Statistics to use for the overview, variables, correlations and missing values. Returns: The profile report in HTML format """ disable_progress_bar = not config["progress_bar"].get(bool) with tqdm(total=1, desc="Generate report structure", disable=disable_progress_bar) as pbar: warnings = summary["messages"] section_items: List[Renderable] = [ Container( get_dataset_items(summary, warnings), sequence_type="tabs", name="Overview", anchor_id="overview", ), Container( render_variables_section(summary), sequence_type="accordion", name="Variables", anchor_id="variables", ), ] scatter_items = get_scatter_matrix(summary["scatter"]) if len(scatter_items) > 0: section_items.append( Container( scatter_items, sequence_type="tabs" if len(scatter_items) <= 10 else "select", name="Interactions", anchor_id="interactions", ), ) corr = get_correlation_items(summary) if corr is not None: section_items.append(corr) missing_items = get_missing_items(summary) if len(missing_items) > 0: section_items.append( Container( missing_items, sequence_type="tabs", name="Missing values", anchor_id="missing", )) sample_items = get_sample_items(summary["sample"]) if len(sample_items) > 0: section_items.append( Container( items=sample_items, sequence_type="list", name="Sample", anchor_id="sample", )) duplicate_items = get_duplicates_items(summary["duplicates"]) if len(duplicate_items) > 0: section_items.append( Container( items=duplicate_items, sequence_type="list", name="Duplicate rows", anchor_id="duplicate", )) sections = Container(section_items, name="Root", sequence_type="sections") pbar.update() footer = HTML( content= 'Report generated with <a href="https://github.com/pandas-profiling/pandas-profiling">pandas-profiling</a>.' ) return Root("Root", sections, footer)
def render_path(summary): varid = summary["varid"] n_freq_table_max = config["n_freq_table_max"].get(int) redact = config["vars"]["cat"]["redact"].get(bool) template_variables = render_categorical(summary) keys = ["name", "parent", "suffix", "stem", "anchor"] for path_part in keys: template_variables[f"freqtable_{path_part}"] = freq_table( freqtable=summary[f"{path_part}_counts"], n=summary["n"], max_number_to_print=n_freq_table_max, ) # Top template_variables["top"].content["items"][0].content["var_type"] = "Path" # Bottom path_overview_tab = Container( [ Table([ { "name": "Common prefix", "value": summary["common_prefix"], "fmt": "fmt", "alert": False, }, { "name": "Unique stems", "value": summary["n_stem_unique"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Unique names", "value": summary["n_name_unique"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Unique extensions", "value": summary["n_suffix_unique"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Unique directories", "value": summary["n_parent_unique"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Unique anchors", "value": summary["n_anchor_unique"], "fmt": "fmt_numeric", "alert": False, }, ]) ], anchor_id=f"{varid}tbl", name="Overview", sequence_type="list", ) path_items = [ path_overview_tab, FrequencyTable( template_variables["freq_table_rows"], name="Full", anchor_id=f"{varid}full_frequency", redact=redact, ), FrequencyTable( template_variables["freqtable_stem"], name="Stem", anchor_id=f"{varid}stem_frequency", redact=redact, ), FrequencyTable( template_variables["freqtable_name"], name="Name", anchor_id=f"{varid}name_frequency", redact=redact, ), FrequencyTable( template_variables["freqtable_suffix"], name="Extension", anchor_id=f"{varid}suffix_frequency", redact=redact, ), FrequencyTable( template_variables["freqtable_parent"], name="Parent", anchor_id=f"{varid}parent_frequency", redact=redact, ), FrequencyTable( template_variables["freqtable_anchor"], name="Anchor", anchor_id=f"{varid}anchor_frequency", redact=redact, ), ] path_tab = Container( path_items, name="Path", sequence_type="tabs", anchor_id=f"{varid}path", ) template_variables["bottom"].content["items"].append(path_tab) return template_variables
def render_categorical_unicode(summary, varid, redact): n_freq_table_max = config["n_freq_table_max"].get(int) category_items = [ FrequencyTable( freq_table( freqtable=summary["category_alias_counts"], n=summary["category_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="最常见分类", anchor_id=f"{varid}category_long_values", redact=False, ) ] for category_alias_name, category_alias_counts in summary[ "category_alias_char_counts" ].items(): category_alias_name = category_alias_name.replace("_", " ") category_items.append( FrequencyTable( freq_table( freqtable=category_alias_counts, n=category_alias_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"最常见字符 {category_alias_name} ", anchor_id=f"{varid}category_alias_values_{category_alias_name}", redact=redact, ) ) script_items = [ FrequencyTable( freq_table( freqtable=summary["script_counts"], n=summary["script_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="最常见值", anchor_id=f"{varid}script_values", redact=False, ), ] for script_name, script_counts in summary["script_char_counts"].items(): script_items.append( FrequencyTable( freq_table( freqtable=script_counts, n=script_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"最常见 {script_name} 字符", anchor_id=f"{varid}script_values_{script_name}", redact=redact, ) ) block_items = [ FrequencyTable( freq_table( freqtable=summary["block_alias_counts"], n=summary["block_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="最常见区段", anchor_id=f"{varid}block_alias_values", redact=False, ) ] for block_name, block_counts in summary["block_alias_char_counts"].items(): block_items.append( FrequencyTable( freq_table( freqtable=block_counts, n=block_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"最频繁 {block_name} 字符", anchor_id=f"{varid}block_alias_values_{block_name}", redact=redact, ) ) citems = [ Container( [ Table( [ { "name": "字符", "value": summary["n_characters"], "fmt": "fmt_numeric", "alert": False, }, { "name": '类别 (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_character_property#General_Category">?</a>)', "value": summary["n_category"], "fmt": "fmt_numeric", "alert": False, }, { "name": '书写系统 (<a target="_blank" href="https://en.wikipedia.org/wiki/Script_(Unicode)#List_of_scripts_in_Unicode">?</a>)', "value": summary["n_scripts"], "fmt": "fmt_numeric", "alert": False, }, { "name": '区段 (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_block">?</a>)', "value": summary["n_block_alias"], "fmt": "fmt_numeric", "alert": False, }, ], name="Unicode属性概述", caption="Unicode标准为每个字符提供了唯一的数字编号(code point),可以用来分析文本变量。", ), ], anchor_id=f"{varid}character_overview", name="概要", sequence_type="list", ), Container( [ FrequencyTable( freq_table( freqtable=summary["character_counts"], n=summary["character_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="最常出现的字符", anchor_id=f"{varid}character_frequency", redact=redact, ), ], name="字符", anchor_id=f"{varid}characters", sequence_type="named_list", ), Container( category_items, name="分类", anchor_id=f"{varid}categories", sequence_type="named_list", ), Container( script_items, name="书写系统", anchor_id=f"{varid}scripts", sequence_type="named_list", ), Container( block_items, name="区段", anchor_id=f"{varid}blocks", sequence_type="named_list", ), ] return Container( citems, name="Unicode", sequence_type="tabs", anchor_id=f"{varid}unicode", )
def render_boolean(config: Settings, summary: dict) -> dict: varid = summary["varid"] n_obs_bool = config.vars.bool.n_obs image_format = config.plot.image_format # Prepare variables template_variables = render_common(config, summary) # Element composition info = VariableInfo( anchor_id=summary["varid"], alerts=summary["alerts"], var_type="Boolean", var_name=summary["varname"], description=summary["description"], ) table = Table( [ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": "n_distinct" in summary["alert_fields"], }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": "p_distinct" in summary["alert_fields"], }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ] ) fqm = FrequencyTableSmall( freq_table( freqtable=summary["value_counts_without_nan"], n=summary["n"], max_number_to_print=n_obs_bool, ), redact=False, ) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") items: List[Renderable] = [ FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}frequency_table", redact=False, ) ] max_unique = config.plot.pie.max_unique if max_unique > 0: items.append( Image( pie_plot( config, summary["value_counts_without_nan"], legend_kws={"loc": "upper right"}, ), image_format=image_format, alt="Chart", name="Chart", anchor_id=f"{varid}pie_chart", ) ) template_variables["bottom"] = Container( items, sequence_type="tabs", anchor_id=f"{varid}bottom" ) return template_variables
def render_complex(summary): varid = summary["varid"] template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Complex number (ℂ)", summary["warnings"], summary["description"], ) table1 = Table( [ {"name": "Distinct", "value": summary["n_distinct"], "fmt": "fmt"}, { "name": "Distinct (%)", "value": summary["p_distinct"], "fmt": "fmt_percent", }, {"name": "Missing", "value": summary["n_missing"], "fmt": "fmt"}, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ] ) table2 = Table( [ {"name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric"}, {"name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric"}, {"name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric"}, {"name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt_numeric"}, {"name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent"}, ] ) placeholder = HTML("") template_variables["top"] = Container( [info, table1, table2, placeholder], sequence_type="grid" ) # Bottom items = [ Image( scatter_complex(summary["scatter_data"]), image_format=image_format, alt="Scatterplot", caption="Scatterplot in the complex plane", name="Scatter", anchor_id=f"{varid}scatter", ) ] bottom = Container(items, sequence_type="tabs", anchor_id=summary["varid"]) template_variables["bottom"] = bottom return template_variables
def render_boolean(summary): varid = summary["varid"] n_obs_bool = config["vars"]["bool"]["n_obs"].get(int) image_format = config["plot"]["image_format"].get(str) # Prepare variables template_variables = render_common(summary) # Element composition info = VariableInfo( anchor_id=summary["varid"], warnings=summary["warnings"], var_type="Boolean", var_name=summary["varname"], description=summary["description"], ) table = Table([ { "name": "Distinct", "value": summary["n_distinct"], "fmt": "fmt", "alert": "n_distinct" in summary["warn_fields"], }, { "name": "Distinct (%)", "value": summary["p_distinct"], "fmt": "fmt_percent", "alert": "p_distinct" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) fqm = FrequencyTableSmall( freq_table( freqtable=summary["value_counts"], n=summary["n"], max_number_to_print=n_obs_bool, ), redact=False, ) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") items = [ FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}frequency_table", redact=False, ) ] max_unique = config["plot"]["pie"]["max_unique"].get(int) if max_unique > 0: items.append( Image( pie_plot(summary["value_counts"], legend_kws={"loc": "upper right"}), image_format=image_format, alt="Chart", name="Chart", anchor_id=f"{varid}pie_chart", )) template_variables["bottom"] = Container(items, sequence_type="tabs", anchor_id=f"{varid}bottom") return template_variables
def render_count(config: Settings, summary: dict) -> dict: template_variables = render_common(config, summary) image_format = config.plot.image_format # Top info = VariableInfo( summary["varid"], summary["varname"], "Real number (ℝ / ℝ<sub>≥0</sub>)", summary["warnings"], summary["description"], ) table1 = Table([ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": False, }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": False, }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": False, }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": False, }, ]) table2 = Table([ { "name": "Mean", "value": fmt_numeric(summary["mean"], precision=config.report.precision), "alert": False, }, { "name": "Minimum", "value": fmt_numeric(summary["min"], precision=config.report.precision), "alert": False, }, { "name": "Maximum", "value": fmt_numeric(summary["max"], precision=config.report.precision), "alert": False, }, { "name": "Zeros", "value": fmt(summary["n_zeros"]), "alert": False, }, { "name": "Zeros (%)", "value": fmt_percent(summary["p_zeros"]), "alert": False, }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ]) mini_histo = Image( mini_histogram(config, *summary["histogram"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") seqs = [ Image( histogram(config, *summary["histogram"]), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id="histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id="common_values", redact=False, ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id="firstn", redact=False, ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id="lastn", redact=False, ), ], sequence_type="tabs", name="Extreme values", anchor_id="extreme_values", ) template_variables["bottom"] = Container( [ Container(seqs, sequence_type="tabs", name="Histogram(s)", anchor_id="histograms"), fq, evs, ], sequence_type="tabs", anchor_id=summary["varid"], ) return template_variables