def render_categorical_length(config: Settings, summary: dict, varid: str) -> Tuple[Renderable, Renderable]: length_table = Table( [ { "name": "Max length", "value": fmt_number(summary["max_length"]), "alert": False, }, { "name": "Median length", "value": fmt_number(summary["median_length"]), "alert": False, }, { "name": "Mean length", "value": fmt_numeric(summary["mean_length"], precision=config.report.precision), "alert": False, }, { "name": "Min length", "value": fmt_number(summary["min_length"]), "alert": False, }, ], name="Length", anchor_id=f"{varid}lengthstats", ) length_histo = Image( histogram(config, *summary["histogram_length"]), image_format=config.plot.image_format, alt="length histogram", name="Length", caption="Histogram of lengths of the category", anchor_id=f"{varid}length", ) return length_table, length_histo
def render_image(config: Settings, summary: dict) -> dict: varid = summary["varid"] n_freq_table_max = config.n_freq_table_max redact = config.vars.cat.redact template_variables = render_file(config, summary) # Top template_variables["top"].content["items"][0].content["var_type"] = "Image" # Bottom image_items = [] """ Min Width Min Height Min Area Mean Width Mean Height Mean Height Median Width Median Height Median Height Max Width Max Height Max Height All dimension properties are in pixels. """ image_shape_items = [ Container( [ Table([ { "name": "Min width", "value": fmt_numeric(summary["min_width"], precision=config.report.precision), "alert": False, }, { "name": "Median width", "value": fmt_numeric( summary["median_width"], precision=config.report.precision, ), "alert": False, }, { "name": "Max width", "value": fmt_numeric(summary["max_width"], precision=config.report.precision), "alert": False, }, ]), Table([ { "name": "Min height", "value": fmt_numeric(summary["min_height"], precision=config.report.precision), "alert": False, }, { "name": "Median height", "value": fmt_numeric( summary["median_height"], precision=config.report.precision, ), "alert": False, }, { "name": "Max height", "value": fmt_numeric(summary["max_height"], precision=config.report.precision), "alert": False, }, ]), Table([ { "name": "Min area", "value": fmt_numeric(summary["min_area"], precision=config.report.precision), "alert": False, }, { "name": "Median area", "value": fmt_numeric( summary["median_area"], precision=config.report.precision, ), "alert": False, }, { "name": "Max area", "value": fmt_numeric(summary["max_area"], precision=config.report.precision), "alert": False, }, ]), ], anchor_id=f"{varid}tbl", name="Overview", sequence_type="grid", ), Image( scatter_series(config, summary["image_dimensions"]), image_format=config.plot.image_format, alt="Scatter plot of image sizes", caption="Scatter plot of image sizes", name="Scatter plot", anchor_id=f"{varid}image_dimensions_scatter", ), FrequencyTable( freq_table( freqtable=summary["image_dimensions"].value_counts(), n=summary["n"], max_number_to_print=n_freq_table_max, ), name="Common values", anchor_id=f"{varid}image_dimensions_frequency", redact=False, ), ] image_shape = Container( image_shape_items, sequence_type="named_list", name="Dimensions", anchor_id=f"{varid}image_dimensions", ) if "exif_keys_counts" in summary: items = [ FrequencyTable( freq_table( freqtable=pd.Series(summary["exif_keys_counts"]), n=summary["n"], max_number_to_print=n_freq_table_max, ), name="Exif keys", anchor_id=f"{varid}exif_keys", redact=redact, ) ] for key, counts in summary["exif_data"].items(): if key == "exif_keys": continue items.append( FrequencyTable( freq_table( freqtable=counts, n=summary["n"], max_number_to_print=n_freq_table_max, ), name=key, anchor_id=f"{varid}_exif_{key}", redact=redact, )) image_items.append( Container( items, anchor_id=f"{varid}exif_data", name="Exif data", sequence_type="named_list", )) image_items.append(image_shape) image_tab = Container( image_items, name="Image", sequence_type="tabs", anchor_id=f"{varid}image", ) template_variables["bottom"].content["items"].append(image_tab) return template_variables
def render_complex(config: Settings, summary: dict) -> dict: varid = summary["varid"] template_variables = {} image_format = config.plot.image_format # Top info = VariableInfo( summary["varid"], summary["varname"], "Complex number (ℂ)", summary["warnings"], summary["description"], ) table1 = Table( [ {"name": "Distinct", "value": fmt(summary["n_distinct"])}, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), }, {"name": "Missing", "value": fmt(summary["n_missing"])}, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), }, ] ) table2 = Table( [ { "name": "Mean", "value": fmt_numeric( summary["mean"], precision=config.report.precision ), }, { "name": "Minimum", "value": fmt_numeric(summary["min"], precision=config.report.precision), }, { "name": "Maximum", "value": fmt_numeric(summary["max"], precision=config.report.precision), }, { "name": "Zeros", "value": fmt_numeric( summary["n_zeros"], precision=config.report.precision ), }, {"name": "Zeros (%)", "value": fmt_percent(summary["p_zeros"])}, ] ) placeholder = HTML("") template_variables["top"] = Container( [info, table1, table2, placeholder], sequence_type="grid" ) # Bottom items = [ Image( scatter_complex(config, summary["scatter_data"]), image_format=image_format, alt="Scatterplot", caption="Scatterplot in the complex plane", name="Scatter", anchor_id=f"{varid}scatter", ) ] bottom = Container(items, sequence_type="tabs", anchor_id=summary["varid"]) template_variables["bottom"] = bottom return template_variables
def test_fmt_numeric(value, precision, expected): assert fmt_numeric(value, precision) == expected
def render_path(config: Settings, summary: dict) -> dict: varid = summary["varid"] n_freq_table_max = config.n_freq_table_max redact = config.vars.cat.redact template_variables = render_categorical(config, summary) keys = ["name", "parent", "suffix", "stem", "anchor"] for path_part in keys: template_variables[f"freqtable_{path_part}"] = freq_table( freqtable=summary[f"{path_part}_counts"], n=summary["n"], max_number_to_print=n_freq_table_max, ) # Top template_variables["top"].content["items"][0].content["var_type"] = "Path" # Bottom path_overview_tab = Container( [ Table( [ { "name": "Common prefix", "value": fmt(summary["common_prefix"]), "alert": False, }, { "name": "Unique stems", "value": fmt_numeric( summary["n_stem_unique"], precision=config.report.precision ), "alert": False, }, { "name": "Unique names", "value": fmt_numeric( summary["n_name_unique"], precision=config.report.precision ), "alert": False, }, { "name": "Unique extensions", "value": fmt_numeric( summary["n_suffix_unique"], precision=config.report.precision, ), "alert": False, }, { "name": "Unique directories", "value": fmt_numeric( summary["n_parent_unique"], precision=config.report.precision, ), "alert": False, }, { "name": "Unique anchors", "value": fmt_numeric( summary["n_anchor_unique"], precision=config.report.precision, ), "alert": False, }, ] ) ], anchor_id=f"{varid}tbl", name="Overview", sequence_type="list", ) path_items = [ path_overview_tab, FrequencyTable( template_variables["freq_table_rows"], name="Full", anchor_id=f"{varid}full_frequency", redact=redact, ), FrequencyTable( template_variables["freqtable_stem"], name="Stem", anchor_id=f"{varid}stem_frequency", redact=redact, ), FrequencyTable( template_variables["freqtable_name"], name="Name", anchor_id=f"{varid}name_frequency", redact=redact, ), FrequencyTable( template_variables["freqtable_suffix"], name="Extension", anchor_id=f"{varid}suffix_frequency", redact=redact, ), FrequencyTable( template_variables["freqtable_parent"], name="Parent", anchor_id=f"{varid}parent_frequency", redact=redact, ), FrequencyTable( template_variables["freqtable_anchor"], name="Anchor", anchor_id=f"{varid}anchor_frequency", redact=redact, ), ] path_tab = Container( path_items, name="Path", sequence_type="tabs", anchor_id=f"{varid}path", ) template_variables["bottom"].content["items"].append(path_tab) return template_variables
def render_count(config: Settings, summary: dict) -> dict: template_variables = render_common(config, summary) image_format = config.plot.image_format # Top info = VariableInfo( summary["varid"], summary["varname"], "Real number (ℝ / ℝ<sub>≥0</sub>)", summary["warnings"], summary["description"], ) table1 = Table([ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": False, }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": False, }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": False, }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": False, }, ]) table2 = Table([ { "name": "Mean", "value": fmt_numeric(summary["mean"], precision=config.report.precision), "alert": False, }, { "name": "Minimum", "value": fmt_numeric(summary["min"], precision=config.report.precision), "alert": False, }, { "name": "Maximum", "value": fmt_numeric(summary["max"], precision=config.report.precision), "alert": False, }, { "name": "Zeros", "value": fmt(summary["n_zeros"]), "alert": False, }, { "name": "Zeros (%)", "value": fmt_percent(summary["p_zeros"]), "alert": False, }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ]) mini_histo = Image( mini_histogram(config, *summary["histogram"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") seqs = [ Image( histogram(config, *summary["histogram"]), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id="histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id="common_values", redact=False, ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id="firstn", redact=False, ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id="lastn", redact=False, ), ], sequence_type="tabs", name="Extreme values", anchor_id="extreme_values", ) template_variables["bottom"] = Container( [ Container(seqs, sequence_type="tabs", name="Histogram(s)", anchor_id="histograms"), fq, evs, ], sequence_type="tabs", anchor_id=summary["varid"], ) return template_variables
def get_dataset_overview(config: Settings, summary: dict) -> Renderable: table_metrics = [ { "name": "Number of variables", "value": fmt_number(summary["table"]["n_var"]), }, { "name": "Number of observations", "value": fmt_number(summary["table"]["n"]), }, { "name": "Missing cells", "value": fmt_number(summary["table"]["n_cells_missing"]), }, { "name": "Missing cells (%)", "value": fmt_percent(summary["table"]["p_cells_missing"]), }, ] if "n_duplicates" in summary["table"]: table_metrics.extend( [ { "name": "Duplicate rows", "value": fmt_number(summary["table"]["n_duplicates"]), }, { "name": "Duplicate rows (%)", "value": fmt_percent(summary["table"]["p_duplicates"]), }, ] ) table_metrics.extend( [ { "name": "Total size in memory", "value": fmt_bytesize(summary["table"]["memory_size"]), }, { "name": "Average record size in memory", "value": fmt_bytesize(summary["table"]["record_size"]), }, ] ) dataset_info = Table( table_metrics, name="Dataset statistics", ) dataset_types = Table( [ { "name": str(type_name), "value": fmt_numeric(count, precision=config.report.precision), } for type_name, count in summary["table"]["types"].items() ], name="Variable types", ) return Container( [dataset_info, dataset_types], anchor_id="dataset_overview", name="Overview", sequence_type="grid", )
def render_real(config: Settings, summary: dict) -> dict: varid = summary["varid"] template_variables = render_common(config, summary) image_format = config.plot.image_format if summary["min"] >= 0: name = "Real number (ℝ<sub>≥0</sub>)" else: name = "Real number (ℝ)" # Top info = VariableInfo( summary["varid"], summary["varname"], name, summary["alerts"], summary["description"], ) table1 = Table([ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": "n_distinct" in summary["alert_fields"], }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": "p_distinct" in summary["alert_fields"], }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Infinite", "value": fmt(summary["n_infinite"]), "alert": "n_infinite" in summary["alert_fields"], }, { "name": "Infinite (%)", "value": fmt_percent(summary["p_infinite"]), "alert": "p_infinite" in summary["alert_fields"], }, { "name": "Mean", "value": fmt_numeric(summary["mean"], precision=config.report.precision), "alert": False, }, ]) table2 = Table([ { "name": "Minimum", "value": fmt_numeric(summary["min"], precision=config.report.precision), "alert": False, }, { "name": "Maximum", "value": fmt_numeric(summary["max"], precision=config.report.precision), "alert": False, }, { "name": "Zeros", "value": fmt(summary["n_zeros"]), "alert": "n_zeros" in summary["alert_fields"], }, { "name": "Zeros (%)", "value": fmt_percent(summary["p_zeros"]), "alert": "p_zeros" in summary["alert_fields"], }, { "name": "Negative", "value": fmt(summary["n_negative"]), "alert": False, }, { "name": "Negative (%)", "value": fmt_percent(summary["p_negative"]), "alert": False, }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ]) mini_histo = Image( mini_histogram(config, *summary["histogram"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") quantile_statistics = Table( [ { "name": "Minimum", "value": fmt_numeric(summary["min"], precision=config.report.precision), }, { "name": "5-th percentile", "value": fmt_numeric(summary["5%"], precision=config.report.precision), }, { "name": "Q1", "value": fmt_numeric(summary["25%"], precision=config.report.precision), }, { "name": "median", "value": fmt_numeric(summary["50%"], precision=config.report.precision), }, { "name": "Q3", "value": fmt_numeric(summary["75%"], precision=config.report.precision), }, { "name": "95-th percentile", "value": fmt_numeric(summary["95%"], precision=config.report.precision), }, { "name": "Maximum", "value": fmt_numeric(summary["max"], precision=config.report.precision), }, { "name": "Range", "value": fmt_numeric(summary["range"], precision=config.report.precision), }, { "name": "Interquartile range (IQR)", "value": fmt_numeric(summary["iqr"], precision=config.report.precision), }, ], name="Quantile statistics", ) descriptive_statistics = Table( [ { "name": "Standard deviation", "value": fmt_numeric(summary["std"], precision=config.report.precision), }, { "name": "Coefficient of variation (CV)", "value": fmt_numeric(summary["cv"], precision=config.report.precision), }, { "name": "Kurtosis", "value": fmt_numeric(summary["kurtosis"], precision=config.report.precision), }, { "name": "Mean", "value": fmt_numeric(summary["mean"], precision=config.report.precision), }, { "name": "Median Absolute Deviation (MAD)", "value": fmt_numeric(summary["mad"], precision=config.report.precision), }, { "name": "Skewness", "value": fmt_numeric(summary["skewness"], precision=config.report.precision), "class": "alert" if "skewness" in summary["alert_fields"] else "", }, { "name": "Sum", "value": fmt_numeric(summary["sum"], precision=config.report.precision), }, { "name": "Variance", "value": fmt_numeric(summary["variance"], precision=config.report.precision), }, { "name": "Monotonicity", "value": fmt_monotonic(summary["monotonic"]), }, ], name="Descriptive statistics", ) statistics = Container( [quantile_statistics, descriptive_statistics], anchor_id=f"{varid}statistics", name="Statistics", sequence_type="grid", ) hist = Image( histogram(config, *summary["histogram"]), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id=f"{varid}histogram", ) fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id=f"{varid}common_values", redact=False, ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name=f"Minimum {config.n_extreme_obs} values", anchor_id=f"{varid}firstn", redact=False, ), FrequencyTable( template_variables["lastn_expanded"], name=f"Maximum {config.n_extreme_obs} values", anchor_id=f"{varid}lastn", redact=False, ), ], sequence_type="tabs", name="Extreme values", anchor_id=f"{varid}extreme_values", ) template_variables["bottom"] = Container( [statistics, hist, fq, evs], sequence_type="tabs", anchor_id=f"{varid}bottom", ) return template_variables