def render_real(summary): varid = summary["varid"] template_variables = render_common(summary) image_format = config["plot"]["image_format"].get(str) if summary["min"] >= 0: name = "Real number (ℝ<sub>≥0</sub>)" else: name = "Real number (ℝ)" # Top info = VariableInfo(summary["varid"], summary["varname"], name, summary["warnings"]) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": "n_unique" in summary["warn_fields"], }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Infinite", "value": summary["n_infinite"], "fmt": "fmt", "alert": "n_infinite" in summary["warn_fields"], }, { "name": "Infinite (%)", "value": summary["p_infinite"], "fmt": "fmt_percent", "alert": "p_infinite" in summary["warn_fields"], }, ]) table2 = Table([ { "name": "Mean", "value": summary["mean"], "fmt": "fmt", "alert": False }, { "name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False }, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt", "alert": "n_zeros" in summary["warn_fields"], }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent", "alert": "p_zeros" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) histogram_bins = 10 # TODO: replace with SmallImage... mini_histo = Image( mini_histogram(summary["histogram_data"], summary, histogram_bins), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") quantile_statistics = Table( [ { "name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric" }, { "name": "5-th percentile", "value": summary["5%"], "fmt": "fmt_numeric" }, { "name": "Q1", "value": summary["25%"], "fmt": "fmt_numeric" }, { "name": "median", "value": summary["50%"], "fmt": "fmt_numeric" }, { "name": "Q3", "value": summary["75%"], "fmt": "fmt_numeric" }, { "name": "95-th percentile", "value": summary["95%"], "fmt": "fmt_numeric" }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric" }, { "name": "Range", "value": summary["range"], "fmt": "fmt_numeric" }, { "name": "Interquartile range (IQR)", "value": summary["iqr"], "fmt": "fmt_numeric", }, ], name="Quantile statistics", ) descriptive_statistics = Table( [ { "name": "Standard deviation", "value": summary["std"], "fmt": "fmt_numeric", }, { "name": "Coefficient of variation (CV)", "value": summary["cv"], "fmt": "fmt_numeric", }, { "name": "Kurtosis", "value": summary["kurtosis"], "fmt": "fmt_numeric" }, { "name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric" }, { "name": "Median Absolute Deviation (MAD)", "value": summary["mad"], "fmt": "fmt_numeric", }, { "name": "Skewness", "value": summary["skewness"], "fmt": "fmt_numeric", "class": "alert" if "skewness" in summary["warn_fields"] else "", }, { "name": "Sum", "value": summary["sum"], "fmt": "fmt_numeric" }, { "name": "Variance", "value": summary["variance"], "fmt": "fmt_numeric" }, ], name="Descriptive statistics", ) statistics = Container( [quantile_statistics, descriptive_statistics], anchor_id=f"{varid}statistics", name="Statistics", sequence_type="grid", ) seqs = [ Image( histogram(summary["histogram_data"], summary, histogram_bins), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={histogram_bins})", name="Histogram", anchor_id=f"{varid}histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id=f"{varid}common_values", ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id=f"{varid}firstn", ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id=f"{varid}lastn", ), ], sequence_type="tabs", name="Extreme values", anchor_id=f"{varid}extreme_values", ) if "histogram_bins_bayesian_blocks" in summary: histo_dyn = Image( histogram( summary["histogram_data"], summary, summary["histogram_bins_bayesian_blocks"], ), image_format=image_format, alt="Histogram", caption= '<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)' .format( fmt_array(summary["histogram_bins_bayesian_blocks"], threshold=5)), name="Dynamic Histogram", anchor_id=f"{varid}dynamic_histogram", ) seqs.append(histo_dyn) template_variables["bottom"] = Container( [ statistics, Container( seqs, sequence_type="tabs", name="Histogram(s)", anchor_id=f"{varid}histograms", ), fq, evs, ], sequence_type="tabs", anchor_id=f"{varid}bottom", ) return template_variables
def render_count(config: Settings, summary: dict) -> dict: template_variables = render_common(config, summary) image_format = config.plot.image_format # Top info = VariableInfo( summary["varid"], summary["varname"], "Real number (ℝ / ℝ<sub>≥0</sub>)", summary["alerts"], summary["description"], ) table1 = Table([ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": False, }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": False, }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": False, }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": False, }, ]) table2 = Table([ { "name": "Mean", "value": fmt_numeric(summary["mean"], precision=config.report.precision), "alert": False, }, { "name": "Minimum", "value": fmt_numeric(summary["min"], precision=config.report.precision), "alert": False, }, { "name": "Maximum", "value": fmt_numeric(summary["max"], precision=config.report.precision), "alert": False, }, { "name": "Zeros", "value": fmt(summary["n_zeros"]), "alert": False, }, { "name": "Zeros (%)", "value": fmt_percent(summary["p_zeros"]), "alert": False, }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ]) mini_histo = Image( mini_histogram(config, *summary["histogram"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") seqs = [ Image( histogram(config, *summary["histogram"]), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id="histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id="common_values", redact=False, ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name=f"Minimum {config.n_extreme_obs} values", anchor_id="firstn", redact=False, ), FrequencyTable( template_variables["lastn_expanded"], name=f"Maximum {config.n_extreme_obs} values", anchor_id="lastn", redact=False, ), ], sequence_type="tabs", name="Extreme values", anchor_id="extreme_values", ) template_variables["bottom"] = Container( [ Container(seqs, sequence_type="tabs", name="Histogram(s)", anchor_id="histograms"), fq, evs, ], sequence_type="tabs", anchor_id=summary["varid"], ) return template_variables
def render_categorical(summary): varid = summary["varid"] n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) image_format = config["plot"]["image_format"].get(str) template_variables = render_common(summary) # TODO: merge with boolean mini_freq_table_rows = freq_table( freqtable=summary["value_counts"], n=summary["count"], max_number_to_print=n_obs_cat, ) # Top # Element composition info = VariableInfo(summary["varid"], summary["varname"], "Categorical", summary["warnings"]) table = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": "n_unique" in summary["warn_fields"], }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) fqm = FrequencyTableSmall(mini_freq_table_rows) # TODO: settings 3,3,6 template_variables["top"] = Sequence([info, table, fqm], sequence_type="grid") # Bottom items = [] frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}common_values", ) items.append(frequency_table) check_compositions = config["vars"]["cat"]["check_composition"].get(bool) if check_compositions: length_table = Table( [ { "name": "Max length", "value": summary["max_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Mean length", "value": summary["mean_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Min length", "value": summary["min_length"], "fmt": "fmt_numeric", "alert": False, }, ], name="Length", anchor_id=f"{varid}lengthstats", ) histogram_bins = 10 length = Image( histogram(summary["length"], summary, histogram_bins), image_format=image_format, alt="Scatter", name="Length", anchor_id=f"{varid}length", ) tbl = Sequence( [length, length_table], anchor_id=f"{varid}tbl", name="Length", sequence_type="grid", ) items.append(tbl) n_freq_table_max = config["n_freq_table_max"].get(int) citems = [] vc = pd.Series(summary["category_alias_values"]).value_counts() citems.append( FrequencyTable( freq_table(freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max), name="Categories", anchor_id=f"{varid}category_long_values", )) vc = pd.Series(summary["script_values"]).value_counts() citems.append( FrequencyTable( freq_table(freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max), name="Scripts", anchor_id=f"{varid}script_values", )) vc = pd.Series(summary["block_alias_values"]).value_counts() citems.append( FrequencyTable( freq_table(freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max), name="Blocks", anchor_id=f"{varid}block_alias_values", )) characters = Sequence( citems, name="Characters", sequence_type="tabs", anchor_id=f"{varid}characters", ) items.append(characters) template_variables["bottom"] = Sequence(items, sequence_type="tabs", anchor_id=f"{varid}bottom") return template_variables
def render_categorical(summary): varid = summary["varid"] n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) image_format = config["plot"]["image_format"].get(str) redact = config["vars"]["cat"]["redact"].get(bool) words = config["vars"]["cat"]["words"].get(bool) characters = config["vars"]["cat"]["characters"].get(bool) length = config["vars"]["cat"]["length"].get(bool) template_variables = render_common(summary) info = VariableInfo( summary["varid"], summary["varname"], "Categorical", summary["warnings"], summary["description"], ) table = Table( [ { "name": "Distinct", "value": summary["n_distinct"], "fmt": "fmt", "alert": "n_distinct" in summary["warn_fields"], }, { "name": "Distinct (%)", "value": summary["p_distinct"], "fmt": "fmt_percent", "alert": "p_distinct" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ] ) fqm = FrequencyTableSmall( freq_table( freqtable=summary["value_counts"], n=summary["count"], max_number_to_print=n_obs_cat, ), redact=redact, ) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") # ============================================================================================ frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}common_values", redact=redact, ) unique_stats, value_counts = render_categorical_frequency( summary, varid, image_format ) overview_items = [] if length: length_table, length_histo = render_categorical_length( summary, varid, image_format ) overview_items.append(length_table) if characters: overview_table_char, unitab = render_categorical_unicode(summary, varid, redact) overview_items.append(overview_table_char) overview_items.append(unique_stats) if not redact: rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row") sample = Table( [ { "name": name, "value": value, "fmt": "fmt", "alert": False, } for name, value in zip(rows, summary["first_rows"]) ], name="Sample", ) overview_items.append(sample) string_items = [frequency_table] if length: string_items.append(length_histo) max_unique = config["plot"]["pie"]["max_unique"].get(int) if max_unique > 0 and summary["n_distinct"] <= max_unique: string_items.append( Image( pie_plot(summary["value_counts"], legend_kws={"loc": "upper right"}), image_format=image_format, alt="Pie chart", name="Pie chart", anchor_id=f"{varid}pie_chart", ) ) bottom_items = [ Container( overview_items, name="Overview", anchor_id=f"{varid}overview", sequence_type="batch_grid", batch_size=len(overview_items), titles=False, ), Container( string_items, name="Categories", anchor_id=f"{varid}string", sequence_type="batch_grid", batch_size=len(string_items), ), ] if words: woc = freq_table( freqtable=summary["word_counts"], n=summary["word_counts"].sum(), max_number_to_print=10, ) fqwo = FrequencyTable( woc, name="Common words", anchor_id=f"{varid}cwo", redact=redact, ) bottom_items.append( Container( [fqwo], name="Words", anchor_id=f"{varid}word", sequence_type="grid", ) ) if characters: bottom_items.append( Container( [unitab], name="Characters", anchor_id=f"{varid}characters", sequence_type="grid", ) ) # Bottom template_variables["bottom"] = Container( bottom_items, sequence_type="tabs", anchor_id=f"{varid}bottom" ) return template_variables
def render_count(summary): varid = summary["varid"] template_variables = render_common(summary) image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Real number (ℝ / ℝ<sub>≥0</sub>)", summary["warnings"], ) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, ]) table2 = Table([ { "name": "Mean", "value": summary["mean"], "fmt": "fmt", "alert": False }, { "name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False }, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt", "alert": False, }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) # TODO: replace with SmallImage... mini_histo = Image( mini_histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Sequence([info, table1, table2, mini_histo], sequence_type="grid") quantile_statistics = { "name": "Quantile statistics", "items": [ { "name": "Minimum", "value": summary["min"], "fmt": "fmt_numeric", "alert": False, }, { "name": "5-th percentile", "value": summary["quantile_5"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Q1", "value": summary["quantile_25"], "fmt": "fmt_numeric", "alert": False, }, { "name": "median", "value": summary["quantile_50"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Q3", "value": summary["quantile_75"], "fmt": "fmt_numeric", "alert": False, }, { "name": "95-th percentile", "value": summary["quantile_95"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Range", "value": summary["range"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Interquartile range", "value": summary["iqr"], "fmt": "fmt_numeric", "alert": False, }, ], } descriptive_statistics = { "name": "Descriptive statistics", "items": [ { "name": "Standard deviation", "value": summary["std"], "fmt": "fmt_numeric", }, { "name": "Coefficient of variation", "value": summary["cv"], "fmt": "fmt_numeric", }, { "name": "Kurtosis", "value": summary["kurt"], "fmt": "fmt_numeric" }, { "name": "Mean", "value": summary["mean"], "fmt": "fmt_numeric" }, { "name": "MAD", "value": summary["mad"], "fmt": "fmt_numeric" }, { "name": "Skewness", "value": summary["skew"], "fmt": "fmt_numeric" }, { "name": "Sum", "value": summary["sum"], "fmt": "fmt_numeric" }, { "name": "Variance", "value": summary["var"], "fmt": "fmt_numeric" }, ], } # TODO: Make sections data structure # statistics = ItemRenderer( # 'statistics', # 'Statistics', # 'table', # [ # quantile_statistics, # descriptive_statistics # ] # ) seqs = [ Image( histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={summary['histogram_bins']})", name="Histogram", anchor_id="histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id="common_values", ) evs = Sequence( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id="firstn", ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id="lastn", ), ], sequence_type="tabs", name="Extreme values", anchor_id="extreme_values", ) if "histogram_bins_bayesian_blocks" in summary: histo_dyn = Image( histogram( summary["histogram_data"], summary, summary["histogram_bins_bayesian_blocks"], ), image_format=image_format, alt="Histogram", caption= '<strong>Histogram with variable size bins</strong> (bins={}, <a href="https://ui.adsabs.harvard.edu/abs/2013ApJ...764..167S/abstract" target="_blank">"bayesian blocks"</a> binning strategy used)' .format( fmt_array(summary["histogram_bins_bayesian_blocks"], threshold=5)), name="Dynamic Histogram", anchor_id="dynamic_histogram", ) seqs.append(histo_dyn) template_variables["bottom"] = Sequence( [ # statistics, Sequence(seqs, sequence_type="tabs", name="Histogram(s)", anchor_id="histograms"), fq, evs, ], sequence_type="tabs", anchor_id=summary["varid"], ) return template_variables
def get_dataset_overview(summary): dataset_info = Table( [ { "name": "Number of variables", "value": summary["table"]["n_var"], "fmt": "fmt_numeric", }, { "name": "Number of observations", "value": summary["table"]["n"], "fmt": "fmt_numeric", }, { "name": "Missing cells", "value": summary["table"]["n_cells_missing"], "fmt": "fmt_numeric", }, { "name": "Missing cells (%)", "value": summary["table"]["p_cells_missing"], "fmt": "fmt_percent", }, { "name": "Duplicate rows", "value": summary["table"]["n_duplicates"], "fmt": "fmt_numeric", }, { "name": "Duplicate rows (%)", "value": summary["table"]["p_duplicates"], "fmt": "fmt_percent", }, { "name": "Total size in memory", "value": summary["table"]["memory_size"], "fmt": "fmt_bytesize", }, { "name": "Average record size in memory", "value": summary["table"]["record_size"], "fmt": "fmt_bytesize", }, ], name="Dataset statistics", ) dataset_types = Table( [{ "name": type_name, "value": count, "fmt": "fmt_numeric" } for type_name, count in summary["table"]["types"].items()], name="Variable types", ) return Container( [dataset_info, dataset_types], anchor_id="dataset_overview", name="Overview", sequence_type="grid", )
def render_boolean(summary): varid = summary["varid"] n_obs_bool = config["vars"]["bool"]["n_obs"].get(int) # Prepare variables template_variables = render_common(summary) mini_freq_table_rows = freq_table( freqtable=summary["value_counts"], n=summary["n"], max_number_to_print=n_obs_bool, ) # Element composition info = VariableInfo( anchor_id=summary["varid"], warnings=summary["warnings"], var_type="Boolean", var_name=summary["varname"], ) table = Table( [ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": "n_unique" in summary["warn_fields"], }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ] ) fqm = FrequencyTableSmall(mini_freq_table_rows) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") freqtable = FrequencyTable( template_variables["freq_table_rows"], name="Frequency Table", anchor_id=f"{varid}frequency_table", ) template_variables["bottom"] = Container( [freqtable], sequence_type="tabs", anchor_id=f"{varid}bottom" ) return template_variables
def render_date(summary): # TODO: render common? template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo(summary["varid"], summary["varname"], "Date", summary["warnings"]) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) table2 = Table([ { "name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False }, ]) mini_histo = Image( mini_histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Sequence([info, table1, table2, mini_histo], sequence_type="grid") # Bottom bottom = Sequence( [ Image( histogram(summary["histogram_data"], summary, summary["histogram_bins"]), image_format=image_format, alt="Histogram", caption="Histogram", name="Histogram", anchor_id="{varid}histogram".format(varid=summary["varid"]), ) ], sequence_type="tabs", anchor_id=summary["varid"], ) template_variables["bottom"] = bottom return template_variables
def render_boolean(config: Settings, summary: dict) -> dict: varid = summary["varid"] n_obs_bool = config.vars.bool.n_obs image_format = config.plot.image_format # Prepare variables template_variables = render_common(config, summary) # Element composition info = VariableInfo( anchor_id=summary["varid"], alerts=summary["alerts"], var_type="Boolean", var_name=summary["varname"], description=summary["description"], ) table = Table( [ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": "n_distinct" in summary["alert_fields"], }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": "p_distinct" in summary["alert_fields"], }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ] ) fqm = FrequencyTableSmall( freq_table( freqtable=summary["value_counts_without_nan"], n=summary["n"], max_number_to_print=n_obs_bool, ), redact=False, ) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") items: List[Renderable] = [ FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}frequency_table", redact=False, ) ] show = config.plot.cat_freq.show max_unique = config.plot.cat_freq.max_unique if show and (max_unique > 0): items.append( Image( cat_frequency_plot( config, summary["value_counts_without_nan"], ), image_format=image_format, alt="Category Frequency Plot", name="Category Frequency Plot", anchor_id=f"{varid}cat_frequency_plot", ) ) template_variables["bottom"] = Container( items, sequence_type="tabs", anchor_id=f"{varid}bottom" ) return template_variables
def render_categorical(config: Settings, summary: dict) -> dict: varid = summary["varid"] n_obs_cat = config.vars.cat.n_obs image_format = config.plot.image_format words = config.vars.cat.words characters = config.vars.cat.characters length = config.vars.cat.length template_variables = render_common(config, summary) info = VariableInfo( summary["varid"], summary["varname"], "Categorical", summary["alerts"], summary["description"], ) table = Table([ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": "n_distinct" in summary["alert_fields"], }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": "p_distinct" in summary["alert_fields"], }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ]) fqm = FrequencyTableSmall( freq_table( freqtable=summary["value_counts_without_nan"], n=summary["count"], max_number_to_print=n_obs_cat, ), redact=config.vars.cat.redact, ) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") # ============================================================================================ frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}common_values", redact=config.vars.cat.redact, ) unique_stats = render_categorical_frequency(config, summary, varid) overview_items = [] if length: length_table, length_histo = render_categorical_length( config, summary, varid) overview_items.append(length_table) if characters: overview_table_char, unitab = render_categorical_unicode( config, summary, varid) overview_items.append(overview_table_char) overview_items.append(unique_stats) if not config.vars.cat.redact: rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row") sample = Table( [{ "name": name, "value": fmt(value), "alert": False, } for name, value in zip(rows, summary["first_rows"])], name="Sample", ) overview_items.append(sample) string_items: List[Renderable] = [frequency_table] if length: string_items.append(length_histo) show = config.plot.cat_freq.show max_unique = config.plot.cat_freq.max_unique if show and (max_unique > 0) and (summary["n_distinct"] <= max_unique): string_items.append( Image( cat_frequency_plot( config, summary["value_counts_without_nan"], ), image_format=image_format, alt="Category Frequency Plot", name="Category Frequency Plot", anchor_id=f"{varid}cat_frequency_plot", )) bottom_items = [ Container( overview_items, name="Overview", anchor_id=f"{varid}overview", sequence_type="batch_grid", batch_size=len(overview_items), titles=False, ), Container( string_items, name="Categories", anchor_id=f"{varid}string", sequence_type="batch_grid", batch_size=len(string_items), ), ] if words: woc = freq_table( freqtable=summary["word_counts"], n=summary["word_counts"].sum(), max_number_to_print=10, ) fqwo = FrequencyTable( woc, name="Common words", anchor_id=f"{varid}cwo", redact=config.vars.cat.redact, ) bottom_items.append( Container( [fqwo], name="Words", anchor_id=f"{varid}word", sequence_type="grid", )) if characters: bottom_items.append( Container( [unitab], name="Characters", anchor_id=f"{varid}characters", sequence_type="grid", )) # Bottom template_variables["bottom"] = Container(bottom_items, sequence_type="tabs", anchor_id=f"{varid}bottom") return template_variables
def render_complex(config: Settings, summary: dict) -> dict: varid = summary["varid"] template_variables = {} image_format = config.plot.image_format # Top info = VariableInfo( summary["varid"], summary["varname"], "Complex number (ℂ)", summary["alerts"], summary["description"], ) table1 = Table([ { "name": "Distinct", "value": fmt(summary["n_distinct"]) }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), }, { "name": "Missing", "value": fmt(summary["n_missing"]) }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), }, ]) table2 = Table([ { "name": "Mean", "value": fmt_numeric(summary["mean"], precision=config.report.precision), }, { "name": "Minimum", "value": fmt_numeric(summary["min"], precision=config.report.precision), }, { "name": "Maximum", "value": fmt_numeric(summary["max"], precision=config.report.precision), }, { "name": "Zeros", "value": fmt_numeric(summary["n_zeros"], precision=config.report.precision), }, { "name": "Zeros (%)", "value": fmt_percent(summary["p_zeros"]) }, ]) placeholder = HTML("") template_variables["top"] = Container([info, table1, table2, placeholder], sequence_type="grid") # Bottom items = [ Image( scatter_complex(config, summary["scatter_data"]), image_format=image_format, alt="Scatterplot", caption="Scatterplot in the complex plane", name="Scatter", anchor_id=f"{varid}scatter", ) ] bottom = Container(items, sequence_type="tabs", anchor_id=summary["varid"]) template_variables["bottom"] = bottom return template_variables
def render_path(summary): varid = summary["varid"] n_freq_table_max = config["n_freq_table_max"].get(int) template_variables = render_categorical(summary) keys = ["name", "parent", "suffix", "stem", "anchor"] for path_part in keys: template_variables[f"freqtable_{path_part}"] = freq_table( freqtable=summary[f"{path_part}_counts"], n=summary["n"], max_number_to_print=n_freq_table_max, ) # Top template_variables["top"].content["items"][0].content["var_type"] = "Path" # Bottom path_overview_tab = Container( [ Table( [ { "name": "Common prefix", "value": summary["common_prefix"], "fmt": "fmt", "alert": False, }, { "name": "Unique stems", "value": summary["n_stem_unique"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Unique names", "value": summary["n_name_unique"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Unique extensions", "value": summary["n_suffix_unique"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Unique directories", "value": summary["n_parent_unique"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Unique anchors", "value": summary["n_anchor_unique"], "fmt": "fmt_numeric", "alert": False, }, ] ) ], anchor_id=f"{varid}tbl", name="Overview", sequence_type="list", ) path_items = [ path_overview_tab, FrequencyTable( template_variables["freq_table_rows"], name="Full", anchor_id=f"{varid}full_frequency", ), FrequencyTable( template_variables["freqtable_stem"], name="Stem", anchor_id=f"{varid}stem_frequency", ), FrequencyTable( template_variables["freqtable_name"], name="Name", anchor_id=f"{varid}name_frequency", ), FrequencyTable( template_variables["freqtable_suffix"], name="Extension", anchor_id=f"{varid}suffix_frequency", ), FrequencyTable( template_variables["freqtable_parent"], name="Parent", anchor_id=f"{varid}parent_frequency", ), FrequencyTable( template_variables["freqtable_anchor"], name="Anchor", anchor_id=f"{varid}anchor_frequency", ), ] path_tab = Container( path_items, name="Path", sequence_type="tabs", anchor_id=f"{varid}path", ) template_variables["bottom"].content["items"].append(path_tab) return template_variables
def render_categorical(summary): n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) image_format = config["plot"]["image_format"].get(str) template_variables = render_common(summary) # TODO: merge with boolean mini_freq_table_rows = freq_table( freqtable=summary["value_counts"], n=summary["count"], max_number_to_print=n_obs_cat, ) # Top # Element composition info = Overview(summary["varid"], summary["varname"], "Categorical", summary["warnings"]) table = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "class": "alert" if "n_unique" in summary["warn_fields"] else "", }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "class": "alert" if "p_unique" in summary["warn_fields"] else "", }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "class": "alert" if "n_missing" in summary["warn_fields"] else "", }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "class": "alert" if "p_missing" in summary["warn_fields"] else "", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ]) fqm = FrequencyTableSmall(mini_freq_table_rows) # TODO: settings 3,3,6 template_variables["top"] = Sequence([info, table, fqm], sequence_type="grid") # Bottom items = [] frequency_table = FrequencyTable( # 'frequency_table', template_variables["freq_table_rows"], name="Common Values", anchor_id="{varid}common_values".format(varid=summary["varid"]), ) items.append(frequency_table) check_compositions = config["vars"]["cat"]["check_composition"].get(bool) if check_compositions: composition = Table( [ { "name": "Contains chars", "value": summary["composition"]["chars"], "fmt": "fmt", }, { "name": "Contains digits", "value": summary["composition"]["digits"], "fmt": "fmt", }, { "name": "Contains whitespace", "value": summary["composition"]["spaces"], "fmt": "fmt", }, { "name": "Contains non-words", "value": summary["composition"]["non-words"], "fmt": "fmt", }, ], name="Composition", anchor_id="{varid}composition".format(varid=summary["varid"]), ) length = Table( [ { "name": "Max length", "value": summary["max_length"], "fmt": "fmt_numeric", }, { "name": "Mean length", "value": summary["mean_length"], "fmt": "fmt_numeric", }, { "name": "Min length", "value": summary["min_length"], "fmt": "fmt_numeric", }, ], name="Length", anchor_id="{varid}lengthstats".format(varid=summary["varid"]), ) tbl = Sequence( [composition, length], anchor_id="{varid}tbl".format(varid=summary["varid"]), name="Composition", sequence_type="grid", ) items.append(tbl) histogram_bins = 10 length = Image( histogram(summary["length"], summary, histogram_bins), image_format=image_format, alt="Scatter", name="Length", anchor_id="{varid}length".format(varid=summary["varid"]), ) items.append(length) template_variables["bottom"] = Sequence( items, sequence_type="tabs", anchor_id="{varid}bottom".format(varid=summary["varid"]), ) return template_variables
def get_dataset_overview(config: Settings, summary: dict) -> Renderable: table_metrics = [ { "name": "Number of variables", "value": fmt_number(summary["table"]["n_var"]), }, { "name": "Number of observations", "value": fmt_number(summary["table"]["n"]), }, { "name": "Missing cells", "value": fmt_number(summary["table"]["n_cells_missing"]), }, { "name": "Missing cells (%)", "value": fmt_percent(summary["table"]["p_cells_missing"]), }, ] if "n_duplicates" in summary["table"]: table_metrics.extend([ { "name": "Duplicate rows", "value": fmt_number(summary["table"]["n_duplicates"]), }, { "name": "Duplicate rows (%)", "value": fmt_percent(summary["table"]["p_duplicates"]), }, ]) table_metrics.extend([ { "name": "Total size in memory", "value": fmt_bytesize(summary["table"]["memory_size"]), }, { "name": "Average record size in memory", "value": fmt_bytesize(summary["table"]["record_size"]), }, ]) dataset_info = Table( table_metrics, name="Dataset statistics", ) dataset_types = Table( [{ "name": str(type_name), "value": fmt_numeric(count, precision=config.report.precision), } for type_name, count in summary["table"]["types"].items()], name="Variable types", ) return Container( [dataset_info, dataset_types], anchor_id="dataset_overview", name="Overview", sequence_type="grid", )
def render_categorical(summary): varid = summary["varid"] n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) image_format = config["plot"]["image_format"].get(str) redact = config["vars"]["cat"]["redact"].get(bool) template_variables = render_common(summary) info = VariableInfo( summary["varid"], summary["varname"], "Categorical", summary["warnings"], summary["description"], ) table = Table([ { "name": "Distinct", "value": summary["n_distinct"], "fmt": "fmt", "alert": "n_distinct" in summary["warn_fields"], }, { "name": "Distinct (%)", "value": summary["p_distinct"], "fmt": "fmt_percent", "alert": "p_distinct" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) fqm = FrequencyTableSmall( freq_table( freqtable=summary["value_counts"], n=summary["count"], max_number_to_print=n_obs_cat, ), redact=redact, ) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") citems = [ FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}common_values", redact=redact, ), render_categorical_frequency(summary, varid, image_format), ] max_unique = config["plot"]["pie"]["max_unique"].get(int) if max_unique > 0 and summary["n_distinct"] <= max_unique: citems.append( Image( pie_plot(summary["value_counts"], legend_kws={"loc": "upper right"}), image_format=image_format, alt="Chart", name="Chart", anchor_id=f"{varid}pie_chart", )) # Bottom items = [ Container( citems, name="Frequencies", anchor_id=f"{varid}frequencies", sequence_type="tabs", ), ] check_length = config["vars"]["cat"]["length"].get(bool) if check_length: items.append(render_categorical_length(summary, varid, image_format)) check_unicode = config["vars"]["cat"]["unicode"].get(bool) if check_unicode: items.append(render_categorical_unicode(summary, varid, redact)) template_variables["bottom"] = Container(items, sequence_type="tabs", anchor_id=f"{varid}bottom") return template_variables
def render_categorical_unicode(summary, varid, redact): n_freq_table_max = config["n_freq_table_max"].get(int) category_items = [ FrequencyTable( freq_table( freqtable=summary["category_alias_counts"], n=summary["category_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring categories", anchor_id=f"{varid}category_long_values", redact=False, ) ] for category_alias_name, category_alias_counts in summary[ "category_alias_char_counts"].items(): category_alias_name = category_alias_name.replace("_", " ") category_items.append( FrequencyTable( freq_table( freqtable=category_alias_counts, n=category_alias_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"Most frequent {category_alias_name} characters", anchor_id=f"{varid}category_alias_values_{category_alias_name}", redact=redact, )) script_items = [ FrequencyTable( freq_table( freqtable=summary["script_counts"], n=summary["script_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring scripts", anchor_id=f"{varid}script_values", redact=False, ), ] for script_name, script_counts in summary["script_char_counts"].items(): script_items.append( FrequencyTable( freq_table( freqtable=script_counts, n=script_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"Most frequent {script_name} characters", anchor_id=f"{varid}script_values_{script_name}", redact=redact, )) block_items = [ FrequencyTable( freq_table( freqtable=summary["block_alias_counts"], n=summary["block_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring blocks", anchor_id=f"{varid}block_alias_values", redact=False, ) ] for block_name, block_counts in summary["block_alias_char_counts"].items(): block_items.append( FrequencyTable( freq_table( freqtable=block_counts, n=block_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"Most frequent {block_name} characters", anchor_id=f"{varid}block_alias_values_{block_name}", redact=redact, )) citems = [ Container( [ Table( [ { "name": "Unique unicode characters", "value": summary["n_characters"], "fmt": "fmt_numeric", "alert": False, }, { "name": 'Unique unicode categories (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_character_property#General_Category">?</a>)', "value": summary["n_category"], "fmt": "fmt_numeric", "alert": False, }, { "name": 'Unique unicode scripts (<a target="_blank" href="https://en.wikipedia.org/wiki/Script_(Unicode)#List_of_scripts_in_Unicode">?</a>)', "value": summary["n_scripts"], "fmt": "fmt_numeric", "alert": False, }, { "name": 'Unique unicode blocks (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_block">?</a>)', "value": summary["n_block_alias"], "fmt": "fmt_numeric", "alert": False, }, ], name="Overview of Unicode Properties", caption= "The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables. ", ), ], anchor_id=f"{varid}character_overview", name="Overview", sequence_type="list", ), Container( [ FrequencyTable( freq_table( freqtable=summary["character_counts"], n=summary["character_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring characters", anchor_id=f"{varid}character_frequency", redact=redact, ), ], name="Characters", anchor_id=f"{varid}characters", sequence_type="named_list", ), Container( category_items, name="Categories", anchor_id=f"{varid}categories", sequence_type="named_list", ), Container( script_items, name="Scripts", anchor_id=f"{varid}scripts", sequence_type="named_list", ), Container( block_items, name="Blocks", anchor_id=f"{varid}blocks", sequence_type="named_list", ), ] return Container( citems, name="Unicode", sequence_type="tabs", anchor_id=f"{varid}unicode", )
def render_image(summary): varid = summary["varid"] n_freq_table_max = config["n_freq_table_max"].get(int) template_variables = render_file(summary) # Top template_variables["top"].content["items"][0].content["var_type"] = "Image" # Bottom image_items = [] """ Min Width Min Height Min Area Mean Width Mean Height Mean Height Median Width Median Height Median Height Max Width Max Height Max Height All dimension properties are in pixels. """ image_shape_items = [ Container( [ Table([ { "name": "Min width", "value": summary["min_width"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Median width", "value": summary["median_width"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Max width", "value": summary["max_width"], "fmt": "fmt_numeric", "alert": False, }, ]), Table([ { "name": "Min height", "value": summary["min_height"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Median height", "value": summary["median_height"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Max height", "value": summary["max_height"], "fmt": "fmt_numeric", "alert": False, }, ]), Table([ { "name": "Min area", "value": summary["min_area"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Median area", "value": summary["median_area"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Max area", "value": summary["max_area"], "fmt": "fmt_numeric", "alert": False, }, ]), ], anchor_id=f"{varid}tbl", name="Overview", sequence_type="grid", ), FrequencyTable( freq_table( freqtable=summary["image_dimensions"].value_counts(), n=summary["n"], max_number_to_print=n_freq_table_max, ), name="Common values", anchor_id=f"{varid}image_dimensions_frequency", ), Image( scatter_series(summary["image_dimensions"]), image_format=config["plot"]["image_format"].get(str), alt="Scatter plot of image sizes", caption="Scatter plot of image sizes", name="Scatter plot", anchor_id=f"{varid}image_dimensions_scatter", ), ] image_shape = Container( image_shape_items, sequence_type="named_list", name="Dimensions", anchor_id=f"{varid}image_dimensions", ) if "exif_keys_counts" in summary: exif_keys = FrequencyTable( freq_table( freqtable=pd.Series(summary["exif_keys_counts"]), n=summary["n"], max_number_to_print=n_freq_table_max, ), name="Exif keys", anchor_id=f"{varid}exif_keys", ) a = [exif_keys] for key, counts in summary["exif_data"].items(): if key == "exif_keys": continue a.append( FrequencyTable( freq_table( freqtable=counts, n=summary["n"], max_number_to_print=n_freq_table_max, ), name=key, anchor_id=f"{varid}_exif_{key}", )) exif_data = Container( a, anchor_id=f"{varid}exif_data", name="Exif data", sequence_type="named_list", ) image_items.append(exif_data) image_items.append(image_shape) image_tab = Container( image_items, name="Image", sequence_type="tabs", anchor_id=f"{varid}image", ) template_variables["bottom"].content["items"].append(image_tab) return template_variables
def render_count(summary): varid = summary["varid"] template_variables = render_common(summary) image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Real number (ℝ / ℝ<sub>≥0</sub>)", summary["warnings"], summary["description"], ) table1 = Table( [ { "name": "唯一值计数", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "唯一值 (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "缺失值", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "缺失值比例 (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, ] ) table2 = Table( [ { "name": "均数", "value": summary["mean"], "fmt": "fmt_numeric", "alert": False, }, { "name": "最小值", "value": summary["min"], "fmt": "fmt_numeric", "alert": False, }, { "name": "最大值", "value": summary["max"], "fmt": "fmt_numeric", "alert": False, }, { "name": "零值", "value": summary["n_zeros"], "fmt": "fmt", "alert": False, }, { "name": "零值 (%)", "value": summary["p_zeros"], "fmt": "fmt_percent", "alert": False, }, { "name": "内存占用", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ] ) mini_histo = Image( mini_histogram(*summary["histogram"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container( [info, table1, table2, mini_histo], sequence_type="grid" ) quantile_statistics = { "name": "定性分析", "items": [ { "name": "最小值", "value": summary["min"], "fmt": "fmt_numeric", "alert": False, }, { "name": "5-th 百分位", "value": summary["quantile_5"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Q1", "value": summary["quantile_25"], "fmt": "fmt_numeric", "alert": False, }, { "name": "中位数", "value": summary["quantile_50"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Q3", "value": summary["quantile_75"], "fmt": "fmt_numeric", "alert": False, }, { "name": "95-th 百分位", "value": summary["quantile_95"], "fmt": "fmt_numeric", "alert": False, }, { "name": "最大值", "value": summary["max"], "fmt": "fmt_numeric", "alert": False, }, { "name": "区间", "value": summary["range"], "fmt": "fmt_numeric", "alert": False, }, { "name": "四分位距", "value": summary["iqr"], "fmt": "fmt_numeric", "alert": False, }, ], } descriptive_statistics = { "name": "描述性统计", "items": [ { "name": "标准差", "value": summary["std"], "fmt": "fmt_numeric", }, { "name": "变异系数", "value": summary["cv"], "fmt": "fmt_numeric", }, {"name": "峰度", "value": summary["kurt"], "fmt": "fmt_numeric"}, {"name": "均数", "value": summary["mean"], "fmt": "fmt_numeric"}, {"name": "MAD", "value": summary["mad"], "fmt": "fmt_numeric"}, {"name": "偏度", "value": summary["skew"], "fmt": "fmt_numeric"}, {"name": "积", "value": summary["sum"], "fmt": "fmt_numeric"}, {"name": "方差", "value": summary["var"], "fmt": "fmt_numeric"}, ], } # TODO: Make sections data structure # statistics = ItemRenderer( # 'statistics', # 'Statistics', # 'table', # [ # quantile_statistics, # descriptive_statistics # ] # ) seqs = [ Image( histogram(*summary["histogram"]), image_format=image_format, alt="Histogram", caption=f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id="histogram", ) ] fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id="common_values", redact=False, ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name="Minimum 5 values", anchor_id="firstn", redact=False, ), FrequencyTable( template_variables["lastn_expanded"], name="Maximum 5 values", anchor_id="lastn", redact=False, ), ], sequence_type="tabs", name="极值", anchor_id="extreme_values", ) template_variables["bottom"] = Container( [ # statistics, Container( seqs, sequence_type="tabs", name="直方图", anchor_id="histograms" ), fq, evs, ], sequence_type="tabs", anchor_id=summary["varid"], ) return template_variables
def render_complex(summary): varid = summary["varid"] template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Complex number (ℂ)", summary["warnings"], ) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt" }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent" }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt" }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", }, ]) table2 = Table([ { "name": "Mean", "value": summary["mean"], "fmt": "fmt" }, { "name": "Minimum", "value": summary["min"], "fmt": "fmt" }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt" }, { "name": "Zeros", "value": summary["n_zeros"], "fmt": "fmt" }, { "name": "Zeros (%)", "value": summary["p_zeros"], "fmt": "fmt_percent" }, ]) placeholder = HTML("") template_variables["top"] = Container([info, table1, table2, placeholder], sequence_type="grid") # Bottom items = [ Image( scatter_complex(summary["scatter_data"]), image_format=image_format, alt="Scatterplot", caption="Scatterplot in the complex plane", name="Scatter", anchor_id=f"{varid}scatter", ) ] bottom = Container(items, sequence_type="tabs", anchor_id=summary["varid"]) template_variables["bottom"] = bottom return template_variables
def render_categorical(summary): varid = summary["varid"] n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) image_format = config["plot"]["image_format"].get(str) template_variables = render_common(summary) # TODO: merge with boolean mini_freq_table_rows = freq_table( freqtable=summary["value_counts"], n=summary["count"], max_number_to_print=n_obs_cat, ) # Top # Element composition info = VariableInfo( summary["varid"], summary["varname"], "Categorical", summary["warnings"], summary["description"], ) table = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": "n_unique" in summary["warn_fields"], }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) fqm = FrequencyTableSmall(mini_freq_table_rows) # TODO: settings 3,3,6 template_variables["top"] = Container([info, table, fqm], sequence_type="grid") # Bottom items = [] frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id=f"{varid}common_values", ) items.append(frequency_table) check_length = config["vars"]["cat"]["length"].get(bool) if check_length: length_table = Table( [ { "name": "Max length", "value": summary["max_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Median length", "value": summary["median_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Mean length", "value": summary["mean_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Min length", "value": summary["min_length"], "fmt": "fmt_numeric", "alert": False, }, ], name="Length", anchor_id=f"{varid}lengthstats", ) histogram_bins = 10 length = Image( histogram(summary["length"], summary, histogram_bins), image_format=image_format, alt="Scatter", name="Length", anchor_id=f"{varid}length", ) length_tab = Container( [length, length_table], anchor_id=f"{varid}tbl", name="Length", sequence_type="grid", ) items.append(length_tab) check_unicode = config["vars"]["cat"]["unicode"].get(bool) if check_unicode: n_freq_table_max = config["n_freq_table_max"].get(int) category_items = [ FrequencyTable( freq_table( freqtable=summary["category_alias_counts"], n=summary["category_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring categories", anchor_id=f"{varid}category_long_values", ) ] for category_alias_name, category_alias_counts in summary[ "category_alias_char_counts"].items(): category_alias_name = category_alias_name.replace("_", " ") category_items.append( FrequencyTable( freq_table( freqtable=category_alias_counts, n=category_alias_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"Most frequent {category_alias_name} characters", anchor_id= f"{varid}category_alias_values_{category_alias_name}", )) script_items = [ FrequencyTable( freq_table( freqtable=summary["script_counts"], n=summary["script_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring scripts", anchor_id=f"{varid}script_values", ), ] for script_name, script_counts in summary["script_char_counts"].items( ): script_items.append( FrequencyTable( freq_table( freqtable=script_counts, n=script_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"Most frequent {script_name} characters", anchor_id=f"{varid}script_values_{script_name}", )) block_items = [ FrequencyTable( freq_table( freqtable=summary["block_alias_counts"], n=summary["block_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring blocks", anchor_id=f"{varid}block_alias_values", ) ] for block_name, block_counts in summary[ "block_alias_char_counts"].items(): block_items.append( FrequencyTable( freq_table( freqtable=block_counts, n=block_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"Most frequent {block_name} characters", anchor_id=f"{varid}block_alias_values_{block_name}", )) citems = [ Container( [ Table( [ { "name": "Unique unicode characters", "value": summary["n_characters"], "fmt": "fmt_numeric", "alert": False, }, { "name": 'Unique unicode categories (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_character_property#General_Category">?</a>)', "value": summary["n_category"], "fmt": "fmt_numeric", "alert": False, }, { "name": 'Unique unicode scripts (<a target="_blank" href="https://en.wikipedia.org/wiki/Script_(Unicode)#List_of_scripts_in_Unicode">?</a>)', "value": summary["n_scripts"], "fmt": "fmt_numeric", "alert": False, }, { "name": 'Unique unicode blocks (<a target="_blank" href="https://en.wikipedia.org/wiki/Unicode_block">?</a>)', "value": summary["n_block_alias"], "fmt": "fmt_numeric", "alert": False, }, ], name="Overview of Unicode Properties", caption= "The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables. ", ), ], anchor_id=f"{varid}character_overview", name="Overview", sequence_type="list", ), Container( [ FrequencyTable( freq_table( freqtable=summary["character_counts"], n=summary["character_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring characters", anchor_id=f"{varid}character_frequency", ), ], name="Characters", anchor_id=f"{varid}characters", sequence_type="named_list", ), Container( category_items, name="Categories", anchor_id=f"{varid}categories", sequence_type="named_list", ), Container( script_items, name="Scripts", anchor_id=f"{varid}scripts", sequence_type="named_list", ), Container( block_items, name="Blocks", anchor_id=f"{varid}blocks", sequence_type="named_list", ), ] characters = Container( citems, name="Unicode", sequence_type="tabs", anchor_id=f"{varid}unicode", ) items.append(characters) template_variables["bottom"] = Container(items, sequence_type="tabs", anchor_id=f"{varid}bottom") return template_variables
def render_url(summary): varid = summary["varid"] n_freq_table_max = config["n_freq_table_max"].get(int) n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) redact = config["vars"]["cat"]["redact"].get(bool) template_variables = render_common(summary) keys = ["scheme", "netloc", "path", "query", "fragment"] for url_part in keys: template_variables[f"freqtable_{url_part}"] = freq_table( freqtable=summary[f"{url_part}_counts"], n=summary["n"], max_number_to_print=n_freq_table_max, ) full_frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Full", anchor_id=f"{varid}full_frequency", redact=redact, ) scheme_frequency_table = FrequencyTable( template_variables["freqtable_scheme"], name="Scheme", anchor_id=f"{varid}scheme_frequency", redact=redact, ) netloc_frequency_table = FrequencyTable( template_variables["freqtable_netloc"], name="Netloc", anchor_id=f"{varid}netloc_frequency", redact=redact, ) path_frequency_table = FrequencyTable( template_variables["freqtable_path"], name="Path", anchor_id=f"{varid}path_frequency", redact=redact, ) query_frequency_table = FrequencyTable( template_variables["freqtable_query"], name="Query", anchor_id=f"{varid}query_frequency", redact=redact, ) fragment_frequency_table = FrequencyTable( template_variables["freqtable_fragment"], name="Fragment", anchor_id=f"{varid}fragment_frequency", redact=redact, ) items = [ full_frequency_table, scheme_frequency_table, netloc_frequency_table, path_frequency_table, query_frequency_table, fragment_frequency_table, ] template_variables["bottom"] = Container(items, sequence_type="tabs", name="url stats", anchor_id=f"{varid}urlstats") # Element composition info = VariableInfo( summary["varid"], summary["varname"], "URL", summary["warnings"], summary["description"], ) table = Table([ { "name": "Distinct", "value": summary["n_distinct"], "fmt": "fmt", "alert": "n_distinct" in summary["warn_fields"], }, { "name": "Distinct (%)", "value": summary["p_distinct"], "fmt": "fmt_percent", "alert": "p_distinct" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) fqm = FrequencyTableSmall( freq_table( freqtable=summary["value_counts"], n=summary["n"], max_number_to_print=n_obs_cat, ), redact=redact, ) template_variables["top"] = Container([info, table, fqm], sequence_type="grid") return template_variables
def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: varid = summary["varid"] template_variables = {} image_format = config.plot.image_format # Top info = VariableInfo( summary["varid"], summary["varname"], "Date", summary["alerts"], summary["description"], ) table1 = Table([ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": False, }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": False, }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": False, }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": False, }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ]) table2 = Table([ { "name": "Minimum", "value": fmt(summary["min"]), "alert": False }, { "name": "Maximum", "value": fmt(summary["max"]), "alert": False }, ]) mini_histo = Image( mini_histogram(config, summary["histogram"][0], summary["histogram"][1], date=True), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") # Bottom bottom = Container( [ Image( histogram(config, summary["histogram"][0], summary["histogram"][1], date=True), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id=f"{varid}histogram", ) ], sequence_type="tabs", anchor_id=summary["varid"], ) template_variables["bottom"] = bottom return template_variables
def render_categorical_unicode(summary, varid, redact): n_freq_table_max = config["n_freq_table_max"].get(int) category_overview = FrequencyTable( freq_table( freqtable=summary["category_alias_counts"], n=summary["category_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring categories", anchor_id=f"{varid}category_long_values", redact=False, ) cats = [] for category_alias_name, category_alias_counts in sorted( summary["category_alias_char_counts"].items(), key=lambda x: -len(x[1]) ): category_alias_name = category_alias_name.replace("_", " ") cats.append( FrequencyTable( freq_table( freqtable=category_alias_counts, n=category_alias_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"{category_alias_name}", anchor_id=f"{varid}category_alias_values_{category_alias_name}", redact=redact, ) ) category_items = [ category_overview, Container( cats, name="Most frequent character per category", sequence_type="batch_grid", anchor_id=f"{varid}categories", batch_size=3, ), ] script_overview = FrequencyTable( freq_table( freqtable=summary["script_counts"], n=summary["script_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring scripts", anchor_id=f"{varid}script_values", redact=False, ) scripts = [] for script_name, script_counts in sorted( summary["script_char_counts"].items(), key=lambda x: -len(x[1]) ): scripts.append( FrequencyTable( freq_table( freqtable=script_counts, n=script_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"{script_name}", anchor_id=f"{varid}script_values_{script_name}", redact=redact, ) ) script_items = [ script_overview, Container( scripts, name="Most frequent character per script", sequence_type="batch_grid", anchor_id=f"{varid}scripts", batch_size=3, ), ] block_overview = FrequencyTable( freq_table( freqtable=summary["block_alias_counts"], n=summary["block_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring blocks", anchor_id=f"{varid}block_alias_values", redact=False, ) blocks = [] for block_name, block_counts in summary["block_alias_char_counts"].items(): blocks.append( FrequencyTable( freq_table( freqtable=block_counts, n=block_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"{block_name}", anchor_id=f"{varid}block_alias_values_{block_name}", redact=redact, ) ) block_items = [ block_overview, Container( blocks, name="Most frequent character per block", sequence_type="batch_grid", anchor_id=f"{varid}blocks", batch_size=3, ), ] overview_table = Table( [ { "name": "Total characters", "value": summary["n_characters"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Distinct characters", "value": summary["n_characters_distinct"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Distinct categories", "value": f"{summary['n_category']} {help(title='Unicode categories (click for more information)', url='https://en.wikipedia.org/wiki/Unicode_character_property#General_Category')}", "fmt": "raw", "alert": False, }, { "name": "Distinct scripts", "value": f"{summary['n_scripts']} {help(title='Unicode scripts (click for more information)', url='https://en.wikipedia.org/wiki/Script_(Unicode)#List_of_scripts_in_Unicode')}", "fmt": "raw", "alert": False, }, { "name": "Distinct blocks", "value": f"{summary['n_block_alias']} {help(title='Unicode blocks (click for more information)', url='https://en.wikipedia.org/wiki/Unicode_block')}", "fmt": "raw", "alert": False, }, ], name="Characters and Unicode", caption="The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables. ", ) citems = [ Container( [ FrequencyTable( freq_table( freqtable=summary["character_counts"], n=summary["character_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring characters", anchor_id=f"{varid}character_frequency", redact=redact, ), ], name="Characters", anchor_id=f"{varid}characters", sequence_type="named_list", ), Container( category_items, name="Categories", anchor_id=f"{varid}categories", sequence_type="named_list", ), Container( script_items, name="Scripts", anchor_id=f"{varid}scripts", sequence_type="named_list", ), Container( block_items, name="Blocks", anchor_id=f"{varid}blocks", sequence_type="named_list", ), ] return overview_table, Container( citems, name="Unicode", sequence_type="tabs", anchor_id=f"{varid}unicode", )
def render_date(summary): varid = summary["varid"] # TODO: render common? template_variables = {} image_format = config["plot"]["image_format"].get(str) # Top info = VariableInfo( summary["varid"], summary["varname"], "Date", summary["warnings"], summary["description"], ) table1 = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) table2 = Table([ { "name": "Minimum", "value": summary["min"], "fmt": "fmt", "alert": False }, { "name": "Maximum", "value": summary["max"], "fmt": "fmt", "alert": False }, ]) mini_histo = Image( mini_histogram(*summary["histogram"], date=True), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") # Bottom bottom = Container( [ Image( histogram(*summary["histogram"], date=True), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id=f"{varid}histogram", ) ], sequence_type="tabs", anchor_id=summary["varid"], ) template_variables["bottom"] = bottom return template_variables
def render_real(config: Settings, summary: dict) -> dict: varid = summary["varid"] template_variables = render_common(config, summary) image_format = config.plot.image_format if summary["min"] >= 0: name = "Real number (ℝ<sub>≥0</sub>)" else: name = "Real number (ℝ)" # Top info = VariableInfo( summary["varid"], summary["varname"], name, summary["alerts"], summary["description"], ) table1 = Table([ { "name": "Distinct", "value": fmt(summary["n_distinct"]), "alert": "n_distinct" in summary["alert_fields"], }, { "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), "alert": "p_distinct" in summary["alert_fields"], }, { "name": "Missing", "value": fmt(summary["n_missing"]), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", "value": fmt_percent(summary["p_missing"]), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Infinite", "value": fmt(summary["n_infinite"]), "alert": "n_infinite" in summary["alert_fields"], }, { "name": "Infinite (%)", "value": fmt_percent(summary["p_infinite"]), "alert": "p_infinite" in summary["alert_fields"], }, { "name": "Mean", "value": fmt_numeric(summary["mean"], precision=config.report.precision), "alert": False, }, ]) table2 = Table([ { "name": "Minimum", "value": fmt_numeric(summary["min"], precision=config.report.precision), "alert": False, }, { "name": "Maximum", "value": fmt_numeric(summary["max"], precision=config.report.precision), "alert": False, }, { "name": "Zeros", "value": fmt(summary["n_zeros"]), "alert": "n_zeros" in summary["alert_fields"], }, { "name": "Zeros (%)", "value": fmt_percent(summary["p_zeros"]), "alert": "p_zeros" in summary["alert_fields"], }, { "name": "Negative", "value": fmt(summary["n_negative"]), "alert": False, }, { "name": "Negative (%)", "value": fmt_percent(summary["p_negative"]), "alert": False, }, { "name": "Memory size", "value": fmt_bytesize(summary["memory_size"]), "alert": False, }, ]) mini_histo = Image( mini_histogram(config, *summary["histogram"]), image_format=image_format, alt="Mini histogram", ) template_variables["top"] = Container([info, table1, table2, mini_histo], sequence_type="grid") quantile_statistics = Table( [ { "name": "Minimum", "value": fmt_numeric(summary["min"], precision=config.report.precision), }, { "name": "5-th percentile", "value": fmt_numeric(summary["5%"], precision=config.report.precision), }, { "name": "Q1", "value": fmt_numeric(summary["25%"], precision=config.report.precision), }, { "name": "median", "value": fmt_numeric(summary["50%"], precision=config.report.precision), }, { "name": "Q3", "value": fmt_numeric(summary["75%"], precision=config.report.precision), }, { "name": "95-th percentile", "value": fmt_numeric(summary["95%"], precision=config.report.precision), }, { "name": "Maximum", "value": fmt_numeric(summary["max"], precision=config.report.precision), }, { "name": "Range", "value": fmt_numeric(summary["range"], precision=config.report.precision), }, { "name": "Interquartile range (IQR)", "value": fmt_numeric(summary["iqr"], precision=config.report.precision), }, ], name="Quantile statistics", ) descriptive_statistics = Table( [ { "name": "Standard deviation", "value": fmt_numeric(summary["std"], precision=config.report.precision), }, { "name": "Coefficient of variation (CV)", "value": fmt_numeric(summary["cv"], precision=config.report.precision), }, { "name": "Kurtosis", "value": fmt_numeric(summary["kurtosis"], precision=config.report.precision), }, { "name": "Mean", "value": fmt_numeric(summary["mean"], precision=config.report.precision), }, { "name": "Median Absolute Deviation (MAD)", "value": fmt_numeric(summary["mad"], precision=config.report.precision), }, { "name": "Skewness", "value": fmt_numeric(summary["skewness"], precision=config.report.precision), "class": "alert" if "skewness" in summary["alert_fields"] else "", }, { "name": "Sum", "value": fmt_numeric(summary["sum"], precision=config.report.precision), }, { "name": "Variance", "value": fmt_numeric(summary["variance"], precision=config.report.precision), }, { "name": "Monotonicity", "value": fmt_monotonic(summary["monotonic"]), }, ], name="Descriptive statistics", ) statistics = Container( [quantile_statistics, descriptive_statistics], anchor_id=f"{varid}statistics", name="Statistics", sequence_type="grid", ) hist = Image( histogram(config, *summary["histogram"]), image_format=image_format, alt="Histogram", caption= f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})", name="Histogram", anchor_id=f"{varid}histogram", ) fq = FrequencyTable( template_variables["freq_table_rows"], name="Common values", anchor_id=f"{varid}common_values", redact=False, ) evs = Container( [ FrequencyTable( template_variables["firstn_expanded"], name=f"Minimum {config.n_extreme_obs} values", anchor_id=f"{varid}firstn", redact=False, ), FrequencyTable( template_variables["lastn_expanded"], name=f"Maximum {config.n_extreme_obs} values", anchor_id=f"{varid}lastn", redact=False, ), ], sequence_type="tabs", name="Extreme values", anchor_id=f"{varid}extreme_values", ) template_variables["bottom"] = Container( [statistics, hist, fq, evs], sequence_type="tabs", anchor_id=f"{varid}bottom", ) return template_variables
def render_url(summary): n_freq_table_max = config["n_freq_table_max"].get(int) n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) # TODO: merge with boolean/categorical mini_freq_table_rows = freq_table(freqtable=summary["value_counts"], n=summary["n"], max_number_to_print=n_obs_cat) template_variables = render_common(summary) keys = ["scheme", "netloc", "path", "query", "fragment"] for url_part in keys: template_variables["freqtable_{}".format(url_part)] = freq_table( freqtable=summary["{}_counts".format(url_part)], n=summary["n"], max_number_to_print=n_freq_table_max, ) full_frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Full", anchor_id="{varid}full_frequency".format(varid=summary["varid"]), ) scheme_frequency_table = FrequencyTable( template_variables["freqtable_scheme"], name="Scheme", anchor_id="{varid}scheme_frequency".format(varid=summary["varid"]), ) netloc_frequency_table = FrequencyTable( template_variables["freqtable_netloc"], name="Netloc", anchor_id="{varid}netloc_frequency".format(varid=summary["varid"]), ) path_frequency_table = FrequencyTable( template_variables["freqtable_path"], name="Path", anchor_id="{varid}path_frequency".format(varid=summary["varid"]), ) query_frequency_table = FrequencyTable( template_variables["freqtable_query"], name="Query", anchor_id="{varid}query_frequency".format(varid=summary["varid"]), ) fragment_frequency_table = FrequencyTable( template_variables["freqtable_fragment"], name="Fragment", anchor_id="{varid}fragment_frequency".format(varid=summary["varid"]), ) items = [ full_frequency_table, scheme_frequency_table, netloc_frequency_table, path_frequency_table, query_frequency_table, fragment_frequency_table, ] template_variables["bottom"] = Sequence( items, sequence_type="tabs", name="url stats", anchor_id="{varid}urlstats".format(varid=summary["varid"]), ) # Element composition info = VariableInfo(summary["varid"], summary["varname"], "URL", summary["warnings"]) table = Table([ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": False, }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": False, }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": False, }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": False, }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ]) fqm = FrequencyTableSmall(mini_freq_table_rows) template_variables["top"] = Sequence([info, table, fqm], sequence_type="grid") return template_variables