def render_categorical_length(config: Settings, summary: dict, varid: str) -> Tuple[Renderable, Renderable]: length_table = Table( [ { "name": "Max length", "value": fmt_number(summary["max_length"]), "alert": False, }, { "name": "Median length", "value": fmt_number(summary["median_length"]), "alert": False, }, { "name": "Mean length", "value": fmt_numeric(summary["mean_length"], precision=config.report.precision), "alert": False, }, { "name": "Min length", "value": fmt_number(summary["min_length"]), "alert": False, }, ], name="Length", anchor_id=f"{varid}lengthstats", ) length_histo = Image( histogram(config, *summary["histogram_length"]), image_format=config.plot.image_format, alt="length histogram", name="Length", caption="Histogram of lengths of the category", anchor_id=f"{varid}length", ) return length_table, length_histo
def render_categorical_unicode(config: Settings, summary: dict, varid: str) -> Tuple[Renderable, Renderable]: n_freq_table_max = config.n_freq_table_max category_overview = FrequencyTable( freq_table( freqtable=summary["category_alias_counts"], n=summary["category_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring categories", anchor_id=f"{varid}category_long_values", redact=False, ) cats = [] for category_alias_name, category_alias_counts in sorted( summary["category_alias_char_counts"].items(), key=lambda x: -len(x[1])): category_alias_name = category_alias_name.replace("_", " ") cats.append( FrequencyTable( freq_table( freqtable=category_alias_counts, n=category_alias_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"{category_alias_name}", anchor_id=f"{varid}category_alias_values_{category_alias_name}", redact=config.vars.cat.redact, )) category_items = [ category_overview, Container( cats, name="Most frequent character per category", sequence_type="batch_grid", anchor_id=f"{varid}categories", batch_size=2, subtitles=True, ), ] script_overview = FrequencyTable( freq_table( freqtable=summary["script_counts"], n=summary["script_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring scripts", anchor_id=f"{varid}script_values", redact=False, ) scripts = [ FrequencyTable( freq_table( freqtable=script_counts, n=script_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"{script_name}", anchor_id=f"{varid}script_values_{script_name}", redact=config.vars.cat.redact, ) for script_name, script_counts in sorted( summary["script_char_counts"].items(), key=lambda x: -len(x[1])) ] script_items = [ script_overview, Container( scripts, name="Most frequent character per script", sequence_type="batch_grid", anchor_id=f"{varid}scripts", batch_size=2, subtitles=True, ), ] block_overview = FrequencyTable( freq_table( freqtable=summary["block_alias_counts"], n=summary["block_alias_counts"].sum(), max_number_to_print=n_freq_table_max, ), name="Most occurring blocks", anchor_id=f"{varid}block_alias_values", redact=False, ) blocks = [ FrequencyTable( freq_table( freqtable=block_counts, n=block_counts.sum(), max_number_to_print=n_freq_table_max, ), name=f"{block_name}", anchor_id=f"{varid}block_alias_values_{block_name}", redact=config.vars.cat.redact, ) for block_name, block_counts in summary["block_alias_char_counts"].items() ] block_items = [ block_overview, Container( blocks, name="Most frequent character per block", sequence_type="batch_grid", anchor_id=f"{varid}blocks", batch_size=2, subtitles=True, ), ] overview_table = Table( [ { "name": "Total characters", "value": fmt_number(summary["n_characters"]), "alert": False, }, { "name": "Distinct characters", "value": fmt_number(summary["n_characters_distinct"]), "alert": False, }, { "name": "Distinct categories", "value": f"{fmt_number(summary['n_category'])} {help(title='Unicode categories (click for more information)', url='https://en.wikipedia.org/wiki/Unicode_character_property#General_Category')}", "alert": False, }, { "name": "Distinct scripts", "value": f"{fmt_number(summary['n_scripts'])} {help(title='Unicode scripts (click for more information)', url='https://en.wikipedia.org/wiki/Script_(Unicode)#List_of_scripts_in_Unicode')}", "alert": False, }, { "name": "Distinct blocks", "value": f"{fmt_number(summary['n_block_alias'])} {help(title='Unicode blocks (click for more information)', url='https://en.wikipedia.org/wiki/Unicode_block')}", "alert": False, }, ], name="Characters and Unicode", caption= "The Unicode Standard assigns character properties to each code point, which can be used to analyse textual variables. ", ) citems = [ Container( [ FrequencyTable( freq_table( freqtable=summary["character_counts"], n=summary["n_characters"], max_number_to_print=n_freq_table_max, ), name="Most occurring characters", anchor_id=f"{varid}character_frequency", redact=config.vars.cat.redact, ), ], name="Characters", anchor_id=f"{varid}characters", sequence_type="named_list", ), Container( category_items, name="Categories", anchor_id=f"{varid}categories", sequence_type="named_list", ), Container( script_items, name="Scripts", anchor_id=f"{varid}scripts", sequence_type="named_list", ), Container( block_items, name="Blocks", anchor_id=f"{varid}blocks", sequence_type="named_list", ), ] return overview_table, Container( citems, name="Unicode", sequence_type="tabs", anchor_id=f"{varid}unicode", )
def get_dataset_overview(config: Settings, summary: dict) -> Renderable: table_metrics = [ { "name": "Number of variables", "value": fmt_number(summary["table"]["n_var"]), }, { "name": "Number of observations", "value": fmt_number(summary["table"]["n"]), }, { "name": "Missing cells", "value": fmt_number(summary["table"]["n_cells_missing"]), }, { "name": "Missing cells (%)", "value": fmt_percent(summary["table"]["p_cells_missing"]), }, ] if "n_duplicates" in summary["table"]: table_metrics.extend( [ { "name": "Duplicate rows", "value": fmt_number(summary["table"]["n_duplicates"]), }, { "name": "Duplicate rows (%)", "value": fmt_percent(summary["table"]["p_duplicates"]), }, ] ) table_metrics.extend( [ { "name": "Total size in memory", "value": fmt_bytesize(summary["table"]["memory_size"]), }, { "name": "Average record size in memory", "value": fmt_bytesize(summary["table"]["record_size"]), }, ] ) dataset_info = Table( table_metrics, name="Dataset statistics", ) dataset_types = Table( [ { "name": str(type_name), "value": fmt_numeric(count, precision=config.report.precision), } for type_name, count in summary["table"]["types"].items() ], name="Variable types", ) return Container( [dataset_info, dataset_types], anchor_id="dataset_overview", name="Overview", sequence_type="grid", )