def sort_fn( grouped_examples: typing.List[typing.Tuple[dict]] ) -> typing.List[typing.Tuple[dict]]: missing = float('-inf') if descending else float('inf') key_fn = lambda x: tuple( metrics.extract_metric_value(x[0], key, missing=missing) for key in sort_key) return sorted(grouped_examples, key=key_fn, reverse=descending)
def filter_fn(examples: typing.Tuple[dict]) -> bool: for example in examples: if example['audio_path'] in exclude: return False if len(include) > 0 and example['audio_path'] not in include: return False if 'duration' in example.keys() and (example['duration'] >= duration[1] or example['duration'] < duration[0]): return False for metric_name, (lower, higher) in allowed_metrics_intervals.items(): metric_value = metrics.extract_metric_value(example, metric_name) * 100 if metric_value is None or metric_value < lower or metric_value >= higher: return False return True
def errors( input_paths: typing.List[str], output_path: typing.Optional[str] = None, include_metrics: typing.List[str] = ('cer', 'wer',), debug_audio: bool = False, filter_fn: typing.Optional[typing.Callable[[typing.Tuple[dict]], bool]] = lambda x: True, sort_fn: typing.Optional[typing.Callable[[typing.List[typing.Tuple[dict]]], typing.List[typing.Tuple[dict]]]] = lambda x: x ) -> str: ''' Parameters: input_paths: paths to json files with list of analyzed examples output_path: path to output html (default: input_path[0]+.html) debug_audio: include audio data into html if true filter_fn: function to filter tuples of examples grouped by `audio_path`, function input: tuple of examples in order same as in `input_paths` function output: true to include examples into html, false otherwise sort_fn: function to sort tuples of examples grouped by `audio_path`, function input: list of tuples of examples, each tuple has same order as in `input_paths` function output: same list but in sorted order ''' grouped_examples = collections.defaultdict(list) examples_count = {} for path in input_paths: examples = transcripts.load(path) examples_count[path] = len(examples) for example in examples: grouped_examples[example['audio_path']].append(example) grouped_examples = list(filter(lambda x: len(x) == len(input_paths), grouped_examples.values())) not_found_examples_count = {path: count - len(grouped_examples) for path, count in examples_count.items()} grouped_examples = list(filter(filter_fn, grouped_examples)) filtered_examples_count = {path: count - len(grouped_examples) - not_found_examples_count[path] for path, count in examples_count.items()} grouped_examples = sort_fn(grouped_examples) style = ''' .filters_table b.warning {color: red;} table.metrics_table {border-collapse:collapse;} .metrics_table th {padding: 5px; padding-left: 10px; text-align: left} .metrics_table tr {padding: 5px;} .metrics_table tr.new_section {border-top: 1px solid black; padding: 5px;} .metrics_table td {border-left: 1px dashed black; border-right: 1px dashed black; padding: 5px; padding-left: 10px;} ''' template = ''' <html> <head> <meta charset="utf-8"> <style> {style} </style> <script> {scripts} </script> </head> <body> <b style="padding: 10px">Filters</b><br><br> Dropped (example not found in other files):<br> <table class="filters_table"> {filter_not_found_table} </table><br> Dropped (filter_fn): <table class="filters_table"> {filter_fn_table} </table><br> <table class="metrics_table"> {metrics_table} </table> </body> </html> ''' # Make filter "not found" table def fmt_filter_table(filtered_count: dict) -> str: filtered_table = [] for file_path, count in filtered_count.items(): css_class = 'warning' if count > 0 else '' file_name = os.path.basename(file_path) filtered_table.append(f'<tr><td>{file_name}</td><td><b class="{css_class}">{count}</b></td></tr>') return '\n'.join(filtered_table) filter_not_found_table = fmt_filter_table(not_found_examples_count) # Make filter "filter_fn" table filter_fn_table = fmt_filter_table(filtered_examples_count) # Make averages table def fmt_averages_table(include_metrics: typing.List[str], averages: dict) -> str: header = '<tr><th>Averages</th>' + '<th></th>' * (len(include_metrics) + 2) + '</tr>\n' header += '<tr><th></th>' + ''.join(f'<th>{metric_name}</th>' for metric_name in include_metrics) + '<th></th>' * 2 + '</tr>\n' content = [] for i, (file_name, metric_values) in enumerate(averages.items()): content_line = f'<td><b>{file_name}</b></td>' + ''.join( f'<td>{metric_value:.2%}</td>' for metric_value in metric_values) + '<td></td>' * 2 if i == 0: content_line = '<tr class="new_section">' + content_line + '</tr>' else: content_line = '<tr>' + content_line + '</tr>' content.append(content_line) content = '\n'.join(content) footer = '<tr class="new_section" style="height: 30px">' + '<th></th>' * (len(include_metrics) + 3) + '</tr>\n' return header + content + footer averages = {} for i, input_file in enumerate(input_paths): file_name = os.path.basename(input_file) file_examples = [examples[i] for examples in grouped_examples] averages[file_name] = [metrics.nanmean(file_examples, metric_name) for metric_name in include_metrics] average_table = fmt_averages_table(include_metrics, averages) # Make examples table def fmt_examples_table(include_metrics: typing.List[str], table_data: typing.List[dict], debug_audio: bool) -> str: header = '<tr><th>Examples</th>' + '<th></th>' * (len(include_metrics) + 2) + '</tr>\n' content = [] for i, examples_data in enumerate(table_data): ref = '<pre>' + examples_data['ref'] + '</pre>' audio_path = examples_data['audio_path'] embedded_audio = fmt_audio(audio_path, i) if debug_audio else '' examples_header = f'<tr class="new_section"><td colspan="{len(include_metrics)+1}"><b>{i}.</b>{audio_path}</td><td>{embedded_audio}</td><td>ref: <pre>{ref}</pre></td></tr>' examples_content = [] for i, example_data in enumerate(examples_data['examples']): metric_values = [f'{value:.2%}' if value is not None else '-' for value in example_data['metric_values']] file_name = example_data['file_name'] alignment = example_data['alignment'] hyp = '<pre>' + example_data['hyp'] + '</pre>' content_line = (f'<td>{file_name}</td>' + ''.join(map('<td>{}</td>'.format, metric_values)) + f'<td>{alignment}</td><td>{hyp}</td>') if i == 0: examples_content.append('<tr class="new_section">' + content_line + '</tr>') else: examples_content.append('<tr>' + content_line + '</tr>') content.append(examples_header) content.extend(examples_content) return header + '\n'.join(content) table_data = [] for examples in grouped_examples: examples_data = dict( audio_path = examples[0]['audio_path'], ref = examples[0]['ref_orig'], examples = []) for i, input_file in enumerate(input_paths): examples_data['examples'].append(dict( file_name = os.path.basename(input_file), metric_values = [metrics.extract_metric_value(examples[i], metric_name) for metric_name in include_metrics], alignment = fmt_alignment(examples[i]['alignment']), hyp = examples[i]["hyp"])) table_data.append(examples_data) examples_data = fmt_examples_table(include_metrics, table_data, debug_audio) # make output html metrics_table = average_table + examples_data report = template.format(style = style, scripts = play_script if debug_audio else '', filter_not_found_table = filter_not_found_table, filter_fn_table = filter_fn_table, metrics_table = metrics_table) html_path = output_path or (input_paths[0] + '.html') open(html_path, 'w').write(report) return html_path