Example #1
0
 def sort_fn(
     grouped_examples: typing.List[typing.Tuple[dict]]
 ) -> typing.List[typing.Tuple[dict]]:
     missing = float('-inf') if descending else float('inf')
     key_fn = lambda x: tuple(
         metrics.extract_metric_value(x[0], key, missing=missing)
         for key in sort_key)
     return sorted(grouped_examples, key=key_fn, reverse=descending)
Example #2
0
	def filter_fn(examples: typing.Tuple[dict]) -> bool:
		for example in examples:
			if example['audio_path'] in exclude:
				return False
			if len(include) > 0 and example['audio_path'] not in include:
				return False
			if 'duration' in example.keys() and (example['duration'] >= duration[1] or example['duration'] < duration[0]):
				return False
			for metric_name, (lower, higher) in allowed_metrics_intervals.items():
				metric_value = metrics.extract_metric_value(example, metric_name) * 100
				if metric_value is None or metric_value < lower or metric_value >= higher:
					return False
			return True
Example #3
0
def errors(
	input_paths: typing.List[str],
	output_path: typing.Optional[str] = None,
	include_metrics: typing.List[str] = ('cer', 'wer',),
	debug_audio: bool = False,
	filter_fn: typing.Optional[typing.Callable[[typing.Tuple[dict]], bool]] = lambda x: True,
	sort_fn: typing.Optional[typing.Callable[[typing.List[typing.Tuple[dict]]], typing.List[typing.Tuple[dict]]]] = lambda x: x
) -> str:
	'''
	Parameters:
		input_paths: paths to json files with list of analyzed examples
		output_path: path to output html (default: input_path[0]+.html)
		debug_audio: include audio data into html if true
		filter_fn: function to filter tuples of examples grouped by `audio_path`,
				   function input: tuple of examples in order same as in `input_paths`
				   function output: true to include examples into html, false otherwise
		sort_fn: function to sort tuples of examples grouped by `audio_path`,
				 function input: list of tuples of examples, each tuple has same order as in `input_paths`
				 function output: same list but in sorted order
	'''
	grouped_examples = collections.defaultdict(list)
	examples_count = {}
	for path in input_paths:
		examples = transcripts.load(path)
		examples_count[path] = len(examples)
		for example in examples:
			grouped_examples[example['audio_path']].append(example)
	grouped_examples = list(filter(lambda x: len(x) == len(input_paths), grouped_examples.values()))
	not_found_examples_count = {path: count - len(grouped_examples) for path, count in examples_count.items()}
	grouped_examples = list(filter(filter_fn, grouped_examples))
	filtered_examples_count = {path: count - len(grouped_examples) - not_found_examples_count[path] for path, count in examples_count.items()}
	grouped_examples = sort_fn(grouped_examples)
	style = '''
				.filters_table b.warning {color: red;}
		        table.metrics_table {border-collapse:collapse;}
		        .metrics_table th {padding: 5px; padding-left: 10px; text-align: left}
		        .metrics_table tr {padding: 5px;}
		        .metrics_table tr.new_section {border-top: 1px solid black; padding: 5px;}
		        .metrics_table td {border-left: 1px dashed black; border-right: 1px dashed black; padding: 5px; padding-left: 10px;}   
	'''

	template = '''
		<html>
		<head>
		    <meta charset="utf-8">
		    <style>
		        {style}
		    </style>
		    <script>
		        {scripts}
		    </script>
		</head>
		<body>
			<b style="padding: 10px">Filters</b><br><br>
            Dropped (example not found in other files):<br>
            <table class="filters_table">
		        {filter_not_found_table}
		    </table><br>
		    Dropped (filter_fn):
		    <table class="filters_table">
		        {filter_fn_table}
		    </table><br>
		    <table class="metrics_table">
		        {metrics_table}
		    </table>
		</body>
		</html>
	'''

	# Make filter "not found" table
	def fmt_filter_table(filtered_count: dict) -> str:
		filtered_table = []
		for file_path, count in filtered_count.items():
			css_class = 'warning' if count > 0 else ''
			file_name = os.path.basename(file_path)
			filtered_table.append(f'<tr><td>{file_name}</td><td><b class="{css_class}">{count}</b></td></tr>')
		return '\n'.join(filtered_table)
	filter_not_found_table = fmt_filter_table(not_found_examples_count)

	# Make filter "filter_fn" table
	filter_fn_table = fmt_filter_table(filtered_examples_count)

	# Make averages table
	def fmt_averages_table(include_metrics: typing.List[str], averages: dict) -> str:
		header = '<tr><th>Averages</th>' + '<th></th>' * (len(include_metrics) + 2) + '</tr>\n'
		header += '<tr><th></th>' + ''.join(f'<th>{metric_name}</th>' for metric_name in include_metrics) + '<th></th>' * 2 + '</tr>\n'
		content = []
		for i, (file_name, metric_values) in enumerate(averages.items()):
			content_line = f'<td><b>{file_name}</b></td>' + ''.join(
				f'<td>{metric_value:.2%}</td>' for metric_value in metric_values) + '<td></td>' * 2
			if i == 0:
				content_line = '<tr class="new_section">' + content_line + '</tr>'
			else:
				content_line = '<tr>' + content_line + '</tr>'
			content.append(content_line)
		content = '\n'.join(content)
		footer = '<tr class="new_section" style="height: 30px">' + '<th></th>' * (len(include_metrics) + 3) + '</tr>\n'
		return header + content + footer

	averages = {}
	for i, input_file in enumerate(input_paths):
		file_name = os.path.basename(input_file)
		file_examples = [examples[i] for examples in grouped_examples]
		averages[file_name] = [metrics.nanmean(file_examples, metric_name) for metric_name in include_metrics]
	average_table = fmt_averages_table(include_metrics, averages)

	# Make examples table
	def fmt_examples_table(include_metrics: typing.List[str], table_data: typing.List[dict], debug_audio: bool) -> str:
		header = '<tr><th>Examples</th>' + '<th></th>' * (len(include_metrics) + 2) + '</tr>\n'
		content = []
		for i, examples_data in enumerate(table_data):
			ref = '<pre>' + examples_data['ref'] + '</pre>'
			audio_path = examples_data['audio_path']
			embedded_audio = fmt_audio(audio_path, i) if debug_audio else ''
			examples_header = f'<tr class="new_section"><td colspan="{len(include_metrics)+1}"><b>{i}.</b>{audio_path}</td><td>{embedded_audio}</td><td>ref: <pre>{ref}</pre></td></tr>'
			examples_content = []
			for i, example_data in enumerate(examples_data['examples']):
				metric_values = [f'{value:.2%}' if value is not None else '-' for value in example_data['metric_values']]
				file_name = example_data['file_name']
				alignment = example_data['alignment']
				hyp = '<pre>' + example_data['hyp'] + '</pre>'
				content_line = (f'<td>{file_name}</td>' + ''.join(map('<td>{}</td>'.format, metric_values)) + f'<td>{alignment}</td><td>{hyp}</td>')
				if i == 0:
					examples_content.append('<tr class="new_section">' + content_line + '</tr>')
				else:
					examples_content.append('<tr>' + content_line + '</tr>')
			content.append(examples_header)
			content.extend(examples_content)
		return header + '\n'.join(content)

	table_data = []
	for examples in grouped_examples:
		examples_data = dict(
			audio_path = examples[0]['audio_path'],
			ref = examples[0]['ref_orig'],
			examples = [])
		for i, input_file in enumerate(input_paths):
			examples_data['examples'].append(dict(
				file_name = os.path.basename(input_file),
				metric_values = [metrics.extract_metric_value(examples[i], metric_name) for metric_name in include_metrics],
				alignment = fmt_alignment(examples[i]['alignment']),
				hyp = examples[i]["hyp"]))
		table_data.append(examples_data)

	examples_data = fmt_examples_table(include_metrics, table_data, debug_audio)

	# make output html
	metrics_table = average_table + examples_data
	report = template.format(style = style,
	                         scripts = play_script if debug_audio else '',
	                         filter_not_found_table = filter_not_found_table,
	                         filter_fn_table = filter_fn_table,
	                         metrics_table = metrics_table)
	html_path = output_path or (input_paths[0] + '.html')
	open(html_path, 'w').write(report)
	return html_path