def inspect_annotations_by_category( dataset_dir: str, category: str, output_dir: str) \ -> _InspectAnnotationsByCategoryResult: annotation_file_path = os.path.join( dataset.make_annotation_dir_path(dataset_dir), f'{category}_dist.json') annotations_by_page_id = dataset.read_annotations_by_page_id( annotation_file_path) inspect_annotations_task_args_list = tuple( _InspectAnnotationsTaskArgs( dataset_dir=dataset_dir, category=category, page_id=page_id, annotations=tuple(annotations)) for (page_id, annotations) in annotations_by_page_id.items()) output_inspection_file_path = os.path.join( output_dir, _make_file_name(category)) error_count_by_type: DefaultDict[ErrorType, int] = defaultdict(int) with Pool() as pool: with open(output_inspection_file_path, 'w') as fout: writer = util.csv_writer(fout) writer.writerow(_InspectAnnotationsTaskResult._fields) for result in sorted(chain.from_iterable(tqdm( pool.imap_unordered(_inspect_annotations_task, inspect_annotations_task_args_list, chunksize=10), total=len(inspect_annotations_task_args_list)))): writer.writerow(result) error_count_by_type[result.error_type] += 1 return _InspectAnnotationsByCategoryResult( category=category, error_count_by_type=error_count_by_type)
def inspect_pages_by_category( dataset_dir: str, category: str, output_dir: str) \ -> _InspectPagesByCategoryResult: html_dir_path = dataset.make_html_dir_path(dataset_dir, category) text_dir_path = dataset.make_text_dir_path(dataset_dir, category) inspect_page_task_args_list = [] for page_id in frozenset( dataset.get_page_id_from_file_path(file_name) for file_name in chain(os.listdir(os.path.join(html_dir_path)), os.listdir(os.path.join(text_dir_path)))): inspect_page_task_args_list.append( _InspectPageTaskArgs( page_id=page_id, html_file_path=os.path.join( html_dir_path, dataset.make_html_file_name(page_id)), text_file_path=os.path.join( text_dir_path, dataset.make_text_file_name(page_id)))) output_page_file_path = os.path.join(output_dir, _make_file_name(category)) error_count_by_type: DefaultDict[ErrorType, int] = defaultdict(int) with Pool() as pool: with open(output_page_file_path, 'w') as fout: writer = util.csv_writer(fout) writer.writerow(_InspectPageTaskResult._fields) for result in sorted(chain.from_iterable(tqdm( pool.imap_unordered(_inspect_page_task, inspect_page_task_args_list, chunksize=10), total=len(inspect_page_task_args_list)))): writer.writerow(result) error_count_by_type[result.error_type] += 1 return _InspectPagesByCategoryResult( category=category, error_count_by_type=error_count_by_type)
def inspect_annotations(dataset_dir: str, output_dir: str) -> None: util.makedirs(output_dir) results = tuple( inspect_annotations_by_category(dataset_dir, category, output_dir) for category in tqdm(sorted(dataset.ALL_CATEGORIES), total=len(dataset.ALL_CATEGORIES))) summary_file_path = os.path.join(output_dir, _make_file_name('summary')) with open(summary_file_path, 'w') as fout: writer = util.csv_writer(fout) header_row: List[Any] = ['category'] header_row.extend(error_type for error_type in ErrorType) writer.writerow(header_row) for result in results: row: List[Any] = [result.category] row.extend(result.error_count_by_type[error_type] for error_type in ErrorType) writer.writerow(row)
def make_dataset_catalogs(dataset_dir: str, output_catalog_dir: str) -> None: util.makedirs(output_catalog_dir) summary_rows: List[List[Any]] = [] for category in tqdm(sorted(dataset.ALL_CATEGORIES), total=len(dataset.ALL_CATEGORIES)): result = make_category_dataset_catalogs( dataset_dir, output_catalog_dir, category) header_row = list(_MakeCategoryDatasetCatalogs._fields) header_row.extend(sorted(result.num_annotations_by_attribute.keys())) summary_rows.append(header_row) value_dict = result._asdict() value_dict['num_annotations_by_attribute'] = None value_row = list(value_dict.values()) value_row.extend( result.num_annotations_by_attribute[attribute] for attribute in sorted(result.num_annotations_by_attribute)) summary_rows.append(value_row) summary_file_path = os.path.join( output_catalog_dir, _make_file_name('summary')) with open(summary_file_path, 'w') as fout: writer = util.csv_writer(fout) for row in _transpose(summary_rows): writer.writerow(row)
def make_category_dataset_catalogs( dataset_dir: str, output_catalog_dir: str, category: str) \ -> _MakeCategoryDatasetCatalogs: answer_annotation_file_path = os.path.join( dataset.make_annotation_dir_path(dataset_dir), f'{category}_dist.json') (attributes, annotation_info_by_page_id) = _get_annotation_info( answer_annotation_file_path) html_dir_path = dataset.make_html_dir_path(dataset_dir, category) text_dir_path = dataset.make_text_dir_path(dataset_dir, category) file_info_task_args_list = [] for file_name in os.listdir(html_dir_path): assert file_name.endswith('.html') page_id = dataset.get_page_id_from_file_path(file_name) file_info_task_args_list.append(_FileInfoTaskArgs( page_id=page_id, html_file_path=os.path.join( html_dir_path, dataset.make_html_file_name(page_id)), text_file_path=os.path.join( text_dir_path, dataset.make_text_file_name(page_id)))) with Pool() as pool: file_info_by_page_id = { result.page_id: result for result in tqdm(pool.imap_unordered(_file_info_task, file_info_task_args_list, chunksize=100), total=len(file_info_task_args_list))} catalog_file_path = os.path.join( output_catalog_dir, _make_file_name(category)) with open(catalog_file_path, 'w') as fout: writer = util.csv_writer(fout) header_row = list(_FileInfoTaskResult._fields) header_row.append('num_annotations') header_row.extend(attributes) writer.writerow(header_row) num_pages = 0 total_html_file_size = 0 total_text_file_size = 0 num_disambiguation_pages = 0 total_infobox_count = 0 num_pages_with_annotation = 0 num_pages_with_infobox = 0 total_num_annotations = 0 total_num_annotations_by_attribute: DefaultDict[str, int] \ = defaultdict(int) for page_id in sorted(file_info_by_page_id.keys()): file_info = file_info_by_page_id[page_id] row = list(file_info) annotation_info = annotation_info_by_page_id.get(page_id) if annotation_info: row.append(annotation_info.num_annotations) row.extend( annotation_info.num_annotations_by_attribute[attribute] for attribute in attributes) else: row.append(None) row.extend(None for unused in range(len(attributes))) writer.writerow(row) num_pages += 1 total_html_file_size += file_info.html_file_size total_text_file_size += file_info.text_file_size total_infobox_count += file_info.infobox_count if annotation_info is not None \ and annotation_info.num_annotations > 0: num_pages_with_annotation += 1 total_num_annotations += annotation_info.num_annotations _add_values_in_place( total_num_annotations_by_attribute, annotation_info.num_annotations_by_attribute) if file_info.is_disambiguation_page: num_disambiguation_pages += 1 if file_info.infobox_count > 0: num_pages_with_infobox += 1 return _MakeCategoryDatasetCatalogs( category=category, num_pages=num_pages, total_html_file_size=total_html_file_size, total_text_file_size=total_text_file_size, num_disambiguation_pages=num_disambiguation_pages, total_infobox_count=total_infobox_count, num_pages_with_annotation=num_pages_with_annotation, num_pages_with_infobox=num_pages_with_infobox, num_attribute_types=len(attributes), total_num_annotations=total_num_annotations, num_annotations_by_attribute=total_num_annotations_by_attribute)