Esempio n. 1
0
def inspect_annotations_by_category(
        dataset_dir: str, category: str, output_dir: str) \
        -> _InspectAnnotationsByCategoryResult:
    annotation_file_path = os.path.join(
        dataset.make_annotation_dir_path(dataset_dir), f'{category}_dist.json')
    annotations_by_page_id = dataset.read_annotations_by_page_id(
        annotation_file_path)
    inspect_annotations_task_args_list = tuple(
        _InspectAnnotationsTaskArgs(
            dataset_dir=dataset_dir,
            category=category,
            page_id=page_id,
            annotations=tuple(annotations))
        for (page_id, annotations) in annotations_by_page_id.items())
    output_inspection_file_path = os.path.join(
        output_dir, _make_file_name(category))
    error_count_by_type: DefaultDict[ErrorType, int] = defaultdict(int)
    with Pool() as pool:
        with open(output_inspection_file_path, 'w') as fout:
            writer = util.csv_writer(fout)
            writer.writerow(_InspectAnnotationsTaskResult._fields)
            for result in sorted(chain.from_iterable(tqdm(
                pool.imap_unordered(_inspect_annotations_task,
                                    inspect_annotations_task_args_list,
                                    chunksize=10),
                    total=len(inspect_annotations_task_args_list)))):
                writer.writerow(result)
                error_count_by_type[result.error_type] += 1
    return _InspectAnnotationsByCategoryResult(
        category=category,
        error_count_by_type=error_count_by_type)
Esempio n. 2
0
def inspect_pages_by_category(
        dataset_dir: str, category: str, output_dir: str) \
        -> _InspectPagesByCategoryResult:
    html_dir_path = dataset.make_html_dir_path(dataset_dir, category)
    text_dir_path = dataset.make_text_dir_path(dataset_dir, category)
    inspect_page_task_args_list = []
    for page_id in frozenset(
            dataset.get_page_id_from_file_path(file_name)
            for file_name in chain(os.listdir(os.path.join(html_dir_path)),
                                   os.listdir(os.path.join(text_dir_path)))):
        inspect_page_task_args_list.append(
            _InspectPageTaskArgs(
                page_id=page_id,
                html_file_path=os.path.join(
                    html_dir_path, dataset.make_html_file_name(page_id)),
                text_file_path=os.path.join(
                    text_dir_path, dataset.make_text_file_name(page_id))))
    output_page_file_path = os.path.join(output_dir, _make_file_name(category))
    error_count_by_type: DefaultDict[ErrorType, int] = defaultdict(int)
    with Pool() as pool:
        with open(output_page_file_path, 'w') as fout:
            writer = util.csv_writer(fout)
            writer.writerow(_InspectPageTaskResult._fields)
            for result in sorted(chain.from_iterable(tqdm(
                    pool.imap_unordered(_inspect_page_task,
                                        inspect_page_task_args_list,
                                        chunksize=10),
                    total=len(inspect_page_task_args_list)))):
                writer.writerow(result)
                error_count_by_type[result.error_type] += 1
    return _InspectPagesByCategoryResult(
        category=category,
        error_count_by_type=error_count_by_type)
Esempio n. 3
0
def inspect_annotations(dataset_dir: str, output_dir: str) -> None:
    util.makedirs(output_dir)
    results = tuple(
        inspect_annotations_by_category(dataset_dir, category, output_dir)
        for category in tqdm(sorted(dataset.ALL_CATEGORIES),
                             total=len(dataset.ALL_CATEGORIES)))
    summary_file_path = os.path.join(output_dir, _make_file_name('summary'))
    with open(summary_file_path, 'w') as fout:
        writer = util.csv_writer(fout)
        header_row: List[Any] = ['category']
        header_row.extend(error_type for error_type in ErrorType)
        writer.writerow(header_row)
        for result in results:
            row: List[Any] = [result.category]
            row.extend(result.error_count_by_type[error_type]
                       for error_type in ErrorType)
            writer.writerow(row)
Esempio n. 4
0
def make_dataset_catalogs(dataset_dir: str, output_catalog_dir: str) -> None:
    util.makedirs(output_catalog_dir)
    summary_rows: List[List[Any]] = []
    for category in tqdm(sorted(dataset.ALL_CATEGORIES),
                         total=len(dataset.ALL_CATEGORIES)):
        result = make_category_dataset_catalogs(
            dataset_dir, output_catalog_dir, category)
        header_row = list(_MakeCategoryDatasetCatalogs._fields)
        header_row.extend(sorted(result.num_annotations_by_attribute.keys()))
        summary_rows.append(header_row)
        value_dict = result._asdict()
        value_dict['num_annotations_by_attribute'] = None
        value_row = list(value_dict.values())
        value_row.extend(
            result.num_annotations_by_attribute[attribute]
            for attribute in sorted(result.num_annotations_by_attribute))
        summary_rows.append(value_row)
    summary_file_path = os.path.join(
        output_catalog_dir, _make_file_name('summary'))
    with open(summary_file_path, 'w') as fout:
        writer = util.csv_writer(fout)
        for row in _transpose(summary_rows):
            writer.writerow(row)
Esempio n. 5
0
def make_category_dataset_catalogs(
        dataset_dir: str, output_catalog_dir: str, category: str) \
        -> _MakeCategoryDatasetCatalogs:
    answer_annotation_file_path = os.path.join(
        dataset.make_annotation_dir_path(dataset_dir), f'{category}_dist.json')
    (attributes, annotation_info_by_page_id) = _get_annotation_info(
        answer_annotation_file_path)
    html_dir_path = dataset.make_html_dir_path(dataset_dir, category)
    text_dir_path = dataset.make_text_dir_path(dataset_dir, category)
    file_info_task_args_list = []
    for file_name in os.listdir(html_dir_path):
        assert file_name.endswith('.html')
        page_id = dataset.get_page_id_from_file_path(file_name)
        file_info_task_args_list.append(_FileInfoTaskArgs(
            page_id=page_id,
            html_file_path=os.path.join(
                html_dir_path, dataset.make_html_file_name(page_id)),
            text_file_path=os.path.join(
                text_dir_path, dataset.make_text_file_name(page_id))))
    with Pool() as pool:
        file_info_by_page_id = {
            result.page_id: result
            for result in tqdm(pool.imap_unordered(_file_info_task,
                                                   file_info_task_args_list,
                                                   chunksize=100),
                               total=len(file_info_task_args_list))}
    catalog_file_path = os.path.join(
        output_catalog_dir, _make_file_name(category))
    with open(catalog_file_path, 'w') as fout:
        writer = util.csv_writer(fout)
        header_row = list(_FileInfoTaskResult._fields)
        header_row.append('num_annotations')
        header_row.extend(attributes)
        writer.writerow(header_row)
        num_pages = 0
        total_html_file_size = 0
        total_text_file_size = 0
        num_disambiguation_pages = 0
        total_infobox_count = 0
        num_pages_with_annotation = 0
        num_pages_with_infobox = 0
        total_num_annotations = 0
        total_num_annotations_by_attribute: DefaultDict[str, int] \
            = defaultdict(int)
        for page_id in sorted(file_info_by_page_id.keys()):
            file_info = file_info_by_page_id[page_id]
            row = list(file_info)
            annotation_info = annotation_info_by_page_id.get(page_id)
            if annotation_info:
                row.append(annotation_info.num_annotations)
                row.extend(
                    annotation_info.num_annotations_by_attribute[attribute]
                    for attribute in attributes)
            else:
                row.append(None)
                row.extend(None for unused in range(len(attributes)))
            writer.writerow(row)
            num_pages += 1
            total_html_file_size += file_info.html_file_size
            total_text_file_size += file_info.text_file_size
            total_infobox_count += file_info.infobox_count
            if annotation_info is not None \
               and annotation_info.num_annotations > 0:
                num_pages_with_annotation += 1
                total_num_annotations += annotation_info.num_annotations
                _add_values_in_place(
                    total_num_annotations_by_attribute,
                    annotation_info.num_annotations_by_attribute)
            if file_info.is_disambiguation_page:
                num_disambiguation_pages += 1
            if file_info.infobox_count > 0:
                num_pages_with_infobox += 1
    return _MakeCategoryDatasetCatalogs(
        category=category,
        num_pages=num_pages,
        total_html_file_size=total_html_file_size,
        total_text_file_size=total_text_file_size,
        num_disambiguation_pages=num_disambiguation_pages,
        total_infobox_count=total_infobox_count,
        num_pages_with_annotation=num_pages_with_annotation,
        num_pages_with_infobox=num_pages_with_infobox,
        num_attribute_types=len(attributes),
        total_num_annotations=total_num_annotations,
        num_annotations_by_attribute=total_num_annotations_by_attribute)