Exemple #1
0
def part03():
    return parse_html_file('dicom_standard/standard/part03.html')

def get_table_and_tdiv(
        standard: BeautifulSoup) -> Tuple[List[TableDictType], Tag]:
    all_tables = standard.find_all('div', class_='table')
    html_table = pl.find_tdiv_by_id(all_tables, TABLE_ID)
    list_table = attribute_table_to_list(html_table)
    table_dict_list = table_to_dict(list_table, COLUMN_TITLES)
    return (table_dict_list, html_table)


def generate_ciod_id(name: str) -> str:
    cleaned_name = name.split('IOD')[0].strip()
    return IOD_ABBREVIATIONS.get(cleaned_name, cleaned_name)


def table_to_json(table: List[TableDictType],
                  tdiv: Tag) -> List[TableDictType]:
    attributes = []
    for row in table:
        row['ciod'] = generate_ciod_id(row['ciod'])
        attributes.append(row)
    return attributes


if __name__ == '__main__':
    standard = pl.parse_html_file(sys.argv[1])
    table, tdiv = get_table_and_tdiv(standard)
    parsed_table_data = table_to_json(table, tdiv)
    pl.write_pretty_json(parsed_table_data)
Exemple #3
0
        for section in all_sections
    }


def enclosing_section_from_id(id_div):
    # TODO: put example from the standard here
    if re.match(r'sect.*', id_div['id']):
        return id_div.parent.parent.parent.parent.parent
    elif re.match(r'biblio.*', id_div['id']):
        return id_div.parent.parent.parent
    elif re.match(r'table.*', id_div['id']):
        return id_div.parent.parent
    elif re.match(r'note.*', id_div['id']):
        return id_div.parent.parent.parent.parent.parent.parent
    else:
        return id_div.parent.parent


if __name__ == '__main__':
    # TODO: figure out a way to speed up the parsing; since we only need a
    # small portion of the parse tree, we may be able to use:
    # https://docs.python.org/3/library/html.parser.html to avoid building the
    # full parse tree.
    standard = {os.path.basename(f): parse_html_file(f) for f in sys.argv[1:]}
    section_ids = extract_section_ids(standard)
    sections = {
        page: normalize_sections(html)
        for page, html in section_ids.items()
    }
    write_pretty_json(sections)