def part03(): return parse_html_file('dicom_standard/standard/part03.html')
def get_table_and_tdiv( standard: BeautifulSoup) -> Tuple[List[TableDictType], Tag]: all_tables = standard.find_all('div', class_='table') html_table = pl.find_tdiv_by_id(all_tables, TABLE_ID) list_table = attribute_table_to_list(html_table) table_dict_list = table_to_dict(list_table, COLUMN_TITLES) return (table_dict_list, html_table) def generate_ciod_id(name: str) -> str: cleaned_name = name.split('IOD')[0].strip() return IOD_ABBREVIATIONS.get(cleaned_name, cleaned_name) def table_to_json(table: List[TableDictType], tdiv: Tag) -> List[TableDictType]: attributes = [] for row in table: row['ciod'] = generate_ciod_id(row['ciod']) attributes.append(row) return attributes if __name__ == '__main__': standard = pl.parse_html_file(sys.argv[1]) table, tdiv = get_table_and_tdiv(standard) parsed_table_data = table_to_json(table, tdiv) pl.write_pretty_json(parsed_table_data)
for section in all_sections } def enclosing_section_from_id(id_div): # TODO: put example from the standard here if re.match(r'sect.*', id_div['id']): return id_div.parent.parent.parent.parent.parent elif re.match(r'biblio.*', id_div['id']): return id_div.parent.parent.parent elif re.match(r'table.*', id_div['id']): return id_div.parent.parent elif re.match(r'note.*', id_div['id']): return id_div.parent.parent.parent.parent.parent.parent else: return id_div.parent.parent if __name__ == '__main__': # TODO: figure out a way to speed up the parsing; since we only need a # small portion of the parse tree, we may be able to use: # https://docs.python.org/3/library/html.parser.html to avoid building the # full parse tree. standard = {os.path.basename(f): parse_html_file(f) for f in sys.argv[1:]} section_ids = extract_section_ids(standard) sections = { page: normalize_sections(html) for page, html in section_ids.items() } write_pretty_json(sections)