def test_name_empty_headings(self): """ Checks that we get some name for a heading with no text content """ html = '<h1><img src="" /></h1>' structure = get_structure(parse(html)) self.assertTrue(len(structure[0].name) > 0)
def test_name_empty_para(self): """ Checks that we get some name for a paragaph with no text content """ html = '<p><img src="" /></p>' structure = get_structure(parse(html)) self.assertTrue(len(structure[0].name) > 0)
def test_get_parent(self): """ Tests that get_parent works """ t = parse("<a><b1></b1><b2></b2></a>") n = t.find(".//b2") p = get_parent(t, n) self.assertEqual(p, t.find(".//a"))
def test_get_index(self): """ Tests that get_index returns the index of node amongst its siblings """ t = parse("<a><b1></b1><b2></b2></a>") n = t.find(".//b2") p = get_parent(t, n) self.assertEqual(1, get_index(p, n))
def extract_structure(content): """ Extracts H1, H2, etc headings, and other block level elements and returns a list of tuples containing (level, name, tag) """ # This function is no longer used externally, but it has tests # against it that are useful at checking the behaviour of get_structure tree = parse(content, clean=True) structure = get_structure(tree, assert_structure=True) return structure
def format_html(html, styleinfo, return_tree=False, pretty_print=False): """ Formats the XHTML given using a dictionary of style information. The dictionary has keys which are the ids of sections, and values which are lists of CSS classes or special commands. """ layout_strategy = get_layout_details_strategy() html = layout_strategy.format_pre_parse_hacks(html, styleinfo) root = parse(html, clean=True) root = layout_strategy.format_post_parse_hacks(root, styleinfo) structure = get_structure(root, assert_structure=True) structure = layout_strategy.format_structure_hacks(structure, styleinfo) sect_ids = [s.sect_id for s in structure] styleinfo = _sanitise_styleinfo(styleinfo, sect_ids) # Strip existing divs, otherwise we cannot format properly. If # there are other block level elements that mess things up, we # raise BadStructure later, but divs have no semantics so can just # be removed. strip_presentation(root) # Apply normal CSS classes. for si in structure: # Apply css styles classes = get_classes_from_presinfo(styleinfo[si.sect_id]) classes.sort() if classes: si.node.set("class", " ".join(classes)) # Create layout from row/column commands layout = create_layout(root, styleinfo, structure) for c in layout.content: check_layout(c, structure, layout_strategy) # Create new ET tree from layout. The individual nodes that belong to # 'root' are not altered, but just added to a new tree. This means that the # information in 'structure' does not need updating. nodes = [] for content in layout.content: nodes.extend(content.as_nodes(layout_strategy)) rendered = ET.fromstring("<html><body></body></html>") rendered.getchildren()[0].extend(nodes) # Apply hacks rendered = layout_strategy.format_post_layout_hacks(rendered, structure, styleinfo) # Pretty print if pretty_print: indent(rendered) # Remove the temporary IDs we may have added when splitting the HTML # into content and presentation. We don't do this before this point, # as the IDs need to be there to identify sections for si in structure: if 'id' in si.node.attrib: del si.node.attrib['id'] if return_tree: return (rendered, structure) else: return html_extract(rendered)
def test_regression_1(self): # A bug in using existing section ids html = '<h1 id="h1_1">heading 1</h1><h1>A new heading</h1><h1 id="h1_2">heading 2</h1><h1 id="h1_3">heading 3</h1>' structure = get_structure(parse(html)) self.assertEqual(["h1_1", "h1_4", "h1_2", "h1_3"], [s.sect_id for s in structure])
def test_dont_use_duplicate_existing_sect_id(self): html = "<h1 id='h1_10'>Hi</h1><h1 id='h1_10'>There</h1>" structure = get_structure(parse(html)) self.assertEqual(structure[0].sect_id, "h1_10") self.assertEqual(structure[1].sect_id, "h1_1")
def pretty_print(content): t = parse(content) indent(t) return html_extract(t)
def format_html(html, styleinfo, return_tree=False, pretty_print=False): """ Formats the XHTML given using a dictionary of style information. The dictionary has keys which are the ids of sections, and values which are lists of CSS classes or special commands. """ layout_strategy = get_layout_details_strategy() html = layout_strategy.format_pre_parse_hacks(html, styleinfo) root = parse(html, clean=True) root = layout_strategy.format_post_parse_hacks(root, styleinfo) structure = get_structure(root, assert_structure=True) structure = layout_strategy.format_structure_hacks(structure, styleinfo) sect_ids = [s.sect_id for s in structure] styleinfo = _sanitise_styleinfo(styleinfo, sect_ids) # Strip existing divs, otherwise we cannot format properly. If # there are other block level elements that mess things up, we # raise BadStructure later, but divs have no semantics so can just # be removed. strip_presentation(root) # Apply normal CSS classes. for si in structure: # Apply css styles classes = get_classes_from_presinfo(styleinfo[si.sect_id]) classes.sort() if classes: si.node.set("class", " ".join(classes)) # Create layout from row/column commands layout = create_layout(root, styleinfo, structure) for c in layout.content: check_layout(c, structure, layout_strategy) # Create new ET tree from layout. The individual nodes that belong to # 'root' are not altered, but just added to a new tree. This means that the # information in 'structure' does not need updating. nodes = [] for content in layout.content: nodes.extend(content.as_nodes(layout_strategy)) rendered = ET.fromstring("<html><body></body></html>") rendered.getchildren()[0].extend(nodes) # Apply hacks rendered = layout_strategy.format_post_layout_hacks( rendered, structure, styleinfo) # Pretty print if pretty_print: indent(rendered) # Remove the temporary IDs we may have added when splitting the HTML # into content and presentation. We don't do this before this point, # as the IDs need to be there to identify sections for si in structure: if 'id' in si.node.attrib: del si.node.attrib['id'] if return_tree: return (rendered, structure) else: return html_extract(rendered)
def clean_html(html): tree = parse(html, clean=True) return html_extract(tree)
def extract_presentation(html): """ Takes HTML with formatting applied and returns presentation elements (a dictionary with keys = section names, values = set of classes/commands) and the HTML without formatting (ready to be used in an editor) """ # TODO: this function is not brilliantly well defined e.g. should # there be an entry in the dictionary for sections with no # formatting? This does not affect functionality, but it does # affect tests. layout_strategy = get_layout_details_strategy() html = layout_strategy.extract_pre_parse_hacks(html) root = parse(html, clean=False) # it's important we don't clean. root = layout_strategy.extract_post_parse_hacks(root) structure = get_structure(root) structure = layout_strategy.extract_structure_hacks(structure) pres = {} layout_commands = find_all_layout_nodes(root, layout_strategy) for si in structure: pres[si.sect_id] = set() # Section - extract classes for c in get_classes_for_node(si.node): pres[si.sect_id].add(PresentationClass(c)) if 'class' in si.node.attrib: del si.node.attrib['class'] # Add custom ids. These are only for purpose of editing, # and will be removed again at end of format_html si.node.set('id', si.sect_id) # Now, deal with layout divs for this structure item cmd_pairs = layout_commands.get(si.node, []) for cmd, div_node in cmd_pairs: # Need to create another entry in pres pres_name = cmd.prefix + si.sect_id cmd_classes = set() # Find the classes that correspond to PresentationClass objects and # add them. node_classes = set(get_classes_for_node(div_node)) if cmd in (NEWROW, NEWINNERROW): filterfunc = layout_strategy.is_row_class else: filterfunc = layout_strategy.is_column_class # Need the classes from the inner column div children = div_node.getchildren() if len(children) > 0 and children[0].tag == 'div': node_classes |= set(get_classes_for_node(children[0])) for c in node_classes: if not filterfunc(c): cmd_classes.add(PresentationClass(c)) cmd_classes.add(cmd) # not strictly necessary, but helps testing pres[pres_name] = cmd_classes strip_presentation(root) out_html = html_extract(root) return (pres, out_html)