def get_sections(dom): # special case where a lone header is lifted to be # subtitle.. didn't really want that to happen, but # it's easier to work around.. who has a single # navigational element, anyway? for elem in htmlutil.getElementsByClass(dom, 'subtitle'): title = htmlutil.getNodeContentsAsText(elem) idx = elem.parentNode.childNodes.index(elem) description = None link = '/'+title.lower() while elem.parentNode.childNodes[idx+1:]: p = elem.parentNode.childNodes[idx+1] if p.nodeType == p.TEXT_NODE: buf = StringIO() p.writexml(buf) assert buf.getvalue().strip() == '', \ 'Subtitle cannot be followed by plain text: %r' % buf.getvalue() idx += 1 else: break if elem.parentNode.childNodes[idx+1:]: p = elem.parentNode.childNodes[idx+1] if (p.nodeType == p.ELEMENT_NODE and p.nodeName == 'table'): assert p.getAttribute('class') == 'docinfo' for field in htmlutil.getElementsByClass(p, 'field'): # TODO use classes for locating th and td, just # missing getOnlyElementByClass th = htmlutil.getOnlyElementByTagName(field, 'th') name = htmlutil.getNodeContentsAsText(th) assert name == 'link:' td = htmlutil.getOnlyElementByTagName(field, 'td') link = htmlutil.getNodeContentsAsText(td) if not link.startswith('/'): link = '/'+link idx += 1 while elem.parentNode.childNodes[idx+1:]: p = elem.parentNode.childNodes[idx+1] if p.nodeType == p.TEXT_NODE: buf = StringIO() p.writexml(buf) assert buf.getvalue().strip() == '', \ 'Subtitle cannot be followed by plain text: %r' % buf.getvalue() idx += 1 else: break if elem.parentNode.childNodes[idx+1:]: p = elem.parentNode.childNodes[idx+1] if (p.nodeType == p.ELEMENT_NODE and p.nodeName == 'p'): description = htmlutil.getNodeContentsAsText(p) yield dict( link=link, title=title, description=description, ) for section in htmlutil.getElementsByClass(dom, 'section'): h1 = htmlutil.getOnlyElementByTagName(section, 'h1') a = htmlutil.getOnlyElementByTagName(h1, 'a') title = htmlutil.getNodeContentsAsText(a) link = '/'+title.lower() for table in htmlutil.getElementsByClass(section, 'field-list'): for field in htmlutil.getElementsByClass(table, 'field'): # TODO use classes for locating th and td, just # missing getOnlyElementByClass th = htmlutil.getOnlyElementByTagName(field, 'th') name = htmlutil.getNodeContentsAsText(th) assert name == 'link:' td = htmlutil.getOnlyElementByTagName(field, 'td') link = htmlutil.getNodeContentsAsText(td) if not link.startswith('/'): link = '/'+link p = section.getElementsByTagName('p') if p: description = htmlutil.getNodeContentsAsText(p[0]) else: description = None yield dict( link=link, title=title, description=description, )
def asDOM( text, source_path=None, template=None, flavor=None, s5_theme_url=None, navigation=None, operation=None, ): if flavor is None: flavor = 'html' settings = dict( input_encoding='utf-8', output_encoding='utf-8', embed_stylesheet=False, stylesheet_path=htmlutil.KLUDGE_KILL_CSS, generator=False, # TODO file insertion should really be disabled # but can't do that now, as that would make the # original include directive fail.. also can't # just temporarily enable it to kludge, as that # would mean the included file sees it as fully # enabled.. will have to reimplement include. # file_insertion_enabled=0, # TODO ponder disabling raw; it allows content creators to # attack the site # raw_enabled=0, _disable_config=1, roast_operation=operation, ) if flavor == 's5': writer = s5_html.Writer() assert template is None assert s5_theme_url is not None settings.update(dict( theme=None, theme_url=s5_theme_url, current_slide=True, )) elif flavor == 'html': writer = html4css1.Writer() else: raise 'Unknown RST flavor: %r' % flavor # Docutils stores default `foo` role in global state that persists # from one parser to another. Parsing directive "default-role" # sets that, usually from s5defs.txt. To avoid infecting all # latter runs (`foo` will create <span # class="incremental">foo</span> instead of <cite>foo</cite>), we # try to contain the damage, and restore the default role to # original settings before every run. try: del roles._roles[''] except KeyError: pass html, publisher = publish_programmatically( source_class=io.StringInput, source=text, source_path=source_path, destination_class=io.StringOutput, destination=None, destination_path=None, reader=None, reader_name='standalone', parser=None, parser_name='restructuredtext', writer=writer, writer_name=None, settings=None, settings_spec=None, settings_overrides=settings, config_section=None, enable_exit_status=None) tree = minidom.parseString(html) title = htmlutil.getTitle(tree) title = htmlutil.getNodeContentsAsText(title) # kill generator meta tag htmlutil.killGeneratorMetaTags(tree) # kill stylesheet htmlutil.killLinkedStylesheet(tree) if flavor == 'html': body = htmlutil.getOnlyElementByTagName(tree, 'body') docs = htmlutil.getElementsByClass(body, 'document') if len(docs) == 1: body = docs[0] # remove the headings rst promoted to top level, # the template will take care of that for h1 in body.getElementsByTagName('h1'): if htmlutil.elementHasClass(h1, 'title'): h1.parentNode.removeChild(h1) break if template is not None: template = Template(original=body, docFactory=loaders.xmlstr(template), title=title, navigation=navigation, ) html = flat.flatten(template) tree = minidom.parseString(html) htmlutil.fixXMLTags(tree) return tree