Python getNodeContentsAsText Examples

Programming Language: Python

Namespace/Package Name: roast.htmlutil

Method/Function: getNodeContentsAsText

Examples at hotexamples.com: 2

Python getNodeContentsAsText - 2 examples found. These are the top rated real world Python examples of roast.htmlutil.getNodeContentsAsText extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: explicit_navi.py Project: mchubby/roast

def get_sections(dom):
    # special case where a lone header is lifted to be
    # subtitle.. didn't really want that to happen, but
    # it's easier to work around.. who has a single
    # navigational element, anyway?
    for elem in htmlutil.getElementsByClass(dom, 'subtitle'):
        title = htmlutil.getNodeContentsAsText(elem)
        idx = elem.parentNode.childNodes.index(elem)
        description = None
        link = '/'+title.lower()

        while elem.parentNode.childNodes[idx+1:]:
            p = elem.parentNode.childNodes[idx+1]
            if p.nodeType == p.TEXT_NODE:
                buf = StringIO()
                p.writexml(buf)
                assert buf.getvalue().strip() == '', \
                    'Subtitle cannot be followed by plain text: %r' % buf.getvalue()
                idx += 1
            else:
                break

        if elem.parentNode.childNodes[idx+1:]:
            p = elem.parentNode.childNodes[idx+1]
            if (p.nodeType == p.ELEMENT_NODE
                and p.nodeName == 'table'):
                assert p.getAttribute('class') == 'docinfo'
                for field in htmlutil.getElementsByClass(p, 'field'):
                    # TODO use classes for locating th and td, just
                    # missing getOnlyElementByClass
                    th = htmlutil.getOnlyElementByTagName(field, 'th')
                    name = htmlutil.getNodeContentsAsText(th)
                    assert name == 'link:'
                    td = htmlutil.getOnlyElementByTagName(field, 'td')
                    link = htmlutil.getNodeContentsAsText(td)
                    if not link.startswith('/'):
                        link = '/'+link

                idx += 1

        while elem.parentNode.childNodes[idx+1:]:
            p = elem.parentNode.childNodes[idx+1]
            if p.nodeType == p.TEXT_NODE:
                buf = StringIO()
                p.writexml(buf)
                assert buf.getvalue().strip() == '', \
                    'Subtitle cannot be followed by plain text: %r' % buf.getvalue()
                idx += 1
            else:
                break

        if elem.parentNode.childNodes[idx+1:]:
            p = elem.parentNode.childNodes[idx+1]
            if (p.nodeType == p.ELEMENT_NODE
                and p.nodeName == 'p'):
                description = htmlutil.getNodeContentsAsText(p)

        yield dict(
            link=link,
            title=title,
            description=description,
            )

    for section in htmlutil.getElementsByClass(dom, 'section'):
        h1 = htmlutil.getOnlyElementByTagName(section, 'h1')
        a = htmlutil.getOnlyElementByTagName(h1, 'a')
        title = htmlutil.getNodeContentsAsText(a)
        link = '/'+title.lower()

        for table in htmlutil.getElementsByClass(section, 'field-list'):
            for field in htmlutil.getElementsByClass(table, 'field'):
                # TODO use classes for locating th and td, just
                # missing getOnlyElementByClass
                th = htmlutil.getOnlyElementByTagName(field, 'th')
                name = htmlutil.getNodeContentsAsText(th)
                assert name == 'link:'
                td = htmlutil.getOnlyElementByTagName(field, 'td')
                link = htmlutil.getNodeContentsAsText(td)
                if not link.startswith('/'):
                    link = '/'+link

        p = section.getElementsByTagName('p')
        if p:
            description = htmlutil.getNodeContentsAsText(p[0])
        else:
            description = None

        yield dict(
            link=link,
            title=title,
            description=description,
            )

Example #2

Show file

File: rst.py Project: mchubby/roast

def asDOM(
    text,
    source_path=None,
    template=None,
    flavor=None,
    s5_theme_url=None,
    navigation=None,
    operation=None,
    ):
    if flavor is None:
        flavor = 'html'

    settings = dict(
        input_encoding='utf-8',
        output_encoding='utf-8',
        embed_stylesheet=False,
        stylesheet_path=htmlutil.KLUDGE_KILL_CSS,
        generator=False,

        # TODO file insertion should really be disabled
        # but can't do that now, as that would make the
        # original include directive fail.. also can't
        # just temporarily enable it to kludge, as that
        # would mean the included file sees it as fully
        # enabled.. will have to reimplement include.

#         file_insertion_enabled=0,

        # TODO ponder disabling raw; it allows content creators to
        # attack the site

#         raw_enabled=0,

        _disable_config=1,
        roast_operation=operation,
        )

    if flavor == 's5':
        writer = s5_html.Writer()
        assert template is None
        assert s5_theme_url is not None
        settings.update(dict(
                theme=None,
                theme_url=s5_theme_url,
                current_slide=True,
                ))
    elif flavor == 'html':
        writer = html4css1.Writer()
    else:
        raise 'Unknown RST flavor: %r' % flavor

    # Docutils stores default `foo` role in global state that persists
    # from one parser to another. Parsing directive "default-role"
    # sets that, usually from s5defs.txt. To avoid infecting all
    # latter runs (`foo` will create <span
    # class="incremental">foo</span> instead of <cite>foo</cite>), we
    # try to contain the damage, and restore the default role to
    # original settings before every run.
    try:
        del roles._roles['']
    except KeyError:
        pass

    html, publisher = publish_programmatically(
        source_class=io.StringInput,
        source=text,
        source_path=source_path,
        destination_class=io.StringOutput,
        destination=None,
        destination_path=None,
        reader=None,
        reader_name='standalone',
        parser=None,
        parser_name='restructuredtext',
        writer=writer,
        writer_name=None,
        settings=None,
        settings_spec=None,
        settings_overrides=settings,
        config_section=None,
        enable_exit_status=None)

    tree = minidom.parseString(html)
    title = htmlutil.getTitle(tree)
    title = htmlutil.getNodeContentsAsText(title)

    # kill generator meta tag
    htmlutil.killGeneratorMetaTags(tree)

    # kill stylesheet
    htmlutil.killLinkedStylesheet(tree)

    if flavor == 'html':
        body = htmlutil.getOnlyElementByTagName(tree, 'body')

        docs = htmlutil.getElementsByClass(body, 'document')
        if len(docs) == 1:
            body = docs[0]

        # remove the headings rst promoted to top level,
        # the template will take care of that
        for h1 in body.getElementsByTagName('h1'):
            if htmlutil.elementHasClass(h1, 'title'):
                h1.parentNode.removeChild(h1)
                break

        if template is not None:
            template = Template(original=body,
                                docFactory=loaders.xmlstr(template),
                                title=title,
                                navigation=navigation,
                                )
            html = flat.flatten(template)
            tree = minidom.parseString(html)

    htmlutil.fixXMLTags(tree)
    return tree