Beispiel #1
0
def thread(data, default=u"Untitled.", id=None):
    """
    Extract <h1> title from web page. The title is *probably* the text node,
    which is the nearest H1 node in context to an element with the `isso-thread` id.
    """

    html = html5lib.parse(data, treebuilder="dom")

    assert html.lastChild.nodeName == "html"
    html = html.lastChild

    # aka getElementById, but limited to div and section tags
    el = list(filter(lambda i: i.attributes["id"].value == "isso-thread",
              filter(lambda i: "id" in i.attributes,
                     chain(*map(html.getElementsByTagName, ("div", "section"))))))

    if not el:
        return id, default

    el = el[0]
    visited = []

    def recurse(node):
        for child in node.childNodes:
            if child.nodeType != child.ELEMENT_NODE:
                continue
            if child.nodeName.upper() == "H1":
                return child
            if child not in visited:
                return recurse(child)

    def gettext(rv):
        for child in rv.childNodes:
            if child.nodeType == child.TEXT_NODE:
                yield child.nodeValue
            if child.nodeType == child.ELEMENT_NODE:
                for item in gettext(child):
                    yield item

    try:
        id = unquote(el.attributes["data-isso-id"].value)
    except (KeyError, AttributeError):
        pass

    try:
        return id, unquote(el.attributes["data-title"].value)
    except (KeyError, AttributeError):
        pass

    while el is not None:  # el.parentNode is None in the very end

        visited.append(el)
        rv = recurse(el)

        if rv:
            return id, ''.join(gettext(rv)).strip()

        el = el.parentNode

    return id, default
Beispiel #2
0
def title(data, default=u"Untitled."):
    """
    Extract <h1> title from web page. The title is *probably* the text node,
    which is the nearest H1 node in context to an element with the `isso-thread` id.

    >>> title("asdf")  # doctest: +IGNORE_UNICODE
    'Untitled.'
    >>> title('''
    ... <html>
    ... <head>
    ...     <title>Foo!</title>
    ... </head>
    ... <body>
    ...     <header>
    ...         <h1>generic website title.</h1>
    ...         <h2>subtile title.</h2>
    ...     </header>
    ...     <article>
    ...         <header>
    ...             <h1>Can you find me?</h1>
    ...         </header>
    ...         <section id="isso-thread">
    ...         </section>
    ...     </article>
    ... </body>
    ... </html>''')  # doctest: +IGNORE_UNICODE
    'Can you find me?'
    >>> title('''
    ... <html>
    ... <body>
    ... <h1>I'm the real title!1
    ... <section data-title="No way%21" id="isso-thread">
    ... ''')  # doctest: +IGNORE_UNICODE
    'No way!'
    """

    html = html5lib.parse(data, treebuilder="dom")

    assert html.lastChild.nodeName == "html"
    html = html.lastChild

    # aka getElementById, but limited to div and section tags
    el = list(filter(lambda i: i.attributes["id"].value == "isso-thread",
              filter(lambda i: "id" in i.attributes,
                     chain(*map(html.getElementsByTagName, ("div", "section"))))))

    if not el:
        return default

    el = el[0]
    visited = []

    def recurse(node):
        for child in node.childNodes:
            if child.nodeType != child.ELEMENT_NODE:
                continue
            if child.nodeName.upper() == "H1":
                return child
            if child not in visited:
                return recurse(child)

    def gettext(rv):
        for child in rv.childNodes:
            if child.nodeType == child.TEXT_NODE:
                yield child.nodeValue
            if child.nodeType == child.ELEMENT_NODE:
                for item in gettext(child):
                    yield item

    try:
        return unquote(el.attributes["data-title"].value)
    except (KeyError, AttributeError):
        pass

    while el is not None:  # el.parentNode is None in the very end

        visited.append(el)
        rv = recurse(el)

        if rv:
            return ''.join(gettext(rv)).strip()

        el = el.parentNode

    return default
Beispiel #3
0
def title(data, default=u"Untitled."):
    """
    Extract <h1> title from web page. The title is *probably* the text node,
    which is the nearest H1 node in context to an element with the `isso-thread` id.

    >>> title("asdf")  # doctest: +IGNORE_UNICODE
    u'Untitled.'
    >>> title('''
    ... <html>
    ... <head>
    ...     <title>Foo!</title>
    ... </head>
    ... <body>
    ...     <header>
    ...         <h1>generic website title.</h1>
    ...         <h2>subtile title.</h2>
    ...     </header>
    ...     <article>
    ...         <header>
    ...             <h1>Can you find me?</h1>
    ...         </header>
    ...         <section id="isso-thread">
    ...         </section>
    ...     </article>
    ... </body>
    ... </html>''')  # doctest: +IGNORE_UNICODE
    u'Can you find me?'
    """

    html = html5lib.parse(data, treebuilder="dom")

    assert html.lastChild.nodeName == "html"
    html = html.lastChild

    # aka getElementById, but limited to div and section tags
    el = list(filter(lambda i: i.attributes["id"].value == "isso-thread",
              filter(lambda i: "id" in i.attributes,
                     chain(*map(html.getElementsByTagName, ("div", "section"))))))

    if not el:
        return default

    el = el[0]
    visited = []

    def recurse(node):
        for child in node.childNodes:
            if child.nodeType != child.ELEMENT_NODE:
                continue
            if child.nodeName.upper() == "H1":
                return child
            if child not in visited:
                return recurse(child)

    def gettext(rv):
        for child in rv.childNodes:
            if child.nodeType == child.TEXT_NODE:
                yield child.nodeValue
            if child.nodeType == child.ELEMENT_NODE:
                for item in gettext(child):
                    yield item

    while el is not None:  # el.parentNode is None in the very end

        visited.append(el)
        rv = recurse(el)

        if rv:
            return ''.join(gettext(rv)).strip()

        el = el.parentNode

    return default