def thread(data, default=u"Untitled.", id=None): """ Extract <h1> title from web page. The title is *probably* the text node, which is the nearest H1 node in context to an element with the `isso-thread` id. """ html = html5lib.parse(data, treebuilder="dom") assert html.lastChild.nodeName == "html" html = html.lastChild # aka getElementById, but limited to div and section tags el = list(filter(lambda i: i.attributes["id"].value == "isso-thread", filter(lambda i: "id" in i.attributes, chain(*map(html.getElementsByTagName, ("div", "section")))))) if not el: return id, default el = el[0] visited = [] def recurse(node): for child in node.childNodes: if child.nodeType != child.ELEMENT_NODE: continue if child.nodeName.upper() == "H1": return child if child not in visited: return recurse(child) def gettext(rv): for child in rv.childNodes: if child.nodeType == child.TEXT_NODE: yield child.nodeValue if child.nodeType == child.ELEMENT_NODE: for item in gettext(child): yield item try: id = unquote(el.attributes["data-isso-id"].value) except (KeyError, AttributeError): pass try: return id, unquote(el.attributes["data-title"].value) except (KeyError, AttributeError): pass while el is not None: # el.parentNode is None in the very end visited.append(el) rv = recurse(el) if rv: return id, ''.join(gettext(rv)).strip() el = el.parentNode return id, default
def title(data, default=u"Untitled."): """ Extract <h1> title from web page. The title is *probably* the text node, which is the nearest H1 node in context to an element with the `isso-thread` id. >>> title("asdf") # doctest: +IGNORE_UNICODE 'Untitled.' >>> title(''' ... <html> ... <head> ... <title>Foo!</title> ... </head> ... <body> ... <header> ... <h1>generic website title.</h1> ... <h2>subtile title.</h2> ... </header> ... <article> ... <header> ... <h1>Can you find me?</h1> ... </header> ... <section id="isso-thread"> ... </section> ... </article> ... </body> ... </html>''') # doctest: +IGNORE_UNICODE 'Can you find me?' >>> title(''' ... <html> ... <body> ... <h1>I'm the real title!1 ... <section data-title="No way%21" id="isso-thread"> ... ''') # doctest: +IGNORE_UNICODE 'No way!' """ html = html5lib.parse(data, treebuilder="dom") assert html.lastChild.nodeName == "html" html = html.lastChild # aka getElementById, but limited to div and section tags el = list(filter(lambda i: i.attributes["id"].value == "isso-thread", filter(lambda i: "id" in i.attributes, chain(*map(html.getElementsByTagName, ("div", "section")))))) if not el: return default el = el[0] visited = [] def recurse(node): for child in node.childNodes: if child.nodeType != child.ELEMENT_NODE: continue if child.nodeName.upper() == "H1": return child if child not in visited: return recurse(child) def gettext(rv): for child in rv.childNodes: if child.nodeType == child.TEXT_NODE: yield child.nodeValue if child.nodeType == child.ELEMENT_NODE: for item in gettext(child): yield item try: return unquote(el.attributes["data-title"].value) except (KeyError, AttributeError): pass while el is not None: # el.parentNode is None in the very end visited.append(el) rv = recurse(el) if rv: return ''.join(gettext(rv)).strip() el = el.parentNode return default
def title(data, default=u"Untitled."): """ Extract <h1> title from web page. The title is *probably* the text node, which is the nearest H1 node in context to an element with the `isso-thread` id. >>> title("asdf") # doctest: +IGNORE_UNICODE u'Untitled.' >>> title(''' ... <html> ... <head> ... <title>Foo!</title> ... </head> ... <body> ... <header> ... <h1>generic website title.</h1> ... <h2>subtile title.</h2> ... </header> ... <article> ... <header> ... <h1>Can you find me?</h1> ... </header> ... <section id="isso-thread"> ... </section> ... </article> ... </body> ... </html>''') # doctest: +IGNORE_UNICODE u'Can you find me?' """ html = html5lib.parse(data, treebuilder="dom") assert html.lastChild.nodeName == "html" html = html.lastChild # aka getElementById, but limited to div and section tags el = list(filter(lambda i: i.attributes["id"].value == "isso-thread", filter(lambda i: "id" in i.attributes, chain(*map(html.getElementsByTagName, ("div", "section")))))) if not el: return default el = el[0] visited = [] def recurse(node): for child in node.childNodes: if child.nodeType != child.ELEMENT_NODE: continue if child.nodeName.upper() == "H1": return child if child not in visited: return recurse(child) def gettext(rv): for child in rv.childNodes: if child.nodeType == child.TEXT_NODE: yield child.nodeValue if child.nodeType == child.ELEMENT_NODE: for item in gettext(child): yield item while el is not None: # el.parentNode is None in the very end visited.append(el) rv = recurse(el) if rv: return ''.join(gettext(rv)).strip() el = el.parentNode return default