import os, sys sys.path.insert(0, os.path.join("..", "..")) from pattern.web import URL, DOM, plaintext from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT # The web module has a number of convenient search engines, # but often you will need to handle the HTML in web pages of your interest manually. # The DOM object can be used for this, similar to the Javascript DOM. # For example: url = URL("http://www.reddit.com/top/") dom = DOM(url.download(cached=True)) print dom.body.content.__class__ for e in dom.get_elements_by_tagname("div.entry")[:5]: # Top 5 reddit entries. for a in e.get_elements_by_tagname( "a.title")[:1]: # First <a class="title"> in entry. print plaintext(a.content) print a.attributes["href"] print # Some of the links can be relative, for example starting with "../". # We can get the absolute URL by prepending the base URL. # However, this might get messy with anchors, trailing slashes and redirected URL's. # A good way to get absolute URL's is to use the module's abs() function: from pattern.web import abs url = URL("http://nodebox.net") for link in DOM(url.download()).by_tag("a"): link = link.attributes.get("href", "") link = abs(link, base=url.redirect or url.string) #print link
# The DOM (Document Object Model) parses a string of HTML # and returns a tree of nested Element objects. # The DOM elements can then be searched by tag name, CSS id, CSS class, ... # For example, top news entries on Reddit are coded as: # <div class="entry"> # <p class="title"> # <a class="title " href="http://i.imgur.com/yDyPu8P.jpg">Bagel the bengal, destroyer of boxes</a> # ... # </div> # # ... which - naturally - is a picture of a cat. url = URL("http://www.reddit.com/top/") dom = DOM(url.download(cached=True)) # print dom.body.content for e in dom.get_elements_by_tagname("div.entry")[:5]: # Top 5 reddit entries. for a in e.get_elements_by_tagname("a.title")[:1]: # First <a class="title"> in entry. print plaintext(a.content) print a.attributes["href"] print # The links in the HTML source code may be relative, # e.g., "../img.jpg" instead of "www.domain.com/img.jpg". # We can get the absolute URL by prepending the base URL. # However, this can get messy with anchors, trailing slashes and redirected URL's. # A good way to get absolute URL's is to use the module's abs() function: from pattern.web import abs url = URL("http://nodebox.net") for link in DOM(url.download()).by_tag("a"): link = link.attributes.get("href", "")