コード例 #1
0
ファイル: 07-dom.py プロジェクト: yangzilong1986/pattern
import os, sys
sys.path.insert(0, os.path.join("..", ".."))

from pattern.web import URL, DOM, plaintext
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

# The web module has a number of convenient search engines,
# but often you will need to handle the HTML in web pages of your interest manually.
# The DOM object can be used for this, similar to the Javascript DOM.

# For example:
url = URL("http://www.reddit.com/top/")
dom = DOM(url.download(cached=True))
print dom.body.content.__class__
for e in dom.get_elements_by_tagname("div.entry")[:5]:  # Top 5 reddit entries.
    for a in e.get_elements_by_tagname(
            "a.title")[:1]:  # First <a class="title"> in entry.
        print plaintext(a.content)
        print a.attributes["href"]
        print

# Some of the links can be relative, for example starting with "../".
# We can get the absolute URL by prepending the base URL.
# However, this might get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs
url = URL("http://nodebox.net")
for link in DOM(url.download()).by_tag("a"):
    link = link.attributes.get("href", "")
    link = abs(link, base=url.redirect or url.string)
    #print link
コード例 #2
0
ファイル: 12-dom.py プロジェクト: sarawutc/pattern
# The DOM (Document Object Model) parses a string of HTML
# and returns a tree of nested Element objects.
# The DOM elements can then be searched by tag name, CSS id, CSS class, ...

# For example, top news entries on Reddit are coded as:
# <div class="entry">
#     <p class="title">
#         <a class="title " href="http://i.imgur.com/yDyPu8P.jpg">Bagel the bengal, destroyer of boxes</a>
#     ...
# </div>
#
# ... which - naturally - is a picture of a cat.
url = URL("http://www.reddit.com/top/")
dom = DOM(url.download(cached=True))
# print dom.body.content
for e in dom.get_elements_by_tagname("div.entry")[:5]:  # Top 5 reddit entries.
    for a in e.get_elements_by_tagname("a.title")[:1]:  # First <a class="title"> in entry.
        print plaintext(a.content)
        print a.attributes["href"]
        print

# The links in the HTML source code may be relative,
# e.g., "../img.jpg" instead of "www.domain.com/img.jpg".
# We can get the absolute URL by prepending the base URL.
# However, this can get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs

url = URL("http://nodebox.net")
for link in DOM(url.download()).by_tag("a"):
    link = link.attributes.get("href", "")