Exemple #1
0
    def test_hackernews(self):
        from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which
        hacker_news = {
            'fields_rules': {
                'news': xpaths("//body/center/table/tr", evalx="result"),
            },
            'result_rules': {
                'title': xpath("//span[@class='pagetop']/b/a"),
                'google': search(r'\>(Google.+?)\<'),
                'html5': search(r'\>(HTML5.+?)\<'),
                'facebook': xpath("//a[contains(text(),'Facebook')]"),
                'titles': xpathz(
                    'news',
                    xpaths(".//td[@class='title']/a")),
                'tsmps': xpathz(
                    'news',
                    xpaths(".//td[@class='subtext']/node()[4]")),
                'points': xpathz(
                    'news',
                    xpaths(".//td[@class='subtext']/span")),
            },
            'result_filters': {
                'titles': (None, "result[2]"),
                'tsmps': (None, "result[2]"),
                'points': (None, "result[2]"),
            }
        }

        url = 'https://news.ycombinator.com/'
        html = url_get(url)
        results = dict(Parser(html, hacker_news, 'utf-8').parse())
        for key, result in results.items():
            print '{} : {}'.format(key, result)
Exemple #2
0
base_fields :       页面基础字段
ajax_rules :        页面 ajax URL 构造规则
result_rules :      结果字段匹配规则
result_filters :    进一步处理匹配结果

每个匹配页面是一个 dict 对象,此对象可以继承
"""
import time

from sasoup.baserules import xpath, xpaths, search, dpath, base, addon, fields, which, ajaxurl, ajax, RespType
from sasoup.baserules import init_rules

top_item_base = {
    "super": None,
    "url_fmt": "http://item.taobao.com/item.htm?id={iid}",
    "page_rules": (search(r"siteId=\d"), xpath("//div[@id='LineZing']/@itemid")),
    "template_rules": {"B": (search(r"siteId=2"),), "C": (search(r"siteId=[134567]"),)},
    "fields_rules": {
        "microscope-data": xpath("//meta[@name='microscope-data']/@content"),
        "exparams": search(r'\([\'"]exparams[\'"]\,\s*[\'"](.+?)[\'"]'),
        "itemViewed": xpath("//div[@id='J_itemViewed']/@data-value", RespType.JSON),
    },
    "base_fields": {
        "siteId": search(r"siteId=(\d)"),
        "itemId": xpath("//div[@id='LineZing']/@itemid"),
        "shopId": xpath("//div[@id='LineZing']/@shopid"),
        "userId": search(r"userid=(\d+)"),
    },
    "addon_fields": {"starts": search(r"starts=(\d+)"), "ends": search(r"ends=(\d+)")},
}