Beispiel #1
0
    def test_hackernews(self):
        from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which
        hacker_news = {
            'fields_rules': {
                'news': xpaths("//body/center/table/tr", evalx="result"),
            },
            'result_rules': {
                'title': xpath("//span[@class='pagetop']/b/a"),
                'google': search(r'\>(Google.+?)\<'),
                'html5': search(r'\>(HTML5.+?)\<'),
                'facebook': xpath("//a[contains(text(),'Facebook')]"),
                'titles': xpathz(
                    'news',
                    xpaths(".//td[@class='title']/a")),
                'tsmps': xpathz(
                    'news',
                    xpaths(".//td[@class='subtext']/node()[4]")),
                'points': xpathz(
                    'news',
                    xpaths(".//td[@class='subtext']/span")),
            },
            'result_filters': {
                'titles': (None, "result[2]"),
                'tsmps': (None, "result[2]"),
                'points': (None, "result[2]"),
            }
        }

        url = 'https://news.ycombinator.com/'
        html = url_get(url)
        results = dict(Parser(html, hacker_news, 'utf-8').parse())
        for key, result in results.items():
            print '{} : {}'.format(key, result)
Beispiel #2
0
base_fields :       页面基础字段
ajax_rules :        页面 ajax URL 构造规则
result_rules :      结果字段匹配规则
result_filters :    进一步处理匹配结果

每个匹配页面是一个 dict 对象,此对象可以继承
"""
import time

from sasoup.baserules import xpath, xpaths, search, dpath, base, addon, fields, which, ajaxurl, ajax, RespType
from sasoup.baserules import init_rules

top_item_base = {
    "super": None,
    "url_fmt": "http://item.taobao.com/item.htm?id={iid}",
    "page_rules": (search(r"siteId=\d"), xpath("//div[@id='LineZing']/@itemid")),
    "template_rules": {"B": (search(r"siteId=2"),), "C": (search(r"siteId=[134567]"),)},
    "fields_rules": {
        "microscope-data": xpath("//meta[@name='microscope-data']/@content"),
        "exparams": search(r'\([\'"]exparams[\'"]\,\s*[\'"](.+?)[\'"]'),
        "itemViewed": xpath("//div[@id='J_itemViewed']/@data-value", RespType.JSON),
    },
    "base_fields": {
        "siteId": search(r"siteId=(\d)"),
        "itemId": xpath("//div[@id='LineZing']/@itemid"),
        "shopId": xpath("//div[@id='LineZing']/@shopid"),
        "userId": search(r"userid=(\d+)"),
    },
    "addon_fields": {"starts": search(r"starts=(\d+)"), "ends": search(r"ends=(\d+)")},
}
Beispiel #3
0
# encoding: utf8
from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which, next, ajaxurl, ajax, RespType
from sasoup.baserules import init_rules

rules = {
    'url': 'http://www.etao.com',
    'fields_rules': {
        'feedList': xpaths("//div[@id='J_FeedList']//div[@id]", evalx="result"),
    },
    'result_rules': {
        'feed': (
            xpathz('feedList', xpath(".//h3[@class='feed-title']/a/@title", evalx="_strip(result)")),
            xpathz('feedList', xpath(".//h3[@class='feed-title']/a/strong/text()", evalx="_strip(result)")),
            xpathz('feedList', xpath(".//div[@class='feed-desc']/p/text()", evalx="_strip(result)")),
        ),
        'cats': next(
            xpaths("//div[contains(@class,'J_PCMain')/li]", evalx="result"),
            xpath(".//h3/a/text()", evalx="result"),
        ),
    },
}

rules = init_rules(rules)
Beispiel #4
0
    electrontics_items: 家电通讯类目特价商品
    digitals_cat:       电脑数码类目列表
    digitals_items:     电脑数码类目特价商品
"""
jdrules = {
    'fields_rules': {
        'catItems': xpaths("//div[@id='_JD_ALLSORT']/div", evalx="result"),
        'cat1Cats': xpaths("//div[@id='electronics']/div[contains(@class,'catalogue')]//ul/li", evalx="result"),
        'cat1Items': xpaths("//div[@id='electronics']/div[contains(@class,'plist')]/div[2]//li", evalx="result"),
        'cat2Cats': xpaths("//div[@id='digitals']/div[contains(@class,'catalogue')]//ul/li", evalx="result"),
        'cat2Items': xpaths("//div[@id='digitals']/div[contains(@class,'plist')]/div[2]//li", evalx="result"),
    },
    'result_rules': {
        'cat': xpathz(
            'catItems',
            xpaths(".//h3/a", evalx="result.text")),
        'electrontics_cat': xpathz(
            'cat1Cats',
            xpath(".//a", evalx="result.text")),
        'electrontics_items': xpathz(
            'cat1Items',
            xpaths("./div[@class='p-name']/a|./div[@class='p-price']/span", evalx="result.text")),
        'digitals_cat': xpathz(
            'cat2Cats',
            xpath(".//a", evalx="result.text")),
        'digitals_items': xpathz(
            'cat2Items',
            xpaths("./div[@class='p-name']/a|./div[@class='p-price']/node()", evalx="result.text")),
    },
}