def test_hackernews(self): from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which hacker_news = { 'fields_rules': { 'news': xpaths("//body/center/table/tr", evalx="result"), }, 'result_rules': { 'title': xpath("//span[@class='pagetop']/b/a"), 'google': search(r'\>(Google.+?)\<'), 'html5': search(r'\>(HTML5.+?)\<'), 'facebook': xpath("//a[contains(text(),'Facebook')]"), 'titles': xpathz( 'news', xpaths(".//td[@class='title']/a")), 'tsmps': xpathz( 'news', xpaths(".//td[@class='subtext']/node()[4]")), 'points': xpathz( 'news', xpaths(".//td[@class='subtext']/span")), }, 'result_filters': { 'titles': (None, "result[2]"), 'tsmps': (None, "result[2]"), 'points': (None, "result[2]"), } } url = 'https://news.ycombinator.com/' html = url_get(url) results = dict(Parser(html, hacker_news, 'utf-8').parse()) for key, result in results.items(): print '{} : {}'.format(key, result)
base_fields : 页面基础字段 ajax_rules : 页面 ajax URL 构造规则 result_rules : 结果字段匹配规则 result_filters : 进一步处理匹配结果 每个匹配页面是一个 dict 对象,此对象可以继承 """ import time from sasoup.baserules import xpath, xpaths, search, dpath, base, addon, fields, which, ajaxurl, ajax, RespType from sasoup.baserules import init_rules top_item_base = { "super": None, "url_fmt": "http://item.taobao.com/item.htm?id={iid}", "page_rules": (search(r"siteId=\d"), xpath("//div[@id='LineZing']/@itemid")), "template_rules": {"B": (search(r"siteId=2"),), "C": (search(r"siteId=[134567]"),)}, "fields_rules": { "microscope-data": xpath("//meta[@name='microscope-data']/@content"), "exparams": search(r'\([\'"]exparams[\'"]\,\s*[\'"](.+?)[\'"]'), "itemViewed": xpath("//div[@id='J_itemViewed']/@data-value", RespType.JSON), }, "base_fields": { "siteId": search(r"siteId=(\d)"), "itemId": xpath("//div[@id='LineZing']/@itemid"), "shopId": xpath("//div[@id='LineZing']/@shopid"), "userId": search(r"userid=(\d+)"), }, "addon_fields": {"starts": search(r"starts=(\d+)"), "ends": search(r"ends=(\d+)")}, }
# encoding: utf8 from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which, next, ajaxurl, ajax, RespType from sasoup.baserules import init_rules rules = { 'url': 'http://www.etao.com', 'fields_rules': { 'feedList': xpaths("//div[@id='J_FeedList']//div[@id]", evalx="result"), }, 'result_rules': { 'feed': ( xpathz('feedList', xpath(".//h3[@class='feed-title']/a/@title", evalx="_strip(result)")), xpathz('feedList', xpath(".//h3[@class='feed-title']/a/strong/text()", evalx="_strip(result)")), xpathz('feedList', xpath(".//div[@class='feed-desc']/p/text()", evalx="_strip(result)")), ), 'cats': next( xpaths("//div[contains(@class,'J_PCMain')/li]", evalx="result"), xpath(".//h3/a/text()", evalx="result"), ), }, } rules = init_rules(rules)
electrontics_items: 家电通讯类目特价商品 digitals_cat: 电脑数码类目列表 digitals_items: 电脑数码类目特价商品 """ jdrules = { 'fields_rules': { 'catItems': xpaths("//div[@id='_JD_ALLSORT']/div", evalx="result"), 'cat1Cats': xpaths("//div[@id='electronics']/div[contains(@class,'catalogue')]//ul/li", evalx="result"), 'cat1Items': xpaths("//div[@id='electronics']/div[contains(@class,'plist')]/div[2]//li", evalx="result"), 'cat2Cats': xpaths("//div[@id='digitals']/div[contains(@class,'catalogue')]//ul/li", evalx="result"), 'cat2Items': xpaths("//div[@id='digitals']/div[contains(@class,'plist')]/div[2]//li", evalx="result"), }, 'result_rules': { 'cat': xpathz( 'catItems', xpaths(".//h3/a", evalx="result.text")), 'electrontics_cat': xpathz( 'cat1Cats', xpath(".//a", evalx="result.text")), 'electrontics_items': xpathz( 'cat1Items', xpaths("./div[@class='p-name']/a|./div[@class='p-price']/span", evalx="result.text")), 'digitals_cat': xpathz( 'cat2Cats', xpath(".//a", evalx="result.text")), 'digitals_items': xpathz( 'cat2Items', xpaths("./div[@class='p-name']/a|./div[@class='p-price']/node()", evalx="result.text")), }, }