def test_hackernews(self): from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which hacker_news = { 'fields_rules': { 'news': xpaths("//body/center/table/tr", evalx="result"), }, 'result_rules': { 'title': xpath("//span[@class='pagetop']/b/a"), 'google': search(r'\>(Google.+?)\<'), 'html5': search(r'\>(HTML5.+?)\<'), 'facebook': xpath("//a[contains(text(),'Facebook')]"), 'titles': xpathz( 'news', xpaths(".//td[@class='title']/a")), 'tsmps': xpathz( 'news', xpaths(".//td[@class='subtext']/node()[4]")), 'points': xpathz( 'news', xpaths(".//td[@class='subtext']/span")), }, 'result_filters': { 'titles': (None, "result[2]"), 'tsmps': (None, "result[2]"), 'points': (None, "result[2]"), } } url = 'https://news.ycombinator.com/' html = url_get(url) results = dict(Parser(html, hacker_news, 'utf-8').parse()) for key, result in results.items(): print '{} : {}'.format(key, result)
base_fields : 页面基础字段 ajax_rules : 页面 ajax URL 构造规则 result_rules : 结果字段匹配规则 result_filters : 进一步处理匹配结果 每个匹配页面是一个 dict 对象,此对象可以继承 """ import time from sasoup.baserules import xpath, xpaths, search, dpath, base, addon, fields, which, ajaxurl, ajax, RespType from sasoup.baserules import init_rules top_item_base = { "super": None, "url_fmt": "http://item.taobao.com/item.htm?id={iid}", "page_rules": (search(r"siteId=\d"), xpath("//div[@id='LineZing']/@itemid")), "template_rules": {"B": (search(r"siteId=2"),), "C": (search(r"siteId=[134567]"),)}, "fields_rules": { "microscope-data": xpath("//meta[@name='microscope-data']/@content"), "exparams": search(r'\([\'"]exparams[\'"]\,\s*[\'"](.+?)[\'"]'), "itemViewed": xpath("//div[@id='J_itemViewed']/@data-value", RespType.JSON), }, "base_fields": { "siteId": search(r"siteId=(\d)"), "itemId": xpath("//div[@id='LineZing']/@itemid"), "shopId": xpath("//div[@id='LineZing']/@shopid"), "userId": search(r"userid=(\d+)"), }, "addon_fields": {"starts": search(r"starts=(\d+)"), "ends": search(r"ends=(\d+)")}, }