Beispiel #1
0
    def test_hackernews(self):
        from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which
        hacker_news = {
            'fields_rules': {
                'news': xpaths("//body/center/table/tr", evalx="result"),
            },
            'result_rules': {
                'title': xpath("//span[@class='pagetop']/b/a"),
                'google': search(r'\>(Google.+?)\<'),
                'html5': search(r'\>(HTML5.+?)\<'),
                'facebook': xpath("//a[contains(text(),'Facebook')]"),
                'titles': xpathz(
                    'news',
                    xpaths(".//td[@class='title']/a")),
                'tsmps': xpathz(
                    'news',
                    xpaths(".//td[@class='subtext']/node()[4]")),
                'points': xpathz(
                    'news',
                    xpaths(".//td[@class='subtext']/span")),
            },
            'result_filters': {
                'titles': (None, "result[2]"),
                'tsmps': (None, "result[2]"),
                'points': (None, "result[2]"),
            }
        }

        url = 'https://news.ycombinator.com/'
        html = url_get(url)
        results = dict(Parser(html, hacker_news, 'utf-8').parse())
        for key, result in results.items():
            print '{} : {}'.format(key, result)
Beispiel #2
0
 "shopUrl": which(
     xpath("//a[contains(@class, 'seller-name')]/@href"), xpath("//a[contains(@class,'enter-shop')]/@href")
 ),
 "itemImg": xpath("//img[@id='J_ImgBooth']/@data-src"),
 "itemTitle": fields("itemViewed", dpath("['title']")),
 "initPrice": fields("itemViewed", dpath("['price']")),
 "promoInfo": None,
 "postageInfo": None,
 "monthlyTrade": ajax("apiItemInfo", dpath("['quantity']['quanity']")),
 "itemRate": None,
 "bonus": None,
 "favNum": ajax("saveCounts", dpath("['{apiItemCollectsKey}']")),
 "totalSoldOut": None,
 "attrList": xpaths(
     "//ul[@class='attributes-list']/li",
     # evalx="re.split(u'[:\uff1a]', _strip(result.text))",
     evalx="result.text.partition(':' if result.text.partition(':')[0] < result.text.partition(u'\uff1a')[0] else u'\uff1a')[::2]",
 ),
 "reviewCount": ajax("saveCounts", dpath("['{apiItemViewsKey}']")),
 "starts": addon("starts"),
 "ends": addon("ends"),
 "userTag": None,
 "cid": None,
 "location": ajax("wholeSibUrl", dpath("['location']")),
 "brand": None,
 "gradeAvg": None,
 "peopleNum": None,
 "periodSoldQuantity": None,
 "rateTotal": None,
 "spuId": None,
 "totalSoldQuantity": None,
Beispiel #3
0
# encoding: utf8
from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which, ajaxurl, ajax, RespType

appsorules = {
    'fields_rules': {
        'items': xpaths("//div[contains(@id, 'liveblog-entry-')]/div", evalx="result"),
    },
    'result_rules': {
        'tsmp': xpathz(
            'items',
            xpaths(".//p[1]/strong", evalx="result.text")),
        'title': xpathz(
            'items',
            xpaths(".//p[2]/a/strong", evalx="result.text")),
        'link': xpathz(
            'items',
            xpaths(".//p[2]/a/@href")),
        'desc': xpathz(
            'items',
            xpaths(".//p[3]", evalx="result.text")),
        'intro': xpathz(
            'items',
            xpaths(".//p[4]", evalx="result.text")),
    },
}
Beispiel #4
0
# encoding: utf8
from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which, next, ajaxurl, ajax, RespType
from sasoup.baserules import init_rules

rules = {
    'url': 'http://www.etao.com',
    'fields_rules': {
        'feedList': xpaths("//div[@id='J_FeedList']//div[@id]", evalx="result"),
    },
    'result_rules': {
        'feed': (
            xpathz('feedList', xpath(".//h3[@class='feed-title']/a/@title", evalx="_strip(result)")),
            xpathz('feedList', xpath(".//h3[@class='feed-title']/a/strong/text()", evalx="_strip(result)")),
            xpathz('feedList', xpath(".//div[@class='feed-desc']/p/text()", evalx="_strip(result)")),
        ),
        'cats': next(
            xpaths("//div[contains(@class,'J_PCMain')/li]", evalx="result"),
            xpath(".//h3/a/text()", evalx="result"),
        ),
    },
}

rules = init_rules(rules)
Beispiel #5
0
# encoding: utf8
from sasoup.baserules import xpath, xpaths, xpathz, search, dpath, base, addon, fields, which, ajaxurl, ajax, RespType

"""
jdrules:
    cat:                顶级类目
    electrontics_cat:   家电通讯类目列表
    electrontics_items: 家电通讯类目特价商品
    digitals_cat:       电脑数码类目列表
    digitals_items:     电脑数码类目特价商品
"""
jdrules = {
    'fields_rules': {
        'catItems': xpaths("//div[@id='_JD_ALLSORT']/div", evalx="result"),
        'cat1Cats': xpaths("//div[@id='electronics']/div[contains(@class,'catalogue')]//ul/li", evalx="result"),
        'cat1Items': xpaths("//div[@id='electronics']/div[contains(@class,'plist')]/div[2]//li", evalx="result"),
        'cat2Cats': xpaths("//div[@id='digitals']/div[contains(@class,'catalogue')]//ul/li", evalx="result"),
        'cat2Items': xpaths("//div[@id='digitals']/div[contains(@class,'plist')]/div[2]//li", evalx="result"),
    },
    'result_rules': {
        'cat': xpathz(
            'catItems',
            xpaths(".//h3/a", evalx="result.text")),
        'electrontics_cat': xpathz(
            'cat1Cats',
            xpath(".//a", evalx="result.text")),
        'electrontics_items': xpathz(
            'cat1Items',
            xpaths("./div[@class='p-name']/a|./div[@class='p-price']/span", evalx="result.text")),
        'digitals_cat': xpathz(
            'cat2Cats',