Example #1
0
def crawl(site, maxpage=None):
    proxy = ybk.config.conf.get('proxy')
    if proxy:
        session.proxies = {'http': proxy}

    conf = get_conf(site)
    ex = Exchange({
        'name': conf['name'],
        'url': conf['url'],
        'abbr': conf['abbr'],
    })
    ex.upsert()
    for type_ in ['result', 'offer', 'stock']:
        tconf = conf.get(type_)
        if not tconf:
            continue
        if maxpage is None:
            maxpage = tconf['maxpage']
        else:
            maxpage = min(maxpage, tconf['maxpage'])
        index = tconf['index']
        if not isinstance(index, list):
            index = [index]
        for url in index:
            content = session.get(url, timeout=(5, 10)).content
            content = fix_javascript(url, content)
            parse_index(ex, type_, content, tconf)
        for page in range(2, maxpage + 1):
            url = tconf['page'].format(page=page)
            content = session.get(url, timeout=(5, 10)).content
            content = fix_javascript(url, content)
            parse_index(ex, type_, content, tconf)