def loop(self, url, next, post=None, cb=None, cc = 1, deep=2, debug=0, allow_external = False, link_filter=None, start_now=True, **options): doneurls = [common.md5(url)] domain = common.get_domain(url).lower() def page_loaded(doc): if doc.req['meta']['deep']<deep: for n in doc.q(next): nexturl = n.nodevalue() if domain != common.get_domain(nexturl): continue if link_filter and not link_filter(url=nexturl): continue if common.md5(nexturl) not in doneurls: doneurls.append(common.md5(nexturl)) req = Request(url=nexturl, meta=dict(deep=doc.req['meta']['deep']+1),use_cache=True, cb = page_loaded, **options) self.downloader.put(req) #allow the loop caller proccessing each loaded page if cb: cb(doc) self.downloader.put(Request(url=url, post=post, meta=dict(deep=1), use_cache=True, cb = page_loaded, **options)) self.downloader.cc = cc if start_now: self.downloader.start()
def page_loaded(doc): if doc.req['meta']['deep']<deep: for n in doc.q(next): nexturl = n.nodevalue() if domain != common.get_domain(nexturl): continue if link_filter and not link_filter(url=nexturl): continue if common.md5(nexturl) not in doneurls: doneurls.append(common.md5(nexturl)) req = Request(url=nexturl, meta=dict(deep=doc.req['meta']['deep']+1),use_cache=True, cb = page_loaded, **options) self.downloader.put(req) #allow the loop caller proccessing each loaded page if cb: cb(doc)
def handler(doc): page = stats.page doc.page = page if verify: if not verify( common.DataObject(starturl=common.DataItem(url), page=page, doc=doc)): doc.ok = False logger.warn("invalid doc at page {0}".format(page)) logger.info('page %s', page) #download and parse details if detail: listings = detail( common.DataObject( starturl=common.DataItem(url), page=page, doc=doc)) if hasattr(detail, '__call__') else doc.q(detail) logger.info('details: %s', len(listings)) for listing in listings: self.downloader.put(Request(url=listing if isinstance( listing, basestring) else listing.nodevalue(), cb=parse_detail, meta=meta, **options), onhold=list_pages_first) done = False _nexturl = None _next_post = None if next: _nexturl = next( common.DataObject( starturl=common.DataItem(url), page=page, doc=doc)) if hasattr(next, '__call__') else ( next if next.startswith('http') else doc.x(next)) if next_post: if not next: #next is not provided, use the original url _nexturl = doc.url _next_post = next_post( common.DataObject( doc=doc, page=page, starturl=common.DataItem(url))) if hasattr( next_post, '__call__') else next_post if next_post: if _next_post: done = False else: done = True else: if not _nexturl: done = True else: done = False #if (next and _nexturl ) or (next_post and _next_post): if not done: #logger.debug('next_post: %s, _nexturl: %s', _next_post, _nexturl) stats.page += 1 if max_pages != 0 and stats.page > max_pages: done = True else: self.downloader.put( Request(_nexturl, _next_post, cb=handler, **options)) else: done = True if parse_list: parse_list(doc)
def pagin(self, url, next=None, post=None, next_post=None, parse_list=None, detail=None, parse_detail=None, cc=3, max_pages=0, list_pages_first=True, start_now=False, debug=True, verify=None, meta={}, **_options): if cc != self.downloader.cc: self.downloader.set_cc(cc) options = common.combine_dicts(self.config, _options) stats = common.DataObject(page=1) #apply scraper-level options def handler(doc): page = stats.page doc.page = page if verify: if not verify( common.DataObject(starturl=common.DataItem(url), page=page, doc=doc)): doc.ok = False logger.warn("invalid doc at page {0}".format(page)) logger.info('page %s', page) #download and parse details if detail: listings = detail( common.DataObject( starturl=common.DataItem(url), page=page, doc=doc)) if hasattr(detail, '__call__') else doc.q(detail) logger.info('details: %s', len(listings)) for listing in listings: self.downloader.put(Request(url=listing if isinstance( listing, basestring) else listing.nodevalue(), cb=parse_detail, meta=meta, **options), onhold=list_pages_first) done = False _nexturl = None _next_post = None if next: _nexturl = next( common.DataObject( starturl=common.DataItem(url), page=page, doc=doc)) if hasattr(next, '__call__') else ( next if next.startswith('http') else doc.x(next)) if next_post: if not next: #next is not provided, use the original url _nexturl = doc.url _next_post = next_post( common.DataObject( doc=doc, page=page, starturl=common.DataItem(url))) if hasattr( next_post, '__call__') else next_post if next_post: if _next_post: done = False else: done = True else: if not _nexturl: done = True else: done = False #if (next and _nexturl ) or (next_post and _next_post): if not done: #logger.debug('next_post: %s, _nexturl: %s', _next_post, _nexturl) stats.page += 1 if max_pages != 0 and stats.page > max_pages: done = True else: self.downloader.put( Request(_nexturl, _next_post, cb=handler, **options)) else: done = True if parse_list: parse_list(doc) ##### end of the handler function ################################################## #start the initial url self.downloader.put(Request(url, post, cb=handler, **options)) if start_now: self.downloader.start()
def load_json(self, url, post=None, **_options): options = common.combine_dicts(self.config, _options) return self.client.load_json(Request(url=url, post=post, **options))