Exemple #1
0
	def loop(self, url, next, post=None, cb=None, cc = 1, deep=2, debug=0, allow_external = False, link_filter=None, start_now=True,  **options):

		doneurls = [common.md5(url)]
		
		domain = common.get_domain(url).lower()



		def page_loaded(doc):

			if doc.req['meta']['deep']<deep:
				for n in doc.q(next):
					nexturl = n.nodevalue()

					if domain != common.get_domain(nexturl):
						continue
					if link_filter and not link_filter(url=nexturl):
						continue

					if common.md5(nexturl) not in doneurls:					
						doneurls.append(common.md5(nexturl))
						req = Request(url=nexturl, meta=dict(deep=doc.req['meta']['deep']+1),use_cache=True,  cb = page_loaded, **options)
						self.downloader.put(req)
			
			#allow the loop caller proccessing each loaded page			
			if cb:
				cb(doc)
		
		
		self.downloader.put(Request(url=url, post=post, meta=dict(deep=1), use_cache=True, cb = page_loaded, **options))			

		self.downloader.cc = cc
		if start_now:
			self.downloader.start()
Exemple #2
0
		def page_loaded(doc):

			if doc.req['meta']['deep']<deep:
				for n in doc.q(next):
					nexturl = n.nodevalue()

					if domain != common.get_domain(nexturl):
						continue
					if link_filter and not link_filter(url=nexturl):
						continue

					if common.md5(nexturl) not in doneurls:					
						doneurls.append(common.md5(nexturl))
						req = Request(url=nexturl, meta=dict(deep=doc.req['meta']['deep']+1),use_cache=True,  cb = page_loaded, **options)
						self.downloader.put(req)
			
			#allow the loop caller proccessing each loaded page			
			if cb:
				cb(doc)
Exemple #3
0
        def handler(doc):
            page = stats.page
            doc.page = page

            if verify:
                if not verify(
                        common.DataObject(starturl=common.DataItem(url),
                                          page=page,
                                          doc=doc)):
                    doc.ok = False
                    logger.warn("invalid doc at page {0}".format(page))

            logger.info('page %s', page)

            #download and parse details
            if detail:

                listings = detail(
                    common.DataObject(
                        starturl=common.DataItem(url), page=page,
                        doc=doc)) if hasattr(detail,
                                             '__call__') else doc.q(detail)

                logger.info('details: %s', len(listings))

                for listing in listings:

                    self.downloader.put(Request(url=listing if isinstance(
                        listing, basestring) else listing.nodevalue(),
                                                cb=parse_detail,
                                                meta=meta,
                                                **options),
                                        onhold=list_pages_first)

            done = False

            _nexturl = None
            _next_post = None

            if next:
                _nexturl = next(
                    common.DataObject(
                        starturl=common.DataItem(url), page=page,
                        doc=doc)) if hasattr(next, '__call__') else (
                            next if next.startswith('http') else doc.x(next))
            if next_post:
                if not next:
                    #next is not provided, use the original url
                    _nexturl = doc.url
                _next_post = next_post(
                    common.DataObject(
                        doc=doc, page=page,
                        starturl=common.DataItem(url))) if hasattr(
                            next_post, '__call__') else next_post

            if next_post:
                if _next_post:
                    done = False
                else:
                    done = True
            else:
                if not _nexturl:
                    done = True
                else:
                    done = False

            #if (next and _nexturl ) or (next_post and _next_post):
            if not done:

                #logger.debug('next_post: %s, _nexturl: %s', _next_post,  _nexturl)

                stats.page += 1

                if max_pages != 0 and stats.page > max_pages:
                    done = True
                else:
                    self.downloader.put(
                        Request(_nexturl, _next_post, cb=handler, **options))
            else:
                done = True

            if parse_list:
                parse_list(doc)
Exemple #4
0
    def pagin(self,
              url,
              next=None,
              post=None,
              next_post=None,
              parse_list=None,
              detail=None,
              parse_detail=None,
              cc=3,
              max_pages=0,
              list_pages_first=True,
              start_now=False,
              debug=True,
              verify=None,
              meta={},
              **_options):

        if cc != self.downloader.cc:
            self.downloader.set_cc(cc)

        options = common.combine_dicts(self.config, _options)

        stats = common.DataObject(page=1)

        #apply scraper-level options

        def handler(doc):
            page = stats.page
            doc.page = page

            if verify:
                if not verify(
                        common.DataObject(starturl=common.DataItem(url),
                                          page=page,
                                          doc=doc)):
                    doc.ok = False
                    logger.warn("invalid doc at page {0}".format(page))

            logger.info('page %s', page)

            #download and parse details
            if detail:

                listings = detail(
                    common.DataObject(
                        starturl=common.DataItem(url), page=page,
                        doc=doc)) if hasattr(detail,
                                             '__call__') else doc.q(detail)

                logger.info('details: %s', len(listings))

                for listing in listings:

                    self.downloader.put(Request(url=listing if isinstance(
                        listing, basestring) else listing.nodevalue(),
                                                cb=parse_detail,
                                                meta=meta,
                                                **options),
                                        onhold=list_pages_first)

            done = False

            _nexturl = None
            _next_post = None

            if next:
                _nexturl = next(
                    common.DataObject(
                        starturl=common.DataItem(url), page=page,
                        doc=doc)) if hasattr(next, '__call__') else (
                            next if next.startswith('http') else doc.x(next))
            if next_post:
                if not next:
                    #next is not provided, use the original url
                    _nexturl = doc.url
                _next_post = next_post(
                    common.DataObject(
                        doc=doc, page=page,
                        starturl=common.DataItem(url))) if hasattr(
                            next_post, '__call__') else next_post

            if next_post:
                if _next_post:
                    done = False
                else:
                    done = True
            else:
                if not _nexturl:
                    done = True
                else:
                    done = False

            #if (next and _nexturl ) or (next_post and _next_post):
            if not done:

                #logger.debug('next_post: %s, _nexturl: %s', _next_post,  _nexturl)

                stats.page += 1

                if max_pages != 0 and stats.page > max_pages:
                    done = True
                else:
                    self.downloader.put(
                        Request(_nexturl, _next_post, cb=handler, **options))
            else:
                done = True

            if parse_list:
                parse_list(doc)

        ##### end of the handler function ##################################################

        #start the initial url
        self.downloader.put(Request(url, post, cb=handler, **options))
        if start_now:
            self.downloader.start()
Exemple #5
0
    def load_json(self, url, post=None, **_options):
        options = common.combine_dicts(self.config, _options)

        return self.client.load_json(Request(url=url, post=post, **options))