Beispiel #1
0
    def save_link(self,
                  url,
                  dir='images',
                  file_name='auto',
                  format='jpg',
                  prefix='',
                  **_options):
        fn = ''

        if file_name == 'auto':
            #special name
            fn = common.DataItem(url).rr('\?.*?$').subreg(
                '/([^/\?\$]+\.[a-z]{2,4})$--is')
            if not fn:
                self.logger.warn('failed to parse file_name from url: %s', url)
                return None

        else:
            #file_name is a fixed name
            fn = file_name

        if not common.subreg(fn, '(\.[a-z]{2,5}$)--is'):
            fn += '.' + format
        fn = prefix + fn

        if not os.path.exists(os.path.join(self.dir, dir)):
            os.makedirs(os.path.join(self.dir, dir))

        path = os.path.join(self.dir, dir, fn)

        if (os.path.exists(path)):
            return fn  #already downloaded
        else:
            #start downloading the file
            options = common.combine_dicts(self.config, _options)

            res = self.client.fetch_data(
                http.Request(url=url, bin=True, **options))

            if res.status.code == 200 and res.data:
                common.put_bin(path, res.data)
                return fn
            else:
                return None
Beispiel #2
0
	def download_file(self, url, filename, dir='images', **_options):
		
		dir_path = self.join_path(dir)
		if not os.path.exists(dir_path):
			os.makedirs(dir_path)

		path = os.path.join(self.dir, dir, filename)
		
		if(os.path.exists(path)):
			return filename #already downloaded
		else:
			#start downloading the file
			options = common.combine_dicts(self.config, _options)		
			
			res = self.client.fetch_data(http.Request(url=url, bin = True, **options))	
					
			if res.code == 200 and res.data:
				common.put_bin(path, res.data)
				return filename
			else:
				return None
Beispiel #3
0
    def download_file(self, url, filename, dir='images', **_options):

        dir_path = self.join_path(dir)
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        path = os.path.join(self.dir, dir, filename)

        if (os.path.exists(path)):
            return filename  #already downloaded
        else:
            #start downloading the file
            options = common.combine_dicts(self.config, _options)

            res = self.client.fetch_data(
                http.Request(url=url, bin=True, **options))

            if res.code == 200 and res.data:
                common.put_bin(path, res.data)
                return filename
            else:
                return None
Beispiel #4
0
    def pagin(self,
              url,
              next=None,
              post=None,
              next_post=None,
              parse_list=None,
              detail=None,
              parse_detail=None,
              cc=3,
              max_pages=0,
              list_pages_first=True,
              start_now=False,
              debug=True,
              verify=None,
              meta={},
              **_options):

        if cc != self.downloader.cc:
            self.downloader.set_cc(cc)

        options = common.combine_dicts(self.config, _options)

        stats = common.DataObject(page=1)

        #apply scraper-level options

        def handler(doc):
            page = stats.page
            doc.page = page

            if verify:
                if not verify(
                        common.DataObject(starturl=common.DataItem(url),
                                          page=page,
                                          doc=doc)):
                    doc.ok = False
                    logger.warn("invalid doc at page {0}".format(page))

            logger.info('page %s', page)

            #download and parse details
            if detail:

                listings = detail(
                    common.DataObject(
                        starturl=common.DataItem(url), page=page,
                        doc=doc)) if hasattr(detail,
                                             '__call__') else doc.q(detail)

                logger.info('details: %s', len(listings))

                for listing in listings:

                    self.downloader.put(Request(url=listing if isinstance(
                        listing, basestring) else listing.nodevalue(),
                                                cb=parse_detail,
                                                meta=meta,
                                                **options),
                                        onhold=list_pages_first)

            done = False

            _nexturl = None
            _next_post = None

            if next:
                _nexturl = next(
                    common.DataObject(
                        starturl=common.DataItem(url), page=page,
                        doc=doc)) if hasattr(next, '__call__') else (
                            next if next.startswith('http') else doc.x(next))
            if next_post:
                if not next:
                    #next is not provided, use the original url
                    _nexturl = doc.url
                _next_post = next_post(
                    common.DataObject(
                        doc=doc, page=page,
                        starturl=common.DataItem(url))) if hasattr(
                            next_post, '__call__') else next_post

            if next_post:
                if _next_post:
                    done = False
                else:
                    done = True
            else:
                if not _nexturl:
                    done = True
                else:
                    done = False

            #if (next and _nexturl ) or (next_post and _next_post):
            if not done:

                #logger.debug('next_post: %s, _nexturl: %s', _next_post,  _nexturl)

                stats.page += 1

                if max_pages != 0 and stats.page > max_pages:
                    done = True
                else:
                    self.downloader.put(
                        Request(_nexturl, _next_post, cb=handler, **options))
            else:
                done = True

            if parse_list:
                parse_list(doc)

        ##### end of the handler function ##################################################

        #start the initial url
        self.downloader.put(Request(url, post, cb=handler, **options))
        if start_now:
            self.downloader.start()
Beispiel #5
0
    def load_json(self, url, post=None, **_options):
        options = common.combine_dicts(self.config, _options)

        return self.client.load_json(Request(url=url, post=post, **options))
Beispiel #6
0
	def normalize(self, scraper):
		""" normalize this req with using the provided scraper's config """
		if self.is_normalized:
			return self

		self.scraper = scraper  

		#copy scraper-wide options if not set yet   
		self.options = common.combine_dicts(scraper.config, self.options)

		req = self

		self.url = common.normalize_url(self.url)
		# self.url = str(self.url)

		accept_error_codes = req.get('accept_error_codes')
		if accept_error_codes is None:
			accept_error_codes = []
			req.set('accept_error_codes', accept_error_codes)

		#default headers
		user_agent = req.get('user_agent', agent.firefox ) #default agent is firefox
		
		if user_agent == 'random':
			user_agent = agent.random_agent()

		headers = {
			"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
			"User-Agent": user_agent,
			"Accept-Language": "en-us,en;q=0.5",
			"Accept-Encoding": "gzip, deflate",         
			# "Connection": "close" #turn off keep-alive
			"Connection": "keep-alive"
		}

		if req.post:
			headers.update({"Content-Type": "application/x-www-form-urlencoded"})
			
		#update user-passed in headers
		if req.get('headers'):
			if req.get('merge_headers') is not False:
				#merge user defined headers with default headers
				headers.update(req.get('headers')) 
			else:
				#only use user defined headers
				headers = req.get('headers')    


		req.set('headers', headers)
			
		proxy = req.get('proxy') or scraper.proxy_manager.get_proxy(req.url)
		if proxy and req.get('proxy_url_filter'):
			#check if this url is qualified for using proxy
			if not re.compile(req.get('proxy_url_filter')).findall(req.url):
				#failed
				proxy = ''
				logger.debug('proxy not used for url: %s', req.url)

		req.set('proxy', proxy)
		
		
		#normalise the post
		if req.post and isinstance(req.post, common.MyDict):
			req.post = req.post.dict()
		if req.post and isinstance(req.post, dict):
			req.post = urllib.urlencode(sorted(req.post.items()))

		self.is_normalized = True   
		
		return self 
Beispiel #7
0
	def normalize(self, scraper):
		""" normalize this req with using the provided scraper's config """
		if self.is_normalized:
			return self

		#copy scraper-wide options if not set yet	
		self.options = common.combine_dicts(scraper.config, self.options)

		req = self

		self.url = common.normalize_url(self.url)
		# self.url = str(self.url)

		accept_error_codes = req.get('accept_error_codes')
		if accept_error_codes is None:
			accept_error_codes = []
			req.set('accept_error_codes', accept_error_codes)

		#default headers
		user_agent = req.get('user_agent', agent.firefox ) #default agent is firefox
		
		if user_agent == 'random':
			user_agent = agent.random_agent()

		headers = {
			"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
			"User-Agent": user_agent,
			"Accept-Language": "en-us,en;q=0.5",
			"Accept-Encoding": "gzip, deflate",			
			# "Connection": "close" #turn off keep-alive
			"Connection": "keep-alive"
		}

		if req.post:
			headers.update({"Content-Type": "application/x-www-form-urlencoded"})
			
		#update user-passed in headers
		if req.get('headers'):
			if req.get('merge_headers') is not False:
				#merge user defined headers with default headers
				headers.update(req.get('headers')) 
			else:
				#only use user defined headers
				headers = req.get('headers')	


		req.set('headers', headers)
			
		proxy = req.get('proxy') or scraper.proxy_manager.get_proxy(req.url)
		if proxy and req.get('proxy_url_filter'):
			#check if this url is qualified for using proxy
			if not re.compile(req.get('proxy_url_filter')).findall(req.url):
				#failed
				proxy = ''
				logger.debug('proxy not used for url: %s', req.url)

		req.set('proxy', proxy)
		
		
		#normalise the post
		if req.post and isinstance(req.post, common.MyDict):
			req.post = req.post.dict()
		if req.post and isinstance(req.post, dict):
			req.post = urllib.urlencode(sorted(req.post.items()))

		self.is_normalized = True	
		
		return self	
Beispiel #8
0
	def pagin(self, url, next=None, post=None,next_post=None, parse_list=None, detail= None, parse_detail= None, cc = 3, max_pages = 0, list_pages_first=True, start_now=False, debug=True, verify=None, meta={},  **_options):
		
		if cc != self.downloader.cc:
			self.downloader.set_cc(cc)

		options = common.combine_dicts(self.config, _options)

		stats = common.DataObject(page=1)

		#apply scraper-level options

		def handler(doc):
			page = stats.page
			doc.page = page

			if verify:				
				if not verify(common.DataObject(starturl=common.DataItem(url), page= page, doc=doc)):
					doc.ok = False
					logger.warn("invalid doc at page {0}".format(page))
			
			logger.info('page %s', page)
			
			
			#download and parse details	
			if detail:
				
				listings = detail(common.DataObject(starturl=common.DataItem(url), page= page, doc=doc)) if hasattr(detail, '__call__') else doc.q(detail)
				
				logger.info('details: %s', len(listings) )

				for listing in listings:
					
					self.downloader.put(Request(url= listing if isinstance(listing, basestring) else listing.nodevalue(), cb = parse_detail, meta=meta, **options), onhold=list_pages_first)
					
			done = False

			_nexturl = None
			_next_post = None

			if next:
				_nexturl = next(common.DataObject(starturl=common.DataItem(url), page= page, doc=doc)) if hasattr(next, '__call__') else ( next if next.startswith('http') else doc.x(next) )
			if next_post:
				if not next: 
					#next is not provided, use the original url
					_nexturl = doc.url								
				_next_post = next_post(common.DataObject(doc=doc, page=page, starturl=common.DataItem(url))) if hasattr(next_post, '__call__') else next_post
			
			if next_post:
				if _next_post:
					done = False
				else:
					done = True
			else:
				if not _nexturl:
					done = True
				else:
					done = False				

			
			#if (next and _nexturl ) or (next_post and _next_post):
			if not done:
				
				#logger.debug('next_post: %s, _nexturl: %s', _next_post,  _nexturl)

				stats.page += 1

				if max_pages != 0 and stats.page > max_pages:
					done = True
				else:	
					self.downloader.put(Request(_nexturl, _next_post, cb= handler, **options))
			else:
				done = True

			
			if parse_list:
				parse_list(doc)

			
					

		##### end of the handler function ##################################################				


		#start the initial url
		self.downloader.put(Request(url, post, cb= handler, **options))
		if start_now:
			self.downloader.start()
Beispiel #9
0
	def load_json(self, url, post=None, **_options):		
		options = common.combine_dicts(self.config, _options)		
		
		return self.client.load_json(Request(url = url, post = post, **options))