Example #1
0
	def load_html(self, req):
		""" returns a unicode html object """
		cache = self.scraper.cache
		accept_error_codes = req.get('accept_error_codes')
		if accept_error_codes is None:
			accept_error_codes = []


		if cache and cache.exists(url = req.url, post=req.post, file_name=req.get('file_name')) and req.get('use_cache'):
			return self._read_from_cache(url=req.url, post=req.post, file_name=req.get('file_name'))

		if req.get('use_cache') and req.get('cache_only') and not cache.exists(url = req.url, post=req.post, file_name=req.get('file_name')):
			html = common.DataItem('<html/>')
			html.status = Status()
			return html 	
			
		res = self.fetch_data(req)

		html = common.DataItem( res.data or '')
		status = res.status

		if (status.code == 200 or status.code in accept_error_codes) and  cache and req.get('use_cache'):
			self._write_to_cache(url=req.url, post=req.post, data=html, status=status, file_name=req.get('file_name'))

		html.status = status

		return html	
Example #2
0
    def _build_response_data(self, req, response):

        encoding = 'utf8'
        unicode_html = u''

        try:
            unicode_html = response['data'].decode(encoding, 'ignore')
        except Exception as e:
            logger.warn('failed to decode bytes from url: %s', req.url)

        return_type = req.get('return_type') or 'doc'

        if return_type == 'doc':
            doc = http.Doc(url=req.url, html=unicode_html)
            doc.req = req
            doc.status.code = response['code']
            doc.status.message = response['message']
            return doc
        elif return_type == 'html':
            html = common.DataItem(unicode_html)
            html.req = req
            html.status = common.DataObject()
            html.status.code = response['code']
            html.status.message = response['message']
            return html

        else:
            self.scraper.logger.warn('unsupported return_type: %s',
                                     return_type)
            return None
Example #3
0
    def _read_from_cache(self, url, post, filename=None):
        cache = self.scraper.cache

        cachedata = cache.read(url=url, post=post,
                               filename=filename).split(meta_seperator)

        cachedhtml = None

        if len(cachedata) == 2:
            cachedhtml = cachedata[1]
            meta = json.loads(cachedata[0])
            #reload status
            response = Response(data=cachedhtml,
                                code=meta['response']['code'],
                                final_url=meta['response']['final_url'],
                                message=meta['response'].get('message', ''))
        else:
            #no meta data
            cachedhtml = cachedata[0]
            response = Response(data=cachedhtml,
                                code=200,
                                final_url=None,
                                message=None)

        html = common.DataItem(cachedhtml)
        html.response = response

        return html
Example #4
0
	def __init__(self, url='', html='<html></html>', html_clean=None, status=None):
		logger = logging.getLogger('__name__')		
		if html_clean:
			html = html_clean(html)

		Node.__init__(self, html)
		self.url = common.DataItem( url )
		self.status = status or Status(final_url=url)
		

		
		
		#resolve relative urls
		baseurl = self.x("//base/@href").tostring()
		if not baseurl:
			baseurl = self.url
		try:
			for n in self.q('//a[@href and not(contains(@href, "javascript")) and not(starts-with(@href, "#")) and not(contains(@href, "mailto:"))]'):					
				if n.href().trim() == '': continue
				n.set('href', urlparse.urljoin(baseurl, n.get('href').tostring()))

			for n in self.q('//iframe[@src]'):					
				if n.src().trim() == '': continue
				n.set('src', urlparse.urljoin(baseurl, n.src()))
		


			for n in self.q('//form[@action]'):					
				n.set('action', urlparse.urljoin(baseurl, n.get('action').tostring()))	
			for n in self.q('//img[@src]'):					
				n.set('src', urlparse.urljoin(baseurl, n.get('src').tostring()))
		except Exception as e:
			logger.warn('there was error while init the Doc object: %s', self.url)
			logger.exception(e)
Example #5
0
    def save(self,
             record,
             filename='result.csv',
             max=None,
             keys=[],
             id=None,
             headers=[],
             remove_existing_file=True,
             always_quoted=True):
        #waiting while other thread writing
        while self.writingflag:
            pass
        #hold the flag
        self.writingflag = True

        path = os.path.join(self.dir, filename)
        format = common.DataItem(path).subreg('\.([a-z]{2,5})$--is').lower()

        if not self.outdb.get(path):
            if os.path.exists(path):
                if remove_existing_file:
                    os.remove(path)

            self.outdb.update({
                path:
                common.DataObject(cnt=0, data=[], ids=[], format=format)
            })

        trackingobj = self.outdb.get(path)
        if keys or id:
            id = id or u"".join(
                [unicode(record[record.index(key) + 1]) for key in keys])
            if id in trackingobj.ids:
                self.writingflag = False
                return
            else:
                trackingobj.ids.append(id)

        trackingobj.cnt += 1

        if format == 'csv':
            #for csv format, save to file immediately
            common.save_csv(path, record, always_quoted=always_quoted)
        elif format in ['xls', 'xlsx']:
            #save for later
            trackingobj.data.append(record)
        if max and trackingobj.cnt == max:
            self.flush()  #save output files and quit
            os._exit(1)

        #free the flag
        self.writingflag = False
Example #6
0
    def load_oneproxy(self):
        proxy_file = self.proxy_file

        if proxy_file:
            if not os.path.exists(proxy_file):
                raise Exception('proxy_file not found: {0}'.format(proxy_file))

            proxy_auth_found = False
            proxy_selected = False

            lines = common.read_lines(proxy_file)
            while (True):
                line = random.choice(lines)
                if 'proxy_auth' in line:
                    if (proxy_auth_found is False):
                        self.proxy_auth = common.DataItem(line).rr(
                            'proxy_auth\s*=\s*').trim()
                        proxy_auth_found = True
                else:
                    if (proxy_selected is False):
                        #support tab, commas separator as well
                        line = line.replace('\t', ':').replace(',', ':')
                        self.proxies.append(line)
                        proxy_selected = True

                if (proxy_selected):
                    if (proxy_auth_found):
                        break
                    else:
                        for line in lines:
                            if 'proxy_auth' in line:
                                self.proxy_auth = common.DataItem(line).rr(
                                    'proxy_auth\s*=\s*').trim()
                                proxy_auth_found = True
                                break
                        if (proxy_auth_found):
                            break
        return self
Example #7
0
	def body_ready(body):
		if req.get('contain') and req.get('contain') not in body:
			result = {
					'success': False,
					'data': body,
					'req': req,
					'code': 200,
					'message': 'not contain text: %s' % req.get('contain')
					
				}
		else:
			#success
			content_type = response.headers.getRawHeaders('content-type')
			if not content_type:
				logger.warn('no content-type header found: %s', req.url)
				content_type = ''
			else:
				content_type = content_type[0].lower()

			is_binary_data = req.get('bin') or False
			if 'image' in content_type or 'pdf' in content_type:
				is_binary_data = True

			if not is_binary_data:
				charset = common.DataItem(content_type).subreg('charset\s*=\s*([^\s]+)')
				if not charset:
					# logger.warn('no charset parsed from content_type: %s, assumed utf8, url: %s', content_type, req.url)
					charset = 'utf8'
					
				body = _to_utf8encoded_bytes(body, charset=charset)				


			result = {
					'success': True,
					'data': body, #in utf8-encoded bytes
					'req': req,
					'code': 200,
					'message': 'ok'

				}	

		try:	
			
			output_deferred.callback(result)

		except Exception as e:
			logger.debug(e.message)
Example #8
0
    def load_proxies(self):
        proxy_file = self.proxy_file

        if proxy_file:
            if not os.path.exists(proxy_file):
                raise Exception('proxy_file not found: {0}'.format(proxy_file))

            for line in common.read_lines(proxy_file):
                if 'proxy_auth' in line:
                    self.proxy_auth = common.DataItem(line).rr(
                        'proxy_auth\s*=\s*').trim()
                    continue

                #support tab, commas separator as well
                line = line.replace('\t', ':').replace(',', ':')
                self.proxies.append(line)

        return self
Example #9
0
    def save_link(self,
                  url,
                  dir='images',
                  file_name='auto',
                  format='jpg',
                  prefix='',
                  **_options):
        fn = ''

        if file_name == 'auto':
            #special name
            fn = common.DataItem(url).rr('\?.*?$').subreg(
                '/([^/\?\$]+\.[a-z]{2,4})$--is')
            if not fn:
                self.logger.warn('failed to parse file_name from url: %s', url)
                return None

        else:
            #file_name is a fixed name
            fn = file_name

        if not common.subreg(fn, '(\.[a-z]{2,5}$)--is'):
            fn += '.' + format
        fn = prefix + fn

        if not os.path.exists(os.path.join(self.dir, dir)):
            os.makedirs(os.path.join(self.dir, dir))

        path = os.path.join(self.dir, dir, fn)

        if (os.path.exists(path)):
            return fn  #already downloaded
        else:
            #start downloading the file
            options = common.combine_dicts(self.config, _options)

            res = self.client.fetch_data(
                http.Request(url=url, bin=True, **options))

            if res.status.code == 200 and res.data:
                common.put_bin(path, res.data)
                return fn
            else:
                return None
Example #10
0
	def _read_from_cache(self, url, post, file_name=None):
		cache = self.scraper.cache

		cachedata = cache.read(url = url, post = post, file_name = file_name).split(meta_seperator)
		
		cachedhtml = None
		status = Status(code=200, final_url=None, error=None)
		if len(cachedata)==2:
			cachedhtml = cachedata[1]
			meta = json.loads( cachedata[0] )
			#reload status
			status = Status(code= meta['status']['code'], final_url = meta['status']['final_url'], error = meta['status'].get('error', '') )
		else:
			#no meta data
			cachedhtml = cachedata[0]
		html = common.DataItem(cachedhtml)	
		html.status = status

		return html
Example #11
0
        def handler(doc):
            page = stats.page
            doc.page = page

            if verify:
                if not verify(
                        common.DataObject(starturl=common.DataItem(url),
                                          page=page,
                                          doc=doc)):
                    doc.ok = False
                    logger.warn("invalid doc at page {0}".format(page))

            logger.info('page %s', page)

            #download and parse details
            if detail:

                listings = detail(
                    common.DataObject(
                        starturl=common.DataItem(url), page=page,
                        doc=doc)) if hasattr(detail,
                                             '__call__') else doc.q(detail)

                logger.info('details: %s', len(listings))

                for listing in listings:

                    self.downloader.put(Request(url=listing if isinstance(
                        listing, basestring) else listing.nodevalue(),
                                                cb=parse_detail,
                                                meta=meta,
                                                **options),
                                        onhold=list_pages_first)

            done = False

            _nexturl = None
            _next_post = None

            if next:
                _nexturl = next(
                    common.DataObject(
                        starturl=common.DataItem(url), page=page,
                        doc=doc)) if hasattr(next, '__call__') else (
                            next if next.startswith('http') else doc.x(next))
            if next_post:
                if not next:
                    #next is not provided, use the original url
                    _nexturl = doc.url
                _next_post = next_post(
                    common.DataObject(
                        doc=doc, page=page,
                        starturl=common.DataItem(url))) if hasattr(
                            next_post, '__call__') else next_post

            if next_post:
                if _next_post:
                    done = False
                else:
                    done = True
            else:
                if not _nexturl:
                    done = True
                else:
                    done = False

            #if (next and _nexturl ) or (next_post and _next_post):
            if not done:

                #logger.debug('next_post: %s, _nexturl: %s', _next_post,  _nexturl)

                stats.page += 1

                if max_pages != 0 and stats.page > max_pages:
                    done = True
                else:
                    self.downloader.put(
                        Request(_nexturl, _next_post, cb=handler, **options))
            else:
                done = True

            if parse_list:
                parse_list(doc)
Example #12
0
	def fetch_data(self, req):
		""" processes a http request specified by the req object and returns a response object """
		req.normalize(self.scraper)

		accept_error_codes = req.get('accept_error_codes')

		
		time.sleep(self.scraper.config['delay'])
		opener = req.get('opener')
		if not opener:
			opener = create_opener(use_cookie=False) if req.get('use_cookie') is False else  self.opener


		headers = req.get('headers')
		
		if self.scraper.config.get('use_requests') is True:
			#use requests module instead of urllib2
			return self.requests_fetch_data(req,headers)

		proxy = req.get('proxy')
		
		if proxy and req.get('use_proxy') is not False:
			if req.url.lower().startswith('https://'):
				opener.add_handler(urllib2.ProxyHandler({'https' : proxy.full_address }))
			else:
				opener.add_handler(urllib2.ProxyHandler({'http' : proxy.full_address }))
		

		request = urllib2.Request(req.url, req.post, headers)
			
		tries = req.get('retries', 0)	
		
		status_code = 0
		error_message = ''
		final_url = None	

		if self.scraper.config['debug']:		
			self.logger.debug('loading %s', req.url)

		try:
			
			with contextlib.closing(opener.open(request,  timeout= req.get('timeout', self.scraper.config['timeout']))) as res:
				final_url = res.url
				status_code = res.code


				rawdata = res.read()

				self.resp = res
				
				if 'gzip' in res.headers.get('content-encoding','').lower():
					bytes = zlib.decompress(rawdata, 16+zlib.MAX_WBITS)
				elif 'deflate' in res.headers.get('content-encoding','').lower():	
					bytes = zlib.decompressobj(-zlib.MAX_WBITS).decompress(rawdata)	
				else:
					bytes = rawdata

				encoding = req.get('encoding') or  common.DataItem(res.headers.get('content-type') or '').subreg('charset\s*=([^;]+)')	or 'utf8'
				content_type = res.headers.get('content-type', '').lower()

				#self.logger.debug('content type: %s, encoding: %s', content_type, encoding)
				
				data = ''

				#default is text data
				
				is_binary_data = req.get('bin') or False
				if 'image' in content_type or 'pdf' in content_type:
					is_binary_data = True

				if  not is_binary_data:
					
					data = bytes.decode(encoding, 'ignore')

					#verify data
					#self.logger.debug('contain: %s', req.get('contain'))
					if req.get('contain') and req.get('contain') not in data:
						raise Exception("invalid html, not contain: %s" % req.get('contain'))

					if req.get('not_contain') and req.get('not_contain') in data:
						raise Exception("invalid html, contain negative string: %s" % req.get('not_contain'))
						
					verify = req.get('verify')
					
					if verify and (not verify(data)):
						raise Exception("invalid html")
				else:
					
					#binary content
					data = bytes		

				
				#log proxy and timestamp into the html file if asked
				if req.get('log_time_and_proxy'):
					
					data += '\n<log id="time_proxy"><time>{time}</time><proxy>{proxy}</proxy></log>'.format(time=time.time(), proxy=proxy if (proxy and req.get('use_proxy') is not False) else '')
	

				return Response(data=data, status= Status(code=status_code, final_url=final_url))	

		
		except Exception, e:
			if status_code == 0 and hasattr(e,'code'):
				status_code = e.code
			if hasattr(e, 'reason'):
				error_message = e.reason			

			elif hasattr(e, 'line'):
				error_message = 'BadStatusLine: %s' % e.line

			elif hasattr(e, 'message'):	
				error_message =  e.message

			

			if not error_message and hasattr(e, 'args'):
				try:				
					error_message = u", ".join([unicode(item) for item in e.args]).replace("''",'unknown')	
				except:
					pass
			
			if tries > 0 and status_code not in accept_error_codes:
				#try to open the request one again	
				self.logger.debug('data fetching error: %s %s', status_code if status_code !=0 else '', error_message)
				req.update({'retries': tries - 1})
				return self.fetch_data(req)
			else:
				self.logger.warn('data fetching error: %s %s', status_code if status_code !=0 else '', error_message)	
				if 'invalid html' in error_message:
					status_code = 0
				return Response(data=None, status = Status(code = status_code, final_url=final_url, error = error_message))
Example #13
0
File: db.py Project: igizm0/scrapex
    def export_items(self,
                     dest_file,
                     query=None,
                     limit=None,
                     sort=None,
                     fields=None,
                     include_hidden_fields=False,
                     multicol_fields={},
                     exclude_fields=[]):
        """ 
		@query: None means all items
		
		@fields: None means all fields

		"""
        if os.path.exists(dest_file):
            os.remove(dest_file)

        if not fields:
            fields = self._compile_all_fields(include_hidden_fields,
                                              exclude_fields=exclude_fields)

        format = common.DataItem(dest_file).subreg(
            '\.([a-z]{2,5})$--is').lower()

        rows = []

        query = query or {}

        cnt = self.count_items(query)

        print('cnt: {}'.format(cnt))

        cursor = self._db.items.find(query)

        if sort:
            cursor = cursor.sort(sort)

        if limit:
            cursor = cursor.limit(limit)

        for item in cursor:
            res = []

            for field in fields:
                # value = item.get(field) or ''
                value = item.get(field)
                if value is None:
                    value = ''

                if field in multicol_fields:
                    maxcol = multicol_fields[field]

                    parts = []
                    if value is None:
                        value = []

                    if isinstance(value, list):
                        parts = value

                    else:
                        parts = value.split('|')

                    if len(parts) < maxcol:
                        #normalize
                        for i in xrange(maxcol - len(parts)):
                            parts.append('')

                    for i in xrange(maxcol):
                        res.append('{} {}'.format(field, i + 1))
                        res.append(parts[i])

                else:
                    res.append(field)
                    res.append(value)

            if format == 'csv':
                common.save_csv(dest_file, res)
            else:
                rows.append(res)

        if format == 'xls':
            import excellib
            excellib.save_xls(dest_file, rows)
        elif format == 'xlsx':
            import excellib
            excellib.save_xlsx(dest_file, rows)