def normalize_post_data(data, encoding='utf-8'): if isinstance(data, six.text_type): return make_str(data, encoding=encoding) elif isinstance(data, six.binary_type): return data else: # it calls `normalize_http_values()` return make_str(smart_urlencode(data, encoding))
def save_list(lst, path): """ Save items from list to the file. """ with open(path, 'wb') as out: lines = [] for item in lst: if isinstance(item, (six.text_type, six.binary_type)): lines.append(make_str(item)) else: lines.append(make_str(json.dumps(item))) out.write(b'\n'.join(lines) + b'\n')
def api_info(self): #if result and self.cache_reader_service: # result = result and ( # not self.cache_reader_service.input_queue.qsize() # and not self.cache_writer_service.input_queue.qsize() info = { 'counters': self.spider.stat.counters, 'collections': dict((x, len(y)) for (x, y) in self.spider.stat.collections.items()), 'thread_number': self.spider.thread_number, 'parser_pool_size': self.spider.parser_pool_size, 'task_queue': self.spider.task_queue.size(), 'task_dispatcher_input_queue': (self.spider.task_dispatcher.input_queue.qsize()), 'parser_service_input_queue': (self.spider.parser_service.input_queue.qsize()), 'network_service_active_threads': (self.spider.network_service.get_active_threads_number()), 'cache_reader_input_queue': (self.spider.cache_reader_service.input_queue.size() if self.spider.cache_reader_service else '--'), 'cache_writer_input_queue': (self.spider.cache_writer_service.input_queue.qsize() if self.spider.cache_writer_service else '--'), } content = make_str(json.dumps(info)) self.response(content=content)
def api_info(self): #if result and self.cache_reader_service: # result = result and ( # not self.cache_reader_service.input_queue.qsize() # and not self.cache_writer_service.input_queue.qsize() info = { 'counters': self.spider.stat.counters, 'collections': dict((x, len(y)) for (x, y) in self.spider.stat.collections.items()), 'thread_number': self.spider.thread_number, 'parser_pool_size': self.spider.parser_pool_size, 'task_queue': self.spider.task_queue.size(), 'task_dispatcher_input_queue': ( self.spider.task_dispatcher.input_queue.qsize() ), 'parser_service_input_queue': ( self.spider.parser_service.input_queue.qsize() ), 'network_service_active_threads': ( self.spider.network_service.get_active_threads_number() ), 'cache_reader_input_queue': ( self.spider.cache_reader_service.input_queue.size() if self.spider.cache_reader_service else '--' ), 'cache_writer_input_queue': ( self.spider.cache_writer_service.input_queue.qsize() if self.spider.cache_writer_service else '--' ), } content = make_str(json.dumps(info)) self.response(content=content)
def test_normalize_bytes(self): fh, path = mkstemp() dumper = MysqlCSVDumper(path) dumper.add_row((make_str('фуу'),)) dumper.close() self.assertTrue(u'фуу' in open(path, encoding='utf-8').read()) os.unlink(path)
def prepare_response(self, grab): #if self.body_file: # self.body_file.close() response = Response() head = '' for key, val in self._response.getheaders().items(): head += '%s: %s\r\n' % (key, val) head += '\r\n' response.head = make_str(head, encoding='latin', errors='ignore') #if self.body_path: # response.body_path = self.body_path #else: # response.body = b''.join(self.response_body_chunks) if self._request._response_path: response.body_path = self._request._response_path # Quick dirty hack, actullay, response is fully read into memory self._request._response_file.write(self._response.read()) #data) self._request._response_file.close() else: if self._request.body_maxsize is not None: #if self.response_body_bytes_read > self.config_body_maxsize: # logger.debug('Response body max size limit reached: %s' % # self.config_body_maxsize) response.body = self._response.read(self._request.body_maxsize) else: response.body = self._response.read() #data # Clear memory #self.response_header_chunks = [] response.code = self._response.status #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME) #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME) #response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME) #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD) #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD) #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD) #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP) response.url = self._response.get_redirect_location( ) or self._request.url import email.message hdr = email.message.Message() for key, val in self._response.getheaders().items(): hdr[key] = val response.parse(charset=grab.config['document_charset'], headers=hdr) jar = self.extract_cookiejar(self._response, self._request) response.cookies = CookieManager(jar) # We do not need anymore cookies stored in the # curl instance so drop them #self.curl.setopt(pycurl.COOKIELIST, 'ALL') return response
def api_info(self): info = { "counters": self.spider.stat.counters, "collections": dict((x, len(y)) for (x, y) in self.spider.stat.collections.items()), "thread_number": self.spider.thread_number, "parser_pool_size": self.spider.parser_pool_size, } content = make_str(json.dumps(info)) self.response(content=content)
def normalize_url(url): # https://tools.ietf.org/html/rfc3986 url = make_unicode(url) if RE_NOT_SAFE_CHAR.search(url): parts = list(urlsplit(url)) # Scheme pass # Network location (user:pass@hostname) if RE_NON_ALPHA_DIGIT_NETLOC.search(parts[1]): parts[1] = parts[1].encode('idna') # Path parts[2] = quote(make_str(parts[2]), safe=RESERVED_CHARS) # Query parts[3] = quote(make_str(parts[3]), safe=RESERVED_CHARS) # Fragment parts[4] = quote(make_str(parts[4]), safe=RESERVED_CHARS) return urlunsplit(map(make_unicode, parts)) return url
def prepare_response(self, grab): #if self.body_file: # self.body_file.close() response = Response() head = '' for key, val in self._response.getheaders().items(): head += '%s: %s\r\n' % (key, val) head += '\r\n' response.head = make_str(head, encoding='latin', errors='ignore') #if self.body_path: # response.body_path = self.body_path #else: # response.body = b''.join(self.response_body_chunks) if self._request._response_path: response.body_path = self._request._response_path # Quick dirty hack, actullay, response is fully read into memory self._request._response_file.write(self._response.read())#data) self._request._response_file.close() else: if self._request.body_maxsize is not None: #if self.response_body_bytes_read > self.config_body_maxsize: # logger.debug('Response body max size limit reached: %s' % # self.config_body_maxsize) response.body = self._response.read(self._request.body_maxsize) else: response.body = self._response.read()#data # Clear memory #self.response_header_chunks = [] response.code = self._response.status #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME) #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME) #response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME) #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD) #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD) #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD) #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP) response.url = self._response.get_redirect_location() or self._request.url import email.message hdr = email.message.Message() for key, val in self._response.getheaders().items(): hdr[key] = val response.parse(charset=grab.config['document_charset'], headers=hdr) jar = self.extract_cookiejar(self._response, self._request) response.cookies = CookieManager(jar) # We do not need anymore cookies stored in the # curl instance so drop them #self.curl.setopt(pycurl.COOKIELIST, 'ALL') return response
def api_info(self): info = { 'counters': self.spider.stat.counters, 'collections': dict((x, len(y)) for (x, y) in self.spider.stat.collections.items()), 'thread_number': self.spider.thread_number, 'parser_pool_size': self.spider.parser_pool_size, } content = make_str(json.dumps(info)) self.response(content=content)
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: headers = make_headers(proxy_basic_auth=req.proxy_userpwd) else: headers = None proxy_url = '%s://%s' % (req.proxy_type, req.proxy) if req.proxy_type == 'socks5': pool = SOCKSProxyManager(proxy_url) # , proxy_headers=headers) else: pool = ProxyManager(proxy_url, proxy_headers=headers) else: pool = self.pool try: retry = Retry(redirect=False, connect=False, read=False) # The read timeout is not total response time timeout # It is the timeout on read of next data chunk from the server # Total response timeout is handled by Grab timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method req.op_started = time.time() res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except exceptions.ReadTimeoutError as ex: raise error.GrabTimeoutError('ReadTimeoutError', ex) except exceptions.ConnectTimeoutError as ex: raise error.GrabConnectionError('ConnectTimeoutError', ex) except exceptions.ProtocolError as ex: # TODO: # the code # raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # fails # with error TypeError: 'OSError' object is not subscriptable raise error.GrabConnectionError('ProtocolError', ex) # WTF? self.request_head = b'' self.request_body = b'' self.request_log = b'' self._response = res
def process(item): key, value = item # Process key if isinstance(key, six.text_type): key = make_str(key, encoding=charset) # Process value if ignore_classes and isinstance(value, ignore_classes): pass elif isinstance(value, six.text_type): value = make_str(value, encoding=charset) elif value is None: value = b'' elif isinstance(value, (list, tuple)): for subval in value: for res in process((key, subval)): yield res return else: value = make_str(value) yield key, value
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: headers = make_headers(proxy_basic_auth=req.proxy_userpwd) else: headers = None proxy_url = '%s://%s' % (req.proxy_type, req.proxy) try: pool = ProxyManager(proxy_url, proxy_headers=headers) except ProxySchemeUnknown: raise GrabMisuseError('Urllib3 transport does ' 'not support %s proxies' % req.proxy_type) else: pool = self.pool try: retry = Retry(redirect=False, connect=False, read=False) # The read timeout is not total response time timeout # It is the timeout on read of next data chunk from the server # Total response timeout is handled by Grab timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method req.op_started = time.time() res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except exceptions.ReadTimeoutError as ex: raise error.GrabTimeoutError('Read timeout') except exceptions.ConnectTimeoutError as ex: raise error.GrabConnectionError('Could not create connection') except exceptions.ProtocolError as ex: raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # WTF? self.request_head = b'' self.request_body = b'' self.request_log = b'' self._response = res
def execute_task_handler(self, res, handler): """ Apply `handler` function to the network result. If network result is failed then submit task again to the network task queue. """ try: handler_name = handler.__name__ except AttributeError: handler_name = 'NONE' if (res['task'].get('raw') or ( res['ok'] and self.valid_response_code(res['grab'].response.code, res['task']))): try: with self.stat.log_time('response_handler'): with self.stat.log_time('response_handler.%s' % handler_name): result = handler(res['grab'], res['task']) if result is None: pass else: for item in result: self.process_handler_result(item, res['task']) except NoDataHandler as ex: raise except Exception as ex: self.process_handler_error(handler_name, ex, res['task']) else: self.stat.inc('spider:task-%s-ok' % res['task'].name) else: # Log the error if res['ok']: msg = res['emsg'] = 'HTTP %s' % res['grab'].response.code else: msg = res['emsg'] self.stat.inc('spider:network-error-%s' % make_str(res['emsg'][:20], errors='ignore')) logger.error(u'Network error: %s' % make_unicode(msg, errors='ignore')) # Try to repeat the same network query if self.network_try_limit > 0: task = res['task'] task.refresh_cache = True # Should use task.grab_config or backup of grab_config task.setup_grab_config(res['grab_config_backup']) self.add_task(task)
def execute_task_handler(self, res, handler): """ Apply `handler` function to the network result. If network result is failed then submit task again to the network task queue. """ try: handler_name = handler.__name__ except AttributeError: handler_name = 'NONE' if (res['task'].get('raw') or (res['ok'] and self.valid_response_code( res['grab'].response.code, res['task']))): try: with self.save_timer('response_handler'): with self.save_timer('response_handler.%s' % handler_name): result = handler(res['grab'], res['task']) if result is None: pass else: for item in result: self.process_handler_result(item, res['task']) except NoDataHandler as ex: raise except Exception as ex: self.process_handler_error(handler_name, ex, res['task']) else: self.inc_count('task-%s-ok' % res['task'].name) else: # Log the error if res['ok']: msg = res['emsg'] = 'HTTP %s' % res['grab'].response.code else: msg = res['emsg'] self.inc_count('network-error-%s' % make_str(res['emsg'][:20], errors='ignore')) logger.error(u'Network error: %s' % make_unicode(msg, errors='ignore')) # Try to repeat the same network query if self.network_try_limit > 0: task = res['task'] task.refresh_cache = True # Should use task.grab_config or backup of grab_config task.setup_grab_config(res['grab_config_backup']) self.add_task(task)
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: headers = make_headers(proxy_basic_auth=req.proxy_userpwd) else: headers = None proxy_url = '%s://%s' % (req.proxy_type, req.proxy) try: pool = ProxyManager(proxy_url, proxy_headers=headers) except ProxySchemeUnknown: raise GrabMisuseError('Urllib3 transport does ' 'not support %s proxies' % req.proxy_type) else: pool = self.pool try: retry = Retry(redirect=False, connect=False, read=False) timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except exceptions.ConnectTimeoutError as ex: raise error.GrabConnectionError('Could not create connection') except exceptions.ProtocolError as ex: raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # WTF? self.request_head = '' self.request_body = '' self.request_log = '' self._response = res
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: auth = "%s@" % req.proxy_userpwd else: auth = "" proxy_url = "%s://%s%s" % (req.proxy_type, auth, req.proxy) pool = ProxyManager(proxy_url) else: pool = self.pool try: retry = Retry(redirect=False, connect=False, read=False) timeout = Timeout(connect=req.connect_timeout, read=req.timeout) # req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method res = pool.urlopen( req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False, ) except exceptions.ConnectTimeoutError as ex: raise error.GrabConnectionError("Could not create connection") except exceptions.ProtocolError as ex: raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # WTF? self.request_head = "" self.request_body = "" self.request_log = "" self._response = res
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: auth = '%s@' % req.proxy_userpwd else: auth = '' proxy_url = '%s://%s%s' % (req.proxy_type, auth, req.proxy) pool = ProxyManager(proxy_url) else: pool = self.pool try: retry = Retry(redirect=False, connect=False, read=False) timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except exceptions.ConnectTimeoutError as ex: raise error.GrabConnectionError('Could not create connection') except exceptions.ProtocolError as ex: raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # WTF? self.request_head = '' self.request_body = '' self.request_log = '' self._response = res
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config['nobody'] self.config_body_maxsize = grab.config['body_maxsize'] try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl( u'%s: %s' % (six.text_type(ex), grab.config['url'])) # py3 hack if not six.PY3: request_url = make_str(request_url) self.curl.setopt(pycurl.URL, request_url) # Actually, FOLLOWLOCATION should always be 0 # because redirect logic takes place in Grab.request method # BUT in Grab.Spider this method is not invoked # So, in Grab.Spider we still rely on Grab internal ability # to follow 30X Locations self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config['follow_location'] else 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit']) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout']) self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout']) #self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config['connection_reuse']: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor) if grab.config['body_inmemory']: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config['body_storage_dir']: raise error.GrabMisuseError( 'Option body_storage_dir is not defined') self.setup_body_file( grab.config['body_storage_dir'], grab.config['body_storage_filename'], create_dir=grab.config['body_storage_create_dir']) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config['verbose_logging']: self.verbose_logging = True # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = generate_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config['user_agent']: grab.config['user_agent'] = '' self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent']) if grab.config['debug']: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method in ('POST', 'PUT'): if (grab.config['post'] is None and grab.config['multipart_post'] is None): raise GrabMisuseError('Neither `post` or `multipart_post`' ' options was specified for the %s' ' request' % grab.request_method) if grab.request_method == 'POST': self.curl.setopt(pycurl.POST, 1) if grab.config['multipart_post']: if isinstance(grab.config['multipart_post'], six.string_types): raise error.GrabMisuseError( 'multipart_post option could not be a string') post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) # py3 hack if six.PY3: post_items = decode_pairs(post_items, grab.config['charset']) self.curl.setopt(pycurl.HTTPPOST, process_upload_items(post_items)) elif grab.config['post']: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) self.curl.setopt(pycurl.POSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, '') elif grab.request_method == 'PUT': data = grab.config['post'] if isinstance(data, six.text_type): # py3 hack # if six.PY3: # data = data.encode('utf-8') # else: raise error.GrabMisuseError( 'Value of post option could be only ' 'byte string if PUT method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PUT') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'PATCH': data = grab.config['post'] if isinstance(data, six.text_type): raise error.GrabMisuseError( 'Value of post option could be only byte ' 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'DELETE': self.curl.setopt(pycurl.CUSTOMREQUEST, 'DELETE') elif grab.request_method == 'HEAD': self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == 'UPLOAD': self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == 'GET': self.curl.setopt(pycurl.HTTPGET, 1) elif grab.request_method == 'OPTIONS': data = grab.config['post'] if data is not None: if isinstance(data, six.text_type): raise error.GrabMisuseError( 'Value of post option could be only byte ' 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) self.curl.setopt(pycurl.CUSTOMREQUEST, 'OPTIONS') else: raise error.GrabMisuseError('Invalid method: %s' % grab.request_method) headers = grab.config['common_headers'] if grab.config['headers']: headers.update(grab.config['headers']) # This is required to avoid some problems headers.update({'Expect': ''}) header_tuples = [str('%s: %s' % x) for x in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config['referer']: self.curl.setopt(pycurl.REFERER, str(grab.config['referer'])) if grab.config['proxy']: self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) else: self.curl.setopt(pycurl.PROXY, '') if grab.config['proxy_userpwd']: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config['proxy_userpwd'])) if grab.config['proxy_type']: key = 'PROXYTYPE_%s' % grab.config['proxy_type'].upper() self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key)) if grab.config['encoding']: if ('gzip' in grab.config['encoding'] and 'zlib' not in pycurl.version): raise error.GrabMisuseError( 'You can not use gzip encoding because ' 'pycurl was built without zlib support') self.curl.setopt(pycurl.ENCODING, grab.config['encoding']) if grab.config['userpwd']: self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd'])) if grab.config.get('interface') is not None: self.curl.setopt(pycurl.INTERFACE, grab.config['interface']) if grab.config.get('reject_file_size') is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config['reject_file_size'])
def process_config(self, grab): req = Request(data=None) try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl( u'%s: %s' % (six.text_type(ex), make_unicode(grab.config['url'], errors='ignore'))) req.url = request_url method = grab.detect_request_method() req.method = make_str(method) req.config_body_maxsize = grab.config['body_maxsize'] req.config_nobody = grab.config['nobody'] req.timeout = grab.config['timeout'] req.connect_timeout = grab.config['connect_timeout'] extra_headers = {} # Body processing if grab.config['body_inmemory']: pass else: if not grab.config['body_storage_dir']: raise GrabMisuseError('Option body_storage_dir is not defined') file_, path_ = self.setup_body_file( grab.config['body_storage_dir'], grab.config['body_storage_filename'], create_dir=grab.config['body_storage_create_dir']) req.response_file = file_ req.response_path = path_ if grab.config['multipart_post'] is not None: post_data = grab.config['multipart_post'] if isinstance(post_data, six.binary_type): pass elif isinstance(post_data, six.text_type): raise GrabMisuseError('Option multipart_post data' ' does not accept unicode.') else: post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) post_items = decode_pairs(post_items, grab.config['charset']) post_items = process_upload_items(post_items) post_data, content_type = encode_multipart_formdata(post_items) extra_headers['Content-Type'] = content_type extra_headers['Content-Length'] = len(post_data) req.data = post_data elif grab.config['post'] is not None: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) extra_headers['Content-Length'] = len(post_data) req.data = post_data if method in ('POST', 'PUT'): if (grab.config['post'] is None and grab.config['multipart_post'] is None): raise GrabMisuseError('Neither `post` or `multipart_post`' ' options was specified for the %s' ' request' % method) # Proxy if grab.config['proxy']: req.proxy = grab.config['proxy'] if grab.config['proxy_userpwd']: req.proxy_userpwd = grab.config['proxy_userpwd'] if grab.config['proxy_type']: req.proxy_type = grab.config['proxy_type'] else: req.proxy_type = 'http' # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = generate_user_agent() extra_headers['User-Agent'] = grab.config['user_agent'] # Headers headers = extra_headers headers.update(grab.config['common_headers']) if grab.config['headers']: headers.update(grab.config['headers']) req.headers = headers # Cookies self.process_cookie_options(grab, req) self._request = req
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: headers = make_headers(proxy_basic_auth=req.proxy_userpwd) else: headers = None proxy_url = '%s://%s' % (req.proxy_type, req.proxy) if req.proxy_type == 'socks5': pool = SOCKSProxyManager( proxy_url, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where() ) # , proxy_headers=headers) else: pool = ProxyManager( proxy_url, proxy_headers=headers, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where() ) else: pool = self.pool try: # Retries can be disabled by passing False: # http://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry # Do not use False because of warning: # Converted retries value: False -> Retry(total=False, # connect=None, read=None, redirect=0, status=None) retry = Retry( total=False, connect=False, read=False, redirect=0, status=None, ) # The read timeout is not total response time timeout # It is the timeout on read of next data chunk from the server # Total response timeout is handled by Grab timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method req.op_started = time.time() try: res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except UnicodeError as ex: raise error.GrabConnectionError('GrabInvalidUrl', ex) except exceptions.ReadTimeoutError as ex: raise error.GrabTimeoutError('ReadTimeoutError', ex) except exceptions.ConnectTimeoutError as ex: raise error.GrabConnectionError('ConnectTimeoutError', ex) except exceptions.ProtocolError as ex: # TODO: # the code # raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # fails # with error TypeError: 'OSError' object is not subscriptable raise error.GrabConnectionError('ProtocolError', ex) except exceptions.SSLError as ex: raise error.GrabConnectionError('SSLError', ex) # WTF? self.request_head = b'' self.request_body = b'' self.request_log = b'' self._response = res
def prepare_response(self, grab): try: #if self.body_file: # self.body_file.close() response = Response() head = '' for key, val in self._response.getheaders().items(): head += '%s: %s\r\n' % (key, val) head += '\r\n' response.head = make_str(head, encoding='latin', errors='ignore') #if self.body_path: # response.body_path = self.body_path #else: # response.body = b''.join(self.response_body_chunks) def read_with_timeout(): if self._request.config_nobody: return b'' maxsize = self._request.config_body_maxsize chunks = [] default_chunk_size = 10000 if maxsize: chunk_size = min(default_chunk_size, maxsize + 1) else: chunk_size = default_chunk_size total_size = 0 while True: chunk = self._response.read(chunk_size) if chunk: total_size += len(chunk) chunks.append(chunk) if maxsize and total_size > maxsize: logger.debug( 'Response body max size limit reached: %s' % maxsize) else: break if self._request.timeout: if time.time( ) - self._request.op_started > self._request.timeout: raise GrabTimeoutError data = b''.join(chunks) if maxsize: data = data[:maxsize] return data if self._request._response_path: response.body_path = self._request._response_path # FIXME: Quick dirty hack, actullay, response is fully read into memory self._request._response_file.write(read_with_timeout()) self._request._response_file.close() else: response.body = read_with_timeout() # Clear memory #self.response_header_chunks = [] response.code = self._response.status #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME) #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME) #response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME) #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD) #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD) #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD) #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP) response.url = self._response.get_redirect_location( ) or self._request.url import email.message hdr = email.message.Message() for key, val in self._response.getheaders().items(): hdr[key] = val response.parse(charset=grab.config['document_charset'], headers=hdr) jar = self.extract_cookiejar() #self._response, self._request) response.cookies = CookieManager(jar) # We do not need anymore cookies stored in the # curl instance so drop them #self.curl.setopt(pycurl.COOKIELIST, 'ALL') return response finally: self._response.release_conn()
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config['nobody'] self.config_body_maxsize = grab.config['body_maxsize'] try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl(u'%s: %s' % (six.text_type(ex), grab.config['url'])) # py3 hack if not six.PY3: request_url = make_str(request_url) self.curl.setopt(pycurl.URL, request_url) # 30* redirects are handled by Grab self.curl.setopt(pycurl.FOLLOWLOCATION, 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit']) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout']) self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout']) #self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config['connection_reuse']: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor) if grab.config['body_inmemory']: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config['body_storage_dir']: raise error.GrabMisuseError( 'Option body_storage_dir is not defined') self.setup_body_file( grab.config['body_storage_dir'], grab.config['body_storage_filename'], create_dir=grab.config['body_storage_create_dir']) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config['verbose_logging']: self.verbose_logging = True # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = generate_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config['user_agent']: grab.config['user_agent'] = '' self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent']) if grab.config['debug']: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method in ('POST', 'PUT'): if (grab.config['post'] is None and grab.config['multipart_post'] is None): raise GrabMisuseError('Neither `post` or `multipart_post`' ' options was specified for the %s' ' request' % grab.request_method) if grab.request_method == 'POST': self.curl.setopt(pycurl.POST, 1) if grab.config['multipart_post']: if isinstance(grab.config['multipart_post'], six.string_types): raise error.GrabMisuseError( 'multipart_post option could not be a string') post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) # py3 hack #if six.PY3: # post_items = decode_pairs(post_items, # grab.config['charset']) self.curl.setopt(pycurl.HTTPPOST, process_upload_items(post_items)) elif grab.config['post']: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) self.curl.setopt(pycurl.POSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, '') elif grab.request_method == 'PUT': data = grab.config['post'] if isinstance(data, six.text_type): # py3 hack # if six.PY3: # data = data.encode('utf-8') # else: raise error.GrabMisuseError( 'Value of post option could be only ' 'byte string if PUT method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PUT') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'PATCH': data = grab.config['post'] if isinstance(data, six.text_type): raise error.GrabMisuseError( 'Value of post option could be only byte ' 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'DELETE': self.curl.setopt(pycurl.CUSTOMREQUEST, 'DELETE') elif grab.request_method == 'HEAD': self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == 'UPLOAD': self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == 'GET': self.curl.setopt(pycurl.HTTPGET, 1) elif grab.request_method == 'OPTIONS': data = grab.config['post'] if data is not None: if isinstance(data, six.text_type): raise error.GrabMisuseError( 'Value of post option could be only byte ' 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) self.curl.setopt(pycurl.CUSTOMREQUEST, 'OPTIONS') else: raise error.GrabMisuseError('Invalid method: %s' % grab.request_method) headers = grab.config['common_headers'] if grab.config['headers']: headers.update(grab.config['headers']) # This is required to avoid some problems headers.update({'Expect': ''}) header_tuples = [str('%s: %s' % x) for x in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config['referer']: self.curl.setopt(pycurl.REFERER, str(grab.config['referer'])) if grab.config['proxy']: self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) else: self.curl.setopt(pycurl.PROXY, '') if grab.config['proxy_userpwd']: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config['proxy_userpwd'])) if grab.config['proxy_type']: key = 'PROXYTYPE_%s' % grab.config['proxy_type'].upper() self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key)) if grab.config['encoding']: if ('gzip' in grab.config['encoding'] and 'zlib' not in pycurl.version): raise error.GrabMisuseError( 'You can not use gzip encoding because ' 'pycurl was built without zlib support') self.curl.setopt(pycurl.ENCODING, grab.config['encoding']) if grab.config['userpwd']: self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd'])) if grab.config.get('interface') is not None: self.curl.setopt(pycurl.INTERFACE, grab.config['interface']) if grab.config.get('reject_file_size') is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config['reject_file_size'])
def test_make_str_from_str(self): self.assertEqual( make_str(u'фыва'.encode('utf-8')), u'фыва'.encode('utf-8'), )
def build_hash(self, url): with self.spider.stat.log_time('cache.read.build_hash'): utf_url = make_str(url) return sha1(utf_url).hexdigest()
def normalize_unicode(value, charset='utf-8'): return make_str(value, encoding=charset, errors='ignore')
def get_random_filename(self): return md5(make_str(str(time.time()))).hexdigest()[:10]
def main(spider_name, thread_number=None, slave=False, settings_module='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, disable_default_logs=False, *args, **kwargs): if disable_default_logs: default_logging(propagate_network_logger=network_logs, grab_log=None, network_log=None) else: default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings_module) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=None, task_try_limit=None, args=spider_args, ) opt_queue = spider_config.get('queue') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=spider_config.get('display_timing')) stats_with_time = bot.render_stats(timing=True) if spider_config.get('display_stats'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.items(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(make_str(stats_with_time)) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def build_hash(self, url): utf_url = make_str(url) return sha1(utf_url).hexdigest()
def process_config(self, grab): req = Request(data=None) try: request_url = normalize_url(grab.config["url"]) except Exception as ex: raise error.GrabInvalidUrl(u"%s: %s" % (six.text_type(ex), grab.config["url"])) req.url = request_url method = grab.detect_request_method() req.method = make_str(method) req.body_maxsize = grab.config["body_maxsize"] if grab.config["nobody"]: req.body_maxsize = 0 req.timeout = grab.config["timeout"] req.connect_timeout = grab.config["connect_timeout"] extra_headers = {} # Body processing if grab.config["body_inmemory"]: pass else: if not grab.config["body_storage_dir"]: raise GrabMisuseError("Option body_storage_dir is not defined") file_, path_ = self.setup_body_file( grab.config["body_storage_dir"], grab.config["body_storage_filename"], create_dir=grab.config["body_storage_create_dir"], ) req._response_file = file_ req._response_path = path_ if grab.config["multipart_post"] is not None: post_data = grab.config["multipart_post"] if isinstance(post_data, six.binary_type): pass elif isinstance(post_data, six.text_type): raise GrabMisuseError("Option multipart_post data" " does not accept unicode.") else: post_items = normalize_http_values( grab.config["multipart_post"], charset=grab.config["charset"], ignore_classes=(UploadFile, UploadContent), ) # if six.PY3: post_items = decode_pairs(post_items, grab.config["charset"]) post_items = process_upload_items(post_items) post_data, content_type = encode_multipart_formdata(post_items) extra_headers["Content-Type"] = content_type extra_headers["Content-Length"] = len(post_data) req.data = post_data elif grab.config["post"] is not None: post_data = normalize_post_data(grab.config["post"], grab.config["charset"]) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) extra_headers["Content-Length"] = len(post_data) req.data = post_data if method in ("POST", "PUT"): if grab.config["post"] is None and grab.config["multipart_post"] is None: raise GrabMisuseError( "Neither `post` or `multipart_post`" " options was specified for the %s" " request" % method ) # Proxy if grab.config["proxy"]: req.proxy = grab.config["proxy"] if grab.config["proxy_userpwd"]: req.proxy_userpwd = grab.config["proxy_userpwd"] if grab.config["proxy_type"]: req.proxy_type = grab.config["proxy_type"] else: req.proxy_type = "http" # User-Agent if grab.config["user_agent"] is None: if grab.config["user_agent_file"] is not None: with open(grab.config["user_agent_file"]) as inf: lines = inf.read().splitlines() grab.config["user_agent"] = random.choice(lines) else: grab.config["user_agent"] = generate_user_agent() extra_headers["User-Agent"] = grab.config["user_agent"] # Headers headers = extra_headers headers.update(grab.config["common_headers"]) if grab.config["headers"]: headers.update(grab.config["headers"]) req.headers = headers # Cookies self.process_cookie_options(grab, req) self._request = req
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: headers = make_headers(proxy_basic_auth=req.proxy_userpwd) else: headers = None proxy_url = '%s://%s' % (req.proxy_type, req.proxy) if req.proxy_type == 'socks5': pool = SOCKSProxyManager( proxy_url, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) # , proxy_headers=headers) else: pool = ProxyManager(proxy_url, proxy_headers=headers, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) else: pool = self.pool with self.wrap_transport_error(): # Retries can be disabled by passing False: # http://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry # Do not use False because of warning: # Converted retries value: False -> Retry(total=False, # connect=None, read=None, redirect=0, status=None) retry = Retry( total=False, connect=False, read=False, redirect=0, status=None, ) # The read timeout is not total response time timeout # It is the timeout on read of next data chunk from the server # Total response timeout is handled by Grab timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method req.op_started = time.time() try: res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except UnicodeError as ex: raise error.GrabConnectionError('GrabInvalidUrl', ex) #except exceptions.ReadTimeoutError as ex: # raise error.GrabTimeoutError('ReadTimeoutError', ex) #except exceptions.ConnectTimeoutError as ex: # raise error.GrabConnectionError('ConnectTimeoutError', ex) #except exceptions.ProtocolError as ex: # # TODO: # # the code # # raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # # fails # # with error TypeError: 'OSError' object is not subscriptable # raise error.GrabConnectionError('ProtocolError', ex) #except exceptions.SSLError as ex: # raise error.GrabConnectionError('SSLError', ex) # WTF? self.request_head = b'' self.request_body = b'' self.request_log = b'' self._response = res
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config["nobody"] self.config_body_maxsize = grab.config["body_maxsize"] try: request_url = normalize_url(grab.config["url"]) except Exception as ex: raise error.GrabInvalidUrl(u"%s: %s" % (six.text_type(ex), grab.config["url"])) # py3 hack if not six.PY3: request_url = make_str(request_url) self.curl.setopt(pycurl.URL, request_url) # Actually, FOLLOWLOCATION should always be 0 # because redirect logic takes place in Grab.request method # BUT in Grab.Spider this method is not invoked # So, in Grab.Spider we still rely on Grab internal ability # to follow 30X Locations self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config["follow_location"] else 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config["redirect_limit"]) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config["connect_timeout"]) self.curl.setopt(pycurl.TIMEOUT, grab.config["timeout"]) self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config["connection_reuse"]: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor) if grab.config["body_inmemory"]: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config["body_storage_dir"]: raise error.GrabMisuseError("Option body_storage_dir is not defined") self.setup_body_file( grab.config["body_storage_dir"], grab.config["body_storage_filename"], create_dir=grab.config["body_storage_create_dir"], ) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config["verbose_logging"]: self.verbose_logging = True # User-Agent if grab.config["user_agent"] is None: if grab.config["user_agent_file"] is not None: with open(grab.config["user_agent_file"]) as inf: lines = inf.read().splitlines() grab.config["user_agent"] = random.choice(lines) else: grab.config["user_agent"] = generate_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config["user_agent"]: grab.config["user_agent"] = "" self.curl.setopt(pycurl.USERAGENT, grab.config["user_agent"]) if grab.config["debug"]: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method in ("POST", "PUT"): if grab.config["post"] is None and grab.config["multipart_post"] is None: raise GrabMisuseError( "Neither `post` or `multipart_post`" " options was specified for the %s" " request" % grab.request_method ) if grab.request_method == "POST": self.curl.setopt(pycurl.POST, 1) if grab.config["multipart_post"]: if isinstance(grab.config["multipart_post"], six.string_types): raise error.GrabMisuseError("multipart_post option could not be a string") post_items = normalize_http_values( grab.config["multipart_post"], charset=grab.config["charset"], ignore_classes=(UploadFile, UploadContent), ) # py3 hack if six.PY3: post_items = decode_pairs(post_items, grab.config["charset"]) # import pdb; pdb.set_trace() self.curl.setopt(pycurl.HTTPPOST, process_upload_items(post_items)) elif grab.config["post"]: post_data = normalize_post_data(grab.config["post"], grab.config["charset"]) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) self.curl.setopt(pycurl.POSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, "") elif grab.request_method == "PUT": data = grab.config["post"] if isinstance(data, six.text_type): # py3 hack # if six.PY3: # data = data.encode('utf-8') # else: raise error.GrabMisuseError("Value of post option could be only " "byte string if PUT method is used") self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, "PUT") self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == "PATCH": data = grab.config["post"] if isinstance(data, six.text_type): raise error.GrabMisuseError("Value of post option could be only byte " "string if PATCH method is used") self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, "PATCH") self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == "DELETE": self.curl.setopt(pycurl.CUSTOMREQUEST, "DELETE") elif grab.request_method == "HEAD": self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == "UPLOAD": self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == "GET": self.curl.setopt(pycurl.HTTPGET, 1) elif grab.request_method == "OPTIONS": data = grab.config["post"] if data is not None: if isinstance(data, six.text_type): raise error.GrabMisuseError( "Value of post option could be only byte " "string if PATCH method is used" ) self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) self.curl.setopt(pycurl.CUSTOMREQUEST, "OPTIONS") else: raise error.GrabMisuseError("Invalid method: %s" % grab.request_method) headers = grab.config["common_headers"] if grab.config["headers"]: headers.update(grab.config["headers"]) # This is required to avoid some problems headers.update({"Expect": ""}) header_tuples = [str("%s: %s" % x) for x in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config["referer"]: self.curl.setopt(pycurl.REFERER, str(grab.config["referer"])) if grab.config["proxy"]: self.curl.setopt(pycurl.PROXY, str(grab.config["proxy"])) else: self.curl.setopt(pycurl.PROXY, "") if grab.config["proxy_userpwd"]: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config["proxy_userpwd"])) if grab.config["proxy_type"]: key = "PROXYTYPE_%s" % grab.config["proxy_type"].upper() self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key)) if grab.config["encoding"]: if "gzip" in grab.config["encoding"] and "zlib" not in pycurl.version: raise error.GrabMisuseError( "You can not use gzip encoding because " "pycurl was built without zlib support" ) self.curl.setopt(pycurl.ENCODING, grab.config["encoding"]) if grab.config["userpwd"]: self.curl.setopt(pycurl.USERPWD, str(grab.config["userpwd"])) if grab.config.get("interface") is not None: self.curl.setopt(pycurl.INTERFACE, grab.config["interface"]) if grab.config.get("reject_file_size") is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config["reject_file_size"])
def prepare_response(self, grab): # Information about urllib3 # On python2 urllib3 headers contains original binary data # On python3 urllib3 headers are converted to unicode # using latin encoding try: #if self.body_file: # self.body_file.close() response = Document() head = '' for key, val in self._response.getheaders().items(): if six.PY2: key = key.decode('utf-8', errors='ignore') val = val.decode('utf-8', errors='ignore') if six.PY3: key = key.encode('latin').decode('utf-8', errors='ignore') val = val.encode('latin').decode('utf-8', errors='ignore') head += '%s: %s\r\n' % (key, val) head += '\r\n' response.head = make_str(head, encoding='utf-8') #if self.body_path: # response.body_path = self.body_path #else: # response.body = b''.join(self.response_body_chunks) def read_with_timeout(): if self._request.config_nobody: return b'' maxsize = self._request.config_body_maxsize chunks = [] default_chunk_size = 10000 if maxsize: chunk_size = min(default_chunk_size, maxsize + 1) else: chunk_size = default_chunk_size bytes_read = 0 while True: chunk = self._response.read(chunk_size) if chunk: bytes_read += len(chunk) chunks.append(chunk) if maxsize and bytes_read > maxsize: # reached limit on bytes to read break else: break if self._request.timeout: if (time.time() - self._request.op_started > self._request.timeout): raise GrabTimeoutError data = b''.join(chunks) if maxsize: data = data[:maxsize] return data if self._request.response_path: response.body_path = self._request.response_path # FIXME: Quick dirty hack, actullay, response is fully # read into memory self._request.response_file.write(read_with_timeout()) self._request.response_file.close() else: response.body = read_with_timeout() # Clear memory #self.response_header_chunks = [] response.code = self._response.status #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME) #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME) #response.name_lookup_time = (self.curl # .getinfo(pycurl.NAMELOOKUP_TIME)) #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD) #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD) #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD) #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP) response.url = (self._response.get_redirect_location() or self._request.url) import email.message hdr = email.message.Message() for key, val in self._response.getheaders().items(): if six.PY2: key = key.decode('utf-8', errors='ignore') val = val.decode('utf-8', errors='ignore') if six.PY3: key = key.encode('latin').decode('utf-8', errors='ignore') val = val.encode('latin').decode('utf-8', errors='ignore') #if key == 'Location': # import pdb; pdb.set_trace() hdr[key] = val response.parse(charset=grab.config['document_charset'], headers=hdr) jar = self.extract_cookiejar() # self._response, self._request) response.cookies = CookieManager(jar) # We do not need anymore cookies stored in the # curl instance so drop them #self.curl.setopt(pycurl.COOKIELIST, 'ALL') return response finally: self._response.release_conn()
def process_config(self, grab): req = Request(data=None) try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl( u'%s: %s' % (six.text_type(ex), grab.config['url'])) req.url = request_url method = grab.detect_request_method() req.method = make_str(method) req.body_maxsize = grab.config['body_maxsize'] if grab.config['nobody']: req.body_maxsize = 0 req.timeout = grab.config['timeout'] req.connect_timeout = grab.config['connect_timeout'] extra_headers = {} # Body processing if grab.config['body_inmemory']: pass else: if not grab.config['body_storage_dir']: raise GrabMisuseError( 'Option body_storage_dir is not defined') file_, path_ = self.setup_body_file( grab.config['body_storage_dir'], grab.config['body_storage_filename'], create_dir=grab.config['body_storage_create_dir']) req._response_file = file_ req._response_path = path_ if grab.config['multipart_post'] is not None: post_data = grab.config['multipart_post'] if isinstance(post_data, six.binary_type): pass elif isinstance(post_data, six.text_type): raise GrabMisuseError('Option multipart_post data' ' does not accept unicode.') else: post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) #if six.PY3: post_items = decode_pairs(post_items, grab.config['charset']) post_items = process_upload_items(post_items) post_data, content_type = encode_multipart_formdata(post_items) extra_headers['Content-Type'] = content_type extra_headers['Content-Length'] = len(post_data) req.data = post_data elif grab.config['post'] is not None: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) extra_headers['Content-Length'] = len(post_data) req.data = post_data if method in ('POST', 'PUT'): if (grab.config['post'] is None and grab.config['multipart_post'] is None): raise GrabMisuseError('Neither `post` or `multipart_post`' ' options was specified for the %s' ' request' % method) # Proxy if grab.config['proxy']: req.proxy = grab.config['proxy'] if grab.config['proxy_userpwd']: req.proxy_userpwd = grab.config['proxy_userpwd'] if grab.config['proxy_type']: req.proxy_type = grab.config['proxy_type'] else: req.proxy_type = 'http' # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = generate_user_agent() extra_headers['User-Agent'] = grab.config['user_agent'] # Headers headers = extra_headers headers.update(grab.config['common_headers']) if grab.config['headers']: headers.update(grab.config['headers']) req.headers = headers # Cookies self.process_cookie_options(grab, req) self._request = req
def main(spider_name, thread_number=None, settings_module='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, api_port=None, parser_pool_size=2, grab_log_file=None, network_log_file=None, network_service=None, grab_transport=None, **kwargs): # pylint: disable=unused-argument default_logging( grab_log=grab_log_file, network_log=network_log_file, propagate_network_logger=network_logs, ) root_config = build_root_config(settings_module) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, _ = parser.parse_known_args() spider_args = vars(opts) bot = spider_class( thread_number=thread_number, config=spider_config, network_try_limit=None, task_try_limit=None, args=spider_args, http_api_port=api_port, parser_pool_size=parser_pool_size, network_service=network_service, grab_transport=grab_transport, ) opt_queue = spider_config.get('queue') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats() if spider_config.get('display_stats'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d', pid) if not disable_report: if spider_config.get('save_report'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.makedirs(dir_) else: clear_directory(dir_) for key, lst in bot.stat.collections.items(): fname_key = key.replace('-', '_') save_list(lst, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(make_str(stats)) return { 'spider_stats': bot.render_stats(), }
def test_make_str_from_none(self): self.assertEqual( make_str(None), b'None', )
def test_make_str_cp1251_from_unicode(self): self.assertEqual( make_str(u'фыва', encoding='cp1251'), u'фыва'.encode('cp1251'), )
def build_hash(self, url): with self.spider.timer.log_time('cache.read.build_hash'): utf_url = make_str(url) return sha1(utf_url).hexdigest()
def test_make_str_from_int(self): self.assertEqual( make_str(1), b'1', )