def get_netscape_cookie_spec(self, cookie, request_host): # FIXME: Now cookie.domain could not be None # request_host is not needed anymore host = make_unicode(cookie.domain) or request_host if cookie.get_nonstandard_attr('HttpOnly'): host = '#HttpOnly_' + host items = [ host, u'TRUE', make_unicode(cookie.path), u'TRUE' if cookie.secure else u'FALSE', make_unicode( str(cookie.expires if cookie.expires else YEAR_2030_EPOCH_TIME) ), make_unicode(cookie.name), make_unicode(cookie.value), ] return (u'\t'.join(items)).encode('utf-8')
def get_netscape_cookie_spec(self, cookie, request_host): # FIXME: Now cookie.domain could not be None # request_host is not needed anymore host = make_unicode(cookie.domain) or request_host if cookie.get_nonstandard_attr('HttpOnly'): host = '#HttpOnly_' + host items = [ host, u'TRUE', make_unicode(cookie.path), u'TRUE' if cookie.secure else u'FALSE', make_unicode(str( cookie.expires if cookie.expires else YEAR_2030_EPOCH_TIME )), make_unicode(cookie.name), make_unicode(cookie.value), ] return (u'\t'.join(items)).encode('utf-8')
def execute_task_handler(self, res, handler): """ Apply `handler` function to the network result. If network result is failed then submit task again to the network task queue. """ try: handler_name = handler.__name__ except AttributeError: handler_name = 'NONE' if (res['task'].get('raw') or ( res['ok'] and self.valid_response_code(res['grab'].response.code, res['task']))): try: with self.stat.log_time('response_handler'): with self.stat.log_time('response_handler.%s' % handler_name): result = handler(res['grab'], res['task']) if result is None: pass else: for item in result: self.process_handler_result(item, res['task']) except NoDataHandler as ex: raise except Exception as ex: self.process_handler_error(handler_name, ex, res['task']) else: self.stat.inc('spider:task-%s-ok' % res['task'].name) else: # Log the error if res['ok']: msg = res['emsg'] = 'HTTP %s' % res['grab'].response.code else: msg = res['emsg'] self.stat.inc('spider:network-error-%s' % make_str(res['emsg'][:20], errors='ignore')) logger.error(u'Network error: %s' % make_unicode(msg, errors='ignore')) # Try to repeat the same network query if self.network_try_limit > 0: task = res['task'] task.refresh_cache = True # Should use task.grab_config or backup of grab_config task.setup_grab_config(res['grab_config_backup']) self.add_task(task)
def execute_task_handler(self, res, handler): """ Apply `handler` function to the network result. If network result is failed then submit task again to the network task queue. """ try: handler_name = handler.__name__ except AttributeError: handler_name = 'NONE' if (res['task'].get('raw') or (res['ok'] and self.valid_response_code( res['grab'].response.code, res['task']))): try: with self.save_timer('response_handler'): with self.save_timer('response_handler.%s' % handler_name): result = handler(res['grab'], res['task']) if result is None: pass else: for item in result: self.process_handler_result(item, res['task']) except NoDataHandler as ex: raise except Exception as ex: self.process_handler_error(handler_name, ex, res['task']) else: self.inc_count('task-%s-ok' % res['task'].name) else: # Log the error if res['ok']: msg = res['emsg'] = 'HTTP %s' % res['grab'].response.code else: msg = res['emsg'] self.inc_count('network-error-%s' % make_str(res['emsg'][:20], errors='ignore')) logger.error(u'Network error: %s' % make_unicode(msg, errors='ignore')) # Try to repeat the same network query if self.network_try_limit > 0: task = res['task'] task.refresh_cache = True # Should use task.grab_config or backup of grab_config task.setup_grab_config(res['grab_config_backup']) self.add_task(task)
def normalize_url(url): # https://tools.ietf.org/html/rfc3986 url = make_unicode(url) if RE_NOT_SAFE_CHAR.search(url): parts = list(urlsplit(url)) # Scheme pass # Network location (user:pass@hostname) if RE_NON_ALPHA_DIGIT_NETLOC.search(parts[1]): parts[1] = parts[1].encode('idna') # Path parts[2] = quote(make_str(parts[2]), safe=RESERVED_CHARS) # Query parts[3] = quote(make_str(parts[3]), safe=RESERVED_CHARS) # Fragment parts[4] = quote(make_str(parts[4]), safe=RESERVED_CHARS) return urlunsplit(map(make_unicode, parts)) return url
def find_link(self, href_pattern, make_absolute=True): """ Find link in response body which href value matches ``href_pattern``. Returns found url or None. """ if make_absolute: self.tree.make_links_absolute(self.response.url) if isinstance(href_pattern, six.text_type): raise GrabMisuseError('Method `find_link` accepts only ' 'byte-string argument') href_pattern = make_unicode(href_pattern) for elem, attr, link, pos in self.tree.iterlinks(): if elem.tag == 'a' and href_pattern in link: return link return None
def prepare_response(self, grab): if self.body_file: self.body_file.close() response = Response() # py3 hack if six.PY3: bytes_head = b''.join(self.response_head_chunks) else: bytes_head = ''.join(self.response_head_chunks) response.head = make_unicode(bytes_head, errors='ignore') if self.body_path: response.body_path = self.body_path else: response.body = b''.join(self.response_body_chunks) # Clear memory self.response_head_chunks = [] self.response_body_chunks = [] response.code = self.curl.getinfo(pycurl.HTTP_CODE) response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME) response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME) response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME) response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD) response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD) response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD) response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP) response.url = self.curl.getinfo(pycurl.EFFECTIVE_URL) response.parse(charset=grab.config['document_charset']) response.cookies = CookieManager(self.extract_cookiejar()) # We do not need anymore cookies stored in the # curl instance so drop them self.curl.setopt(pycurl.COOKIELIST, 'ALL') return response
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: headers = make_headers(proxy_basic_auth=req.proxy_userpwd) else: headers = None proxy_url = '%s://%s' % (req.proxy_type, req.proxy) if req.proxy_type == 'socks5': pool = SOCKSProxyManager( proxy_url, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) # , proxy_headers=headers) else: pool = ProxyManager(proxy_url, proxy_headers=headers, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) else: pool = self.pool with self.wrap_transport_error(): # Retries can be disabled by passing False: # http://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry # Do not use False because of warning: # Converted retries value: False -> Retry(total=False, # connect=None, read=None, redirect=0, status=None) retry = Retry( total=False, connect=False, read=False, redirect=0, status=None, ) # The read timeout is not total response time timeout # It is the timeout on read of next data chunk from the server # Total response timeout is handled by Grab timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method req.op_started = time.time() try: res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except UnicodeError as ex: raise error.GrabConnectionError('GrabInvalidUrl', ex) #except exceptions.ReadTimeoutError as ex: # raise error.GrabTimeoutError('ReadTimeoutError', ex) #except exceptions.ConnectTimeoutError as ex: # raise error.GrabConnectionError('ConnectTimeoutError', ex) #except exceptions.ProtocolError as ex: # # TODO: # # the code # # raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # # fails # # with error TypeError: 'OSError' object is not subscriptable # raise error.GrabConnectionError('ProtocolError', ex) #except exceptions.SSLError as ex: # raise error.GrabConnectionError('SSLError', ex) # WTF? self.request_head = b'' self.request_body = b'' self.request_log = b'' self._response = res
def process_config(self, grab): req = Request(data=None) try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl( u'%s: %s' % (six.text_type(ex), make_unicode(grab.config['url'], errors='ignore'))) req.url = request_url method = grab.detect_request_method() req.method = make_str(method) req.config_body_maxsize = grab.config['body_maxsize'] req.config_nobody = grab.config['nobody'] req.timeout = grab.config['timeout'] req.connect_timeout = grab.config['connect_timeout'] extra_headers = {} # Body processing if grab.config['body_inmemory']: pass else: if not grab.config['body_storage_dir']: raise GrabMisuseError('Option body_storage_dir is not defined') file_, path_ = self.setup_body_file( grab.config['body_storage_dir'], grab.config['body_storage_filename'], create_dir=grab.config['body_storage_create_dir']) req.response_file = file_ req.response_path = path_ if grab.config['multipart_post'] is not None: post_data = grab.config['multipart_post'] if isinstance(post_data, six.binary_type): pass elif isinstance(post_data, six.text_type): raise GrabMisuseError('Option multipart_post data' ' does not accept unicode.') else: post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) post_items = decode_pairs(post_items, grab.config['charset']) post_items = process_upload_items(post_items) post_data, content_type = encode_multipart_formdata(post_items) extra_headers['Content-Type'] = content_type extra_headers['Content-Length'] = len(post_data) req.data = post_data elif grab.config['post'] is not None: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) extra_headers['Content-Length'] = len(post_data) req.data = post_data if method in ('POST', 'PUT'): if (grab.config['post'] is None and grab.config['multipart_post'] is None): raise GrabMisuseError('Neither `post` or `multipart_post`' ' options was specified for the %s' ' request' % method) # Proxy if grab.config['proxy']: req.proxy = grab.config['proxy'] if grab.config['proxy_userpwd']: req.proxy_userpwd = grab.config['proxy_userpwd'] if grab.config['proxy_type']: req.proxy_type = grab.config['proxy_type'] else: req.proxy_type = 'http' # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = generate_user_agent() extra_headers['User-Agent'] = grab.config['user_agent'] # Headers headers = extra_headers headers.update(grab.config['common_headers']) if grab.config['headers']: headers.update(grab.config['headers']) req.headers = headers # Cookies self.process_cookie_options(grab, req) self._request = req
def test_make_unicode_from_none(self): self.assertEqual( make_unicode(None), u'None', )
def test_make_unicode_from_int(self): self.assertEqual( make_unicode(1), u'1', )
def test_make_unicode_from_unicode(self): self.assertEqual( make_unicode(u'фыва'), u'фыва', )
def test_make_unicode_from_str(self): self.assertEqual( make_unicode(u'фыва'.encode('utf-8')), u'фыва', )
def request(self): req = self._request if req.proxy: if req.proxy_userpwd: headers = make_headers(proxy_basic_auth=req.proxy_userpwd) else: headers = None proxy_url = '%s://%s' % (req.proxy_type, req.proxy) if req.proxy_type == 'socks5': pool = SOCKSProxyManager( proxy_url, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where() ) # , proxy_headers=headers) else: pool = ProxyManager( proxy_url, proxy_headers=headers, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where() ) else: pool = self.pool try: # Retries can be disabled by passing False: # http://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry # Do not use False because of warning: # Converted retries value: False -> Retry(total=False, # connect=None, read=None, redirect=0, status=None) retry = Retry( total=False, connect=False, read=False, redirect=0, status=None, ) # The read timeout is not total response time timeout # It is the timeout on read of next data chunk from the server # Total response timeout is handled by Grab timeout = Timeout(connect=req.connect_timeout, read=req.timeout) #req_headers = dict((make_unicode(x), make_unicode(y)) # for (x, y) in req.headers.items()) if six.PY3: req_url = make_unicode(req.url) req_method = make_unicode(req.method) else: req_url = make_str(req.url) req_method = req.method req.op_started = time.time() try: res = pool.urlopen(req_method, req_url, body=req.data, timeout=timeout, retries=retry, headers=req.headers, preload_content=False) except UnicodeError as ex: raise error.GrabConnectionError('GrabInvalidUrl', ex) except exceptions.ReadTimeoutError as ex: raise error.GrabTimeoutError('ReadTimeoutError', ex) except exceptions.ConnectTimeoutError as ex: raise error.GrabConnectionError('ConnectTimeoutError', ex) except exceptions.ProtocolError as ex: # TODO: # the code # raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1]) # fails # with error TypeError: 'OSError' object is not subscriptable raise error.GrabConnectionError('ProtocolError', ex) except exceptions.SSLError as ex: raise error.GrabConnectionError('SSLError', ex) # WTF? self.request_head = b'' self.request_body = b'' self.request_log = b'' self._response = res
def process_config(self, grab): req = Request(data=None) try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl( u'%s: %s' % ( six.text_type(ex), make_unicode(grab.config['url'], errors='ignore') ) ) req.url = request_url method = grab.detect_request_method() req.method = make_str(method) req.config_body_maxsize = grab.config['body_maxsize'] req.config_nobody = grab.config['nobody'] req.timeout = grab.config['timeout'] req.connect_timeout = grab.config['connect_timeout'] extra_headers = {} # Body processing if grab.config['body_inmemory']: pass else: if not grab.config['body_storage_dir']: raise GrabMisuseError( 'Option body_storage_dir is not defined') file_, path_ = self.setup_body_file( grab.config['body_storage_dir'], grab.config['body_storage_filename'], create_dir=grab.config['body_storage_create_dir']) req.response_file = file_ req.response_path = path_ if grab.config['multipart_post'] is not None: post_data = grab.config['multipart_post'] if isinstance(post_data, six.binary_type): pass elif isinstance(post_data, six.text_type): raise GrabMisuseError('Option multipart_post data' ' does not accept unicode.') else: post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) post_items = decode_pairs(post_items, grab.config['charset']) post_items = process_upload_items(post_items) post_data, content_type = encode_multipart_formdata(post_items) extra_headers['Content-Type'] = content_type extra_headers['Content-Length'] = len(post_data) req.data = post_data elif grab.config['post'] is not None: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) extra_headers['Content-Length'] = len(post_data) req.data = post_data if method in ('POST', 'PUT'): if (grab.config['post'] is None and grab.config['multipart_post'] is None): raise GrabMisuseError('Neither `post` or `multipart_post`' ' options was specified for the %s' ' request' % method) # Proxy if grab.config['proxy']: req.proxy = grab.config['proxy'] if grab.config['proxy_userpwd']: req.proxy_userpwd = grab.config['proxy_userpwd'] if grab.config['proxy_type']: req.proxy_type = grab.config['proxy_type'] else: req.proxy_type = 'http' # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = generate_user_agent() extra_headers['User-Agent'] = grab.config['user_agent'] # Headers headers = extra_headers headers.update(grab.config['common_headers']) if grab.config['headers']: headers.update(grab.config['headers']) req.headers = headers # Cookies self.process_cookie_options(grab, req) self._request = req