def process_config(self, grab): req = Request(data=None) try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl( u'%s: %s' % (six.text_type(ex), grab.config['url'])) req.url = request_url method = grab.detect_request_method() req.method = make_str(method) req.body_maxsize = grab.config['body_maxsize'] if grab.config['nobody']: req.body_maxsize = 0 req.timeout = grab.config['timeout'] req.connect_timeout = grab.config['connect_timeout'] extra_headers = {} # Body processing if grab.config['body_inmemory']: pass else: if not grab.config['body_storage_dir']: raise GrabMisuseError( 'Option body_storage_dir is not defined') file_, path_ = self.setup_body_file( grab.config['body_storage_dir'], grab.config['body_storage_filename'], create_dir=grab.config['body_storage_create_dir']) req._response_file = file_ req._response_path = path_ if grab.config['multipart_post'] is not None: post_data = grab.config['multipart_post'] if isinstance(post_data, six.binary_type): pass elif isinstance(post_data, six.text_type): raise GrabMisuseError('Option multipart_post data' ' does not accept unicode.') else: post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) #if six.PY3: post_items = decode_pairs(post_items, grab.config['charset']) post_items = process_upload_items(post_items) post_data, content_type = encode_multipart_formdata(post_items) extra_headers['Content-Type'] = content_type extra_headers['Content-Length'] = len(post_data) req.data = post_data elif grab.config['post'] is not None: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) extra_headers['Content-Length'] = len(post_data) req.data = post_data if method in ('POST', 'PUT'): if (grab.config['post'] is None and grab.config['multipart_post'] is None): raise GrabMisuseError('Neither `post` or `multipart_post`' ' options was specified for the %s' ' request' % method) # Proxy if grab.config['proxy']: req.proxy = grab.config['proxy'] if grab.config['proxy_userpwd']: req.proxy_userpwd = grab.config['proxy_userpwd'] if grab.config['proxy_type']: req.proxy_type = grab.config['proxy_type'] else: req.proxy_type = 'http' # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = generate_user_agent() extra_headers['User-Agent'] = grab.config['user_agent'] # Headers headers = extra_headers headers.update(grab.config['common_headers']) if grab.config['headers']: headers.update(grab.config['headers']) req.headers = headers # Cookies self.process_cookie_options(grab, req) self._request = req
def test_normalize_post_data_non_text(self): self.assertEqual( normalize_post_data([('bar', 1), ('bar', [3, 4])]), b'bar=1&bar=3&bar=4', )
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config['nobody'] self.config_body_maxsize = grab.config['body_maxsize'] try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl(u'%s: %s' % (six.text_type(ex), grab.config['url'])) # py3 hack if not six.PY3: request_url = make_str(request_url) self.curl.setopt(pycurl.URL, request_url) # 30* redirects are handled by Grab self.curl.setopt(pycurl.FOLLOWLOCATION, 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit']) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout']) self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout']) #self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config['connection_reuse']: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor) if grab.config['body_inmemory']: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config['body_storage_dir']: raise error.GrabMisuseError( 'Option body_storage_dir is not defined') self.setup_body_file( grab.config['body_storage_dir'], grab.config['body_storage_filename'], create_dir=grab.config['body_storage_create_dir']) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config['verbose_logging']: self.verbose_logging = True # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = generate_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config['user_agent']: grab.config['user_agent'] = '' self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent']) if grab.config['debug']: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method in ('POST', 'PUT'): if (grab.config['post'] is None and grab.config['multipart_post'] is None): raise GrabMisuseError('Neither `post` or `multipart_post`' ' options was specified for the %s' ' request' % grab.request_method) if grab.request_method == 'POST': self.curl.setopt(pycurl.POST, 1) if grab.config['multipart_post']: if isinstance(grab.config['multipart_post'], six.string_types): raise error.GrabMisuseError( 'multipart_post option could not be a string') post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) # py3 hack #if six.PY3: # post_items = decode_pairs(post_items, # grab.config['charset']) self.curl.setopt(pycurl.HTTPPOST, process_upload_items(post_items)) elif grab.config['post']: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) self.curl.setopt(pycurl.POSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, '') elif grab.request_method == 'PUT': data = grab.config['post'] if isinstance(data, six.text_type): # py3 hack # if six.PY3: # data = data.encode('utf-8') # else: raise error.GrabMisuseError( 'Value of post option could be only ' 'byte string if PUT method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PUT') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'PATCH': data = grab.config['post'] if isinstance(data, six.text_type): raise error.GrabMisuseError( 'Value of post option could be only byte ' 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'DELETE': self.curl.setopt(pycurl.CUSTOMREQUEST, 'DELETE') elif grab.request_method == 'HEAD': self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == 'UPLOAD': self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == 'GET': self.curl.setopt(pycurl.HTTPGET, 1) elif grab.request_method == 'OPTIONS': data = grab.config['post'] if data is not None: if isinstance(data, six.text_type): raise error.GrabMisuseError( 'Value of post option could be only byte ' 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) self.curl.setopt(pycurl.CUSTOMREQUEST, 'OPTIONS') else: raise error.GrabMisuseError('Invalid method: %s' % grab.request_method) headers = grab.config['common_headers'] if grab.config['headers']: headers.update(grab.config['headers']) # This is required to avoid some problems headers.update({'Expect': ''}) header_tuples = [str('%s: %s' % x) for x in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config['referer']: self.curl.setopt(pycurl.REFERER, str(grab.config['referer'])) if grab.config['proxy']: self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) else: self.curl.setopt(pycurl.PROXY, '') if grab.config['proxy_userpwd']: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config['proxy_userpwd'])) if grab.config['proxy_type']: key = 'PROXYTYPE_%s' % grab.config['proxy_type'].upper() self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key)) if grab.config['encoding']: if ('gzip' in grab.config['encoding'] and 'zlib' not in pycurl.version): raise error.GrabMisuseError( 'You can not use gzip encoding because ' 'pycurl was built without zlib support') self.curl.setopt(pycurl.ENCODING, grab.config['encoding']) if grab.config['userpwd']: self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd'])) if grab.config.get('interface') is not None: self.curl.setopt(pycurl.INTERFACE, grab.config['interface']) if grab.config.get('reject_file_size') is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config['reject_file_size'])
def test_normalize_post_data_bytes(self): self.assertEqual( normalize_post_data(u'фыва'.encode('utf-8')), u'фыва'.encode('utf-8'), )
def process_config(self, grab): req = Request(data=None) try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl( u'%s: %s' % (six.text_type(ex), make_unicode(grab.config['url'], errors='ignore'))) req.url = request_url method = grab.detect_request_method() req.method = make_str(method) req.config_body_maxsize = grab.config['body_maxsize'] req.config_nobody = grab.config['nobody'] req.timeout = grab.config['timeout'] req.connect_timeout = grab.config['connect_timeout'] extra_headers = {} # Body processing if grab.config['body_inmemory']: pass else: if not grab.config['body_storage_dir']: raise GrabMisuseError('Option body_storage_dir is not defined') file_, path_ = self.setup_body_file( grab.config['body_storage_dir'], grab.config['body_storage_filename'], create_dir=grab.config['body_storage_create_dir']) req.response_file = file_ req.response_path = path_ if grab.config['multipart_post'] is not None: post_data = grab.config['multipart_post'] if isinstance(post_data, six.binary_type): pass elif isinstance(post_data, six.text_type): raise GrabMisuseError('Option multipart_post data' ' does not accept unicode.') else: post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) post_items = decode_pairs(post_items, grab.config['charset']) post_items = process_upload_items(post_items) post_data, content_type = encode_multipart_formdata(post_items) extra_headers['Content-Type'] = content_type extra_headers['Content-Length'] = len(post_data) req.data = post_data elif grab.config['post'] is not None: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) extra_headers['Content-Length'] = len(post_data) req.data = post_data if method in ('POST', 'PUT'): if (grab.config['post'] is None and grab.config['multipart_post'] is None): raise GrabMisuseError('Neither `post` or `multipart_post`' ' options was specified for the %s' ' request' % method) # Proxy if grab.config['proxy']: req.proxy = grab.config['proxy'] if grab.config['proxy_userpwd']: req.proxy_userpwd = grab.config['proxy_userpwd'] if grab.config['proxy_type']: req.proxy_type = grab.config['proxy_type'] else: req.proxy_type = 'http' # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = generate_user_agent() extra_headers['User-Agent'] = grab.config['user_agent'] # Headers headers = extra_headers headers.update(grab.config['common_headers']) if grab.config['headers']: headers.update(grab.config['headers']) req.headers = headers # Cookies self.process_cookie_options(grab, req) self._request = req
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config['nobody'] self.config_body_maxsize = grab.config['body_maxsize'] try: request_url = normalize_url(grab.config['url']) except Exception as ex: raise error.GrabInvalidUrl( u'%s: %s' % (six.text_type(ex), grab.config['url'])) # py3 hack if not six.PY3: request_url = smart_str(request_url) self.curl.setopt(pycurl.URL, request_url) self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config['follow_location'] else 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit']) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout']) self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout']) self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config['connection_reuse']: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.head_processor) if grab.config['body_inmemory']: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config['body_storage_dir']: raise error.GrabMisuseError( 'Option body_storage_dir is not defined') self.setup_body_file(grab.config['body_storage_dir'], grab.config['body_storage_filename']) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config['verbose_logging']: self.verbose_logging = True # User-Agent if grab.config['user_agent'] is None: if grab.config['user_agent_file'] is not None: with open(grab.config['user_agent_file']) as inf: lines = inf.read().splitlines() grab.config['user_agent'] = random.choice(lines) else: grab.config['user_agent'] = random_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config['user_agent']: grab.config['user_agent'] = '' self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent']) if grab.config['debug']: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method == 'POST': self.curl.setopt(pycurl.POST, 1) if grab.config['multipart_post']: if isinstance(grab.config['multipart_post'], six.string_types): raise error.GrabMisuseError( 'multipart_post option could not be a string') post_items = normalize_http_values( grab.config['multipart_post'], charset=grab.config['charset'], ignore_classes=(UploadFile, UploadContent), ) # py3 hack if six.PY3: post_items = decode_pairs(post_items, grab.config['charset']) # import pdb; pdb.set_trace() self.curl.setopt(pycurl.HTTPPOST, process_upload_items(post_items)) elif grab.config['post']: post_data = normalize_post_data(grab.config['post'], grab.config['charset']) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) self.curl.setopt(pycurl.POSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, '') elif grab.request_method == 'PUT': data = grab.config['post'] if isinstance(data, six.text_type): # py3 hack # if six.PY3: # data = data.encode('utf-8') # else: raise error.GrabMisuseError( 'Value of post option could be only ' 'byte string if PUT method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'PATCH': data = grab.config['post'] if isinstance(data, six.text_type): raise error.GrabMisuseError( 'Value of post option could be only byte ' 'string if PATCH method is used') self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH') self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == 'DELETE': self.curl.setopt(pycurl.CUSTOMREQUEST, 'DELETE') elif grab.request_method == 'HEAD': self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == 'UPLOAD': self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == 'GET': self.curl.setopt(pycurl.HTTPGET, 1) else: raise error.GrabMisuseError('Invalid method: %s' % grab.request_method) headers = grab.config['common_headers'] if grab.config['headers']: headers.update(grab.config['headers']) header_tuples = [str('%s: %s' % x) for x in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config['referer']: self.curl.setopt(pycurl.REFERER, str(grab.config['referer'])) if grab.config['proxy']: self.curl.setopt(pycurl.PROXY, str(grab.config['proxy'])) else: self.curl.setopt(pycurl.PROXY, '') if grab.config['proxy_userpwd']: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config['proxy_userpwd'])) if grab.config['proxy_type']: key = 'PROXYTYPE_%s' % grab.config['proxy_type'].upper() self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key)) if grab.config['encoding']: if ('gzip' in grab.config['encoding'] and 'zlib' not in pycurl.version): raise error.GrabMisuseError( 'You can not use gzip encoding because ' 'pycurl was built without zlib support') self.curl.setopt(pycurl.ENCODING, grab.config['encoding']) if grab.config['userpwd']: self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd'])) if grab.config.get('interface') is not None: self.curl.setopt(pycurl.INTERFACE, grab.config['interface']) if grab.config.get('reject_file_size') is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config['reject_file_size'])
def process_config(self, grab): """ Setup curl instance with values from ``self.config``. """ # Copy some config for future usage self.config_nobody = grab.config["nobody"] self.config_body_maxsize = grab.config["body_maxsize"] try: request_url = normalize_url(grab.config["url"]) except Exception as ex: raise error.GrabInvalidUrl(u"%s: %s" % (six.text_type(ex), grab.config["url"])) # py3 hack if not six.PY3: request_url = make_str(request_url) self.curl.setopt(pycurl.URL, request_url) # Actually, FOLLOWLOCATION should always be 0 # because redirect logic takes place in Grab.request method # BUT in Grab.Spider this method is not invoked # So, in Grab.Spider we still rely on Grab internal ability # to follow 30X Locations self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config["follow_location"] else 0) self.curl.setopt(pycurl.MAXREDIRS, grab.config["redirect_limit"]) self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config["connect_timeout"]) self.curl.setopt(pycurl.TIMEOUT, grab.config["timeout"]) self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0) if not grab.config["connection_reuse"]: self.curl.setopt(pycurl.FRESH_CONNECT, 1) self.curl.setopt(pycurl.FORBID_REUSE, 1) self.curl.setopt(pycurl.NOSIGNAL, 1) self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor) if grab.config["body_inmemory"]: self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) else: if not grab.config["body_storage_dir"]: raise error.GrabMisuseError("Option body_storage_dir is not defined") self.setup_body_file( grab.config["body_storage_dir"], grab.config["body_storage_filename"], create_dir=grab.config["body_storage_create_dir"], ) self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor) if grab.config["verbose_logging"]: self.verbose_logging = True # User-Agent if grab.config["user_agent"] is None: if grab.config["user_agent_file"] is not None: with open(grab.config["user_agent_file"]) as inf: lines = inf.read().splitlines() grab.config["user_agent"] = random.choice(lines) else: grab.config["user_agent"] = generate_user_agent() # If value is None then set empty string # None is not acceptable because in such case # pycurl will set its default user agent "PycURL/x.xx.x" if not grab.config["user_agent"]: grab.config["user_agent"] = "" self.curl.setopt(pycurl.USERAGENT, grab.config["user_agent"]) if grab.config["debug"]: self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor) # Ignore SSL errors self.curl.setopt(pycurl.SSL_VERIFYPEER, 0) self.curl.setopt(pycurl.SSL_VERIFYHOST, 0) # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) if grab.request_method in ("POST", "PUT"): if grab.config["post"] is None and grab.config["multipart_post"] is None: raise GrabMisuseError( "Neither `post` or `multipart_post`" " options was specified for the %s" " request" % grab.request_method ) if grab.request_method == "POST": self.curl.setopt(pycurl.POST, 1) if grab.config["multipart_post"]: if isinstance(grab.config["multipart_post"], six.string_types): raise error.GrabMisuseError("multipart_post option could not be a string") post_items = normalize_http_values( grab.config["multipart_post"], charset=grab.config["charset"], ignore_classes=(UploadFile, UploadContent), ) # py3 hack if six.PY3: post_items = decode_pairs(post_items, grab.config["charset"]) # import pdb; pdb.set_trace() self.curl.setopt(pycurl.HTTPPOST, process_upload_items(post_items)) elif grab.config["post"]: post_data = normalize_post_data(grab.config["post"], grab.config["charset"]) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) self.curl.setopt(pycurl.POSTFIELDS, post_data) else: self.curl.setopt(pycurl.POSTFIELDS, "") elif grab.request_method == "PUT": data = grab.config["post"] if isinstance(data, six.text_type): # py3 hack # if six.PY3: # data = data.encode('utf-8') # else: raise error.GrabMisuseError("Value of post option could be only " "byte string if PUT method is used") self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, "PUT") self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == "PATCH": data = grab.config["post"] if isinstance(data, six.text_type): raise error.GrabMisuseError("Value of post option could be only byte " "string if PATCH method is used") self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.CUSTOMREQUEST, "PATCH") self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) elif grab.request_method == "DELETE": self.curl.setopt(pycurl.CUSTOMREQUEST, "DELETE") elif grab.request_method == "HEAD": self.curl.setopt(pycurl.NOBODY, 1) elif grab.request_method == "UPLOAD": self.curl.setopt(pycurl.UPLOAD, 1) elif grab.request_method == "GET": self.curl.setopt(pycurl.HTTPGET, 1) elif grab.request_method == "OPTIONS": data = grab.config["post"] if data is not None: if isinstance(data, six.text_type): raise error.GrabMisuseError( "Value of post option could be only byte " "string if PATCH method is used" ) self.curl.setopt(pycurl.UPLOAD, 1) self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read) self.curl.setopt(pycurl.INFILESIZE, len(data)) self.curl.setopt(pycurl.CUSTOMREQUEST, "OPTIONS") else: raise error.GrabMisuseError("Invalid method: %s" % grab.request_method) headers = grab.config["common_headers"] if grab.config["headers"]: headers.update(grab.config["headers"]) # This is required to avoid some problems headers.update({"Expect": ""}) header_tuples = [str("%s: %s" % x) for x in headers.items()] self.curl.setopt(pycurl.HTTPHEADER, header_tuples) self.process_cookie_options(grab, request_url) if grab.config["referer"]: self.curl.setopt(pycurl.REFERER, str(grab.config["referer"])) if grab.config["proxy"]: self.curl.setopt(pycurl.PROXY, str(grab.config["proxy"])) else: self.curl.setopt(pycurl.PROXY, "") if grab.config["proxy_userpwd"]: self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config["proxy_userpwd"])) if grab.config["proxy_type"]: key = "PROXYTYPE_%s" % grab.config["proxy_type"].upper() self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key)) if grab.config["encoding"]: if "gzip" in grab.config["encoding"] and "zlib" not in pycurl.version: raise error.GrabMisuseError( "You can not use gzip encoding because " "pycurl was built without zlib support" ) self.curl.setopt(pycurl.ENCODING, grab.config["encoding"]) if grab.config["userpwd"]: self.curl.setopt(pycurl.USERPWD, str(grab.config["userpwd"])) if grab.config.get("interface") is not None: self.curl.setopt(pycurl.INTERFACE, grab.config["interface"]) if grab.config.get("reject_file_size") is not None: self.curl.setopt(pycurl.MAXFILESIZE, grab.config["reject_file_size"])
def process_config(self, grab): req = Request(data=None) try: request_url = normalize_url(grab.config["url"]) except Exception as ex: raise error.GrabInvalidUrl(u"%s: %s" % (six.text_type(ex), grab.config["url"])) req.url = request_url method = grab.detect_request_method() req.method = make_str(method) req.body_maxsize = grab.config["body_maxsize"] if grab.config["nobody"]: req.body_maxsize = 0 req.timeout = grab.config["timeout"] req.connect_timeout = grab.config["connect_timeout"] extra_headers = {} # Body processing if grab.config["body_inmemory"]: pass else: if not grab.config["body_storage_dir"]: raise GrabMisuseError("Option body_storage_dir is not defined") file_, path_ = self.setup_body_file( grab.config["body_storage_dir"], grab.config["body_storage_filename"], create_dir=grab.config["body_storage_create_dir"], ) req._response_file = file_ req._response_path = path_ if grab.config["multipart_post"] is not None: post_data = grab.config["multipart_post"] if isinstance(post_data, six.binary_type): pass elif isinstance(post_data, six.text_type): raise GrabMisuseError("Option multipart_post data" " does not accept unicode.") else: post_items = normalize_http_values( grab.config["multipart_post"], charset=grab.config["charset"], ignore_classes=(UploadFile, UploadContent), ) # if six.PY3: post_items = decode_pairs(post_items, grab.config["charset"]) post_items = process_upload_items(post_items) post_data, content_type = encode_multipart_formdata(post_items) extra_headers["Content-Type"] = content_type extra_headers["Content-Length"] = len(post_data) req.data = post_data elif grab.config["post"] is not None: post_data = normalize_post_data(grab.config["post"], grab.config["charset"]) # py3 hack # if six.PY3: # post_data = smart_unicode(post_data, # grab.config['charset']) extra_headers["Content-Length"] = len(post_data) req.data = post_data if method in ("POST", "PUT"): if grab.config["post"] is None and grab.config["multipart_post"] is None: raise GrabMisuseError( "Neither `post` or `multipart_post`" " options was specified for the %s" " request" % method ) # Proxy if grab.config["proxy"]: req.proxy = grab.config["proxy"] if grab.config["proxy_userpwd"]: req.proxy_userpwd = grab.config["proxy_userpwd"] if grab.config["proxy_type"]: req.proxy_type = grab.config["proxy_type"] else: req.proxy_type = "http" # User-Agent if grab.config["user_agent"] is None: if grab.config["user_agent_file"] is not None: with open(grab.config["user_agent_file"]) as inf: lines = inf.read().splitlines() grab.config["user_agent"] = random.choice(lines) else: grab.config["user_agent"] = generate_user_agent() extra_headers["User-Agent"] = grab.config["user_agent"] # Headers headers = extra_headers headers.update(grab.config["common_headers"]) if grab.config["headers"]: headers.update(grab.config["headers"]) req.headers = headers # Cookies self.process_cookie_options(grab, req) self._request = req