def get(url, cookies='', proxy=None, useragent='', referer='', headers=[], redirect_count=0, headers_only=False): """ GET запрос Возвращаемые значения: см. функцию __request """ err_counter = 0 if redirect_count >= MAX_REDIRECTS: raise InfiniteRedirection(url) # Бесконечный цикл для hammer mode while True: try: result = __process_redirect(__request(url, request_type='get', cookies=cookies, proxy=proxy, referer=referer, useragent=useragent, headers=headers, redirect_count=redirect_count, headers_only=headers_only)) return result except DeadProxy as e: err_counter += 1 if err_counter >= HAMMER_MODE_ATTEMPTS: raise DeadProxy(e['proxy'], e['port']) except pycurl.error as e: err_counter += 1 if err_counter >= HAMMER_MODE_ATTEMPTS: raise pycurl.error(str(e))
def post(url, data, cookies='', proxy=None, useragent='', referer='', headers=[], headers_only=False): """ POST запрос data: dict Возвращаемые значения: см. функцию __request """ err_counter = 0 # Бесконечный цикл для hammer mode while True: try: result = __process_redirect(__request(url, request_type='post', cookies=cookies, referer=referer, post_data=data, useragent=useragent, headers=headers, proxy=proxy, headers_only=headers_only)) return result except DeadProxy as e: err_counter += 1 if err_counter >= HAMMER_MODE_ATTEMPTS: raise DeadProxy(e['proxy'], e['port']) except pycurl.error as e: err_counter += 1 if err_counter >= HAMMER_MODE_ATTEMPTS: raise pycurl.error(str(e))
def downloadWorker(self): logger.info('Starting download worker (retries=%d)' % self.retrylimit-1) while True: item = self.q.get() try: debug.log('Downloading %s to %s%s (retries=%d)' % (item['remoteurl'], item['downloaddir'], item['localfname'], self.retrylimit-1)) fp = open(item['downloaddir'] + item['localfname'], 'wb') c = pycurl.Curl() c.setopt(c.URL, item['remoteurl']) c.setopt(c.WRITEDATA, fp) c.perform() if c.getinfo(pycurl.HTTP_CODE) != 200: logger.error("FAILED to download %s: %d" % (item['remoteurl'], c.getinfo(pycurl.HTTP_CODE))) raise pycurl.error() c.close() fp.close() self.downloadedsegs.append((item['order'], item['localfname'])) except pycurl.error: logger.error('Caught exception while downloading %s' % item['remoteurl']) c.close() item['retries'] += 1 if (item['retries'] < self.retrylimit): logger.info('Retry counter is %d, will try again' % item['retries']) self.q.put(item) else: logger.error('Retry counter exceeded for %s' % item['localfname']) self.failedDownloads = True finally: self.q.task_done()
def test_pycurl_error(self): curl = CurlStub(error=pycurl.error(60, "pycurl error")) try: fetch("http://example.com", curl=curl) except PyCurlError as error: self.assertEqual(error.error_code, 60) self.assertEqual(error.message, "pycurl error") else: self.fail("PyCurlError not raised")
def progress(download_t, download_d, upload_t, upload_d): global count, start_at, timeout_max count = count + 1 if count % 1000 == 0 and download_t > 0: r = download_d * 100.0 / download_t print "Total %d bytes, have %d bytes so far, %d%s" % (download_t, download_d, r, '%') ds = time.time() - start_at if timeout_max and ds >= timeout_max: raise pycurl.error(-1, u'download timeout. max=%s' % timeout_max)
def test_mocked_exception_duckduckgo(self): """What if curl raises an exception?""" # Arrange url = "duckduckgo.com/html" mock = MockResponse() mock.exception = pycurl.error() self.backend.responses.add(mock, url) # Act, Assert self.assertRaises(pycurl.error, self.browser.go, url)
def download_file(file_key, file_vars=None, create_dirs=True): """Download a file from the web and save it to disk TODO: Remove when files.path() is removed Use pycurl (libcurl) to do the actual downloading. Request might be nicer for this, but turned out to be much slower (and in practice unusable for bigger files) and also not really supporting ftp-downloads. Args: file_key (String): File key that should be downloaded. file_vars (Dict): File variables used to find path from file_key. create_dirs (Bool): Create directories as necessary before downloading file. """ if (not config.where.files.download_missing.bool or "url" not in config.files[file_key] or not config.files[file_key].url.str): return None file_path = path(file_key, file_vars=file_vars, download_missing=False) if file_path.exists(): return None if create_dirs: file_path.parent.mkdir(parents=True, exist_ok=True) file_url = url(file_key, file_vars=file_vars) file_path = file_path.with_name(file_url.name) log.info(f"Download {file_key} from '{file_url}' to '{file_path}'") with builtins.open(file_path, mode="wb") as fid: c = pycurl.Curl() c.setopt(c.URL, file_url) c.setopt(c.WRITEDATA, fid) try: c.perform() if not (200 <= c.getinfo(c.HTTP_CODE) <= 299): raise pycurl.error() except pycurl.error: log.error( f"Problem downloading file: {c.getinfo(c.EFFECTIVE_URL)} ({c.getinfo(c.HTTP_CODE)})" ) if file_path.exists(): # Print first 10 lines to console head_of_file = f"Contents of '{file_path}':\n" + "\n".join( file_path.read_text().split("\n")[:10]) print(console.indent(head_of_file, num_spaces=8)) file_path.unlink() log.warn( f"Try to download '{file_url}' manually and save it at '{file_path}'" ) else: log.info(f"Done downloading {file_key}") finally: c.close() return file_path
def test_adapter_translates_from_pycurl_errors(error_code, error_msg, expected_exception): request = PreparedRequest() request.prepare(url="http://somefakeurl", method="GET", headers={}) pool = FakePool() pool.add_exception(pycurl.error(error_code, error_msg)) pool_provider = FakePoolProvider() pool_provider.add_pool_for_url(request.url, pool) adapter = CURLAdapter( pool_provider_factory=lambda *args, **kwargs: pool_provider) with pytest.raises(expected_exception): adapter.send(request)
def download(request): request_url = request.get("url") headers = request.get("headers") if isinstance(headers, dict): headers = [k + ":" + v for k, v in headers.items()] proxies = request.get("proxies") mothed = request.get("mothed") c = pycurl.Curl() body = BytesIO() c.setopt(pycurl.VERBOSE, True) c.setopt(pycurl.HEADER, False) c.setopt(pycurl.TIMEOUT, 3) c.setopt(pycurl.CONNECTTIMEOUT, 1) c.setopt(pycurl.URL, request_url) if headers: print(headers) c.setopt(pycurl.HTTPHEADER, headers) c.setopt(pycurl.ENCODING, 'gzip,deflate') c.setopt(pycurl.SSL_VERIFYPEER, False) c.setopt(pycurl.SSL_VERIFYHOST, False) if mothed is None: mothed = "get" if mothed.lower() == "post": c.setopt(pycurl.POST, 1) data = request.get("data") if data: c.setopt(pycurl.POSTFIELDS, data) c.setopt(pycurl.WRITEFUNCTION, body.write) if proxies: proxy, password = convert_proxy_format(proxies) c.setopt(pycurl.PROXY, proxy) c.setopt(pycurl.PROXYUSERPWD, password) try: c.perform() code = c.getinfo(pycurl.RESPONSE_CODE) content = c.getinfo(pycurl.CONTENT_TYPE) if code != 200: raise pycurl.error(code, "") except pycurl.error as err: print(repr(err)) raise err finally: c.close() return body.getvalue().decode("gbk")
def request(self, verb, path, body, headers): c = self.curl hdrs = [str(h + ": " + v) for h, v in six.iteritems(headers)] if headers else [] verb = verb.upper() if verb == 'GET': if self.cleaning_needed: c.setopt(pycurl.POST, 0) c.unsetopt(pycurl.CUSTOMREQUEST) c.setopt(pycurl.NOBODY, 0) self.cleaning_needed = False if body: self.cleaning_needed = True c.setopt(pycurl.POST, 0) c.setopt(pycurl.CUSTOMREQUEST, verb) c.setopt(pycurl.NOBODY, 0) c.setopt(pycurl.POSTFIELDS, body or "") elif verb == 'POST': self.cleaning_needed = True c.unsetopt(pycurl.CUSTOMREQUEST) c.setopt(pycurl.NOBODY, 0) c.setopt(pycurl.POST, 1) c.setopt(pycurl.POSTFIELDS, body or "") hdrs.append("Expect:") elif verb == 'PUT' or verb == "DELETE": self.cleaning_needed = True c.setopt(pycurl.POST, 0) c.setopt(pycurl.CUSTOMREQUEST, verb) c.setopt(pycurl.NOBODY, 0) c.setopt(pycurl.POSTFIELDS, body or "") elif verb == 'HEAD': self.cleaning_needed = True c.setopt(pycurl.POST, 0) c.unsetopt(pycurl.CUSTOMREQUEST) c.setopt(pycurl.NOBODY, 1) else: raise pycurl.error("unsupported verb: " + verb) c.setopt(pycurl.URL, str(self.prefix + path)) c.setopt(pycurl.HTTPHEADER, hdrs) self.buf = BytesIO() self.response_headers = [] c.setopt(pycurl.WRITEFUNCTION, self.buf.write) c.setopt(pycurl.HEADERFUNCTION, self._header_handler) c.perform()
def download(request): headers = [ "User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" ] while True: c = pycurl.Curl() body = StringIO.StringIO() c.setopt(pycurl.TIMEOUT, 5) #c.setopt(pycurl.CONNECTTIMEOUT, 1) c.setopt(pycurl.URL, request['comURL']) c.setopt(pycurl.HTTPHEADER, headers) c.setopt(pycurl.ENCODING, 'gzip,deflate') c.setopt(pycurl.SSL_VERIFYPEER, 0) c.setopt(pycurl.SSL_VERIFYHOST, 0) #c.setopt(pycurl.WRITEHEADER, headers) c.setopt(pycurl.WRITEFUNCTION, body.write) #c.setopt(pycurl.PROXY, "http://127.0.0.1:8888") #Fiddler #c.setopt(pycurl.PROXYUSERPWD, self.userpwd) if 'formdata' in request: postfields = urllib.urlencode(request['formdata']) c.setopt(pycurl.POST, 1) c.setopt(pycurl.POSTFIELDS, postfields) if 'postfields' in request: c.setopt(pycurl.POST, 1) c.setopt(pycurl.POSTFIELDS, request['postfields']) try: c.perform() code = c.getinfo(pycurl.RESPONSE_CODE) if code != 200: raise pycurl.error(code, "") break except pycurl.error as err: if err[0] in (7, 28, 56): continue else: print('{}, {}, {}'.format(time.strftime('%H:%M:%S'), err[0], err[1])) raise err finally: c.close() return body.getvalue()
def get(url, cookies='', proxy=None, useragent='', referer='', headers=[], redirect_count=0, headers_only=False): """ GET запрос Возвращаемые значения: см. функцию __request """ err_counter = 0 if redirect_count >= MAX_REDIRECTS: raise InfiniteRedirection(url) # Бесконечный цикл для hammer mode while True: try: result = __process_redirect( __request(url, request_type='get', cookies=cookies, proxy=proxy, referer=referer, useragent=useragent, headers=headers, redirect_count=redirect_count, headers_only=headers_only)) return result except DeadProxy as e: err_counter += 1 if err_counter >= HAMMER_MODE_ATTEMPTS: raise DeadProxy(e['proxy'], e['port']) except pycurl.error as e: err_counter += 1 if err_counter >= HAMMER_MODE_ATTEMPTS: raise pycurl.error(str(e))
def post(url, data, cookies='', proxy=None, useragent='', referer='', headers=[], headers_only=False): """ POST запрос data: dict Возвращаемые значения: см. функцию __request """ err_counter = 0 # Бесконечный цикл для hammer mode while True: try: result = __process_redirect( __request(url, request_type='post', cookies=cookies, referer=referer, post_data=data, useragent=useragent, headers=headers, proxy=proxy, headers_only=headers_only)) return result except DeadProxy as e: err_counter += 1 if err_counter >= HAMMER_MODE_ATTEMPTS: raise DeadProxy(e['proxy'], e['port']) except pycurl.error as e: err_counter += 1 if err_counter >= HAMMER_MODE_ATTEMPTS: raise pycurl.error(str(e))
def execute_multi(self, calls, timeout): multi = pycurl.CurlMulti() for request in calls: multi.add_handle(request._curl) while True: while True: ret, num = multi.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break if num == 0: break if multi.select(timeout) < 0: raise pycurl.error(pycurl.E_OPERATION_TIMEOUTED) failed_calls = [] for request in calls: multi.remove_handle(request._curl) request._response_content = request._process_http_request() if request.response_code() == 0: failed_calls.append(request) else: if request._response_content: request.process() error_string = request.error() if error_string: self._errors.append(error_string) multi.close() return failed_calls
def _download(self, chunks, resume): if not resume: self.info.clear() self.info.add_chunk('{0}.chunk0'.format( self.path), (0, 0)) # create an initial entry self.chunks = [] # initial chunk that will load complete file (if needed) init = CurlChunk(0, self, None, resume) self.chunks.append(init) self.manager.add_handle(init.get_handle()) last_finish_check = 0 last_time_check = 0 chunks_done = set() # list of curl handles that are finished chunks_created = False done = False # This is a resume, if we were chunked originally assume still can if self.info.get_count() > 1: self.chunk_support = True while True: # need to create chunks # will be set later by first chunk if not chunks_created and self.chunk_support and self.size: self.flags ^= Connection.Resumable # TODO: Recheck... if not resume: self.info.set_size(self.size) self.info.create_chunks(chunks) self.info.save() chunks = self.info.get_count() init.set_range(self.info.get_chunk_range(0)) for i in range(1, chunks): c = CurlChunk( i, self, self.info.get_chunk_range(i), resume) handle = c.get_handle() if handle: self.chunks.append(c) self.manager.add_handle(handle) else: # close immediately self.log.debug('Invalid curl handle -> closed') c.close() chunks_created = True while True: ret, _ = self.manager.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break t = time.time() # reduce these calls # when num_q is 0, the loop is exited while last_finish_check + 0.5 < t: # list of failed curl handles failed = [] # TODO: Rewrite... # save only last exception, we can only raise one anyway exc = Exception() num_q, ok_list, err_list = self.manager.info_read() for c in ok_list: chunk = self.find_chunk(c) # check if the header implies success, # else add it to failed list try: chunk.verify_header() except ResponseException as exc: self.log.debug( 'Chunk {0:d} failed'.format( chunk.id + 1)) self.log.debug(exc, exc_info=True) failed.append(chunk) else: chunks_done.add(c) for c in err_list: curl, errno, msg = c chunk = self.find_chunk(curl) # test if chunk was finished if errno != 23 or '0 !=' not in msg: failed.append(chunk) exc = pycurl.error(errno, msg) self.log.debug( 'Chunk {0:d} failed'.format(chunk.id + 1)) self.log.debug(exc, exc_info=True) continue # check if the header implies success, # else add it to failed list try: chunk.verify_header() except ResponseException as exc: self.log.debug( 'Chunk {0:d} failed'.format( chunk.id + 1)) self.log.debug(exc, exc_info=True) failed.append(chunk) else: chunks_done.add(curl) if not num_q: # no more info to get # check if init is not finished so we reset download # connections # note that other chunks are closed and everything # downloaded with initial connection if failed: if init in failed or init.curl in chunks_done: raise exc self.log.error( 'Download chunks failed, fallback to ' 'single connection | {0}'.format(exc)) # list of chunks to clean and remove to_clean = [x for x in self.chunks if x is not init] for chunk in to_clean: self.close_chunk(chunk) self.chunks.remove(chunk) remove(self.info.get_chunk_name(chunk.id)) # let first chunk load the rest and update the # info file init.reset_range() self.info.clear() self.info.add_chunk('{0}.chunk0'.format( self.path), (0, self.size)) self.info.save() last_finish_check = t if len(chunks_done) >= len(self.chunks): if len(chunks_done) > len(self.chunks): self.log.warning( 'Finished download chunks size incorrect') done = True # all chunks loaded break if done: break # all chunks loaded # calc speed once per second, averaging over 3 seconds if last_time_check + 1 < t: len_la = len(self.last_arrived) diff = [c.arrived - (self.last_arrived[i] if len_la > i else 0) for i, c in enumerate(self.chunks)] self.last_speeds[1] = self.last_speeds[0] self.last_speeds[0] = self.speeds self.speeds = [float(a) // (t - last_time_check) for a in diff] self.last_arrived = [c.arrived for c in self.chunks] last_time_check = t if self._abort: raise Abort self.manager.select(1) for chunk in self.chunks: chunk.flush_file() # make sure downloads are written to disk self._copy_chunks()
class GetUrlTestCase(unittest.TestCase): @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(side_effect=['ololo', error()])) def test_if_pycurl_error_second(self): with mock.patch('source.lib.__init__.logger', mock.Mock()) as m_loger: result = source.lib.__init__.get_url('ololo.ru', 42) self.assertEqual(1, m_loger.error.call_count) self.assertEqual('ololo.ru', result[0]) self.assertEqual('ERROR', result[1]) self.assertEqual(None, result[2]) @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(side_effect=['ololo', ValueError()])) def test_if_value_error_second(self): with mock.patch('source.lib.__init__.logger', mock.Mock()) as m_loger: result = source.lib.__init__.get_url('ololo.ru', 42) self.assertEqual(1, m_loger.error.call_count) self.assertEqual('ololo.ru', result[0]) self.assertEqual('ERROR', result[1]) self.assertEqual(None, result[2]) @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(side_effect=[ValueError(), 'ololo'])) def test_if_value_error_first(self): with mock.patch('source.lib.__init__.logger', mock.Mock()) as m_loger: result = source.lib.__init__.get_url('ololo.ru', 42) self.assertEqual(1, m_loger.error.call_count) self.assertEqual('ololo.ru', result[0]) self.assertEqual('ERROR', result[1]) self.assertEqual(None, result[2]) @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(side_effect=[error(), 'ololo'])) def test_if_pycurl_error_first(self): with mock.patch('source.lib.__init__.logger', mock.Mock()) as m_loger: result = source.lib.__init__.get_url('ololo.ru', 42) self.assertEqual(1, m_loger.error.call_count) self.assertEqual('ololo.ru', result[0]) self.assertEqual('ERROR', result[1]) self.assertEqual(None, result[2]) @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(return_value=['ish', 'ololo.ru'])) def test_if_new_redirect_url_and_match(self): with (mock.patch('source.lib.__init__.OK_REDIRECT', mock.Mock())): with (mock.patch('source.lib.__init__.OK_REDIRECT.match', mock.Mock(return_value=True))): result = source.lib.__init__.get_url('vk.ru', 42) self.assertEqual(None, result[0]) self.assertEqual(None, result[1]) self.assertEqual('ish', result[2]) @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(return_value=['ish', 'ololo.ru'])) @mock.patch('source.lib.__init__.check_for_meta', mock.Mock(return_value=None)) @mock.patch('source.lib.__init__.prepare_url', mock.Mock(return_value=None)) def test_redirect_url_and_not(self): with (mock.patch('source.lib.__init__.OK_REDIRECT', mock.Mock())): with (mock.patch('source.lib.__init__.OK_REDIRECT.match', mock.Mock(return_value=False))): result = source.lib.__init__.get_url('vk.ru', 42) self.assertEqual(None, result[0]) self.assertEqual(source.lib.__init__.REDIRECT_HTTP, result[1]) self.assertEqual('ish', result[2]) @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(return_value=['ish', None])) @mock.patch('source.lib.__init__.check_for_meta', mock.Mock(return_value='ololo.ru')) @mock.patch('source.lib.__init__.prepare_url', mock.Mock(return_value='vk.com')) def test_not_redirect_url_and_redirect_url_and_not_urlsplit(self): urlsplit = mock.MagicMock() urlsplit.scheme = mock.Mock(return_value='bugaga') with (mock.patch('source.lib.__init__.OK_REDIRECT', mock.Mock())): with (mock.patch('source.lib.__init__.OK_REDIRECT.match', mock.Mock(return_value=False))): result = source.lib.__init__.get_url('vk.ru', 42) self.assertEqual(source.lib.__init__.REDIRECT_META, result[1]) self.assertEqual('ish', result[2]) @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(return_value=['ish', None])) @mock.patch('source.lib.__init__.check_for_meta', mock.Mock(return_value=None)) @mock.patch('source.lib.__init__.prepare_url', mock.Mock(return_value='vk.com')) def test_not_redirect_url_and_not_redirect_url_and_not_urlsplit(self): urlsplit = mock.Mock() urlsplit.scheme = 'market' with (mock.patch('source.lib.__init__.OK_REDIRECT', mock.Mock())): with (mock.patch('source.lib.__init__.OK_REDIRECT.match', mock.Mock(return_value=False))): with (mock.patch('source.lib.__init__.urlsplit', mock.Mock(return_value=urlsplit))): result = source.lib.__init__.get_url('vk.ru', 42) self.assertEqual(None, result[1]) self.assertEqual('ish', result[2]) @mock.patch('source.lib.__init__.make_pycurl_request', mock.Mock(return_value=['ish', 'ololo'])) @mock.patch('source.lib.__init__.prepare_url', mock.Mock(return_value='vk.com')) def test_redirect_url_and_urlsplit(self): urlsplit = mock.Mock() urlsplit.scheme = 'market' with (mock.patch('source.lib.__init__.OK_REDIRECT', mock.Mock())): with (mock.patch('source.lib.__init__.OK_REDIRECT.match', mock.Mock(return_value=False))): with (mock.patch('source.lib.__init__.urlsplit', mock.Mock(return_value=urlsplit))): with (mock.patch('source.lib.__init__.fix_market_url', mock.Mock())) as m_fix: result = source.lib.__init__.get_url('vk.ru', 42) self.assertEqual('ish', result[2]) self.assertEqual(source.lib.__init__.REDIRECT_HTTP, result[1]) self.assertEqual(1, m_fix.call_count)
def setopt(self, method='GET', ua='', cookies=None, proxy=None, url=None, verbose=False, headers=None, timeout=120, data=None, allow_redirect=False): ''' @proxy protocol://host:port eg: socks5://127.0.0.1:1080 ''' self._curl.reset() method = method.upper() if method not in ("GET", "POST", "DELETE", "PUT", "OPTIONS", "HEAD"): raise pycurl.error("not support method:%s" % method) if method in ("HEAD", "DELETE"): self._curl.setopt(pycurl.NOBODY, False) if method in ("POST", "PUT"): self._curl.setopt(pycurl.POST, True) if method in ("PUT", "DELETE", "PUT", "OPTIONS"): self._curl.setopt(pycurl.CUSTOMREQUEST, method) self._curl.setopt(pycurl.NOSIGNAL, True) self._curl.setopt(pycurl.URL, url) self.setproxy(proxy) if verbose: self._curl.setopt(pycurl.VERBOSE, True) allheaders = [] self.setcookies(cookies) if headers: curl_headers = self._header2curlstyle(headers) allheaders.extend(curl_headers) if ua: self.ua = ua allheaders.extend(["User-Agent: %s" % self.ua]) if allheaders: self._curl.setopt(pycurl.HTTPHEADER, allheaders) if method in ("POST", "PUT"): if isinstance(data, str): self._curl.setopt(pycurl.POSTFIELDS, data) elif hasattr(data, "read"): self._curl.setopt(pycurl.UPLOAD, True) self._curl.setopt(pycurl.READFUNCTION, data.read) data.seek(0, 2) filesize = data.tell() data.seek(0) self._curl.setopt(pycurl.INFILESIZE, filesize) elif isinstance(data, dict): postfields = self._dict2urlfields(data) self._curl.setopt(pycurl.POSTFIELDS, postfields) self._curl.setopt(pycurl.TIMEOUT, timeout) # if self.cookiefile: # self._curl.setopt(pycurl.COOKIEJAR, self.cookiefile) # self._curl.setopt(pycurl.COOKIEFILE, self.cookiefile) self._curl.setopt(pycurl.HEADERFUNCTION, self.headerfunc) self._curl.setopt(pycurl.WRITEFUNCTION, self.contentfunc) if allow_redirect: self._curl.setopt(pycurl.FOLLOWLOCATION, 1) self._curl.setopt(pycurl.MAXREDIRS, 5)
def __request(url, request_type, cookies='', post_data={}, proxy=None, headers=[], useragent='', referer='', redirect_count=0, attempt=1, headers_only=False): """ Универсальная функция. Используется в функциях get & post Возвращает: Словарь заголовков: dict Тело: string Cookies: string; query string: a=1&b=2 Connect time: float Current URL: string Redirect URL: string | none Redirect count: integer """ # сливаем переданые в функции и глобальные заголовки all_headers = HEADERS + headers all_headers = __headers_to_dict(all_headers, replace_duplicates=True) all_headers = ["%s: %s" % (k, v) for k, v in all_headers.items()] c = pycurl.Curl() got_headers = StringIO() body = StringIO() if headers_only: c.setopt(pycurl.NOBODY, 1) else: c.setopt(pycurl.WRITEFUNCTION, body.write) c.setopt(pycurl.URL, url) c.setopt(pycurl.TIMEOUT, TIMEOUT) c.setopt(pycurl.HEADERFUNCTION, got_headers.write) """ If it is 1, libcurl will not use any functions that install signal handlers or any functions that cause signals to be sent to the process. This option is mainly here to allow multi-threaded unix applications to still set/use all timeout options etc, without risking getting signals """ c.setopt(pycurl.NOSIGNAL, 1) # более приоритетный юзер-агент - это переданый в функцию # менее приоритетный - это юзер-агент установленный глобально if not useragent: if USERAGENT: useragent = USERAGENT if useragent: c.setopt(pycurl.USERAGENT, useragent) if all_headers: c.setopt(pycurl.HTTPHEADER, all_headers) # Установка referer'a if referer: c.setopt(pycurl.REFERER, referer) # Если код ответа >= 400, то вызываем ошибку c.setopt(pycurl.FAILONERROR, 1) # ВАЖНО: т.к. куки не сохраняем в файлах, а передаём строкой # то FOLLOWLOCATION не будет использовать куки, присвоенные # сразу до редиректа. Отказываемся от его использования # # c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.COOKIE, cookies) c.setopt(pycurl.VERBOSE, 1) c.setopt(pycurl.DEBUGFUNCTION, __logging) # не проверяем SSL сертификат. Запросы становястя уязвимы к атаке MITM c.setopt(pycurl.SSL_VERIFYHOST, 0) c.setopt(pycurl.SSL_VERIFYPEER, 0) if request_type.lower() == 'post' and post_data: c.setopt(pycurl.HTTPPOST, post_data.items()) # Если передан прокси, то работаем через него if proxy: # CURL proxytype if PROXY_TYPE == 'socks5': proxy_type = pycurl.PROXYTYPE_SOCKS5 elif PROXY_TYPE == 'socks4': proxy_type = pycurl.PROXYTYPE_SOCKS4 elif PROXY_TYPE == 'http': proxy_type = pycurl.PROXYTYPE_HTTP # если не можем отделить ip и порт, то возвращаем ошибку try: proxy_ip, port = proxy.split(':') port = int(port) except ValueError: logging.error("Возможно, неверный формат прокси: %s", str(proxy)) raise DeadProxy(proxy_ip, port) c.setopt(pycurl.PROXY, proxy_ip) c.setopt(pycurl.PROXYPORT, port) c.setopt(pycurl.PROXYTYPE, proxy_type) # Обработка исключений при загрузке страницы try: c.perform() except pycurl.error as err: """ CURLE_HTTP_RETURNED_ERROR (22) This is returned if CURLOPT_FAILONERROR is set TRUE and the HTTP server returns an error code that is >= 400. """ if err[0] == 22: raise WrongCode(c.getinfo(pycurl.RESPONSE_CODE)) """ Если используем прокси, то все ошибки, кроме неверного кода ответа спихиваем на него """ if proxy: raise DeadProxy(proxy_ip, port) else: raise pycurl.error(str(err)) # словарь got_headers = __get_headers(got_headers.getvalue()) result = {'headers': got_headers, 'body': body.getvalue(), 'current_proxy': proxy, 'useragent': useragent, 'referer': referer, 'sent_headers': all_headers, 'cookies': __get_cookies(got_headers['Set-Cookie'], cookies), 'connect_time': c.getinfo(pycurl.CONNECT_TIME), 'response_code': c.getinfo(pycurl.RESPONSE_CODE), 'current_url': c.getinfo(pycurl.EFFECTIVE_URL), 'redirect_url': c.getinfo(pycurl.REDIRECT_URL), 'redirect_count': redirect_count, 'headers_only': headers_only} c.close() del c return result
def _download(self, chunks, resume): if not resume: self.info.clear() self.info.add_chunk("{0}.chunk0".format(self.path), (0, 0)) #: create an initial entry self.chunks = [] # initial chunk that will load complete file (if needed) init = CurlChunk(0, self, None, resume) self.chunks.append(init) self.manager.add_handle(init.get_handle()) last_finish_check = 0 last_time_check = 0 chunks_done = set() #: list of curl handles that are finished chunks_created = False done = False if self.info.get_count( ) > 1: #: This is a resume, if we were chunked originally assume still can self.chunk_support = True while True: # need to create chunks if not chunks_created and self.chunk_support and self.size: #: will be set later by first chunk self.flags ^= Connection.Resumable if not resume: self.info.set_size(self.size) self.info.create_chunks(chunks) self.info.save() chunks = self.info.get_count() init.set_range(self.info.get_chunk_range(0)) for i in range(1, chunks): c = CurlChunk(i, self, self.info.get_chunk_range(i), resume) handle = c.get_handle() if handle: self.chunks.append(c) self.manager.add_handle(handle) else: # close immediately self.pyload.log.debug("Invalid curl handle -> closed") c.close() chunks_created = True while True: ret, num_handles = self.manager.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break t = time() # reduce these calls # when num_q is 0, the loop is exited while last_finish_check + 0.5 < t: # list of failed curl handles failed = [] ex = None #: save only last exception, we can only raise one anyway num_q, ok_list, err_list = self.manager.info_read() for c in ok_list: chunk = self.find_chunk(c) try: #: check if the header implies success, else add it to failed list chunk.verify_header() except ResponseException as e: self.pyload.log.debug("Chunk {0:d} failed: {1}".format( chunk.id + 1, str(e))) failed.append(chunk) ex = e else: chunks_done.add(c) for c in err_list: curl, errno, msg = c chunk = self.find_chunk(curl) # test if chunk was finished if errno != 23 or "0 !=" not in msg: failed.append(chunk) ex = pycurl.error(errno, msg) self.pyload.log.debug("Chunk {0:d} failed: {1}".format( chunk.id + 1, ex)) continue try: #: check if the header implies success, else add it to failed list chunk.verify_header() except ResponseException as e: self.pyload.log.debug("Chunk {0:d} failed: {1}".format( chunk.id + 1, str(e))) failed.append(chunk) ex = e else: chunks_done.add(curl) if not num_q: #: no more info to get # check if init is not finished so we reset download connections # note that other chunks are closed and everything # downloaded with initial connection if failed and init not in failed and init.c not in chunks_done: self.pyload.log.error( _("Download chunks failed, fallback to single connection | {0}" .format(ex))) # list of chunks to clean and remove to_clean = [x for x in self.chunks if x is not init] for chunk in to_clean: self.close_chunk(chunk) self.chunks.remove(chunk) remove( format.path(self.info.get_chunk_name( chunk.id))) # let first chunk load the rest and update the info # file init.reset_range() self.info.clear() self.info.add_chunk("{0}.chunk0".format(self.path), (0, self.size)) self.info.save() elif failed: raise ex last_finish_check = t if len(chunks_done) >= len(self.chunks): if len(chunks_done) > len(self.chunks): self.pyload.log.warning( _("Finished download chunks size incorrect, please report bug" )) done = True #: all chunks loaded break if done: break #: all chunks loaded # calc speed once per second, averaging over 3 seconds if last_time_check + 1 < t: diff = [ c.arrived - (self.last_arrived[i] if len(self.last_arrived) > i else 0) for i, c in enumerate(self.chunks) ] self.last_speeds[1] = self.last_speeds[0] self.last_speeds[0] = self.speeds self.speeds = [float(a) // (t - last_time_check) for a in diff] self.last_arrived = [c.arrived for c in self.chunks] last_time_check = t if self.do_abort: raise Abort self.manager.select(1) for chunk in self.chunks: chunk.flush_file() #: make sure downloads are written to disk self._copy_chunks()
def _download(self, chunks, resume): if not resume: self.info.clear() self.info.add_chunk(f"{self.filename}.chunk0", (0, 0)) #: create an initial entry) self.chunks = [] # initial chunk that will load complete file (if needed) init = HTTPChunk(0, self, None, resume) self.chunks.append(init) self.m.add_handle(init.get_handle()) last_finish_check = 0 last_time_check = 0 chunks_done = set() #: list of curl handles that are finished chunks_created = False done = False if ( self.info.get_count() > 1 ): #: This is a resume, if we were chunked originally assume still can self.chunk_support = True while True: # need to create chunks if (not chunks_created and self.chunk_support and self.size): #: will be set later by first chunk if not resume: self.info.set_size(self.size) self.info.create_chunks(chunks) self.info.save() chunks = self.info.get_count() init.set_range(self.info.get_chunk_range(0)) for i in range(1, chunks): c = HTTPChunk(i, self, self.info.get_chunk_range(i), resume) handle = c.get_handle() if handle: self.chunks.append(c) self.m.add_handle(handle) else: # close immediately self.log.debug("Invalid curl handle -> closed") c.close() chunks_created = True while True: ret, num_handles = self.m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break t = time.time() # reduce these calls while last_finish_check + 0.5 < t: # list of failed curl handles failed = [] ex = None #: save only last exception, we can only raise one anyway num_q, ok_list, err_list = self.m.info_read() for c in ok_list: chunk = self.find_chunk(c) try: #: check if the header implies success, else add it to failed list chunk.verify_header() except BadHeader as exc: self.log.debug(f"Chunk {chunk.id + 1} failed: {exc}") failed.append(chunk) ex = exc else: self.log.debug( f"Chunk {chunk.id + 1} download finished") chunks_done.add(c) for c in err_list: curl, errno, msg = c chunk = self.find_chunk(curl) # test if chunk was finished if errno != pycurl.E_WRITE_ERROR or not chunk.aborted: failed.append(chunk) ex = pycurl.error(errno, msg) self.log.debug(f"Chunk {chunk.id + 1} failed: {ex}") continue try: #: check if the header implies success, else add it to failed list chunk.verify_header() except BadHeader as exc: self.log.debug(f"Chunk {chunk.id + 1} failed: {exc}") failed.append(chunk) ex = exc else: self.log.debug( f"Chunk {chunk.id + 1} download finished") chunks_done.add(curl) if not num_q: #: no more infos to get # check if init is not finished so we reset download connections # note that other chunks are closed and downloaded with init too if failed and init not in failed and init.c not in chunks_done: self.log.error( f"Download chunks failed, fallback to single connection | {ex}" ) # list of chunks to clean and os.remove to_clean = [x for x in self.chunks if x is not init] for chunk in to_clean: self.close_chunk(chunk) self.chunks.remove(chunk) os.remove(self.info.get_chunk_name(chunk.id)) # let first chunk load the rest and update the info file init.reset_range() self.info.clear() self.info.add_chunk(f"{self.filename}.chunk0", (0, self.size)) self.info.save() elif failed: raise ex or Exception last_finish_check = t if len(chunks_done) >= len(self.chunks): if len(chunks_done) > len(self.chunks): self.log.warning( "Finished download chunks size incorrect, please report bug." ) done = True #: all chunks loaded break if done: break #: all chunks loaded # calc speed once per second, averaging over 3 seconds if last_time_check + 1 < t: diff = [ c.arrived - (self.last_arrived[i] if len(self.last_arrived) > i else 0) for i, c in enumerate(self.chunks) ] self.last_speeds[1] = self.last_speeds[0] self.last_speeds[0] = self.speeds self.speeds = [float(a) / (t - last_time_check) for a in diff] self.last_arrived = [c.arrived for c in self.chunks] last_time_check = t self.update_progress() if self.abort: raise Abort # time.sleep(0.003) #supress busy waiting - limits dl speed to (1 / x) * # buffersize self.m.select(1) for chunk in self.chunks: chunk.flush_file() #: make sure downloads are written to disk self._copy_chunks()
def download(self,url): c = pycurl.Curl()#实例化对象 c.setopt(pycurl.URL, url) c.setopt(pycurl.USERAGENT, self.userAgent)#伪装成某代理 c.setopt(pycurl.CONNECTTIMEOUT, self.connectionTimeout)#链接超时 c.setopt(pycurl.TIMEOUT, self.operationTimeout)#操作超时 c.setopt(pycurl.ENCODING, self.encoding)#encoding c.setopt(pycurl.HTTPHEADER, self.headerParams)#Accept信息 c.setopt(pycurl.DNS_CACHE_TIMEOUT,60)#设置保存DNS信息的时间,默认为60秒 if self.referer: c.setopt(pycurl.REFERER, self.referer)#告诉服务器从哪个连接过来的。 if self.httpAuth:#auth c.setopt(pycurl.HTTPAUTH, self.httpAuthType)#request header中authorization需要这个信息 c.setopt(pycurl.USERPWD, self.httpAuth) if self.agency.has_key('ADD') and self.agency.has_key('PWD'):#agency c.setopt(pycurl.PROXY,self.agency['ADD']) c.setopt(pycurl.PROXYUSERPWD,self.agency['PWD'])#浏览器代理 if self.COOKIE:#cookie c.setopt(pycurl.COOKIEFILE, self.COOKIE) c.setopt(pycurl.COOKIEJAR, self.COOKIE) if self.POST:#post if self.DATA: Data=self.DATA else: print 'DATA is necessary when request type is POST' return False c.setopt(pycurl.POST, 1) c.setopt(pycurl.POSTFIELDS, Data) self.POST=0 self.headerWrite=StringIO.StringIO()#通过回调函数,不断把response header信息写入.(StringIO作为'内存文件'对象) if self.isDownload:#download self.b = open(self.isDownload, 'wb')#把文件下载到DownLoad这个文件中 else: self.b = StringIO.StringIO()#声明一个StringIO对象,在内存缓冲区写入内容 ######################################################以下三个需要调用回调函数 c.setopt(pycurl.WRITEFUNCTION, self.contentWriteCallBack)#把网页内容写入 c.setopt(pycurl.HEADERFUNCTION, self.headerWriteCallBack)#将头部信息(response header)写入 if self.isDownload:#下载或上传时回调函数 c.setopt(pycurl.NOPROGRESS, self.noProgress) c.setopt(pycurl.PROGRESSFUNCTION, self.progressWriteCallBack)#把进度输出 ###################################################### c.setopt(pycurl.FOLLOWLOCATION, 1)#支持重定向 即http code 是3开头的 c.setopt(pycurl.MAXREDIRS,10)#最大重定向数为10,防止爬虫陷阱 c.setopt(pycurl.SSL_VERIFYPEER, 0)#SSL certificate c.setopt(pycurl.SSL_VERIFYHOST, 0)#SSL certificate c.setopt(pycurl.NOSIGNAL, 1)#This option is here to allow multi-threaded unix applications to still set/use all timeout options etc, without risking getting signals. #c.setopt(pycurl.FRESH_CONNECT,1)#强制获取新的连接,即替代缓存中的连接 #c.setopt(pycurl.FORBID_REUSE, 1)#完成交互后强制断开连接,不重用 #c.setopt(pycurl.SOCKET_TIMEOUT, 9) #c.setopt(pycurl.E_OPERATION_TIMEOUTED, 3600) #开始访问 try: c.perform()#执行curl #以下为抓包的一些详细信息.以毫秒为单位 dnsTime = c.getinfo(c.NAMELOOKUP_TIME)*1000#域名解析时间 connectTimeTemp = c.getinfo(c.CONNECT_TIME)*1000#远程服务器连接时间 preTransferTimeTemp = c.getinfo(c.PRETRANSFER_TIME)*1000#从建立连接到准备传输所消耗的时间 startTransferTimeTemp = c.getinfo(c.STARTTRANSFER_TIME)*1000#从准备传输到传输第一个字节消耗的时间 totalTime = c.getinfo(c.TOTAL_TIME)*1000#上一请求总的时间 connectTime = connectTimeTemp - dnsTime transferTime = totalTime - preTransferTimeTemp #print 'dnsTime:',dnsTime #print 'connectTime:',connectTime #print 'transferTime:',transferTime #print 'totalTime:',totalTime if self.isDownload: c.close() self.b.close()#关闭文件 self.isDownload = False return True self.headerContent=self.headerWrite.getvalue()#访问结束后,从response headers获取返回值 self.httpCode=c.getinfo(c.HTTP_CODE)#HTTP状态码 contenttype = re.compile('charset=(.*)',re.I|re.S|re.M).findall(c.getinfo(c.CONTENT_TYPE))#c.getinfo(c.CONTENT_TYPE)的返回值为text/html; charset=utf-8 if contenttype: webCharset = contenttype[0] else: webCharset = self.charset value= self.b.getvalue() if webCharset.lower() !='utf-8': #print 'encoding to utf-8...' commonutil = CommonUtil() value = commonutil.convertCoding(webCharset,self.charset,value) c.close() self.b.close() if self.httpCode>=400: errorMessages = re.compile('<title>(.*)</title>',re.I|re.S|re.M).findall(value) if errorMessages: errorMessage = errorMessages[0] else: errorMessage = 'Http code >=400' raise pycurl.error(errorMessage.decode('UTF-8'))#decode是为了处理中文 except pycurl.error, e: print sys.exc_info()[0],sys.exc_info()[1]#打印错误信息 return False
def test_translate_curl_exception(error_code, error_msg, expected_exception): curl_exception = pycurl.error(error_code, error_msg) translated_exception = translate_curl_exception(curl_exception) assert translated_exception == expected_exception
def test_handle_response_pycurl_error(self): """PycURLGetter allows other errors to propagate.""" error = pycurl.error(pycurl.E_MULTI_OUT_OF_MEMORY) getter = PycURLGetter(FakeCurl(perform_error=error)) with self.assertRaises(pycurl.error): getter.handle_response()
def test_cert_verification_failed(self): """Cert verification error raises CertificateVerificationFailed.""" error = pycurl.error(pycurl.E_SSL_CACERT) getter = PycURLGetter(FakeCurl(perform_error=error)) with self.assertRaises(CertificateVerificationFailed): getter.handle_response()
def test_handle_response_connection_error(self): """On connection error, handle_response raises CouldNotConnect.""" error = pycurl.error(pycurl.E_COULDNT_CONNECT) getter = PycURLGetter(FakeCurl(perform_error=error)) with self.assertRaises(CouldNotConnect): getter.handle_response()
def download(self, request): request_url = request.get("url") headers = request.get("headers") if isinstance(headers, dict): headers = [k + ":" + v for k, v in headers.items()] proxies = request.get("proxy") mothed = request.get("mothed") encoding = request.get("encoding") c = pycurl.Curl() body = BytesIO() if self.pycurl_config: #default c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) c.setopt(pycurl.TIMEOUT, 3) c.setopt(pycurl.CONNECTTIMEOUT, 1) c.setopt(pycurl.URL, request_url) if headers: c.setopt(pycurl.HTTPHEADER, headers) c.setopt(pycurl.ENCODING, 'gzip,deflate') c.setopt(pycurl.SSL_VERIFYPEER, False) c.setopt(pycurl.SSL_VERIFYHOST, False) if mothed is None: mothed = "get" if mothed.lower() == "post": c.setopt(pycurl.POST, 1) data = request.get("data") if data: c.setopt(pycurl.POSTFIELDS, urllib.urlencode(data)) c.setopt(pycurl.WRITEFUNCTION, body.write) if self.use_proxy: if proxies: proxy, password = self.convert_proxy_format(proxies) self.log.debug((proxy, password)) c.setopt(pycurl.PROXY, proxy) c.setopt(pycurl.PROXYUSERPWD, password) else: if self.used_proxy: proxy, password = self.convert_proxy_format( self.used_proxy) self.log.debug((proxy, password)) c.setopt(pycurl.PROXY, proxy) c.setopt(pycurl.PROXYUSERPWD, password) #set pycurl_config for k, v in self.pycurl_config.items(): c.setopt(k, v) # set yourself self.overwrite_download_opt(c) else: c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) c.setopt(pycurl.TIMEOUT, 3) c.setopt(pycurl.CONNECTTIMEOUT, 1) c.setopt(pycurl.URL, request_url) if headers: c.setopt(pycurl.HTTPHEADER, headers) c.setopt(pycurl.ENCODING, 'gzip,deflate') c.setopt(pycurl.SSL_VERIFYPEER, False) c.setopt(pycurl.SSL_VERIFYHOST, False) if mothed is None: mothed = "get" if mothed.lower() == "post": c.setopt(pycurl.POST, 1) data = request.get("data") if data: c.setopt(pycurl.POSTFIELDS, urllib.urlencode(data)) c.setopt(pycurl.WRITEFUNCTION, body.write) if self.use_proxy: if proxies: proxy, password = self.convert_proxy_format(proxies) self.log.debug((proxy, password)) c.setopt(pycurl.PROXY, proxy) c.setopt(pycurl.PROXYUSERPWD, password) else: if self.used_proxy: proxy, password = self.convert_proxy_format( self.used_proxy) self.log.debug((proxy, password)) c.setopt(pycurl.PROXY, proxy) c.setopt(pycurl.PROXYUSERPWD, password) self.overwrite_download_opt(c) try: c.perform() code = c.getinfo(pycurl.HTTP_CODE) if code != 200: raise pycurl.error(code, "") except pycurl.error as err: #if err[0] not in (7,28,56): # self.log.error(err) self.log.exception(err) #raise err return "" finally: c.close() result = body.getvalue() if not encoding: coding = chardet.detect(result)['encoding'] return result.decode(coding)
def __request(url, request_type, cookies='', post_data={}, proxy=None, headers=[], useragent='', referer='', redirect_count=0, attempt=1, headers_only=False): """ Универсальная функция. Используется в функциях get & post Возвращает: Словарь заголовков: dict Тело: string Cookies: string; query string: a=1&b=2 Connect time: float Current URL: string Redirect URL: string | none Redirect count: integer """ # сливаем переданые в функции и глобальные заголовки all_headers = HEADERS + headers all_headers = __headers_to_dict(all_headers, replace_duplicates=True) all_headers = ["%s: %s" % (k, v) for k, v in all_headers.items()] c = pycurl.Curl() got_headers = StringIO() body = StringIO() if headers_only: c.setopt(pycurl.NOBODY, 1) else: c.setopt(pycurl.WRITEFUNCTION, body.write) c.setopt(pycurl.URL, url) c.setopt(pycurl.TIMEOUT, TIMEOUT) c.setopt(pycurl.HEADERFUNCTION, got_headers.write) """ If it is 1, libcurl will not use any functions that install signal handlers or any functions that cause signals to be sent to the process. This option is mainly here to allow multi-threaded unix applications to still set/use all timeout options etc, without risking getting signals """ c.setopt(pycurl.NOSIGNAL, 1) # более приоритетный юзер-агент - это переданый в функцию # менее приоритетный - это юзер-агент установленный глобально if not useragent: if USERAGENT: useragent = USERAGENT if useragent: c.setopt(pycurl.USERAGENT, useragent) if all_headers: c.setopt(pycurl.HTTPHEADER, all_headers) # Установка referer'a if referer: c.setopt(pycurl.REFERER, referer) # Если код ответа >= 400, то вызываем ошибку c.setopt(pycurl.FAILONERROR, 1) # ВАЖНО: т.к. куки не сохраняем в файлах, а передаём строкой # то FOLLOWLOCATION не будет использовать куки, присвоенные # сразу до редиректа. Отказываемся от его использования # # c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.COOKIE, cookies) c.setopt(pycurl.VERBOSE, 1) c.setopt(pycurl.DEBUGFUNCTION, __logging) # не проверяем SSL сертификат. Запросы становястя уязвимы к атаке MITM c.setopt(pycurl.SSL_VERIFYHOST, 0) c.setopt(pycurl.SSL_VERIFYPEER, 0) if request_type.lower() == 'post' and post_data: c.setopt(pycurl.HTTPPOST, post_data.items()) # Если передан прокси, то работаем через него if proxy: # CURL proxytype if PROXY_TYPE == 'socks5': proxy_type = pycurl.PROXYTYPE_SOCKS5 elif PROXY_TYPE == 'socks4': proxy_type = pycurl.PROXYTYPE_SOCKS4 elif PROXY_TYPE == 'http': proxy_type = pycurl.PROXYTYPE_HTTP # если не можем отделить ip и порт, то возвращаем ошибку try: proxy_ip, port = proxy.split(':') port = int(port) except ValueError: logging.error("Возможно, неверный формат прокси: %s", str(proxy)) raise DeadProxy(proxy_ip, port) c.setopt(pycurl.PROXY, proxy_ip) c.setopt(pycurl.PROXYPORT, port) c.setopt(pycurl.PROXYTYPE, proxy_type) # Обработка исключений при загрузке страницы try: c.perform() except pycurl.error as err: """ CURLE_HTTP_RETURNED_ERROR (22) This is returned if CURLOPT_FAILONERROR is set TRUE and the HTTP server returns an error code that is >= 400. """ if err[0] == 22: raise WrongCode(c.getinfo(pycurl.RESPONSE_CODE)) """ Если используем прокси, то все ошибки, кроме неверного кода ответа спихиваем на него """ if proxy: raise DeadProxy(proxy_ip, port) else: raise pycurl.error(str(err)) # словарь got_headers = __get_headers(got_headers.getvalue()) result = { 'headers': got_headers, 'body': body.getvalue(), 'current_proxy': proxy, 'useragent': useragent, 'referer': referer, 'sent_headers': all_headers, 'cookies': __get_cookies(got_headers['Set-Cookie'], cookies), 'connect_time': c.getinfo(pycurl.CONNECT_TIME), 'response_code': c.getinfo(pycurl.RESPONSE_CODE), 'current_url': c.getinfo(pycurl.EFFECTIVE_URL), 'redirect_url': c.getinfo(pycurl.REDIRECT_URL), 'redirect_count': redirect_count, 'headers_only': headers_only } c.close() del c return result
def perform(_): raise pycurl.error('Test Exception')
def download_file( self, file_key: str, file_vars: Optional[Dict[str, str]] = None, file_path: Optional[pathlib.Path] = None, create_dirs: bool = True, **path_args: Any, ) -> Optional[pathlib.Path]: """Download a file from the web and save it to disk Use pycurl (libcurl) to do the actual downloading. Requests might be nicer for this, but turned out to be much slower (and in practice unusable for bigger files) and also not really supporting ftp-downloads. Args: file_key: File key that should be downloaded. file_vars: File variables used to find path from file_key. file_path: Path where file will be saved, default is to read from configuration. create_dirs: Create directories as necessary before downloading file. path_args: Arguments passed on to .path() to find file_path. Returns: Path to downloaded file, None if no file was downloaded. """ # Do not download anything if download_missing class variable is False if not self.download_missing: return None # Do not download anything if url is not given in configuration if "url" not in self[file_key] or not self[file_key].url.str: return None # Get file_path from configuration if it's not given explicitly file_url = self.url(file_key, file_vars=file_vars, **path_args) is_zipped = self.is_path_zipped(file_url) path_args.update(is_zipped=is_zipped) if file_path is None: file_path = self.path(file_key, file_vars=file_vars, download_missing=False, **path_args) file_path = file_path.with_name(file_url.name) if create_dirs: file_path.parent.mkdir(parents=True, exist_ok=True) log.info(f"Download {file_key} from '{file_url}' to '{file_path}'") with builtins.open(file_path, mode="wb") as fid: c = pycurl.Curl() c.setopt(c.URL, file_url) c.setopt(c.WRITEDATA, fid) try: c.perform() if not (200 <= c.getinfo(c.HTTP_CODE) <= 299): raise pycurl.error() except pycurl.error: log.error(f"Problem downloading file: {c.getinfo(c.EFFECTIVE_URL)} ({c.getinfo(c.HTTP_CODE)})") if file_path.exists(): # Print first 10 lines to console head_of_file = f"Contents of '{file_path}':\n" + "\n".join(file_path.read_text().split("\n")[:10]) log.info(console.indent(head_of_file, num_spaces=8)) file_path.unlink() log.warn(f"Try to download '{file_url}' manually and save it at '{file_path}'") else: log.info(f"Done downloading {file_key}") finally: c.close() return file_path
def mock_curl_not_http_error(mock_curl): mock_curl.perform.side_effect = pycurl.error() return mock_curl
def _check_curl_errors(self): for f in self.curlmulti.info_read()[2]: raise pycurl.error(*f[1:])
def _download(self, chunks, resume): if not resume: self.info.clear() self.info.addChunk("%s.chunk0" % self.filename, (0, 0)) #create an initial entry self.chunks = [] init = HTTPChunk( 0, self, None, resume) #initial chunk that will load complete file (if needed) self.chunks.append(init) self.m.add_handle(init.getHandle()) lastFinishCheck = 0 lastTimeCheck = 0 chunksDone = set() # list of curl handles that are finished chunksCreated = False done = False if self.info.getCount( ) > 1: # This is a resume, if we were chunked originally assume still can self.chunkSupport = True while 1: #need to create chunks if not chunksCreated and self.chunkSupport and self.size: #will be setted later by first chunk if not resume: self.info.setSize(self.size) self.info.createChunks(chunks) self.info.save() chunks = self.info.getCount() init.setRange(self.info.getChunkRange(0)) for i in range(1, chunks): c = HTTPChunk(i, self, self.info.getChunkRange(i), resume) handle = c.getHandle() if handle: self.chunks.append(c) self.m.add_handle(handle) else: #close immediatly self.log.debug("Invalid curl handle -> closed") c.close() chunksCreated = True while 1: ret, num_handles = self.m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break t = time() # reduce these calls while lastFinishCheck + 0.5 < t: # list of failed curl handles failed = [] ex = None # save only last exception, we can only raise one anyway num_q, ok_list, err_list = self.m.info_read() for c in ok_list: chunk = self.findChunk(c) try: # check if the header implies success, else add it to failed list chunk.verifyHeader() except BadHeader, e: self.log.debug("Chunk %d failed: %s" % (chunk.id + 1, str(e))) failed.append(chunk) ex = e else: chunksDone.add(c) for c in err_list: curl, errno, msg = c chunk = self.findChunk(curl) #test if chunk was finished if errno != 23 or "0 !=" not in msg: failed.append(chunk) ex = pycurl.error(errno, msg) self.log.debug("Chunk %d failed: %s" % (chunk.id + 1, str(ex))) continue try: # check if the header implies success, else add it to failed list chunk.verifyHeader() except BadHeader, e: self.log.debug("Chunk %d failed: %s" % (chunk.id + 1, str(e))) failed.append(chunk) ex = e else: chunksDone.add(curl) if not num_q: # no more infos to get # check if init is not finished so we reset download connections # note that other chunks are closed and downloaded with init too if failed and init not in failed and init.c not in chunksDone: self.log.error( _("Download chunks failed, fallback to single connection | %s" % (str(ex)))) #list of chunks to clean and remove to_clean = filter(lambda x: x is not init, self.chunks) for chunk in to_clean: self.closeChunk(chunk) self.chunks.remove(chunk) remove(fs_encode(self.info.getChunkName(chunk.id))) #let first chunk load the rest and update the info file init.resetRange() self.info.clear() self.info.addChunk("%s.chunk0" % self.filename, (0, self.size)) self.info.save() elif failed: raise ex lastFinishCheck = t if len(chunksDone) >= len(self.chunks): if len(chunksDone) > len(self.chunks): self.log.warning( "Finished download chunks size incorrect, please report bug." ) done = True #all chunks loaded break
def iterate_results(self): while True: try: self.network_op_lock.acquire() with self.sigint_handler.handle_sigint(): queued_messages, ok_list, fail_list = ( self.multi.info_read()) finally: self.network_op_lock.release() #except Exception as ex: # # Usually that should not happen # logging.error('', exc_info=ex) # continue results = [] for curl in ok_list: results.append((True, curl, None, None, None)) for curl, ecode, emsg in fail_list: curl.grab_callback_interrupted = False try: raise pycurl.error(ecode, emsg) except Exception as exc: # pylint: disable=broad-except grab_exc = build_grab_exception(exc, curl) # grab_exc could be None if the pycurl error # was expected (could be in case of # body_maxsize and other options) if grab_exc: results.append((False, curl, ecode, emsg, grab_exc)) else: results.append((True, curl, None, None, None)) for is_ok, curl, ecode, emsg, grab_exc in results: # FORMAT: {is_ok, grab, grab_config_backup, task, # ecode, emsg, error_abbr, exc} curl_id = id(curl) task = self.registry[curl_id]['task'] grab = self.registry[curl_id]['grab'] grab_config_backup =\ self.registry[curl_id]['grab_config_backup'] try: self.network_op_lock.acquire() grab.process_request_result() except GrabTooManyRedirectsError: ecode = ERROR_TOO_MANY_REDIRECTS emsg = 'Too many meta refresh redirects' is_ok = False finally: self.network_op_lock.release() #except Exception as ex: # logging.error('', exc_info=ex) # ecode = ERROR_INTERNAL_GRAB_ERROR # emsg = 'Internal grab error' # is_ok = False grab.doc.error_code = ecode grab.doc.error_msg = emsg grab.exception = grab_exc # Free resources del self.registry[curl_id] grab.transport.curl = None if is_ok: error_abbr = None else: error_abbr = ERRNUM_TAG.get(ecode, 'unknown-%d' % ecode) yield { 'ok': is_ok, 'ecode': ecode, 'emsg': emsg, 'error_abbr': error_abbr, 'exc': grab_exc, 'grab': grab, 'grab_config_backup': grab_config_backup, }, task try: self.network_op_lock.acquire() with self.sigint_handler.handle_sigint(): self.multi.remove_handle(curl) finally: self.network_op_lock.release() curl.reset() self.freelist.append(curl) if not queued_messages: break
def _check_curl_errors(self): for f in self.curl_multi.info_read()[2]: raise pycurl.error(*f[1:])
def _download(self, chunks, resume): if not resume: self.info.clear() # Create an initial entry self.info.addChunk("%s.chunk0" % self.filename, (0, 0)) self.chunks = [] # Initial chunk that will load complete file (if needed) init = HTTPChunk(0, self, None, resume) self.chunks.append(init) self.m.add_handle(init.getHandle()) lastFinishCheck = 0 lastTimeCheck = 0 # List of curl handles that are finished chunksDone = set() chunksCreated = False done = False # This is a resume, if we were chunked originally assume still can if self.info.getCount() > 1: self.chunkSupport = True while 1: # Need to create chunks # Will be set later by first chunk if not chunksCreated and self.chunkSupport and self.size: if not resume: self.info.setSize(self.size) self.info.createChunks(chunks) self.info.save() chunks = self.info.getCount() init.setRange(self.info.getChunkRange(0)) for i in range(1, chunks): c = HTTPChunk(i, self, self.info.getChunkRange(i), resume) handle = c.getHandle() if handle: self.chunks.append(c) self.m.add_handle(handle) else: # Close immediatly self.log.debug("Invalid curl handle -> closed") c.close() chunksCreated = True while 1: ret, num_handles = self.m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break t = time() # Reduce these calls while lastFinishCheck + 0.5 < t: # List of failed curl handles failed = [] # Save only last exception, we can only raise one anyway ex = None num_q, ok_list, err_list = self.m.info_read() for c in ok_list: chunk = self.findChunk(c) # Check if the header implies success, else add it to failed list try: chunk.verifyHeader() except BadHeader as e: self.log.debug("Chunk %d failed: %s" % (chunk.id + 1, str(e))) failed.append(chunk) ex = e else: chunksDone.add(c) for c in err_list: curl, errno, msg = c chunk = self.findChunk(curl) # Test if chunk was finished if errno != 23 or "0 !=" not in msg: failed.append(chunk) ex = pycurl.error(errno, msg) self.log.debug("Chunk %d failed: %s" % (chunk.id + 1, str(ex))) continue # Check if the header implies success, else add it to failed list try: chunk.verifyHeader() except BadHeader as e: self.log.debug("Chunk %d failed: %s" % (chunk.id + 1, str(e))) failed.append(chunk) ex = e else: chunksDone.add(curl) # No more infos to get if not num_q: # Check if init is not finished so we reset download connections, # note that other chunks are closed and downloaded with init too if failed and init not in failed and init.c not in chunksDone: self.log.error( _("Download chunks failed, fallback to single connection | %s" % (str(ex)))) # List of chunks to clean and remove for chunk in filter(lambda x: x is not init, self.chunks): self.closeChunk(chunk) self.chunks.remove(chunk) remove(fs_encode(self.info.getChunkName(chunk.id))) # Let first chunk load the rest and update the info file init.resetRange() self.info.clear() self.info.addChunk("%s.chunk0" % self.filename, (0, self.size)) self.info.save() elif failed: raise ex lastFinishCheck = t if len(chunksDone) >= len(self.chunks): if len(chunksDone) > len(self.chunks): self.log.warning( "Finished download chunks size incorrect, please report bug." ) # All chunks loaded done = True break # All chunks loaded if done: break # Calc speed once per second, averaging over 3 seconds if lastTimeCheck + 1 < t: diff = [ c.arrived - (self.lastArrived[i] if len(self.lastArrived) > i else 0) for i, c in enumerate(self.chunks) ] self.lastSpeeds[1] = self.lastSpeeds[0] self.lastSpeeds[0] = self.speeds self.speeds = [float(a) / (t - lastTimeCheck) for a in diff] self.lastArrived = [c.arrived for c in self.chunks] lastTimeCheck = t self.updateProgress() if self.abort: raise Abort() # Sleep(0.003) #supress busy waiting - limits dl speed to (1 / x) * buffersize self.m.select(1) for chunk in self.chunks: # Make sure downloads are written to disk chunk.flushFile() self._copyChunks()
def _download(self, chunks, resume): if not resume: self.info.clear() self.info.addChunk("%s.chunk0" % self.filename, (0, 0)) #create an initial entry self.chunks = [] init = HTTPChunk(0, self, None, resume) #initial chunk that will load complete file (if needed) self.chunks.append(init) self.m.add_handle(init.getHandle()) lastFinishCheck = 0 lastTimeCheck = 0 chunksDone = set() # list of curl handles that are finished chunksCreated = False done = False if self.info.getCount() > 1: # This is a resume, if we were chunked originally assume still can self.chunkSupport = True while 1: #need to create chunks if not chunksCreated and self.chunkSupport and self.size: #will be setted later by first chunk if not resume: self.info.setSize(self.size) self.info.createChunks(chunks) self.info.save() chunks = self.info.getCount() init.setRange(self.info.getChunkRange(0)) for i in range(1, chunks): c = HTTPChunk(i, self, self.info.getChunkRange(i), resume) handle = c.getHandle() if handle: self.chunks.append(c) self.m.add_handle(handle) else: #close immediatly self.log.debug("Invalid curl handle -> closed") c.close() chunksCreated = True while 1: ret, num_handles = self.m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break t = time() # reduce these calls while lastFinishCheck + 0.5 < t: # list of failed curl handles failed = [] ex = None # save only last exception, we can only raise one anyway num_q, ok_list, err_list = self.m.info_read() for c in ok_list: chunk = self.findChunk(c) try: # check if the header implies success, else add it to failed list chunk.verifyHeader() except BadHeader, e: self.log.debug("Chunk %d failed: %s" % (chunk.id + 1, str(e))) failed.append(chunk) ex = e else: chunksDone.add(c) for c in err_list: curl, errno, msg = c chunk = self.findChunk(curl) #test if chunk was finished if errno != 23 or "0 !=" not in msg: failed.append(chunk) ex = pycurl.error(errno, msg) self.log.debug("Chunk %d failed: %s" % (chunk.id + 1, str(ex))) continue try: # check if the header implies success, else add it to failed list chunk.verifyHeader() except BadHeader, e: self.log.debug("Chunk %d failed: %s" % (chunk.id + 1, str(e))) failed.append(chunk) ex = e else: chunksDone.add(curl) if not num_q: # no more infos to get # check if init is not finished so we reset download connections # note that other chunks are closed and downloaded with init too if failed and init not in failed and init.c not in chunksDone: self.log.error(_("Download chunks failed, fallback to single connection | %s" % (str(ex)))) #list of chunks to clean and remove to_clean = filter(lambda x: x is not init, self.chunks) for chunk in to_clean: self.closeChunk(chunk) self.chunks.remove(chunk) remove(fs_encode(self.info.getChunkName(chunk.id))) #let first chunk load the rest and update the info file init.resetRange() self.info.clear() self.info.addChunk("%s.chunk0" % self.filename, (0, self.size)) self.info.save() elif failed: raise ex lastFinishCheck = t if len(chunksDone) >= len(self.chunks): if len(chunksDone) > len(self.chunks): self.log.warning("Finished download chunks size incorrect, please report bug.") done = True #all chunks loaded break
def execute(self, method, url, data=None, headers=[]): import pycurl host = self.get_host_port_from_url(url) if host in self.curl_session: curl = self.curl_session[host] else: self.curl_session[host] = pycurl.Curl() curl = self.curl_session[host] url = url.replace(" ", "%20") method = method.upper() self.server_headers = dict() buffer = BytesIO() curl.setopt(curl.URL, nfw.utils.if_unicode_to_utf8(url)) try: curl.setopt(curl.WRITEDATA, buffer) except TypeError: curl.setopt(curl.WRITEFUNCTION, buffer.write) curl.setopt(curl.HEADERFUNCTION, self.header_function) curl.setopt(curl.FOLLOWLOCATION, True) curl.setopt(curl.SSL_VERIFYPEER, self.ssl_verify_peer) curl.setopt(curl.SSL_VERIFYHOST, self.ssl_verify_host) curl.setopt(curl.CONNECTTIMEOUT, self.connect_timeout) curl.setopt(curl.TIMEOUT, self.timeout) curl.setopt(curl.DEBUGFUNCTION, _debug) curl.setopt(curl.VERBOSE, 1) if data is not None: curl.setopt(curl.POSTFIELDS, nfw.utils.if_unicode_to_utf8(data)) else: curl.setopt(curl.POSTFIELDS, nfw.utils.if_unicode_to_utf8('')) send_headers = list() for header in headers: send_header = nfw.utils.if_unicode_to_utf8( "%s: %s" % (header, headers[header])) send_headers.append(send_header) curl.setopt(pycurl.HTTPHEADER, send_headers) if method == nfw.HTTP_GET: curl.setopt(curl.CUSTOMREQUEST, nfw.utils.if_unicode_to_utf8('GET')) elif method == nfw.HTTP_PUT: curl.setopt(curl.CUSTOMREQUEST, nfw.utils.if_unicode_to_utf8('PUT')) elif method == nfw.HTTP_POST: curl.setopt(curl.CUSTOMREQUEST, nfw.utils.if_unicode_to_utf8('POST')) elif method == nfw.HTTP_PATCH: curl.setopt(curl.CUSTOMREQUEST, nfw.utils.if_unicode_to_utf8('PATCH')) elif method == nfw.HTTP_DELETE: curl.setopt(curl.CUSTOMREQUEST, nfw.utils.if_unicode_to_utf8('DELETE')) elif method == nfw.HTTP_OPTIONS: curl.setopt(curl.CUSTOMREQUEST, nfw.utils.if_unicode_to_utf8('OPTIONS')) elif method == nfw.HTTP_HEAD: curl.setopt(curl.CUSTOMREQUEST, nfw.utils.if_unicode_to_utf8('HEAD')) elif method == nfw.HTTP_TRACE: curl.setopt(curl.CUSTOMREQUEST, nfw.utils.if_unicode_to_utf8('TRACE')) elif method == nfw.HTTP_CONNECT: curl.setopt(curl.CUSTOMREQUEST, nfw.utils.if_unicode_to_utf8('CONNECT')) else: raise nfw.Error("Invalid request type %s" % (method, )) try: curl.perform() status = curl.getinfo(pycurl.HTTP_CODE) except pycurl.error as e: del self.curl_session[host] if e[0] == 28: raise nfw.RestClientError("Connection timeout %s" % (host, )) else: raise pycurl.error(e) # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in self.server_headers: content_type = self.server_headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) if encoding is None: # Default encoding for JSON is UTF-8. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'utf_8' body = buffer.getvalue() # Decode using the encoding we figured out. body = body.decode(encoding) resp_header = nfw.Headers() for h in self.server_headers: resp_header[h] = self.server_headers[h] return (status, resp_header, body)