def enviaPeticion(url, dominio, ruta, cookieDicc={"TESTID": "set"}): try: # Empaquetador de cookies. jar = CookieJar() # Genera un objeto request para posterior peticion. peticion = urllib.request.Request(url=url) # crearCookie to generate a cookie and add it to the cookie jar. for key, item in cookieDicc.items(): jar.set_cookie(crearCookie(key, item, dominio, ruta)) # print(crearCookie(key, item)) jar.add_cookie_header(peticion) # Generar peticion. edmundoDantes = urllib.request.build_opener() abreteSesamo = edmundoDantes.open(peticion) RiquezaYVenganza = verificacionAcceso(abreteSesamo) if RiquezaYVenganza: print( "Busca tu propio Arbol") else: print( "!(Busca tu propio arbol)") return RiquezaYVenganza except urllib.error.HTTPError as err: print("Pagina fuera de servicio") return "Pagina fuera de servicio"
def testCookieAdapters(self): jar = CookieJar(policy=None) # DefaultCookiePolicy()) # set a cookie res = Response() tstval = str(uuid.uuid4()) res.set_cookie("a-cookie", tstval, domain="example.com") cookies = jar.make_cookies(filters.ResponseCookieAdapter(res), Request.blank("http://example.com")) for c in cookies: jar.set_cookie(c) self.assert_(len(jar), ("where's my cookies?")) self.assert_("a-cookie" in [c.name for c in jar], "seriously, where's my cookie") # now put the header on the request please request = Request.blank("http://example.com") self.assert_(".example.com" in jar._cookies.keys(), jar._cookies.keys()) jar.add_cookie_header(filters.RequestCookieAdapter(request)) self.assert_("Cookie" in request.headers, (str(request), "Y NO COOKIES?"))
class HttpTransport(Transport): """ HTTP transport using urllib2. Provided basic http transport that provides for cookies, proxies but no authentication. """ def __init__(self, **kwargs): """ @param kwargs: Keyword arguments. - B{proxy} - An http proxy to be specified on requests. The proxy is defined as {protocol:proxy,} - type: I{dict} - default: {} - B{timeout} - Set the url open timeout (seconds). - type: I{float} - default: 90 """ Transport.__init__(self) Unskin(self.options).update(kwargs) self.cookiejar = CookieJar() self.proxy = {} self.urlopener = None def open(self, request): try: url = request.url log.debug('opening (%s)', url) u2request = urllib.request.Request(url) self.proxy = self.options.proxy return self.u2open(u2request) except urllib.error.HTTPError as e: raise TransportError(str(e), e.code, e.fp) def send(self, request): result = None url = request.url msg = request.message headers = request.headers try: u2request = urllib.request.Request(url, msg, headers) self.addcookies(u2request) self.proxy = self.options.proxy request.headers.update(u2request.headers) log.debug('sending:\n%s', request) fp = self.u2open(u2request) self.getcookies(fp, u2request) result = Reply(200, fp.headers.dict, fp.read()) log.debug('received:\n%s', result) except urllib.error.HTTPError as e: if e.code in (202, 204): result = None else: raise TransportError(e.msg, e.code, e.fp) return result def addcookies(self, u2request): """ Add cookies in the cookiejar to the request. @param u2request: A urllib2 request. @rtype: u2request: urllib2.Requet. """ self.cookiejar.add_cookie_header(u2request) def getcookies(self, fp, u2request): """ Add cookies in the request to the cookiejar. @param u2request: A urllib2 request. @rtype: u2request: urllib2.Requet. """ self.cookiejar.extract_cookies(fp, u2request) def u2open(self, u2request): """ Open a connection. @param u2request: A urllib2 request. @type u2request: urllib2.Requet. @return: The opened file-like urllib2 object. @rtype: fp """ tm = self.options.timeout url = self.u2opener() if self.u2ver() < 2.6: socket.setdefaulttimeout(tm) return url.open(u2request) else: return url.open(u2request, timeout=tm) def u2opener(self): """ Create a urllib opener. @return: An opener. @rtype: I{OpenerDirector} """ if self.urlopener is None: return urllib.request.build_opener(*self.u2handlers()) else: return self.urlopener def u2handlers(self): """ Get a collection of urllib handlers. @return: A list of handlers to be installed in the opener. @rtype: [Handler,...] """ handlers = [] handlers.append(urllib.request.ProxyHandler(self.proxy)) return handlers def u2ver(self): """ Get the major/minor version of the urllib2 lib. @return: The urllib2 version. @rtype: float """ try: part = urllib.request.__version__.split('.', 1) n = float('.'.join(part)) return n except Exception as e: log.exception(e) return 0 def __deepcopy__(self, memo={}): clone = self.__class__() p = Unskin(self.options) cp = Unskin(clone.options) cp.update(p) return clone
class Cookies(MutableMapping): """ HTTP Cookies, as a mutable mapping. """ def __init__(self, cookies: CookieTypes = None) -> None: if cookies is None or isinstance(cookies, dict): self.jar = CookieJar() if isinstance(cookies, dict): for key, value in cookies.items(): self.set(key, value) elif isinstance(cookies, Cookies): self.jar = CookieJar() for cookie in cookies.jar: self.jar.set_cookie(cookie) else: self.jar = cookies def extract_cookies(self, response: Response) -> None: """ Loads any cookies based on the response `Set-Cookie` headers. """ urlib_response = self._CookieCompatResponse(response) urllib_request = self._CookieCompatRequest(response.request) self.jar.extract_cookies(urlib_response, urllib_request) # type: ignore def set_cookie_header(self, request: Request) -> None: """ Sets an appropriate 'Cookie:' HTTP header on the `Request`. """ urllib_request = self._CookieCompatRequest(request) self.jar.add_cookie_header(urllib_request) def set(self, name: str, value: str, domain: str = "", path: str = "/") -> None: """ Set a cookie value by name. May optionally include domain and path. """ kwargs = { "version": 0, "name": name, "value": value, "port": None, "port_specified": False, "domain": domain, "domain_specified": bool(domain), "domain_initial_dot": domain.startswith("."), "path": path, "path_specified": bool(path), "secure": False, "expires": None, "discard": True, "comment": None, "comment_url": None, "rest": {"HttpOnly": None}, "rfc2109": False, } cookie = Cookie(**kwargs) # type: ignore self.jar.set_cookie(cookie) def get( # type: ignore self, name: str, default: str = None, domain: str = None, path: str = None ) -> typing.Optional[str]: """ Get a cookie by name. May optionally include domain and path in order to specify exactly which cookie to retrieve. """ value = None for cookie in self.jar: if cookie.name == name: if domain is None or cookie.domain == domain: # type: ignore if path is None or cookie.path == path: if value is not None: message = f"Multiple cookies exist with name={name}" raise CookieConflict(message) value = cookie.value if value is None: return default return value def delete(self, name: str, domain: str = None, path: str = None) -> None: """ Delete a cookie by name. May optionally include domain and path in order to specify exactly which cookie to delete. """ if domain is not None and path is not None: return self.jar.clear(domain, path, name) remove = [] for cookie in self.jar: if cookie.name == name: if domain is None or cookie.domain == domain: # type: ignore if path is None or cookie.path == path: remove.append(cookie) for cookie in remove: self.jar.clear(cookie.domain, cookie.path, cookie.name) # type: ignore def clear(self, domain: str = None, path: str = None) -> None: """ Delete all cookies. Optionally include a domain and path in order to only delete a subset of all the cookies. """ args = [] if domain is not None: args.append(domain) if path is not None: assert domain is not None args.append(path) self.jar.clear(*args) def update(self, cookies: CookieTypes = None) -> None: # type: ignore cookies = Cookies(cookies) for cookie in cookies.jar: self.jar.set_cookie(cookie) def __setitem__(self, name: str, value: str) -> None: return self.set(name, value) def __getitem__(self, name: str) -> str: value = self.get(name) if value is None: raise KeyError(name) return value def __delitem__(self, name: str) -> None: return self.delete(name) def __len__(self) -> int: return len(self.jar) def __iter__(self) -> typing.Iterator[str]: return (cookie.name for cookie in self.jar) def __bool__(self) -> bool: for _ in self.jar: return True return False class _CookieCompatRequest(urllib.request.Request): """ Wraps a `Request` instance up in a compatibility interface suitable for use with `CookieJar` operations. """ def __init__(self, request: Request) -> None: super().__init__( url=str(request.url), headers=dict(request.headers), method=request.method, ) self.request = request def add_unredirected_header(self, key: str, value: str) -> None: super().add_unredirected_header(key, value) self.request.headers[key] = value class _CookieCompatResponse: """ Wraps a `Request` instance up in a compatibility interface suitable for use with `CookieJar` operations. """ def __init__(self, response: Response): self.response = response def info(self) -> email.message.Message: info = email.message.Message() for key, value in self.response.headers.items(): info[key] = value return info
class HttpTransport(Transport): """ Basic HTTP transport implemented using using urllib2, that provides for cookies & proxies but no authentication. """ def __init__(self, **kwargs): """ @param kwargs: Keyword arguments. - B{proxy} - An HTTP proxy to be specified on requests. The proxy is defined as {protocol:proxy,} - type: I{dict} - default: {} - B{timeout} - Set the URL open timeout (seconds). - type: I{float} - default: 90 """ Transport.__init__(self) Unskin(self.options).update(kwargs) self.cookiejar = CookieJar() self.proxy = {} self.urlopener = None def open(self, request): try: url = self.__get_request_url_for_urllib(request) log.debug('opening (%s)', url) u2request = urllib.request.Request(url) self.proxy = self.options.proxy return self.u2open(u2request) except urllib.error.HTTPError as e: raise TransportError(str(e), e.code, e.fp) def send(self, request): url = self.__get_request_url_for_urllib(request) msg = request.message headers = request.headers try: u2request = urllib.request.Request(url, msg, headers) self.addcookies(u2request) self.proxy = self.options.proxy request.headers.update(u2request.headers) log.debug('sending:\n%s', request) fp = self.u2open(u2request) self.getcookies(fp, u2request) headers = fp.headers if sys.version_info < (3, 0): headers = headers.dict reply = Reply(http.client.OK, headers, fp.read()) log.debug('received:\n%s', reply) return reply except urllib.error.HTTPError as e: if e.code not in (http.client.ACCEPTED, http.client.NO_CONTENT): raise TransportError(e.msg, e.code, e.fp) def addcookies(self, u2request): """ Add cookies in the cookiejar to the request. @param u2request: A urllib2 request. @rtype: u2request: urllib2.Request. """ self.cookiejar.add_cookie_header(u2request) def getcookies(self, fp, u2request): """ Add cookies in the request to the cookiejar. @param u2request: A urllib2 request. @rtype: u2request: urllib2.Request. """ self.cookiejar.extract_cookies(fp, u2request) def u2open(self, u2request): """ Open a connection. @param u2request: A urllib2 request. @type u2request: urllib2.Request. @return: The opened file-like urllib2 object. @rtype: fp """ tm = self.options.timeout url = self.u2opener() if (sys.version_info < (3, 0)) and (self.u2ver() < 2.6): socket.setdefaulttimeout(tm) return url.open(u2request) return url.open(u2request, timeout=tm) def u2opener(self): """ Create a urllib opener. @return: An opener. @rtype: I{OpenerDirector} """ if self.urlopener is None: return urllib.request.build_opener(*self.u2handlers()) return self.urlopener def u2handlers(self): """ Get a collection of urllib handlers. @return: A list of handlers to be installed in the opener. @rtype: [Handler,...] """ return [urllib.request.ProxyHandler(self.proxy)] def u2ver(self): """ Get the major/minor version of the urllib2 lib. @return: The urllib2 version. @rtype: float """ try: part = urllib.request.__version__.split('.', 1) return float('.'.join(part)) except Exception as e: log.exception(e) return 0 def __deepcopy__(self, memo={}): clone = self.__class__() p = Unskin(self.options) cp = Unskin(clone.options) cp.update(p) return clone @staticmethod def __get_request_url_for_urllib(request): """ Returns the given request's URL, properly encoded for use with urllib. We expect that the given request object already verified that the URL contains ASCII characters only and stored it as a native str value. urllib accepts URL information as a native str value and may break unexpectedly if given URL information in another format. Python 3.x httplib.client implementation must be given a unicode string and not a bytes object and the given string is internally converted to a bytes object using an explicitly specified ASCII encoding. Python 2.7 httplib implementation expects the URL passed to it to not be a unicode string. If it is, then passing it to the underlying httplib Request object will cause that object to forcefully convert all of its data to unicode, assuming that data contains ASCII data only and raising a UnicodeDecodeError exception if it does not (caused by simple unicode + string concatenation). Python 2.4 httplib implementation does not really care about this as it does not use the internal optimization present in the Python 2.7 implementation causing all the requested data to be converted to unicode. """ assert isinstance(request.url, str) return request.url
class HttpTransport(Transport): """ HTTP transport using urllib2. Provided basic http transport that provides for cookies, proxies but no authentication. """ def __init__(self, **kwargs): """ @param kwargs: Keyword arguments. - B{proxy} - An http proxy to be specified on requests. The proxy is defined as {protocol:proxy,} - type: I{dict} - default: {} - B{timeout} - Set the url open timeout (seconds). - type: I{float} - default: 90 """ Transport.__init__(self) Unskin(self.options).update(kwargs) self.cookiejar = CookieJar() self.proxy = {} self.urlopener = None def open(self, request): try: url = request.url log.debug('opening (%s)', url) u2request = u2.Request(url) self.proxy = self.options.proxy return self.u2open(u2request) except HTTPError as e: raise TransportError(str(e), e.code, e.fp) def send(self, request): result = None url = request.url msg = request.message headers = request.headers try: u2request = u2.Request(url, msg, headers) self.addcookies(u2request) self.proxy = self.options.proxy request.headers.update(u2request.headers) log.debug('sending:\n%s', request) fp = self.u2open(u2request) self.getcookies(fp, u2request) #result = Reply(200, fp.headers.dict, fp.read()) #print(str(fp)) result = Reply(200, fp.headers, fp.read()) log.debug('received:\n%s', result) except HTTPError as e: if e.code in (202,204): result = None else: raise TransportError(e.msg, e.code, e.fp) return result def addcookies(self, u2request): """ Add cookies in the cookiejar to the request. @param u2request: A urllib2 request. @rtype: u2request: urllib2.Requet. """ self.cookiejar.add_cookie_header(u2request) def getcookies(self, fp, u2request): """ Add cookies in the request to the cookiejar. @param u2request: A urllib2 request. @rtype: u2request: urllib2.Requet. """ self.cookiejar.extract_cookies(fp, u2request) def u2open(self, u2request): """ Open a connection. @param u2request: A urllib2 request. @type u2request: urllib2.Requet. @return: The opened file-like urllib2 object. @rtype: fp """ tm = self.options.timeout url = self.u2opener() if self.u2ver() < 2.6: socket.setdefaulttimeout(tm) return url.open(u2request) else: return url.open(u2request, timeout=tm) def u2opener(self): """ Create a urllib opener. @return: An opener. @rtype: I{OpenerDirector} """ if self.urlopener is None: return u2.build_opener(*self.u2handlers()) else: return self.urlopener def u2handlers(self): """ Get a collection of urllib handlers. @return: A list of handlers to be installed in the opener. @rtype: [Handler,...] """ handlers = [] handlers.append(u2.ProxyHandler(self.proxy)) return handlers def u2ver(self): """ Get the major/minor version of the urllib2 lib. @return: The urllib2 version. @rtype: float """ try: part = u2.__version__.split('.', 1) n = float('.'.join(part)) return n except Exception as e: log.exception(e) return 0 def __deepcopy__(self, memo={}): clone = self.__class__() p = Unskin(self.options) cp = Unskin(clone.options) cp.update(p) return clone
class CookieTransport(TimeoutTransport): '''A subclass of xmlrpclib.Transport that supports cookies.''' cookiejar = None scheme = 'http' # Cribbed from xmlrpclib.Transport.send_user_agent def send_cookies(self, connection, cookie_request): if self.cookiejar is None: self.cookiejar = CookieJar() elif self.cookiejar: # Let the cookiejar figure out what cookies are appropriate self.cookiejar.add_cookie_header(cookie_request) # Pull the cookie headers out of the request object... cookielist = list() for h, v in cookie_request.header_items(): if h.startswith('Cookie'): cookielist.append([h, v]) # ...and put them over the connection for h, v in cookielist: connection.putheader(h, v) def single_request(self, host, handler, request_body, verbose=1): # issue XML-RPC request h = self.make_connection(host) if verbose: h.set_debuglevel(1) request_url = "%s://%s/" % (self.scheme, host) cookie_request = urllib.request.Request(request_url) try: self.send_request(h, handler, request_body) self.send_host(h, host) self.send_cookies( h, cookie_request) # ADDED. creates cookiejar if None. self.send_user_agent(h) self.send_content(h, request_body) response = h.getresponse(buffering=True) # ADDED: parse headers and get cookies here # fake a response object that we can fill with the headers above class CookieResponse: def __init__(self, headers): self.headers = headers def info(self): return self.headers cookie_response = CookieResponse(response.msg) # Okay, extract the cookies from the headers self.cookiejar.extract_cookies(cookie_response, cookie_request) # And write back any changes if hasattr(self.cookiejar, 'save'): self.cookiejar.save(self.cookiejar.filename) if response.status == 200: self.verbose = verbose return self.parse_response(response) except xmlrpc.client.Fault: raise except Exception: # All unexpected errors leave connection in # a strange state, so we clear it. self.close() raise # discard any response data and raise exception if (response.getheader("content-length", 0)): response.read() raise xmlrpc.client.ProtocolError( host + handler, response.status, response.reason, response.msg, )
class Session: def __init__(self, base, auth=None): self.auth = auth self.headers = {'User-Agent': 'foobar'} self.context = init_ssl() self.jar = CookieJar() split = urlsplit(base) self.base = '{}://{}'.format(split.scheme, split.netloc) if self.auth: auth = ':'.join(self.auth) if sys.version_info >= (3,): basic = base64.b64encode(auth.encode('ascii')).decode('ascii') else: basic = base64.b64encode(auth) self.headers['Authorization'] = 'Basic {}'.format(basic) self._get_crumb() def _get_crumb(self): """ Get the necessary crumb header if our Jenkins instance is CSRF protected, and automatically add it to this session's default headers. """ try: args = 'xpath=concat(//crumbRequestField,":",//crumb)' resp = self.get_url(self.base + '/crumbIssuer/api/xml?' + args) except HTTPError as err: # only ignore the error if it's a 404 (i.e. Jenkins is not CSRF # protected) if err.code != 404: raise else: key, value = resp.text.split(':') self.headers[key] = value def get_url(self, url, data=None, stream=False, retries=5): headers = self.headers.copy() if data is not None: data = urlencode(data).encode('utf-8') headers['Content-Type'] = 'application/x-www-form-urlencoded' retries = 1 # do not retry POSTs req = Request(url, data, headers=headers) self.jar.add_cookie_header(req) for i in range(retries): # pragma: nocover try: response = urlopen(req, context=self.context) except HTTPError: if i == retries - 1: raise time.sleep(0.1) else: break self.jar.extract_cookies(response, req) if sys.version_info >= (3,): response.headers = CaseInsensitiveDict(response.headers._headers) else: response.headers = CaseInsensitiveDict(response.headers.dict) if stream: return stream_response(response) else: response.text = response.read().decode('utf-8') return response def get_job_params(self, url): """ Get the list of allowed parameters and their respective choices. """ url = url.rstrip('/') + '/api/json' response = self.get_url(url) response = json.loads(response.text) props = response.get('property', []) definition_prop = 'hudson.model.ParametersDefinitionProperty' defs = next( ( p['parameterDefinitions'] for p in props if p.get('_class', '') == definition_prop ), [], ) if not defs: return {} params = {} for definition in defs: params[definition['name']] = definition.get('choices', None) return params def launch_build(self, url, params=None): """ Submit job and return the queue item location. """ url = url.rstrip('/') + '/' job_params = self.get_job_params(url) validate_params(job_params, params) url += 'buildWithParameters' if job_params else 'build' url += '?delay=0' log('Sending build request') data = params or "" # urllib will send a POST with an empty string response = self.get_url(url, data=data) assert ( 'Location' in response.headers ), 'Something went wrong with the Jenkins API' location = response.headers['Location'] assert 'queue' in location, 'Something went wrong with the Jenkins API' return location def get_queue_status(self, location): """ Check the status of a queue item. Returns the build url if the job is already executing, or None if it's still in the queue. """ queue = location.rstrip('/') + '/api/json' response = self.get_url(queue) response = json.loads(response.text) if response.get('cancelled', False): raise RuntimeError('Build was cancelled') if response.get('executable', False): return response['executable']['url'] return None @deprecate(instead='wait_queue') def wait_queue_item(self, *args, **kwargs): pass def wait_queue(self, location, interval=5.0): """ Wait until the item starts building. """ while True: job_url = self.get_queue_status(location) if job_url is not None: break show_progress('Job queued', interval) log('') return job_url @deprecate(instead='job_status') def get_job_status(self, *args, **kwargs): pass def job_status(self, build_url): """ Check the status of a running build. Returns a tuple with the status of the build and the current stage. The status is True on successful exit, False on failure or None if the build is still running. """ poll_url = build_url.rstrip('/') + '/wfapi/describe' try: response = self.get_url(poll_url) except HTTPError as error: if error.code == 404: build_number = build_url.rstrip('/').rpartition('/')[2] error.msg = 'Build #%s does not exist' % build_number raise response = json.loads(response.text) status = response.get('status', '') stages = response.get('stages', [{}]) if status == 'NOT_EXECUTED': if response.get('durationMillis', 0) == 0: # Build has just been launched. Report it as in_progress return None, {} # Build finished as not_executed. Probably an in your Jenkinsfile return False, stages[-1] elif status == 'IN_PROGRESS': in_progress = [ s for s in stages if s.get('status', '') == 'IN_PROGRESS' ] in_progress = in_progress or [{}] return None, in_progress[0] else: # Jenkins returns false negatives in the 'status' field sometimes. # Instead of trusting 'status', we will determine if the build # failed by checking if any of the stages failed. last = stages[-1] status = all( s.get('status', '') in ('SUCCESS', 'NOT_EXECUTED') for s in stages ) return status, last @deprecate(instead='wait_job') def wait_for_job(self, *args, **kwargs): pass def wait_job(self, build_url, interval=5.0): """ Wait until the build finishes. """ name = '#' + build_url.rstrip('/').split('/')[-1] last_stage = None while True: status, stage = self.job_status(build_url) if status is not None: status_name = 'SUCCESS' if status else 'FAILURE' log('\nJob', name, 'ended in', status_name) return status stage_name = stage.get('name', '') msg = stage_name or 'Build %s in progress' % name millis = stage.get('durationMillis', None) if stage_name != last_stage: last_stage = stage_name msg = '\n' + msg show_progress(msg, interval, millis=millis) def retrieve_log(self, build_url): """ Get the build log and return it as a string. """ build_url = build_url.rstrip('/') + '/' url = build_url + 'consoleText' log = ''.join( block.text.decode('utf-8', errors='ignore') for block in self.get_url(url, stream=True) ) return log @deprecate(instead='dump_log') def save_log_to_file(self, *args, **kwargs): pass def dump_log(self, build_url, filename=None): """ Save the build log to a file. """ build_url = build_url.rstrip('/') + '/' if filename: file = filename elif CONFIG['output'] and CONFIG['output'] is not True: file = CONFIG['output'] else: job_name = build_url[build_url.find('/job/') :] job_name = ( job_name.replace('/', '_').replace('_job_', '_').strip('_') ) file = job_name + '.txt' isfile = hasattr(file, 'write') if not isfile: file = io.open(file, 'w', encoding='utf-8') file.write(self.retrieve_log(build_url)) if not isfile: file.close() log('Job output saved to', file)
def unshort_url(url, parse_documents=False, enable_cookies=None, **kwargs): """Try to unshort the given URL (follow http redirects). Parameters: url (`str`): Shortened URL. parse_documents (`bool`, *optional*): If True, Unalix will also try to obtain the unshortened URL from the response's body. enable_cookies (`bool`, *optional*): True: Unalix will handle cookies for all requests. False: Unalix will not handle cookies. None (default): Unalix will handle cookies only if needed. In most cases, cookies returned in HTTP responses are useless. They do not need to be stored or sent back to the server. Keeping this as "None" should be enough for you. Only set this parameter to True if you get stuck at some redirect loop due to missing cookies. **kwargs (`bool`, *optional*): Optional arguments that `parse_rules` takes. Raises: ConnectionError: In case some error occurred during the request. TooManyRedirects: In case the request exceeded maximum allowed redirects. InvalidURL: In case the provided *url* is not a valid URL or hostname. InvalidScheme: In case the provided *url* has a invalid or unknown scheme. InvalidContentEncoding: In case the "Content-Enconding" header has a invalid value. Usage: >>> from unalix import unshort_url >>> unshort_url("https://bitly.is/Pricing-Pop-Up") 'https://bitly.com/pages/pricing' """ url = parse_rules(parse_url(url), **kwargs) if enable_cookies is None: cookies = CookieJar(policy=allow_cookies_if_needed) elif enable_cookies is True: cookies = CookieJar(policy=allow_all_cookies) else: cookies = CookieJar(policy=deny_all_cookies) total_redirects = 0 while True: if total_redirects > max_redirects: raise TooManyRedirects( "The request exceeded maximum allowed redirects", url ) scheme, netloc, path, params, query, fragment = urlparse(url) connection = create_connection(scheme, netloc) add_missing_attributes(url, connection) if query: path = f"{path}?{query}" cookies.add_cookie_header(connection) headers = connection.headers headers.update(default_headers) try: connection.request("GET", path, headers=headers) response = connection.getresponse() except Exception as exception: raise ConnectionError(str(exception), url) cookies.extract_cookies(response, connection) redirect_url = handle_redirects(url, response) if isinstance(redirect_url, str): total_redirects = total_redirects + 1 url = parse_rules(redirect_url, **kwargs) continue if parse_documents: extracted_url = extract_url(url, response) if isinstance(extracted_url, str): url = parse_rules(extracted_url, **kwargs) continue break if not response.isclosed(): response.close() return url
class HttpTransport(Transport): """ Basic HTTP transport implemented using using urllib2, that provides for cookies & proxies but no authentication. """ def __init__(self, **kwargs): """ @param kwargs: Keyword arguments. - B{proxy} - An HTTP proxy to be specified on requests. The proxy is defined as {protocol:proxy,} - type: I{dict} - default: {} - B{timeout} - Set the URL open timeout (seconds). - type: I{float} - default: 90 """ Transport.__init__(self) Unskin(self.options).update(kwargs) self.cookiejar = CookieJar() self.proxy = {} self.urlopener = None def open(self, request): try: url = self.__get_request_url_for_urllib(request) log.debug('opening (%s)', url) u2request = urllib.request.Request(url) self.proxy = self.options.proxy return self.u2open(u2request) except urllib.error.HTTPError as e: raise TransportError(str(e), e.code, e.fp) def send(self, request): url = self.__get_request_url_for_urllib(request) msg = request.message headers = request.headers try: u2request = urllib.request.Request(url, msg, headers) self.addcookies(u2request) self.proxy = self.options.proxy request.headers.update(u2request.headers) log.debug('sending:\n%s', request) fp = self.u2open(u2request) self.getcookies(fp, u2request) headers = fp.headers if sys.version_info < (3, 0): headers = headers.dict reply = Reply(http.client.OK, headers, fp.read()) log.debug('received:\n%s', reply) return reply except urllib.error.HTTPError as e: if e.code not in (http.client.ACCEPTED, http.client.NO_CONTENT): raise TransportError(e.msg, e.code, e.fp) def addcookies(self, u2request): """ Add cookies in the cookiejar to the request. @param u2request: A urllib2 request. @rtype: u2request: urllib2.Request. """ self.cookiejar.add_cookie_header(u2request) def getcookies(self, fp, u2request): """ Add cookies in the request to the cookiejar. @param u2request: A urllib2 request. @rtype: u2request: urllib2.Request. """ self.cookiejar.extract_cookies(fp, u2request) def u2open(self, u2request): """ Open a connection. @param u2request: A urllib2 request. @type u2request: urllib2.Request. @return: The opened file-like urllib2 object. @rtype: fp """ tm = self.options.timeout url = self.u2opener() if (sys.version_info < (3, 0)) and (self.u2ver() < 2.6): socket.setdefaulttimeout(tm) return url.open(u2request) return url.open(u2request, timeout=tm) def u2opener(self): """ Create a urllib opener. @return: An opener. @rtype: I{OpenerDirector} """ if self.urlopener is None: return urllib.request.build_opener(*self.u2handlers()) return self.urlopener def u2handlers(self): """ Get a collection of urllib handlers. @return: A list of handlers to be installed in the opener. @rtype: [Handler,...] """ return [urllib.request.ProxyHandler(self.proxy)] def u2ver(self): """ Get the major/minor version of the urllib2 lib. @return: The urllib2 version. @rtype: float """ try: part = urllib2.__version__.split('.', 1) return float('.'.join(part)) except Exception as e: log.exception(e) return 0 def __deepcopy__(self, memo={}): clone = self.__class__() p = Unskin(self.options) cp = Unskin(clone.options) cp.update(p) return clone @staticmethod def __get_request_url_for_urllib(request): """ Returns the given request's URL, properly encoded for use with urllib. We expect that the given request object already verified that the URL contains ASCII characters only and stored it as a native str value. urllib accepts URL information as a native str value and may break unexpectedly if given URL information in another format. Python 3.x httplib.client implementation must be given a unicode string and not a bytes object and the given string is internally converted to a bytes object using an explicitly specified ASCII encoding. Python 2.7 httplib implementation expects the URL passed to it to not be a unicode string. If it is, then passing it to the underlying httplib Request object will cause that object to forcefully convert all of its data to unicode, assuming that data contains ASCII data only and raising a UnicodeDecodeError exception if it does not (caused by simple unicode + string concatenation). Python 2.4 httplib implementation does not really care about this as it does not use the internal optimization present in the Python 2.7 implementation causing all the requested data to be converted to unicode. """ assert isinstance(request.url, str) return request.url
class CookieTransport(TimeoutTransport): '''A subclass of xmlrpclib.Transport that supports cookies.''' cookiejar = None scheme = 'http' # Cribbed from xmlrpclib.Transport.send_user_agent def send_cookies(self, connection, cookie_request): if self.cookiejar is None: self.cookiejar = CookieJar() elif self.cookiejar: # Let the cookiejar figure out what cookies are appropriate self.cookiejar.add_cookie_header(cookie_request) # Pull the cookie headers out of the request object... cookielist = list() for h, v in cookie_request.header_items(): if h.startswith('Cookie'): cookielist.append([h, v]) # ...and put them over the connection for h, v in cookielist: connection.putheader(h, v) # This is the same request() method from xmlrpclib.Transport, # with a couple additions noted below def request(self, host, handler, request_body, verbose=0): h = self.make_connection(host) if verbose: h.set_debuglevel(1) request_url = "%s://%s/" % (self.scheme, host) cookie_request = urllib.request.Request(request_url) self.send_request(h, handler, request_body) self.send_host(h, host) self.send_cookies(h, cookie_request) # ADDED. creates cookiejar if None. self.send_user_agent(h) self.send_content(h, request_body) errcode, errmsg, headers = h.getreply() # ADDED: parse headers and get cookies here # fake a response object that we can fill with the headers above class CookieResponse: def __init__(self, headers): self.headers = headers def info(self): return self.headers cookie_response = CookieResponse(headers) # Okay, extract the cookies from the headers self.cookiejar.extract_cookies(cookie_response, cookie_request) # And write back any changes if hasattr(self.cookiejar, 'save'): self.cookiejar.save(self.cookiejar.filename) if errcode != 200: raise xmlrpc.client.ProtocolError( host + handler, errcode, errmsg, headers ) self.verbose = verbose try: sock = h._conn.sock except AttributeError: sock = None return self._parse_response(h.getfile(), sock)
class HttpClient(ObjectWithLogger): """An HTTP client.""" ATTEMPTS = 10 LOGGER_NAME = 'HTTP' SLEEP_TIME = 1 # s TIMEOUT = 3 # s DEFAULT_ACCEPT_HEADERS = { 'Accept': '*/*;q=0.1', 'Accept-Charset': 'utf-8;q=1.0, *;q=0.1', 'Accept-Encoding': 'gzip, br, deflate;q=1.0, *;q=0.5', 'Accept-Language': 'en-US, en;q=1.0, *;q=0.5' } DEFAULT_ACCESS_CONTROL_REQUEST_HEADERS = { 'Access-Control-Request-Headers': 'Accept, Accept-Charset, Accept-Encoding, Accept-Language,' ' Connection, Content-Type, DNT', 'Access-Control-Request-Method': 'GET, HEAD, OPTIONS' } DEFAULT_HEADERS = { 'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' } DEFAULT_HEAD_HEADERS = dict(**DEFAULT_HEADERS, **DEFAULT_ACCEPT_HEADERS) DEFAULT_OPTIONS_HEADERS = dict(**DEFAULT_HEADERS, **DEFAULT_ACCESS_CONTROL_REQUEST_HEADERS) isHeadAllowed = None isOptionsAllowed = None def __init__(self): """Initialize an HttpClient instance.""" super().__init__() self.cookieJar = CookieJar() @contextmanager def connect(self, server, protocol='https', attempts=ATTEMPTS): """Connect to an HTTP server.""" if not can_connect_to_internet(): self.logError('No Internet connection.') raise ExitException() self.url = urlsplit(urlunsplit((protocol, server, '', '', ''))) if self.url.scheme == 'https': connectionClass = HTTPSConnection port = HTTPS_PORT elif self.url.scheme == 'http': connectionClass = HTTPConnection port = HTTP_PORT else: self.logError('Unknown protocol.') raise ExitException() while attempts: try: self.connection = connectionClass(self.url.netloc, port=port, timeout=self.TIMEOUT) self.logInfo('Connecting to ' + self.url.netloc + '...') self.connection.connect() self.logInfo('...connected; checking server options...') self.requestServerOptions() self.logInfo('...connection negotiated; yielding...') yield self.connection except gaierror as error: self.logDebug('...error getting address info; error code ' + str(error.errno) + ': "' + error.strerror + '"...') if error.errno == -2: # Name or service not known self.logDebug('netloc = ' + str(self.url.netloc)) attempts = 1 except HTTPException: raise except HttpRedirect as redirect: self.logInfo('...following redirect...') with self.connect(redirect.url.netloc, attempts=attempts) \ as connection: yield connection else: return finally: self.logInfo('...closing connection...') self.connection.close() attempts -= 1 if attempts: self.logDebug('...retrying (' + str(attempts) + ' more attempts)...') sleep(self.SLEEP_TIME) self.logError('...couldn\'t connect.') raise ExitException() def download(self, downloads, baseDownloadPath): """TODO.""" make_directory_if_not_exists(baseDownloadPath) for server, files in downloads.items(): with self.connect(server): for file in files: self.downloadFile(file['url'], baseDownloadPath / file['saveAs']) def downloadFile(self, source, destination, attempts=ATTEMPTS, force=False): """TODO.""" self.logInfo('Downloading ' + source + '...') if isinstance(source, str): self.url = urlsplit(source) self.logDebug(self.url) doHead = self.isHeadAllowed doOptions = self.isOptionsAllowed headers = dict(**self.DEFAULT_HEADERS, **self.DEFAULT_ACCEPT_HEADERS) resource = urlunsplit(('', '', self.url.path, self.url.query, '')) if not force: last_modified = get_last_modified_time(destination) if last_modified: headers['If-Modified-Since'] = last_modified # TODO: use ETag # etag = etag(destination) # if etag: # headers['If-None-Match'] = etag while attempts: if doOptions: with self.requestOptions(resource, headers=headers) as response: pass if doHead: with self.requestHead(resource, headers=headers) as response: # TODO: if 'Last-Modified' > if not force and response.status == 304: self.logInfo('...remote file not modified; skipping.') return if response.getheader('Content-Length') \ == get_file_size(destination): self.logInfo('...file already exists; skipping.') return try: with self.request(resource, headers=headers) as response: if response.status in REDIRECT_STATUS_CODES: raise HttpRedirect(response.status, response.getheader('Location')) elif not force and response.status == 304: self.logInfo('...remote file not modified; skipping.') return elif response.status == 200: self.logInfo('...saving as ' + str(destination) + '...') with destination.open('w+b') as destinationFile: destinationFile.write(response.read()) # for chunk in response.iter_content(ONE_MEGABYTE): # if chunk: # destinationFile.write(chunk) # destinationFile.flush() # TODO: store ETag # if response.getheader('ETag'): # save_etag(response.headers['ETag']) except HTTPException as error: self.logError(error.args[0]) self.logError('...HTTP error...') except HttpRedirect as redirect: if redirect.url.scheme != self.url.scheme \ or redirect.url.netloc != self.url.netloc: raise redirect self.url = redirect.url resource = \ urlunsplit(('', '', self.url.path, self.url.query, '')) doHead = False continue else: self.logInfo('...' + source + ' downloaded.') return attempts -= 1 if attempts: self.logDebug('...retrying (' + str(attempts) + ' more attempts)...') sleep(self.SLEEP_TIME) self.logWarning('...couldn\'t download ' + source + '.') @staticmethod def formatHeader(header, argument): """Format an HTTP header.""" return format_http_header(header, argument) @contextmanager def request(self, resource, method=GET, headers=None, messageBody=None): """TODO.""" if headers is None: headers = {} skipAcceptEncoding = skipHost = False else: skipAcceptEncoding = 'Accept-Encoding' in headers skipHost = 'Host' in headers request = Request(urlunsplit(self.url), headers=headers, method=method) self.cookieJar.add_cookie_header(request) headers = sorted(request.header_items()) self.logInfo('...starting request...') self.logDebug(method + ' ' + resource + ' HTTP/1.1') self.connection.putrequest(method, resource, skip_host=skipHost, skip_accept_encoding=skipAcceptEncoding) if headers: self.logDebug('...sending headers...') for header, argument in headers: self.logDebug(self.formatHeader(header, argument)) self.connection.putheader(header, argument) if messageBody: self.logDebug('...sending message body...') # self.logDebug(messageBody) self.connection.endheaders(messageBody) else: self.connection.endheaders() self.logDebug('...getting response...') with self.connection.getresponse() as response: self.logInfo('...response received...') self.logDebug(str(response.status) + ' ' + response.reason) for header, argument in response.getheaders(): self.logDebug(self.formatHeader(header, argument)) # self.logDebug(response.read()) self.cookieJar.extract_cookies(response, request) yield response def requestHead(self, resource, headers=None): """TODO.""" if headers is None: headers = self.DEFAULT_HEAD_HEADERS return self.request(resource, HEAD, headers=headers) def requestOptions(self, resource, headers=None): """TODO.""" if headers is None: headers = self.DEFAULT_OPTIONS_HEADERS return self.request(resource, OPTIONS, headers=headers) def requestServerOptions(self, headers=None): """TODO.""" if headers is None: headers = self.DEFAULT_HEADERS with self.request(SERVER_WIDE_REQUEST_TARGET, OPTIONS, headers=headers) as response: if response.status == 200: if HEAD in response.getheader('Allow'): self.isHeadAllowed = True elif response.status in REDIRECT_STATUS_CODES: location = response.getheader('Location') if location.split('://')[1] \ == self.server + SERVER_WIDE_REQUEST_TARGET: location = location[:-1] raise HttpRedirect(response.status, location) elif response.status >= 400: with self.requestOptions(SERVER_ROOT) as rootOptionsResponse: if rootOptionsResponse.status == 200: self.isOptionsAllowed = True elif rootOptionsResponse.status >= 400: self.isOptionsAllowed = False if not self.isOptionsAllowed: with self.requestHead(SERVER_ROOT) as rootHeadResponse: if rootHeadResponse.status == 200: self.isHeadAllowed = True elif rootHeadResponse.status >= 404: self.isHeadAllowed = False