def host_path_to_url(host_path): """Formats host (absolute) path to url""" host_path = resolve_path(host_path) rel_path = os.path.relpath(host_path, ROOT_DIR) url = urlunparse(("http", f"host.docker.internal:{FILE_HOST_PORT}", rel_path, None, None, None)) return url
def parse_request_uri(self): properties = self.parse_request_properties() zone = properties.get("zone", "") port = str(self.config.port) endpoint = "".join( [self.config.protocol, "://", self.config.host, ":", port]) if zone != "": endpoint = "".join([ self.config.protocol, "://", zone, ".", self.config.host, ":", port ]) request_uri = self.operation["URI"] if len(properties): for (k, v) in properties.items(): endpoint = endpoint.replace("<%s>" % k, v) request_uri = request_uri.replace("<%s>" % k, v) parsed_uri = endpoint + request_uri parsed_params = self.parse_request_params() if len(parsed_params): scheme, netloc, path, params, req_query, fragment = urlparse( parsed_uri, allow_fragments=False) query = [req_query] for (k, v) in parsed_params.items(): query.append("%s=%s" % (k, v)) if not req_query: query.pop(0) parsed_uri = urlunparse( (scheme, netloc, path, params, "", fragment)) + "?" + "&".join( sorted(query)) return parsed_uri
def get_url(args_dict): """ util method to build the api url, needs improvement :param args_dict: A dictionary with query of a url, format is key: value, that translates into 'http...?key=value' :return: """ url_parts = list(urlparse(ROOT_URL)) url_parts[4] = urlencode(args_dict) return urlunparse(url_parts)
def sign(self): self.req.headers["Authorization"] = "".join([ "QS ", self.access_key_id, ":", self.get_authorization() ]) self.logger.debug(self.req.headers["Authorization"]) prepared = self.req.prepare() scheme, netloc, path, params, query, fragment = urlparse( prepared.url, allow_fragments=False ) path = quote(unquote(path)) prepared.url = urlunparse( (scheme, netloc, path, params, query, fragment) ) return prepared
def sign_query(self, expires): del self.req.headers["Content-Type"] prepared = self.req.prepare() scheme, netloc, path, params, req_query, fragment = urlparse( prepared.url, allow_fragments=False ) path = quote(unquote(path)) query = [ req_query, "signature=%s" % self.get_query_signature(expires), "access_key_id=%s" % self.access_key_id, "expires=%s" % str(expires) ] if not req_query: query.pop(0) prepared.url = urlunparse((scheme, netloc, path, params, "", fragment) ) + "?" + "&".join(query) return prepared
def fixurl(url): # Inspired from https://stackoverflow.com/a/804380 but using requests from requests.utils import urlparse, urlunparse, quote, unquote # turn string into unicode if not isinstance(url, unicode): url = url.decode('utf8') # parse it parsed = urlparse(url) # divide the netloc further userpass, at, hostport = parsed.netloc.rpartition('@') user, colon1, pass_ = userpass.partition(':') host, colon2, port = hostport.partition(':') # encode each component scheme = parsed.scheme.encode('utf8') user = quote(user.encode('utf8')) colon1 = colon1.encode('utf8') pass_ = quote(pass_.encode('utf8')) at = at.encode('utf8') host = host.encode('idna') colon2 = colon2.encode('utf8') port = port.encode('utf8') path = '/'.join( # could be encoded slashes! quote(unquote(pce).encode('utf8'), '') for pce in parsed.path.split('/')) query = quote(unquote(parsed.query).encode('utf8'), '=&?/') fragment = quote(unquote(parsed.fragment).encode('utf8')) # put it back together netloc = ''.join((user, colon1, pass_, at, host, colon2, port)) #urlunparse((scheme, netloc, path, params, query, fragment)) params = '' return urlunparse((scheme, netloc, path, params, query, fragment))
def download_url(self, url): scheme, netloc, path, params, query, fragment = urlparse(url) path = quote(path) # For safety urlencode the generated URL... url = urlunparse((scheme, netloc, path, params, query, fragment)) if url in self.bad_urls: self._logger_.log('INFO', 'Not downloading known bad URL: {0}'.format(url)) return None try: # The actual request resp = self._requests_get(url, headers=self._req_headers, stream=True) except RequestException as err: self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format(err)) return None if resp.status_code != 200: # Not HTTP 200 OK self._handle_request_exception(url, 'Downloading failed with status code: {0} {1}'.format(resp.status_code, resp.reason)) return None # REQUEST reqv_headers = resp.request.headers reqv_headers['Host'] = netloc proto = 'HTTP/{0}'.format(respv_str[resp.raw.version]) # Friendly protocol name reqv_http_headers = StatusAndHeaders('GET {0} {1}'.format(urlunparse(('', '', path, params, query, fragment)), proto), reqv_headers.items(), is_http_request=True) reqv_record = self._writer.create_warc_record(url, 'request', http_headers=reqv_http_headers) # RESPONSE resp_status = '{0} {1}'.format(resp.status_code, resp.reason) resp_headers_list = resp.raw.headers.items() # get raw headers from urllib3 # Must get peer_name before the content is read # It has no official API for that: # https://github.com/kennethreitz/requests/issues/2158 # https://github.com/urllib3/urllib3/issues/1071 # So workaround to be compatible with windows: # https://stackoverflow.com/questions/22492484/how-do-i-get-the-ip-address-from-a-http-request-using-the-\ # requests-library/22513161#22513161 try: peer_name = resp.raw._connection.sock.getpeername()[0] # Must get peer_name before the content is read except AttributeError: # On Windows there is no getpeername() Attribute of the class... try: peer_name = resp.raw._connection.sock.socket.getpeername()[0] except AttributeError: peer_name = 'None' # Socket closed and could not derermine peername... try: data = resp.raw.read() # To be able to return decoded and also write warc except ProtocolError as err: self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format(err)) return None if len(data) == 0: err = 'Response data has zero length!' self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n' ' The program ignores it and jumps to the next one.'.format(err)) return None enc = resp.encoding # Get or detect encoding to decode the bytes of the text to str if enc is None: enc = detect(data)['encoding'] try: text = data.decode(enc) # Normal decode process except UnicodeDecodeError: self._logger_.log('WARNING', '\t'.join(('DECODE ERROR RETRYING IN \'IGNORE\' MODE:', url, enc))) text = data.decode(enc, 'ignore') data_stream = BytesIO(data) # Need the original byte stream to write the payload to the warc file resp_http_headers = StatusAndHeaders(resp_status, resp_headers_list, protocol=proto) # Add extra headers like encoding because it is not stored any other way... resp_record = self._writer.create_warc_record(url, 'response', payload=data_stream, http_headers=resp_http_headers, warc_headers_dict={'WARC-IP-Address': peer_name, 'WARC-X-Detected-Encoding': enc}) # Everything is OK, write the two WARC records self._writer.write_record(reqv_record) self._writer.write_record(resp_record) return text