Exemple #1
0
def host_path_to_url(host_path):
    """Formats host (absolute) path to url"""
    host_path = resolve_path(host_path)
    rel_path = os.path.relpath(host_path, ROOT_DIR)
    url = urlunparse(("http", f"host.docker.internal:{FILE_HOST_PORT}",
                      rel_path, None, None, None))
    return url
Exemple #2
0
 def parse_request_uri(self):
     properties = self.parse_request_properties()
     zone = properties.get("zone", "")
     port = str(self.config.port)
     endpoint = "".join(
         [self.config.protocol, "://", self.config.host, ":", port])
     if zone != "":
         endpoint = "".join([
             self.config.protocol, "://", zone, ".", self.config.host, ":",
             port
         ])
     request_uri = self.operation["URI"]
     if len(properties):
         for (k, v) in properties.items():
             endpoint = endpoint.replace("<%s>" % k, v)
             request_uri = request_uri.replace("<%s>" % k, v)
     parsed_uri = endpoint + request_uri
     parsed_params = self.parse_request_params()
     if len(parsed_params):
         scheme, netloc, path, params, req_query, fragment = urlparse(
             parsed_uri, allow_fragments=False)
         query = [req_query]
         for (k, v) in parsed_params.items():
             query.append("%s=%s" % (k, v))
         if not req_query:
             query.pop(0)
         parsed_uri = urlunparse(
             (scheme, netloc, path, params, "", fragment)) + "?" + "&".join(
                 sorted(query))
     return parsed_uri
Exemple #3
0
def get_url(args_dict):
    """
    util method to build the api url, needs improvement
    :param args_dict: A dictionary with query of a url, format is key: value, that translates into 'http...?key=value'
    :return:
    """
    url_parts = list(urlparse(ROOT_URL))
    url_parts[4] = urlencode(args_dict)
    return urlunparse(url_parts)
 def sign(self):
     self.req.headers["Authorization"] = "".join([
         "QS ", self.access_key_id, ":", self.get_authorization()
     ])
     self.logger.debug(self.req.headers["Authorization"])
     prepared = self.req.prepare()
     scheme, netloc, path, params, query, fragment = urlparse(
         prepared.url, allow_fragments=False
     )
     path = quote(unquote(path))
     prepared.url = urlunparse(
         (scheme, netloc, path, params, query, fragment)
     )
     return prepared
 def sign_query(self, expires):
     del self.req.headers["Content-Type"]
     prepared = self.req.prepare()
     scheme, netloc, path, params, req_query, fragment = urlparse(
         prepared.url, allow_fragments=False
     )
     path = quote(unquote(path))
     query = [
         req_query, "signature=%s" % self.get_query_signature(expires),
         "access_key_id=%s" % self.access_key_id,
         "expires=%s" % str(expires)
     ]
     if not req_query:
         query.pop(0)
     prepared.url = urlunparse((scheme, netloc, path, params, "", fragment)
                               ) + "?" + "&".join(query)
     return prepared
def fixurl(url):
    # Inspired from https://stackoverflow.com/a/804380 but using requests
    from requests.utils import urlparse, urlunparse, quote, unquote

    # turn string into unicode
    if not isinstance(url, unicode):
        url = url.decode('utf8')

    # parse it
    parsed = urlparse(url)

    # divide the netloc further
    userpass, at, hostport = parsed.netloc.rpartition('@')
    user, colon1, pass_ = userpass.partition(':')
    host, colon2, port = hostport.partition(':')

    # encode each component
    scheme = parsed.scheme.encode('utf8')
    user = quote(user.encode('utf8'))
    colon1 = colon1.encode('utf8')
    pass_ = quote(pass_.encode('utf8'))
    at = at.encode('utf8')
    host = host.encode('idna')
    colon2 = colon2.encode('utf8')
    port = port.encode('utf8')
    path = '/'.join(  # could be encoded slashes!
        quote(unquote(pce).encode('utf8'), '')
        for pce in parsed.path.split('/'))
    query = quote(unquote(parsed.query).encode('utf8'), '=&?/')
    fragment = quote(unquote(parsed.fragment).encode('utf8'))

    # put it back together
    netloc = ''.join((user, colon1, pass_, at, host, colon2, port))
    #urlunparse((scheme, netloc, path, params, query, fragment))
    params = ''
    return urlunparse((scheme, netloc, path, params, query, fragment))
Exemple #7
0
    def download_url(self, url):
        scheme, netloc, path, params, query, fragment = urlparse(url)
        path = quote(path)  # For safety urlencode the generated URL...
        url = urlunparse((scheme, netloc, path, params, query, fragment))

        if url in self.bad_urls:
            self._logger_.log('INFO', 'Not downloading known bad URL: {0}'.format(url))
            return None

        try:  # The actual request
            resp = self._requests_get(url, headers=self._req_headers, stream=True)
        except RequestException as err:
            self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n'
                                                ' The program ignores it and jumps to the next one.'.format(err))
            return None

        if resp.status_code != 200:  # Not HTTP 200 OK
            self._handle_request_exception(url, 'Downloading failed with status code: {0} {1}'.format(resp.status_code,
                                                                                                      resp.reason))
            return None

        # REQUEST
        reqv_headers = resp.request.headers
        reqv_headers['Host'] = netloc

        proto = 'HTTP/{0}'.format(respv_str[resp.raw.version])  # Friendly protocol name
        reqv_http_headers = StatusAndHeaders('GET {0} {1}'.format(urlunparse(('', '', path, params, query, fragment)),
                                                                  proto), reqv_headers.items(), is_http_request=True)
        reqv_record = self._writer.create_warc_record(url, 'request', http_headers=reqv_http_headers)

        # RESPONSE
        resp_status = '{0} {1}'.format(resp.status_code, resp.reason)
        resp_headers_list = resp.raw.headers.items()  # get raw headers from urllib3
        # Must get peer_name before the content is read
        # It has no official API for that:
        # https://github.com/kennethreitz/requests/issues/2158
        # https://github.com/urllib3/urllib3/issues/1071
        # So workaround to be compatible with windows:
        # https://stackoverflow.com/questions/22492484/how-do-i-get-the-ip-address-from-a-http-request-using-the-\
        # requests-library/22513161#22513161
        try:
            peer_name = resp.raw._connection.sock.getpeername()[0]  # Must get peer_name before the content is read
        except AttributeError:  # On Windows there is no getpeername() Attribute of the class...
            try:
                peer_name = resp.raw._connection.sock.socket.getpeername()[0]
            except AttributeError:
                peer_name = 'None'  # Socket closed and could not derermine peername...

        try:
            data = resp.raw.read()  # To be able to return decoded and also write warc
        except ProtocolError as err:
            self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n'
                                                ' The program ignores it and jumps to the next one.'.format(err))
            return None

        if len(data) == 0:
            err = 'Response data has zero length!'
            self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n'
                                                ' The program ignores it and jumps to the next one.'.format(err))
            return None

        enc = resp.encoding  # Get or detect encoding to decode the bytes of the text to str
        if enc is None:
            enc = detect(data)['encoding']
        try:
            text = data.decode(enc)  # Normal decode process
        except UnicodeDecodeError:
            self._logger_.log('WARNING', '\t'.join(('DECODE ERROR RETRYING IN \'IGNORE\' MODE:', url, enc)))
            text = data.decode(enc, 'ignore')
        data_stream = BytesIO(data)  # Need the original byte stream to write the payload to the warc file

        resp_http_headers = StatusAndHeaders(resp_status, resp_headers_list, protocol=proto)
        # Add extra headers like encoding because it is not stored any other way...
        resp_record = self._writer.create_warc_record(url, 'response', payload=data_stream,
                                                      http_headers=resp_http_headers,
                                                      warc_headers_dict={'WARC-IP-Address': peer_name,
                                                                         'WARC-X-Detected-Encoding': enc})
        # Everything is OK, write the two WARC records
        self._writer.write_record(reqv_record)
        self._writer.write_record(resp_record)

        return text