Exemple #1
0
 def startElementNS(self, name, qname, attrs):
     stack = self.stack
     stack.append(ElementHandler())
     current = self.current
     parent = self.parent
     base = attrs.get(BASE, None)
     if base is not None:
         base, frag = urldefrag(base)
         if parent and parent.base:
             base = urljoin(parent.base, base)
         else:
             systemId = self.locator.getPublicId() \
                 or self.locator.getSystemId()
             if systemId:
                 base = urljoin(systemId, base)
     else:
         if parent:
             base = parent.base
         if base is None:
             systemId = self.locator.getPublicId() \
                 or self.locator.getSystemId()
             if systemId:
                 base, frag = urldefrag(systemId)
     current.base = base
     language = attrs.get(LANG, None)
     if language is None:
         if parent:
             language = parent.language
     current.language = language
     current.start(name, qname, attrs)
Exemple #2
0
 def startElementNS(self, name, qname, attrs):
     stack = self.stack
     stack.append(ElementHandler())
     current = self.current
     parent = self.parent
     base = attrs.get(BASE, None)
     if base is not None:
         base, frag = urldefrag(base)
         if parent and parent.base:
             base = urljoin(parent.base, base)
         else:
             systemId = self.locator.getPublicId() \
                 or self.locator.getSystemId()
             if systemId:
                 base = urljoin(systemId, base)
     else:
         if parent:
             base = parent.base
         if base is None:
             systemId = self.locator.getPublicId() \
                 or self.locator.getSystemId()
             if systemId:
                 base, frag = urldefrag(systemId)
     current.base = base
     language = attrs.get(LANG, None)
     if language is None:
         if parent:
             language = parent.language
     current.language = language
     current.start(name, qname, attrs)
Exemple #3
0
    def check_traverse_and_set_context(self, key, node):
        """This method checks if we need to resolve a $ref.

        Decision is based on the node (eg. if it's a remote reference, starting with http),
        or if it's a local one.

        As both local and remote references can be relative to the given file, a
        self.context attribute is used to distinguish if the $ref is in the original
        file or in an external source.

        :param key:
        :param node:
        :return: True if I have to resolve the node.
        """
        if key != "$ref":
            return False, None

        if node.startswith("#/"):  # local reference
            try:
                is_local_ref = finddict(self.openapi, fragment_to_keys(node))
            except KeyError:
                is_local_ref = False

            # Don't resolve local references already in the spec.
            if is_local_ref:
                return False, None
            # Resolve local references in external files.
            if self.context:
                return True, None

            return False, None

        if node.startswith("http"):  # url reference
            host, fragment = urldefrag(node)
            return True, host

        if node.startswith("file://"):
            raise NotImplementedError

        host, fragment = urldefrag(node)
        if self.context:
            if self.context.startswith("http"):
                p = urljoin(self.context, host)
                # log.info(f"trying to set context {p}. Was {self.context}. host is: {host}.")
                return True, p

            p = Path(self.context).parent.joinpath(host)
            # log.info(f"trying to set context {p}. Was {self.context}. host is: {host}. resolved is {p.resolve()}")
            if p.is_file():
                return True, str(p.resolve())
            else:
                log.warning("can't set context %r. Retains %r", p,
                            self.context)

        # Remote reference should use previous
        #  context. Better should be to track
        #  nodes with their context.
        return True, None
        def url(self, name, force=False):
            """
            Returns the real URL in DEBUG mode.
            """
            if settings.DEBUG and not force:
                hashed_name, fragment = name, ''
            else:
                clean_name, fragment = urldefrag(name)
                if urlsplit(clean_name).path.endswith('/'):  # don't hash paths
                    hashed_name = name
                else:
                    hashed_name = self.stored_name(clean_name)

            final_url = super(HashedFilesMixin, self).url(hashed_name)

            # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
            # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
            query_fragment = '?#' in name  # [sic!]
            if fragment or query_fragment:
                urlparts = list(urlsplit(final_url))
                if fragment and not urlparts[4]:
                    urlparts[4] = fragment
                if query_fragment and not urlparts[3]:
                    urlparts[2] += '?'
                final_url = urlunsplit(urlparts)

            return unquote(final_url)
Exemple #5
0
    def download_request(self, request):
        timeout = request.meta.get('download_timeout') or self._connectTimeout
        agent = self._get_agent(request, timeout)

        # request details
        url = urldefrag(request.url)[0]
        method = to_bytes(request.method)
        headers = TxHeaders(request.headers)
        if isinstance(agent, self._TunnelingAgent):
            headers.removeHeader(b'Proxy-Authorization')
        bodyproducer = _RequestBodyProducer(
            request.body) if request.body else None

        start_time = time()
        d = agent.request(method, to_bytes(url, encoding='ascii'), headers,
                          bodyproducer)
        # set download latency
        d.addCallback(self._cb_latency, request, start_time)
        # response body is ready to be consumed
        d.addCallback(self._cb_bodyready, request)
        d.addCallback(self._cb_bodydone, request, url)
        # check download timeout
        self._timeout_cl = reactor.callLater(timeout, d.cancel)
        d.addBoth(self._cb_timeout, request, url, timeout)
        return d
Exemple #6
0
    def __init__(self, request, timeout=180):
        self.url = urldefrag(request.url)[0]
        self.method = request.method
        self.body = request.body or None
        self.headers = Headers(request.headers)
        self.response_headers = None
        self.timeout = request.meta.get('download_timeout') or timeout
        self.start_time = time()
        self.deferred = defer.Deferred().addCallback(self._build_response, request)

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
        # to have _disconnectedDeferred. See Twisted r32329.
        # As Scrapy implements it's own logic to handle redirects is not
        # needed to add the callback _waitForDisconnect.
        # Specifically this avoids the AttributeError exception when
        # clientConnectionFailed method is called.
        self._disconnectedDeferred = defer.Deferred()

        self._set_connection_attributes(request)

        # set Host header based on url
        self.headers.setdefault('Host', self.netloc)

        # set Content-Length based len of body
        if self.body is not None:
            self.headers['Content-Length'] = len(self.body)
            # just in case a broken http/1.1 decides to keep connection alive
            self.headers.setdefault("Connection", "close")
        # Content-Length must be specified in POST method even with no body
        elif self.method == 'POST':
            self.headers['Content-Length'] = 0
Exemple #7
0
def get_disk_name(ovf):
    """Get the disk format and file name from a OVF descriptor."""
    root = etree.fromstring(ovf)
    ovf_ns = root.nsmap['ovf']

    id_attr = '{%s}id' % ovf_ns
    href_attr = '{%s}href' % ovf_ns
    files = {f.get(id_attr): f.get(href_attr) for f in
             root.findall('ovf:References/ovf:File', root.nsmap)}

    # we do not care about more than one disk
    disk = root.find('ovf:DiskSection/ovf:Disk', root.nsmap)
    if disk is not None:
        format_attr = '{%s}format' % ovf_ns
        fileref_attr = '{%s}fileRef' % ovf_ns
        ovf_format = disk.get(format_attr)
        if not ovf_format:
            raise Exception("Expecting some format!")
        (format_url, _) = parse.urldefrag(ovf_format)
        try:
            disk_format = SPECS[format_url]
        except KeyError:
            raise Exception("Unknown format!")
        try:
            disk_file = files[disk.get(fileref_attr)]
        except KeyError:
            raise Exception("Unknown disk!")
        return (disk_format, disk_file)
    return None, None
 def include_root_definition(self):
     self.known_mappings['definitions'].update({
         urlparse(v._json_reference): self.descend(value=v._model_spec)
         for v in itervalues(self.swagger_spec.definitions)
         # urldefrag(url)[0] returns the url without the fragment, it is guaranteed to be present
         if urldefrag(v._json_reference)[0] == self.swagger_spec.origin_url
     })
Exemple #9
0
    def __init__(self, request, timeout=180):
        self._url = urldefrag(request.url)[0]
        # converting to bytes to comply to Twisted interface
        self.url = to_bytes(self._url, encoding='ascii')
        self.method = to_bytes(request.method, encoding='ascii')
        self.body = request.body or None
        self.headers = Headers(request.headers)
        self.response_headers = None
        self.timeout = request.meta.get('download_timeout') or timeout
        self.start_time = time()
        self.deferred = defer.Deferred().addCallback(self._build_response, request)

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
        # to have _disconnectedDeferred. See Twisted r32329.
        # As Scrapy implements it's own logic to handle redirects is not
        # needed to add the callback _waitForDisconnect.
        # Specifically this avoids the AttributeError exception when
        # clientConnectionFailed method is called.
        self._disconnectedDeferred = defer.Deferred()

        self._set_connection_attributes(request)

        # set Host header based on url
        self.headers.setdefault('Host', self.netloc)

        # set Content-Length based len of body
        if self.body is not None:
            self.headers['Content-Length'] = len(self.body)
            # just in case a broken http/1.1 decides to keep connection alive
            self.headers.setdefault("Connection", "close")
        # Content-Length must be specified in POST method even with no body
        elif self.method == b'POST':
            self.headers['Content-Length'] = 0
Exemple #10
0
def escape_ajax(url):
    """
    Return the crawleable url according to:
    http://code.google.com/web/ajaxcrawling/docs/getting-started.html

    >>> escape_ajax("www.example.com/ajax.html#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
    'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html?#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html#!")
    'www.example.com/ajax.html?_escaped_fragment_='

    URLs that are not "AJAX crawlable" (according to Google) returned as-is:

    >>> escape_ajax("www.example.com/ajax.html#key=value")
    'www.example.com/ajax.html#key=value'
    >>> escape_ajax("www.example.com/ajax.html#")
    'www.example.com/ajax.html#'
    >>> escape_ajax("www.example.com/ajax.html")
    'www.example.com/ajax.html'
    """
    defrag, frag = urldefrag(url)
    if not frag.startswith('!'):
        return url
    return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
Exemple #11
0
def escape_ajax(url):
    """
    Return the crawleable url according to:
    http://code.google.com/web/ajaxcrawling/docs/getting-started.html

    >>> escape_ajax("www.example.com/ajax.html#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
    'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html?#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html#!")
    'www.example.com/ajax.html?_escaped_fragment_='

    URLs that are not "AJAX crawlable" (according to Google) returned as-is:

    >>> escape_ajax("www.example.com/ajax.html#key=value")
    'www.example.com/ajax.html#key=value'
    >>> escape_ajax("www.example.com/ajax.html#")
    'www.example.com/ajax.html#'
    >>> escape_ajax("www.example.com/ajax.html")
    'www.example.com/ajax.html'
    """
    defrag, frag = urldefrag(url)
    if not frag.startswith("!"):
        return url
    return add_or_replace_parameter(defrag, "_escaped_fragment_", frag[1:])
Exemple #12
0
def url_query_cleaner(url,
                      parameterlist=(),
                      sep='&',
                      kvsep='=',
                      remove=False,
                      unique=True,
                      keep_fragments=False):
    """Clean URL arguments leaving only those passed in the parameterlist keeping order

    >>> import w3lib.url
    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',))
    'product.html?id=200'
    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])
    'product.html?id=200&name=wired'
    >>>

    If `unique` is ``False``, do not remove duplicated keys

    >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)
    'product.html?d=1&d=2&d=3'
    >>>

    If `remove` is ``True``, leave only those **not in parameterlist**.

    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)
    'product.html?foo=bar&name=wired'
    >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)
    'product.html?name=wired'
    >>>

    By default, URL fragments are removed. If you need to preserve fragments,
    pass the ``keep_fragments`` argument as ``True``.

    >>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True)
    'http://domain.tld/#123123'

    """

    if isinstance(parameterlist, (six.text_type, bytes)):
        parameterlist = [parameterlist]
    url, fragment = urldefrag(url)
    base, _, query = url.partition('?')
    seen = set()
    querylist = []
    for ksv in query.split(sep):
        k, _, _ = ksv.partition(kvsep)
        if unique and k in seen:
            continue
        elif remove and k in parameterlist:
            continue
        elif not remove and k not in parameterlist:
            continue
        else:
            querylist.append(ksv)
            seen.add(k)
    url = '?'.join([base, sep.join(querylist)]) if querylist else base
    if keep_fragments:
        url += '#' + fragment
    return url
    def traverse(self, node, key=ROOT_NODE, parents=None, cb=print, context=None):
        """ Recursively call nested elements."""

        # Trim parents breadcrumb as 4 will suffice.
        parents = parents[-4:] if parents else []

        # Unwind items as a dict or an enumerated list
        # to simplify traversal.
        if isinstance(node, (dict, list)):
            valuelist = node.items() if isinstance(node, dict) else enumerate(node)
            if key is not ROOT_NODE:
                parents.append(key)
            parents.append(node)
            for k, i in valuelist:
                self.traverse(i, k, parents, cb, context)
            return

        # Resolve HTTP references adding fragments
        # to 'schema', 'headers' or 'parameters'
        do_traverse, new_context = self.check_traverse_and_set_context(key, node)
        # If the context changes, update the global pointer too.
        # TODO: we would eventually get rid of self.context completely.
        if new_context:
            self.context = new_context
            context = new_context
        # log.info(f"test node context {key}, {node}, {do_traverse}")
        log.debug("test node context %r, %r, %r", key, node, do_traverse)
        if do_traverse:
            ancestor, needle = parents[-3:-1]
            # log.info(f"replacing: {needle} in {ancestor} with ref {node}. Parents are {parents}")
            ancestor[needle] = cb(key, node, context)

            # Get the component where to store the given item.
            component_name = self.get_component_name(needle, parents)

            # Use a pre and post traversal functions.
            # - before: append the reference to yaml_components.
            # - traverse
            # - after: deepcopy the resulting item in the yaml_components
            #          then replace it with the reference in the specs
            if component_name:
                # log.info(f"needle {needle} in components_map.")
                host, fragment = urldefrag(node)
                fragment = basename(fragment.strip("/"))
                self.yaml_components[component_name][fragment] = ancestor[needle]

            if isinstance(ancestor[needle], (dict, list)):
                self.traverse(ancestor[needle], key, parents, cb, context)

            if component_name:
                # Now the node is fully resolved. I can replace it with the
                # Deepcopy
                self.yaml_components[component_name][fragment] = deepcopy(
                    ancestor[needle]
                )
                ancestor[needle] = {
                    "$ref": "#" + join("/components", component_name, fragment)
                }
Exemple #14
0
 def getDisplayIdentifier(self):
     """Return the display_identifier if set, else return the claimed_id.
     """
     if self.display_identifier is not None:
         return self.display_identifier
     if self.claimed_id is None:
         return None
     else:
         return urldefrag(self.claimed_id)[0]
Exemple #15
0
 def absolutize(self, uri, defrag=1):
     base = urljoin("file:", pathname2url(os.getcwd()))
     result = urljoin("%s/" % base, uri, allow_fragments=not defrag)
     if defrag:
         result = urldefrag(result)[0]
     if not defrag:
         if uri and uri[-1] == "#" and result[-1] != "#":
             result = "%s#" % result
     return URIRef(result)
Exemple #16
0
    def get_yaml_reference(self, f):
        # log.info(f"Downloading {f}")
        host, fragment = urldefrag(f)
        if host not in self.yaml_cache:
            self.yaml_cache[host] = open_file_or_url(host)

        f_yaml = yaml.safe_load(self.yaml_cache[host])
        if fragment.strip("/"):
            f_yaml = finddict(f_yaml, fragment_to_keys(fragment))
        return f_yaml
Exemple #17
0
def normalizeURL(url):
    """Normalize a URL, converting normalization failures to
    DiscoveryFailure"""
    try:
        normalized = urinorm.urinorm(url)
    except ValueError as why:
        raise DiscoveryFailure(
            'Normalizing identifier: %s' % six.text_type(why), None)
    else:
        return urldefrag(normalized)[0]
def get_yaml_reference(f, yaml_cache=None):
    #log.info(f"Downloading {f}")
    host, fragment = urldefrag(f)
    if host not in yaml_cache:
        yaml_cache[host] = urlopen(host).read()

    f_yaml = yaml.load(yaml_cache[host])
    if fragment.strip("/"):
        f_yaml = finddict(f_yaml, fragment.strip("/").split("/"))
    return f_yaml
Exemple #19
0
def update_params(_url, _debug=False, **params):
    """Update the query parameters in a URL.

    ``_url`` is any URL, with or without a query string.

    ``**params`` are query parameters to add or replace. Each value may be a
    string, a list of strings, or None. Passing a list generates multiple
    values for the same parameter. Passing None deletes the corresponding
    parameter if present.

    Return the new URL.

    *Debug mode:* if ``_debug=True``, return a tuple:
    ``[0]`` is the URL without query string or fragment,
    ``[1]`` is the final query parameters as a dict, and
    ``[2]`` is the fragment part of the original URL or the empty string.

    Usage:

    >>> update_params("foo", new1="NEW1")
    'foo?new1=NEW1'
    >>> update_params("foo?p=1", p="2")
    'foo?p=2'
    >>> update_params("foo?p=1", p=None)
    'foo'
    >>> update_params("http://example.com/foo?new1=OLD1#myfrag", new1="NEW1")
    'http://example.com/foo?new1=NEW1#myfrag'
    >>> update_params("http://example.com/foo?new1=OLD1#myfrag", new1="NEW1", _debug=True)
    ('http://example.com/foo', {'new1': 'NEW1'}, 'myfrag')
    >>> update_params("http://www.mau.de?foo=2", brrr=3)
    'http://www.mau.de?foo=2&brrr=3'
    >>> update_params("http://www.mau.de?foo=A&foo=B", foo=["C", "D"])
    'http://www.mau.de?foo=C&foo=D'
    """

    url, fragment = urldefrag(_url)
    if "?" in url:
        url, qs = url.split("?", 1)
        query = parse_qs(qs)
    else:
        query = {}
    for key in params:
        value = params[key]
        if value is not None:
            query[key] = value
        elif key in query:
            del query[key]
    if _debug:
        return url, query, fragment
    qs = urlencode(query, True)
    if qs:
        qs = "?" + qs
    if fragment:
        fragment = "#" + fragment
    return "{0}{1}{2}".format(url, qs, fragment)
Exemple #20
0
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False):
    """Clean URL arguments leaving only those passed in the parameterlist keeping order

    >>> import w3lib.url
    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',))
    'product.html?id=200'
    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])
    'product.html?id=200&name=wired'
    >>>

    If `unique` is ``False``, do not remove duplicated keys

    >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)
    'product.html?d=1&d=2&d=3'
    >>>

    If `remove` is ``True``, leave only those **not in parameterlist**.

    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)
    'product.html?foo=bar&name=wired'
    >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)
    'product.html?name=wired'
    >>>

    By default, URL fragments are removed. If you need to preserve fragments,
    pass the ``keep_fragments`` argument as ``True``.

    >>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True)
    'http://domain.tld/#123123'

    """

    if isinstance(parameterlist, (six.text_type, bytes)):
        parameterlist = [parameterlist]
    url, fragment = urldefrag(url)
    base, _, query = url.partition('?')
    seen = set()
    querylist = []
    for ksv in query.split(sep):
        if not ksv:
            continue
        k, _, _ = ksv.partition(kvsep)
        if unique and k in seen:
            continue
        elif remove and k in parameterlist:
            continue
        elif not remove and k not in parameterlist:
            continue
        else:
            querylist.append(ksv)
            seen.add(k)
    url = '?'.join([base, sep.join(querylist)]) if querylist else base
    if keep_fragments:
        url += '#' + fragment
    return url
    def get_yaml_reference(self, f):
        # log.info(f"Downloading {f}")
        host, fragment = urldefrag(f)
        if host not in self.yaml_cache:
            self.yaml_cache[host] = urlopen(host).read()

        f_yaml = yaml.safe_load(self.yaml_cache[host])
        if fragment.strip("/"):
            f_yaml = finddict(
                f_yaml, fragment.strip("/").split("/"))
        return f_yaml
Exemple #22
0
    def resolve_node(self, key, node):
        # log.info(f"Resolving {node}")
        n = node
        if not node.startswith('http'):
            # Check if self.context already points to node
            host, fragment = urldefrag(n)

            if self.context and self.context.endswith(host):
                n = urljoin(self.context, "#" + fragment)
            else:
                n = urljoin(self.context, node)
        _yaml = self.get_yaml_reference(n)
        return _yaml
Exemple #23
0
def url_query_cleaner(url,
                      parameterlist=(),
                      sep='&',
                      kvsep='=',
                      remove=False,
                      unique=True):
    """Clean URL arguments leaving only those passed in the parameterlist keeping order

    >>> import w3lib.url
    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',))
    'product.html?id=200'
    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])
    'product.html?id=200&name=wired'
    >>>

    If `unique` is ``False``, do not remove duplicated keys

    >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)
    'product.html?d=1&d=2&d=3'
    >>>

    If `remove` is ``True``, leave only those **not in parameterlist**.

    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)
    'product.html?foo=bar&name=wired'
    >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)
    'product.html?name=wired'
    >>>

    """

    if isinstance(parameterlist, (six.text_type, bytes)):
        parameterlist = [parameterlist]
    url = urldefrag(url)[0]
    base, _, query = url.partition('?')
    seen = set()
    querylist = []
    for ksv in query.split(sep):
        k, _, _ = ksv.partition(kvsep)
        if unique and k in seen:
            continue
        elif remove and k in parameterlist:
            continue
        elif not remove and k not in parameterlist:
            continue
        else:
            querylist.append(ksv)
            seen.add(k)
    return '?'.join([base, sep.join(querylist)]) if querylist else base
Exemple #24
0
    def resolve_node(self, key, node, context):
        """This is the callback.
        """
        # log.info(f"Resolving {node}, {context}")
        n = node
        if not node.startswith("http"):
            # Check if self.context already points to node
            host, fragment = urldefrag(n)

            if context and context.endswith(host):
                n = urljoin(context, "#" + fragment)
            else:
                n = urljoin(context, node)
        _yaml = self.get_yaml_reference(n)
        return _yaml
Exemple #25
0
    def resolve_ref(self, obj, base_url):
        ref = obj.pop('import', None)

        url = urlparse.urljoin(base_url, ref)
        if url in self.resolved:
            return self.resolved[url]
        if url in self.resolving:
            raise RuntimeError('Circular reference for url %s' % url)
        self.resolving[url] = True
        doc_url, pointer = urlparse.urldefrag(url)
        document = self.fetch(doc_url)
        fragment = copy.deepcopy(resolve_pointer(document, pointer))
        try:
            result = self.resolve_all(fragment, doc_url)
        finally:
            del self.resolving[url]
        return result
Exemple #26
0
    def resolve_ref(self, obj, base_url):
        ref = obj.pop('import', None)

        url = urlparse.urljoin(base_url, ref)
        if url in self.resolved:
            return self.resolved[url]
        if url in self.resolving:
            raise RuntimeError('Circular reference for url %s' % url)
        self.resolving[url] = True
        doc_url, pointer = urlparse.urldefrag(url)
        document = self.fetch(doc_url)
        fragment = copy.deepcopy(resolve_pointer(document, pointer))
        try:
            result = self.resolve_all(fragment, doc_url)
        finally:
            del self.resolving[url]
        return result
Exemple #27
0
    def clean_url(url):
        """Cleans any url of relative paths remaining after urljoins.

        :param url: any url containing relative path contaimination
        :type url: str
        :rtype: str
        :returns: cleaned url

        usage::
            >>> clean_url('http://google.com/../../url/path/#frag')
            'http://google.com/url/path/'
            >>> clean_url('../../url/path')
            '/url/path'
            >>> clean_url('./same/dir/')
            'same/dir/'

        """
        return RELATIVE_PATHS.sub('', unquote(urldefrag(url)[0]))
Exemple #28
0
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True):
    """Clean URL arguments leaving only those passed in the parameterlist keeping order

    >>> import w3lib.url
    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',))
    'product.html?id=200'
    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])
    'product.html?id=200&name=wired'
    >>>

    If `unique` is ``False``, do not remove duplicated keys

    >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)
    'product.html?d=1&d=2&d=3'
    >>>

    If `remove` is ``True``, leave only those **not in parameterlist**.

    >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)
    'product.html?foo=bar&name=wired'
    >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)
    'product.html?name=wired'
    >>>

    """

    if isinstance(parameterlist, (six.text_type, bytes)):
        parameterlist = [parameterlist]
    url = urldefrag(url)[0]
    base, _, query = url.partition('?')
    seen = set()
    querylist = []
    for ksv in query.split(sep):
        k, _, _ = ksv.partition(kvsep)
        if unique and k in seen:
            continue
        elif remove and k in parameterlist:
            continue
        elif not remove and k not in parameterlist:
            continue
        else:
            querylist.append(ksv)
            seen.add(k)
    return '?'.join([base, sep.join(querylist)]) if querylist else base
    def traverse(self, node, key=ROOT_NODE, parents=None, cb=print):
        """ Recursively call nested elements."""

        # Trim parents breadcrumb as 4 will suffice.
        parents = parents[-4:] if parents else []

        # Unwind items as a dict or an enumerated list
        # to simplify traversal.
        if isinstance(node, (dict, list)):
            valuelist = node.items() if isinstance(node, dict) else enumerate(node)
            if key is not ROOT_NODE:
                parents.append(key)
            parents.append(node)
            for k, i in valuelist:
                self.traverse(i, k, parents, cb)
            return

        # Resolve HTTP references adding fragments
        # to 'schema', 'headers' or 'parameters'
        if key == '$ref' and node.startswith("http"):
            ancestor, needle = parents[-3:-1]
            # log.info(f"replacing: {needle} in {ancestor} with ref {node}")
            ancestor[needle] = cb(key, node)

            # Use a pre and post traversal functions.
            # - before: append the reference to yaml_components
            # - traverse
            # - after: deepcopy the resulting item in the yaml_components
            #          then replace it with the reference in the specs
            if needle in COMPONENTS_MAP:
                host, fragment = urldefrag(node)
                fragment = fragment.strip("/")
                needle_alias = COMPONENTS_MAP[needle]
                self.yaml_components[needle_alias][fragment] = ancestor[needle]
            if isinstance(ancestor[needle], (dict, list)):
                self.traverse(ancestor[needle], key, parents, cb)
            if needle in COMPONENTS_MAP:
                # Now the node is fully resolved. I can replace it with the
                # Deepcopy
                self.yaml_components[needle_alias][
                    fragment] = deepcopy(ancestor[needle])
                ancestor[needle] = {"$ref": "#" +
                                    join("/components", needle_alias, fragment)}
Exemple #30
0
 def resolve_ref(self, obj, base_url):
     ref, mixin, checksum = (obj.pop("$ref", None), obj.pop("$mixin", None), obj.pop("$checksum", None))
     ref = ref or mixin
     url = urlparse.urljoin(base_url, ref)
     if url in self.resolved:
         return self.resolved[url]
     if url in self.resolving:
         raise RuntimeError("Circular reference for url %s" % url)
     self.resolving[url] = True
     doc_url, pointer = urlparse.urldefrag(url)
     document = self.fetch(doc_url)
     fragment = copy.deepcopy(resolve_pointer(document, pointer))
     try:
         self.verify_checksum(checksum, fragment)
         if isinstance(fragment, dict) and mixin:
             fragment = dict(obj, **fragment)
         result = self.resolve_all(fragment, doc_url)
     finally:
         del self.resolving[url]
     return result
Exemple #31
0
    def download_request(self, request):
        timeout = request.meta.get('download_timeout') or self._connectTimeout
        agent = self._get_agent(request, timeout)

        # request details
        url = urldefrag(request.url)[0]
        method = request.method
        headers = TxHeaders(request.headers)
        bodyproducer = _RequestBodyProducer(request.body) if request.body else None

        start_time = time()
        d = agent.request(method, url, headers, bodyproducer)
        # set download latency
        d.addCallback(self._cb_latency, request, start_time)
        # response body is ready to be consumed
        d.addCallback(self._cb_bodyready, request)
        d.addCallback(self._cb_bodydone, request, url)
        # check download timeout
        self._timeout_cl = reactor.callLater(timeout, d.cancel)
        d.addBoth(self._cb_timeout, request, url, timeout)
        return d
Exemple #32
0
 def resolve_ref(self, obj, base_url):
     ref, mixin, checksum = (obj.pop('$ref', None), obj.pop('$mixin', None),
                             obj.pop('$checksum', None))
     ref = ref or mixin
     url = urlparse.urljoin(base_url, ref)
     if url in self.resolved:
         return self.resolved[url]
     if url in self.resolving:
         raise RuntimeError('Circular reference for url %s' % url)
     self.resolving[url] = True
     doc_url, pointer = urlparse.urldefrag(url)
     document = self.fetch(doc_url)
     fragment = copy.deepcopy(resolve_pointer(document, pointer))
     try:
         self.verify_checksum(checksum, fragment)
         if isinstance(fragment, dict) and mixin:
             fragment = dict(obj, **fragment)
         result = self.resolve_all(fragment, doc_url)
     finally:
         del self.resolving[url]
     return result
Exemple #33
0
    def download_request(self, request):
        timeout = request.meta.get('download_timeout') or self._connectTimeout
        agent = self._get_agent(request, timeout)

        # request details
        url = urldefrag(request.url)[0]
        method = to_bytes(request.method)
        headers = TxHeaders(request.headers)
        if isinstance(agent, self._TunnelingAgent):
            headers.removeHeader(b'Proxy-Authorization')
        if request.body:
            bodyproducer = _RequestBodyProducer(request.body)
        elif method == b'POST':
            # Setting Content-Length: 0 even for POST requests is not a
            # MUST per HTTP RFCs, but it's common behavior, and some
            # servers require this, otherwise returning HTTP 411 Length required
            #
            # RFC 7230#section-3.3.2:
            # "a Content-Length header field is normally sent in a POST
            # request even when the value is 0 (indicating an empty payload body)."
            #
            # Twisted < 17 will not add "Content-Length: 0" by itself;
            # Twisted >= 17 fixes this;
            # Using a producer with an empty-string sends `0` as Content-Length
            # for all versions of Twisted.
            bodyproducer = _RequestBodyProducer(b'')
        else:
            bodyproducer = None
        start_time = time()
        d = agent.request(method, to_bytes(url, encoding='ascii'), headers,
                          bodyproducer)
        # set download latency
        d.addCallback(self._cb_latency, request, start_time)
        # response body is ready to be consumed
        d.addCallback(self._cb_bodyready, request)
        d.addCallback(self._cb_bodydone, request, url)
        # check download timeout
        self._timeout_cl = reactor.callLater(timeout, d.cancel)
        d.addBoth(self._cb_timeout, request, url, timeout)
        return d
Exemple #34
0
    def download_request(self, request):
        timeout = request.meta.get('download_timeout') or self._connectTimeout
        agent = self._get_agent(request, timeout)

        # request details
        url = urldefrag(request.url)[0]
        method = to_bytes(request.method)
        headers = TxHeaders(request.headers)
        if isinstance(agent, self._TunnelingAgent):
            headers.removeHeader(b'Proxy-Authorization')
        if request.body:
            bodyproducer = _RequestBodyProducer(request.body)
        elif method == b'POST':
            # Setting Content-Length: 0 even for POST requests is not a
            # MUST per HTTP RFCs, but it's common behavior, and some
            # servers require this, otherwise returning HTTP 411 Length required
            #
            # RFC 7230#section-3.3.2:
            # "a Content-Length header field is normally sent in a POST
            # request even when the value is 0 (indicating an empty payload body)."
            #
            # Twisted < 17 will not add "Content-Length: 0" by itself;
            # Twisted >= 17 fixes this;
            # Using a producer with an empty-string sends `0` as Content-Length
            # for all versions of Twisted.
            bodyproducer = _RequestBodyProducer(b'')
        else:
            bodyproducer = None
        start_time = time()
        d = agent.request(
            method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
        # set download latency
        d.addCallback(self._cb_latency, request, start_time)
        # response body is ready to be consumed
        d.addCallback(self._cb_bodyready, request)
        d.addCallback(self._cb_bodydone, request, url)
        # check download timeout
        self._timeout_cl = reactor.callLater(timeout, d.cancel)
        d.addBoth(self._cb_timeout, request, url, timeout)
        return d
Exemple #35
0
    def parse_toc_element(self, url, li):
        """Fill the toc item"""
        a_link = li.find('a', attrs={'class': 'articleLink'}, recursive=False)

        if a_link:
            # cleanup_url(urljoin(url, a_link.attrs['href']))
            return Article(a_link.text.strip(), None,
                           re.sub('^art', '', a_link.attrs['id']))

        title = li.find(['span', 'a'], attrs={'class': 'title-link'},
                        recursive=False)

        match = re.match(r'(.*?)(?: \((Articles .*)\))?$',
                         merge_spaces(title.text.strip()))
        title_text, articles = match.groups()

        section = Section(title.attrs['id'], title_text)

        if 'href' in title.attrs:
            section_url = urldefrag(urljoin(url, title.attrs['href']))[0]
            section.url_section = urljoin(url, section_url)

        for ul in find_all_non_nested(li, 'ul'):
            for child_node in ul.find_all('li', recursive=False):
                child = self.parse_toc_element(url, child_node)
                if isinstance(child, Article) and self.with_articles:
                    if section.articles is None:
                        section.articles = []
                    section.articles.append(child)
                elif isinstance(child, Section):
                    if section.children is None:
                        section.children = []
                    section.children.append(child)

        if not section.children and not self.with_articles:
            section.articles = articles

        return section
Exemple #36
0
def escape_ajax(url):
    """
    Return the crawleable url according to:
    http://code.google.com/web/ajaxcrawling/docs/getting-started.html

    >>> escape_ajax("www.example.com/ajax.html#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
    'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html?#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html#!")
    'www.example.com/ajax.html?_escaped_fragment_='

    URLs that are not "AJAX crawlable" (according to Google) returned as-is:

    >>> escape_ajax("www.example.com/ajax.html#key=value")
    'www.example.com/ajax.html#key=value'
    >>> escape_ajax("www.example.com/ajax.html#")
    'www.example.com/ajax.html#'
    >>> escape_ajax("www.example.com/ajax.html")
    'www.example.com/ajax.html'
    """
    #>>>urlparse('http://www.example.com/ajax.html?k1=v1&k2=v2#!key=value')
    #ParseResult(scheme='http', netloc='www.example.com', path='/ajax.html'
    #           , params='', query='k1=v1&k2=v2', fragment='!key=value')

    defrag, frag = urldefrag(url)
    #这个函数就是将fragment单独抽取出来的。以上面的url为例
    #('http://www.example.com/ajax.html?k1=v1&k2=v2', '!key=value')

    #如果不是以!开头就直接返回。
    #如果是则对url进行操作。
    if not frag.startswith('!'):
        return url
    return add_or_replace_parameter(defrag, '_escaped_fragment_',
                                    frag[1:])  #[1:]去感叹号。
Exemple #37
0
    def download_request(self, request):
        timeout = request.meta.get('download_timeout') or self._connectTimeout
        agent = self._get_agent(request, timeout)

        # request details
        url = urldefrag(request.url)[0]
        method = to_bytes(request.method)
        headers = TxHeaders(request.headers)
        if isinstance(agent, self._TunnelingAgent):
            headers.removeHeader(b'Proxy-Authorization')
        bodyproducer = _RequestBodyProducer(request.body) if request.body else None

        start_time = time()
        d = agent.request(
            method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
        # set download latency
        d.addCallback(self._cb_latency, request, start_time)
        # response body is ready to be consumed
        d.addCallback(self._cb_bodyready, request)
        d.addCallback(self._cb_bodydone, request, url)
        # check download timeout
        self._timeout_cl = reactor.callLater(timeout, d.cancel)
        d.addBoth(self._cb_timeout, request, url, timeout)
        return d
Exemple #38
0
    def resolve_ref(self, obj, base_url):
        ref = obj.pop('import', None)
        txt = obj.pop('include', None)

        parse = txt is None

        url = urlparse.urljoin(base_url, ref or txt)
        if url in self.resolved:
            return self.resolved[url]
        if url in self.resolving:
            raise RuntimeError('Circular reference for url %s' % url)
        self.resolving[url] = True
        doc_url, pointer = urlparse.urldefrag(url)
        try:
            document = self.fetch(doc_url, parse)
            if parse:
                fragment = (copy.deepcopy(self.index.get("#" + pointer))
                            or resolve_pointer(document, pointer))
                result = self.resolve_all(fragment, doc_url)
            else:
                result = document
        finally:
            del self.resolving[url]
        return result
def escape_ajax(url):
    """
    Return the crawleable url according to:
    http://code.google.com/web/ajaxcrawling/docs/getting-started.html

    >>> escape_ajax("www.example.com/ajax.html#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
    'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html?#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html#!")
    'www.example.com/ajax.html?_escaped_fragment_='

    URLs that are not "AJAX crawlable" (according to Google) returned as-is:

    >>> escape_ajax("www.example.com/ajax.html#key=value")
    'www.example.com/ajax.html#key=value'
    >>> escape_ajax("www.example.com/ajax.html#")
    'www.example.com/ajax.html#'
    >>> escape_ajax("www.example.com/ajax.html")
    'www.example.com/ajax.html'
    """
    #>>>urlparse('http://www.example.com/ajax.html?k1=v1&k2=v2#!key=value')
    #ParseResult(scheme='http', netloc='www.example.com', path='/ajax.html'
    #           , params='', query='k1=v1&k2=v2', fragment='!key=value')

    defrag, frag = urldefrag(url)
    #这个函数就是将fragment单独抽取出来的。以上面的url为例
    #('http://www.example.com/ajax.html?k1=v1&k2=v2', '!key=value')

    #如果不是以!开头就直接返回。
    #如果是则对url进行操作。
    if not frag.startswith('!'):
        return url
    return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])#[1:]去感叹号。
Exemple #40
0
    def resolve_ref(self, obj, base_url):
        ref = obj.pop('import', None)
        txt = obj.pop('include', None)

        parse = txt is None

        url = urlparse.urljoin(base_url, ref or txt)
        if url in self.resolved:
            return self.resolved[url]
        if url in self.resolving:
            raise RuntimeError('Circular reference for url %s' % url)
        self.resolving[url] = True
        doc_url, pointer = urlparse.urldefrag(url)
        try:
            document = self.fetch(doc_url, parse)
            if parse:
                fragment = (copy.deepcopy(self.index.get("#" + pointer))
                            or resolve_pointer(document, pointer))
                result = self.resolve_all(fragment, doc_url)
            else:
                result = document
        finally:
            del self.resolving[url]
        return result
Exemple #41
0
 def raw_process_reference(self, path):
     uri = "file://" + os.path.abspath(path)
     fileuri, _ = urldefrag(uri)
     return RawProcessReference(self.raw_document_loader.fetch(fileuri), uri)
Exemple #42
0
def get_base_uri(uri):
    parsed = urlparse(uri)
    if parsed.fragment:
        return "{}#".format(urldefrag(uri)[0])

    return "{}/".format(uri.rsplit('/', 1)[0])
Exemple #43
0
def _get_versioned_url(full_url, version):
    parsed_url, _ = parse.urldefrag(full_url)

    if version[-1] != '/':
        version += '/'
    return parse.urljoin(parsed_url, version)
Exemple #44
0
 def raw_process_reference(self, path):
     uri = "file://" + os.path.abspath(path)
     fileuri, _ = urldefrag(uri)
     return RawProcessReference(self.raw_document_loader.fetch(fileuri),
                                uri)