def startElementNS(self, name, qname, attrs): stack = self.stack stack.append(ElementHandler()) current = self.current parent = self.parent base = attrs.get(BASE, None) if base is not None: base, frag = urldefrag(base) if parent and parent.base: base = urljoin(parent.base, base) else: systemId = self.locator.getPublicId() \ or self.locator.getSystemId() if systemId: base = urljoin(systemId, base) else: if parent: base = parent.base if base is None: systemId = self.locator.getPublicId() \ or self.locator.getSystemId() if systemId: base, frag = urldefrag(systemId) current.base = base language = attrs.get(LANG, None) if language is None: if parent: language = parent.language current.language = language current.start(name, qname, attrs)
def check_traverse_and_set_context(self, key, node): """This method checks if we need to resolve a $ref. Decision is based on the node (eg. if it's a remote reference, starting with http), or if it's a local one. As both local and remote references can be relative to the given file, a self.context attribute is used to distinguish if the $ref is in the original file or in an external source. :param key: :param node: :return: True if I have to resolve the node. """ if key != "$ref": return False, None if node.startswith("#/"): # local reference try: is_local_ref = finddict(self.openapi, fragment_to_keys(node)) except KeyError: is_local_ref = False # Don't resolve local references already in the spec. if is_local_ref: return False, None # Resolve local references in external files. if self.context: return True, None return False, None if node.startswith("http"): # url reference host, fragment = urldefrag(node) return True, host if node.startswith("file://"): raise NotImplementedError host, fragment = urldefrag(node) if self.context: if self.context.startswith("http"): p = urljoin(self.context, host) # log.info(f"trying to set context {p}. Was {self.context}. host is: {host}.") return True, p p = Path(self.context).parent.joinpath(host) # log.info(f"trying to set context {p}. Was {self.context}. host is: {host}. resolved is {p.resolve()}") if p.is_file(): return True, str(p.resolve()) else: log.warning("can't set context %r. Retains %r", p, self.context) # Remote reference should use previous # context. Better should be to track # nodes with their context. return True, None
def url(self, name, force=False): """ Returns the real URL in DEBUG mode. """ if settings.DEBUG and not force: hashed_name, fragment = name, '' else: clean_name, fragment = urldefrag(name) if urlsplit(clean_name).path.endswith('/'): # don't hash paths hashed_name = name else: hashed_name = self.stored_name(clean_name) final_url = super(HashedFilesMixin, self).url(hashed_name) # Special casing for a @font-face hack, like url(myfont.eot?#iefix") # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax query_fragment = '?#' in name # [sic!] if fragment or query_fragment: urlparts = list(urlsplit(final_url)) if fragment and not urlparts[4]: urlparts[4] = fragment if query_fragment and not urlparts[3]: urlparts[2] += '?' final_url = urlunsplit(urlparts) return unquote(final_url)
def download_request(self, request): timeout = request.meta.get('download_timeout') or self._connectTimeout agent = self._get_agent(request, timeout) # request details url = urldefrag(request.url)[0] method = to_bytes(request.method) headers = TxHeaders(request.headers) if isinstance(agent, self._TunnelingAgent): headers.removeHeader(b'Proxy-Authorization') bodyproducer = _RequestBodyProducer( request.body) if request.body else None start_time = time() d = agent.request(method, to_bytes(url, encoding='ascii'), headers, bodyproducer) # set download latency d.addCallback(self._cb_latency, request, start_time) # response body is ready to be consumed d.addCallback(self._cb_bodyready, request) d.addCallback(self._cb_bodydone, request, url) # check download timeout self._timeout_cl = reactor.callLater(timeout, d.cancel) d.addBoth(self._cb_timeout, request, url, timeout) return d
def __init__(self, request, timeout=180): self.url = urldefrag(request.url)[0] self.method = request.method self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") # Content-Length must be specified in POST method even with no body elif self.method == 'POST': self.headers['Content-Length'] = 0
def get_disk_name(ovf): """Get the disk format and file name from a OVF descriptor.""" root = etree.fromstring(ovf) ovf_ns = root.nsmap['ovf'] id_attr = '{%s}id' % ovf_ns href_attr = '{%s}href' % ovf_ns files = {f.get(id_attr): f.get(href_attr) for f in root.findall('ovf:References/ovf:File', root.nsmap)} # we do not care about more than one disk disk = root.find('ovf:DiskSection/ovf:Disk', root.nsmap) if disk is not None: format_attr = '{%s}format' % ovf_ns fileref_attr = '{%s}fileRef' % ovf_ns ovf_format = disk.get(format_attr) if not ovf_format: raise Exception("Expecting some format!") (format_url, _) = parse.urldefrag(ovf_format) try: disk_format = SPECS[format_url] except KeyError: raise Exception("Unknown format!") try: disk_file = files[disk.get(fileref_attr)] except KeyError: raise Exception("Unknown disk!") return (disk_format, disk_file) return None, None
def include_root_definition(self): self.known_mappings['definitions'].update({ urlparse(v._json_reference): self.descend(value=v._model_spec) for v in itervalues(self.swagger_spec.definitions) # urldefrag(url)[0] returns the url without the fragment, it is guaranteed to be present if urldefrag(v._json_reference)[0] == self.swagger_spec.origin_url })
def __init__(self, request, timeout=180): self._url = urldefrag(request.url)[0] # converting to bytes to comply to Twisted interface self.url = to_bytes(self._url, encoding='ascii') self.method = to_bytes(request.method, encoding='ascii') self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") # Content-Length must be specified in POST method even with no body elif self.method == b'POST': self.headers['Content-Length'] = 0
def escape_ajax(url): """ Return the crawleable url according to: http://code.google.com/web/ajaxcrawling/docs/getting-started.html >>> escape_ajax("www.example.com/ajax.html#!key=value") 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value") 'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html?#!key=value") 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html#!") 'www.example.com/ajax.html?_escaped_fragment_=' URLs that are not "AJAX crawlable" (according to Google) returned as-is: >>> escape_ajax("www.example.com/ajax.html#key=value") 'www.example.com/ajax.html#key=value' >>> escape_ajax("www.example.com/ajax.html#") 'www.example.com/ajax.html#' >>> escape_ajax("www.example.com/ajax.html") 'www.example.com/ajax.html' """ defrag, frag = urldefrag(url) if not frag.startswith('!'): return url return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
def escape_ajax(url): """ Return the crawleable url according to: http://code.google.com/web/ajaxcrawling/docs/getting-started.html >>> escape_ajax("www.example.com/ajax.html#!key=value") 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value") 'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html?#!key=value") 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html#!") 'www.example.com/ajax.html?_escaped_fragment_=' URLs that are not "AJAX crawlable" (according to Google) returned as-is: >>> escape_ajax("www.example.com/ajax.html#key=value") 'www.example.com/ajax.html#key=value' >>> escape_ajax("www.example.com/ajax.html#") 'www.example.com/ajax.html#' >>> escape_ajax("www.example.com/ajax.html") 'www.example.com/ajax.html' """ defrag, frag = urldefrag(url) if not frag.startswith("!"): return url return add_or_replace_parameter(defrag, "_escaped_fragment_", frag[1:])
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False): """Clean URL arguments leaving only those passed in the parameterlist keeping order >>> import w3lib.url >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',)) 'product.html?id=200' >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name']) 'product.html?id=200&name=wired' >>> If `unique` is ``False``, do not remove duplicated keys >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False) 'product.html?d=1&d=2&d=3' >>> If `remove` is ``True``, leave only those **not in parameterlist**. >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True) 'product.html?foo=bar&name=wired' >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True) 'product.html?name=wired' >>> By default, URL fragments are removed. If you need to preserve fragments, pass the ``keep_fragments`` argument as ``True``. >>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True) 'http://domain.tld/#123123' """ if isinstance(parameterlist, (six.text_type, bytes)): parameterlist = [parameterlist] url, fragment = urldefrag(url) base, _, query = url.partition('?') seen = set() querylist = [] for ksv in query.split(sep): k, _, _ = ksv.partition(kvsep) if unique and k in seen: continue elif remove and k in parameterlist: continue elif not remove and k not in parameterlist: continue else: querylist.append(ksv) seen.add(k) url = '?'.join([base, sep.join(querylist)]) if querylist else base if keep_fragments: url += '#' + fragment return url
def traverse(self, node, key=ROOT_NODE, parents=None, cb=print, context=None): """ Recursively call nested elements.""" # Trim parents breadcrumb as 4 will suffice. parents = parents[-4:] if parents else [] # Unwind items as a dict or an enumerated list # to simplify traversal. if isinstance(node, (dict, list)): valuelist = node.items() if isinstance(node, dict) else enumerate(node) if key is not ROOT_NODE: parents.append(key) parents.append(node) for k, i in valuelist: self.traverse(i, k, parents, cb, context) return # Resolve HTTP references adding fragments # to 'schema', 'headers' or 'parameters' do_traverse, new_context = self.check_traverse_and_set_context(key, node) # If the context changes, update the global pointer too. # TODO: we would eventually get rid of self.context completely. if new_context: self.context = new_context context = new_context # log.info(f"test node context {key}, {node}, {do_traverse}") log.debug("test node context %r, %r, %r", key, node, do_traverse) if do_traverse: ancestor, needle = parents[-3:-1] # log.info(f"replacing: {needle} in {ancestor} with ref {node}. Parents are {parents}") ancestor[needle] = cb(key, node, context) # Get the component where to store the given item. component_name = self.get_component_name(needle, parents) # Use a pre and post traversal functions. # - before: append the reference to yaml_components. # - traverse # - after: deepcopy the resulting item in the yaml_components # then replace it with the reference in the specs if component_name: # log.info(f"needle {needle} in components_map.") host, fragment = urldefrag(node) fragment = basename(fragment.strip("/")) self.yaml_components[component_name][fragment] = ancestor[needle] if isinstance(ancestor[needle], (dict, list)): self.traverse(ancestor[needle], key, parents, cb, context) if component_name: # Now the node is fully resolved. I can replace it with the # Deepcopy self.yaml_components[component_name][fragment] = deepcopy( ancestor[needle] ) ancestor[needle] = { "$ref": "#" + join("/components", component_name, fragment) }
def getDisplayIdentifier(self): """Return the display_identifier if set, else return the claimed_id. """ if self.display_identifier is not None: return self.display_identifier if self.claimed_id is None: return None else: return urldefrag(self.claimed_id)[0]
def absolutize(self, uri, defrag=1): base = urljoin("file:", pathname2url(os.getcwd())) result = urljoin("%s/" % base, uri, allow_fragments=not defrag) if defrag: result = urldefrag(result)[0] if not defrag: if uri and uri[-1] == "#" and result[-1] != "#": result = "%s#" % result return URIRef(result)
def get_yaml_reference(self, f): # log.info(f"Downloading {f}") host, fragment = urldefrag(f) if host not in self.yaml_cache: self.yaml_cache[host] = open_file_or_url(host) f_yaml = yaml.safe_load(self.yaml_cache[host]) if fragment.strip("/"): f_yaml = finddict(f_yaml, fragment_to_keys(fragment)) return f_yaml
def normalizeURL(url): """Normalize a URL, converting normalization failures to DiscoveryFailure""" try: normalized = urinorm.urinorm(url) except ValueError as why: raise DiscoveryFailure( 'Normalizing identifier: %s' % six.text_type(why), None) else: return urldefrag(normalized)[0]
def get_yaml_reference(f, yaml_cache=None): #log.info(f"Downloading {f}") host, fragment = urldefrag(f) if host not in yaml_cache: yaml_cache[host] = urlopen(host).read() f_yaml = yaml.load(yaml_cache[host]) if fragment.strip("/"): f_yaml = finddict(f_yaml, fragment.strip("/").split("/")) return f_yaml
def update_params(_url, _debug=False, **params): """Update the query parameters in a URL. ``_url`` is any URL, with or without a query string. ``**params`` are query parameters to add or replace. Each value may be a string, a list of strings, or None. Passing a list generates multiple values for the same parameter. Passing None deletes the corresponding parameter if present. Return the new URL. *Debug mode:* if ``_debug=True``, return a tuple: ``[0]`` is the URL without query string or fragment, ``[1]`` is the final query parameters as a dict, and ``[2]`` is the fragment part of the original URL or the empty string. Usage: >>> update_params("foo", new1="NEW1") 'foo?new1=NEW1' >>> update_params("foo?p=1", p="2") 'foo?p=2' >>> update_params("foo?p=1", p=None) 'foo' >>> update_params("http://example.com/foo?new1=OLD1#myfrag", new1="NEW1") 'http://example.com/foo?new1=NEW1#myfrag' >>> update_params("http://example.com/foo?new1=OLD1#myfrag", new1="NEW1", _debug=True) ('http://example.com/foo', {'new1': 'NEW1'}, 'myfrag') >>> update_params("http://www.mau.de?foo=2", brrr=3) 'http://www.mau.de?foo=2&brrr=3' >>> update_params("http://www.mau.de?foo=A&foo=B", foo=["C", "D"]) 'http://www.mau.de?foo=C&foo=D' """ url, fragment = urldefrag(_url) if "?" in url: url, qs = url.split("?", 1) query = parse_qs(qs) else: query = {} for key in params: value = params[key] if value is not None: query[key] = value elif key in query: del query[key] if _debug: return url, query, fragment qs = urlencode(query, True) if qs: qs = "?" + qs if fragment: fragment = "#" + fragment return "{0}{1}{2}".format(url, qs, fragment)
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False): """Clean URL arguments leaving only those passed in the parameterlist keeping order >>> import w3lib.url >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',)) 'product.html?id=200' >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name']) 'product.html?id=200&name=wired' >>> If `unique` is ``False``, do not remove duplicated keys >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False) 'product.html?d=1&d=2&d=3' >>> If `remove` is ``True``, leave only those **not in parameterlist**. >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True) 'product.html?foo=bar&name=wired' >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True) 'product.html?name=wired' >>> By default, URL fragments are removed. If you need to preserve fragments, pass the ``keep_fragments`` argument as ``True``. >>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True) 'http://domain.tld/#123123' """ if isinstance(parameterlist, (six.text_type, bytes)): parameterlist = [parameterlist] url, fragment = urldefrag(url) base, _, query = url.partition('?') seen = set() querylist = [] for ksv in query.split(sep): if not ksv: continue k, _, _ = ksv.partition(kvsep) if unique and k in seen: continue elif remove and k in parameterlist: continue elif not remove and k not in parameterlist: continue else: querylist.append(ksv) seen.add(k) url = '?'.join([base, sep.join(querylist)]) if querylist else base if keep_fragments: url += '#' + fragment return url
def get_yaml_reference(self, f): # log.info(f"Downloading {f}") host, fragment = urldefrag(f) if host not in self.yaml_cache: self.yaml_cache[host] = urlopen(host).read() f_yaml = yaml.safe_load(self.yaml_cache[host]) if fragment.strip("/"): f_yaml = finddict( f_yaml, fragment.strip("/").split("/")) return f_yaml
def resolve_node(self, key, node): # log.info(f"Resolving {node}") n = node if not node.startswith('http'): # Check if self.context already points to node host, fragment = urldefrag(n) if self.context and self.context.endswith(host): n = urljoin(self.context, "#" + fragment) else: n = urljoin(self.context, node) _yaml = self.get_yaml_reference(n) return _yaml
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True): """Clean URL arguments leaving only those passed in the parameterlist keeping order >>> import w3lib.url >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',)) 'product.html?id=200' >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name']) 'product.html?id=200&name=wired' >>> If `unique` is ``False``, do not remove duplicated keys >>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False) 'product.html?d=1&d=2&d=3' >>> If `remove` is ``True``, leave only those **not in parameterlist**. >>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True) 'product.html?foo=bar&name=wired' >>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True) 'product.html?name=wired' >>> """ if isinstance(parameterlist, (six.text_type, bytes)): parameterlist = [parameterlist] url = urldefrag(url)[0] base, _, query = url.partition('?') seen = set() querylist = [] for ksv in query.split(sep): k, _, _ = ksv.partition(kvsep) if unique and k in seen: continue elif remove and k in parameterlist: continue elif not remove and k not in parameterlist: continue else: querylist.append(ksv) seen.add(k) return '?'.join([base, sep.join(querylist)]) if querylist else base
def resolve_node(self, key, node, context): """This is the callback. """ # log.info(f"Resolving {node}, {context}") n = node if not node.startswith("http"): # Check if self.context already points to node host, fragment = urldefrag(n) if context and context.endswith(host): n = urljoin(context, "#" + fragment) else: n = urljoin(context, node) _yaml = self.get_yaml_reference(n) return _yaml
def resolve_ref(self, obj, base_url): ref = obj.pop('import', None) url = urlparse.urljoin(base_url, ref) if url in self.resolved: return self.resolved[url] if url in self.resolving: raise RuntimeError('Circular reference for url %s' % url) self.resolving[url] = True doc_url, pointer = urlparse.urldefrag(url) document = self.fetch(doc_url) fragment = copy.deepcopy(resolve_pointer(document, pointer)) try: result = self.resolve_all(fragment, doc_url) finally: del self.resolving[url] return result
def clean_url(url): """Cleans any url of relative paths remaining after urljoins. :param url: any url containing relative path contaimination :type url: str :rtype: str :returns: cleaned url usage:: >>> clean_url('http://google.com/../../url/path/#frag') 'http://google.com/url/path/' >>> clean_url('../../url/path') '/url/path' >>> clean_url('./same/dir/') 'same/dir/' """ return RELATIVE_PATHS.sub('', unquote(urldefrag(url)[0]))
def traverse(self, node, key=ROOT_NODE, parents=None, cb=print): """ Recursively call nested elements.""" # Trim parents breadcrumb as 4 will suffice. parents = parents[-4:] if parents else [] # Unwind items as a dict or an enumerated list # to simplify traversal. if isinstance(node, (dict, list)): valuelist = node.items() if isinstance(node, dict) else enumerate(node) if key is not ROOT_NODE: parents.append(key) parents.append(node) for k, i in valuelist: self.traverse(i, k, parents, cb) return # Resolve HTTP references adding fragments # to 'schema', 'headers' or 'parameters' if key == '$ref' and node.startswith("http"): ancestor, needle = parents[-3:-1] # log.info(f"replacing: {needle} in {ancestor} with ref {node}") ancestor[needle] = cb(key, node) # Use a pre and post traversal functions. # - before: append the reference to yaml_components # - traverse # - after: deepcopy the resulting item in the yaml_components # then replace it with the reference in the specs if needle in COMPONENTS_MAP: host, fragment = urldefrag(node) fragment = fragment.strip("/") needle_alias = COMPONENTS_MAP[needle] self.yaml_components[needle_alias][fragment] = ancestor[needle] if isinstance(ancestor[needle], (dict, list)): self.traverse(ancestor[needle], key, parents, cb) if needle in COMPONENTS_MAP: # Now the node is fully resolved. I can replace it with the # Deepcopy self.yaml_components[needle_alias][ fragment] = deepcopy(ancestor[needle]) ancestor[needle] = {"$ref": "#" + join("/components", needle_alias, fragment)}
def resolve_ref(self, obj, base_url): ref, mixin, checksum = (obj.pop("$ref", None), obj.pop("$mixin", None), obj.pop("$checksum", None)) ref = ref or mixin url = urlparse.urljoin(base_url, ref) if url in self.resolved: return self.resolved[url] if url in self.resolving: raise RuntimeError("Circular reference for url %s" % url) self.resolving[url] = True doc_url, pointer = urlparse.urldefrag(url) document = self.fetch(doc_url) fragment = copy.deepcopy(resolve_pointer(document, pointer)) try: self.verify_checksum(checksum, fragment) if isinstance(fragment, dict) and mixin: fragment = dict(obj, **fragment) result = self.resolve_all(fragment, doc_url) finally: del self.resolving[url] return result
def download_request(self, request): timeout = request.meta.get('download_timeout') or self._connectTimeout agent = self._get_agent(request, timeout) # request details url = urldefrag(request.url)[0] method = request.method headers = TxHeaders(request.headers) bodyproducer = _RequestBodyProducer(request.body) if request.body else None start_time = time() d = agent.request(method, url, headers, bodyproducer) # set download latency d.addCallback(self._cb_latency, request, start_time) # response body is ready to be consumed d.addCallback(self._cb_bodyready, request) d.addCallback(self._cb_bodydone, request, url) # check download timeout self._timeout_cl = reactor.callLater(timeout, d.cancel) d.addBoth(self._cb_timeout, request, url, timeout) return d
def resolve_ref(self, obj, base_url): ref, mixin, checksum = (obj.pop('$ref', None), obj.pop('$mixin', None), obj.pop('$checksum', None)) ref = ref or mixin url = urlparse.urljoin(base_url, ref) if url in self.resolved: return self.resolved[url] if url in self.resolving: raise RuntimeError('Circular reference for url %s' % url) self.resolving[url] = True doc_url, pointer = urlparse.urldefrag(url) document = self.fetch(doc_url) fragment = copy.deepcopy(resolve_pointer(document, pointer)) try: self.verify_checksum(checksum, fragment) if isinstance(fragment, dict) and mixin: fragment = dict(obj, **fragment) result = self.resolve_all(fragment, doc_url) finally: del self.resolving[url] return result
def download_request(self, request): timeout = request.meta.get('download_timeout') or self._connectTimeout agent = self._get_agent(request, timeout) # request details url = urldefrag(request.url)[0] method = to_bytes(request.method) headers = TxHeaders(request.headers) if isinstance(agent, self._TunnelingAgent): headers.removeHeader(b'Proxy-Authorization') if request.body: bodyproducer = _RequestBodyProducer(request.body) elif method == b'POST': # Setting Content-Length: 0 even for POST requests is not a # MUST per HTTP RFCs, but it's common behavior, and some # servers require this, otherwise returning HTTP 411 Length required # # RFC 7230#section-3.3.2: # "a Content-Length header field is normally sent in a POST # request even when the value is 0 (indicating an empty payload body)." # # Twisted < 17 will not add "Content-Length: 0" by itself; # Twisted >= 17 fixes this; # Using a producer with an empty-string sends `0` as Content-Length # for all versions of Twisted. bodyproducer = _RequestBodyProducer(b'') else: bodyproducer = None start_time = time() d = agent.request(method, to_bytes(url, encoding='ascii'), headers, bodyproducer) # set download latency d.addCallback(self._cb_latency, request, start_time) # response body is ready to be consumed d.addCallback(self._cb_bodyready, request) d.addCallback(self._cb_bodydone, request, url) # check download timeout self._timeout_cl = reactor.callLater(timeout, d.cancel) d.addBoth(self._cb_timeout, request, url, timeout) return d
def download_request(self, request): timeout = request.meta.get('download_timeout') or self._connectTimeout agent = self._get_agent(request, timeout) # request details url = urldefrag(request.url)[0] method = to_bytes(request.method) headers = TxHeaders(request.headers) if isinstance(agent, self._TunnelingAgent): headers.removeHeader(b'Proxy-Authorization') if request.body: bodyproducer = _RequestBodyProducer(request.body) elif method == b'POST': # Setting Content-Length: 0 even for POST requests is not a # MUST per HTTP RFCs, but it's common behavior, and some # servers require this, otherwise returning HTTP 411 Length required # # RFC 7230#section-3.3.2: # "a Content-Length header field is normally sent in a POST # request even when the value is 0 (indicating an empty payload body)." # # Twisted < 17 will not add "Content-Length: 0" by itself; # Twisted >= 17 fixes this; # Using a producer with an empty-string sends `0` as Content-Length # for all versions of Twisted. bodyproducer = _RequestBodyProducer(b'') else: bodyproducer = None start_time = time() d = agent.request( method, to_bytes(url, encoding='ascii'), headers, bodyproducer) # set download latency d.addCallback(self._cb_latency, request, start_time) # response body is ready to be consumed d.addCallback(self._cb_bodyready, request) d.addCallback(self._cb_bodydone, request, url) # check download timeout self._timeout_cl = reactor.callLater(timeout, d.cancel) d.addBoth(self._cb_timeout, request, url, timeout) return d
def parse_toc_element(self, url, li): """Fill the toc item""" a_link = li.find('a', attrs={'class': 'articleLink'}, recursive=False) if a_link: # cleanup_url(urljoin(url, a_link.attrs['href'])) return Article(a_link.text.strip(), None, re.sub('^art', '', a_link.attrs['id'])) title = li.find(['span', 'a'], attrs={'class': 'title-link'}, recursive=False) match = re.match(r'(.*?)(?: \((Articles .*)\))?$', merge_spaces(title.text.strip())) title_text, articles = match.groups() section = Section(title.attrs['id'], title_text) if 'href' in title.attrs: section_url = urldefrag(urljoin(url, title.attrs['href']))[0] section.url_section = urljoin(url, section_url) for ul in find_all_non_nested(li, 'ul'): for child_node in ul.find_all('li', recursive=False): child = self.parse_toc_element(url, child_node) if isinstance(child, Article) and self.with_articles: if section.articles is None: section.articles = [] section.articles.append(child) elif isinstance(child, Section): if section.children is None: section.children = [] section.children.append(child) if not section.children and not self.with_articles: section.articles = articles return section
def escape_ajax(url): """ Return the crawleable url according to: http://code.google.com/web/ajaxcrawling/docs/getting-started.html >>> escape_ajax("www.example.com/ajax.html#!key=value") 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value") 'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html?#!key=value") 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html#!") 'www.example.com/ajax.html?_escaped_fragment_=' URLs that are not "AJAX crawlable" (according to Google) returned as-is: >>> escape_ajax("www.example.com/ajax.html#key=value") 'www.example.com/ajax.html#key=value' >>> escape_ajax("www.example.com/ajax.html#") 'www.example.com/ajax.html#' >>> escape_ajax("www.example.com/ajax.html") 'www.example.com/ajax.html' """ #>>>urlparse('http://www.example.com/ajax.html?k1=v1&k2=v2#!key=value') #ParseResult(scheme='http', netloc='www.example.com', path='/ajax.html' # , params='', query='k1=v1&k2=v2', fragment='!key=value') defrag, frag = urldefrag(url) #这个函数就是将fragment单独抽取出来的。以上面的url为例 #('http://www.example.com/ajax.html?k1=v1&k2=v2', '!key=value') #如果不是以!开头就直接返回。 #如果是则对url进行操作。 if not frag.startswith('!'): return url return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:]) #[1:]去感叹号。
def download_request(self, request): timeout = request.meta.get('download_timeout') or self._connectTimeout agent = self._get_agent(request, timeout) # request details url = urldefrag(request.url)[0] method = to_bytes(request.method) headers = TxHeaders(request.headers) if isinstance(agent, self._TunnelingAgent): headers.removeHeader(b'Proxy-Authorization') bodyproducer = _RequestBodyProducer(request.body) if request.body else None start_time = time() d = agent.request( method, to_bytes(url, encoding='ascii'), headers, bodyproducer) # set download latency d.addCallback(self._cb_latency, request, start_time) # response body is ready to be consumed d.addCallback(self._cb_bodyready, request) d.addCallback(self._cb_bodydone, request, url) # check download timeout self._timeout_cl = reactor.callLater(timeout, d.cancel) d.addBoth(self._cb_timeout, request, url, timeout) return d
def resolve_ref(self, obj, base_url): ref = obj.pop('import', None) txt = obj.pop('include', None) parse = txt is None url = urlparse.urljoin(base_url, ref or txt) if url in self.resolved: return self.resolved[url] if url in self.resolving: raise RuntimeError('Circular reference for url %s' % url) self.resolving[url] = True doc_url, pointer = urlparse.urldefrag(url) try: document = self.fetch(doc_url, parse) if parse: fragment = (copy.deepcopy(self.index.get("#" + pointer)) or resolve_pointer(document, pointer)) result = self.resolve_all(fragment, doc_url) else: result = document finally: del self.resolving[url] return result
def escape_ajax(url): """ Return the crawleable url according to: http://code.google.com/web/ajaxcrawling/docs/getting-started.html >>> escape_ajax("www.example.com/ajax.html#!key=value") 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value") 'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html?#!key=value") 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html#!") 'www.example.com/ajax.html?_escaped_fragment_=' URLs that are not "AJAX crawlable" (according to Google) returned as-is: >>> escape_ajax("www.example.com/ajax.html#key=value") 'www.example.com/ajax.html#key=value' >>> escape_ajax("www.example.com/ajax.html#") 'www.example.com/ajax.html#' >>> escape_ajax("www.example.com/ajax.html") 'www.example.com/ajax.html' """ #>>>urlparse('http://www.example.com/ajax.html?k1=v1&k2=v2#!key=value') #ParseResult(scheme='http', netloc='www.example.com', path='/ajax.html' # , params='', query='k1=v1&k2=v2', fragment='!key=value') defrag, frag = urldefrag(url) #这个函数就是将fragment单独抽取出来的。以上面的url为例 #('http://www.example.com/ajax.html?k1=v1&k2=v2', '!key=value') #如果不是以!开头就直接返回。 #如果是则对url进行操作。 if not frag.startswith('!'): return url return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])#[1:]去感叹号。
def raw_process_reference(self, path): uri = "file://" + os.path.abspath(path) fileuri, _ = urldefrag(uri) return RawProcessReference(self.raw_document_loader.fetch(fileuri), uri)
def get_base_uri(uri): parsed = urlparse(uri) if parsed.fragment: return "{}#".format(urldefrag(uri)[0]) return "{}/".format(uri.rsplit('/', 1)[0])
def _get_versioned_url(full_url, version): parsed_url, _ = parse.urldefrag(full_url) if version[-1] != '/': version += '/' return parse.urljoin(parsed_url, version)