def startElementNS(self, name, qname, attrs): stack = self.stack stack.append(ElementHandler()) current = self.current parent = self.parent base = attrs.get(BASE, None) if base is not None: base, frag = urldefrag(base) if parent and parent.base: base = urljoin(parent.base, base) else: systemId = self.locator.getPublicId() \ or self.locator.getSystemId() if systemId: base = urljoin(systemId, base) else: if parent: base = parent.base if base is None: systemId = self.locator.getPublicId() \ or self.locator.getSystemId() if systemId: base, frag = urldefrag(systemId) current.base = base language = attrs.get(LANG, None) if language is None: if parent: language = parent.language current.language = language current.start(name, qname, attrs)
def get(self): url = request.args.get('url') tags = request.args.getlist('tag') filters = [db.Bookmark.user == current_user.id] if url is not None: filters.append(db.Bookmark.url == urldefrag(url).url) # If Bookmark.tags is null, filtering will yield no results if tags: filters.append(db.Bookmark.tags.contains(tags)) result = db.Bookmark.query.filter(*filters) \ .order_by( db.Bookmark.read.desc().nullsfirst(), db.Bookmark.timestamp.desc()) \ .paginate() headers = {} links = [] if result.has_next: last_url = update_query(request.url, {'page': result.pages}) links.append(lh.Link(last_url, rel='last')) if links: headers['Link'] = lh.format_links(links) return list(map(lambda x: x.to_dict(), result.items)), 200, headers
def post(self, id): update = request.get_json() if 'id' in update: return {'error': 'Updating id is not allowed'}, \ HTTPStatus.BAD_REQUEST bookmark = db.Bookmark.query \ .filter_by(id=id, user=current_user.id) \ .first() if bookmark is None: return {'error': 'Not found'}, HTTPStatus.NOT_FOUND if 'url' in update: bookmark.url = urldefrag(update['url']).url if 'title' in update: bookmark.title = update['title'] if 'timestamp' in update: bookmark.timestamp = aniso8601.parse_datetime(update['timestamp']) if 'read' in update: if update['read']: bookmark.read = aniso8601.parse_datetime(update['read']) else: bookmark.read = None if 'tags' in update: bookmark.tags = update['tags'] db.db.session.add(bookmark) db.db.session.commit() return bookmark.to_dict(), HTTPStatus.OK
def get_urls(self, postings: list): urls = [] for posting in postings: defraged = urldefrag(self.id_file[posting.doc_id])[0] if defraged not in urls: urls.append(defraged) return urls
def __init__(self, request, timeout=180): self._url = urldefrag(request.url)[0] # converting to bytes to comply to Twisted interface self.url = to_bytes(self._url, encoding='ascii') self.method = to_bytes(request.method, encoding='ascii') self.body = request.body or None self.headers = Headers(request.headers) self.response_headers = None self.timeout = request.meta.get('download_timeout') or timeout self.start_time = time() self.deferred = defer.Deferred().addCallback(self._build_response, request) # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected # to have _disconnectedDeferred. See Twisted r32329. # As Scrapy implements it's own logic to handle redirects is not # needed to add the callback _waitForDisconnect. # Specifically this avoids the AttributeError exception when # clientConnectionFailed method is called. self._disconnectedDeferred = defer.Deferred() self._set_connection_attributes(request) # set Host header based on url self.headers.setdefault('Host', self.netloc) # set Content-Length based len of body if self.body is not None: self.headers['Content-Length'] = len(self.body) # just in case a broken http/1.1 decides to keep connection alive self.headers.setdefault("Connection", "close") # Content-Length must be specified in POST method even with no body elif self.method == b'POST': self.headers['Content-Length'] = 0
def parse_scm(collection, version): """Extract name, version, path and subdir out of the SCM pointer.""" if ',' in collection: collection, version = collection.split(',', 1) elif version == '*' or not version: version = 'HEAD' if collection.startswith('git+'): path = collection[4:] else: path = collection path, fragment = urldefrag(path) fragment = fragment.strip(os.path.sep) if path.endswith(os.path.sep + '.git'): name = path.split(os.path.sep)[-2] elif '://' not in path and '@' not in path: name = path else: name = path.split('/')[-1] if name.endswith('.git'): name = name[:-4] return name, version, path, fragment
def UrlQueueFilter(url, currentUrl, filterArr, inputthread): newlist = sum(filterArr, []) global SeenDB isseen, SeenDB, k = DBCtrl.checkinSeenDB(url, SeenDB) if (isseen == 1): return False elif (isseen == -1): DBCtrl.resetSeenDB(inputthread) isseen, SeenDB, k = DBCtrl.checkinSeenDB(url, SeenDB) for filterItem in newlist: if (filterItem in url): return False parseURL = urlparse(url).netloc.split('.') #print(url) #print(parseURL) for i in range(len(parseURL)): if (len(parseURL[i]) > 63): return False if ('https://' in url): pass elif ('http://' in url): pass else: url = urljoin(currentUrl, url) newurl, dummy_frag = urldefrag(url) return newurl
def extract_next_links(url, resp): # specifications number 3.2, 3.3 next_links = set() base_url = urlparse(url) valid = [200, 201, 203] # list of valid html status codes https://www.w3.org/Protocols/HTTP/HTRESP.html if resp.status in valid: if url in uniqueDomains: return list() resp_content = resp.raw_response.headers['Content-Type'].split(';')[0] if resp_content != "text/html": return list() # found on stackoverflow on how to read headers # uniqueDomains.add(url) soup = BeautifulSoup(resp.raw_response.content, "lxml") # using lxml to read content if resp.status == 200 and str(soup) == "": # making sure website is not empty return list() for link in soup.find_all("a"): # uses beautifulsoup to find urls link = link.get("href") if link is None or link == "": continue else: link = link.lower() # making link lowercase in order to defragment defragmented_url = urldefrag(link)[0] fixed_link = fix_url(defragmented_url, base_url) # need to fix relative links if fixed_link not in uniqueDomains: # makes sure only unique domains are being crawled uniqueDomains.add(fixed_link) next_links.add(fixed_link) else: continue return list(next_links) # returns list of set of links (makes sure links are unique)
def extract_domains(site_text): domains = set() only_a_tags = SoupStrainer("a") for link in BeautifulSoup(site_text, "html.parser", parse_only=only_a_tags): if link.has_attr('href') and urlparse(link["href"]).scheme not in ["", "mailto"]: domains.add(urldefrag(link["href"])[0]) return list(domains)
def _test_link(self, link, external): """Check single link. Either local or remote""" base_link, fragment = urldefrag(link) if link in self.seen_links: return self.seen_links[link] if base_link in self.seen_links and not self.seen_links[base_link]: return False ret = False if external: if fragment: # test with HTTP GET and read to soup to check for fragment anchor ret = self._test_http_fragment(base_link, fragment) else: ret = test_http_head(link) else: if fragment: # read file to soup to check for fragment anchor ret = self._test_file_fragment(base_link, fragment) else: # just stat file ret = os.path.exists(link) self.seen_links[link] = ret return ret
def url_to_overrides(url_string): url = urlparse(url_string) if url.scheme == '': return OverridesFile(url.path) elif url.scheme == 'file': return OverridesFile(url.path) elif url.scheme == 'http' or url.scheme == 'https': return OverridesUrl(url.geturl()) elif url.scheme.startswith('git+'): if not url.fragment: raise UnsupportedUrlError( ('Cannot handle overrides with no path given, offeding url was' ' {url}.') .format( url=url_string ) ) fragments = dict( map(lambda x: x.split('='), url.fragment.split('&')) ) return OverridesGit( repo_url=urldefrag(url.geturl()[4:])[0], path=fragments['path'], rev=fragments.get('rev', None), ) else: raise UnsupportedUrlError('Cannot handle common overrides url %s' % url_string)
def oauth_callback(): if not settings.OAUTH: abort(404) resp = oauth.provider.authorized_response() if resp is None or isinstance(resp, OAuthException): log.warning("Failed OAuth: %r", resp) return Unauthorized("Authentication has failed.") response = signals.handle_oauth_session.send(provider=oauth.provider, oauth=resp) for (_, role) in response: if role is None: continue update_role(role) db.session.commit() log.info("Logged in: %r", role) authz = Authz.from_role(role) token = authz.to_token(role=role) token = token.decode('utf-8') state = request.args.get('state') next_url = get_best_next_url(state, request.referrer) next_url, _ = urldefrag(next_url) next_url = '%s#token=%s' % (next_url, token) return redirect(next_url) log.error("No OAuth handler for %r was installed.", oauth.provider.name) return Unauthorized("Authentication has failed.")
def resolving(self, ref: str): """ Context manager which resolves a JSON ``ref`` and enters the resolution scope of this ref. """ new_uri = fixed_urljoin(self.resolution_scope, ref) uri, fragment = urlparse.urldefrag(new_uri) normalized_uri = normalize(uri) if normalized_uri in self.store: schema = self.store[normalized_uri] elif not uri or uri == self.base_uri: schema = self.schema else: schema = resolve_remote(uri, self.handlers) if self.cache: scheme = urlparse.urlsplit(normalized_uri).scheme if scheme != 'internal-no-cache': self.store[normalized_uri] = schema old_base_uri, old_schema = self.base_uri, self.schema self.base_uri, self.schema = uri, schema try: with self.in_scope(uri): yield resolve_path(schema, fragment) finally: self.base_uri, self.schema = old_base_uri, old_schema
def _GetMeasureDataForSlice(self, slice_id, tableMappings): ret = {} measures = sorted( self.graph.objects(subject=slice_id, predicate=SCHEMA.measure)) for measure_id in measures: unit_codes = list( self.graph.objects(subject=measure_id, predicate=SCHEMA.unitCode)) unit_texts = list( self.graph.objects(subject=measure_id, predicate=SCHEMA.unitText)) csv_id = urldefrag(measure_id).fragment for tableMapping in tableMappings: if tableMapping['sourceEntity'] == measure_id: csv_id = str(tableMapping['columnIdentifier']) break if not csv_id: print("Unable to determine CSV ID for metric", measure_id, file=sys.stderr) exit(1) ret[csv_id] = { 'id': measure_id, 'unit_code': unit_codes, 'unit_text': unit_texts, } return ret
def url(self, name, force=False): """ Returns the real URL in DEBUG mode. """ if settings.DEBUG and not force: hashed_name, fragment = name, '' else: clean_name, fragment = urldefrag(name) if urlsplit(clean_name).path.endswith('/'): # don't hash paths hashed_name = name else: cache_key = self.cache_key(name) hashed_name = self.cache.get(cache_key) if hashed_name is None: hashed_name = self.hashed_name(clean_name).replace('\\', '/') # set the cache if there was a miss # (e.g. if cache server goes down) self.cache.set(cache_key, hashed_name) final_url = super(CachedFilesMixin, self).url(hashed_name) # Special casing for a @font-face hack, like url(myfont.eot?#iefix") # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax query_fragment = '?#' in name # [sic!] if fragment or query_fragment: urlparts = list(urlsplit(final_url)) if fragment and not urlparts[4]: urlparts[4] = fragment if query_fragment and not urlparts[3]: urlparts[2] += '?' final_url = urlunsplit(urlparts) return unquote(final_url)
def _url(self, hashed_name_func, name, force=False, hashed_files=None): """ Return the non-hashed URL in DEBUG mode. """ if settings.DEBUG and not force: hashed_name, fragment = name, '' else: clean_name, fragment = urldefrag(name) if urlsplit(clean_name).path.endswith('/'): # don't hash paths hashed_name = name else: args = (clean_name,) if hashed_files is not None: args += (hashed_files,) hashed_name = hashed_name_func(*args) final_url = super().url(hashed_name) # Special casing for a @font-face hack, like url(myfont.eot?#iefix") # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax query_fragment = '?#' in name # [sic!] if fragment or query_fragment: urlparts = list(urlsplit(final_url)) if fragment and not urlparts[4]: urlparts[4] = fragment if query_fragment and not urlparts[3]: urlparts[2] += '?' final_url = urlunsplit(urlparts) return unquote(final_url)
def normalize(seed_url, link): """ Normalize this URL by removing hash and adding domain """ # remove hash to avoid duplicates link, _ = urlparse.urldefrag(link) return urlparse.urljoin(seed_url, link)
def get_div_link(self, tip): tag_a = tip.parent.find('a', class_='qlink') if tag_a: url = tag_a.get('href') return urldefrag(url)[0] else: return ''
def getlinks(pageurl, pageresponse, domain): """Returns a list of links from from this page to be crawled. pageurl = URL of this page pageresponse = page content; response object from requests module domain = domain being crawled (None to return links to *any* domain) """ soup = bs4.BeautifulSoup(pageresponse.text, "html.parser") # get target URLs for all links on the page links = [a.attrs.get('href') for a in soup.select('a[href]')] # remove fragment identifiers links = [urldefrag(link)[0] for link in links] # remove any empty strings links = [link for link in links if link] # if it's a relative link, change to absolute links = [link if bool(urlparse(link).netloc) else urljoin(pageurl, link) \ for link in links] # if only crawing a single domain, remove links to other domains if domain: links = [link for link in links if urlparse(link).netloc == domain] return links
def resolving(self, ref): """ Context manager which resolves a JSON ``ref`` and enters the resolution scope of this ref. :argument str ref: reference to resolve """ full_uri = urlparse.urljoin(self.resolution_scope, ref) uri, fragment = urlparse.urldefrag(full_uri) if uri in self.store: document = self.store[uri] elif not uri or uri == self.base_uri: document = self.referrer else: document = self.resolve_remote(uri) old_base_uri, old_referrer = self.base_uri, self.referrer self.base_uri, self.referrer = uri, document try: with self.in_scope(uri): yield self.resolve_fragment(document, fragment) finally: self.base_uri, self.referrer = old_base_uri, old_referrer
def __init__(self, url, previous=None, **info): # Apply the simple idempotent optimizations to all urls (no need to # ever deal with "HTTP://.."). This means case-sensitivity, and a # whole lot of other things that the urlnorm library will do for us. # We call this the original url, even though it is a bit of a lie. try: self.original_url = urlnorm.norm(url) except urlnorm.InvalidUrl as e: raise urlnorm.InvalidUrl('{}: {}'.format(e, url)) # For the normalized url that we'll be exposing, remove the # fragment, and treat https and http the same. url, fragment = urldefrag(self.original_url) self.lossy_url_data = {'fragment': fragment} if url.startswith('https:'): url = 'http' + url[5:] self.lossy_url_data.update({'protocol': 'https'}) self.url = url self.set_previous(previous) self.info = info self.post = None # Runtime data self.response = None self.exception = None self.retries = 0
def escape_ajax(url): """ Return the crawleable url according to: https://developers.google.com/webmasters/ajax-crawling/docs/getting-started >>> escape_ajax("www.example.com/ajax.html#!key=value") 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value") 'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html?#!key=value") 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue' >>> escape_ajax("www.example.com/ajax.html#!") 'www.example.com/ajax.html?_escaped_fragment_=' URLs that are not "AJAX crawlable" (according to Google) returned as-is: >>> escape_ajax("www.example.com/ajax.html#key=value") 'www.example.com/ajax.html#key=value' >>> escape_ajax("www.example.com/ajax.html#") 'www.example.com/ajax.html#' >>> escape_ajax("www.example.com/ajax.html") 'www.example.com/ajax.html' """ defrag, frag = urldefrag(url) if not frag.startswith('!'): return url return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
def download_request(self, request): from twisted.internet import reactor timeout = request.meta.get('download_timeout') or self._connectTimeout agent = self._get_agent(request, timeout) # request details url = urldefrag(request.url)[0] method = to_bytes(request.method) headers = TxHeaders(request.headers) if isinstance(agent, self._TunnelingAgent): headers.removeHeader(b'Proxy-Authorization') if request.body: bodyproducer = _RequestBodyProducer(request.body) else: bodyproducer = None start_time = time() d = agent.request(method, to_bytes(url, encoding='ascii'), headers, bodyproducer) # set download latency d.addCallback(self._cb_latency, request, start_time) # response body is ready to be consumed d.addCallback(self._cb_bodyready, request) d.addCallback(self._cb_bodydone, request, url) # check download timeout self._timeout_cl = reactor.callLater(timeout, d.cancel) d.addBoth(self._cb_timeout, request, url, timeout) return d
def convert_link(base_url, href): # check for redirect # get content-type # compute href file name # convert to relative path href_url = urljoin(base_url, href) href_url_defrag = urldefrag(href_url)[0] base_path = url_to_file_name(base_url, 'text/html') # has to be html if href_url_defrag in site_redirects: href_url_defrag = site_redirects[href_url_defrag] if href_url_defrag in site_urls: content_type = site_urls[href_url_defrag]['content-type'] href_path = url_to_file_name(href_url, content_type) href_path = posixpath.relpath(href_path, start=os.path.dirname(base_path)) return href_path else: href_path = url_to_file_name( href_url, None) # allow links not in site_urls if have extension if href_path: return posixpath.relpath(href_path, start=os.path.dirname(base_path)) else: print('Unknown URL ' + href_url + ' not in site_urls') return None
def oauth_callback(): if not settings.OAUTH: abort(404) resp = oauth.provider.authorized_response() if resp is None or isinstance(resp, OAuthException): log.warning("Failed OAuth: %r", resp) return Unauthorized("Authentication has failed.") response = signals.handle_oauth_session.send(provider=oauth.provider, oauth=resp) for (_, role) in response: if role is None: continue db.session.commit() update_role(role) log.info("Logged in: %r", role) request.authz = Authz.from_role(role) record_audit(Audit.ACT_LOGIN) token = request.authz.to_token(role=role) token = token.decode('utf-8') state = request.args.get('state') next_url = get_best_next_url(state, request.referrer) next_url, _ = urldefrag(next_url) next_url = '%s#token=%s' % (next_url, token) return redirect(next_url) log.error("No OAuth handler for %r was installed.", oauth.provider.name) return Unauthorized("Authentication has failed.")
def replace_refs(cls, obj, _recursive=False, **kwargs): """ Returns a deep copy of `obj` with all contained JSON reference objects replaced with :class:`JsonRef` instances. :param obj: If this is a JSON reference object, a :class:`JsonRef` instance will be created. If `obj` is not a JSON reference object, a deep copy of it will be created with all contained JSON reference objects replaced by :class:`JsonRef` instances :param base_uri: URI to resolve relative references against :param loader: Callable that takes a URI and returns the parsed JSON (defaults to global ``jsonloader``, a :class:`JsonLoader` instance) :param jsonschema: Flag to turn on `JSON Schema mode <http://json-schema.org/latest/json-schema-core.html#anchor25>`_. 'id' keyword changes the `base_uri` for references contained within the object :param load_on_repr: If set to ``False``, :func:`repr` call on a :class:`JsonRef` object will not cause the reference to be loaded if it hasn't already. (defaults to ``True``) """ store = kwargs.setdefault("_store", _URIDict()) base_uri, frag = urlparse.urldefrag(kwargs.get("base_uri", "")) store_uri = None # If this does not get set, we won't store the result if not frag and not _recursive: store_uri = base_uri try: if kwargs.get("jsonschema") and isinstance(obj["id"], basestring): kwargs["base_uri"] = urlparse.urljoin( kwargs.get("base_uri", ""), obj["id"] ) store_uri = kwargs["base_uri"] except (TypeError, LookupError): pass try: if not isinstance(obj["$ref"], basestring): raise TypeError except (TypeError, LookupError): pass else: return cls(obj, **kwargs) # If our obj was not a json reference object, iterate through it, # replacing children with JsonRefs kwargs["_recursive"] = True path = list(kwargs.pop("_path", ())) if isinstance(obj, Mapping): obj = type(obj)( (k, cls.replace_refs(v, _path=path+[k], **kwargs)) for k, v in iteritems(obj) ) elif isinstance(obj, Sequence) and not isinstance(obj, basestring): obj = type(obj)( cls.replace_refs(v, _path=path+[i], **kwargs) for i, v in enumerate(obj) ) if store_uri is not None: store[store_uri] = obj return obj
def _ExpandFootnotes(self): for result in self.graph.query( MakeSparqlSelectQuery( ('?ds', 'a', 'schema:StatisticalDataset'), ('?ds', 'schema:footnote', '?fn'), ns_manager=self.graph.namespace_manager)): if result['fn'] not in self.subjects: self.graph.remove( (result['ds'], SCHEMA.footnote, result['fn'])) id_prefix = urldefrag(str(result['ds'])).url with self.getter.Fetch(str(result['fn'])) as f: reader = DictReader(f) for row in reader: row_id = rdflib.URIRef(id_prefix + '#footnote=' + row['codeValue']) self.graph.add((result['ds'], SCHEMA.footnote, row_id)) self.graph.add((row_id, rdflib.RDF.type, SCHEMA.StatisticalAnnotation)) for key, val in row.items(): fields = key.split('@') if len(fields) > 1: # A language code is specified self.graph.add( (row_id, getattr(SCHEMA, fields[0]), rdflib.Literal(val, language=fields[1]))) else: self.graph.add((row_id, getattr(SCHEMA, key), rdflib.Literal(val)))
def getlinks(pageurl, domain, soup): """Returns a list of links from from this page to be crawled. pageurl = URL of this page domain = domain being crawled (None to return links to *any* domain) soup = BeautifulSoup object for this page """ # get target URLs for all links on the page links = [a.attrs.get('href') for a in soup.select('a[href]')] # remove fragment identifiers links = [urldefrag(link)[0] for link in links] # remove any empty strings links = [link for link in links if link] # if it's a relative link, change to absolute links = [link if bool(urlparse(link).netloc) else urljoin(pageurl, link) \ for link in links] # if only crawing a single domain, remove links to other domains if domain: links = [link for link in links if samedomain(urlparse(link).netloc, domain)] return links
def validate(url): if url in visitedUrls: return visitedUrls.append(url) try: content = urlopen(url).read().decode("utf8") except: # Assume the content is binary. return wikiUrls = [] invalidUrls = [] # This may see redundant, but without the `.find_all('a')`, soup will also # contain the `DocType` element which does not have an `href` attribute. # See <http://stackoverflow.com/questions/17943992/beautifulsoup-and-soupstrainer-for-getting-links-dont-work-with-hasattr-returni>. soup = BeautifulSoup(content, parse_only=SoupStrainer('a', href=True)).find_all('a') for externalUrl in soup: fullExternalUrl = urljoin(url, urldefrag(externalUrl['href']).url) if baseUrl in fullExternalUrl and \ not fullExternalUrl.endswith('/_history'): if externalUrl.has_attr('class') and 'absent' in externalUrl['class']: invalidUrls.append(fullExternalUrl) else: wikiUrls.append(fullExternalUrl) if len(invalidUrls) > 0: invalidWikiPages.append((url, invalidUrls)) for wikiUrl in wikiUrls: if wikiUrl not in visitedUrls: validate(wikiUrl)
def validate_url(url, parent_url='http:'): """ Validate a URL to be a string having an explicit recognized scheme. Arguments: url: string URL parent_url: optional string URL from which to inherit an implicit scheme. Returns: dict having: valid: boolean truth value. url: string modified URL. """ if bytes == type(url): url = url.decode() parsed_url = urlparse(url) if 0 < len(parsed_url.path) and '/' == parsed_url.path[0]: url = urldefrag(urljoin(parent_url, url))[0] elif not parsed_url.scheme: parent_scheme = urlparse(parent_url).scheme or 'http' url = parent_scheme + ':' + url parsed_url = urlparse(url) valid = parsed_url.scheme in ('http', 'https', '') and \ bool(parsed_url.netloc) return {'valid': valid, 'url': url}
def _parse(self, page: BeautifulSoup, url): seasons = OrderedDict() eqg = OrderedSet() child = page.select_one("#WikiaArticle h2") season = child.text while child.next_sibling: child = child.next_sibling if child.name == "table": for a in child.find_all("a", string="Transcript"): if not a.has_attr("class") or "new" not in a["class"]: episode_url, fragment = urldefrag(a["href"]) episode_url = urljoin(url, episode_url) if "Equestria Girls" not in season: if season not in seasons: seasons[season] = OrderedSet() seasons[season].append(episode_url) else: eqg.append(episode_url) continue if child.name == "h2": season = child.text continue seasons["Equestria Girls"] = eqg return seasons
def _GetDimensionDataForSlice(self, slice_id, tableMappings): ret = {} dims = sorted( self.graph.objects(subject=slice_id, predicate=SCHEMA.dimension)) for dim_id in dims: dim_type = list( self.graph.objects(subject=dim_id, predicate=rdflib.RDF.type)) dim_equiv_types = list( self.graph.objects(subject=dim_id, predicate=SCHEMA.equivalentType)) csv_id = urldefrag(dim_id).fragment for tableMapping in tableMappings: if tableMapping['sourceEntity'] == dim_id: csv_id = str(tableMapping['columnIdentifier']) break if not csv_id: print("Unable to determine CSV ID for dimension", dim_id, file=sys.stderr) exit(1) ret[csv_id] = { 'id': dim_id, 'type': dim_type, 'types': dim_equiv_types } return ret
def has_valid_signature(request, activity): ''' verify incoming signature ''' try: signature = Signature.parse(request) key_actor = urldefrag(signature.key_id).url if key_actor != activity.get('actor'): raise ValueError("Wrong actor created signature.") remote_user = activitypub.resolve_remote_id(models.User, key_actor) if not remote_user: return False try: signature.verify(remote_user.key_pair.public_key, request) except ValueError: old_key = remote_user.key_pair.public_key remote_user = activitypub.resolve_remote_id(models.User, remote_user.remote_id, refresh=True) if remote_user.key_pair.public_key == old_key: raise # Key unchanged. signature.verify(remote_user.key_pair.public_key, request) except (ValueError, requests.exceptions.HTTPError): return False return True
async def read_file(self, parser_server_queue, mongodb): # reads one document from the database at random. # if error is raised, returns. print( "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF" ) try: cursor = mongodb.fetch_random_doc('https3', 1) for docs in cursor: doc = docs['file_data'] ids = docs['_id'] mongodb.update_doc('https3', ids) except Exception as e: return "Kill the Task" # extract_tags return list of all 'a' tags and url of the page parsed. # base_url ====> page url, a_tags ====> list of 'a' tags. base_url, a_tags = await self.extract_tags(doc) if base_url == "": return for url in a_tags: urls = url.get("href") if urls == None: return # check if link retrieved is in absolute form. # If False, convert to absolute and then check the structure of url formed. # If True, directly check the structure of url formed. # While checking structure, anything beyond and including [#,?,=,:] must be dropped. if self.is_absolute(urls): urls = await self.correct_url_structure(urls) else: url_absolute = urljoin(base_url, urls) print( " A B S O L U T E U R L ", base_url, " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" ) url_absolute = urldefrag(url_absolute) urls = await self.correct_url_structure(url_absolute[0]) urls = unquote(urls) print() print() print(";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ", urls, ";;;;;;;;;;;;;;;;;;;;;;;;;;;;;") # put in the parser_server_queue for server to consume. await parser_server_queue.put(urls) print( "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE" ) return "DONE"
async def _get_data_from_page( self, page_tld: str, host: str, page_domain: str) -> Tuple[Set, Set, Set, Set]: urls = set() domains = set() backlinks = set() redirects = set() for link in set(self.html_page.xpath('//a')): if tldextract.extract(page_domain)[2] != page_tld: continue # reset to default href = url = link_url = link_tld = internal_link = '' try: href = link.xpath('./@href')[0].lower() except IndexError: continue url = unquote(urldefrag(href)[0]).replace("'", "") url = url.split('@')[-1:][0] # remove mailto part if url == page_domain: # skip this option continue if len(url) > len(page_tld) + 1: link_url = tldextract.extract(url) if link_url[1] in ['mailto', 'google']: continue link_tld = link_url[2] if not link_tld: # url has not tld - it's URI, then create full URL url = url[1:] if url.startswith('./') else url url = url[1:] if url.startswith('//') else url delimiter = '' if url.startswith('/') else '/' internal_link = f"{page_domain}{delimiter}{url}" if link_tld == page_tld: # if url not domain: try: if len(url.split(f'.{page_tld}')[1]) > 1: urls.add(url) except IndexError: pass if host not in url and url not in host: anchor = await self._get_anchor(link) dofollow = await self._get_rel(link) backlinks.add((url[:249], anchor, dofollow)) domain = '.'.join(link_url[1:3]) if len(domain) < 64: # check for maximum domain length domains.add(domain) elif internal_link: urls.add(internal_link) return domains, urls, backlinks, redirects
async def parse_links(self, response): links = set() content_type = None encoding = None price_link = [] if response.status == 200: content_type = response.headers.get('content-type') response_url = str(response.url) if content_type in ('text/html', 'application/xml', 'text/html;charset=UTF-8'): pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = await response.text() # print(text) '''(?i)href=["']([^\s"'<>]+)''' urls = set( re.findall( '<li style="margin-left: [-\d]+px">.*?<a href="(/s/ref=lp_\d+_nr_n_[\d+].*?)">.*?<span class="refinementLink">(.*?)</span>.*?</a>.*?</li>', text, re.S | re.M)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) else: for price_g in range(1, 100, 2): low_price = price_g high_price = price_g + 1 price_link.append( "{}&low-price={}&high-price={}".format( response_url, low_price, high_price)) if len(price_link) > 0: redis_server.lpush("price_link_tmp", *price_link) for url in urls: u, t = url k = u.replace('&', '&') normalized = urljoin(str(response.url), k) defragmented, frag = urldefrag(normalized) if self.url_allowed(defragmented): print(defragmented, t) ''' Children's Books(儿童图书) General (科å¦é€šä¿—读物) 这两个陷入了回调. INFO:__main__:redirect to 'https://www.amazon.cn/s/ref=lp_2084813051_nr_n_11/460-8646033-3118437?rh=n%3A2084813051&ie=UTF8' from 'https://www.amazon.cn/s/ref=lp_2084813051_nr_n_11/460-8646033-3118437?fst=as%3Aoff&rh=n%3A658390051%2Cn%3A%21658391051%2Cn%3A2045366051%2Cn%3A2078652051%2Cn%3A2084813051%2Cn%3A2084839051&bbn=2084813051&ie=UTF8&qid=1511710241&rnid=2084813051' ''' LOGGER.info = LOGGER.debug( 'previous url: %s, next url: %s, title: %s', str(response.url), defragmented, t) if t == "General (科å¦é€šä¿—读物)": LOGGER.error("错误的分类: %r", t) else: links.add(defragmented) stat = FetchStatistic(url=response.url) return stat, links
def splitDecodeFragment(url): urlPart, fragPart = urldefrag(url) if isPy3: return (urlPart, unquote(fragPart, "utf-8", errors=None)) else: return _STR_UNICODE(urlPart), unquote(_STR_UNICODE(fragPart), "utf-8", errors=None)
def defragment_and_absolute(current_url, new_url): parse_new_url = urlparse(new_url) if not bool(parse_new_url.netloc): # if new url is a relative link, make it absolute with the current url new_url = urljoin(current_url, new_url) parse_new_url = urlparse(new_url) defrag = urldefrag(new_url) return defrag.url
def splitDecodeFragment(url): if url is None: # urldefrag returns byte strings for none, instead of unicode strings return _STR_UNICODE(""), _STR_UNICODE("") urlPart, fragPart = urldefrag(url) if isPy3: return (urlPart, unquote(fragPart, "utf-8", errors=None)) else: return _STR_UNICODE(urlPart), unquote(_STR_UNICODE(fragPart), "utf-8", errors=None)
def validate(uri): sep = ' ' uri = uri.split(sep, 1)[0] u = urlparse(uri) if not u.scheme or not u.netloc: return False fixed, throwaway = urldefrag(uri) return fixed
def _get_repo_remote(url): ''' For a given git url, return the base url and branch, if present (otherwise return 'master'). ''' base_url, branch = urldefrag(url) branch = branch or 'master' return base_url, branch
def _meta_schemas(): """ Collect the urls and meta schemas from each known validator. """ meta_schemas = (v.META_SCHEMA for v in validators.values()) return dict((urlparse.urldefrag(m["id"])[0], m) for m in meta_schemas)
def test_spider(client, app, check_external_links): """Check that all links work Spiders the site, making sure all internal links point to existing pages. Includes fragments: any #hash in a link must correspond to existing element with id. If check_external_links is true, checks external links as well. """ to_visit = {'http://localhost/'} visited = set() external = set() wanted_fragments = collections.defaultdict(set) page_ids = {} def recording_url_for(*args, **kwargs): url = flask.url_for(*args, **kwargs) if url not in visited: to_visit.add(urljoin('http://localhost/', url)) return url app.jinja_env.globals['url_for'] = recording_url_for while to_visit: url = to_visit.pop() if url in visited: continue visited.add(url) links = [] parsed = urlparse(url) if parsed.netloc == 'localhost': print('visit', url) page_ids[url] = [] check_url(client, url, links, page_ids[url]) for link in links: fullurl = urljoin('http://localhost/', url) fullurl = urljoin(fullurl, link) result = urldefrag(fullurl) defrag = result.url fragment = result.fragment if fragment: wanted_fragments[defrag].add(fragment) if defrag not in visited: to_visit.add(defrag) else: if parsed.scheme in ('http', 'https'): external.add(url) else: print('ignore', url) for url, fragments in wanted_fragments.items(): assert fragments <= set(page_ids[url]) if check_external_links: for url in external: print('check', url) check_external_link(url)
def absolutize(self, uri, defrag=1): base = urljoin("file:", pathname2url(os.getcwd())) result = urljoin("%s/" % base, uri, allow_fragments=not defrag) if defrag: result = urldefrag(result)[0] if not defrag: if uri and uri[-1] == "#" and result[-1] != "#": result = "%s#" % result return URIRef(result)
def html_to_lxml(url, text, clean=False): """Parse plain-text HTML into an `lxml` tree.""" if clean: text = _text_from_sp(('pandoc', '--from=html', '--to=html5'), text.encode()) html = lxml.html.document_fromstring(text) # Endless loops ahoy html.rewrite_links(lambda s: '' if urldefrag(s).url == url else s, base_href=url) return html
def check_url(url): """Check the given URL by issuring a HEAD request.""" # We don't want to include a fragment in our request. url, fragment = urldefrag(url) # Attempt to open the target URL using a HEAD request. request = Request(url) request.get_method = lambda: 'HEAD' request.add_header('User-Agent', USER_AGENT) return urlopen(request)
def crawler(startpage, maxpages=100, singledomain=True): """Crawl the web starting from specified page. 1st parameter = starting page url maxpages = maximum number of pages to crawl singledomain = whether to only crawl links within startpage's domain """ import requests, re, bs4 from urllib.parse import urldefrag, urljoin, urlparse from collections import deque pagequeue = deque() # queue of pages to be crawled pagequeue.append(startpage) crawled = [] # list of pages already crawled domain = urlparse(startpage).netloc # for singledomain option pages = 0 # number of pages succesfully crawled so far failed = 0 # number of pages that couldn't be crawled while pages < maxpages and pagequeue: url = pagequeue.popleft() # get next page to crawl (FIFO queue) try: response = requests.get(url) if not response.headers['content-type'].startswith('text/html'): continue # don't crawl non-HTML links soup = bs4.BeautifulSoup(response.text, "html.parser") print('Crawling:', url) pages += 1 crawled.append(url) # PROCESSING CODE GOES HERE: # do something interesting with this page # get target URLs for all links on the page links = [a.attrs.get('href') for a in soup.select('a[href]')] # remove fragment identifiers links = [urldefrag(link)[0] for link in links] # remove any empty strings links = list(filter(None,links)) # if it's a relative link, change to absolute links = [link if bool(urlparse(link).netloc) else urljoin(url,link) for link in links] # if singledomain=True, remove links to other domains if singledomain: links = [link for link in links if (urlparse(link).netloc == domain)] # add these links to the queue (except if already crawled) for link in links: if link not in crawled and link not in pagequeue: pagequeue.append(link) except: print("*FAILED*:", url) failed += 1 print('{0} pages crawled, {1} pages failed to load.'.format(pages, failed))
def send(self, request, stream=False, timeout=None, verify=True, cert=None, proxies=None): url, fragment = urldefrag(request.url) self.requests.update({url: 1}) if not url in self.urls: raise ConnectionError("no such virtual url", request.url) resp = Resp(**self.urls[url]) r = self.build_response(request, resp) if not stream: # force prefetching content unless streaming in use r.content return r
def extract_links(self, html): log.debug("extract_links") soup = BeautifulSoup(html, "lxml") links = [] for link in soup.find_all('a', href=True): href = link.get('href') href = urljoin(self.seed, href) href, _ = urldefrag(href) links.append(href) log.info("found {}".format(len(links))) return links
def filter_links(link_list, base_url): href_list = [element.get("href") for element in link_list] for i in range(len(href_list)): if href_list[i].startswith("http://go.usa.gov/"): href_list[i] = utils.resolve_redirect(href_list[i]) href_list = [urldefrag(urljoin(base_url, href))[0] for href in href_list] filtered_list = [ href for href in href_list if href and href not in BLACKLIST_REPORT_URLS and not href.startswith("mailto:") ] filtered_list = list(set(filtered_list)) return filtered_list
def annotation(self): settings = self.request.registry.settings try: document = util.elasticsearch_client(settings).get( index=settings["elasticsearch_index"], doc_type="annotation", id=self.request.matchdict["id"]) except exceptions.NotFoundError: statsd.incr("views.annotation.404.annotation_not_found") raise httpexceptions.HTTPNotFound(_("Annotation not found")) try: annotation_id, document_uri = util.parse_document(document) except util.InvalidAnnotationError as exc: statsd.incr("views.annotation.422.{}".format(exc.reason)) raise httpexceptions.HTTPUnprocessableEntity(str(exc)) # Remove any existing #fragment identifier from the URI before we # append our own. document_uri = parse.urldefrag(document_uri)[0] if not (document_uri.startswith("http://") or document_uri.startswith("https://")): statsd.incr("views.annotation.422.not_an_http_or_https_document") raise httpexceptions.HTTPUnprocessableEntity( _("Sorry, but it looks like this annotation was made on a " "document that is not publicly available.")) via_url = "{via_base_url}/{uri}#annotations:{id}".format( via_base_url=settings["via_base_url"], uri=document_uri, id=annotation_id) extension_url = "{uri}#annotations:{id}".format( uri=document_uri, id=annotation_id) parsed_url = parse.urlparse(document_uri) pretty_url = parsed_url.netloc[:NETLOC_MAX_LENGTH] if len(parsed_url.netloc) > NETLOC_MAX_LENGTH: pretty_url = pretty_url + jinja2.Markup("…") statsd.incr("views.annotation.200.annotation_found") return { "data": json.dumps({ # Warning: variable names change from python_style to # javaScriptStyle here! "chromeExtensionId": settings["chrome_extension_id"], "viaUrl": via_url, "extensionUrl": extension_url, }), "pretty_url": pretty_url }
def get_urls(self, document): urls = [] urls_to_parse = [] dom = html.fromstring(document) for href in dom.xpath('//a/@href'): if any(e in href for e in self.exclude): continue url = urljoin(self.base, urldefrag(href)[0]) if url.startswith(self.base): if self.capture in url: urls_to_parse.append(url) urls.append(url) return urls, urls_to_parse
def fix_toc_entry(self, toc): if toc.href: href = urlnormalize(toc.href) href, frag = urldefrag(href) replacement = self.rename_map.get(href, None) if replacement is not None: nhref = replacement if frag: nhref = '#'.join((nhref, frag)) toc.href = nhref for x in toc: self.fix_toc_entry(x)
def get_out_going_links(self, page, html_body): """extracts all the outgoing links and adds links that belong to main page domain for further crawling if they are not crawled yet This avoids: - links that are .zip files - links mentioned in href that are javascript methods - mailto: links """ soup = BeautifulSoup(html_body, "html.parser") valid_links_for_this_page = [] for a in soup.find_all('a', href=True): href = a['href'].lower() href = self.compose_url_from_href(page, href) # clean the href so that it will have legitimate urls instead of #cluttered ones and q=param prints href = urldefrag(href)[0] # skip intra links [this took time to find out !] ##1 # remove query params as only the path matters if href.find('?') != -1: href = href[:href.find('?')] ##2 new_page = urlparse(href) # add to the queue only it it doesn't cause a cycle # assumption: if a link ends with domain.com, assuming it can be crawled to make sitemap complete if not str(new_page.netloc).endswith(self.start_page): # doesn't belong to domain valid_links_for_this_page.append(href) continue if self.robot_allows(href) and \ not href in self.site_map and \ not href in self.unvisited and \ not 'javascript:' in href and \ not 'mailto:' in href: if not ( href.endswith(".zip") or href.endswith(".gz") or href.endswith(".gzip") or href.endswith(".tar") or href.endswith(".bz2") or href.endswith(".jpg") or href.endswith(".png") or href.endswith(".exe") ): self.unvisited.add(href) valid_links_for_this_page.append(href) return valid_links_for_this_page
def crawl(base_url, follow_external_links=True, ignore_fragments=True, verify=True): base_netloc = urlparse(base_url).netloc seen = set([base_url]) todo = [base_url] session = requests.Session() session.verify = verify while todo: url = todo.pop() try: rsp = session.get(url) except requests.exceptions.InvalidSchema: # TODO: Check if the scheme is a valid one, or otherwise # communicate the error to the user. continue yield rsp if urlparse(url).netloc != base_netloc: continue content_type, _ = cgi.parse_header(rsp.headers['content-type']) if content_type == 'text/html': urls = extract_urls_from_html(rsp.text) elif content_type == 'text/css': urls = extract_urls_from_css(rsp.text) else: # see https://bitbucket.org/ned/coveragepy/issues/497/ continue # pragma: no cover for url1 in urls: abs_url = urljoin(url, url1) if ignore_fragments: abs_url = urldefrag(abs_url)[0] if not follow_external_links: if urlparse(abs_url).netloc != base_netloc: continue if abs_url not in seen: seen.add(abs_url) todo.append(abs_url)
def find_internal_links(route): """ Given a route, returns (an estimate of) all routes that it links to in any manner (href or resource). Won't find links that are neither in an "href" nor marked as a rewritable-resource link (erring on the side of not finding links since finding links risks half-private pages being Google-indexable, while finding no links in weird cases just means weird enough things won't be Google-indexable unless I explicitly mark them into that list). """ result = set() f = route_metadata[route].file # TODO if we except some HTML files from rewriting then this # will be wrong: if f not in files_to_rewrite: return result contents = utils.read_file_binary(join('site', f)) for href in re.finditer( br'(?<!'+urlregexps.urlbyte+br')(?:'+ # Things that look like href= that are not a link to a page: # example text that talks about hrefs; <base href="...">. # Only <a>, <area> and possibly <link> elements can have # hrefs we're interested in. By excluding not-actually-links, # we can have broken-link detection without false positives # (or few enough false positives that it's trivial to work around). br'''<(?:[Aa]|[Aa][Rr][Ee][Aa]|[Ll][Ii][Nn][Kk])\s[^<>]*href=(?P<quote>["']?)(?P<url1>'''+urlregexps.urlbyte+br'''+)(?<!\?rr)(?P=quote)'''+ br'''|(?P<url2>'''+urlregexps.urlbyte+br'''+)\?rr'''+ br')(?!'+urlregexps.urlbyte+br')' , contents): url = href.group('url1') or href.group('url2') linktype = 'rr' if href.group('url2') != None else 'href' ref = html.unescape(url.decode('utf-8')) if linktype == 'rr': path = config.fake_resource_route+normpath(join(dirname(f), ref)) elif linktype == 'href': path = urldefrag(urljoin(route, ref))[0] if path in route_metadata: result.add(path) else: # (we don't currently try to check links to third-party websites) if path[:len(config.hypothetical_scheme_and_domain)] == config.hypothetical_scheme_and_domain: sys.stderr.write(route + ' links to nonexistent ' + ref + '\n') nonlocal broken_link_found broken_link_found = True return result