def test_normalize_host(): assert urlcanon.normalize_host('EXAMPLE.Com') == b'example.com' assert urlcanon.normalize_host('₹.com') == b'xn--yzg.com' assert urlcanon.normalize_host('XN--fa-Hia.de..') == b'xn--fa-hia.de' assert urlcanon.normalize_host('☕.de') == b'xn--53h.de' assert urlcanon.normalize_host( '日本⒈co.jp') == b'%E6%97%A5%E6%9C%AC%E2%92%88co%EF%BC%8Ejp' assert urlcanon.normalize_host('☃.net') == b'xn--n3h.net' assert urlcanon.normalize_host('%e2%98%83.n%45t') == b'xn--n3h.net' assert urlcanon.normalize_host('%25e2%98%%383.N%45t') == b'xn--n3h.net'
def test_host_matches_domain(): assert urlcanon.host_matches_domain('1.2.3.4', '1.2.3.4') assert urlcanon.host_matches_domain(b'1.2.3.4', '1.2.3.4') assert urlcanon.host_matches_domain('1.2.3.4', b'1.2.3.4') assert urlcanon.host_matches_domain(b'1.2.3.4', b'1.2.3.4') assert urlcanon.host_matches_domain('foo.example.com', 'example.com') assert not urlcanon.host_matches_domain('example.com', 'foo.example.com') assert not urlcanon.host_matches_domain('foo.EXAMPLE.COM', 'example.com') assert urlcanon.host_matches_domain( urlcanon.normalize_host('foo.EXAMPLE.COM'), 'example.com') assert not urlcanon.host_matches_domain('☃.net', 'xn--n3h.net') assert urlcanon.host_matches_domain('☃.net', '☃.net') assert urlcanon.host_matches_domain('😬.☃.net', '☃.net') assert not urlcanon.host_matches_domain( '😬.☃.net', urlcanon.normalize_host('☃.net')) assert urlcanon.host_matches_domain( urlcanon.normalize_host('😬.☃.net'), urlcanon.normalize_host('☃.net'))
def test_url_matches_domain(): assert urlcanon.url_matches_domain('http://1.2.3.4/', '1.2.3.4') assert urlcanon.url_matches_domain(b'scheme://1.2.3.4', '1.2.3.4') assert urlcanon.url_matches_domain('ftp://1.2.3.4/a/b/c/d', b'1.2.3.4') assert urlcanon.url_matches_domain(b'http://1.2.3.4', b'1.2.3.4') assert urlcanon.url_matches_domain( 'http://foo.example.com', 'example.com') assert not urlcanon.url_matches_domain( 'http://example.com', 'foo.example.com') assert not urlcanon.url_matches_domain( 'http://foo.EXAMPLE.COM', 'example.com') assert urlcanon.url_matches_domain( urlcanon.whatwg('http://foo.EXAMPLE.COM'), 'example.com') assert not urlcanon.url_matches_domain('http://☃.net', 'xn--n3h.net') assert urlcanon.url_matches_domain('http://☃.net', '☃.net') assert urlcanon.url_matches_domain('http://😬.☃.net', '☃.net') assert not urlcanon.url_matches_domain( 'http://😬.☃.net', urlcanon.normalize_host('☃.net')) assert urlcanon.url_matches_domain( urlcanon.whatwg('https://😬.☃.net'), urlcanon.normalize_host('☃.net'))
def _enforce_limit(self, limit_key, limit_value, soft=False): if not self.server.stats_db: return bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) _limit_key = limit_key # if limit_key looks like 'job1:foo.com/total/urls' then we only want # to apply this rule if the requested url is within domain bucket0_fields = bucket0.split(':') if len(bucket0_fields) == 2: domain = urlcanon.normalize_host(bucket0_fields[1]) if not urlcanon.host_matches_domain(self.hostname, domain): return # else host matches, go ahead and enforce the limit bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii')) _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and limit_value and limit_value > 0 and value >= limit_value: body = ("request rejected by warcprox: reached %s %s=%s\n" % ( "soft limit" if soft else "limit", _limit_key, limit_value)).encode("utf-8") if soft: self.send_response(430, "Reached soft limit") else: self.send_response(420, "Reached limit") self.send_header("Content-Type", "text/plain;charset=utf-8") self.send_header("Connection", "close") self.send_header("Content-Length", len(body)) response_meta = { "stats": {bucket0:self.server.stats_db.value(bucket0)} } if soft: response_meta["reached-soft-limit"] = {_limit_key:limit_value} else: response_meta["reached-limit"] = {_limit_key:limit_value} self.send_header( "Warcprox-Meta", json.dumps(response_meta, separators=(",",":"))) self.end_headers() if self.command != "HEAD": self.wfile.write(body) self.connection.close() raise warcprox.RequestBlockedByRule( "%s %s %s %s -- reached %s %s=%s" % ( self.client_address[0], 430 if soft else 420, self.command, self.url, "soft limit" if soft else "limit", _limit_key, limit_value))
def _enforce_limit(self, buckets, limit_key, limit_value, soft=False): if not self.server.stats_db: return # parse limit key bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) # normalize domain if part of bucket if ":" in bucket0: b, raw_domain = bucket0.split(":", 1) domain = urlcanon.normalize_host(raw_domain).decode("ascii") bucket0 = "%s:%s" % (b, domain) limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2) if not bucket0 in buckets: return value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and limit_value and limit_value > 0 and value >= limit_value: body = ("request rejected by warcprox: reached %s %s=%s\n" % ("soft limit" if soft else "limit", limit_key, limit_value)).encode("utf-8") if soft: self.send_response(430, "Reached soft limit") else: self.send_response(420, "Reached limit") self.send_header("Content-Type", "text/plain;charset=utf-8") self.send_header("Connection", "close") self.send_header("Content-Length", len(body)) response_meta = { "stats": { bucket0: self.server.stats_db.value(bucket0) } } if soft: response_meta["reached-soft-limit"] = {limit_key: limit_value} else: response_meta["reached-limit"] = {limit_key: limit_value} self.send_header("Warcprox-Meta", json.dumps(response_meta, separators=",:")) self.end_headers() if self.command != "HEAD": self.wfile.write(body) self.connection.close() raise warcprox.RequestBlockedByRule( "%s %s %s %s -- reached %s %s=%s" % (self.client_address[0], 430 if soft else 420, self.command, self.url, "soft limit" if soft else "limit", limit_key, limit_value))
def _enforce_limit(self, buckets, limit_key, limit_value, soft=False): if not self.server.stats_db: return # parse limit key bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) # normalize domain if part of bucket if ":" in bucket0: b, raw_domain = bucket0.split(":", 1) domain = urlcanon.normalize_host(raw_domain).decode("ascii") bucket0 = "%s:%s" % (b, domain) limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2) if not bucket0 in buckets: return value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and limit_value and limit_value > 0 and value >= limit_value: body = ("request rejected by warcprox: reached %s %s=%s\n" % ( "soft limit" if soft else "limit", limit_key, limit_value)).encode("utf-8") if soft: self.send_response(430, "Reached soft limit") else: self.send_response(420, "Reached limit") self.send_header("Content-Type", "text/plain;charset=utf-8") self.send_header("Connection", "close") self.send_header("Content-Length", len(body)) response_meta = { "stats": {bucket0:self.server.stats_db.value(bucket0)} } if soft: response_meta["reached-soft-limit"] = {limit_key:limit_value} else: response_meta["reached-limit"] = {limit_key:limit_value} self.send_header( "Warcprox-Meta", json.dumps(response_meta, separators=",:")) self.end_headers() if self.command != "HEAD": self.wfile.write(body) self.connection.close() raise warcprox.RequestBlockedByRule( "%s %s %s %s -- reached %s %s=%s" % ( self.client_address[0], 430 if soft else 420, self.command, self.url, "soft limit" if soft else "limit", limit_key, limit_value))
def unravel_buckets(url, warcprox_meta): ''' Unravels bucket definitions in Warcprox-Meta header. Each bucket definition can either be a string, which signifies the name of the bucket, or a dict. If a dict it is expected to have at least an item with key 'bucket' whose value is the name of the bucket. The other currently recognized item is 'tally-domains', which if supplied should be a list of domains. This instructs warcprox to additionally tally substats of the given bucket by domain. Host stats are stored in the stats table under the key '{parent-bucket}:{domain(normalized)}'. Returns: list of strings Example Warcprox-Meta header (a real one will likely have other sections besides 'stats'): Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} In this case the return value would be ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"] ''' buckets = ["__all__"] if (warcprox_meta and "stats" in warcprox_meta and "buckets" in warcprox_meta["stats"]): for bucket in warcprox_meta["stats"]["buckets"]: if isinstance(bucket, dict): if not 'bucket' in bucket: self.logger.warning( 'ignoring invalid stats bucket in ' 'warcprox-meta header %s', bucket) continue buckets.append(bucket['bucket']) if bucket.get('tally-domains'): canon_url = urlcanon.semantic(url) for domain in bucket['tally-domains']: domain = urlcanon.normalize_host(domain).decode( 'ascii') if urlcanon.url_matches_domain(canon_url, domain): buckets.append('%s:%s' % (bucket['bucket'], domain)) else: buckets.append(bucket) else: buckets.append("__unspecified__") return buckets
def _enforce_limit(self, limit_key, limit_value, soft=False): bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) _limit_key = limit_key # if limit_key looks like 'job1:foo.com/total/urls' then we only want # to apply this rule if the requested url is within domain bucket0_fields = bucket0.split(':') if len(bucket0_fields) == 2: domain = urlcanon.normalize_host(bucket0_fields[1]) if not urlcanon.host_matches_domain(self.hostname, domain): return # else host matches, go ahead and enforce the limit bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii')) _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and limit_value and limit_value > 0 and value >= limit_value: body = ("request rejected by warcprox: reached %s %s=%s\n" % ( "soft limit" if soft else "limit", _limit_key, limit_value)).encode("utf-8") if soft: self.send_response(430, "Reached soft limit") else: self.send_response(420, "Reached limit") self.send_header("Content-Type", "text/plain;charset=utf-8") self.send_header("Connection", "close") self.send_header("Content-Length", len(body)) response_meta = { "stats": {bucket0:self.server.stats_db.value(bucket0)} } if soft: response_meta["reached-soft-limit"] = {_limit_key:limit_value} else: response_meta["reached-limit"] = {_limit_key:limit_value} self.send_header( "Warcprox-Meta", json.dumps(response_meta, separators=(",",":"))) self.end_headers() if self.command != "HEAD": self.wfile.write(body) self.connection.close() raise warcprox.RequestBlockedByRule( "%s %s %s %s -- reached %s %s=%s" % ( self.client_address[0], 430 if soft else 420, self.command, self.url, "soft limit" if soft else "limit", _limit_key, limit_value))
def unravel_buckets(url, warcprox_meta): ''' Unravels bucket definitions in Warcprox-Meta header. Each bucket definition can either be a string, which signifies the name of the bucket, or a dict. If a dict it is expected to have at least an item with key 'bucket' whose value is the name of the bucket. The other currently recognized item is 'tally-domains', which if supplied should be a list of domains. This instructs warcprox to additionally tally substats of the given bucket by domain. Host stats are stored in the stats table under the key '{parent-bucket}:{domain(normalized)}'. Returns: list of strings Example Warcprox-Meta header (a real one will likely have other sections besides 'stats'): Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} In this case the return value would be ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"] ''' buckets = ["__all__"] if (warcprox_meta and "stats" in warcprox_meta and "buckets" in warcprox_meta["stats"]): for bucket in warcprox_meta["stats"]["buckets"]: if isinstance(bucket, dict): if not 'bucket' in bucket: self.logger.warn( 'ignoring invalid stats bucket in ' 'warcprox-meta header %s', bucket) continue buckets.append(bucket['bucket']) if bucket.get('tally-domains'): canon_url = urlcanon.semantic(url) for domain in bucket['tally-domains']: domain = urlcanon.normalize_host(domain).decode('ascii') if urlcanon.url_matches_domain(canon_url, domain): buckets.append( '%s:%s' % (bucket['bucket'], domain)) else: buckets.append(bucket) else: buckets.append("__unspecified__") return buckets
def _determine_host_port(self): # Get hostname and port to connect to if self.is_connect: host, self.port = self.path.split(':') else: self.url = self.path u = urllib_parse.urlparse(self.url) if u.scheme != 'http': raise Exception( 'unable to parse request %r as a proxy request' % ( self.requestline)) host = u.hostname self.port = u.port or 80 self.path = urllib_parse.urlunparse( urllib_parse.ParseResult( scheme='', netloc='', params=u.params, path=u.path or '/', query=u.query, fragment=u.fragment)) self.hostname = urlcanon.normalize_host(host).decode('ascii')