def test_cache_control(self): '''Make sure parsing of the ttl with cache control works''' for directive in ('no-cache', 'no-store', 'must-revalidate'): self.assertEqual(Utility.get_ttl({ 'cache-control': directive }, 5), 0) # Make sure that we can honor s-maxage for directive in ('s-maxage=10,foo', 's-maxage = 10'): self.assertEqual(Utility.get_ttl({ 'cache-control': directive }, 5), 10) # If we can't parse it as an integer, then we'll skip it self.assertEqual(Utility.get_ttl({ 'cache-control': 's-maxage = not int' }, 5), 5) # Make sure we can honor max-age for directive in ('max-age=10,foo', 'max-age = 10'): self.assertEqual(Utility.get_ttl({ 'cache-control': directive }, 5), 10) # If we can't parse it as an integer, then we'll skip it self.assertEqual(Utility.get_ttl({ 'cache-control': 'max-age = not int' }, 5), 5)
def get_robot_agent(root_domain: str, protocol="http") -> Rules: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(root_domain)[4] versions = ["http://", "https://", "http://www.", "https://www."] suffix = "/robots.txt" current = "" found = False for version in versions: temp_link = version + root_domain + suffix try: status_code, content_type = LinkChecker.get_response(temp_link) if status_code == ResponseCode.LinkOK: current = temp_link found = True break else: raise ConnectionError except: pass if found: try: robots = RobotsCache() req = robots.session.get(current) ttl = max(robots.min_ttl, Utility.get_ttl(req.headers, robots.default_ttl)) # And now parse the thing and return it return parser.Rules(current, req.status_code, req.content, time.time() + ttl) # rules = robots.fetch(current) # return rules except: return None else: return None
def test_cache_control(self): '''Make sure parsing of the ttl with cache control works''' for directive in ('no-cache', 'no-store', 'must-revalidate'): self.assertEqual(Utility.get_ttl({'cache-control': directive}, 5), 0) # Make sure that we can honor s-maxage for directive in ('s-maxage=10,foo', 's-maxage = 10'): self.assertEqual(Utility.get_ttl({'cache-control': directive}, 5), 10) # If we can't parse it as an integer, then we'll skip it self.assertEqual( Utility.get_ttl({'cache-control': 's-maxage = not int'}, 5), 5) # Make sure we can honor max-age for directive in ('max-age=10,foo', 'max-age = 10'): self.assertEqual(Utility.get_ttl({'cache-control': directive}, 5), 10) # If we can't parse it as an integer, then we'll skip it self.assertEqual( Utility.get_ttl({'cache-control': 'max-age = not int'}, 5), 5)
def test_expires(self): '''Make sure we can honor Expires''' # Test a plain-and-simple expires, using now as a default time import datetime ttl = Utility.get_ttl( { 'expires': (datetime.datetime.utcnow() + datetime.timedelta(seconds=10) ).strftime('%a, %d %b %Y %H:%M:%S %z') }, 5) self.assertLess(ttl, 11) self.assertGreater(ttl, 9) # Make sure this works when a date is provided now = datetime.datetime.utcnow() ttl = self.assertEqual( Utility.get_ttl( { 'expires': (now + datetime.timedelta(seconds=10) ).strftime('%a, %d %b %Y %H:%M:%S %z'), 'date': (now).strftime('%a, %d %b %Y %H:%M:%S %z') }, 5), 10) # If the date is unparseable, use 'now' ttl = Utility.get_ttl( { 'expires': (datetime.datetime.utcnow() + datetime.timedelta(seconds=10) ).strftime('%a, %d %b %Y %H:%M:%S %z'), 'date': 'not a valid time' }, 5) self.assertLess(ttl, 11) self.assertGreater(ttl, 9) # Lastly, if the 'expires' header is unparseable, then pass ttl = self.assertEqual( Utility.get_ttl({'expires': 'not a valid time'}, 5), 5)
def test_expires(self): '''Make sure we can honor Expires''' # Test a plain-and-simple expires, using now as a default time import datetime ttl = Utility.get_ttl({ 'expires': ( datetime.datetime.now() + datetime.timedelta(seconds=10) ).strftime('%a, %d %b %Y %H:%M:%S %z') }, 5) self.assertLess(ttl, 11) self.assertGreater(ttl, 9) # Make sure this works when a date is provided now = datetime.datetime.now() ttl = self.assertEqual(Utility.get_ttl({ 'expires': ( now + datetime.timedelta(seconds=10) ).strftime('%a, %d %b %Y %H:%M:%S %z'), 'date': ( now ).strftime('%a, %d %b %Y %H:%M:%S %z') }, 5), 10) # If the date is unparseable, use 'now' ttl = Utility.get_ttl({ 'expires': ( datetime.datetime.now() + datetime.timedelta(seconds=10) ).strftime('%a, %d %b %Y %H:%M:%S %z'), 'date': 'not a valid time' }, 5) self.assertLess(ttl, 11) self.assertGreater(ttl, 9) # Lastly, if the 'expires' header is unparseable, then pass ttl = self.assertEqual(Utility.get_ttl({ 'expires': 'not a valid time' }, 5), 5)
def _parse_robots(self, response): #A lot of work to provide the expire time which we don't actually use ttl = max(self.min_ttl, Utility.get_ttl(response.headers, self.default_ttl)) rp = Rules(response.url, response.status, response.body, time.time() + ttl) rp.parse(response.body) self._parsers[urlparse_cached(response).netloc] = rp