Exemple #1
0
    def test_cache_control(self):
        '''Make sure parsing of the ttl with cache control works'''
        for directive in ('no-cache', 'no-store', 'must-revalidate'):
            self.assertEqual(Utility.get_ttl({
                'cache-control': directive
            }, 5), 0)

        # Make sure that we can honor s-maxage
        for directive in ('s-maxage=10,foo', 's-maxage = 10'):
            self.assertEqual(Utility.get_ttl({
                'cache-control': directive
            }, 5), 10)
        # If we can't parse it as an integer, then we'll skip it
        self.assertEqual(Utility.get_ttl({
            'cache-control': 's-maxage = not int'
        }, 5), 5)

        # Make sure we can honor max-age
        for directive in ('max-age=10,foo', 'max-age = 10'):
            self.assertEqual(Utility.get_ttl({
                'cache-control': directive
            }, 5), 10)
        # If we can't parse it as an integer, then we'll skip it
        self.assertEqual(Utility.get_ttl({
            'cache-control': 'max-age = not int'
        }, 5), 5)
    def get_robot_agent(root_domain: str, protocol="http") -> Rules:
        if root_domain.startswith("http"):
            root_domain = LinkChecker.get_root_domain(root_domain)[4]
        versions = ["http://", "https://", "http://www.", "https://www."]
        suffix = "/robots.txt"
        current = ""
        found = False
        for version in versions:
            temp_link = version + root_domain + suffix
            try:
                status_code, content_type = LinkChecker.get_response(temp_link)
                if status_code == ResponseCode.LinkOK:
                    current = temp_link
                    found = True
                    break
                else:
                    raise ConnectionError
            except:
                pass
        if found:
            try:
                robots = RobotsCache()
                req = robots.session.get(current)
                ttl = max(robots.min_ttl,
                          Utility.get_ttl(req.headers, robots.default_ttl))
                # And now parse the thing and return it
                return parser.Rules(current, req.status_code, req.content,
                                    time.time() + ttl)

                # rules = robots.fetch(current)
                # return rules
            except:
                return None
        else:
            return None
Exemple #3
0
    def test_cache_control(self):
        '''Make sure parsing of the ttl with cache control works'''
        for directive in ('no-cache', 'no-store', 'must-revalidate'):
            self.assertEqual(Utility.get_ttl({'cache-control': directive}, 5),
                             0)

        # Make sure that we can honor s-maxage
        for directive in ('s-maxage=10,foo', 's-maxage = 10'):
            self.assertEqual(Utility.get_ttl({'cache-control': directive}, 5),
                             10)
        # If we can't parse it as an integer, then we'll skip it
        self.assertEqual(
            Utility.get_ttl({'cache-control': 's-maxage = not int'}, 5), 5)

        # Make sure we can honor max-age
        for directive in ('max-age=10,foo', 'max-age = 10'):
            self.assertEqual(Utility.get_ttl({'cache-control': directive}, 5),
                             10)
        # If we can't parse it as an integer, then we'll skip it
        self.assertEqual(
            Utility.get_ttl({'cache-control': 'max-age = not int'}, 5), 5)
Exemple #4
0
    def test_expires(self):
        '''Make sure we can honor Expires'''
        # Test a plain-and-simple expires, using now as a default time
        import datetime
        ttl = Utility.get_ttl(
            {
                'expires':
                (datetime.datetime.utcnow() + datetime.timedelta(seconds=10)
                 ).strftime('%a, %d %b %Y %H:%M:%S %z')
            }, 5)
        self.assertLess(ttl, 11)
        self.assertGreater(ttl, 9)

        # Make sure this works when a date is provided
        now = datetime.datetime.utcnow()
        ttl = self.assertEqual(
            Utility.get_ttl(
                {
                    'expires': (now + datetime.timedelta(seconds=10)
                                ).strftime('%a, %d %b %Y %H:%M:%S %z'),
                    'date': (now).strftime('%a, %d %b %Y %H:%M:%S %z')
                }, 5), 10)

        # If the date is unparseable, use 'now'
        ttl = Utility.get_ttl(
            {
                'expires':
                (datetime.datetime.utcnow() + datetime.timedelta(seconds=10)
                 ).strftime('%a, %d %b %Y %H:%M:%S %z'),
                'date':
                'not a valid time'
            }, 5)
        self.assertLess(ttl, 11)
        self.assertGreater(ttl, 9)

        # Lastly, if the 'expires' header is unparseable, then pass
        ttl = self.assertEqual(
            Utility.get_ttl({'expires': 'not a valid time'}, 5), 5)
Exemple #5
0
    def test_expires(self):
        '''Make sure we can honor Expires'''
        # Test a plain-and-simple expires, using now as a default time
        import datetime
        ttl = Utility.get_ttl({
            'expires': (
                datetime.datetime.now() + datetime.timedelta(seconds=10)
            ).strftime('%a, %d %b %Y %H:%M:%S %z')
        }, 5)
        self.assertLess(ttl, 11)
        self.assertGreater(ttl, 9)

        # Make sure this works when a date is provided
        now = datetime.datetime.now()
        ttl = self.assertEqual(Utility.get_ttl({
            'expires': (
                now + datetime.timedelta(seconds=10)
            ).strftime('%a, %d %b %Y %H:%M:%S %z'),
            'date': (
                now
            ).strftime('%a, %d %b %Y %H:%M:%S %z')
        }, 5), 10)

        # If the date is unparseable, use 'now'
        ttl = Utility.get_ttl({
            'expires': (
                datetime.datetime.now() + datetime.timedelta(seconds=10)
            ).strftime('%a, %d %b %Y %H:%M:%S %z'),
            'date': 'not a valid time'
        }, 5)
        self.assertLess(ttl, 11)
        self.assertGreater(ttl, 9)

        # Lastly, if the 'expires' header is unparseable, then pass
        ttl = self.assertEqual(Utility.get_ttl({
            'expires': 'not a valid time'
        }, 5), 5)
Exemple #6
0
 def _parse_robots(self, response):
     #A lot of work to provide the expire time which we don't actually use
     ttl = max(self.min_ttl, Utility.get_ttl(response.headers, self.default_ttl))
     rp = Rules(response.url, response.status, response.body, time.time() + ttl)
     rp.parse(response.body)
     self._parsers[urlparse_cached(response).netloc] = rp