def test_skip_malformed_line(self):
        """If there is no colon in a line, then we must skip it"""
        rp = robotstxtparser.RobotExclusionRulesParser()
        rp.parse('''
            User-Agent: agent
            Disallow /no/colon/in/this/line
        ''')

        self.assertTrue(
            rp.is_allowed('agent', 'http://example.org/no/colon/in/this/line'))
    def test_case_insensitivity(self):
        """Make sure user agent matches are case insensitive"""
        rp = robotstxtparser.RobotExclusionRulesParser()
        rp.parse('''
            User-agent: Agent
            Disallow: /path
        ''')

        self.assertFalse(rp.is_allowed('agent', 'http://example.org/path'))
        self.assertFalse(rp.is_allowed('aGeNt', 'http://example.org/path'))
    def test_utf8_bom(self):
        """If there's a UTF-8 BOM, we should parse it as such"""
        rp = robotstxtparser.RobotExclusionRulesParser()
        rp.parse(codecs.BOM_UTF8 + b'''
            User-Agent: agent
            Allow: /path
            User-Agent: other
            Disallow: /path
        ''')

        self.assertTrue(rp.is_allowed('agent', 'http://example.org/path'))
        self.assertFalse(rp.is_allowed('other', 'http://example.org/path'))
    def test_grouping(self):
        """Multiple consecutive User-Agent lines are allowed."""
        rp = robotstxtparser.RobotExclusionRulesParser()
        rp.parse('''
            User-agent: one
            User-agent: two
            Disallow: /tmp
        ''')

        self.assertFalse(rp.is_allowed('one', 'http://example.org/tmp'))
        self.assertFalse(rp.is_allowed('two', 'http://example.org/tmp'))
        self.assertTrue(rp.is_allowed('agent', 'http://example.org/tmp'))
    def test_honors_specific_agent(self):
        """Honors the specific user agent if a match is found"""
        rp = robotstxtparser.RobotExclusionRulesParser()
        rp.parse('''
        User-agent: *
        Disallow: /tmp
        
        User-agent: agent
        Allow: /tmp
        ''')

        self.assertTrue(rp.is_allowed('agent', 'http://example.org/tmp'))
        self.assertTrue(rp.is_allowed('agent', 'http://example.org/path'))
    def test_support_grouping_blank_lines(self):
        """Make sure blank lines are ignored"""
        rp = robotstxtparser.RobotExclusionRulesParser()
        rp.parse('''
            User-Agent: agent
            
            Allow: /path
            
            Disallow: /tmp
            
            User-Agent: other
            
            Disallow: /path
        ''')

        self.assertTrue(rp.is_allowed('agent', 'http://example.org/path'))
        self.assertFalse(rp.is_allowed('agent', 'http://example.org/tmp'))
        self.assertFalse(rp.is_allowed('other', 'http://example.org/path'))
    def test_grouping_unknown_keys(self):
        """
        When we encounter unknown keys, we should disregard any grouping that may have
        happened between user agent rules.

        This is an example from the wild. Despite `Noindex` not being a valid directive,
        we'll not consider the '*' and 'ia_archiver' rules together.
        """
        rp = robotstxtparser.RobotExclusionRulesParser()
        rp.parse('''
            User-agent: *
            Disallow: /content/2/
            User-agent: *
            Noindex: /gb.html
            Noindex: /content/2/
            User-agent: ia_archiver
            Disallow: /
        ''')

        self.assertTrue(rp.is_allowed('agent', 'http://example.org/foo'))
        self.assertTrue(rp.is_allowed('ia_archiver', 'http://example.org/bar'))
    def test_wildcard(self):
        """Test wildcard directives"""
        rp = robotstxtparser.RobotExclusionRulesParser()
        rp.parse('''
            User-Agent: agent
            Disallow: */foo
            Disallow: /*.gif$
            
            User-Agent: *
            Allow: *****************/*.js$
        ''')

        self.assertFalse(rp.is_allowed('agent', 'http://example.org/foo'))
        self.assertFalse(
            rp.is_allowed('agent', 'http://example.org/path/foo/bar'))
        self.assertFalse(rp.is_allowed('agent',
                                       'http://example.org/image.gif'))
        self.assertTrue(rp.is_allowed('agent', 'http://example.org/image.jpg'))
        self.assertTrue(
            rp.is_allowed('other',
                          'http://example.org/inlife/daily/fashion-20160727/'))
    def test_rfc_example(self):
        """Tests the example provided by the RFC"""
        rp = robotstxtparser.RobotExclusionRulesParser()
        rp.parse('''
            # /robots.txt for http://www.fict.org/
            # comments to [email protected]
            
            User-agent: unhipbot
            Disallow: /
            
            User-agent: webcrawler
            User-agent: excite
            Disallow:
            
            User-agent: *
            Disallow: /org/plans.html
            Allow: /org/
            Allow: /serv
            Allow: /~mak
            Disallow: /
        ''')

        # The unhipbot bot
        self.assertFalse(rp.is_allowed('unhipbot', 'http://example.org/'))
        self.assertFalse(
            rp.is_allowed('unhipbot', 'http://example.org/index.html'))
        self.assertTrue(
            rp.is_allowed('unhipbot', 'http://example.org/robots.txt'))
        self.assertFalse(
            rp.is_allowed('unhipbot', 'http://example.org/server.html'))
        self.assertFalse(
            rp.is_allowed('unhipbot', 'http://example.org/services/fast.html'))
        self.assertFalse(
            rp.is_allowed('unhipbot', 'http://example.org/services/slow.html'))
        self.assertFalse(
            rp.is_allowed('unhipbot', 'http://example.org/orgo.gif'))
        self.assertFalse(
            rp.is_allowed('unhipbot', 'http://example.org/org/about.html'))
        self.assertFalse(
            rp.is_allowed('unhipbot', 'http://example.org/org/plans.html'))
        self.assertFalse(
            rp.is_allowed('unhipbot', 'http://example.org/%7Ejim/jim.html'))
        self.assertFalse(
            rp.is_allowed('unhipbot', 'http://example.org/%7Emak/mak.html'))

        # The webcrawler bot
        self.assertTrue(rp.is_allowed('webcrawler', 'http://example.org/'))
        self.assertTrue(
            rp.is_allowed('webcrawler', 'http://example.org/index.html'))
        self.assertTrue(
            rp.is_allowed('webcrawler', 'http://example.org/robots.txt'))
        self.assertTrue(
            rp.is_allowed('webcrawler', 'http://example.org/server.html'))
        self.assertTrue(
            rp.is_allowed('webcrawler',
                          'http://example.org/services/fast.html'))
        self.assertTrue(
            rp.is_allowed('webcrawler',
                          'http://example.org/services/slow.html'))
        self.assertTrue(
            rp.is_allowed('webcrawler', 'http://example.org/orgo.gif'))
        self.assertTrue(
            rp.is_allowed('webcrawler', 'http://example.org/org/about.html'))
        self.assertTrue(
            rp.is_allowed('webcrawler', 'http://example.org/org/plans.html'))
        self.assertTrue(
            rp.is_allowed('webcrawler', 'http://example.org/%7Ejim/jim.html'))
        self.assertTrue(
            rp.is_allowed('webcrawler', 'http://example.org/%7Emak/mak.html'))

        # The excite bot
        self.assertTrue(rp.is_allowed('excite', 'http://example.org/'))
        self.assertTrue(
            rp.is_allowed('excite', 'http://example.org/index.html'))
        self.assertTrue(
            rp.is_allowed('excite', 'http://example.org/robots.txt'))
        self.assertTrue(
            rp.is_allowed('excite', 'http://example.org/server.html'))
        self.assertTrue(
            rp.is_allowed('excite', 'http://example.org/services/fast.html'))
        self.assertTrue(
            rp.is_allowed('excite', 'http://example.org/services/slow.html'))
        self.assertTrue(rp.is_allowed('excite', 'http://example.org/orgo.gif'))
        self.assertTrue(
            rp.is_allowed('excite', 'http://example.org/org/about.html'))
        self.assertTrue(
            rp.is_allowed('excite', 'http://example.org/org/plans.html'))
        self.assertTrue(
            rp.is_allowed('excite', 'http://example.org/%7Ejim/jim.html'))
        self.assertTrue(
            rp.is_allowed('excite', 'http://example.org/%7Emak/mak.html'))

        # All others
        self.assertFalse(rp.is_allowed('anything', 'http://example.org/'))
        self.assertFalse(
            rp.is_allowed('anything', 'http://example.org/index.html'))
        self.assertTrue(
            rp.is_allowed('anything', 'http://example.org/robots.txt'))
        self.assertTrue(
            rp.is_allowed('anything', 'http://example.org/server.html'))
        self.assertTrue(
            rp.is_allowed('anything', 'http://example.org/services/fast.html'))
        self.assertTrue(
            rp.is_allowed('anything', 'http://example.org/services/slow.html'))
        self.assertFalse(
            rp.is_allowed('anything', 'http://example.org/orgo.gif'))
        self.assertTrue(
            rp.is_allowed('anything', 'http://example.org/org/about.html'))
        self.assertFalse(
            rp.is_allowed('anything', 'http://example.org/org/plans.html'))
        self.assertFalse(
            rp.is_allowed('anything', 'http://example.org/%7Ejim/jim.html'))
        self.assertTrue(
            rp.is_allowed('anything', 'http://example.org/%7Emak/mak.html'))