def test_robotstxt_ready_parser(self):
     middleware = RobotsTxtMiddleware(self._get_successful_crawler())
     d = self.assertNotIgnored(Request('http://site.local/allowed'),
                               middleware)
     d.addCallback(lambda _: self.assertNotIgnored(
         Request('http://site.local/allowed'), middleware))
     return d
 def test_robotstxt(self):
     middleware = RobotsTxtMiddleware(self._get_successful_crawler())
     return DeferredList([
         self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
         self.assertIgnored(Request('http://site.local/admin/main'), middleware),
         self.assertIgnored(Request('http://site.local/static/'), middleware)
     ], fireOnOneErrback=True)
 def test_robotstxt_meta(self):
     middleware = RobotsTxtMiddleware(self._get_successful_crawler())
     meta = {'dont_obey_robotstxt': True}
     return DeferredList([
         self.assertNotIgnored(Request('http://site.local/allowed', meta=meta), middleware),
         self.assertNotIgnored(Request('http://site.local/admin/main', meta=meta), middleware),
         self.assertNotIgnored(Request('http://site.local/static/', meta=meta), middleware)
     ], fireOnOneErrback=True)
 def test_robotstxt_empty_response(self):
     # empty response should equal 'allow all'
     middleware = RobotsTxtMiddleware(self._get_emptybody_crawler())
     return DeferredList([
         self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
         self.assertNotIgnored(Request('http://site.local/admin/main'), middleware),
         self.assertNotIgnored(Request('http://site.local/static/'), middleware)
     ], fireOnOneErrback=True)
Esempio n. 5
0
 def test_robotstxt_user_agent_setting(self):
     crawler = self._get_successful_crawler()
     crawler.settings.set('ROBOTSTXT_USER_AGENT', 'Examplebot')
     crawler.settings.set('USER_AGENT', 'Mozilla/5.0 (X11; Linux x86_64)')
     middleware = RobotsTxtMiddleware(crawler)
     rp = mock.MagicMock(return_value=True)
     middleware.process_request_2(rp, Request('http://site.local/allowed'), None)
     rp.allowed.assert_called_once_with('http://site.local/allowed', 'Examplebot')
 def test_robotstxt_garbage(self):
     # garbage response should be discarded, equal 'allow all'
     middleware = RobotsTxtMiddleware(self._get_garbage_crawler())
     deferred = DeferredList([
         self.assertNotIgnored(Request('http://site.local'), middleware),
         self.assertNotIgnored(Request('http://site.local/allowed'), middleware),
         self.assertNotIgnored(Request('http://site.local/admin/main'), middleware),
         self.assertNotIgnored(Request('http://site.local/static/'), middleware)
     ], fireOnOneErrback=True)
     return deferred
    def test_robotstxt_immediate_error(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        err = error.DNSLookupError('Robotstxt address not found')
        def immediate_failure(request, spider):
            deferred = Deferred()
            deferred.errback(failure.Failure(err))
            return deferred
        self.crawler.engine.download.side_effect = immediate_failure

        middleware = RobotsTxtMiddleware(self.crawler)
        return self.assertNotIgnored(Request('http://site.local'), middleware)
    def test_ignore_robotstxt_request(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        def ignore_request(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.errback, failure.Failure(IgnoreRequest()))
            return deferred
        self.crawler.engine.download.side_effect = ignore_request

        middleware = RobotsTxtMiddleware(self.crawler)
        mw_module_logger.error = mock.MagicMock()

        d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
        d.addCallback(lambda _: self.assertFalse(mw_module_logger.error.called))
        return d
    def test_robotstxt_error(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        err = error.DNSLookupError('Robotstxt address not found')
        def return_failure(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.errback, failure.Failure(err))
            return deferred
        self.crawler.engine.download.side_effect = return_failure

        middleware = RobotsTxtMiddleware(self.crawler)
        middleware._logerror = mock.MagicMock(side_effect=middleware._logerror)
        deferred = middleware.process_request(Request('http://site.local'), None)
        deferred.addCallback(lambda _: self.assertTrue(middleware._logerror.called))
        return deferred
Esempio n. 10
0
    def test_robotstxt_empty_response(self):
        # empty response should equal 'allow all'
        middleware = RobotsTxtMiddleware(self._get_emptybody_crawler())
        self.assertNotIgnored(Request('http://site.local'), middleware)

        def test(r):
            self.assertNotIgnored(Request('http://site.local/allowed'),
                                  middleware)
            self.assertNotIgnored(Request('http://site.local/admin/main'),
                                  middleware)
            self.assertNotIgnored(Request('http://site.local/static/'),
                                  middleware)

        deferred = Deferred()
        deferred.addCallback(test)
        reactor.callFromThread(deferred.callback, None)
        return deferred
Esempio n. 11
0
    def test_robotstxt_meta(self):
        middleware = RobotsTxtMiddleware(self._get_successful_crawler())
        meta = {'dont_obey_robotstxt': True}
        self.assertNotIgnored(Request('http://site.local', meta=meta),
                              middleware)

        def test(r):
            self.assertNotIgnored(
                Request('http://site.local/allowed', meta=meta), middleware)
            self.assertNotIgnored(
                Request('http://site.local/admin/main', meta=meta), middleware)
            self.assertNotIgnored(
                Request('http://site.local/static/', meta=meta), middleware)

        deferred = Deferred()
        deferred.addCallback(test)
        reactor.callFromThread(deferred.callback, None)
        return deferred
 def test_robotstxt(self):
     middleware = RobotsTxtMiddleware(self._get_successful_crawler())
     return DeferredList(
         [
             self.assertNotIgnored(Request("http://site.local/allowed"),
                                   middleware),
             self.assertIgnored(Request("http://site.local/admin/main"),
                                middleware),
             self.assertIgnored(Request("http://site.local/static/"),
                                middleware),
             self.assertIgnored(
                 Request("http://site.local/wiki/K%C3%A4ytt%C3%A4j%C3%A4:"),
                 middleware,
             ),
             self.assertIgnored(Request("http://site.local/wiki/Käyttäjä:"),
                                middleware),
         ],
         fireOnOneErrback=True,
     )
Esempio n. 13
0
    def test_robotstxt(self):
        middleware = RobotsTxtMiddleware(self._get_successful_crawler())
        # There is a bit of neglect in robotstxt.py: robots.txt is fetched asynchronously,
        # and it is actually fetched only *after* first process_request completes.
        # So, first process_request will always succeed.
        # We defer test() because otherwise robots.txt download mock will be called after assertRaises failure.
        self.assertNotIgnored(Request('http://site.local'), middleware)

        def test(r):
            self.assertNotIgnored(Request('http://site.local/allowed'),
                                  middleware)
            self.assertIgnored(Request('http://site.local/admin/main'),
                               middleware)
            self.assertIgnored(Request('http://site.local/static/'),
                               middleware)

        deferred = Deferred()
        deferred.addCallback(test)
        reactor.callFromThread(deferred.callback, None)
        return deferred
Esempio n. 14
0
    def test_robotstxt_garbage(self):
        # garbage response should be discarded, equal 'allow all'
        middleware = RobotsTxtMiddleware(self._get_garbage_crawler())
        middleware._logerror = mock.MagicMock()
        middleware.process_request(Request('http://site.local'), None)
        self.assertNotIgnored(Request('http://site.local'), middleware)

        def test(r):
            self.assertNotIgnored(Request('http://site.local/allowed'),
                                  middleware)
            self.assertNotIgnored(Request('http://site.local/admin/main'),
                                  middleware)
            self.assertNotIgnored(Request('http://site.local/static/'),
                                  middleware)

        deferred = Deferred()
        deferred.addCallback(test)
        deferred.addErrback(lambda _: self.assertIsNone(middleware._logerror.
                                                        assert_any_call()))
        reactor.callFromThread(deferred.callback, None)
        return deferred
 def _get_middleware(self):
     crawler = self._get_crawler()
     return RobotsTxtMiddleware(crawler)