def test_robotstxt_error(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        err = error.DNSLookupError('Robotstxt address not found')
        def return_failure(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.errback, failure.Failure(err))
            return deferred
        self.crawler.engine.download.side_effect = return_failure

        middleware = RobotsTxtMiddleware(self.crawler)
        middleware._logerror = mock.MagicMock(side_effect=middleware._logerror)
        deferred = middleware.process_request(Request('http://site.local'), None)
        deferred.addCallback(lambda _: self.assertTrue(middleware._logerror.called))
        return deferred
 def test_robotstxt_empty_response(self):
     # empty response should equal 'allow all'
     middleware = RobotsTxtMiddleware(self._get_emptybody_crawler())
     return DeferredList([
         self.assertNotIgnored(Request('http://site.local/allowed'),
                               middleware),
         self.assertNotIgnored(Request('http://site.local/admin/main'),
                               middleware),
         self.assertNotIgnored(Request('http://site.local/static/'),
                               middleware)
     ],
                         fireOnOneErrback=True)
Esempio n. 3
0
    def test_robotstxt_immediate_error(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        err = error.DNSLookupError('Robotstxt address not found')

        def immediate_failure(request, spider):
            deferred = Deferred()
            deferred.errback(failure.Failure(err))
            return deferred
        self.crawler.engine.download.side_effect = immediate_failure

        middleware = RobotsTxtMiddleware(self.crawler)
        return self.assertNotIgnored(Request('http://site.local'), middleware)
 def test_robotstxt_meta(self):
     middleware = RobotsTxtMiddleware(self._get_successful_crawler())
     meta = {'dont_obey_robotstxt': True}
     return DeferredList([
         self.assertNotIgnored(
             Request('http://site.local/allowed', meta=meta), middleware),
         self.assertNotIgnored(
             Request('http://site.local/admin/main', meta=meta),
             middleware),
         self.assertNotIgnored(
             Request('http://site.local/static/', meta=meta), middleware)
     ],
                         fireOnOneErrback=True)
 def test_robotstxt_garbage(self):
     # garbage response should be discarded, equal 'allow all'
     middleware = RobotsTxtMiddleware(self._get_garbage_crawler())
     deferred = DeferredList([
         self.assertNotIgnored(Request('http://site.local'), middleware),
         self.assertNotIgnored(Request('http://site.local/allowed'),
                               middleware),
         self.assertNotIgnored(Request('http://site.local/admin/main'),
                               middleware),
         self.assertNotIgnored(Request('http://site.local/static/'),
                               middleware)
     ],
                             fireOnOneErrback=True)
     return deferred
    def test_ignore_robotstxt_request(self):
        self.crawler.settings.set('ROBOTSTXT_OBEY', True)
        def ignore_request(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.errback, failure.Failure(IgnoreRequest()))
            return deferred
        self.crawler.engine.download.side_effect = ignore_request

        middleware = RobotsTxtMiddleware(self.crawler)
        mw_module_logger.error = mock.MagicMock()

        d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
        d.addCallback(lambda _: self.assertFalse(mw_module_logger.error.called))
        return d
 def test_robotstxt(self):
     middleware = RobotsTxtMiddleware(self._get_successful_crawler())
     return DeferredList([
         self.assertNotIgnored(Request('http://site.local/allowed'),
                               middleware),
         self.assertIgnored(Request('http://site.local/admin/main'),
                            middleware),
         self.assertIgnored(Request('http://site.local/static/'),
                            middleware),
         self.assertIgnored(
             Request('http://site.local/wiki/K%C3%A4ytt%C3%A4j%C3%A4:'),
             middleware),
         self.assertIgnored(Request('http://site.local/wiki/Käyttäjä:'),
                            middleware)
     ],
                         fireOnOneErrback=True)
Esempio n. 8
0
    def test_robotstxt_empty_response(self):
        # empty response should equal 'allow all'
        middleware = RobotsTxtMiddleware(self._get_emptybody_crawler())
        self.assertNotIgnored(Request('http://site.local'), middleware)

        def test(r):
            self.assertNotIgnored(Request('http://site.local/allowed'),
                                  middleware)
            self.assertNotIgnored(Request('http://site.local/admin/main'),
                                  middleware)
            self.assertNotIgnored(Request('http://site.local/static/'),
                                  middleware)

        deferred = Deferred()
        deferred.addCallback(test)
        reactor.callFromThread(deferred.callback, None)
        return deferred
Esempio n. 9
0
    def test_robotstxt_meta(self):
        middleware = RobotsTxtMiddleware(self._get_successful_crawler())
        meta = {'dont_obey_robotstxt': True}
        self.assertNotIgnored(Request('http://site.local', meta=meta),
                              middleware)

        def test(r):
            self.assertNotIgnored(
                Request('http://site.local/allowed', meta=meta), middleware)
            self.assertNotIgnored(
                Request('http://site.local/admin/main', meta=meta), middleware)
            self.assertNotIgnored(
                Request('http://site.local/static/', meta=meta), middleware)

        deferred = Deferred()
        deferred.addCallback(test)
        reactor.callFromThread(deferred.callback, None)
        return deferred
Esempio n. 10
0
    def test_robotstxt(self):
        middleware = RobotsTxtMiddleware(self._get_successful_crawler())
        # There is a bit of neglect in robotstxt.py: robots.txt is fetched asynchronously,
        # and it is actually fetched only *after* first process_request completes.
        # So, first process_request will always succeed.
        # We defer test() because otherwise robots.txt download mock will be called after assertRaises failure.
        self.assertNotIgnored(Request('http://site.local'), middleware)

        def test(r):
            self.assertNotIgnored(Request('http://site.local/allowed'),
                                  middleware)
            self.assertIgnored(Request('http://site.local/admin/main'),
                               middleware)
            self.assertIgnored(Request('http://site.local/static/'),
                               middleware)

        deferred = Deferred()
        deferred.addCallback(test)
        reactor.callFromThread(deferred.callback, None)
        return deferred
 def _get_middleware(self):
     crawler = self._get_crawler()
     return RobotsTxtMiddleware(crawler)
 def test_robotstxt_ready_parser(self):
     middleware = RobotsTxtMiddleware(self._get_successful_crawler())
     d = self.assertNotIgnored(Request('http://site.local/allowed'), middleware)
     d.addCallback(lambda _: self.assertNotIgnored(Request('http://site.local/allowed'), middleware))
     return d