Example #1
0
    def test_log_debug(self):
        with LogCapture() as l:
            settings = {'DUPEFILTER_DEBUG': True,
                        'DUPEFILTER_CLASS': __name__  + '.FromCrawlerRFPDupeFilter'}
            crawler = get_crawler(SimpleSpider, settings_dict=settings)
            scheduler = Scheduler.from_crawler(crawler)
            spider = SimpleSpider.from_crawler(crawler)

            dupefilter = scheduler.df
            dupefilter.open()

            r1 = Request('http://scrapytest.org/index.html')
            r2 = Request('http://scrapytest.org/index.html',
                headers={'Referer': 'http://scrapytest.org/INDEX.html'}
            )

            dupefilter.log(r1, spider)
            dupefilter.log(r2, spider)

            assert crawler.stats.get_value('dupefilter/filtered') == 2
            l.check_present(('scrapy.dupefilters', 'DEBUG',
                ('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
                ' (referer: None)')))
            l.check_present(('scrapy.dupefilters', 'DEBUG',
                ('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
                ' (referer: http://scrapytest.org/INDEX.html)')))

            dupefilter.close('finished')
Example #2
0
 def test_df_from_settings_scheduler(self):
     settings = {'DUPEFILTER_DEBUG': True,
                 'DUPEFILTER_CLASS': __name__  + '.FromSettingsRFPDupeFilter'}
     crawler = get_crawler(settings_dict=settings)
     scheduler = Scheduler.from_crawler(crawler)
     self.assertTrue(scheduler.df.debug)
     self.assertEqual(scheduler.df.method, 'from_settings')
Example #3
0
 def test_df_from_settings_scheduler(self):
     settings = {'DUPEFILTER_DEBUG': True,
                 'DUPEFILTER_CLASS': __name__  + '.FromSettingsRFPDupeFilter'}
     crawler = get_crawler(settings_dict=settings)
     scheduler = Scheduler.from_crawler(crawler)
     self.assertTrue(scheduler.df.debug)
     self.assertEqual(scheduler.df.method, 'from_settings')
Example #4
0
    def test_log(self):
        with LogCapture() as l:
            settings = {
                'DUPEFILTER_DEBUG': False,
                'DUPEFILTER_CLASS': __name__ + '.FromCrawlerRFPDupeFilter'
            }
            crawler = get_crawler(SimpleSpider, settings_dict=settings)
            scheduler = Scheduler.from_crawler(crawler)
            spider = SimpleSpider.from_crawler(crawler)

            dupefilter = scheduler.df
            dupefilter.open()

            r1 = Request('http://scrapytest.org/index.html')
            r2 = Request('http://scrapytest.org/index.html')

            dupefilter.log(r1, spider)
            dupefilter.log(r2, spider)

            assert crawler.stats.get_value('dupefilter/filtered') == 2
            l.check_present(('scrapy.dupefilters', 'DEBUG', (
                'Filtered duplicate request: <GET http://scrapytest.org/index.html>'
                ' - no more duplicates will be shown'
                ' (see DUPEFILTER_DEBUG to show all duplicates)')))

            dupefilter.close('finished')
Example #5
0
    def test_seenreq_newlines(self):
        """ Checks against adding duplicate \r to
        line endings on Windows platforms. """

        r1 = Request('http://scrapytest.org/1')

        path = tempfile.mkdtemp()
        crawler = get_crawler(settings_dict={'JOBDIR': path})
        try:
            scheduler = Scheduler.from_crawler(crawler)
            df = scheduler.df
            df.open()
            df.request_seen(r1)
            df.close('finished')

            with open(os.path.join(path, 'requests.seen'), 'rb') as seen_file:
                line = next(seen_file).decode()
                assert not line.endswith('\r\r\n')
                if sys.platform == 'win32':
                    assert line.endswith('\r\n')
                else:
                    assert line.endswith('\n')

        finally:
            shutil.rmtree(path)
Example #6
0
    def test_log_debug(self):
        with LogCapture() as l:
            settings = {'DUPEFILTER_DEBUG': True,
                        'DUPEFILTER_CLASS': __name__  + '.FromCrawlerRFPDupeFilter'}
            crawler = get_crawler(SimpleSpider, settings_dict=settings)
            scheduler = Scheduler.from_crawler(crawler)
            spider = SimpleSpider.from_crawler(crawler)

            dupefilter = scheduler.df
            dupefilter.open()

            r1 = Request('http://scrapytest.org/index.html')
            r2 = Request('http://scrapytest.org/index.html',
                headers={'Referer': 'http://scrapytest.org/INDEX.html'}
            )
            
            dupefilter.log(r1, spider)
            dupefilter.log(r2, spider)

            assert crawler.stats.get_value('dupefilter/filtered') == 2
            l.check_present(('scrapy.dupefilters', 'DEBUG',
                ('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
                ' (referer: None)')))
            l.check_present(('scrapy.dupefilters', 'DEBUG',
                ('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
                ' (referer: http://scrapytest.org/INDEX.html)')))

            dupefilter.close('finished')
 def _incompatible(self):
     settings = dict(SCHEDULER_PRIORITY_QUEUE=
                     'scrapy.pqueues.DownloaderAwarePriorityQueue',
                     CONCURRENT_REQUESTS_PER_IP=1)
     crawler = Crawler(Spider, settings)
     scheduler = Scheduler.from_crawler(crawler)
     spider = Spider(name='spider')
     scheduler.open(spider)
Example #8
0
 def test_df_direct_scheduler(self):
     settings = {
         'DUPEFILTER_CLASS': DirectDupeFilter,
         'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'
     }
     crawler = get_crawler(settings_dict=settings)
     scheduler = Scheduler.from_crawler(crawler)
     self.assertEqual(scheduler.df.method, 'n/a')
Example #9
0
def _get_dupefilter(*, crawler=None, settings=None, open=True):
    if crawler is None:
        crawler = get_crawler(settings_dict=settings)
    scheduler = Scheduler.from_crawler(crawler)
    dupefilter = scheduler.df
    if open:
        dupefilter.open()
    return dupefilter
Example #10
0
 def _incompatible(self):
     settings = dict(
             SCHEDULER_PRIORITY_QUEUE='scrapy.pqueues.DownloaderAwarePriorityQueue',
             CONCURRENT_REQUESTS_PER_IP=1
             )
     crawler = Crawler(Spider, settings)
     scheduler = Scheduler.from_crawler(crawler)
     spider = Spider(name='spider')
     scheduler.open(spider)
Example #11
0
 def test_df_from_settings_scheduler(self):
     settings = {
         'DUPEFILTER_DEBUG': True,
         'DUPEFILTER_CLASS': FromSettingsRFPDupeFilter,
         'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION'
     }
     crawler = get_crawler(settings_dict=settings)
     scheduler = Scheduler.from_crawler(crawler)
     self.assertTrue(scheduler.df.debug)
     self.assertEqual(scheduler.df.method, 'from_settings')
Example #12
0
    def test_log(self):
        with LogCapture() as l:
            settings = {'DUPEFILTER_DEBUG': False,
                        'DUPEFILTER_CLASS': __name__  + '.FromCrawlerRFPDupeFilter'}
            crawler = get_crawler(SimpleSpider, settings_dict=settings)
            scheduler = Scheduler.from_crawler(crawler)
            spider = SimpleSpider.from_crawler(crawler)

            dupefilter = scheduler.df
            dupefilter.open()

            r1 = Request('http://scrapytest.org/index.html')
            r2 = Request('http://scrapytest.org/index.html')
            
            dupefilter.log(r1, spider)
            dupefilter.log(r2, spider)

            assert crawler.stats.get_value('dupefilter/filtered') == 2
            l.check_present(('scrapy.dupefilters', 'DEBUG', 
                ('Filtered duplicate request: <GET http://scrapytest.org/index.html>'
                ' - no more duplicates will be shown'
                ' (see DUPEFILTER_DEBUG to show all duplicates)')))

            dupefilter.close('finished')
Example #13
0
 def test_df_direct_scheduler(self):
     settings = {'DUPEFILTER_CLASS': __name__  + '.DirectDupeFilter'}
     crawler = get_crawler(settings_dict=settings)
     scheduler = Scheduler.from_crawler(crawler)
     self.assertEqual(scheduler.df.method, 'n/a')
 def create_scheduler(self):
     self.mock_crawler = MockCrawler(self.priority_queue_cls, self.jobdir)
     self.scheduler = Scheduler.from_crawler(self.mock_crawler)
     self.spider = Spider(name='spider')
     self.scheduler.open(self.spider)
Example #15
0
 def test_df_direct_scheduler(self):
     settings = {'DUPEFILTER_CLASS': __name__  + '.DirectDupeFilter'}
     crawler = get_crawler(settings_dict=settings)
     scheduler = Scheduler.from_crawler(crawler)
     self.assertEqual(scheduler.df.method, 'n/a')
Example #16
0
 def create_scheduler(self):
     self.mock_crawler = MockCrawler(self.priority_queue_cls, self.jobdir)
     self.scheduler = Scheduler.from_crawler(self.mock_crawler)
     self.spider = Spider(name='spider')
     self.scheduler.open(self.spider)