def test_enabled_setting(self): mw = TestMiddlewareManager(get_engine(), mw_classes=[M1, M2, MOff]) active = [x.__class__ for x in mw.middlewares] self.assertListEqual(active, [M1, M2]) self.assertEqual(mw.middlewares[0].enabled_setting, 'M1_ENABLED') self.assertEqual(mw.middlewares[1].enabled_setting, 'M2_OFF') mw = TestMiddlewareManager(get_engine(M1_ENABLED=False, M2_OFF=False), mw_classes=[M1, M2, MOff]) active = [x.__class__ for x in mw.middlewares] self.assertListEqual(active, [])
def test_process_request(self): engine = get_engine(RANDOM_USER_AGENT_LIST=['a']) mw = RandomUserAgent(engine) request = Request('http://github.com/') request = mw.process_request(request) self.assertEqual(request.headers['User-Agent'], 'a') # user agent shouldn't overwrite existing value engine = get_engine(RANDOM_USER_AGENT_LIST=['b']) mw = RandomUserAgent(engine) request = mw.process_request(request) self.assertEqual(request.headers['User-Agent'], 'a')
def setUp(self): engine = get_engine() self.stats = engine.stats self.mw = DownloaderStats(engine) self.req = Request('http://github.com') self.resp = Response('scrapytest.org', status=400, request=self.req)
def test_init(self): mw = TestMiddlewareManager(get_engine()) active = [x.__class__ for x in mw.middlewares] self.assertListEqual(active, [M1, M2]) logged = self.lw.get_first_line() self.assertEqual(logged, "[crawlmi] DEBUG: Disabled <class 'crawlmi.tests.test_middleware_manager.MOff'>:")
def setUp(self): self.clock = Clock() self.engine = get_engine(LOG_STATS_INTERVAL=30) self.engine.signals = SignalManager(self.engine) self.ls = LogStats(self.engine, clock=self.clock) self.lw = LogWrapper() self.lw.setUp()
def setUp(self): self.engine = get_engine( LOG_ENABLED=False, PIPELINE_BASE={'crawlmi.tests.test_engine.Pipeline': 10}) self.clock = self.engine.clock self.engine.setup() self.sp = SignalProcessor(self.engine) self.pipeline = Pipeline.obj
def test_init(self): mw = TestMiddlewareManager(get_engine()) active = [x.__class__ for x in mw.middlewares] self.assertListEqual(active, [M1, M2]) logged = self.lw.get_first_line() self.assertEqual( logged, "[crawlmi] DEBUG: Disabled <class 'crawlmi.tests.test_middleware_manager.MOff'>:" )
def test_decode_chunked_transfer(self): ct = ChunkedTransfer(get_engine()) chunked_body = '25\r\n' + 'This is the data in the first chunk\r\n\r\n' chunked_body += '1C\r\n' + 'and this is the second one\r\n\r\n' chunked_body += '3\r\n' + 'con\r\n' chunked_body += '8\r\n' + 'sequence\r\n' chunked_body += '0\r\n\r\n' body = ct._decode_chunked_transfer(chunked_body) self.assertEqual(body, \ 'This is the data in the first chunk\r\n' + 'and this is the second one\r\n' + 'consequence')
def test_process_request(self): engine = get_engine() mw = DuplicateFilter(engine) r1 = Request('http://test.org/1') r2 = Request('http://test.org/2') r3 = Request('http://test.org/2') self.assertIs(mw.process_request(r1), r1) self.assertIs(mw.process_request(r2), r2) self.assertIsNone(mw.process_request(r3)) engine.signals.send(clear_duplicate_filter) self.assertIs(mw.process_request(r3), r3)
def test_memory_stats(self): stats = MemoryStats(get_engine(STATS_DUMP=True)) self.assertEqual(stats.get_stats(), {}) self.assertEqual(stats.get_value('anything'), None) self.assertEqual(stats.get_value('anything', 'default'), 'default') stats.set_value('test', 'value') self.assertEqual(stats.get_stats(), {'test': 'value'}) stats.set_value('test2', 23) self.assertEqual(stats.get_stats(), {'test': 'value', 'test2': 23}) self.assertEqual(stats.get_value('test2'), 23) stats.inc_value('test2') self.assertEqual(stats.get_value('test2'), 24) stats.inc_value('test2', 6) self.assertEqual(stats.get_value('test2'), 30) stats.max_value('test2', 6) self.assertEqual(stats.get_value('test2'), 30) stats.max_value('test2', 40) self.assertEqual(stats.get_value('test2'), 40) stats.max_value('test3', 1) self.assertEqual(stats.get_value('test3'), 1) stats.min_value('test2', 60) self.assertEqual(stats.get_value('test2'), 40) stats.min_value('test2', 35) self.assertEqual(stats.get_value('test2'), 35) stats.min_value('test4', 7) self.assertEqual(stats.get_value('test4'), 7) stats.add_value('stats', 3) stats.add_value('stats', 2, 2.0) statistics = stats.get_value('stats') self.assertTrue(eq(statistics.average, 7.0/3.0), statistics.average) self.assertRaises(RuntimeError, stats.add_value, 'test4', 1) stats.add_sample('samples', 3, 'hello') stats.add_sample('samples', 2, 'world') samples = stats.get_value('samples') self.assertEqual(len(samples), 2) self.assertListEqual(samples.samples, [(3, 'hello'), (2, 'world')]) self.assertRaises(RuntimeError, stats.add_value, 'test4', 5, '!') stats.dump_stats() logged = self.lw.get_first_line(clear=False) self.assertTrue(logged.startswith('[crawlmi] INFO: Dumping crawlmi stats:')) logged = self.lw.get_logged() self.assertIn('test', logged) self.assertIn('test2', logged) self.assertIn('test3', logged) self.assertIn('test3', logged) self.assertIn('stats', logged) self.assertIn('samples', logged)
def test_dummy_stats(self): stats = DummyStats(get_engine()) self.assertEqual(stats.get_stats(), {}) self.assertEqual(stats.get_value('anything'), None) self.assertEqual(stats.get_value('anything', 'default'), 'default') stats.set_value('test', 'value') stats.inc_value('v1') stats.max_value('v2', 100) stats.min_value('v3', 100) stats.set_value('test', 'value') stats.add_value('stats', 100) stats.add_value('stats', 100, 12) stats.add_sample('samples', 3, 'hello') self.assertEqual(stats.get_stats(), {}) stats.dump_stats()
def test_tags(self): engine = get_engine() mw = DuplicateFilter(engine) r1 = Request('http://test.org/', meta={'df_tag': '1'}) r2 = Request('http://test.org/', meta={'df_tag': '2'}) r3 = Request('http://test.org/', meta={'df_tag': '2'}) self.assertIs(mw.process_request(r1), r1) self.assertIs(mw.process_request(r2), r2) self.assertIsNone(mw.process_request(r3)) engine.signals.send(clear_duplicate_filter, df_tag='2') self.assertIsNone(mw.process_request(r1)) self.assertIs(mw.process_request(r2), r2) self.assertIsNone(mw.process_request(r3))
def setUp(self): self.mw = Redirect(get_engine())
def _get_pm(self, *mw_classes): return PipelineManager(get_engine(), mw_classes=mw_classes)
def setUp(self): self.mw = HttpCompression(get_engine())
def _get_engine(self, **kwargs): engine = get_engine(**kwargs) self.stats = engine.stats return engine
def test_init2(self): mw = TestMiddlewareManager(get_engine(), mw_classes=[M1, M2, MOff]) active = [x.__class__ for x in mw.middlewares] self.assertListEqual(active, [M1, M2])
def setUp(self): self.mw = Cookies(get_engine())
def test_config(self): self.assertRaises(NotConfigured, LogStats, get_engine(LOG_STATS_INTERVAL=0))
def test_empty_list(self): engine = get_engine(RANDOM_USER_AGENT_LIST=[]) mw = RandomUserAgent(engine) # there should be many default user agents self.assertGreater(len(mw.user_agents), 10)
def setUp(self): self.mw = MetaRefreshRedirect(get_engine())
def setUp(self): engine = get_engine() self.mw = Retry(engine) self.mw.max_retry_times = 2
def _get_engine(self, **new_settings): return get_engine(self._get_settings(**new_settings))
def setUp(self): self.mw = Canonical(get_engine())
def setUp(self): self.engine = get_engine(LOG_ENABLED=False, PIPELINE_BASE={"crawlmi.tests.test_engine.Pipeline": 10}) self.clock = self.engine.clock self.engine.setup() self.sp = SignalProcessor(self.engine) self.pipeline = Pipeline.obj
def setUp(self): self.dh = DefaultHeaders(get_engine()) self.defaults = {} for k, v in self.dh.headers.iteritems(): self.defaults[k] = [v]
def test_basic(self): em = ExtensionManager(get_engine(), mw_classes=[E1, E2, EOff]) active = [x.__class__ for x in em.middlewares] self.assertListEqual(active, [E1, E2]) self.assertIsInstance(em['e1'], E1) self.assertRaises(KeyError, em.__getitem__, 'eoff')