def test_disabled_handler(self): handlers = {"scheme": None} crawler = get_crawler(settings_dict={"DOWNLOAD_HANDLERS": handlers}) dh = DownloadHandlers(crawler) self.assertNotIn("scheme", dh._schemes) for scheme in handlers: # force load handlers dh._get_handler(scheme) self.assertNotIn("scheme", dh._handlers) self.assertIn("scheme", dh._notconfigured)
def test_not_configured_handler(self): handlers = {'scheme': 'tests.test_downloader_handlers.OffDH'} crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers}) dh = DownloadHandlers(crawler) self.assertIn('scheme', dh._schemes) for scheme in handlers: # force load handlers dh._get_handler(scheme) self.assertNotIn('scheme', dh._handlers) self.assertIn('scheme', dh._notconfigured)
def test_not_configured_handler(self): handlers = {'scheme': 'tests.test_downloader_handlers.OffDH'} crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers}) dh = DownloadHandlers(crawler) self.assertIn('scheme', dh._schemes) for scheme in handlers: # force load handlers dh._get_handler(scheme) self.assertNotIn('scheme', dh._handlers) self.assertIn('scheme', dh._notconfigured)
def test_lazy_handlers(self): handlers = {'scheme': DummyLazyDH} crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers}) dh = DownloadHandlers(crawler) self.assertIn('scheme', dh._schemes) self.assertNotIn('scheme', dh._handlers) for scheme in handlers: # force load lazy handler dh._get_handler(scheme) self.assertIn('scheme', dh._handlers) self.assertNotIn('scheme', dh._notconfigured)
def test_enabled_handler(self): handlers = {'scheme': 'tests.test_downloader_handlers.DummyDH'} crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers}) dh = DownloadHandlers(crawler) self.assertIn('scheme', dh._schemes) self.assertIn('scheme', dh._handlers) self.assertNotIn('scheme', dh._notconfigured)
def test_not_configured_handler(self): handlers = {'scheme': OffDH} crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers}) dh = DownloadHandlers(crawler) self.assertIn('scheme', dh._schemes) self.assertNotIn('scheme', dh._handlers) self.assertIn('scheme', dh._notconfigured)
def __init__(self, crawler): self.settings = crawler.settings self.signals = crawler.signals self.slots = {} self.active = set() self.handlers = DownloadHandlers(crawler) self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS') self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP') self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY') self.middleware = DownloaderMiddlewareManager.from_crawler(crawler) self._slot_gc_loop = task.LoopingCall(self._slot_gc) self._slot_gc_loop.start(60)
def __init__(self, crawler): logger.debug(crawler.spider.name + " call the downloader") self.settings = dict() self.settings[crawler.spider.name] = crawler.settings self.signals = dict() self.signals[crawler.spider.name] = crawler.signals self.slots = {} self.conn = getConn() cur = self.conn.cursor() self.conn.select_db(conf.get("mysql", "db")) self.conn.commit() cur.close() self.actives = pack_dict() self.handlers = dict() self.handlers[crawler.spider.name] = DownloadHandlers(crawler) self.total_concurrency = self.settings[crawler.spider.name].getint('CONCURRENT_REQUESTS') self.domain_concurrency = self.settings[crawler.spider.name].getint('CONCURRENT_REQUESTS_PER_DOMAIN') self.ip_concurrency = self.settings[crawler.spider.name].getint('CONCURRENT_REQUESTS_PER_IP') self.middleware = dict() self.middleware[crawler.spider.name] = DownloaderMiddlewareManager.from_crawler(crawler) self._slot_gc_loop = task.LoopingCall(self._slot_gc) self._slot_gc_loop.start(60) self.rememberseq = dict() self.wastedtime = 0 mlist = self.settings[crawler.spider.name].get("TO_CHANGE") if mlist is not None: logger.debug("getIt~!!") DefaultInfo.printinfo(mlist) self.rememberseq = mlist out = "" for k in mlist.keys(): out += k + ":" + str(mlist[k]) + ":" self.insert_end("", out, str(datetime.datetime.now())) propor_dict = self.settings[crawler.spider.name].get("SPIDER_PROPORTION") if propor_dict is not None: logger.debug("get proportion dict " + str(propor_dict)) self.proportion_dict = propor_dict
def test_disabled_handler(self): handlers = {'scheme': None} dh = DownloadHandlers(get_crawler({'DOWNLOAD_HANDLERS': handlers})) self.assertNotIn('scheme', dh._handlers) self.assertNotIn('scheme', dh._notconfigured)
def test_not_configured_handler(self): handlers = {'scheme': 'tests.test_downloader_handlers.OffDH'} dh = DownloadHandlers(get_crawler({'DOWNLOAD_HANDLERS': handlers})) self.assertNotIn('scheme', dh._handlers) self.assertIn('scheme', dh._notconfigured)
def add_crawler(self, crawler): self.settings[crawler.spider.name] = crawler.settings self.signals[crawler.spider.name] = crawler.signals self.handlers[crawler.spider.name] = DownloadHandlers(crawler) self.middleware[crawler.spider.name] = DownloaderMiddlewareManager.from_crawler(crawler)