Ejemplo n.º 1
0
 def test_disabled_handler(self):
     handlers = {"scheme": None}
     crawler = get_crawler(settings_dict={"DOWNLOAD_HANDLERS": handlers})
     dh = DownloadHandlers(crawler)
     self.assertNotIn("scheme", dh._schemes)
     for scheme in handlers:  # force load handlers
         dh._get_handler(scheme)
     self.assertNotIn("scheme", dh._handlers)
     self.assertIn("scheme", dh._notconfigured)
 def test_not_configured_handler(self):
     handlers = {'scheme': 'tests.test_downloader_handlers.OffDH'}
     crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
     dh = DownloadHandlers(crawler)
     self.assertIn('scheme', dh._schemes)
     for scheme in handlers: # force load handlers
         dh._get_handler(scheme)
     self.assertNotIn('scheme', dh._handlers)
     self.assertIn('scheme', dh._notconfigured)
Ejemplo n.º 3
0
 def test_not_configured_handler(self):
     handlers = {'scheme': 'tests.test_downloader_handlers.OffDH'}
     crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
     dh = DownloadHandlers(crawler)
     self.assertIn('scheme', dh._schemes)
     for scheme in handlers:  # force load handlers
         dh._get_handler(scheme)
     self.assertNotIn('scheme', dh._handlers)
     self.assertIn('scheme', dh._notconfigured)
Ejemplo n.º 4
0
 def test_lazy_handlers(self):
     handlers = {'scheme': DummyLazyDH}
     crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
     dh = DownloadHandlers(crawler)
     self.assertIn('scheme', dh._schemes)
     self.assertNotIn('scheme', dh._handlers)
     for scheme in handlers:  # force load lazy handler
         dh._get_handler(scheme)
     self.assertIn('scheme', dh._handlers)
     self.assertNotIn('scheme', dh._notconfigured)
 def test_enabled_handler(self):
     handlers = {'scheme': 'tests.test_downloader_handlers.DummyDH'}
     crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
     dh = DownloadHandlers(crawler)
     self.assertIn('scheme', dh._schemes)
     self.assertIn('scheme', dh._handlers)
     self.assertNotIn('scheme', dh._notconfigured)
Ejemplo n.º 6
0
 def test_not_configured_handler(self):
     handlers = {'scheme': OffDH}
     crawler = get_crawler(settings_dict={'DOWNLOAD_HANDLERS': handlers})
     dh = DownloadHandlers(crawler)
     self.assertIn('scheme', dh._schemes)
     self.assertNotIn('scheme', dh._handlers)
     self.assertIn('scheme', dh._notconfigured)
Ejemplo n.º 7
0
 def __init__(self, crawler):
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.slots = {}
     self.active = set()
     self.handlers = DownloadHandlers(crawler)
     self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
     self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
     self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
     self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
     self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
     self._slot_gc_loop = task.LoopingCall(self._slot_gc)
     self._slot_gc_loop.start(60)
Ejemplo n.º 8
0
    def __init__(self, crawler):
        logger.debug(crawler.spider.name + " call the downloader")
        self.settings = dict()
        self.settings[crawler.spider.name] = crawler.settings
        self.signals = dict()
        self.signals[crawler.spider.name] = crawler.signals
        self.slots = {}
        self.conn = getConn()
        cur = self.conn.cursor()
        self.conn.select_db(conf.get("mysql", "db"))
        self.conn.commit()
        cur.close()
        self.actives = pack_dict()
        self.handlers = dict()
        self.handlers[crawler.spider.name] = DownloadHandlers(crawler)
        self.total_concurrency = self.settings[crawler.spider.name].getint('CONCURRENT_REQUESTS')
        self.domain_concurrency = self.settings[crawler.spider.name].getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        self.ip_concurrency = self.settings[crawler.spider.name].getint('CONCURRENT_REQUESTS_PER_IP')
        self.middleware = dict()
        self.middleware[crawler.spider.name] = DownloaderMiddlewareManager.from_crawler(crawler)
        self._slot_gc_loop = task.LoopingCall(self._slot_gc)
        self._slot_gc_loop.start(60)
        self.rememberseq = dict()
        self.wastedtime = 0
        mlist = self.settings[crawler.spider.name].get("TO_CHANGE")
        if mlist is not None:
            logger.debug("getIt~!!")
            DefaultInfo.printinfo(mlist)
            self.rememberseq = mlist
            out = ""
            for k in mlist.keys():
                out += k + ":" + str(mlist[k]) + ":"
            self.insert_end("", out, str(datetime.datetime.now()))

        propor_dict = self.settings[crawler.spider.name].get("SPIDER_PROPORTION")
        if propor_dict is not None:
            logger.debug("get proportion dict " + str(propor_dict))
            self.proportion_dict = propor_dict
Ejemplo n.º 9
0
 def test_disabled_handler(self):
     handlers = {'scheme': None}
     dh = DownloadHandlers(get_crawler({'DOWNLOAD_HANDLERS': handlers}))
     self.assertNotIn('scheme', dh._handlers)
     self.assertNotIn('scheme', dh._notconfigured)
Ejemplo n.º 10
0
 def test_not_configured_handler(self):
     handlers = {'scheme': 'tests.test_downloader_handlers.OffDH'}
     dh = DownloadHandlers(get_crawler({'DOWNLOAD_HANDLERS': handlers}))
     self.assertNotIn('scheme', dh._handlers)
     self.assertIn('scheme', dh._notconfigured)
Ejemplo n.º 11
0
 def add_crawler(self, crawler):
     self.settings[crawler.spider.name] = crawler.settings
     self.signals[crawler.spider.name] = crawler.signals
     self.handlers[crawler.spider.name] = DownloadHandlers(crawler)
     self.middleware[crawler.spider.name] = DownloaderMiddlewareManager.from_crawler(crawler)