def setUp(self):
     self.crawler = get_crawler(Spider, self.settings_dict)
     self.spider = self.crawler._create_spider('foo')
     self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
     # some mw depends on stats collector
     self.crawler.stats.open_spider(self.spider)
     return self.mwman.open_spider(self.spider)
 def setUp(self):
     self.crawler = get_crawler(Spider, self.settings_dict)
     self.spider = self.crawler._create_spider('foo')
     self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
     # some mw depends on stats collector
     self.crawler.stats.open_spider(self.spider)
     return self.mwman.open_spider(self.spider)
Example #3
0
    def start_query_request(self, cursor=None):
        """
        Generate the search request
        """
        if cursor:
            url = self.url + "&cursor={cursor}"
            url = url.format(query=quote(self.query), cursor=quote(cursor))
        else:
            url = self.url.format(query=quote(self.query))
        request = http.Request(url,
                               callback=self.parse_result_page,
                               cookies=self.cookies,
                               headers=self.headers)
        yield request

        self.num_search_issued += 1
        if self.num_search_issued % 100 == 0:
            # get new SeleniumMiddleware
            for m in self.crawler.engine.downloader.middleware.middlewares:
                if isinstance(m, SeleniumMiddleware):
                    m.spider_closed()
            self.crawler.engine.downloader.middleware = DownloaderMiddlewareManager.from_crawler(
                self.crawler)
            # update cookies
            yield SeleniumRequest(url="https://twitter.com/explore",
                                  callback=self.update_cookies,
                                  dont_filter=True)
Example #4
0
 def __init__(self, crawler):
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.slots = {}
     self.active = set()
     self.handlers = DownloadHandlers(crawler)
     self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
     self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
     self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
     self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
     self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
     self._slot_gc_loop = task.LoopingCall(self._slot_gc)
     self._slot_gc_loop.start(60)
Example #5
0
    def __init__(self, crawler):
        logger.debug(crawler.spider.name + " call the downloader")
        self.settings = dict()
        self.settings[crawler.spider.name] = crawler.settings
        self.signals = dict()
        self.signals[crawler.spider.name] = crawler.signals
        self.slots = {}
        self.conn = getConn()
        cur = self.conn.cursor()
        self.conn.select_db(conf.get("mysql", "db"))
        self.conn.commit()
        cur.close()
        self.actives = pack_dict()
        self.handlers = dict()
        self.handlers[crawler.spider.name] = DownloadHandlers(crawler)
        self.total_concurrency = self.settings[crawler.spider.name].getint('CONCURRENT_REQUESTS')
        self.domain_concurrency = self.settings[crawler.spider.name].getint('CONCURRENT_REQUESTS_PER_DOMAIN')
        self.ip_concurrency = self.settings[crawler.spider.name].getint('CONCURRENT_REQUESTS_PER_IP')
        self.middleware = dict()
        self.middleware[crawler.spider.name] = DownloaderMiddlewareManager.from_crawler(crawler)
        self._slot_gc_loop = task.LoopingCall(self._slot_gc)
        self._slot_gc_loop.start(60)
        self.rememberseq = dict()
        self.wastedtime = 0
        mlist = self.settings[crawler.spider.name].get("TO_CHANGE")
        if mlist is not None:
            logger.debug("getIt~!!")
            DefaultInfo.printinfo(mlist)
            self.rememberseq = mlist
            out = ""
            for k in mlist.keys():
                out += k + ":" + str(mlist[k]) + ":"
            self.insert_end("", out, str(datetime.datetime.now()))

        propor_dict = self.settings[crawler.spider.name].get("SPIDER_PROPORTION")
        if propor_dict is not None:
            logger.debug("get proportion dict " + str(propor_dict))
            self.proportion_dict = propor_dict
Example #6
0
 def add_crawler(self, crawler):
     self.settings[crawler.spider.name] = crawler.settings
     self.signals[crawler.spider.name] = crawler.signals
     self.handlers[crawler.spider.name] = DownloadHandlers(crawler)
     self.middleware[crawler.spider.name] = DownloaderMiddlewareManager.from_crawler(crawler)