def setUp(self): self.crawler = get_crawler(Spider, self.settings_dict) self.spider = self.crawler._create_spider('foo') self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler) # some mw depends on stats collector self.crawler.stats.open_spider(self.spider) return self.mwman.open_spider(self.spider)
def start_query_request(self, cursor=None): """ Generate the search request """ if cursor: url = self.url + "&cursor={cursor}" url = url.format(query=quote(self.query), cursor=quote(cursor)) else: url = self.url.format(query=quote(self.query)) request = http.Request(url, callback=self.parse_result_page, cookies=self.cookies, headers=self.headers) yield request self.num_search_issued += 1 if self.num_search_issued % 100 == 0: # get new SeleniumMiddleware for m in self.crawler.engine.downloader.middleware.middlewares: if isinstance(m, SeleniumMiddleware): m.spider_closed() self.crawler.engine.downloader.middleware = DownloaderMiddlewareManager.from_crawler( self.crawler) # update cookies yield SeleniumRequest(url="https://twitter.com/explore", callback=self.update_cookies, dont_filter=True)
def __init__(self, crawler): self.settings = crawler.settings self.signals = crawler.signals self.slots = {} self.active = set() self.handlers = DownloadHandlers(crawler) self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS') self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP') self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY') self.middleware = DownloaderMiddlewareManager.from_crawler(crawler) self._slot_gc_loop = task.LoopingCall(self._slot_gc) self._slot_gc_loop.start(60)
def __init__(self, crawler): logger.debug(crawler.spider.name + " call the downloader") self.settings = dict() self.settings[crawler.spider.name] = crawler.settings self.signals = dict() self.signals[crawler.spider.name] = crawler.signals self.slots = {} self.conn = getConn() cur = self.conn.cursor() self.conn.select_db(conf.get("mysql", "db")) self.conn.commit() cur.close() self.actives = pack_dict() self.handlers = dict() self.handlers[crawler.spider.name] = DownloadHandlers(crawler) self.total_concurrency = self.settings[crawler.spider.name].getint('CONCURRENT_REQUESTS') self.domain_concurrency = self.settings[crawler.spider.name].getint('CONCURRENT_REQUESTS_PER_DOMAIN') self.ip_concurrency = self.settings[crawler.spider.name].getint('CONCURRENT_REQUESTS_PER_IP') self.middleware = dict() self.middleware[crawler.spider.name] = DownloaderMiddlewareManager.from_crawler(crawler) self._slot_gc_loop = task.LoopingCall(self._slot_gc) self._slot_gc_loop.start(60) self.rememberseq = dict() self.wastedtime = 0 mlist = self.settings[crawler.spider.name].get("TO_CHANGE") if mlist is not None: logger.debug("getIt~!!") DefaultInfo.printinfo(mlist) self.rememberseq = mlist out = "" for k in mlist.keys(): out += k + ":" + str(mlist[k]) + ":" self.insert_end("", out, str(datetime.datetime.now())) propor_dict = self.settings[crawler.spider.name].get("SPIDER_PROPORTION") if propor_dict is not None: logger.debug("get proportion dict " + str(propor_dict)) self.proportion_dict = propor_dict
def add_crawler(self, crawler): self.settings[crawler.spider.name] = crawler.settings self.signals[crawler.spider.name] = crawler.signals self.handlers[crawler.spider.name] = DownloadHandlers(crawler) self.middleware[crawler.spider.name] = DownloaderMiddlewareManager.from_crawler(crawler)