def __init__(self, node_id, crawler_id, redis_config, handlers): super(CrawlerProcess, self).__init__() self.node_id = node_id self.crawler_id = crawler_id self.redis_config = redis_config # self.queue = mp.Queue(maxsize=MAX_QUEUE_SIZE) self.crawler_queue = CrawlerQueue(node_id, crawler_id, redis_config=redis_config) self.crawler_queue.clear() # self.lock = mp.Lock() self.handlers = handlers logger.debug("number of handlers attached: %d" % (len(handlers)))
def new_crawler(self, node_id, apikeys, config, crawler_proxies = None): file_handler_config = { "name": "FileHandler", "args": { "output_folder" : config["output"] } } # try: #crawler_id = md5('%s:%s'%(self.node_id, idx)) #apikeys = self.config['apikeys'][apikey_list[idx]] crawler_id = apikeys['app_key'] logger.debug('creating a new crawler: %s'%crawler_id) if (not crawler_proxies): crawler_proxies = next(self.proxy_generator) if self.proxy_generator else None crawler = UserRelationshipCrawler(node_id, crawler_id, copy.copy(apikeys), handlers=[create_handler(file_handler_config)], redis_config=copy.copy(config['redis_config']), proxies=crawler_proxies) if (crawler_id in self.crawlers): #self.crawlers[crawler_id].clear() del self.crawlers[crawler_id] self.crawlers[crawler_id] = { 'apikeys': apikeys, 'crawler': crawler, 'crawler_queue': CrawlerQueue(self.node_id, crawler_id, redis_config=copy.copy(config['redis_config'])), 'crawler_proxies': crawler_proxies } crawler.start()
class CrawlerProcess(mp.Process): def __init__(self, node_id, crawler_id, redis_config, handlers): super(CrawlerProcess, self).__init__() self.node_id = node_id self.crawler_id = crawler_id self.redis_config = redis_config # self.queue = mp.Queue(maxsize=MAX_QUEUE_SIZE) self.crawler_queue = CrawlerQueue(node_id, crawler_id, redis_config=redis_config) self.crawler_queue.clear() # self.lock = mp.Lock() self.handlers = handlers logger.debug("number of handlers attached: %d" % (len(handlers))) def get_crawler_id(self): return self.crawler_id def enqueue(self, request): # self.queue.put(request, block=True) self.crawler_queue.put(request) return True def get_cmd(self): # return self.queue.get(block=True) return self.crawler_queue.get(block=True) def get_queue_size(self): self.crawler_queue.qsize() def run(self): pass
class CrawlerProcess(mp.Process): def __init__(self, node_id, crawler_id, redis_config, handlers): super(CrawlerProcess, self).__init__() self.node_id = node_id self.crawler_id = crawler_id self.redis_config = redis_config #self.queue = mp.Queue(maxsize=MAX_QUEUE_SIZE) self.crawler_queue = CrawlerQueue(node_id, crawler_id, redis_config=redis_config) self.crawler_queue.clear() #self.lock = mp.Lock() self.handlers = handlers logger.debug("number of handlers attached: %d"%(len(handlers))) def get_crawler_id(self): return self.crawler_id def enqueue(self, request): #self.queue.put(request, block=True) self.crawler_queue.put(request) return True def get_cmd(self): #return self.queue.get(block=True) return self.crawler_queue.get(block=True) def get_queue_size(self): self.crawler_queue.qsize() def run(self): pass
def __init__(self, node_id, crawler_id, redis_config, handlers): super(CrawlerProcess, self).__init__() self.node_id = node_id self.crawler_id = crawler_id self.redis_config = redis_config #self.queue = mp.Queue(maxsize=MAX_QUEUE_SIZE) self.crawler_queue = CrawlerQueue(node_id, crawler_id, redis_config=redis_config) self.crawler_queue.clear() #self.lock = mp.Lock() self.handlers = handlers logger.debug("number of handlers attached: %d"%(len(handlers)))
def new_crawler(self, node_id, apikeys, config, crawler_proxies=None): file_handler_config = { "name": "FileHandler", "args": { "output_folder": config["output"] } } mongo_handler_config = { "name": "MongoDBHandler", "args": { "config": config["mongodb"] } } crawler_id = apikeys['app_key'] logger.debug('creating a new crawler: %s' % crawler_id) if (not crawler_proxies): crawler_proxies = next( self.proxy_generator) if self.proxy_generator else None crawler = TwitterCrawler(node_id, crawler_id, copy.copy(apikeys), handlers=[ create_handler(file_handler_config), create_handler(mongo_handler_config) ], redis_config=copy.copy( config['redis_config']), proxies=crawler_proxies) if (crawler_id in self.crawlers): #self.crawlers[crawler_id].clear() del self.crawlers[crawler_id] self.crawlers[crawler_id] = { 'apikeys': apikeys, 'crawler': crawler, 'crawler_queue': CrawlerQueue(self.node_id, crawler_id, redis_config=copy.copy(config['redis_config'])), 'crawler_proxies': crawler_proxies } crawler.start()