Beispiel #1
0
    def __init__(self, node_id, crawler_id, redis_config, handlers):
        super(CrawlerProcess, self).__init__()
        self.node_id = node_id
        self.crawler_id = crawler_id
        self.redis_config = redis_config
        # self.queue = mp.Queue(maxsize=MAX_QUEUE_SIZE)

        self.crawler_queue = CrawlerQueue(node_id,
                                          crawler_id,
                                          redis_config=redis_config)
        self.crawler_queue.clear()
        # self.lock = mp.Lock()
        self.handlers = handlers
        logger.debug("number of handlers attached: %d" % (len(handlers)))
Beispiel #2
0
	def new_crawler(self, node_id, apikeys, config, crawler_proxies = None):
		file_handler_config = {
			"name": "FileHandler",
			"args": {
				"output_folder" : config["output"]
			}
		}

		# try:
			#crawler_id = md5('%s:%s'%(self.node_id, idx))
			#apikeys = self.config['apikeys'][apikey_list[idx]]
		crawler_id = apikeys['app_key']
		logger.debug('creating a new crawler: %s'%crawler_id)
		if (not crawler_proxies):
			crawler_proxies = next(self.proxy_generator) if self.proxy_generator else None

		crawler = UserRelationshipCrawler(node_id, crawler_id, copy.copy(apikeys), handlers=[create_handler(file_handler_config)], redis_config=copy.copy(config['redis_config']), proxies=crawler_proxies)
		
		if (crawler_id in self.crawlers):
			#self.crawlers[crawler_id].clear()
			del self.crawlers[crawler_id]

		self.crawlers[crawler_id] = {
			'apikeys': apikeys,
			'crawler': crawler,
			'crawler_queue': CrawlerQueue(self.node_id, crawler_id, redis_config=copy.copy(config['redis_config'])),
			'crawler_proxies': crawler_proxies
		}
		crawler.start()
Beispiel #3
0
class CrawlerProcess(mp.Process):
    def __init__(self, node_id, crawler_id, redis_config, handlers):
        super(CrawlerProcess, self).__init__()
        self.node_id = node_id
        self.crawler_id = crawler_id
        self.redis_config = redis_config
        # self.queue = mp.Queue(maxsize=MAX_QUEUE_SIZE)

        self.crawler_queue = CrawlerQueue(node_id,
                                          crawler_id,
                                          redis_config=redis_config)
        self.crawler_queue.clear()
        # self.lock = mp.Lock()
        self.handlers = handlers
        logger.debug("number of handlers attached: %d" % (len(handlers)))

    def get_crawler_id(self):
        return self.crawler_id

    def enqueue(self, request):
        # self.queue.put(request, block=True)
        self.crawler_queue.put(request)
        return True

    def get_cmd(self):
        # return  self.queue.get(block=True)
        return self.crawler_queue.get(block=True)

    def get_queue_size(self):
        self.crawler_queue.qsize()

    def run(self):
        pass
Beispiel #4
0
class CrawlerProcess(mp.Process):

	def __init__(self, node_id, crawler_id, redis_config, handlers):
		super(CrawlerProcess, self).__init__()
		self.node_id = node_id
		self.crawler_id = crawler_id
		self.redis_config = redis_config
		#self.queue = mp.Queue(maxsize=MAX_QUEUE_SIZE)

		self.crawler_queue = CrawlerQueue(node_id, crawler_id, redis_config=redis_config)
		self.crawler_queue.clear()
		#self.lock = mp.Lock()
		self.handlers = handlers
		logger.debug("number of handlers attached: %d"%(len(handlers)))


	def get_crawler_id(self):
		return self.crawler_id

	def enqueue(self, request):
		#self.queue.put(request, block=True)
		self.crawler_queue.put(request)
		return True

	def get_cmd(self):
		#return  self.queue.get(block=True)
		return self.crawler_queue.get(block=True)

	def get_queue_size(self):
		self.crawler_queue.qsize()

	def run(self):
		pass
			
Beispiel #5
0
	def __init__(self, node_id, crawler_id, redis_config, handlers):
		super(CrawlerProcess, self).__init__()
		self.node_id = node_id
		self.crawler_id = crawler_id
		self.redis_config = redis_config
		#self.queue = mp.Queue(maxsize=MAX_QUEUE_SIZE)

		self.crawler_queue = CrawlerQueue(node_id, crawler_id, redis_config=redis_config)
		self.crawler_queue.clear()
		#self.lock = mp.Lock()
		self.handlers = handlers
		logger.debug("number of handlers attached: %d"%(len(handlers)))
Beispiel #6
0
    def new_crawler(self, node_id, apikeys, config, crawler_proxies=None):
        file_handler_config = {
            "name": "FileHandler",
            "args": {
                "output_folder": config["output"]
            }
        }
        mongo_handler_config = {
            "name": "MongoDBHandler",
            "args": {
                "config": config["mongodb"]
            }
        }

        crawler_id = apikeys['app_key']
        logger.debug('creating a new crawler: %s' % crawler_id)
        if (not crawler_proxies):
            crawler_proxies = next(
                self.proxy_generator) if self.proxy_generator else None

        crawler = TwitterCrawler(node_id,
                                 crawler_id,
                                 copy.copy(apikeys),
                                 handlers=[
                                     create_handler(file_handler_config),
                                     create_handler(mongo_handler_config)
                                 ],
                                 redis_config=copy.copy(
                                     config['redis_config']),
                                 proxies=crawler_proxies)

        if (crawler_id in self.crawlers):
            #self.crawlers[crawler_id].clear()
            del self.crawlers[crawler_id]

        self.crawlers[crawler_id] = {
            'apikeys':
            apikeys,
            'crawler':
            crawler,
            'crawler_queue':
            CrawlerQueue(self.node_id,
                         crawler_id,
                         redis_config=copy.copy(config['redis_config'])),
            'crawler_proxies':
            crawler_proxies
        }
        crawler.start()