def schedule(): global filtered_newtasks while 1: try: if (get_message_queue_size('parse') < config.max_task_queue_size * 2 and get_message_queue_size('pipeline') < config.max_task_queue_size * 2 and get_message_queue_size('schedule') < config.max_task_queue_size * 2 and get_message_queue_size('request') < config.max_task_queue_size): for crawler in celeryconfig.crawlers: #tasks[crawler['name'] + '.schedule'].delay(check_task=True) #for task in tasks[crawler['name']+'.schedule'].scheduler.new_tasks_generator(): # if task.get('priority', None): # #tasks[crawler['name']+'.request_priority'].delay(task) # tasks[crawler['name']+'.request_priority'].apply_async((task, ), compression='zlib') # else: # #tasks[crawler['name']+'.request'].delay(task) # tasks[crawler['name']+'.request'].apply_async((task, ), compression='zlib') with scheduel_lock: db = tasks[ crawler['name'] + '.schedule'].scheduler.get_redis_connection() new_tasks = db.lpop(config.filtered_task_pool_key) if new_tasks: new_tasks = json.loads(new_tasks) else: new_tasks = filtered_newtasks filtered_newtasks = [] for task in new_tasks: if task.get('priority', None): #tasks[crawler['name']+'.request_priority'].delay(task) tasks[crawler['name'] + '.request_priority'].apply_async( (task, ), compression='zlib') else: #tasks[crawler['name']+'.request'].delay(task) tasks[crawler['name'] + '.request'].apply_async( (task, ), compression='zlib') else: gevent.sleep(1) except Exception as exc: logging.exception(exc) for crawler in celeryconfig.crawlers: tasks[crawler['name'] + '.schedule'].scheduler.save_filter() raise
def schedule(): global filtered_newtasks while 1: try: if (get_message_queue_size('parse') < config.max_task_queue_size * 2 and get_message_queue_size('pipeline') < config.max_task_queue_size * 2 and get_message_queue_size('schedule') < config.max_task_queue_size * 2 and get_message_queue_size('request') < config.max_task_queue_size): for crawler in celeryconfig.crawlers: #tasks[crawler['name'] + '.schedule'].delay(check_task=True) #for task in tasks[crawler['name']+'.schedule'].scheduler.new_tasks_generator(): # if task.get('priority', None): # #tasks[crawler['name']+'.request_priority'].delay(task) # tasks[crawler['name']+'.request_priority'].apply_async((task, ), compression='zlib') # else: # #tasks[crawler['name']+'.request'].delay(task) # tasks[crawler['name']+'.request'].apply_async((task, ), compression='zlib') with scheduel_lock: db = tasks[crawler['name']+'.schedule'].scheduler.get_redis_connection() new_tasks = db.lpop(config.filtered_task_pool_key) if new_tasks: new_tasks = json.loads(new_tasks) else: new_tasks = filtered_newtasks filtered_newtasks = [] for task in new_tasks: if task.get('priority', None): #tasks[crawler['name']+'.request_priority'].delay(task) tasks[crawler['name']+'.request_priority'].apply_async((task, ), compression='zlib') else: #tasks[crawler['name']+'.request'].delay(task) tasks[crawler['name']+'.request'].apply_async((task, ), compression='zlib') else: gevent.sleep(1) except Exception as exc: logging.exception(exc) for crawler in celeryconfig.crawlers: tasks[crawler['name']+'.schedule'].scheduler.save_filter() raise
def schedule(self, tasks=None, check_task=False, save_filter=False): if save_filter: self.scheduler.save_filter() return # initcail tasks: if not tasks and not check_task: for task in self.scheduler.init_generator(): if task.get('priority', None): #app.tasks[self.crawler_name+'.request_priority'].delay(task) app.tasks[self.crawler_name + '.request_priority'].apply_async( (task, ), compression='zlib') else: #app.tasks[self.crawler_name+'.request'].delay(task) app.tasks[self.crawler_name + '.request'].apply_async( (task, ), compression='zlib') #group(app.tasks[self.crawler_name + '.request'].s(task) # | app.tasks[self.crawler_name + '.parse'].s() # | app.tasks[self.crawler_name + '.pipeline'].s() # ).delay() #app.tasks[self.crawler_name+'.schedule'].apply_async( # args=[], # kwargs = { # 'check_task': True, # }, # eta=datetime.datetime.now()) # add new tasks, call by task.apply elif tasks and not check_task: self.scheduler.add_new_task(tasks) #app.tasks[self.crawler_name+'.new_task'].delay(task) # schedule task elif check_task: #i = app.control.inspect() timedelta = config.new_task_check_interval if (get_message_queue_size('request') < config.max_task_queue_size): for task in self.scheduler.new_tasks_generator(): if task.get('priority', None): app.tasks[self.crawler_name + '.request_priority'].delay(task) else: app.tasks[self.crawler_name + '.request'].delay(task) timedelta = 1
def schedule(self, tasks=None, check_task=False, save_filter=False): if save_filter: self.scheduler.save_filter() return # initcail tasks: if not tasks and not check_task: for task in self.scheduler.init_generator(): if task.get('priority', None): #app.tasks[self.crawler_name+'.request_priority'].delay(task) app.tasks[self.crawler_name+'.request_priority'].apply_async((task, ), compression='zlib') else: #app.tasks[self.crawler_name+'.request'].delay(task) app.tasks[self.crawler_name+'.request'].apply_async((task, ), compression='zlib') #group(app.tasks[self.crawler_name + '.request'].s(task) # | app.tasks[self.crawler_name + '.parse'].s() # | app.tasks[self.crawler_name + '.pipeline'].s() # ).delay() #app.tasks[self.crawler_name+'.schedule'].apply_async( # args=[], # kwargs = { # 'check_task': True, # }, # eta=datetime.datetime.now()) # add new tasks, call by task.apply elif tasks and not check_task: self.scheduler.add_new_task(tasks) #app.tasks[self.crawler_name+'.new_task'].delay(task) # schedule task elif check_task: #i = app.control.inspect() timedelta = config.new_task_check_interval if (get_message_queue_size('request') < config.max_task_queue_size): for task in self.scheduler.new_tasks_generator(): if task.get('priority', None): app.tasks[self.crawler_name+'.request_priority'].delay(task) else: app.tasks[self.crawler_name+'.request'].delay(task) timedelta = 1