def __init__(self): self.logger = Logger() self.que = PriorityQueue() self.rdb = redis.StrictRedis(host=DB_HOST, port=DB_PORT, db=DB_ID) self.pipe = self.rdb.pipeline(transaction=True)
class Scheduler: logger = Logger() def __init__(self): self.logger = Logger() self.que = PriorityQueue() self.rdb = redis.StrictRedis(host=DB_HOST, port=DB_PORT, db=DB_ID) self.pipe = self.rdb.pipeline(transaction=True) def start_spider(self): # prepare_task_items() settings = get_project_settings() process = CrawlerProcess(settings) process.crawl('common_spider') process.start() # configure_logging(settings) # runner = CrawlerRunner(settings) # print(all_spiders) # #for spider in all_spiders: # #d=runner.crawl(all_spiders[0]) # d = runner.join() # d.addBoth(lambda _: reactor.stop()) # reactor.run() def start_validator(self): settings = get_project_settings() process = CrawlerProcess(settings) process.crawl('baidu_validator') process.start() def schedule(self): self.logger.log('start schedule') while not self.que.empty(): self.que.get() time_start = time.time() for taskname, task in TASKS.items(): self.que.put(Task_dict( taskname, task, time_start, )) while not self.que.empty(): time_now = time.time() task_dict = self.que.get() self.logger.log('now waiting for ' + task_dict.taskname) if time_now < task_dict.start_time: time.sleep(task_dict.start_time - time_now) self.start_processing(task_dict.taskname, task_dict.task) time_now = time.time() self.que.put( Task_dict( task_dict.taskname, task_dict.task, time_now + task_dict.task['interval'] * 60, )) # print(time_now+task['interval']*60, task) def start_processing(self, taskname, task): self.logger.log('\n' + "*" * 54) self.logger.log('%-20s%s' % (taskname, 'start')) task_queue = [taskname + DB_SPLIT_SYMBOL + x for x in task["resource"]] self.rdb.delete(DB_RAW_IPPOOL_NAME) self.pipe.lpush(DB_TASK_QUEUE_NAME, *task_queue) self.pipe.execute() self.logger.log('%-20s%s' % (taskname, 'crawling')) process = Process(target=self.start_spider) process.start() process.join() ippool_size = self.ippool_turn_raw() self.logger.log('%-20s%s' % (taskname, 'validating')) process = Process(target=self.start_validator) process.start() process.join() ippool_size_now = self.rdb.zcard(DB_IPPOOL_NAME) self.logger.log( '%-20s%s %03d\n' % (taskname, 'contribution', ippool_size_now - ippool_size)) self.logger.log("*" * 54) def ippool_turn_raw(self): ippool = self.rdb.zrange(DB_IPPOOL_NAME, 0, -1) ippool_size = len(ippool) if ippool_size > 0: self.pipe.sadd(DB_RAW_IPPOOL_NAME, *ippool) self.pipe.delete(DB_IPPOOL_NAME) self.pipe.execute() return ippool_size