class Master: def msg_callback(self, ch, method, properties, body): callback_set = {'SUCCESS': self.success, 'FAIL': self.fail, 'AWAKE': self.update_slave_response_time, 'STOP': self.stop, 'ADD_SLAVE': self.add_slave, 'KILL_SLAVE': self.kill_slave, 'RESTART_SLAVE': self.restart_slave, 'STAT': self.stat, 'START': self.start, 'RECONFIGURE': self.configure, 'REFRESH': self.refresh } try: command = body[:body.find(' ')] info = body[body.find(' ')+1:] if command in callback_set: callback_set[command](ujson.loads(info)) else: logging.debug(" [x] Unknown command %r" % (str(body),)) except KeyError as e: if str(e) == "'Queue.DeclareOk'": logging.debug("Queue.DelcareOk at %r" % (str(body),)) else: logging.error("Unknown KeyError at %r:" % (str(body),)) except RuntimeError as e: if 'recursion' in str(e): logging.error('MAXIMUM RECURSION ERROR') def __init__(self, conf_file): self.config = ConfigParser.ConfigParser(allow_no_value=True) self.clean_time_gap = None self.wait_time_for_slave = None self.master_queue_name = None self.task_queue_name = None self.task_queue_size_limit = None self.task_file_name = None self.task_counter_file = None self.ssh_key = None self.s3_bucket = None self.s3_folder = None self.slave_num_every_packup = None self.slave_max_sec_each_task = None self.slave_python_version = None self.master_ip = None self.slaves_ip = None self.slave_awake_frequency = None self.configure(conf_file) self.last_wake_time = None self.repeated_timer = None self.is_started = False self.pop_forever_handler = None logging.info('Starting task manager...') self.task_manager = TaskManager(self.task_file_name, self.task_counter_file) logging.info('Starting slave manager...') self.slave_manager = SlaveManager(master_ip=self.master_ip, slaves_ip=self.slaves_ip, ssh_key=self.ssh_key, s3_bucket=self.s3_bucket, s3_folder=self.s3_folder, slave_num_every_packup=self.slave_num_every_packup, slave_max_sec_each_task=self.slave_max_sec_each_task, slave_python_version=self.slave_python_version, slave_awake_frequency=self.slave_awake_frequency, slave_buffer_size=1) logging.info('Starting connection manager...') self.message_connection = ConnectionManager(queue_name=self.master_queue_name, durable=False, callback=self.msg_callback, no_ack=True) def run(self): logging.info(' [*] Waiting for messages. To exit press CTRL+C') try: self.message_connection.start_accepting_message() except KeyboardInterrupt: logging.info('Stopping master...') master.stop(None) except EOFError: logging.info('Download finishes. Shutting down master.') master.stop(None) # except Exception as e: # logging.info(str(e)) # logging.info('Stopping master...') # TODO: write all configuration in one file def configure(self, conf_file): self.config.read(conf_file) self.clean_time_gap = self.config.getint('main', 'clean_time_gap') self.wait_time_for_slave = self.config.getint('main', 'wait_time_for_slave') self.slave_awake_frequency = self.config.get('main', 'slave_awake_frequency') self.master_ip = self.config.get('main', 'master_private_ip') self.slaves_ip = self.config.get('main', 'slaves_private_ip') self.master_queue_name = self.config.get('main', 'master_queue_name') self.task_queue_name = self.config.get('main', 'task_queue_name') self.task_file_name = self.config.get('main', 'task_file') self.task_queue_size_limit = int(self.config.get('main', 'task_queue_size_limit')) self.task_counter_file = self.config.get('main', 'task_counter_file') self.ssh_key = self.config.get('main', 'ssh_key') self.s3_bucket = self.config.get('main', 's3_bucket') self.s3_folder = self.config.get('main', 's3_folder') self.slave_num_every_packup = self.config.get('main', 'slave_num_every_packup') self.slave_max_sec_each_task = self.config.get('main', 'slave_max_sec_each_task') self.slave_python_version = self.config.get('main', 'slave_python_version') def add_slave(self, slave_info): if self.slave_manager.exist_slave(slave_info): logging.info('Slave ' + slave_info['host'] + ' already exists.') return logging.info('master: add slave' + str(slave_info)) new_slave_info = self.slave_manager.add_slave(slave_info) self.slave_manager.run_slave(new_slave_info) # TODO: def kill_slave(self, slave_info): if not self.slave_manager.exist_slave(slave_info): return logging.info('kill slave ' + str(slave_info)) self.slave_manager.kill_slave(slave_info) def restart_slave(self, slave_info): logging.info(slave_info['host']) logging.info('restart_slave' + str(slave_info)) self.kill_slave(slave_info) self.add_slave(slave_info) def start(self, info): logging.info('Master Starts') self.last_wake_time = datetime.datetime.utcnow() self.is_started = True self.pop_forever_handler = threading.Thread(target=self.start_popping_tasks) self.pop_forever_handler.start() self.repeated_timer = RepeatedTimer(self.clean_time_gap, self.notice_refresh, None) def pop_forever(self): self.start_popping_tasks() def get_task_queue_size(self): pass # TODO: There is a bottle neck here def start_popping_tasks(self): task_connection = ConnectionManager(queue_name=self.task_queue_name, durable=True, no_ack=False) eof_reached = False while self.is_started and not eof_reached: current_task_queue_size = task_connection.get_task_queue_size() while self.is_started and current_task_queue_size < self.task_queue_size_limit: task = self.task_manager.pop_task() if task is None: # TODO: Don't use Error. Just break and handle the case later in this function logging.info('EOF Reached') eof_reached = True break message = 'WORK ' + ujson.dumps(task) task_connection.publish(message) current_task_queue_size += 1 task_connection.stop() def fail(self, slave_task_info): self.task_manager.add_task(slave_task_info['task']) self.slave_manager.update_last_response(slave_task_info) def success(self, slave_task_info): slave_info = self.slave_manager.update_last_response(slave_task_info) def update_slave_response_time(self, slave_task_info): slave_info = self.slave_manager.update_last_response(slave_task_info) def stop(self, info): self.is_started = False self.notice_slaves_stop() if self.pop_forever_handler is not None: self.pop_forever_handler.join() if self.repeated_timer is not None: self.repeated_timer.stop() self.slave_manager.stop() self.task_manager.stop() self.message_connection.stop() def notice_slaves_stop(self): task_connection = ConnectionManager(queue_name=self.task_queue_name, durable=True, no_ack=False) screen_list = [key for key in self.slave_manager.slave_dict.keys()] for screen in screen_list: task_connection.publish('STOP {}') # task_connection.broadcast_task('STOP {}') task_connection.stop() def refresh(self, info): cur_progress, total_task = self.task_manager.get_progress() logging.info('downloading {}/{} files'.format(cur_progress, total_task)) if not self.is_started: return # if time interval met, check failed slave if self.last_wake_time is None: self.last_wake_time = datetime.datetime.utcnow() if self.last_wake_time + datetime.timedelta( seconds=self.clean_time_gap) > datetime.datetime.utcnow(): return failed_slaves = self.slave_manager.get_failed_slaves(self.wait_time_for_slave) if len(failed_slaves) != 0: logging.info('Finding failed slaves... ' + str(failed_slaves)) for slave in failed_slaves: self.restart_slave(slave) self.last_wake_time = datetime.datetime.utcnow() def notice_refresh(self, info): try: self.message_connection.publish('REFRESH {}') except IndexError: logging.critical('INDEX_ERROR') def stat(self, info): logging.info('=====================================') logging.info('Num of slave: ', self.slave_manager.get_num_slaves()) logging.info('=====================================') if len(info) > 0: for slave in self.slave_manager.slave_list: if slave['last_response'] is None: delta = 'new slave' else: delta = datetime.datetime.utcnow() - slave['last_response'] logging.info(slave['host'], '|', slave['queue'], '|', delta) logging.info('====================================')