コード例 #1
0
ファイル: master.py プロジェクト: habemusne/flexibloader
class Master:

    def msg_callback(self, ch, method, properties, body):
        callback_set = {'SUCCESS': self.success,
                        'FAIL': self.fail,
                        'AWAKE': self.update_slave_response_time,
                        'STOP': self.stop,
                        'ADD_SLAVE': self.add_slave,
                        'KILL_SLAVE': self.kill_slave,
                        'RESTART_SLAVE': self.restart_slave,
                        'STAT': self.stat,
                        'START': self.start,
                        'RECONFIGURE': self.configure,
                        'REFRESH': self.refresh
                        }
        try:
            command = body[:body.find(' ')]
            info = body[body.find(' ')+1:]
            if command in callback_set:
                callback_set[command](ujson.loads(info))
            else:
                logging.debug(" [x] Unknown command %r" % (str(body),))
        except KeyError as e:
            if str(e) == "'Queue.DeclareOk'":
                logging.debug("Queue.DelcareOk at %r" % (str(body),))
            else:
                logging.error("Unknown KeyError at %r:" % (str(body),))
        except RuntimeError as e:
            if 'recursion' in str(e):
                logging.error('MAXIMUM RECURSION ERROR')

    def __init__(self, conf_file):
        self.config = ConfigParser.ConfigParser(allow_no_value=True)
        self.clean_time_gap = None
        self.wait_time_for_slave = None
        self.master_queue_name = None
        self.task_queue_name = None
        self.task_queue_size_limit = None
        self.task_file_name = None
        self.task_counter_file = None
        self.ssh_key = None
        self.s3_bucket = None
        self.s3_folder = None
        self.slave_num_every_packup = None
        self.slave_max_sec_each_task = None
        self.slave_python_version = None
        self.master_ip = None
        self.slaves_ip = None
        self.slave_awake_frequency = None
        self.configure(conf_file)

        self.last_wake_time = None

        self.repeated_timer = None
        self.is_started = False
        self.pop_forever_handler = None

        logging.info('Starting task manager...')
        self.task_manager = TaskManager(self.task_file_name, self.task_counter_file)
        logging.info('Starting slave manager...')
        self.slave_manager = SlaveManager(master_ip=self.master_ip,
                                          slaves_ip=self.slaves_ip,
                                          ssh_key=self.ssh_key,
                                          s3_bucket=self.s3_bucket,
                                          s3_folder=self.s3_folder,
                                          slave_num_every_packup=self.slave_num_every_packup,
                                          slave_max_sec_each_task=self.slave_max_sec_each_task,
                                          slave_python_version=self.slave_python_version,
                                          slave_awake_frequency=self.slave_awake_frequency,
                                          slave_buffer_size=1)
        logging.info('Starting connection manager...')
        self.message_connection = ConnectionManager(queue_name=self.master_queue_name,
                                                    durable=False,
                                                    callback=self.msg_callback,
                                                    no_ack=True)

    def run(self):
        logging.info(' [*] Waiting for messages. To exit press CTRL+C')
        try:
            self.message_connection.start_accepting_message()
        except KeyboardInterrupt:
            logging.info('Stopping master...')
            master.stop(None)
        except EOFError:
            logging.info('Download finishes. Shutting down master.')
            master.stop(None)
        # except Exception as e:
        #     logging.info(str(e))
        #     logging.info('Stopping master...')

    # TODO: write all configuration in one file
    def configure(self, conf_file):
        self.config.read(conf_file)
        self.clean_time_gap = self.config.getint('main', 'clean_time_gap')
        self.wait_time_for_slave = self.config.getint('main', 'wait_time_for_slave')
        self.slave_awake_frequency = self.config.get('main', 'slave_awake_frequency')
        self.master_ip = self.config.get('main', 'master_private_ip')
        self.slaves_ip = self.config.get('main', 'slaves_private_ip')
        self.master_queue_name = self.config.get('main', 'master_queue_name')
        self.task_queue_name = self.config.get('main', 'task_queue_name')
        self.task_file_name = self.config.get('main', 'task_file')
        self.task_queue_size_limit = int(self.config.get('main', 'task_queue_size_limit'))
        self.task_counter_file = self.config.get('main', 'task_counter_file')
        self.ssh_key = self.config.get('main', 'ssh_key')
        self.s3_bucket = self.config.get('main', 's3_bucket')
        self.s3_folder = self.config.get('main', 's3_folder')
        self.slave_num_every_packup = self.config.get('main', 'slave_num_every_packup')
        self.slave_max_sec_each_task = self.config.get('main', 'slave_max_sec_each_task')
        self.slave_python_version = self.config.get('main', 'slave_python_version')

    def add_slave(self, slave_info):
        if self.slave_manager.exist_slave(slave_info):
            logging.info('Slave ' + slave_info['host'] + ' already exists.')
            return
        logging.info('master: add slave' + str(slave_info))
        new_slave_info = self.slave_manager.add_slave(slave_info)
        self.slave_manager.run_slave(new_slave_info)
        # TODO:

    def kill_slave(self, slave_info):
        if not self.slave_manager.exist_slave(slave_info):
            return
        logging.info('kill slave ' + str(slave_info))
        self.slave_manager.kill_slave(slave_info)

    def restart_slave(self, slave_info):
        logging.info(slave_info['host'])
        logging.info('restart_slave' + str(slave_info))
        self.kill_slave(slave_info)
        self.add_slave(slave_info)

    def start(self, info):
        logging.info('Master Starts')
        self.last_wake_time = datetime.datetime.utcnow()
        self.is_started = True

        self.pop_forever_handler = threading.Thread(target=self.start_popping_tasks)
        self.pop_forever_handler.start()

        self.repeated_timer = RepeatedTimer(self.clean_time_gap, self.notice_refresh, None)

    def pop_forever(self):
        self.start_popping_tasks()

    def get_task_queue_size(self):
        pass

    # TODO: There is a bottle neck here
    def start_popping_tasks(self):
        task_connection = ConnectionManager(queue_name=self.task_queue_name,
                                            durable=True, no_ack=False)
        eof_reached = False
        while self.is_started and not eof_reached:
            current_task_queue_size = task_connection.get_task_queue_size()
            while self.is_started and current_task_queue_size < self.task_queue_size_limit:
                task = self.task_manager.pop_task()
                if task is None:
                    # TODO: Don't use Error. Just break and handle the case later in this function
                    logging.info('EOF Reached')
                    eof_reached = True
                    break
                message = 'WORK ' + ujson.dumps(task)
                task_connection.publish(message)
                current_task_queue_size += 1

        task_connection.stop()

    def fail(self, slave_task_info):
        self.task_manager.add_task(slave_task_info['task'])
        self.slave_manager.update_last_response(slave_task_info)

    def success(self, slave_task_info):
        slave_info = self.slave_manager.update_last_response(slave_task_info)

    def update_slave_response_time(self, slave_task_info):
        slave_info = self.slave_manager.update_last_response(slave_task_info)

    def stop(self, info):
        self.is_started = False
        self.notice_slaves_stop()
        if self.pop_forever_handler is not None:
            self.pop_forever_handler.join()
        if self.repeated_timer is not None:
            self.repeated_timer.stop()
        self.slave_manager.stop()
        self.task_manager.stop()
        self.message_connection.stop()

    def notice_slaves_stop(self):
        task_connection = ConnectionManager(queue_name=self.task_queue_name,
                                            durable=True, no_ack=False)
        screen_list = [key for key in self.slave_manager.slave_dict.keys()]
        for screen in screen_list:
            task_connection.publish('STOP {}')
        # task_connection.broadcast_task('STOP {}')
        task_connection.stop()

    def refresh(self, info):
        cur_progress, total_task = self.task_manager.get_progress()
        logging.info('downloading {}/{} files'.format(cur_progress, total_task))
        if not self.is_started:
            return

        # if time interval met, check failed slave
        if self.last_wake_time is None:
            self.last_wake_time = datetime.datetime.utcnow()

        if self.last_wake_time + datetime.timedelta(
                seconds=self.clean_time_gap) > datetime.datetime.utcnow():
            return
        failed_slaves = self.slave_manager.get_failed_slaves(self.wait_time_for_slave)
        if len(failed_slaves) != 0:
            logging.info('Finding failed slaves... ' + str(failed_slaves))
        for slave in failed_slaves:
            self.restart_slave(slave)
        self.last_wake_time = datetime.datetime.utcnow()

    def notice_refresh(self, info):
        try:
            self.message_connection.publish('REFRESH {}')
        except IndexError:
            logging.critical('INDEX_ERROR')

    def stat(self, info):
        logging.info('=====================================')
        logging.info('Num of slave: ', self.slave_manager.get_num_slaves())
        logging.info('=====================================')
        if len(info) > 0:
            for slave in self.slave_manager.slave_list:
                if slave['last_response'] is None:
                    delta = 'new slave'
                else:
                    delta = datetime.datetime.utcnow() - slave['last_response']
                logging.info(slave['host'], '|', slave['queue'], '|', delta)
            logging.info('====================================')