Esempio n. 1
0
class ATR(Singleton):
    def __init__(self, hyper_params, max_num=9999, random=True):
        self.hp_name = list(hyper_params.keys())
        self.hp = self.get_hp(hyper_params)
        self.max_num = max_num
        self.random = random

        self.resource_manager = ResourceManager(mem_limit=1,
                                                cpu_limit=0.1,
                                                gpu_limit=0.5,
                                                max_instances=self.max_num)

        self.waiting_pool = [item for item in self.hp]
        self.finished_pool = []
        self.working_pool = []
        self.working_process = []

        self.lock_list = []
        self.shared_eps_num_list = []
        self.shared_eps_reward_list = []

    def get_hp(self, hyper_params):
        hp_list = list(hyper_params.values())
        if len(hp_list) < 2:
            return hp_list

        hp = hp_list.pop(0)
        for i in range(len(hp_list)):
            hp = list(product(hp, hp_list[i]))
        return hp

    def start(self):
        while True:
            self.auto_tune()
            sleep(1.0)

    def report(self):
        self.resource_manager.report()

    def auto_tune(self):
        self.ask_result()
        self.auto_kill()
        self.auto_gen()

    # for test
    def ask_result(self):
        if len(self.working_pool) == 0: return
        for i in range(len(self.working_pool)):
            lock = self.lock_list[i]
            lock.acquire()
            shared_eps_num = self.shared_eps_num_list[i].value
            shared_eps_reward = self.shared_eps_reward_list[i].value
            lock.release()
            print(i, shared_eps_num, shared_eps_reward)

    # for test
    def auto_kill(self):
        if len(self.working_pool) == 0:
            if len(self.waiting_pool) == 0:
                print('All Job Finished !')
            return

        index = -999
        min_eps_reward = -99999
        for i in range(len(self.working_pool)):
            lock = self.lock_list[i]
            lock.acquire()
            shared_eps_num = self.shared_eps_num_list[i].value
            shared_eps_reward = self.shared_eps_reward_list[i].value
            lock.release()
            if shared_eps_num > 1000:
                if shared_eps_reward < min_eps_reward:
                    min_eps_reward = shared_eps_reward
                    index = i
        if index < 0:
            return
        process = self.working_process.pop(index)
        process.terminate()
        hyper_param = self.working_pool.pop(index)
        self.finished_pool.append(hyper_param)
        self.lock_list.pop(index)
        self.shared_eps_num_list.pop(index)
        self.shared_eps_reward_list.pop(index)

    def auto_gen(self):
        while True:
            if len(self.waiting_pool) == 0: return
            if len(self.working_pool) >= self.max_num: return
            if not self.resource_manager.get_memory_access(): return
            if not self.resource_manager.get_cpu_access(): return
            gpu_id = self.resource_manager.get_gpu_access()
            if gpu_id < 0: return
            if (self.random):
                index = 0
                if len(self.waiting_pool) > 1:
                    index = randint(0, len(self.waiting_pool) - 1)
                hyper_param = self.waiting_pool.pop(index)
                self.working_pool.append(hyper_param)
            else:
                hyper_param = self.waiting_pool.pop(0)
                self.working_pool.append(hyper_param)

            self.create_process(hyper_param)

    def create_process(self, hyper_param):
        ctx = mp.get_context('spawn')
        lock = ctx.Lock()
        shared_eps_num = ctx.Value('l', 0)
        shared_eps_reward = ctx.Value('d', 0.0)

        process = ctx.Process(target=run,
                              args=(lock, shared_eps_num, shared_eps_reward,
                                    hyper_param))
        process.start()

        self.lock_list.append(lock)
        self.shared_eps_num_list.append(shared_eps_num)
        self.shared_eps_reward_list.append(shared_eps_reward)
        self.working_process.append(process)

        print('Start:', hyper_param)

    def listener(self, event):
        if event.exception: print('The job crashed :(')