def __init__(self, args, data, opts): ''' 需要完成几个任务,第一个是对初始任务,怎么考虑 第二是从构建此表,最终完成采样过程 第三是根据采样过程,构建出模型的size,选取出需要用到的参数 :param model: :param task: :param args: :param data: ''' self.args = args self.data = data self.opts = opts self.controller = Controller(args=self.args, task_num=self.opts.num_task) self.controller_optim = Adam(self.controller.parameters(), lr=args.controller_lr) cuda_condition = torch.cuda.is_available() and args.with_cuda self.device = torch.device("cuda" if cuda_condition else "cpu") self.controller = self.controller.to(self.device) self.tasks_config = [] self.task_acc = [] self.model_dict = [] self.task_scope = 1 # =>reuse self.general_scope = 1 # =>new if self.args.adapt: self.task_scope += 1 if self.args.fuse: self.general_scope += 1 self.tensorboard_writer = SummaryWriter() self.iter = 0
def __init__(self, mdir, device, time_limit, explorer=False): """ Build vae, rnn, controller and environment. """ self.explorer = explorer # Load controllers vae_file, rnn_file, ctrl_file = \ [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']] if self.explorer: ctrl_file = join(mdir, 'exp', 'best.tar') assert exists(vae_file) and exists(rnn_file),\ "Either vae or mdrnn is untrained." vae_state, rnn_state = [ torch.load(fname, map_location={'cuda:0': str(device)}) for fname in (vae_file, rnn_file) ] for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)): print("Loading {} at epoch {} " "with test loss {}".format(m, s['epoch'], s['precision'])) self.vae = VAE(3, LSIZE).to(device) self.vae.load_state_dict(vae_state['state_dict']) # MDRNNCell self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device) self.mdrnn.load_state_dict( {k.strip('_l0'): v for k, v in rnn_state['state_dict'].items()}) self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device) # load controller if it was previously saved if exists(ctrl_file): ctrl_state = torch.load(ctrl_file, map_location={'cuda:0': str(device)}) print("Loading Controller with reward {}".format( ctrl_state['reward'])) self.controller.load_state_dict(ctrl_state['state_dict']) self.env = gym.make('CarRacing-v0') self.device = device self.time_limit = time_limit self.mdrnn_notcell = MDRNN(LSIZE, ASIZE, RSIZE, 5) self.mdrnn_notcell.to(device) self.mdrnn_notcell.load_state_dict(rnn_state['state_dict'])
def get_data(self, request, id): response = api.get_controller(request, id) data = json.loads(response.text) controller = Controller(data["id"], data["controller_name"], data["class_name"], data["enabled"]) return controller
def __init__(self, device, time_limit, discrete_VAE): """ Build vae, rnn, controller and environment. """ self.env = gym.make('CarRacing-v0') self.device = device self.time_limit = time_limit self.discrete_VAE = discrete_VAE #Because the represenation is discrete, we increase the size of the latent vector if (self.discrete_VAE): LSIZE = 128 self.vae = VAE(3, LSIZE, 1024) self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5) self.controller = Controller(LSIZE, RSIZE, ASIZE)
def __init__(self, mdir, device, time_limit): """ Build vae, rnn, controller and environment. """ # Loading world model and vae vae_file, rnn_file, ctrl_file = \ [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']] assert exists(vae_file) and exists(rnn_file),\ "Either vae or mdrnn is untrained." vae_state, rnn_state = [ torch.load(fname, map_location={'cuda:0': str(device)}) for fname in (vae_file, rnn_file)] for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)): print("Loading {} at epoch {} " "with test loss {}".format( m, s['epoch'], s['precision'])) self.vae = VAE(3, LSIZE).to(device) self.vae.load_state_dict(vae_state['state_dict']) self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device) self.mdrnn.load_state_dict( {k.strip('_l0'): v for k, v in rnn_state['state_dict'].items()}) self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device) # load controller if it was previously saved if exists(ctrl_file): ctrl_state = torch.load(ctrl_file, map_location={'cuda:0': str(device)}) print("Loading Controller with reward {}".format( ctrl_state['reward'])) self.controller.load_state_dict(ctrl_state['state_dict']) self.env = gym.make('CarRacing-v0') self.device = device self.time_limit = time_limit
for s_id in range(rollouts): p_queue.put((s_id, best_guess)) print("Evaluating...") for _ in tqdm(range(rollouts)): while r_queue.empty(): sleep(.1) restimates.append(r_queue.get()[1]) return best_guess, np.mean(restimates), np.std(restimates) ################################################################################ # Launch CMA # ################################################################################ controller = Controller(LSIZE, RSIZE, ASIZE, is_gate=args.is_gate) # dummy instance # define current best and load parameters cur_best = None ctrl_file = join(ctrl_dir, 'best.tar') print("Attempting to load previous best...") if exists(ctrl_file): state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'}) cur_best = -state['reward'] # changes so that can load previous mdoels even if the controller source # code has changes (add new parameters) load_model_safe_(controller, state['state_dict']) try: print(controller.gates, controller.is_gate) except AttributeError:
class Mutator: def __init__(self, args, data, opts): ''' 需要完成几个任务,第一个是对初始任务,怎么考虑 第二是从构建此表,最终完成采样过程 第三是根据采样过程,构建出模型的size,选取出需要用到的参数 :param model: :param task: :param args: :param data: ''' self.args = args self.data = data self.opts = opts self.controller = Controller(args=self.args, task_num=self.opts.num_task) self.controller_optim = Adam(self.controller.parameters(), lr=args.controller_lr) cuda_condition = torch.cuda.is_available() and args.with_cuda self.device = torch.device("cuda" if cuda_condition else "cpu") self.controller = self.controller.to(self.device) self.tasks_config = [] self.task_acc = [] self.model_dict = [] self.task_scope = 1 # =>reuse self.general_scope = 1 # =>new if self.args.adapt: self.task_scope += 1 if self.args.fuse: self.general_scope += 1 self.tensorboard_writer = SummaryWriter() self.iter = 0 def run(self): print('Experiment use {}'.format(self.args.base)) if self.args.base == 'mlp': report_final_eval_acc, final_log, all_acc = self.run_mlp() elif self.args.base == 'cnn': report_final_eval_acc, final_log, all_acc = self.run_cnn() print('Acc:') for items in report_final_eval_acc: s = '' for item in items: s += '%.3f\t' % item print(s) print(all_acc) print(final_log) print(self.args) def controller_sample(self, task): if self.args.base == 'mlp': steps = self.args.mlp_linear elif self.args.base == 'cnn': steps = self.args.cnn_cnn_linear + self.args.cnn_mlp_linear else: steps = 0 raise NotImplemented step_probs = [] step_idx = [] step_losses = [] sample_idx = torch.tensor(0).view(-1).to(self.device) hidden = None for idx, step in enumerate(range(steps)): logit, hidden = self.controller(input=sample_idx, task=task, hidden=hidden) if self.args.greedy > 0 and random.random() < self.args.greedy: sample_idx = torch.tensor( random.randint( 0, task * self.task_scope + self.general_scope - 1)).to(self.device) if self.args.base == 'cnn': raise NotImplemented # greedy should fix with cnn model else: sample_idx = torch.multinomial(F.softmax(logit, dim=-1), 1).view(-1) if idx >= self.args.cnn_cnn_linear: if sample_idx == 0: pass elif self.general_scope > 1 and step == self.general_scope - 1: pass else: if self.args.adapt: while (sample_idx - self.general_scope ) % self.task_scope + 1 == 2: sample_idx = torch.multinomial( F.softmax(logit, dim=-1), 1).view(-1) assert (sample_idx - self.general_scope ) % self.task_scope + 1 != 2 assert sample_idx < task * self.task_scope + self.general_scope step_probs.append(F.softmax(logit, dim=-1).tolist()) step_idx.append(sample_idx.item()) step_losses.append( F.cross_entropy(logit.view(1, -1), sample_idx.view(-1))) step_losses = torch.stack(step_losses, dim=0) return step_probs, step_idx, torch.mean(step_losses) def crop_model(self, step_idx, default_config): def get_layer_dict(cur_model_dict, use_dict, layer): # 从一个模型的参数中,取出某一个层的参数 for key, value in use_dict.items(): if 'Stack{}'.format(layer) in key: cur_model_dict[key] = value return cur_model_dict def init_dict(last_model_dict, cur_model_dict): # 将最近的一层的classify继承下来 for key, value in last_model_dict.items(): if 'classify' in key: cur_model_dict[key] = value return cur_model_dict def fuse(cur_model_dict, layer): temp = dict() for use_dict in self.model_dict: for key, value in use_dict.items(): if 'Stack{}'.format(layer) in key: if key in temp.keys(): temp[key].append(value) else: temp[key] = [value] for key, value in temp.items(): cur_model_dict[key] = torch.mean( torch.stack(value, dim=0), dim=0) # we assert all model shape equal return cur_model_dict def adapt_config(source_config): ''' {'conv': [(64, 128, 3)]} :param source_config: :return: ''' key = source_config.keys() assert len(key) == 1 key = list(key)[0] assert key == 'conv' # because just cnn could adapt, and this is ensured by <controller_sample> method source_config = deepcopy(source_config) value = source_config[key] assert isinstance(value, list) original_tuple = value[0] adapt_tuple = (original_tuple[1], original_tuple[1], 1) value.append(adapt_tuple) return source_config cur_model_dict = dict() cur_model_dict = init_dict(self.model_dict[-1], cur_model_dict) cur_model_config = [] create_log = '' for layer, step in enumerate(step_idx): # 选择空间 [new, reuse 0, adapt 0, reuse 1, adapt1 ,.....] # step = step.item() if step == 0: create_log += 'NEW '.format(layer) cur_model_config.append(default_config[layer]) elif self.general_scope > 1 and step == self.general_scope - 1: create_log += 'Fuse from task above '.format(layer) cur_model_config.append( default_config[layer]) # we assert all shape equal cur_model_dict = fuse(cur_model_dict, layer) else: ''' test case1: general_scope=2 task_scope=1 then [0,1,2,3,4] we get[new,fuse,reuse0,reuse1,reuse2] test case2: general_scope=1 task_scope=1 then [0,1,2,3,4] we get[new,reuse0,reuse1,reuse2,reuse3] ''' task_num = (step - self.general_scope) // self.task_scope choice = (step - self.general_scope ) % self.task_scope + 1 # adapt maybe wrong! use_dict = self.model_dict[task_num] use_config = self.tasks_config[task_num] if choice == 1: create_log += 'REUSE from task {} '.format(task_num) cur_model_dict = get_layer_dict(cur_model_dict, use_dict, layer) cur_model_config.append(use_config[layer]) elif self.args.adapt and choice == 2: create_log += 'ADAPT from task {} '.format(task_num) assert layer < 3 cur_model_dict = get_layer_dict(cur_model_dict, use_dict, layer) cur_model_config.append(adapt_config(use_config[layer])) else: raise NotImplemented assert len(cur_model_config) == len(step_idx) return cur_model_dict, cur_model_config, create_log def count_reward(self, cur_acc_lis, back_acc_list): ''' :param cur_acc_lis: 当前任务上,不同采样过程中出现的acc :param back_acc_list: 当前采样的情况下,对历史任务的回测acc :return: ''' if len(cur_acc_lis) > 1: beta = cur_acc_lis[-1] / max(cur_acc_lis[:-1]) else: beta = 0 alpha = [] assert len(back_acc_list) == len(self.task_acc) # for origin_acc, eval_back_acc in zip(self.task_acc, back_acc_list): # acc_drop = max(0, origin_acc - eval_back_acc) # # acc_drop = origin_acc - eval_back_acc # TODO, find better reward # alpha.append(acc_drop / origin_acc) # noise = 0.001 # alpha = 1 / (torch.mean(torch.tensor(alpha)) + noise) # alpha = -1 * (torch.mean(torch.tensor(alpha))) #TODO, find better reward # alpha = torch.sigmoid(-1 * (torch.mean(torch.tensor(alpha)))) - 0.5 # alpha = -1 * (torch.mean(torch.tensor(alpha))) + 0.05 # alpha = -1 * (torch.mean(torch.tensor(alpha))) + 0.5 # alpha = -1 * (torch.max(torch.tensor(alpha))) + 0.1 # reward = alpha + beta for origin_acc, eval_back_acc in zip(self.task_acc, back_acc_list): # acc_drop = max(0, origin_acc - eval_back_acc) acc_drop = eval_back_acc / origin_acc alpha.append(acc_drop) alpha = torch.mean(torch.tensor(alpha)) reward = alpha if self.args.beta: reward += beta self.tensorboard_writer.add_scalar('Reward/Sum', reward, self.iter) self.tensorboard_writer.add_scalar('Reward/Alpha', alpha, self.iter) self.tensorboard_writer.add_scalar('Reward/Beta', beta, self.iter) self.iter += 1 if self.args.baseline > 0: reward = reward - self.args.baseline return reward.item() def run_mlp(self): final_log = '' report_final_eval_acc = [[0.0] * self.opts.num_task for _ in range(self.opts.num_task)] if self.args.dataset == 'mnist': input_feature = 28 * 28 elif self.args.dataset == 'cifar10': input_feature = 32 * 32 else: input_feature = 0 raise NotImplemented default_config = [{ 'mlp': (input_feature, self.args.mlp_size) }] + [{ 'mlp': (self.args.mlp_size, self.args.mlp_size) }] * (self.args.mlp_linear - 1) controller_dic = deepcopy(self.controller.state_dict()) for task in range(self.opts.num_task): print( '--------------Create Config and Dict for task {}--------------' .format(task)) if self.args.random: self.controller.load_state_dict(deepcopy(controller_dic)) elif self.args.gaussian > 0: temp = deepcopy(self.controller.state_dict()) for key, value in temp.items(): temp[key] = value + torch.randn_like(value) * ( self.args.gaussian**0.5) self.controller.load_state_dict(temp) elif self.args.random_c: temp = deepcopy(self.controller.state_dict()) for key, value in temp.items(): if 'choice' in key: temp[key] = controller_dic[key] self.controller.load_state_dict(temp) if task == 0: cur_model = MLP(default_config, self.args.mlp_size, self.opts) trainer = Trainer(model=cur_model, task=task, args=self.args, data=self.data) cur_acc, cur_model_dic = trainer.run() self.tasks_config.append(default_config) self.task_acc.append(cur_acc) self.model_dict.append(cur_model_dic) print('Task{} Best Acc is {}'.format(task, cur_acc)) report_final_eval_acc[task][:task + 1] = [cur_acc] else: best_reward = float('-inf') cur_acc_lis = [] cur_best_acc, cur_best_dic, cur_best_config, best_create_log, step_probs = 0, None, None, None, None report_back_acc_list = None if self.args.upper_bound: valid_idx = list(range(task + 1)) total_choice = list( itertools.product(valid_idx, repeat=self.args.mlp_linear)) * 5 total_step = len(total_choice) elif self.args.base_model: total_choice = [[task] * self.args.mlp_linear] total_step = 1 else: total_step = self.args.controller_steps for steps in range(total_step): if self.args.upper_bound or self.args.base_model: step_idx = list(total_choice[steps]) else: self.controller.train() step_probs, step_idx, sample_loss = self.controller_sample( task) cur_model_dict, cur_model_config, create_log = self.crop_model( step_idx, default_config) cur_model = MLP(cur_model_config, self.args.mlp_size, self.opts) trainer = Trainer(model=cur_model, task=task, args=self.args, data=self.data) trainer.reload_checkpoint(cur_model_dict) cur_acc, cur_model_dic = trainer.run( task_list=list(range(0, task))) cur_acc_lis.append(cur_acc) back_acc_list = trainer.history_eval( task_list=list(range(0, task))) reward = self.count_reward(cur_acc_lis, back_acc_list) if steps % self.args.controller_logging_step == 0: print( '-------Logging at {} step for controller-------'. format(steps)) print(create_log) print('Reward:{}. '.format(reward)) if step_probs: for step_prob in step_probs: print(step_prob) if reward > best_reward: best_reward = reward cur_best_dic = cur_model_dic cur_best_acc = cur_acc cur_best_config = cur_model_config report_back_acc_list = back_acc_list best_create_log = create_log if self.args.upper_bound or self.args.base_model: pass else: self.controller_optim.zero_grad() loss = sample_loss * reward loss.backward() self.controller_optim.step() print('\033[95mAfter task {}'.format(task)) print(best_create_log) final_log = final_log + best_create_log + '\n' print('best reward :{}\033[0m'.format(best_reward)) self.tasks_config.append(cur_best_config) self.task_acc.append(cur_best_acc) self.model_dict.append(cur_best_dic) report_final_eval_acc[task][:len(report_back_acc_list) + 1] = report_back_acc_list + [ cur_best_acc ] if task == self.opts.num_task - 1: all_acc = torch.mean( torch.tensor(report_back_acc_list + [cur_best_acc])).item() return report_final_eval_acc, final_log, all_acc def run_cnn(self): final_log = '' report_final_eval_acc = [[0.0] * self.opts.num_task for _ in range(self.opts.num_task)] if self.args.dataset == 'mnist': input_size = 28 input_channel = 1 elif self.args.dataset == 'cifar10': input_size = 32 input_channel = 3 else: input_feature = 0 raise NotImplemented # (((inputsize-3)//2 -2)//2-1)//2 final_size = (( (input_size - input_size // 8 + 1) // 2 - input_size // 10 + 1) // 2 - 1) // 2 default_config = [{ 'conv': [(input_channel, 64, input_size // 8)] }, { 'conv': [(64, 128, input_size // 10)] }, { 'conv': [(128, 256, 2)] }, { 'mlp': (final_size**2 * 256, 2048) }, { 'mlp': (2048, 2048) }] controller_dic = deepcopy(self.controller.state_dict()) for task in range(self.opts.num_task): print( '--------------Create Config and Dict for task {}--------------' .format(task)) if self.args.random: self.controller.load_state_dict(deepcopy(controller_dic)) elif self.args.gaussian > 0: temp = deepcopy(self.controller.state_dict()) for key, value in temp.items(): temp[key] = value + torch.randn_like(value) * ( self.args.gaussian**0.5) self.controller.load_state_dict(temp) elif self.args.random_c: temp = deepcopy(self.controller.state_dict()) for key, value in temp.items(): if 'choice' in key: temp[key] = controller_dic[key] self.controller.load_state_dict(temp) if task == 0: cur_model = CNN(default_config, self.args.cnn_linear_size, self.opts) trainer = Trainer(model=cur_model, task=task, args=self.args, data=self.data) cur_acc, cur_model_dic = trainer.run() self.tasks_config.append(default_config) self.task_acc.append(cur_acc) self.model_dict.append(cur_model_dic) print('Task{} Best Acc is {}'.format(task, cur_acc)) report_final_eval_acc[task][:task + 1] = [cur_acc] else: best_reward = float('-inf') cur_acc_lis = [] cur_best_acc, cur_best_dic, cur_best_config, best_create_log, step_probs = 0, None, None, None, None report_back_acc_list = None total_step = self.args.controller_steps for steps in range(total_step): self.controller.train() step_probs, step_idx, sample_loss = self.controller_sample( task) cur_model_dict, cur_model_config, create_log = self.crop_model( step_idx, default_config) cur_model = CNN(cur_model_config, self.args.cnn_linear_size, self.opts) trainer = Trainer(model=cur_model, task=task, args=self.args, data=self.data) trainer.reload_checkpoint(cur_model_dict) cur_acc, cur_model_dic = trainer.run( task_list=list(range(0, task))) cur_acc_lis.append(cur_acc) back_acc_list = trainer.history_eval( task_list=list(range(0, task))) reward = self.count_reward(cur_acc_lis, back_acc_list) if steps % self.args.controller_logging_step == 0: print( '-------Logging at {} step for controller-------'. format(steps)) print(create_log) print('Reward:{}. '.format(reward)) if step_probs: for step_prob in step_probs: print(step_prob) if reward > best_reward: best_reward = reward cur_best_dic = cur_model_dic cur_best_acc = cur_acc cur_best_config = cur_model_config report_back_acc_list = back_acc_list best_create_log = create_log if self.args.upper_bound or self.args.base_model: pass else: self.controller_optim.zero_grad() loss = sample_loss * reward loss.backward() self.controller_optim.step() print('\033[95mAfter task {}'.format(task)) print(best_create_log) final_log = final_log + best_create_log + '\n' print('best reward :{}\033[0m'.format(best_reward)) self.tasks_config.append(cur_best_config) self.task_acc.append(cur_best_acc) self.model_dict.append(cur_best_dic) report_final_eval_acc[task][:len(report_back_acc_list) + 1] = report_back_acc_list + [ cur_best_acc ] if task == self.opts.num_task - 1: all_acc = torch.mean( torch.tensor(report_back_acc_list + [cur_best_acc])).item() return report_final_eval_acc, final_log, all_acc
# Fix numeric divergence due to bug in Cudnn torch.backends.cudnn.benchmark = True device = torch.device("cuda" if cuda else "cpu") trained = 0 #model = VAE(3, LSIZE).to(device) vae_model = VAE(3, LSIZE) vae_model = torch.nn.DataParallel(vae_model, device_ids=[7]) vae_model.cuda(7) vae_model.eval() mdrnn_model = MDRNNCell(LSIZE, ASIZE, RSIZE, 5) mdrnn_model = torch.nn.DataParallel(mdrnn_model, device_ids=[7]) mdrnn_model.cuda(7) mdrnn_model.eval() controller = torch.nn.DataParallel(Controller(LSIZE, RSIZE, ASIZE)).cuda() vis = visdom.Visdom(env='dream') image_window = vis.image( np.random.rand(RED_SIZE * 10, RED_SIZE * 10), opts=dict(title='dream!', caption='dream.'), ) # check vae dir exists, if not, create it dream_dir = join(args.logdir, 'dream') vae_dir = join(args.logdir, 'vae') reload_file = join(vae_dir, 'best.tar') state = torch.load(reload_file) print("Reloading model at epoch {}" ", with test error {}".format(state['epoch'], state['precision'])) vae_model.load_state_dict(state['state_dict'])
def train_explorer(logdir, epochs=10, n_samples=4, pop_size=4, display=True, max_workers=10): results = {} results['best'] = [] # multiprocessing variables num_workers = min(max_workers, n_samples * pop_size) time_limit = 1000 # create tmp dir if non existent and clean it if existent tmp_dir = join(logdir, 'tmp_exp') if not exists(tmp_dir): mkdir(tmp_dir) else: for fname in listdir(tmp_dir): unlink(join(tmp_dir, fname)) # create exp dir if non exitent explore_dir = join(logdir, 'explore') if not exists(explore_dir): mkdir(explore_dir) ################################################################################ # Thread routines # ################################################################################ def slave_routine(p_queue, r_queue, e_queue, p_index): """ Thread routine. Threads interact with p_queue, the parameters queue, r_queue, the result queue and e_queue the end queue. They pull parameters from p_queue, execute the corresponding rollout, then place the result in r_queue. Each parameter has its own unique id. Parameters are pulled as tuples (s_id, params) and results are pushed as (s_id, result). The same parameter can appear multiple times in p_queue, displaying the same id each time. As soon as e_queue is non empty, the thread terminate. When multiple gpus are involved, the assigned gpu is determined by the process index p_index (gpu = p_index % n_gpus). :args p_queue: queue containing couples (s_id, parameters) to evaluate :args r_queue: where to place results (s_id, results) :args e_queue: as soon as not empty, terminate :args p_index: the process index """ # init routine gpu = p_index % torch.cuda.device_count() device = torch.device( 'cuda:{}'.format(gpu) if torch.cuda.is_available() else 'cpu') # redirect streams sys.stdout = open(join(tmp_dir, str(getpid()) + '.out'), 'a') sys.stderr = open(join(tmp_dir, str(getpid()) + '.err'), 'a') # with torch.no_grad(): # r_gen = RolloutGenerator(logdir, device, time_limit) # while e_queue.empty(): # if p_queue.empty(): # sleep(.1) # else: # s_id, params = p_queue.get() # r_queue.put((s_id, r_gen.rollout(params))) with torch.no_grad(): r_gen = RolloutGenerator(logdir, device, time_limit) while e_queue.empty(): if p_queue.empty(): sleep(.1) else: s_id, params = p_queue.get() r_queue.put((s_id, r_gen.rollout(params))) ################################################################################ # Define queues and start workers # ################################################################################ p_queue = Queue() r_queue = Queue() e_queue = Queue() for p_index in range(num_workers): Process(target=slave_routine, args=(p_queue, r_queue, e_queue, p_index)).start() ################################################################################ # Evaluation # ################################################################################ def evaluate(solutions, results, rollouts=100): """ Give current controller evaluation. Evaluation is minus the cumulated reward averaged over rollout runs. :args solutions: CMA set of solutions :args results: corresponding results :args rollouts: number of rollouts :returns: minus averaged cumulated reward """ index_min = np.argmin(results) best_guess = solutions[index_min] restimates = [] for s_id in range(rollouts): p_queue.put((s_id, best_guess)) print("Evaluating...") for _ in tqdm(range(rollouts)): while r_queue.empty(): sleep(.1) restimates.append(r_queue.get()[1]) return best_guess, np.mean(restimates), np.std(restimates) ################################################################################ # Launch CMA # ################################################################################ controller = Controller(LSIZE, RSIZE, ASIZE) # dummy instance # define current best and load parameters cur_best = None ctrl_file = join(explore_dir, 'best.tar') print("Attempting to load previous best...") if exists(ctrl_file): state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'}) cur_best = -state['reward'] controller.load_state_dict(state['state_dict']) print("Previous best was {}...".format(-cur_best)) parameters = controller.parameters() es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1, {'popsize': pop_size}) epoch = 0 log_step = 3 while not es.stop(): if cur_best is not None and -cur_best > target_return: print("Already better than target, breaking...") break r_list = [0] * pop_size # result list solutions = es.ask() # push parameters to queue for s_id, s in enumerate(solutions): for _ in range(n_samples): p_queue.put((s_id, s)) # retrieve results if display: pbar = tqdm(total=pop_size * n_samples) for _ in range(pop_size * n_samples): while r_queue.empty(): sleep(.1) r_s_id, r = r_queue.get() r_list[r_s_id] += r / n_samples if display: pbar.update(1) if display: pbar.close() es.tell(solutions, r_list) es.disp() # evaluation and saving if epoch % log_step == log_step - 1: best_params, best, std_best = evaluate(solutions, r_list) # log the best results['best'].append(best) print("Current evaluation: {}".format(best)) if not cur_best or cur_best > best: cur_best = best print("Saving new best with value {}+-{}...".format( -cur_best, std_best)) load_parameters(best_params, controller) torch.save( { 'epoch': epoch, 'reward': -cur_best, 'state_dict': controller.state_dict() }, join(explore_dir, 'best.tar')) if -best > target_return: print( "Terminating controller training with value {}...".format( best)) break epoch += 1 es.result_pretty() e_queue.put('EOP') return results
def __init__(self, mdir, device, time_limit, number_goals, Forward_model, hiddengoals: bool, curiosityreward=bool, static=bool): """ Build vae, rnn, controller and environment. """ # Loading world model and vae vae_file, rnn_file, ctrl_file, Dtild_file, hiddenvae_file = [ join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl', 'dtild', 'hiddenvae'] ] assert exists(vae_file) and exists( rnn_file), "Either vae or mdrnn is untrained." vae_state, rnn_state, hiddenvae_state = [ torch.load(fname, map_location={'cuda:0': str(device)}) for fname in (vae_file, rnn_file, hiddenvae_file) ] for m, s in (('VAE', vae_state), ('MDRNN', rnn_state), ('HiddenVAE', hiddenvae_state)): print("Loading {} at epoch {} " "with test loss {}".format(m, s['epoch'], s['precision'])) self.vae = VAE(3, LSIZE).to(device) self.vae.load_state_dict(vae_state['state_dict']) self.HiddenVAE = HiddenVAE(256, LSIZE).to(device) self.HiddenVAE.load_state_dict(hiddenvae_state['state_dict']) self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device) self.mdrnn.load_state_dict( {k.strip('_l0'): v for k, v in rnn_state['state_dict'].items()}) self.mdrnnBIG = MDRNN(LSIZE, ASIZE, RSIZE, 5).to(device) self.mdrnnBIG.load_state_dict(rnn_state["state_dict"]) self.controller = Controller(256, 256, 6).to(device) self.env = gym.make('MiniGrid-MultiRoom-N6-v0') self.device = device self.number_goals = number_goals self.time_limit = time_limit self.vae_state = vae_state self.rnn_state = rnn_state self.hiddenvae_state = hiddenvae_state self.hiddengoals = hiddengoals self.curiosityreward = curiosityreward self.static = static self.Forward_model = Forward_model self.fmodel = Dtild(32, 256, 1, 32).to(device)
class RolloutGenerator(object): """ Utility to generate rollouts. Encapsulate everything that is needed to generate rollouts in the TRUE ENV using a controller with previously trained VAE and MDRNN. :attr vae: VAE model loaded from mdir/vae :attr mdrnn: MDRNN model loaded from mdir/mdrnn :attr controller: Controller, either loaded from mdir/ctrl or randomly initialized :attr env: instance of the CarRacing-v0 gym environment :attr device: device used to run VAE, MDRNN and Controller :attr time_limit: rollouts have a maximum of time_limit timesteps """ def __init__(self, mdir, device, time_limit): """ Build vae, rnn, controller and environment. """ # Loading world model and vae vae_file, rnn_file, ctrl_file = \ [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']] assert exists(vae_file) and exists(rnn_file),\ "Either vae or mdrnn is untrained." vae_state, rnn_state = [ torch.load(fname, map_location={'cuda:0': str(device)}) for fname in (vae_file, rnn_file)] for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)): print("Loading {} at epoch {} " "with test loss {}".format( m, s['epoch'], s['precision'])) self.vae = VAE(3, LSIZE).to(device) self.vae.load_state_dict(vae_state['state_dict']) self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device) self.mdrnn.load_state_dict( {k.strip('_l0'): v for k, v in rnn_state['state_dict'].items()}) self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device) # load controller if it was previously saved if exists(ctrl_file): ctrl_state = torch.load(ctrl_file, map_location={'cuda:0': str(device)}) print("Loading Controller with reward {}".format( ctrl_state['reward'])) self.controller.load_state_dict(ctrl_state['state_dict']) self.env = gym.make('CarRacing-v0') self.device = device self.time_limit = time_limit def get_action_and_transition(self, obs, hidden): """ Get action and transition. Encode obs to latent using the VAE, then obtain estimation for next latent and next hidden state using the MDRNN and compute the controller corresponding action. :args obs: current observation (1 x 3 x 64 x 64) torch tensor :args hidden: current hidden state (1 x 256) torch tensor :returns: (action, next_hidden) - action: 1D np array - next_hidden (1 x 256) torch tensor """ _, latent_mu, _ = self.vae(obs) action = self.controller(latent_mu, hidden[0]) _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden) return action.squeeze().cpu().numpy(), next_hidden def rollout(self, params, render=False): """ Execute a rollout and returns minus cumulative reward. Load :params: into the controller and execute a single rollout. This is the main API of this class. :args params: parameters as a single 1D np array :returns: minus cumulative reward """ # copy params into the controller if params is not None: load_parameters(params, self.controller) obs = self.env.reset() # This first render is required ! self.env.render() hidden = [ torch.zeros(1, RSIZE).to(self.device) for _ in range(2)] cumulative = 0 i = 0 while True: obs = transform(obs).unsqueeze(0).to(self.device) action, hidden = self.get_action_and_transition(obs, hidden) obs, reward, done, _ = self.env.step(action) if render: self.env.render() cumulative += reward if done or i > self.time_limit: return - cumulative i += 1
def __init__(self, mdir, device, time_limit, iteration_num=None, video_dir=None): """ Build vae, rnn, controller and environment. """ # Loading world model and vae vae_file, rnn_file, ctrl_file = [ join(mdir, m, "best.tar") for m in ["vae", "mdrnn", "ctrl"] ] if iteration_num is not None: vae_file, rnn_file, ctrl_file = [ join(mdir, m, "iter_{}".format(iteration_num), "best.tar") for m in ["vae", "mdrnn", "ctrl"] ] assert exists(vae_file) and exists( rnn_file), "Either vae or mdrnn is untrained." if iteration_num is not None: vae_file, rnn_file, ctrl_file = [ join(mdir, m, "iter_{}".format(iteration_num), "best.tar") for m in ["vae", "mdrnn", "ctrl"] ] assert exists(vae_file) and exists( rnn_file), "Either vae or mdrnn is untrained." print("\nRollout Generator") vae_state, rnn_state = [ torch.load(fname, map_location={"cuda:0": str(device)}) for fname in (vae_file, rnn_file) ] print("Loading VAE from {}".format(vae_file)) print("Loading RNN from {}".format(rnn_file)) for m, s in (("VAE", vae_state), ("MDRNN", rnn_state)): print("Loading {} at epoch {} " "with test loss {}".format(m, s["epoch"], s["precision"])) self.vae = VAE(3, LSIZE).to(device) self.vae.load_state_dict(vae_state["state_dict"]) self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device) self.mdrnn.load_state_dict( {k.strip("_l0"): v for k, v in rnn_state["state_dict"].items()}) self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device) # load controller if it was previously saved if exists(ctrl_file): print("Loading Controller from {}".format(ctrl_file)) ctrl_state = torch.load(ctrl_file, map_location={"cuda:0": str(device)}) print("Loading Controller with reward {}".format( ctrl_state["reward"])) self.controller.load_state_dict(ctrl_state["state_dict"]) self.env = gym.make("BipedalWalkerHardcore-v2") self.device = device self.time_limit = time_limit
class RolloutGenerator(object): """ Utility to generate rollouts. Encapsulate everything that is needed to generate rollouts in the TRUE ENV using a controller with previously trained VAE and MDRNN. :attr vae: VAE model loaded from mdir/vae :attr mdrnn: MDRNN model loaded from mdir/mdrnn :attr controller: Controller, either loaded from mdir/ctrl or randomly initialized :attr env: instance of the CarRacing-v0 gym environment :attr device: device used to run VAE, MDRNN and Controller :attr time_limit: rollouts have a maximum of time_limit timesteps """ def __init__(self, mdir, device, time_limit, iteration_num=None, video_dir=None): """ Build vae, rnn, controller and environment. """ # Loading world model and vae vae_file, rnn_file, ctrl_file = [ join(mdir, m, "best.tar") for m in ["vae", "mdrnn", "ctrl"] ] if iteration_num is not None: vae_file, rnn_file, ctrl_file = [ join(mdir, m, "iter_{}".format(iteration_num), "best.tar") for m in ["vae", "mdrnn", "ctrl"] ] assert exists(vae_file) and exists( rnn_file), "Either vae or mdrnn is untrained." if iteration_num is not None: vae_file, rnn_file, ctrl_file = [ join(mdir, m, "iter_{}".format(iteration_num), "best.tar") for m in ["vae", "mdrnn", "ctrl"] ] assert exists(vae_file) and exists( rnn_file), "Either vae or mdrnn is untrained." print("\nRollout Generator") vae_state, rnn_state = [ torch.load(fname, map_location={"cuda:0": str(device)}) for fname in (vae_file, rnn_file) ] print("Loading VAE from {}".format(vae_file)) print("Loading RNN from {}".format(rnn_file)) for m, s in (("VAE", vae_state), ("MDRNN", rnn_state)): print("Loading {} at epoch {} " "with test loss {}".format(m, s["epoch"], s["precision"])) self.vae = VAE(3, LSIZE).to(device) self.vae.load_state_dict(vae_state["state_dict"]) self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device) self.mdrnn.load_state_dict( {k.strip("_l0"): v for k, v in rnn_state["state_dict"].items()}) self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device) # load controller if it was previously saved if exists(ctrl_file): print("Loading Controller from {}".format(ctrl_file)) ctrl_state = torch.load(ctrl_file, map_location={"cuda:0": str(device)}) print("Loading Controller with reward {}".format( ctrl_state["reward"])) self.controller.load_state_dict(ctrl_state["state_dict"]) self.env = gym.make("BipedalWalkerHardcore-v2") self.device = device self.time_limit = time_limit def get_action_and_transition(self, obs, hidden): """ Get action and transition. Encode obs to latent using the VAE, then obtain estimation for next latent and next hidden state using the MDRNN and compute the controller corresponding action. :args obs: current observation (1 x 3 x 64 x 64) torch tensor :args hidden: current hidden state (1 x 256) torch tensor :returns: (action, next_hidden) - action: 1D np array - next_hidden (1 x 256) torch tensor """ _, latent_mu, _ = self.vae(obs) action = self.controller(latent_mu, hidden[0]) _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden) return action.squeeze().cpu().numpy(), next_hidden def rollout(self, params, render=False, rollout_dir=None, rollout_num=0, video_dir=None): """ Execute a rollout and returns minus cumulative reward. Load :params: into the controller and execute a single rollout. This is the main API of this class. :args params: parameters as a single 1D np array :returns: minus cumulative reward """ if video_dir is not None: self.env = wrappers.Monitor( self.env, "./{}/rollout_{}/".format(video_dir, rollout_num)) # copy params into the controller if params is not None: load_parameters(params, self.controller) self.env.reset() # This first render is required ! obs = self.env.render(mode='rgb_array') hidden = [torch.zeros(1, RSIZE).to(self.device) for _ in range(2)] cumulative = 0 i = 0 s_rollout = [] r_rollout = [] d_rollout = [] a_rollout = [] print('Starting to create the rollouts') while True: if i % 100 == 0: print("{} steps done of rollout".format(i)) obs = transform(obs).unsqueeze(0).to(self.device) action, hidden = self.get_action_and_transition(obs, hidden) _, reward, done, _ = self.env.step(action) # Save rollout data im_frame = self.env.render(mode="rgb_array") img = PIL.Image.fromarray(im_frame) img = img.resize((64, 64)) obs = np.array(img) s_rollout += [obs] r_rollout += [reward] d_rollout += [done] a_rollout += [action] if render: self.env.render() cumulative += reward if done or i > self.time_limit: print('Completed rollout with {} steps'.format(i)) if rollout_dir is not None: print("> End of rollout {}, {} frames...".format( rollout_num, len(s_rollout))) np.savez( join(rollout_dir, "rollout_{}".format(rollout_num)), observations=np.array(s_rollout), rewards=np.array(r_rollout), actions=np.array(a_rollout), terminals=np.array(d_rollout), ) self.env.reset() return -cumulative i += 1
for s_id in range(rollouts): p_queue.put((s_id, best_guess)) print("Evaluating...") for _ in tqdm(range(rollouts)): while r_queue.empty(): sleep(0.1) restimates.append(r_queue.get()[1]) return best_guess, np.mean(restimates), np.std(restimates) ################################################################################ # Launch CMA # ################################################################################ controller = Controller(LSIZE, RSIZE, ASIZE) # dummy instance # define current best and load parameters cur_best = None ctrl_file = join(prev_ctrl_dir, "best.tar") print("Attempting to load previous best...") if exists(ctrl_file): state = torch.load(ctrl_file, map_location={"cuda:0": "cpu"}) # cur_best = -state["reward"] print("Loading Controller from {}".format(ctrl_file)) controller.load_state_dict(state["state_dict"]) # print("Previous best was {}...".format(-cur_best)) parameters = controller.parameters() es = cma.CMAEvolutionStrategy( flatten_parameters(parameters), 0.1, {"popsize": pop_size}
elif arg == 'fuzzy': controller_fuzzy = True mono = True simulations /= 2 else: raise GetoptError() except GetoptError: usage() sys.exit(-1) #simulations = 5 if len(sys.argv) == 1 else int(sys.argv[1]) for i in range(int(simulations * 2)): # create a controller if mono: control = FuzzyLogicController( log=log) if controller_fuzzy else Controller(log=log) else: control = Controller( log=log) if i % 2 == 0 else FuzzyLogicController(log=log) # create North-to-South and West-to-East lanes north2south = Lane(control, S=15, D=7, name='North to South', init_state=State.green) west2east = Lane(control, S=15, D=7, name='West to East', init_state=State.red)
M = buildMemory('weights/2019.12.07/mdn_rnn_weights') get_hidden = K.function(M.layers[0].input, M.layers[0].output) # In[ ]: print(M.summary()) # In[ ]: controller = Controller(32+256, 3) controller.set_weights(np.load('./weights/C_weights.npy')) # $$\text{Controller}: \mathbb R^{288} \rightarrow \mathbb R^3 $$ # In[ ]: print('controller shape:') # In[ ]: print(controller.shape)
for s_id in range(rollouts): p_queue.put((s_id, best_guess)) print("Evaluating...") for _ in tqdm(range(rollouts)): while r_queue.empty(): sleep(.1) restimates.append(r_queue.get()[1]) return best_guess, np.mean(restimates), np.std(restimates) ################################################################################ # Launch CMA # ################################################################################ controller = Controller(LSIZE, RSIZE, ASIZE) # dummy instance # define current best and load parameters cur_best = None ctrl_file = join(ctrl_dir, 'best.tar') print("Attempting to load previous best...") if exists(ctrl_file): state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'}) cur_best = - state['reward'] controller.load_state_dict(state['state_dict']) print("Previous best was {}...".format(-cur_best)) parameters = controller.parameters() es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1, {'popsize': pop_size})
class RolloutGenerator(object): """ Utility to generate rollouts. Encapsulate everything that is needed to generate rollouts in the TRUE ENV using a controller with previously trained VAE and MDRNN. :attr vae: VAE model loaded from mdir/vae :attr mdrnn: MDRNN model loaded from mdir/mdrnn :attr controller: Controller, either loaded from mdir/ctrl or randomly initialized :attr env: instance of the CarRacing-v0 gym environment :attr device: device used to run VAE, MDRNN and Controller :attr time_limit: rollouts have a maximum of time_limit timesteps """ def __init__(self, mdir, device, time_limit, explorer=False): """ Build vae, rnn, controller and environment. """ self.explorer = explorer # Load controllers vae_file, rnn_file, ctrl_file = \ [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']] if self.explorer: ctrl_file = join(mdir, 'exp', 'best.tar') assert exists(vae_file) and exists(rnn_file),\ "Either vae or mdrnn is untrained." vae_state, rnn_state = [ torch.load(fname, map_location={'cuda:0': str(device)}) for fname in (vae_file, rnn_file) ] for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)): print("Loading {} at epoch {} " "with test loss {}".format(m, s['epoch'], s['precision'])) self.vae = VAE(3, LSIZE).to(device) self.vae.load_state_dict(vae_state['state_dict']) # MDRNNCell self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device) self.mdrnn.load_state_dict( {k.strip('_l0'): v for k, v in rnn_state['state_dict'].items()}) self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device) # load controller if it was previously saved if exists(ctrl_file): ctrl_state = torch.load(ctrl_file, map_location={'cuda:0': str(device)}) print("Loading Controller with reward {}".format( ctrl_state['reward'])) self.controller.load_state_dict(ctrl_state['state_dict']) self.env = gym.make('CarRacing-v0') self.device = device self.time_limit = time_limit self.mdrnn_notcell = MDRNN(LSIZE, ASIZE, RSIZE, 5) self.mdrnn_notcell.to(device) self.mdrnn_notcell.load_state_dict(rnn_state['state_dict']) #####$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ # VERY LAZY. Copied from the other trainmdrnn file # from trainmdrnn import get_loss, to_latent def to_latent(self, obs, next_obs): """ Transform observations to latent space. :args obs: 5D torch tensor (BSIZE, SEQ_LEN, ASIZE, SIZE, SIZE) :args next_obs: 5D torch tensor (BSIZE, SEQ_LEN, ASIZE, SIZE, SIZE) :returns: (latent_obs, latent_next_obs) - latent_obs: 4D torch tensor (BSIZE, SEQ_LEN, LSIZE) - next_latent_obs: 4D torch tensor (BSIZE, SEQ_LEN, LSIZE) """ with torch.no_grad(): obs, next_obs = [ f.upsample(x.view(-1, 3, SIZE, SIZE), size=RED_SIZE, mode='bilinear', align_corners=True) for x in (obs, next_obs) ] (obs_mu, obs_logsigma), (next_obs_mu, next_obs_logsigma) = [ self.vae(x)[1:] for x in (obs, next_obs) ] SEQ_LEN = 1 latent_obs, latent_next_obs = [ (x_mu + x_logsigma.exp() * torch.randn_like(x_mu)).view( BSIZE, SEQ_LEN, LSIZE) for x_mu, x_logsigma in [( obs_mu, obs_logsigma), (next_obs_mu, next_obs_logsigma)] ] return latent_obs, latent_next_obs def mdrnn_exp_reward(self, latent_obs, action, reward, latent_next_obs, hidden): """ # REMOVE TERMINAL Compute losses. The loss that is computed is: (GMMLoss(latent_next_obs, GMMPredicted) + MSE(reward, predicted_reward) + BCE(terminal, logit_terminal)) / (LSIZE + 2) The LSIZE + 2 factor is here to counteract the fact that the GMMLoss scales approximately linearily with LSIZE. All losses are averaged both on the batch and the sequence dimensions (the two first dimensions). :args latent_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor :args action: (BSIZE, SEQ_LEN, ASIZE) torch tensor :args reward: (BSIZE, SEQ_LEN) torch tensor :args latent_next_obs: (BSIZE, SEQ_LEN, LSIZE) torch tensor :returns: dictionary of losses, containing the gmm, the mse, the bce and the averaged loss. """ mus, sigmas, logpi, rs, ds, next_hidden = self.mdrnn( action, latent_obs, hidden) gmm = gmm_loss(latent_next_obs, mus, sigmas, logpi) # bce = f.binary_cross_entropy_with_logits(ds, terminal) mse = f.mse_loss(rs, reward) loss = (gmm + mse) / (LSIZE + 2) return loss.squeeze().cpu().numpy() # def recon_error_reward(self, obs, hidden, obs_new): # print('recon_error_reward') # """Find out how good the reconstruction was. # Encoding the vae to get mu and the controller action is deterministic, so its fine to be duplicated # ??? maybe remove this and the above function because of unnecessary duplication # """ # # obs_new = torch.from_numpy(np.moveaxis(obs_new, 2, 0).copy()).unsqueeze(0).to(self.device).type(torch.cuda.FloatTensor) # # obs = obs.to(self.device).type(torch.cuda.FloatTensor) # _, latent_mu, _ = self.vae(obs) # action = self.controller(latent_mu, hidden[0]) # mus, sigmas, logpi, r, d, next_hidden = self.mdrnn(action, latent_mu, hidden) # print('mus.size()', mus.size()) # print('sigmas.size()', sigmas.size()) # print('logpi.size()', logpi.size()) # print('r.size()', r.size()) # print('d.size()', d.size()) # print('next_hidden.size() [0], [1]', next_hidden[0].size(), next_hidden[1].size()) # recon_x = self.vae.decoder(mus.squeeze()).type(torch.cuda.FloatTensor) # ??? this is just mu, right? Still a bit confused # print('obs_new.size()', obs_new.size()) # print('recon_x.size()', recon_x.size()) # # reward = -1*((recon_x - obs_new) ** 2).mean() # reward = -1*F.mse_loss(recon_x, obs_new).item() def rollout(self, params, render=False): """ Execute a rollout and return reward Load :params: into the controller and execute a single rollout. This is the main API of this class. :args params: parameters as a single 1D np array :returns: minus cumulative reward if ctrl mode, cumulative recon_error if exp mode """ # copy params into the controller if params is not None: load_parameters(params, self.controller) obs = self.env.reset() # This first render is required ! self.env.render() hidden = [torch.zeros(1, RSIZE).to(self.device) for _ in range(2)] cumulative = 0 i = 0 while True: obs = transform(obs).unsqueeze(0).to(self.device) # GET ACTION _, latent_mu, _ = self.vae(obs) action = self.controller(latent_mu, hidden[0]) _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden) action = action.squeeze().cpu().numpy() next_obs, reward, done, _ = self.env.step(action) if self.explorer: latent_obs, latent_next_obs = self.to_latent( obs.unsqueeze(0), transform(next_obs).unsqueeze(0).to(self.device)) action = torch.from_numpy(action).unsqueeze(0) latent_obs = latent_obs.to(self.device).squeeze().unsqueeze(0) latent_next_obs = latent_next_obs.to( self.device).squeeze().unsqueeze(0) action = action.to(self.device) reward = torch.from_numpy(np.array(reward)).unsqueeze(0).type( torch.cuda.FloatTensor) reward = self.mdrnn_exp_reward(latent_obs, action, reward, latent_next_obs, hidden) obs = next_obs hidden = next_hidden if render: self.env.render() cumulative += reward if done or i > self.time_limit: return -cumulative i += 1
class RolloutGenerator(object): """ Utility to generate rollouts. Encapsulate everything that is needed to generate rollouts in the TRUE ENV using a controller with previously trained VAE and MDRNN. :attr vae: VAE model loaded from mdir/vae :attr mdrnn: MDRNN model loaded from mdir/mdrnn :attr controller: Controller, either loaded from mdir/ctrl or randomly initialized :attr env: instance of the CarRacing-v0 gym environment :attr device: device used to run VAE, MDRNN and Controller :attr time_limit: rollouts have a maximum of time_limit timesteps """ def __init__(self, mdir, device, time_limit): """ Build vae, rnn, controller and environment. """ # Loading world model and vae vae_file, rnn_file, ctrl_file = \ [join(mdir, m, 'best.tar') for m in ['vae', 'mdrnn', 'ctrl']] assert exists(vae_file) and exists(rnn_file),\ "Either vae or mdrnn is untrained." vae_state, rnn_state = [ torch.load(fname, map_location={'cuda:0': str(device)}) for fname in (vae_file, rnn_file) ] for m, s in (('VAE', vae_state), ('MDRNN', rnn_state)): print("Loading {} at epoch {} " "with test loss {}".format(m, s['epoch'], s['precision'])) self.vae = VAE(3, LSIZE).to(device) self.vae.load_state_dict(vae_state['state_dict']) self.mdrnn = MDRNNCell(LSIZE, ASIZE, RSIZE, 5).to(device) self.mdrnn.load_state_dict( {k.strip('_l0'): v for k, v in rnn_state['state_dict'].items()}) self.controller = Controller(LSIZE, RSIZE, ASIZE).to(device) # load controller if it was previously saved if exists(ctrl_file): ctrl_state = torch.load(ctrl_file, map_location={'cuda:0': str(device)}) print("Loading Controller with reward {}".format( ctrl_state['reward'])) self.controller.load_state_dict(ctrl_state['state_dict']) self.env = gym.make('CarRacing-v0') self.device = device self.time_limit = time_limit def get_action_and_transition(self, obs, hidden): """ Get action and transition. Encode obs to latent using the VAE, then obtain estimation for next latent and next hidden state using the MDRNN and compute the controller corresponding action. :args obs: current observation (1 x 3 x 64 x 64) torch tensor :args hidden: current hidden state (1 x 256) torch tensor :returns: (action, next_hidden) - action: 1D np array - next_hidden (1 x 256) torch tensor """ _, latent_mu, _ = self.vae(obs) action = self.controller(latent_mu, hidden[0]) _, _, _, _, _, next_hidden = self.mdrnn(action, latent_mu, hidden) return action.squeeze().cpu().numpy(), next_hidden def rollout(self, params, render=False): """ Execute a rollout and returns minus cumulative reward. Load :params: into the controller and execute a single rollout. This is the main API of this class. :args params: parameters as a single 1D np array :returns: minus cumulative reward """ # copy params into the controller if params is not None: load_parameters(params, self.controller) obs = self.env.reset() # This first render is required ! self.env.render() hidden = [torch.zeros(1, RSIZE).to(self.device) for _ in range(2)] cumulative = 0 i = 0 while True: obs = transform(obs).unsqueeze(0).to(self.device) action, hidden = self.get_action_and_transition(obs, hidden) obs, reward, done, _ = self.env.step(action) if render: self.env.render() cumulative += reward if done or i > self.time_limit: return -cumulative i += 1
def train_C_given_M(mdrnnCell, latent_dim, hidden_dim, action_dim): # Parameters num_episode = 1 batch_size = 1 learning_rate = 0.01 gamma = 0.99 done_threshold = np.log(0.5) interim_policy = Controller(latent_dim, hidden_dim, action_dim) optimizer = torch.optim.RMSprop(interim_policy.parameters(), lr=learning_rate) # Batch History state_pool = [] action_pool = [] reward_pool = [] steps = 0 for e in range(num_episode): # initial latent and hidden states z_t = torch.randn(1, LSIZE) h_t = 2 * [torch.zeros(1, RSIZE)] for t in range(1000): # pick action using policy net given z_t, h_t mean_a_t = interim_policy(z_t, h_t[0]) action_policy_std = 0.1 cov = action_policy_std * torch.eye(action_dim) stochastic_policy = MultivariateNormal(loc=mean_a_t, covariance_matrix=cov) a_t = stochastic_policy.sample() mu, sigma, pi, r, d, n_h = mdrnnCell(a_t, z_t, h_t) # sample next z_t from N(mu, sigma) pi = pi.squeeze() mixt = Categorical(torch.exp(pi)).sample().item() z_t = mu[:, mixt, :] # + sigma[:, mixt, :] * torch.randn_like(mu[:, mixt, :]) h_t = n_h reward = -0.1 if d >= done_threshold: done = True else: done = False state_pool.append((z_t, h_t)) action_pool.append(a_t) reward_pool.append(reward) steps += 1 if done: break # Update policy if e > 0 and e % batch_size == 0: # Discount reward running_add = 0 for i in reversed(range(steps)): if reward_pool[i] == 0: running_add = 0 else: running_add = running_add * gamma + reward_pool[i] reward_pool[i] = running_add # Normalize reward reward_mean = np.mean(reward_pool) reward_std = np.std(reward_pool) for i in range(steps): reward_pool[i] = (reward_pool[i] - reward_mean) / reward_std # Gradient Desent optimizer.zero_grad() for i in range(steps): z_t, h_t = state_pool[i] action = action_pool[i] reward = reward_pool[i] mean_a_t = interim_policy(z_t, h_t[0]) action_policy_std = 0.1 cov = action_policy_std * torch.eye(action_dim) stochastic_policy = MultivariateNormal(loc=mean_a_t, covariance_matrix=cov) loss = -stochastic_policy.log_prob( action) * reward # Negtive score function x reward # TODO: why do we need to use retain_graph here? loss.backward(retain_graph=True) optimizer.step() state_pool = [] action_pool = [] reward_pool = [] steps = 0 return interim_policy
def ctrl_exp_gen_data(rollouts, datadir, logdir, noise_type, device, use_ctrl_exp, exp_prob=.5, randomness_factor=.1): """ randomness factor is the multiple we will multiply the current standard deviation by to get the std for the normal disnt std. This help because it is more resonable. Really should be updating based on parameter distances over updates, but whatever. ** read the openai parameter thing Uses fixed parameters for vae and mdrnn, but maybe change All the if use_ctrl_exp: should be switched to having the random thing inside a module, or at least consistent with the explorer way. """ assert exists(logdir), "The directory does not exist..." exp_prob = float(exp_prob) env = gym.make("CarRacing-v0") seq_len = 1000 if use_ctrl_exp: a_rollout = [] #### Load controller and explorer ctrl_file = join(logdir, 'ctrl', 'best.tar') exp_file = join(logdir, 'exp', 'best.tar') controller = Controller(LSIZE, RSIZE, ASIZE).to(device) explorer = Controller(LSIZE, RSIZE, ASIZE).to(device) if exists(ctrl_file): ctrl_state = torch.load(ctrl_file, map_location={'cuda:0': str(device)}) print("Loading Controller with reward {}".format( ctrl_state['reward'])) controller.load_state_dict(ctrl_state['state_dict']) if exists(exp_file): exp_state = torch.load(exp_file, map_location={'cuda:0': str(device)}) print("Loading Explorer with reward {}".format( exp_state['reward'])) explorer.load_state_dict(exp_state['state_dict']) # Make the generators (this is unnecessary, shoul dbe organized some other way) ctrl_gen = RolloutGeneratorSingle(logdir, device, controller) exp_gen = RolloutGeneratorSingle(logdir, device, explorer) # for parameter noise exploration def update_params_noise(model, randomness_factor): def gaussian(ins, stddev=std): return ins + Variable(torch.randn(ins.size()).cuda() * stddev) all_params = [] controller_new = controller for name, param in controller.named_parameters(): all_params.append(param) std = np.std(np.array(params)) print('Parameter mean: ', np.mean(np.array(params))) print('Parameter std: ', std) std = std * randomness_factor controller_new.apply(gaussian) return controller_new for i in range(rollouts): env.reset() env.env.viewer.window.dispatch_events() s_rollout = [] r_rollout = [] d_rollout = [] if use_ctrl_exp: # randomize the explorer and controller explorer_new = update_params_noise(explorer, randomness_factor) controller_new = update_params_noise(controller, randomness_factor) # initialize the hidden state for the model: hidden = [torch.zeros(1, RSIZE).to(device) for _ in range(2)] else: if noise_type == 'white': a_rollout = [env.action_space.sample() for _ in range(seq_len)] elif noise_type == 'brown': a_rollout = sample_continuous_policy(env.action_space, seq_len, 1. / 50) t = 0 while True: if use_ctrl_exp: # explore or exploit: if random.uniform(0, 1) < exp_prob: action, obs, hidden = ctrl_gen(obs, hidden) else: action, obs, hidden = exp_gen(obs, hidden) a_rollout.append(action) else: action = a_rollout[t] t += 1 s, r, done, _ = env.step(action) env.env.viewer.window.dispatch_events() s_rollout += [s] r_rollout += [r] d_rollout += [done] if done: print("> End of rollout {}, {} frames...".format( i, len(s_rollout))) np.savez(join(datadir, 'rollout_{}'.format(i)), observations=np.array(s_rollout), rewards=np.array(r_rollout), actions=np.array(a_rollout), terminals=np.array(d_rollout)) break