# Define actor-critic model acmodel = utils.load_model(args.model, raise_not_found=False) if acmodel is None: if args.pretrained_model: acmodel = utils.load_model(args.pretrained_model, raise_not_found=True) else: acmodel = ACModel(obss_preprocessor.obs_space, envs[0].action_space, args.image_dim, args.memory_dim, args.instr_dim, not args.no_instr, args.instr_arch, not args.no_mem, args.arch) obss_preprocessor.vocab.save() utils.save_model(acmodel, args.model) if torch.cuda.is_available(): acmodel.cuda() # Define actor-critic algo reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward if args.algo == "ppo": algo = babyai.rl.PPOAlgo(envs, acmodel, args.frames_per_proc, args.discount, args.lr, args.beta1, args.beta2, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.ppo_epochs, args.batch_size, obss_preprocessor, reshape_reward) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) # When using extra binary information, more tensors (model params) are initialized compared to when we don't use that. # Thus, there starts to be a difference in the random state. If we want to avoid it, in order to make sure that
class ImitationLearning(object): def __init__( self, args, ): self.args = args utils.seed(self.args.seed) # args.env is a list when training on multiple environments if getattr(args, 'multi_env', None): self.env = [gym.make(item) for item in args.multi_env] self.train_demos = [] for demos, episodes in zip(args.multi_demos, args.multi_episodes): demos_path = utils.get_demos_path(demos, None, None, valid=False) logger.info('loading {} of {} demos'.format(episodes, demos)) train_demos = utils.load_demos(demos_path) logger.info('loaded demos') if episodes > len(train_demos): raise ValueError( "there are only {} train demos in {}".format( len(train_demos), demos)) self.train_demos.extend(train_demos[:episodes]) logger.info('So far, {} demos loaded'.format( len(self.train_demos))) self.val_demos = [] for demos, episodes in zip(args.multi_demos, [args.val_episodes] * len(args.multi_demos)): demos_path_valid = utils.get_demos_path(demos, None, None, valid=True) logger.info('loading {} of {} valid demos'.format( episodes, demos)) valid_demos = utils.load_demos(demos_path_valid) logger.info('loaded demos') if episodes > len(valid_demos): logger.info( 'Using all the available {} demos to evaluate valid. accuracy' .format(len(valid_demos))) self.val_demos.extend(valid_demos[:episodes]) logger.info('So far, {} valid demos loaded'.format( len(self.val_demos))) logger.info('Loaded all demos') observation_space = self.env[0].observation_space action_space = self.env[0].action_space else: self.env = gym.make(self.args.env) demos_path = utils.get_demos_path(args.demos, args.env, args.demos_origin, valid=False) demos_path_valid = utils.get_demos_path(args.demos, args.env, args.demos_origin, valid=True) print("else") logger.info('loading demos') self.train_demos = utils.load_demos(demos_path) print(len(self.train_demos)) print(self.train_demos[0]) logger.info('loaded demos') if args.episodes: if args.episodes > len(self.train_demos): raise ValueError("there are only {} train demos".format( len(self.train_demos))) self.train_demos = self.train_demos[:args.episodes] self.val_demos = utils.load_demos(demos_path_valid) if args.val_episodes > len(self.val_demos): logger.info( 'Using all the available {} demos to evaluate valid. accuracy' .format(len(self.val_demos))) self.val_demos = self.val_demos[:self.args.val_episodes] observation_space = self.env.observation_space action_space = self.env.action_space print("else") print(args.model) self.obss_preprocessor = utils.ObssPreprocessor( args.model, observation_space, getattr(self.args, 'pretrained_model', None)) # Define actor-critic model self.acmodel = utils.load_model(args.model, raise_not_found=False) if self.acmodel is None: if getattr(self.args, 'pretrained_model', None): self.acmodel = utils.load_model(args.pretrained_model, raise_not_found=True) else: self.acmodel = ACModel(self.obss_preprocessor.obs_space, action_space, args.image_dim, args.memory_dim, args.instr_dim, not self.args.no_instr, self.args.instr_arch, not self.args.no_mem, self.args.arch) self.obss_preprocessor.vocab.save() utils.save_model(self.acmodel, args.model) self.acmodel.train() if torch.cuda.is_available(): self.acmodel.cuda() self.optimizer = torch.optim.Adam(self.acmodel.parameters(), self.args.lr, eps=self.args.optim_eps) self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.9) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") @staticmethod def default_model_name(args): if getattr(args, 'multi_env', None): # It's better to specify one's own model name for this scenario named_envs = '-'.join(args.multi_env) else: named_envs = args.env # Define model name suffix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") instr = args.instr_arch if args.instr_arch else "noinstr" model_name_parts = { 'envs': named_envs, 'arch': args.arch, 'instr': instr, 'seed': args.seed, 'suffix': suffix } default_model_name = "{envs}_IL_{arch}_{instr}_seed{seed}_{suffix}".format( **model_name_parts) if getattr(args, 'pretrained_model', None): default_model_name = args.pretrained_model + '_pretrained_' + default_model_name return default_model_name def starting_indexes(self, num_frames): if num_frames % self.args.recurrence == 0: return np.arange(0, num_frames, self.args.recurrence) else: return np.arange(0, num_frames, self.args.recurrence)[:-1] def run_epoch_recurrence(self, demos, is_training=False): indices = list(range(len(demos))) if is_training: np.random.shuffle(indices) batch_size = min(self.args.batch_size, len(demos)) offset = 0 if not is_training: self.acmodel.eval() # Log dictionary log = {"entropy": [], "policy_loss": [], "accuracy": []} start_time = time.time() frames = 0 for batch_index in range(len(indices) // batch_size): logger.info("batch {}, FPS so far {}".format( batch_index, frames / (time.time() - start_time) if frames else 0)) batch = [demos[i] for i in indices[offset:offset + batch_size]] frames += sum([len(demo[3]) for demo in batch]) _log = self.run_epoch_recurrence_one_batch(batch, is_training=is_training) log["entropy"].append(_log["entropy"]) log["policy_loss"].append(_log["policy_loss"]) log["accuracy"].append(_log["accuracy"]) offset += batch_size if not is_training: self.acmodel.train() return log def run_epoch_recurrence_one_batch(self, batch, is_training=False): batch = utils.demos.transform_demos(batch) batch.sort(key=len, reverse=True) # Constructing flat batch and indices pointing to start of each demonstration flat_batch = [] inds = [0] for demo in batch: flat_batch += demo inds.append(inds[-1] + len(demo)) flat_batch = np.array(flat_batch) inds = inds[:-1] num_frames = len(flat_batch) mask = np.ones([len(flat_batch)], dtype=np.float64) mask[inds] = 0 mask = torch.tensor(mask, device=self.device, dtype=torch.float).unsqueeze(1) # Observations, true action, values and done for each of the stored demostration obss, action_true, done = flat_batch[:, 0], flat_batch[:, 1], flat_batch[:, 2] action_true = torch.tensor([action for action in action_true], device=self.device, dtype=torch.long) # Memory to be stored memories = torch.zeros([len(flat_batch), self.acmodel.memory_size], device=self.device) episode_ids = np.zeros(len(flat_batch)) memory = torch.zeros([len(batch), self.acmodel.memory_size], device=self.device) preprocessed_first_obs = self.obss_preprocessor(obss[inds], device=self.device) instr_embedding = self.acmodel._get_instr_embedding( preprocessed_first_obs.instr) # Loop terminates when every observation in the flat_batch has been handled while True: # taking observations and done located at inds obs = obss[inds] done_step = done[inds] preprocessed_obs = self.obss_preprocessor(obs, device=self.device) with torch.no_grad(): # taking the memory till len(inds), as demos beyond that have already finished new_memory = self.acmodel( preprocessed_obs, memory[:len(inds), :], instr_embedding[:len(inds)])['memory'] memories[inds, :] = memory[:len(inds), :] memory[:len(inds), :] = new_memory episode_ids[inds] = range(len(inds)) # Updating inds, by removing those indices corresponding to which the demonstrations have finished inds = inds[:len(inds) - sum(done_step)] if len(inds) == 0: break # Incrementing the remaining indices inds = [index + 1 for index in inds] # Here, actual backprop upto args.recurrence happens final_loss = 0 final_entropy, final_policy_loss, final_value_loss = 0, 0, 0 indexes = self.starting_indexes(num_frames) memory = memories[indexes] accuracy = 0 total_frames = len(indexes) * self.args.recurrence for _ in range(self.args.recurrence): obs = obss[indexes] preprocessed_obs = self.obss_preprocessor(obs, device=self.device) action_step = action_true[indexes] mask_step = mask[indexes] model_results = self.acmodel(preprocessed_obs, memory * mask_step, instr_embedding[episode_ids[indexes]]) dist = model_results['dist'] memory = model_results['memory'] entropy = dist.entropy().mean() policy_loss = -dist.log_prob(action_step).mean() loss = policy_loss - self.args.entropy_coef * entropy action_pred = dist.probs.max(1, keepdim=True)[1] accuracy += float( (action_pred == action_step.unsqueeze(1)).sum()) / total_frames final_loss += loss final_entropy += entropy final_policy_loss += policy_loss indexes += 1 final_loss /= self.args.recurrence if is_training: self.optimizer.zero_grad() final_loss.backward() self.optimizer.step() log = {} log["entropy"] = float(final_entropy / self.args.recurrence) log["policy_loss"] = float(final_policy_loss / self.args.recurrence) log["accuracy"] = float(accuracy) return log def validate(self, episodes, verbose=True): # Seed needs to be reset for each validation, to ensure consistency utils.seed(self.args.val_seed) if verbose: logger.info("Validating the model") if getattr(self.args, 'multi_env', None): agent = utils.load_agent(self.env[0], model_name=self.args.model, argmax=True) else: agent = utils.load_agent(self.env, model_name=self.args.model, argmax=True) # Setting the agent model to the current model agent.model = self.acmodel agent.model.eval() logs = [] for env_name in ([self.args.env] if not getattr(self.args, 'multi_env', None) else self.args.multi_env): logs += [ batch_evaluate(agent, env_name, self.args.val_seed, episodes) ] agent.model.train() return logs def collect_returns(self): logs = self.validate(episodes=self.args.eval_episodes, verbose=False) mean_return = { tid: np.mean(log["return_per_episode"]) for tid, log in enumerate(logs) } return mean_return def train(self, train_demos, writer, csv_writer, status_path, header, reset_status=False): # Load the status def initial_status(): return {'i': 0, 'num_frames': 0, 'patience': 0} status = initial_status() if os.path.exists(status_path) and not reset_status: with open(status_path, 'r') as src: status = json.load(src) elif not os.path.exists(os.path.dirname(status_path)): # Ensure that the status directory exists os.makedirs(os.path.dirname(status_path)) # If the batch size is larger than the number of demos, we need to lower the batch size if self.args.batch_size > len(train_demos): self.args.batch_size = len(train_demos) logger.info( "Batch size too high. Setting it to the number of train demos ({})" .format(len(train_demos))) # Model saved initially to avoid "Model not found Exception" during first validation step utils.save_model(self.acmodel, self.args.model) # best mean return to keep track of performance on validation set best_success_rate, patience, i = 0, 0, 0 total_start_time = time.time() while status['i'] < getattr(self.args, 'epochs', int(1e9)): if 'patience' not in status: # if for some reason you're finetuining with IL an RL pretrained agent status['patience'] = 0 # Do not learn if using a pre-trained model that already lost patience if status['patience'] > self.args.patience: break if status['num_frames'] > self.args.frames: break status['i'] += 1 i = status['i'] update_start_time = time.time() # Learning rate scheduler self.scheduler.step() log = self.run_epoch_recurrence(train_demos, is_training=True) total_len = sum([len(item[3]) for item in train_demos]) status['num_frames'] += total_len update_end_time = time.time() # Print logs if status['i'] % self.args.log_interval == 0: total_ellapsed_time = int(time.time() - total_start_time) fps = total_len / (update_end_time - update_start_time) duration = datetime.timedelta(seconds=total_ellapsed_time) for key in log: log[key] = np.mean(log[key]) train_data = [ status['i'], status['num_frames'], fps, total_ellapsed_time, log["entropy"], log["policy_loss"], log["accuracy"] ] logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | H {:.3f} | pL {: .3f} | A {: .3f}" .format(*train_data)) # Log the gathered data only when we don't evaluate the validation metrics. It will be logged anyways # afterwards when status['i'] % self.args.val_interval == 0 if status['i'] % self.args.val_interval != 0: # instantiate a validation_log with empty strings when no validation is done validation_data = [''] * len( [key for key in header if 'valid' in key]) assert len(header) == len(train_data + validation_data) if self.args.tb: for key, value in zip(header, train_data): writer.add_scalar(key, float(value), status['num_frames']) csv_writer.writerow(train_data + validation_data) if status['i'] % self.args.val_interval == 0: valid_log = self.validate(self.args.val_episodes) mean_return = [ np.mean(log['return_per_episode']) for log in valid_log ] success_rate = [ np.mean( [1 if r > 0 else 0 for r in log['return_per_episode']]) for log in valid_log ] val_log = self.run_epoch_recurrence(self.val_demos) validation_accuracy = np.mean(val_log["accuracy"]) if status['i'] % self.args.log_interval == 0: validation_data = [validation_accuracy ] + mean_return + success_rate logger.info(("Validation: A {: .3f} " + ("| R {: .3f} " * len(mean_return) + "| S {: .3f} " * len(success_rate))).format( *validation_data)) assert len(header) == len(train_data + validation_data) if self.args.tb: for key, value in zip(header, train_data + validation_data): writer.add_scalar(key, float(value), status['num_frames']) csv_writer.writerow(train_data + validation_data) # In case of a multi-env, the update condition would be "better mean success rate" ! if np.mean(success_rate) > best_success_rate: best_success_rate = np.mean(success_rate) status['patience'] = 0 with open(status_path, 'w') as dst: json.dump(status, dst) # Saving the model logger.info("Saving best model") if torch.cuda.is_available(): self.acmodel.cpu() utils.save_model(self.acmodel, self.args.model + "_best") self.obss_preprocessor.vocab.save( utils.get_vocab_path(self.args.model + "_best")) if torch.cuda.is_available(): self.acmodel.cuda() else: status['patience'] += 1 logger.info( "Losing patience, new value={}, limit={}".format( status['patience'], self.args.patience)) if torch.cuda.is_available(): self.acmodel.cpu() utils.save_model(self.acmodel, self.args.model) if torch.cuda.is_available(): self.acmodel.cuda() with open(status_path, 'w') as dst: json.dump(status, dst)
class MetaLearner(nn.Module): """ Meta Learner """ def __init__(self, args): """ :param args: """ super(MetaLearner, self).__init__() self.update_lr = args.update_lr self.meta_lr = args.meta_lr self.task_num = args.task_num self.args = args utils.seed(self.args.seed) self.env = gym.make(self.args.env) demos_path = utils.get_demos_path(args.demos, args.env, args.demos_origin, valid=False) demos_path_valid = utils.get_demos_path(args.demos, args.env, args.demos_origin, valid=True) logger.info('loading demos') self.train_demos = utils.load_demos(demos_path) logger.info('loaded demos') # if args.episodes: # if args.episodes > len(self.train_demos): # raise ValueError("there are only {} train demos".format(len(self.train_demos))) # self.train_demos = self.train_demos[:args.episodes] self.val_demos = utils.load_demos(demos_path_valid) # if args.val_episodes > len(self.val_demos): # logger.info('Using all the available {} demos to evaluate valid. accuracy'.format(len(self.val_demos))) self.val_demos = self.val_demos[:self.args.val_episodes] observation_space = self.env.observation_space action_space = self.env.action_space print(args.model) self.obss_preprocessor = utils.ObssPreprocessor( args.model, observation_space, getattr(self.args, 'pretrained_model', None)) # Define actor-critic model # self.net = utils.load_model(args.model, raise_not_found=False) # if self.net is None: # if getattr(self.args, 'pretrained_model', None): # self.net = utils.load_model(args.pretrained_model, raise_not_found=True) # else: self.net = ACModel(self.obss_preprocessor.obs_space, action_space, args.image_dim, args.memory_dim, args.instr_dim, not self.args.no_instr, self.args.instr_arch, not self.args.no_mem, self.args.arch) self.obss_preprocessor.vocab.save() # utils.save_model(self.net, args.model) self.fast_net = copy.deepcopy(self.net) self.net.train() self.fast_net.train() if torch.cuda.is_available(): self.net.cuda() self.fast_net.cuda() self.optimizer = torch.optim.SGD(self.fast_net.parameters(), lr=self.args.update_lr) # self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.9) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.meta_optim = optim.Adam(self.net.parameters(), lr=self.meta_lr) def clip_grad_by_norm_(self, grad, max_norm): """ in-place gradient clipping. :param grad: list of gradients :param max_norm: maximum norm allowable :return: """ total_norm = 0 counter = 0 for g in grad: param_norm = g.data.norm(2) total_norm += param_norm.item()**2 counter += 1 total_norm = total_norm**(1. / 2) clip_coef = max_norm / (total_norm + 1e-6) if clip_coef < 1: for g in grad: g.data.mul_(clip_coef) return total_norm / counter def starting_indexes(self, num_frames): if num_frames % self.args.recurrence == 0: return np.arange(0, num_frames, self.args.recurrence) else: return np.arange(0, num_frames, self.args.recurrence)[:-1] def forward_batch(self, batch, task, net='fast', is_training=True): if net == 'fast': acmodel = self.fast_net else: acmodel = self.net batch = utils.demos.induce_grammar(batch, task) batch = utils.demos.transform_demos(batch) batch.sort(key=len, reverse=True) # Constructing flat batch and indices pointing to start of each demonstration flat_batch = [] inds = [0] for demo in batch: flat_batch += demo inds.append(inds[-1] + len(demo)) flat_batch = np.array(flat_batch) inds = inds[:-1] num_frames = len(flat_batch) mask = np.ones([len(flat_batch)], dtype=np.float64) mask[inds] = 0 mask = torch.tensor(mask, device=self.device, dtype=torch.float).unsqueeze(1) # Observations, true action, values and done for each of the stored demostration obss, action_true, done = flat_batch[:, 0], flat_batch[:, 1], flat_batch[:, 2] action_true = torch.tensor([action for action in action_true], device=self.device, dtype=torch.long) # Memory to be stored memories = torch.zeros([len(flat_batch), acmodel.memory_size], device=self.device) episode_ids = np.zeros(len(flat_batch)) memory = torch.zeros([len(batch), acmodel.memory_size], device=self.device) preprocessed_first_obs = self.obss_preprocessor(obss[inds], device=self.device) instr_embedding = acmodel._get_instr_embedding( preprocessed_first_obs.instr) # Loop terminates when every observation in the flat_batch has been handled while True: # taking observations and done located at inds obs = obss[inds] done_step = done[inds] preprocessed_obs = self.obss_preprocessor(obs, device=self.device) with torch.no_grad(): # taking the memory till len(inds), as demos beyond that have already finished new_memory = acmodel(preprocessed_obs, memory[:len(inds), :], instr_embedding[:len(inds)])['memory'] memories[inds, :] = memory[:len(inds), :] memory[:len(inds), :] = new_memory episode_ids[inds] = range(len(inds)) # Updating inds, by removing those indices corresponding to which the demonstrations have finished inds = inds[:len(inds) - sum(done_step)] if len(inds) == 0: break # Incrementing the remaining indices inds = [index + 1 for index in inds] # Here, actual backprop upto args.recurrence happens final_loss = 0 final_entropy, final_policy_loss, final_value_loss = 0, 0, 0 indexes = self.starting_indexes(num_frames) memory = memories[indexes] accuracy = 0 total_frames = len(indexes) * self.args.recurrence for _ in range(self.args.recurrence): obs = obss[indexes] preprocessed_obs = self.obss_preprocessor(obs, device=self.device) action_step = action_true[indexes] mask_step = mask[indexes] model_results = acmodel(preprocessed_obs, memory * mask_step, instr_embedding[episode_ids[indexes]]) dist = model_results['dist'] memory = model_results['memory'] entropy = dist.entropy().mean() policy_loss = -dist.log_prob(action_step).mean() loss = policy_loss - self.args.entropy_coef * entropy action_pred = dist.probs.max(1, keepdim=True)[1] accuracy += float( (action_pred == action_step.unsqueeze(1)).sum()) / total_frames final_loss += loss final_entropy += entropy final_policy_loss += policy_loss indexes += 1 final_loss /= self.args.recurrence # if is_training: # self.optimizer.zero_grad() # final_loss.backward() # self.optimizer.step() log = {} log["entropy"] = float(final_entropy / self.args.recurrence) log["policy_loss"] = float(final_policy_loss / self.args.recurrence) log["accuracy"] = float(accuracy) return final_loss, log # def forward(self, x_spt, y_spt, x_qry, y_qry): def forward(self, demo): task_num = self.args.task_num losses = [] # losses_q[i], i is tasks idx logs = [] grads = [] self.optimizer.zero_grad() for i in range(task_num): # copy initializing net self.fast_net = copy.deepcopy(self.net) for p in self.fast_net.parameters(): p.retain_grad() self.fast_net.zero_grad() # optimize fast net for k isntances of task i loss_task, log = self.forward_batch(demo, i, 'fast') # grad = torch.autograd.grad(loss_task, self.fast_net.parameters(),allow_unused = True) loss_task.backward() grad = [x.grad for x in self.fast_net.parameters()] # print (grad) grads.append(grad) # self.optimizer.step() # loss_task, log = self.forward_batch(demo, i, 'fast') # losses.append(loss_task) logs.append(log) self.meta_update(demo, grads) # end of all tasks # sum over all losses on query set across all tasks # loss_q = sum(losses) / task_num # # optimize theta parameters # self.meta_optim.zero_grad() # grad = torch.autograd.grad(loss_q, self.net.parameters(), allow_unused=True) # print (grad) # # loss_q.backward() # for g,p in zip(grad,self.net.parameters()): # p.grad = g # # print('meta update') # # for p in self.net.parameters()[:5]: # # (torch.norm(p).item()) # self.meta_optim.step() return logs def meta_update(self, demo, grads): print('\n Meta update \n') # We use a dummy forward / backward pass to get the correct grads into self.net loss, _ = self.forward_batch(demo, 0, 'net') gradients = [] for p in self.net.parameters(): gradients.append(torch.zeros(np.array(p.data).shape).cuda()) # Unpack the list of grad dicts for i in range(len(grads[0])): for grad in grads: if grad[i] is not None: gradients[i] = gradients[i] + grad[i][0] # gradients = [sum(grad[i][0] for grad in grads) for i in range(len(grads[0]))] # gradients = {k: sum(d[k] for d in ls) for k in ls[0].keys()} # Register a hook on each parameter in the net that replaces the current dummy grad # with our grads accumulated across the meta-batch hooks = [] for i, p in enumerate(self.net.parameters()): def get_closure(): it = i def replace_grad(grad): ng = Variable( torch.from_numpy( np.array(gradients[it], dtype=np.float32))).cuda() return ng return replace_grad try: hooks.append(p.register_hook(get_closure())) except: print(p) get_closure() # Compute grads for current step, replace with summed gradients as defined by hook self.meta_optim.zero_grad() loss.backward() # Update the net parameters with the accumulated gradient according to optimizer self.meta_optim.step() # Remove the hooks before next training phase for h in hooks: h.remove() def validate(self, demo): val_task_num = self.args.task_num losses = [] # losses_q[i], i is tasks idx logs = [] val_logs = [] for i in range(19): self.fast_net = copy.deepcopy(self.net) self.fast_net.zero_grad() # optimize fast net for k isntances of task i for k in range(5): loss_task, log = self.forward_batch(demo[32 * k:32 * k + 32], 119 - i, 'fast') self.optimizer.zero_grad() loss_task.backward() self.optimizer.step() # loss_task, log = self.forward_batch(demo, i, 'fast') # losses.append(loss_task) logs.append(log) loss_task, log = self.forward_batch(demo[32 * k:32 * k + 32], 119 - i, 'fast') val_logs.append(log) return val_logs
def main(exp, argv): os.environ["BABYAI_STORAGE"] = exp.results_directory() # Parse arguments parser = ArgumentParser() parser.add_argument("--algo", default='ppo', help="algorithm to use (default: ppo)") parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--reward-scale", type=float, default=20., help="Reward scale multiplier") parser.add_argument( "--gae-lambda", type=float, default=0.99, help="lambda coefficient in GAE formula (default: 0.99, 1 means no gae)" ) parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument("--ppo-epochs", type=int, default=4, help="number of epochs for PPO (default: 4)") parser.add_argument( "--save-interval", type=int, default=50, help= "number of updates between two saves (default: 50, 0 means no saving)") parser.add_argument("--workers", type=int, default=8, help="number of workers for PyTorch (default: 8)") parser.add_argument("--max-count", type=int, default=1000, help="maximum number of frames to run for") parser.add_argument("--sample_duration", type=float, default=0.5, help="sampling duration") parser.add_argument("--cuda", action="store_true", default=False, help="whether to use cuda") args = parser.parse_args(argv) utils.seed(args.seed) torch_settings = init_torch( seed=args.seed, cuda=args.cuda, workers=args.workers, ) # Generate environments envs = [] for i in range(args.procs): env = gym.make(args.env) env.seed(100 * args.seed + i) envs.append(env) # Define model name suffix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") instr = args.instr_arch if args.instr_arch else "noinstr" mem = "mem" if not args.no_mem else "nomem" model_name_parts = { 'env': args.env, 'algo': args.algo, 'arch': args.arch, 'instr': instr, 'mem': mem, 'seed': args.seed, 'info': '', 'coef': '', 'suffix': suffix } default_model_name = "{env}_{algo}_{arch}_{instr}_{mem}_seed{seed}{info}{coef}_{suffix}".format( **model_name_parts) if args.pretrained_model: default_model_name = args.pretrained_model + '_pretrained_' + default_model_name args.model = args.model.format( **model_name_parts) if args.model else default_model_name utils.configure_logging(args.model) logger = logging.getLogger(__name__) # Define obss preprocessor if 'emb' in args.arch: obss_preprocessor = utils.IntObssPreprocessor( args.model, envs[0].observation_space, args.pretrained_model) else: obss_preprocessor = utils.ObssPreprocessor(args.model, envs[0].observation_space, args.pretrained_model) # Define actor-critic model # acmodel = utils.load_model(args.model, raise_not_found=False) acmodel = None if acmodel is None: if args.pretrained_model: acmodel = utils.load_model(args.pretrained_model, raise_not_found=True) else: acmodel = ACModel(obss_preprocessor.obs_space, envs[0].action_space, args.image_dim, args.memory_dim, args.instr_dim, not args.no_instr, args.instr_arch, not args.no_mem, args.arch) obss_preprocessor.vocab.save() # utils.save_model(acmodel, args.model) if torch_settings.cuda: acmodel.cuda() # Define actor-critic algo reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward if args.algo == "ppo": algo = babyai.rl.PPOAlgo( envs, acmodel, args.frames_per_proc, args.discount, args.lr, args.beta1, args.beta2, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.ppo_epochs, args.batch_size, obss_preprocessor, reshape_reward) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) # When using extra binary information, more tensors (model params) are initialized compared to when we don't use that. # Thus, there starts to be a difference in the random state. If we want to avoid it, in order to make sure that # the results of supervised-loss-coef=0. and extra-binary-info=0 match, we need to reseed here. utils.seed(args.seed) # Restore training status status_path = os.path.join(utils.get_log_dir(args.model), 'status.json') if os.path.exists(status_path): with open(status_path, 'r') as src: status = json.load(src) else: status = {'i': 0, 'num_episodes': 0, 'num_frames': 0} # # Define logger and Tensorboard writer and CSV writer # header = (["update", "episodes", "frames", "FPS", "duration"] # + ["return_" + stat for stat in ['mean', 'std', 'min', 'max']] # + ["success_rate"] # + ["num_frames_" + stat for stat in ['mean', 'std', 'min', 'max']] # + ["entropy", "value", "policy_loss", "value_loss", "loss", "grad_norm"]) # if args.tb: # from tensorboardX import SummaryWriter # writer = SummaryWriter(utils.get_log_dir(args.model)) # csv_path = os.path.join(utils.get_log_dir(args.model), 'log.csv') # first_created = not os.path.exists(csv_path) # # we don't buffer data going in the csv log, cause we assume # # that one update will take much longer that one write to the log # csv_writer = csv.writer(open(csv_path, 'a', 1)) # if first_created: # csv_writer.writerow(header) # Log code state, command, availability of CUDA and model babyai_code = list(babyai.__path__)[0] try: last_commit = subprocess.check_output( 'cd {}; git log -n1'.format(babyai_code), shell=True).decode('utf-8') logger.info('LAST COMMIT INFO:') logger.info(last_commit) except subprocess.CalledProcessError: logger.info('Could not figure out the last commit') try: diff = subprocess.check_output('cd {}; git diff'.format(babyai_code), shell=True).decode('utf-8') if diff: logger.info('GIT DIFF:') logger.info(diff) except subprocess.CalledProcessError: logger.info('Could not figure out the last commit') logger.info('COMMAND LINE ARGS:') logger.info(args) logger.info("CUDA available: {}".format(torch.cuda.is_available())) logger.info(acmodel) # Train model total_start_time = time.time() best_success_rate = 0 best_mean_return = 0 test_env_name = args.env wrapper = iteration_wrapper( exp, sync=torch_settings.sync, max_count=args.max_count, sample_duration=args.sample_duration, ) # while status['num_frames'] < args.frames: while True: with wrapper() as it: # Update parameters if wrapper.done(): break update_start_time = time.time() logs = algo.update_parameters() update_end_time = time.time() it.set_count(logs["num_frames"]) it.log(loss=logs["loss"], )
if acmodel1 is None: if args.pretrained_model: acmodel1 = utils.load_model(args.pretrained_model, 1, raise_not_found=True) else: #torch.manual_seed(args.seed) acmodel1 = ACModel(obss_preprocessor.obs_space, envs1[0].action_space, args.image_dim, args.memory_dim, args.instr_dim, args.enc_dim, args.dec_dim, not args.no_instr, args.instr_arch, not args.no_mem, args.arch, args.len_message, args.num_symbols) obss_preprocessor.vocab.save() utils.save_model(acmodel0, args.model, 0) utils.save_model(acmodel1, args.model, 1) if torch.cuda.is_available(): acmodel0.cuda() acmodel1.cuda() # Define actor-critic algo reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward if args.algo == "ppo": algo = babyai.rl.PPOAlgo(envs0, envs1, acmodel0, acmodel1, args.frames_per_proc, args.discount, args.lr, args.beta1, args.beta2, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.ppo_epochs, args.batch_size, obss_preprocessor, reshape_reward) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) # When using extra binary information, more tensors (model params) are initialized compared to when we don't use that.