def log(self, iteration, train_stats): # --- save model --- if iteration % self.args.save_interval == 0: save_path = os.path.join(self.tb_logger.full_output_folder, 'models') if not os.path.exists(save_path): os.mkdir(save_path) torch.save( self.agent.state_dict(), os.path.join(save_path, "agent{0}.pt".format(iteration))) if iteration % self.args.log_interval == 0: if self.args.policy == 'dqn': returns, success_rate, observations, rewards, reward_preds = self.evaluate( ) # This part is super specific for the Semi-Circle env # elif self.args.env_name == 'PointRobotSparse-v0': # returns, success_rate, log_probs, observations, \ # rewards, reward_preds, reward_belief, reward_belief_discretized, points = self.evaluate() else: returns, success_rate, log_probs, observations, rewards, reward_preds = self.evaluate( ) if self.args.log_tensorboard: tasks_to_vis = np.random.choice(self.args.num_eval_tasks, 5) for i, task in enumerate(tasks_to_vis): self.env.reset(task) if PLOT_VIS: self.tb_logger.writer.add_figure( 'policy_vis/task_{}'.format(i), utl_eval.plot_rollouts(observations[task, :], self.env), self._n_rl_update_steps_total) self.tb_logger.writer.add_figure( 'reward_prediction_train/task_{}'.format(i), utl_eval.plot_rew_pred_vs_rew(rewards[task, :], reward_preds[task, :]), self._n_rl_update_steps_total) # self.tb_logger.writer.add_figure('reward_prediction_train/task_{}'.format(i), # utl_eval.plot_rew_pred_vs_reward_belief_vs_rew(rewards[task, :], # reward_preds[task, :], # reward_belief[task, :]), # self._n_rl_update_steps_total) # if self.args.env_name == 'PointRobotSparse-v0': # This part is super specific for the Semi-Circle env # for t in range(0, int(self.args.trajectory_len/4), 3): # self.tb_logger.writer.add_figure('discrete_belief_reward_pred_task_{}/timestep_{}'.format(i, t), # utl_eval.plot_discretized_belief_halfcircle(reward_belief_discretized[task, t, :], # points, self.env, # observations[task, :t+1]), # self._n_rl_update_steps_total) if self.args.max_rollouts_per_task > 1: for episode_idx in range(self.args.max_rollouts_per_task): self.tb_logger.writer.add_scalar( 'returns_multi_episode/episode_{}'.format( episode_idx + 1), np.mean(returns[:, episode_idx]), self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'returns_multi_episode/sum', np.mean(np.sum(returns, axis=-1)), self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'returns_multi_episode/success_rate', np.mean(success_rate), self._n_rl_update_steps_total) else: self.tb_logger.writer.add_scalar( 'returns/returns_mean', np.mean(returns), self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'returns/returns_std', np.std(returns), self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'returns/success_rate', np.mean(success_rate), self._n_rl_update_steps_total) if self.args.policy == 'dqn': self.tb_logger.writer.add_scalar( 'rl_losses/qf_loss_vs_n_updates', train_stats['qf_loss'], self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'weights/q_network', list(self.agent.qf.parameters())[0].mean(), self._n_rl_update_steps_total) if list(self.agent.qf.parameters())[0].grad is not None: param_list = list(self.agent.qf.parameters()) self.tb_logger.writer.add_scalar( 'gradients/q_network', sum([ param_list[i].grad.mean() for i in range(len(param_list)) ]), self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'weights/q_target', list(self.agent.target_qf.parameters())[0].mean(), self._n_rl_update_steps_total) if list(self.agent.target_qf.parameters() )[0].grad is not None: param_list = list(self.agent.target_qf.parameters()) self.tb_logger.writer.add_scalar( 'gradients/q_target', sum([ param_list[i].grad.mean() for i in range(len(param_list)) ]), self._n_rl_update_steps_total) else: self.tb_logger.writer.add_scalar( 'policy/log_prob', np.mean(log_probs), self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'rl_losses/qf1_loss', train_stats['qf1_loss'], self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'rl_losses/qf2_loss', train_stats['qf2_loss'], self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'rl_losses/policy_loss', train_stats['policy_loss'], self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'rl_losses/alpha_entropy_loss', train_stats['alpha_entropy_loss'], self._n_rl_update_steps_total) # weights and gradients self.tb_logger.writer.add_scalar( 'weights/q1_network', list(self.agent.qf1.parameters())[0].mean(), self._n_rl_update_steps_total) if list(self.agent.qf1.parameters())[0].grad is not None: param_list = list(self.agent.qf1.parameters()) self.tb_logger.writer.add_scalar( 'gradients/q1_network', sum([ param_list[i].grad.mean() for i in range(len(param_list)) ]), self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'weights/q1_target', list(self.agent.qf1_target.parameters())[0].mean(), self._n_rl_update_steps_total) if list(self.agent.qf1_target.parameters() )[0].grad is not None: param_list = list(self.agent.qf1_target.parameters()) self.tb_logger.writer.add_scalar( 'gradients/q1_target', sum([ param_list[i].grad.mean() for i in range(len(param_list)) ]), self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'weights/q2_network', list(self.agent.qf2.parameters())[0].mean(), self._n_rl_update_steps_total) if list(self.agent.qf2.parameters())[0].grad is not None: param_list = list(self.agent.qf2.parameters()) self.tb_logger.writer.add_scalar( 'gradients/q2_network', sum([ param_list[i].grad.mean() for i in range(len(param_list)) ]), self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'weights/q2_target', list(self.agent.qf2_target.parameters())[0].mean(), self._n_rl_update_steps_total) if list(self.agent.qf2_target.parameters() )[0].grad is not None: param_list = list(self.agent.qf2_target.parameters()) self.tb_logger.writer.add_scalar( 'gradients/q2_target', sum([ param_list[i].grad.mean() for i in range(len(param_list)) ]), self._n_rl_update_steps_total) self.tb_logger.writer.add_scalar( 'weights/policy', list(self.agent.policy.parameters())[0].mean(), self._n_rl_update_steps_total) if list(self.agent.policy.parameters() )[0].grad is not None: param_list = list(self.agent.policy.parameters()) self.tb_logger.writer.add_scalar( 'gradients/policy', sum([ param_list[i].grad.mean() for i in range(len(param_list)) ]), self._n_rl_update_steps_total) for k, v in [ ('num_rl_updates', self._n_rl_update_steps_total), ('time_elapsed', time.time() - self._start_time), ('iteration', iteration), ]: self.tb_logger.writer.add_scalar(k, v, self._n_rl_update_steps_total) self.tb_logger.finish_iteration(iteration) print( "Iteration -- {}, Success rate -- {:.3f}, Avg. return -- {:.3f}, Elapsed time {:5d}[s]" .format(iteration, np.mean(success_rate), np.mean(np.sum(returns, axis=-1)), int(time.time() - self._start_time)))
def train(vae, dataset, args): ''' :param vae: :param dataset: list of lists. each list for different task contains torch tensors of s,a,r,s',t :param args: :return: ''' if args.log_tensorboard: writer = SummaryWriter(args.full_save_path) num_tasks = len(dataset) start_time = time.time() total_updates = 0 for iter_ in range(args.num_iters): n_batches = np.min([ int(np.ceil(d[0].shape[1] / args.vae_batch_num_rollouts_per_task)) for d in dataset ]) # traj_permutation = np.random.permutation(dataset[0][0].shape[1]) traj_permutation = np.random.permutation( np.min([d[0].shape[1] for d in dataset])) loss_tr, rew_loss_tr, state_loss_tr, kl_loss_tr = 0, 0, 0, 0 # initialize loss for epoch n_updates = 0 # count number of updates for i in tqdm(range(n_batches), desc="Epoch {}".format(iter_)): if i == n_batches - 1: traj_indices = traj_permutation[ i * args.vae_batch_num_rollouts_per_task:] else: traj_indices = traj_permutation[ i * args.vae_batch_num_rollouts_per_task:(i + 1) * args.vae_batch_num_rollouts_per_task] n_task_batches = int(np.ceil(num_tasks / args.tasks_batch_size)) task_permutation = np.random.permutation(num_tasks) for j in range(n_task_batches): # run over tasks if j == n_task_batches - 1: indices = task_permutation[j * args.tasks_batch_size:] else: indices = task_permutation[j * args.tasks_batch_size:(j + 1) * args.tasks_batch_size] obs, actions, rewards, next_obs = [], [], [], [] for idx in indices: # random_subset = np.random.permutation(dataset[idx][0].shape[1], ) # random_subset = np.random.choice(dataset[idx][0].shape[1], args.vae_batch_num_rollouts_per_task) obs.append( ptu.FloatTensor(dataset[idx][0][:, traj_indices, :])) actions.append( ptu.FloatTensor(dataset[idx][1][:, traj_indices, :])) rewards.append( ptu.FloatTensor(dataset[idx][2][:, traj_indices, :])) next_obs.append( ptu.FloatTensor(dataset[idx][3][:, traj_indices, :])) obs = torch.cat(obs, dim=1) actions = torch.cat(actions, dim=1) rewards = torch.cat(rewards, dim=1) next_obs = torch.cat(next_obs, dim=1) rew_recon_loss, state_recon_loss, kl_term = update_step( vae, obs, actions, rewards, next_obs, args) # take average (this is the expectation over p(M)) loss = args.rew_loss_coeff * rew_recon_loss + \ args.state_loss_coeff * state_recon_loss + \ args.kl_weight * kl_term # update vae.optimizer.zero_grad() loss.backward() vae.optimizer.step() n_updates += 1 loss_tr += loss.item() rew_loss_tr += rew_recon_loss.item() state_loss_tr += state_recon_loss.item() kl_loss_tr += kl_term.item() print( 'Elapsed time: {:.2f}, loss: {:.4f} -- rew_loss: {:.4f} -- state_loss: {:.4f} -- kl: {:.4f}' .format(time.time() - start_time, loss_tr / n_updates, rew_loss_tr / n_updates, state_loss_tr / n_updates, kl_loss_tr / n_updates)) total_updates += n_updates # log tb if args.log_tensorboard: writer.add_scalar('loss/vae_loss', loss_tr / n_updates, total_updates) writer.add_scalar('loss/rew_recon_loss', rew_loss_tr / n_updates, total_updates) writer.add_scalar('loss/state_recon_loss', state_loss_tr / n_updates, total_updates) writer.add_scalar('loss/kl', kl_loss_tr / n_updates, total_updates) if args.env_name != 'GridNavi-v2': # TODO: eval for gridworld domain rewards_eval, reward_preds_eval = eval_vae(dataset, vae, args) for task in range(NUM_EVAL_TASKS): writer.add_figure( 'reward_prediction/task_{}'.format(task), utl_eval.plot_rew_pred_vs_rew( rewards_eval[task, :], reward_preds_eval[task, :]), total_updates) if (iter_ + 1) % args.eval_interval == 0: pass if args.save_model and (iter_ + 1) % args.save_interval == 0: save_path = os.path.join(os.getcwd(), args.full_save_path, 'models') if not os.path.exists(save_path): os.mkdir(save_path) torch.save( vae.encoder.state_dict(), os.path.join(save_path, "encoder{0}.pt".format(iter_ + 1))) if vae.reward_decoder is not None: torch.save( vae.reward_decoder.state_dict(), os.path.join(save_path, "reward_decoder{0}.pt".format(iter_ + 1))) if vae.state_decoder is not None: torch.save( vae.state_decoder.state_dict(), os.path.join(save_path, "state_decoder{0}.pt".format(iter_ + 1)))