Example #1
0
    def learn(self):
        logger.info("Training")
        n_steps = 0
        # best_success_rate = 0.

        for epoch in range(self.args.n_epochs):
            residual_losses = []
            for _ in range(self.args.n_cycles):
                # Collect trajectories
                self.controller.reconfigure_heuristic(self.get_residual)
                n_steps += self.collect_trajectories(
                    self.args.num_rollouts_per_mpi)
                # Update residual
                logger.debug("Updating")
                for _ in range(self.args.n_batches):
                    residual_loss = self._update_residual()
                    residual_losses.append(
                        residual_loss.detach().cpu().numpy())
                    logger.debug('Loss', residual_loss)

                self._update_target_network(self.residual_target,
                                            self.residual)

            success_rate = self.eval_agent()
            if MPI.COMM_WORLD.Get_rank() == 0:
                print(
                    '[{}] epoch is: {}, Num steps: {}, eval success rate is: {:.3f}'
                    .format(datetime.now(), epoch, n_steps, success_rate))
                logger.record_tabular('epoch', epoch)
                logger.record_tabular('n_steps', n_steps)
                logger.record_tabular('success_rate', success_rate)
                logger.record_tabular('residual_loss',
                                      np.mean(residual_losses))
                logger.dump_tabular()
Example #2
0
 def collect_trajectories(self, num_traj):
     '''
     This function collects trajectories based on the controller and learned residuals
     '''
     logger.debug("Rolling out")
     mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f = [
     ], [], [], [], [], [], [], [], []
     for traj in range(num_traj):
         ep_obs, ep_ag, ep_g, ep_actions, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f = [
         ], [], [], [], [], [], [], [], []
         # observation = self.planning_env.reset()
         observation = set_sim_state_and_goal(
             self.planning_env,
             self.eval_qpos[traj],
             self.eval_qvel[traj],
             self.eval_goals[traj],
         )
         obs = observation['observation']
         ag = observation['achieved_goal']
         g = observation['desired_goal']
         s_h = self.controller.heuristic_obs_g(obs, g)
         f = self.planning_env.extract_features(obs, g)
         for _ in range(self.env_params['max_timesteps']):
             qpos = observation['sim_state'].qpos
             qvel = observation['sim_state'].qvel
             ac, info = self.controller.act(observation)
             ac_ind = self.planning_env.discrete_actions[tuple(ac)]
             logger.debug('Heuristic', info['start_node_h'])
             logger.debug('Action', ac)
             observation_new, rew, _, _ = self.planning_env.step(ac)
             # Apply dynamics residual
             observation_new, rew = self.apply_dynamics_residual(
                 observation, ac, observation_new, rew)
             self.n_planning_steps += 1
             obs_new = observation_new['observation']
             ag_new = observation_new['achieved_goal']
             if self.args.render:
                 self.planning_env.render()
             multi_append([ep_obs, ep_ag, ep_g, ep_actions, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f],
                          [obs.copy(), ag.copy(), g.copy(), ac_ind, s_h, rew, qpos.copy(), qvel.copy(), f.copy()])
             obs = obs_new.copy()
             ag = ag_new.copy()
             observation = observation_new
             s_h = self.controller.heuristic_obs_g(obs, g)
             f = self.planning_env.extract_features(obs, g)
         multi_append([ep_obs, ep_ag, ep_s_h, ep_f],
                      [obs.copy(), ag.copy(), s_h, f.copy()])
         multi_append([mb_obs, mb_ag, mb_actions, mb_g, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f],
                      [ep_obs, ep_ag, ep_actions, ep_g, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f])
     mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f = np.array(mb_obs), np.array(mb_ag), np.array(
         mb_g), np.array(mb_actions), np.array(mb_s_h), np.array(mb_r), np.array(mb_qpos), np.array(mb_qvel), np.array(mb_f)
     self.dataset.store_episode(
         [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f])
     # Update normalizer
     self._update_normalizer(
         [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f])
Example #3
0
 def collect_trajectories(self, num_traj):
     '''
     This function collects trajectories based on the controller and learned residuals
     '''
     logger.debug("Rolling out")
     n_steps = 0
     mb_obs, mb_ag, mb_g, mb_actions, mb_s_h = [], [], [], [], []
     mb_r, mb_f = [], []
     for traj in range(num_traj):
         ep_obs, ep_ag, ep_g, ep_actions, ep_s_h = [], [], [], [], []
         ep_r, ep_f = [], []
         observation = self.env.reset()
         obs = observation['observation']
         ag = observation['achieved_goal']
         g = observation['desired_goal']
         s_h = self.controller.heuristic_obs_g(obs, g)
         f = self.env.extract_features(obs, g)
         for _ in range(self.env_params['max_timesteps']):
             ac, info = self.controller.act(observation)
             ac_ind = self.env.discrete_actions[tuple(ac)]
             logger.debug('Heuristic', info['start_node_h'])
             logger.debug('Action', ac)
             observation_new, rew, _, _ = self.env.step(ac)
             n_steps += 1
             obs_new = observation_new['observation']
             ag_new = observation_new['achieved_goal']
             if self.args.render:
                 self.env.render()
             ep_obs.append(obs.copy())
             ep_ag.append(ag.copy())
             ep_g.append(g.copy())
             # FIX: Storing action index, instead of the action
             ep_actions.append(ac_ind)
             ep_s_h.append(s_h)
             ep_r.append(rew)
             ep_f.append(f.copy())
             obs = obs_new.copy()
             ag = ag_new.copy()
             observation = observation_new
             s_h = self.controller.heuristic_obs_g(obs, g)
             f = self.env.extract_features(obs, g)
         ep_obs.append(obs.copy())
         ep_ag.append(ag.copy())
         ep_s_h.append(s_h)
         ep_f.append(f.copy())
         mb_obs.append(ep_obs)
         mb_ag.append(ep_ag)
         mb_actions.append(ep_actions)
         mb_g.append(ep_g)
         mb_s_h.append(ep_s_h)
         mb_r.append(ep_r)
         mb_f.append(ep_f)
     mb_obs = np.array(mb_obs)
     mb_ag = np.array(mb_ag)
     mb_g = np.array(mb_g)
     mb_actions = np.array(mb_actions)
     mb_s_h = np.array(mb_s_h)
     mb_r = np.array(mb_r)
     mb_f = np.array(mb_f)
     self.dataset.store_episode(
         [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_f])
     # Update normalizer
     self._update_normalizer(
         [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_f])
     return n_steps