def learn(self): logger.info("Training") n_steps = 0 # best_success_rate = 0. for epoch in range(self.args.n_epochs): residual_losses = [] for _ in range(self.args.n_cycles): # Collect trajectories self.controller.reconfigure_heuristic(self.get_residual) n_steps += self.collect_trajectories( self.args.num_rollouts_per_mpi) # Update residual logger.debug("Updating") for _ in range(self.args.n_batches): residual_loss = self._update_residual() residual_losses.append( residual_loss.detach().cpu().numpy()) logger.debug('Loss', residual_loss) self._update_target_network(self.residual_target, self.residual) success_rate = self.eval_agent() if MPI.COMM_WORLD.Get_rank() == 0: print( '[{}] epoch is: {}, Num steps: {}, eval success rate is: {:.3f}' .format(datetime.now(), epoch, n_steps, success_rate)) logger.record_tabular('epoch', epoch) logger.record_tabular('n_steps', n_steps) logger.record_tabular('success_rate', success_rate) logger.record_tabular('residual_loss', np.mean(residual_losses)) logger.dump_tabular()
def collect_trajectories(self, num_traj): ''' This function collects trajectories based on the controller and learned residuals ''' logger.debug("Rolling out") mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f = [ ], [], [], [], [], [], [], [], [] for traj in range(num_traj): ep_obs, ep_ag, ep_g, ep_actions, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f = [ ], [], [], [], [], [], [], [], [] # observation = self.planning_env.reset() observation = set_sim_state_and_goal( self.planning_env, self.eval_qpos[traj], self.eval_qvel[traj], self.eval_goals[traj], ) obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] s_h = self.controller.heuristic_obs_g(obs, g) f = self.planning_env.extract_features(obs, g) for _ in range(self.env_params['max_timesteps']): qpos = observation['sim_state'].qpos qvel = observation['sim_state'].qvel ac, info = self.controller.act(observation) ac_ind = self.planning_env.discrete_actions[tuple(ac)] logger.debug('Heuristic', info['start_node_h']) logger.debug('Action', ac) observation_new, rew, _, _ = self.planning_env.step(ac) # Apply dynamics residual observation_new, rew = self.apply_dynamics_residual( observation, ac, observation_new, rew) self.n_planning_steps += 1 obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] if self.args.render: self.planning_env.render() multi_append([ep_obs, ep_ag, ep_g, ep_actions, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f], [obs.copy(), ag.copy(), g.copy(), ac_ind, s_h, rew, qpos.copy(), qvel.copy(), f.copy()]) obs = obs_new.copy() ag = ag_new.copy() observation = observation_new s_h = self.controller.heuristic_obs_g(obs, g) f = self.planning_env.extract_features(obs, g) multi_append([ep_obs, ep_ag, ep_s_h, ep_f], [obs.copy(), ag.copy(), s_h, f.copy()]) multi_append([mb_obs, mb_ag, mb_actions, mb_g, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f], [ep_obs, ep_ag, ep_actions, ep_g, ep_s_h, ep_r, ep_qpos, ep_qvel, ep_f]) mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f = np.array(mb_obs), np.array(mb_ag), np.array( mb_g), np.array(mb_actions), np.array(mb_s_h), np.array(mb_r), np.array(mb_qpos), np.array(mb_qvel), np.array(mb_f) self.dataset.store_episode( [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f]) # Update normalizer self._update_normalizer( [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_qpos, mb_qvel, mb_f])
def collect_trajectories(self, num_traj): ''' This function collects trajectories based on the controller and learned residuals ''' logger.debug("Rolling out") n_steps = 0 mb_obs, mb_ag, mb_g, mb_actions, mb_s_h = [], [], [], [], [] mb_r, mb_f = [], [] for traj in range(num_traj): ep_obs, ep_ag, ep_g, ep_actions, ep_s_h = [], [], [], [], [] ep_r, ep_f = [], [] observation = self.env.reset() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] s_h = self.controller.heuristic_obs_g(obs, g) f = self.env.extract_features(obs, g) for _ in range(self.env_params['max_timesteps']): ac, info = self.controller.act(observation) ac_ind = self.env.discrete_actions[tuple(ac)] logger.debug('Heuristic', info['start_node_h']) logger.debug('Action', ac) observation_new, rew, _, _ = self.env.step(ac) n_steps += 1 obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] if self.args.render: self.env.render() ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) ep_g.append(g.copy()) # FIX: Storing action index, instead of the action ep_actions.append(ac_ind) ep_s_h.append(s_h) ep_r.append(rew) ep_f.append(f.copy()) obs = obs_new.copy() ag = ag_new.copy() observation = observation_new s_h = self.controller.heuristic_obs_g(obs, g) f = self.env.extract_features(obs, g) ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) ep_s_h.append(s_h) ep_f.append(f.copy()) mb_obs.append(ep_obs) mb_ag.append(ep_ag) mb_actions.append(ep_actions) mb_g.append(ep_g) mb_s_h.append(ep_s_h) mb_r.append(ep_r) mb_f.append(ep_f) mb_obs = np.array(mb_obs) mb_ag = np.array(mb_ag) mb_g = np.array(mb_g) mb_actions = np.array(mb_actions) mb_s_h = np.array(mb_s_h) mb_r = np.array(mb_r) mb_f = np.array(mb_f) self.dataset.store_episode( [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_f]) # Update normalizer self._update_normalizer( [mb_obs, mb_ag, mb_g, mb_actions, mb_s_h, mb_r, mb_f]) return n_steps